[Refactor] Change evaluation script path (#1165)

This commit is contained in:
Deshraj Yadav
2024-01-12 21:29:59 +05:30
committed by GitHub
parent 862ff6cca6
commit affe319460
21 changed files with 50 additions and 45 deletions

View File

View File

@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
from embedchain.utils.evaluation import EvalData
class BaseMetric(ABC):
"""Base class for a metric.
This class provides a common interface for all metrics.
"""
def __init__(self, name: str = "base_metric"):
"""
Initialize the BaseMetric.
"""
self.name = name
@abstractmethod
def evaluate(self, dataset: list[EvalData]):
"""
Abstract method to evaluate the dataset.
This method should be implemented by subclasses to perform the actual
evaluation on the dataset.
:param dataset: dataset to evaluate
:type dataset: list[EvalData]
"""
raise NotImplementedError()

View File

@@ -0,0 +1,3 @@
from .answer_relevancy import AnswerRelevance # noqa: F401
from .context_relevancy import ContextRelevance # noqa: F401
from .groundedness import Groundedness # noqa: F401

View File

@@ -0,0 +1,93 @@
import concurrent.futures
import logging
import os
from string import Template
from typing import Optional
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from embedchain.config.evaluation.base import AnswerRelevanceConfig
from embedchain.evaluation.base import BaseMetric
from embedchain.utils.evaluation import EvalData, EvalMetric
class AnswerRelevance(BaseMetric):
"""
Metric for evaluating the relevance of answers.
"""
def __init__(self, config: Optional[AnswerRelevanceConfig] = AnswerRelevanceConfig()):
super().__init__(name=EvalMetric.ANSWER_RELEVANCY.value)
self.config = config
api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
self.client = OpenAI(api_key=api_key)
def _generate_prompt(self, data: EvalData) -> str:
"""
Generates a prompt based on the provided data.
"""
return Template(self.config.prompt).substitute(
num_gen_questions=self.config.num_gen_questions, answer=data.answer
)
def _generate_questions(self, prompt: str) -> list[str]:
"""
Generates questions from the prompt.
"""
response = self.client.chat.completions.create(
model=self.config.model,
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content.strip().split("\n")
def _generate_embedding(self, question: str) -> np.ndarray:
"""
Generates the embedding for a question.
"""
response = self.client.embeddings.create(
input=question,
model=self.config.embedder,
)
return np.array(response.data[0].embedding)
def _compute_similarity(self, original: np.ndarray, generated: np.ndarray) -> float:
"""
Computes the cosine similarity between two embeddings.
"""
original = original.reshape(1, -1)
norm = np.linalg.norm(original) * np.linalg.norm(generated, axis=1)
return np.dot(generated, original.T).flatten() / norm
def _compute_score(self, data: EvalData) -> float:
"""
Computes the relevance score for a given data item.
"""
prompt = self._generate_prompt(data)
generated_questions = self._generate_questions(prompt)
original_embedding = self._generate_embedding(data.question)
generated_embeddings = np.array([self._generate_embedding(q) for q in generated_questions])
similarities = self._compute_similarity(original_embedding, generated_embeddings)
return np.mean(similarities)
def evaluate(self, dataset: list[EvalData]) -> float:
"""
Evaluates the dataset and returns the average answer relevance score.
"""
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
for future in tqdm(
concurrent.futures.as_completed(future_to_data), total=len(dataset), desc="Evaluating Answer Relevancy"
):
data = future_to_data[future]
try:
results.append(future.result())
except Exception as e:
logging.error(f"Error evaluating answer relevancy for {data}: {e}")
return np.mean(results) if results else 0.0

View File

@@ -0,0 +1,69 @@
import concurrent.futures
import os
from string import Template
from typing import Optional
import numpy as np
import pysbd
from openai import OpenAI
from tqdm import tqdm
from embedchain.config.evaluation.base import ContextRelevanceConfig
from embedchain.evaluation.base import BaseMetric
from embedchain.utils.evaluation import EvalData, EvalMetric
class ContextRelevance(BaseMetric):
"""
Metric for evaluating the relevance of context in a dataset.
"""
def __init__(self, config: Optional[ContextRelevanceConfig] = ContextRelevanceConfig()):
super().__init__(name=EvalMetric.CONTEXT_RELEVANCY.value)
self.config = config
api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
self.client = OpenAI(api_key=api_key)
self._sbd = pysbd.Segmenter(language=self.config.language, clean=False)
def _sentence_segmenter(self, text: str) -> list[str]:
"""
Segments the given text into sentences.
"""
return self._sbd.segment(text)
def _compute_score(self, data: EvalData) -> float:
"""
Computes the context relevance score for a given data item.
"""
original_context = "\n".join(data.contexts)
prompt = Template(self.config.prompt).substitute(context=original_context, question=data.question)
response = self.client.chat.completions.create(
model=self.config.model, messages=[{"role": "user", "content": prompt}]
)
useful_context = response.choices[0].message.content.strip()
useful_context_sentences = self._sentence_segmenter(useful_context)
original_context_sentences = self._sentence_segmenter(original_context)
if not original_context_sentences:
return 0.0
return len(useful_context_sentences) / len(original_context_sentences)
def evaluate(self, dataset: list[EvalData]) -> float:
"""
Evaluates the dataset and returns the average context relevance score.
"""
scores = []
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(self._compute_score, data) for data in dataset]
for future in tqdm(
concurrent.futures.as_completed(futures), total=len(dataset), desc="Evaluating Context Relevancy"
):
try:
scores.append(future.result())
except Exception as e:
print(f"Error during evaluation: {e}")
return np.mean(scores) if scores else 0.0

View File

@@ -0,0 +1,102 @@
import concurrent.futures
import logging
import os
from string import Template
from typing import Optional
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from embedchain.config.evaluation.base import GroundednessConfig
from embedchain.evaluation.base import BaseMetric
from embedchain.utils.evaluation import EvalData, EvalMetric
class Groundedness(BaseMetric):
"""
Metric for groundedness of answer from the given contexts.
"""
def __init__(self, config: Optional[GroundednessConfig] = None):
super().__init__(name=EvalMetric.GROUNDEDNESS.value)
self.config = config or GroundednessConfig()
api_key = self.config.api_key or os.environ["OPENAI_API_KEY"]
if not api_key:
raise ValueError("Please set the OPENAI_API_KEY environment variable or pass the `api_key` in config.")
self.client = OpenAI(api_key=api_key)
def _generate_answer_claim_prompt(self, data: EvalData) -> str:
"""
Generate the prompt for the given data.
"""
prompt = Template(self.config.answer_claims_prompt).substitute(question=data.question, answer=data.answer)
return prompt
def _get_claim_statements(self, prompt: str) -> np.ndarray:
"""
Get claim statements from the answer.
"""
response = self.client.chat.completions.create(
model=self.config.model,
messages=[{"role": "user", "content": f"{prompt}"}],
)
result = response.choices[0].message.content.strip()
claim_statements = np.array([statement for statement in result.split("\n") if statement])
return claim_statements
def _generate_claim_inference_prompt(self, data: EvalData, claim_statements: list[str]) -> str:
"""
Generate the claim inference prompt for the given data and claim statements.
"""
prompt = Template(self.config.claims_inference_prompt).substitute(
context="\n".join(data.contexts), claim_statements="\n".join(claim_statements)
)
return prompt
def _get_claim_verdict_scores(self, prompt: str) -> np.ndarray:
"""
Get verdicts for claim statements.
"""
response = self.client.chat.completions.create(
model=self.config.model,
messages=[{"role": "user", "content": f"{prompt}"}],
)
result = response.choices[0].message.content.strip()
claim_verdicts = result.split("\n")
verdict_score_map = {"1": 1, "0": 0, "-1": np.nan}
verdict_scores = np.array([verdict_score_map[verdict] for verdict in claim_verdicts])
return verdict_scores
def _compute_score(self, data: EvalData) -> float:
"""
Compute the groundedness score for a single data point.
"""
answer_claims_prompt = self._generate_answer_claim_prompt(data)
claim_statements = self._get_claim_statements(answer_claims_prompt)
claim_inference_prompt = self._generate_claim_inference_prompt(data, claim_statements)
verdict_scores = self._get_claim_verdict_scores(claim_inference_prompt)
return np.sum(verdict_scores) / claim_statements.size
def evaluate(self, dataset: list[EvalData]):
"""
Evaluate the dataset and returns the average groundedness score.
"""
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
for future in tqdm(
concurrent.futures.as_completed(future_to_data),
total=len(future_to_data),
desc="Evaluating Groundedness",
):
data = future_to_data[future]
try:
score = future.result()
results.append(score)
except Exception as e:
logging.error(f"Error while evaluating groundedness for data point {data}: {e}")
return np.mean(results) if results else 0.0