[Refactor] Change evaluation script path (#1165)
This commit is contained in:
0
embedchain/evaluation/__init__.py
Normal file
0
embedchain/evaluation/__init__.py
Normal file
29
embedchain/evaluation/base.py
Normal file
29
embedchain/evaluation/base.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from embedchain.utils.evaluation import EvalData
|
||||
|
||||
|
||||
class BaseMetric(ABC):
|
||||
"""Base class for a metric.
|
||||
|
||||
This class provides a common interface for all metrics.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str = "base_metric"):
|
||||
"""
|
||||
Initialize the BaseMetric.
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
@abstractmethod
|
||||
def evaluate(self, dataset: list[EvalData]):
|
||||
"""
|
||||
Abstract method to evaluate the dataset.
|
||||
|
||||
This method should be implemented by subclasses to perform the actual
|
||||
evaluation on the dataset.
|
||||
|
||||
:param dataset: dataset to evaluate
|
||||
:type dataset: list[EvalData]
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
3
embedchain/evaluation/metrics/__init__.py
Normal file
3
embedchain/evaluation/metrics/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .answer_relevancy import AnswerRelevance # noqa: F401
|
||||
from .context_relevancy import ContextRelevance # noqa: F401
|
||||
from .groundedness import Groundedness # noqa: F401
|
||||
93
embedchain/evaluation/metrics/answer_relevancy.py
Normal file
93
embedchain/evaluation/metrics/answer_relevancy.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import concurrent.futures
|
||||
import logging
|
||||
import os
|
||||
from string import Template
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
from embedchain.config.evaluation.base import AnswerRelevanceConfig
|
||||
from embedchain.evaluation.base import BaseMetric
|
||||
from embedchain.utils.evaluation import EvalData, EvalMetric
|
||||
|
||||
|
||||
class AnswerRelevance(BaseMetric):
|
||||
"""
|
||||
Metric for evaluating the relevance of answers.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[AnswerRelevanceConfig] = AnswerRelevanceConfig()):
|
||||
super().__init__(name=EvalMetric.ANSWER_RELEVANCY.value)
|
||||
self.config = config
|
||||
api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
|
||||
def _generate_prompt(self, data: EvalData) -> str:
|
||||
"""
|
||||
Generates a prompt based on the provided data.
|
||||
"""
|
||||
return Template(self.config.prompt).substitute(
|
||||
num_gen_questions=self.config.num_gen_questions, answer=data.answer
|
||||
)
|
||||
|
||||
def _generate_questions(self, prompt: str) -> list[str]:
|
||||
"""
|
||||
Generates questions from the prompt.
|
||||
"""
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
)
|
||||
return response.choices[0].message.content.strip().split("\n")
|
||||
|
||||
def _generate_embedding(self, question: str) -> np.ndarray:
|
||||
"""
|
||||
Generates the embedding for a question.
|
||||
"""
|
||||
response = self.client.embeddings.create(
|
||||
input=question,
|
||||
model=self.config.embedder,
|
||||
)
|
||||
return np.array(response.data[0].embedding)
|
||||
|
||||
def _compute_similarity(self, original: np.ndarray, generated: np.ndarray) -> float:
|
||||
"""
|
||||
Computes the cosine similarity between two embeddings.
|
||||
"""
|
||||
original = original.reshape(1, -1)
|
||||
norm = np.linalg.norm(original) * np.linalg.norm(generated, axis=1)
|
||||
return np.dot(generated, original.T).flatten() / norm
|
||||
|
||||
def _compute_score(self, data: EvalData) -> float:
|
||||
"""
|
||||
Computes the relevance score for a given data item.
|
||||
"""
|
||||
prompt = self._generate_prompt(data)
|
||||
generated_questions = self._generate_questions(prompt)
|
||||
original_embedding = self._generate_embedding(data.question)
|
||||
generated_embeddings = np.array([self._generate_embedding(q) for q in generated_questions])
|
||||
similarities = self._compute_similarity(original_embedding, generated_embeddings)
|
||||
return np.mean(similarities)
|
||||
|
||||
def evaluate(self, dataset: list[EvalData]) -> float:
|
||||
"""
|
||||
Evaluates the dataset and returns the average answer relevance score.
|
||||
"""
|
||||
results = []
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
|
||||
for future in tqdm(
|
||||
concurrent.futures.as_completed(future_to_data), total=len(dataset), desc="Evaluating Answer Relevancy"
|
||||
):
|
||||
data = future_to_data[future]
|
||||
try:
|
||||
results.append(future.result())
|
||||
except Exception as e:
|
||||
logging.error(f"Error evaluating answer relevancy for {data}: {e}")
|
||||
|
||||
return np.mean(results) if results else 0.0
|
||||
69
embedchain/evaluation/metrics/context_relevancy.py
Normal file
69
embedchain/evaluation/metrics/context_relevancy.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import concurrent.futures
|
||||
import os
|
||||
from string import Template
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pysbd
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
from embedchain.config.evaluation.base import ContextRelevanceConfig
|
||||
from embedchain.evaluation.base import BaseMetric
|
||||
from embedchain.utils.evaluation import EvalData, EvalMetric
|
||||
|
||||
|
||||
class ContextRelevance(BaseMetric):
|
||||
"""
|
||||
Metric for evaluating the relevance of context in a dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[ContextRelevanceConfig] = ContextRelevanceConfig()):
|
||||
super().__init__(name=EvalMetric.CONTEXT_RELEVANCY.value)
|
||||
self.config = config
|
||||
api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
self._sbd = pysbd.Segmenter(language=self.config.language, clean=False)
|
||||
|
||||
def _sentence_segmenter(self, text: str) -> list[str]:
|
||||
"""
|
||||
Segments the given text into sentences.
|
||||
"""
|
||||
return self._sbd.segment(text)
|
||||
|
||||
def _compute_score(self, data: EvalData) -> float:
|
||||
"""
|
||||
Computes the context relevance score for a given data item.
|
||||
"""
|
||||
original_context = "\n".join(data.contexts)
|
||||
prompt = Template(self.config.prompt).substitute(context=original_context, question=data.question)
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.config.model, messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
useful_context = response.choices[0].message.content.strip()
|
||||
useful_context_sentences = self._sentence_segmenter(useful_context)
|
||||
original_context_sentences = self._sentence_segmenter(original_context)
|
||||
|
||||
if not original_context_sentences:
|
||||
return 0.0
|
||||
return len(useful_context_sentences) / len(original_context_sentences)
|
||||
|
||||
def evaluate(self, dataset: list[EvalData]) -> float:
|
||||
"""
|
||||
Evaluates the dataset and returns the average context relevance score.
|
||||
"""
|
||||
scores = []
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
futures = [executor.submit(self._compute_score, data) for data in dataset]
|
||||
for future in tqdm(
|
||||
concurrent.futures.as_completed(futures), total=len(dataset), desc="Evaluating Context Relevancy"
|
||||
):
|
||||
try:
|
||||
scores.append(future.result())
|
||||
except Exception as e:
|
||||
print(f"Error during evaluation: {e}")
|
||||
|
||||
return np.mean(scores) if scores else 0.0
|
||||
102
embedchain/evaluation/metrics/groundedness.py
Normal file
102
embedchain/evaluation/metrics/groundedness.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import concurrent.futures
|
||||
import logging
|
||||
import os
|
||||
from string import Template
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
|
||||
from embedchain.config.evaluation.base import GroundednessConfig
|
||||
from embedchain.evaluation.base import BaseMetric
|
||||
from embedchain.utils.evaluation import EvalData, EvalMetric
|
||||
|
||||
|
||||
class Groundedness(BaseMetric):
|
||||
"""
|
||||
Metric for groundedness of answer from the given contexts.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[GroundednessConfig] = None):
|
||||
super().__init__(name=EvalMetric.GROUNDEDNESS.value)
|
||||
self.config = config or GroundednessConfig()
|
||||
api_key = self.config.api_key or os.environ["OPENAI_API_KEY"]
|
||||
if not api_key:
|
||||
raise ValueError("Please set the OPENAI_API_KEY environment variable or pass the `api_key` in config.")
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
|
||||
def _generate_answer_claim_prompt(self, data: EvalData) -> str:
|
||||
"""
|
||||
Generate the prompt for the given data.
|
||||
"""
|
||||
prompt = Template(self.config.answer_claims_prompt).substitute(question=data.question, answer=data.answer)
|
||||
return prompt
|
||||
|
||||
def _get_claim_statements(self, prompt: str) -> np.ndarray:
|
||||
"""
|
||||
Get claim statements from the answer.
|
||||
"""
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=[{"role": "user", "content": f"{prompt}"}],
|
||||
)
|
||||
result = response.choices[0].message.content.strip()
|
||||
claim_statements = np.array([statement for statement in result.split("\n") if statement])
|
||||
return claim_statements
|
||||
|
||||
def _generate_claim_inference_prompt(self, data: EvalData, claim_statements: list[str]) -> str:
|
||||
"""
|
||||
Generate the claim inference prompt for the given data and claim statements.
|
||||
"""
|
||||
prompt = Template(self.config.claims_inference_prompt).substitute(
|
||||
context="\n".join(data.contexts), claim_statements="\n".join(claim_statements)
|
||||
)
|
||||
return prompt
|
||||
|
||||
def _get_claim_verdict_scores(self, prompt: str) -> np.ndarray:
|
||||
"""
|
||||
Get verdicts for claim statements.
|
||||
"""
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.config.model,
|
||||
messages=[{"role": "user", "content": f"{prompt}"}],
|
||||
)
|
||||
result = response.choices[0].message.content.strip()
|
||||
claim_verdicts = result.split("\n")
|
||||
verdict_score_map = {"1": 1, "0": 0, "-1": np.nan}
|
||||
verdict_scores = np.array([verdict_score_map[verdict] for verdict in claim_verdicts])
|
||||
return verdict_scores
|
||||
|
||||
def _compute_score(self, data: EvalData) -> float:
|
||||
"""
|
||||
Compute the groundedness score for a single data point.
|
||||
"""
|
||||
answer_claims_prompt = self._generate_answer_claim_prompt(data)
|
||||
claim_statements = self._get_claim_statements(answer_claims_prompt)
|
||||
|
||||
claim_inference_prompt = self._generate_claim_inference_prompt(data, claim_statements)
|
||||
verdict_scores = self._get_claim_verdict_scores(claim_inference_prompt)
|
||||
return np.sum(verdict_scores) / claim_statements.size
|
||||
|
||||
def evaluate(self, dataset: list[EvalData]):
|
||||
"""
|
||||
Evaluate the dataset and returns the average groundedness score.
|
||||
"""
|
||||
results = []
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
|
||||
for future in tqdm(
|
||||
concurrent.futures.as_completed(future_to_data),
|
||||
total=len(future_to_data),
|
||||
desc="Evaluating Groundedness",
|
||||
):
|
||||
data = future_to_data[future]
|
||||
try:
|
||||
score = future.result()
|
||||
results.append(score)
|
||||
except Exception as e:
|
||||
logging.error(f"Error while evaluating groundedness for data point {data}: {e}")
|
||||
|
||||
return np.mean(results) if results else 0.0
|
||||
Reference in New Issue
Block a user