From e2cca61cd3f6d3a51f0c685b13fdeb8f1fef63e2 Mon Sep 17 00:00:00 2001
From: Deven Patel <iamdevenpatel@gmail.com>
Date: Thu, 11 Jan 2024 20:02:47 +0530
Subject: [PATCH] [Feature] Add support for RAG evaluation (#1154)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
---
 docs/api-reference/pipeline/evaluate.mdx     |  41 ++++
 docs/components/evaluation.mdx               | 208 +++++++++++++++++++
 docs/mint.json                               |   6 +-
 embedchain/app.py                            | 108 +++++++++-
 embedchain/config/eval/__init__.py           |   2 +
 embedchain/config/eval/base.py               |  92 ++++++++
 embedchain/embedchain.py                     |  12 +-
 embedchain/eval/__init__.py                  |   0
 embedchain/eval/base.py                      |  29 +++
 embedchain/eval/metrics/__init__.py          |   3 +
 embedchain/eval/metrics/answer_relevancy.py  |  93 +++++++++
 embedchain/eval/metrics/context_relevancy.py |  69 ++++++
 embedchain/eval/metrics/groundedness.py      | 102 +++++++++
 embedchain/utils/eval.py                     |  17 ++
 embedchain/utils/misc.py                     |   3 +-
 embedchain/vectordb/zilliz.py                |  11 +-
 poetry.lock                                  |  12 +-
 pyproject.toml                               |   1 +
 18 files changed, 788 insertions(+), 21 deletions(-)
 create mode 100644 docs/api-reference/pipeline/evaluate.mdx
 create mode 100644 docs/components/evaluation.mdx
 create mode 100644 embedchain/config/eval/__init__.py
 create mode 100644 embedchain/config/eval/base.py
 create mode 100644 embedchain/eval/__init__.py
 create mode 100644 embedchain/eval/base.py
 create mode 100644 embedchain/eval/metrics/__init__.py
 create mode 100644 embedchain/eval/metrics/answer_relevancy.py
 create mode 100644 embedchain/eval/metrics/context_relevancy.py
 create mode 100644 embedchain/eval/metrics/groundedness.py
 create mode 100644 embedchain/utils/eval.py
diff --git a/docs/api-reference/pipeline/evaluate.mdx b/docs/api-reference/pipeline/evaluate.mdx
new file mode 100644
index 00000000..64cb612c
--- /dev/null
+++ b/docs/api-reference/pipeline/evaluate.mdx
@@ -0,0 +1,41 @@
+---
+title: '📝 evaluate'
+---
+
+`evaluate()` method is used to evaluate the performance of a RAG app. You can find the signature below:
+
+### Parameters
+
+<ParamField path="question" type="Union[str, list[str]]">
+    A question or a list of questions to evaluate your app on.
+</ParamField>
+<ParamField path="metrics" type="Optional[list[Union[BaseMetric, str]]]" optional>
+    The metrics to evaluate your app on. Defaults to all metrics: `["context_relevancy", "answer_relevancy", "groundedness"]`
+</ParamField>
+<ParamField path="num_workers" type="int" optional>
+    Specify the number of threads to use for parallel processing.
+</ParamField>
+
+### Returns
+
+<ResponseField name="metrics" type="dict">
+    Returns the metrics you have chosen to evaluate your app on as a dictionary.
+</ResponseField>
+
+## Usage
+
+```python
+from embedchain import App
+
+app = App()
+
+# add data source
+app.add("https://www.forbes.com/profile/elon-musk")
+
+# run evaluation
+app.evaluate("what is the net worth of Elon Musk?")
+# {'answer_relevancy': 0.958019958036268, 'context_relevancy': 0.12903225806451613}
+
+# or
+# app.evaluate(["what is the net worth of Elon Musk?", "which companies does Elon Musk own?"])
+```
diff --git a/docs/components/evaluation.mdx b/docs/components/evaluation.mdx
new file mode 100644
index 00000000..dd8332ea
--- /dev/null
+++ b/docs/components/evaluation.mdx
@@ -0,0 +1,208 @@
+---
+title: 🔬 Evaluation
+---
+
+## Overview
+
+We provide out-of-the-box evaluation methods for your datasets. You can use them to evaluate your models and compare them with other models.
+
+Currently, we provide the following evaluation methods:
+
+<CardGroup cols={3}>
+    <Card title="Context Relevancy" href="#context_relevancy"></Card>
+    <Card title="Answer Relevancy" href="#answer_relevancy"></Card>
+    <Card title="Groundedness" href="#groundedness"></Card>
+    <Card title="Custom" href="#custom"></Card>
+</CardGroup>
+
+More evaluation metrics are coming soon! 🏗️
+
+## Usage
+
+We have found that the best way to evaluate datasets is with the help of OpenAI's `gpt-4` model. Hence, we require you to set `OPENAI_API_KEY` as an environment variable. If you don't want to set it, you can pass it in the config argument of the respective evaluation class, as shown in the examples later below.
+
+<Accordion title="We will assume the following dataset for the examples below">
+<CodeGroup>
+```python main.py
+from embedchain.utils.eval import EvalData
+
+data = [
+    {
+        "question": "What is the net worth of Elon Musk?",
+        "contexts": [
+            """Elon Musk PROFILEElon MuskCEO, ...""",
+            """a Twitter poll on whether the journalists' ...""",
+            """2016 and run by Jared Birchall.[335]...""",
+        ],
+        "answer": "As of the information provided, Elon Musk's net worth is $241.6 billion.",
+    },
+    {
+        "question": "which companies does Elon Musk own?",
+        "contexts": [
+            """of December 2023[update], ...""",
+            """ThielCofounderView ProfileTeslaHolds ...""",
+            """Elon Musk PROFILEElon MuskCEO, ...""",
+        ],
+        "answer": "Elon Musk owns several companies, including Tesla, SpaceX, Neuralink, and The Boring Company.",
+    },
+]
+
+dataset = []
+
+for d in data:
+    dataset.append(EvalData(question=d["question"], contexts=d["contexts"], answer=d["answer"]))
+```
+</CodeGroup>
+</Accordion>
+
+## Context Relevancy <a id="context_relevancy"></a>
+
+Context relevancy is a metric to determine how relevant the context is to the question. We use OpenAI's `gpt-4` model to determine the relevancy of the context.
+We achieve this by prompting the model with the question and the context and asking it to return relevant sentences from the context. We then use the following formula to determine the score:
+
+context_relevance_score = (# of relevant sentences in context) $$\div$$ (total # of sentences in context)
+
+You can run the context relevancy evaluation with the following simple code:
+
+```python
+from embedchain.eval.metrics import ContextRelevance
+metric = ContextRelevance()
+score = metric.evaluate(dataset)    # dataset from above
+print(score)
+# 0.27975528364849833
+```
+
+In the above example, we used sensible defaults for the evaluation. However, you can also configure the evaluation metric as per your needs using the `ContextRelevanceConfig` class. 
+
+### ContextRelevanceConfig
+
+<ParamField path="model" type="str" optional>
+    The model to use for the evaluation. Defaults to `gpt-4`. We only support openai's models for now.
+</ParamField>
+<ParamField path="api_key" type="str" optional>
+    The openai api key to use for the evaluation. Defaults to `None`. If not provided, we will use the `OPENAI_API_KEY` environment variable.
+</ParamField>
+<ParamField path="language" type="str" optional>
+    The language of the dataset being evaluated. We need this to determine the understand the context provided in the dataset. Defaults to `en`.
+</ParamField>
+<ParamField path="prompt" type="str" optional>
+    The prompt to extract the relevant sentences from the context. Defaults to `CONTEXT_RELEVANCY_PROMPT`, which can be found at `embedchain.config.eval.base` path.
+</ParamField>
+
+```python
+openai_api_key = "sk-xxx"
+metric = ContextRelevance(config=ContextRelevanceConfig(model='gpt-4', api_key=openai_api_key, language="en"))
+print(metric.evaluate(dataset))
+```
+
+
+## Answer Relevancy <a id="answer_relevancy"></a>
+
+Answer relevancy is a metric to determine how relevant the answer is to the question. We use OpenAI's `gpt-4` model to determine the relevancy of the answer.
+We achieve this by prompting the model with the answer and asking it to generate questions from the answer. We then use the cosine similarity between the generated questions and the original question to determine the score.
+
+answer_relevancy_score = mean(cosine_similarity(generated_questions, original_question))
+
+You can run the answer relevancy evaluation with the following simple code:
+
+```python
+from embedchain.eval.metrics import AnswerRelevance
+metric = AnswerRelevance()
+score = metric.evaluate(dataset)    # dataset from above
+print(score)
+# 0.9505334177461916
+```
+
+In the above example, we used sensible defaults for the evaluation. However, you can also configure the evaluation metric as per your needs using the `AnswerRelevanceConfig` class.
+
+### AnswerRelevanceConfig
+
+<ParamField path="model" type="str" optional>
+    The model to use for the evaluation. Defaults to `gpt-4`. We only support openai's models for now.
+</ParamField>
+<ParamField path="embedder" type="str" optional>
+    The embedder to use for embedding the text. Defaults to `text-embedding-ada-002`. We only support openai's embedders for now.
+</ParamField>
+<ParamField path="api_key" type="str" optional>
+    The openai api key to use for the evaluation. Defaults to `None`. If not provided, we will use the `OPENAI_API_KEY` environment variable.
+</ParamField>
+<ParamField path="num_gen_questions" type="int" optional>
+    The number of questions to generate for each answer. We use the generated questions to compare the similarity with the original question to determine the score. Defaults to `1`.
+</ParamField>
+<ParamField path="prompt" type="str" optional>
+    The prompt to extract the `num_gen_questions` number of questions from the provided answer. Defaults to `ANSWER_RELEVANCY_PROMPT`, which can be found at `embedchain.config.eval.base` path.
+</ParamField>
+
+```python
+openai_api_key = "sk-xxx"
+metric = AnswerRelevance(config=AnswerRelevanceConfig(model='gpt-4',
+                                                      embedder="text-embedding-ada-002",
+                                                      api_key=openai_api_key,
+                                                      num_gen_questions=2))
+print(metric.evaluate(dataset))
+```
+
+## Groundedness <a id="groundedness"></a>
+
+Groundedness is a metric to determine how grounded the answer is to the context. We use OpenAI's `gpt-4` model to determine the groundedness of the answer.
+We achieve this by prompting the model with the answer and asking it to generate claims from the answer. We then again prompt the model with the context and the generated claims to determine the verdict on the claims. We then use the following formula to determine the score:
+
+groundedness_score = (sum of all verdicts) $$\div$$ (total # of claims)
+
+You can run the groundedness evaluation with the following simple code:
+
+```python
+from embedchain.eval.metrics import Groundedness
+metric = Groundedness()
+score = metric.evaluate(dataset)    # dataset from above
+print(score)
+# 1.0
+```
+
+In the above example, we used sensible defaults for the evaluation. However, you can also configure the evaluation metric as per your needs using the `GroundednessConfig` class.
+
+### GroundednessConfig
+
+<ParamField path="model" type="str" optional>
+    The model to use for the evaluation. Defaults to `gpt-4`. We only support openai's models for now.
+</ParamField>
+<ParamField path="api_key" type="str" optional>
+    The openai api key to use for the evaluation. Defaults to `None`. If not provided, we will use the `OPENAI_API_KEY` environment variable.
+</ParamField>
+<ParamField path="answer_claims_prompt" type="str" optional>
+    The prompt to extract the claims from the provided answer. Defaults to `GROUNDEDNESS_ANSWER_CLAIMS_PROMPT`, which can be found at `embedchain.config.eval.base` path.
+</ParamField>
+<ParamField path="claims_inference_prompt" type="str" optional>
+    The prompt to get verdicts on the claims from the answer from the given context. Defaults to `GROUNDEDNESS_CLAIMS_INFERENCE_PROMPT`, which can be found at `embedchain.config.eval.base` path.
+</ParamField>
+
+```python
+openai_api_key = "sk-xxx"
+metric = Groundedness(config=GroundednessConfig(model='gpt-4',
+                                                api_key=openai_api_key))
+print(metric.evaluate(dataset))
+```
+
+## Custom <a id="custom"></a>
+
+You can also create your own evaluation metric by extending the `BaseMetric` class. You can find the source code for the existing metrics at `embedchain.eval.metrics` path.
+
+<Note>
+You must provide the `name` of your custom metric in the `__init__` method of your class. This name will be used to identify your metric in the evaluation report.
+</Note>
+
+```python
+from embedchain.eval.metrics import BaseMetric
+from embedchain.utils.eval import EvalData
+from embedchain.config.base_config import BaseConfig
+from typing import Optional
+
+class CustomMetric(BaseMetric):
+    def __init__(self, config: Optional[BaseConfig] = None):
+        super().__init__(name="custom_metric")
+
+    def evaluate(self, dataset: list[EvalData]):
+        score = 0.0
+        # write your evaluation logic here
+        return score
+```
diff --git a/docs/mint.json b/docs/mint.json
index 58315282..3ea46ece 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -131,7 +131,8 @@
         },
         "components/llms",
         "components/vector-databases",
-        "components/embedding-models"
+        "components/embedding-models",
+        "components/evaluation"
       ]
     },
     {
@@ -208,7 +209,8 @@
             "api-reference/pipeline/search",
             "api-reference/pipeline/deploy",
             "api-reference/pipeline/reset",
-            "api-reference/pipeline/delete"
+            "api-reference/pipeline/delete",
+            "api-reference/pipeline/evaluate"
           ]
         },
         "api-reference/store/openai-assistant",
diff --git a/embedchain/app.py b/embedchain/app.py
index c47775a2..a9e222fe 100644
--- a/embedchain/app.py
+++ b/embedchain/app.py
@@ -1,13 +1,15 @@
 import ast
+import concurrent.futures
 import json
 import logging
 import os
 import sqlite3
 import uuid
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import requests
 import yaml
+from tqdm import tqdm
 
 from embedchain.cache import (Config, ExactMatchEvaluation,
                               SearchDistanceEvaluation, cache,
@@ -18,11 +20,15 @@ from embedchain.constants import SQLITE_PATH
 from embedchain.embedchain import EmbedChain
 from embedchain.embedder.base import BaseEmbedder
 from embedchain.embedder.openai import OpenAIEmbedder
+from embedchain.eval.base import BaseMetric
+from embedchain.eval.metrics import (AnswerRelevance, ContextRelevance,
+                                     Groundedness)
 from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.llm.base import BaseLlm
 from embedchain.llm.openai import OpenAILlm
 from embedchain.telemetry.posthog import AnonymousTelemetry
+from embedchain.utils.eval import EvalData, EvalMetric
 from embedchain.utils.misc import validate_config
 from embedchain.vectordb.base import BaseVectorDB
 from embedchain.vectordb.chroma import ChromaDB
@@ -455,3 +461,103 @@ class App(EmbedChain):
             chunker=chunker_config_data,
             cache_config=cache_config,
         )
+
+    def _eval(self, dataset: list[EvalData], metric: Union[BaseMetric, str]):
+        """
+        Evaluate the app on a dataset for a given metric.
+        """
+        metric_str = metric.name if isinstance(metric, BaseMetric) else metric
+        eval_class_map = {
+            EvalMetric.CONTEXT_RELEVANCY.value: ContextRelevance,
+            EvalMetric.ANSWER_RELEVANCY.value: AnswerRelevance,
+            EvalMetric.GROUNDEDNESS.value: Groundedness,
+        }
+
+        if metric_str in eval_class_map:
+            return eval_class_map[metric_str]().evaluate(dataset)
+
+        # Handle the case for custom metrics
+        if isinstance(metric, BaseMetric):
+            return metric.evaluate(dataset)
+        else:
+            raise ValueError(f"Invalid metric: {metric}")
+
+    def evaluate(
+        self,
+        questions: Union[str, list[str]],
+        metrics: Optional[list[Union[BaseMetric, str]]] = None,
+        num_workers: int = 4,
+    ):
+        """
+        Evaluate the app on a question.
+
+        param: questions: A question or a list of questions to evaluate.
+        type: questions: Union[str, list[str]]
+        param: metrics: A list of metrics to evaluate. Defaults to all metrics.
+        type: metrics: Optional[list[Union[BaseMetric, str]]]
+        param: num_workers: Number of workers to use for parallel processing.
+        type: num_workers: int
+        return: A dictionary containing the evaluation results.
+        rtype: dict
+        """
+        if "OPENAI_API_KEY" not in os.environ:
+            raise ValueError("Please set the OPENAI_API_KEY environment variable with permission to use `gpt4` model.")
+
+        queries, answers, contexts = [], [], []
+        if isinstance(questions, list):
+            with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+                future_to_data = {executor.submit(self.query, q, citations=True): q for q in questions}
+                for future in tqdm(
+                    concurrent.futures.as_completed(future_to_data),
+                    total=len(future_to_data),
+                    desc="Getting answer and contexts for questions",
+                ):
+                    question = future_to_data[future]
+                    queries.append(question)
+                    answer, context = future.result()
+                    answers.append(answer)
+                    contexts.append(list(map(lambda x: x[0], context)))
+        else:
+            answer, context = self.query(questions, citations=True)
+            queries = [questions]
+            answers = [answer]
+            contexts = [list(map(lambda x: x[0], context))]
+
+        metrics = metrics or [
+            EvalMetric.CONTEXT_RELEVANCY.value,
+            EvalMetric.ANSWER_RELEVANCY.value,
+            EvalMetric.GROUNDEDNESS.value,
+        ]
+
+        logging.info(f"Collecting data from {len(queries)} questions for evaluation...")
+        dataset = []
+        for q, a, c in zip(queries, answers, contexts):
+            dataset.append(EvalData(question=q, answer=a, contexts=c))
+
+        logging.info(f"Evaluating {len(dataset)} data points...")
+        result = {}
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+            future_to_metric = {executor.submit(self._eval, dataset, metric): metric for metric in metrics}
+            for future in tqdm(
+                concurrent.futures.as_completed(future_to_metric),
+                total=len(future_to_metric),
+                desc="Evaluating metrics",
+            ):
+                metric = future_to_metric[future]
+                if isinstance(metric, BaseMetric):
+                    result[metric.name] = future.result()
+                else:
+                    result[metric] = future.result()
+
+        if self.config.collect_metrics:
+            telemetry_props = self._telemetry_props
+            metrics_names = []
+            for metric in metrics:
+                if isinstance(metric, BaseMetric):
+                    metrics_names.append(metric.name)
+                else:
+                    metrics_names.append(metric)
+            telemetry_props["metrics"] = metrics_names
+            self.telemetry.capture(event_name="evaluate", properties=telemetry_props)
+
+        return result
diff --git a/embedchain/config/eval/__init__.py b/embedchain/config/eval/__init__.py
new file mode 100644
index 00000000..ebbfcd0d
--- /dev/null
+++ b/embedchain/config/eval/__init__.py
@@ -0,0 +1,2 @@
+from .base import (AnswerRelevanceConfig, ContextRelevanceConfig,  # noqa: F401
+                   GroundednessConfig)
diff --git a/embedchain/config/eval/base.py b/embedchain/config/eval/base.py
new file mode 100644
index 00000000..942302dd
--- /dev/null
+++ b/embedchain/config/eval/base.py
@@ -0,0 +1,92 @@
+from typing import Optional
+
+from embedchain.config.base_config import BaseConfig
+
+ANSWER_RELEVANCY_PROMPT = """
+Please provide $num_gen_questions questions from the provided answer.
+You must provide the complete question, if are not able to provide the complete question, return empty string ("").
+Please only provide one question per line without numbers or bullets to distinguish them.
+You must only provide the questions and no other text.
+
+$answer
+"""  # noqa:E501
+
+
+CONTEXT_RELEVANCY_PROMPT = """
+Please extract relevant sentences from the provided context that is required to answer the given question.
+If no relevant sentences are found, or if you believe the question cannot be answered from the given context, return the empty string ("").
+While extracting candidate sentences you're not allowed to make any changes to sentences from given context or make up any sentences.
+You must only provide sentences from the given context and nothing else.
+
+Context: $context
+Question: $question
+"""  # noqa:E501
+
+GROUNDEDNESS_ANSWER_CLAIMS_PROMPT = """
+Please provide one or more statements from each sentence of the provided answer.
+You must provide the symantically equivalent statements for each sentence of the answer.
+You must provide the complete statement, if are not able to provide the complete statement, return empty string ("").
+Please only provide one statement per line WITHOUT numbers or bullets.
+If the question provided is not being answered in the provided answer, return empty string ("").
+You must only provide the statements and no other text.
+
+$question
+$answer
+"""  # noqa:E501
+
+GROUNDEDNESS_CLAIMS_INFERENCE_PROMPT = """
+Given the context and the provided claim statements, please provide a verdict for each claim statement whether it can be completely infered from the given context or not.
+Use only "1" (yes), "0" (no) and "-1" (null) for "yes", "no" or "null" respectively.
+You must provide one verdict per line, ONLY WITH "1", "0" or "-1" as per your verdict to the given statement and nothing else.
+You must provide the verdicts in the same order as the claim statements.
+
+Contexts: 
+$context
+
+Claim statements: 
+$claim_statements
+"""  # noqa:E501
+
+
+class GroundednessConfig(BaseConfig):
+    def __init__(
+        self,
+        model: str = "gpt-4",
+        api_key: Optional[str] = None,
+        answer_claims_prompt: str = GROUNDEDNESS_ANSWER_CLAIMS_PROMPT,
+        claims_inference_prompt: str = GROUNDEDNESS_CLAIMS_INFERENCE_PROMPT,
+    ):
+        self.model = model
+        self.api_key = api_key
+        self.answer_claims_prompt = answer_claims_prompt
+        self.claims_inference_prompt = claims_inference_prompt
+
+
+class AnswerRelevanceConfig(BaseConfig):
+    def __init__(
+        self,
+        model: str = "gpt-4",
+        embedder: str = "text-embedding-ada-002",
+        api_key: Optional[str] = None,
+        num_gen_questions: int = 1,
+        prompt: str = ANSWER_RELEVANCY_PROMPT,
+    ):
+        self.model = model
+        self.embedder = embedder
+        self.api_key = api_key
+        self.num_gen_questions = num_gen_questions
+        self.prompt = prompt
+
+
+class ContextRelevanceConfig(BaseConfig):
+    def __init__(
+        self,
+        model: str = "gpt-4",
+        api_key: Optional[str] = None,
+        language: str = "en",
+        prompt: str = CONTEXT_RELEVANCY_PROMPT,
+    ):
+        self.model = model
+        self.api_key = api_key
+        self.language = language
+        self.prompt = prompt
diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py
index d99f038f..4d8e4976 100644
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -7,12 +7,9 @@ from typing import Any, Optional, Union
 from dotenv import load_dotenv
 from langchain.docstore.document import Document
 
-from embedchain.cache import (
-    adapt,
-    get_gptcache_session,
-    gptcache_data_convert,
-    gptcache_update_cache_callback,
-)
+from embedchain.cache import (adapt, get_gptcache_session,
+                              gptcache_data_convert,
+                              gptcache_update_cache_callback)
 from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
 from embedchain.config.base_app_config import BaseAppConfig
@@ -22,7 +19,8 @@ from embedchain.embedder.base import BaseEmbedder
 from embedchain.helpers.json_serializable import JSONSerializable
 from embedchain.llm.base import BaseLlm
 from embedchain.loaders.base_loader import BaseLoader
-from embedchain.models.data_type import DataType, DirectDataType, IndirectDataType, SpecialDataType
+from embedchain.models.data_type import (DataType, DirectDataType,
+                                         IndirectDataType, SpecialDataType)
 from embedchain.telemetry.posthog import AnonymousTelemetry
 from embedchain.utils.misc import detect_datatype, is_valid_json_string
 from embedchain.vectordb.base import BaseVectorDB
diff --git a/embedchain/eval/__init__.py b/embedchain/eval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/embedchain/eval/base.py b/embedchain/eval/base.py
new file mode 100644
index 00000000..d86a8be1
--- /dev/null
+++ b/embedchain/eval/base.py
@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+
+from embedchain.utils.eval import EvalData
+
+
+class BaseMetric(ABC):
+    """Base class for a metric.
+
+    This class provides a common interface for all metrics.
+    """
+
+    def __init__(self, name: str = "base_metric"):
+        """
+        Initialize the BaseMetric.
+        """
+        self.name = name
+
+    @abstractmethod
+    def evaluate(self, dataset: list[EvalData]):
+        """
+        Abstract method to evaluate the dataset.
+
+        This method should be implemented by subclasses to perform the actual
+        evaluation on the dataset.
+
+        :param dataset: dataset to evaluate
+        :type dataset: list[EvalData]
+        """
+        raise NotImplementedError()
diff --git a/embedchain/eval/metrics/__init__.py b/embedchain/eval/metrics/__init__.py
new file mode 100644
index 00000000..95f57900
--- /dev/null
+++ b/embedchain/eval/metrics/__init__.py
@@ -0,0 +1,3 @@
+from .answer_relevancy import AnswerRelevance  # noqa: F401
+from .context_relevancy import ContextRelevance  # noqa: F401
+from .groundedness import Groundedness  # noqa: F401
diff --git a/embedchain/eval/metrics/answer_relevancy.py b/embedchain/eval/metrics/answer_relevancy.py
new file mode 100644
index 00000000..5335449c
--- /dev/null
+++ b/embedchain/eval/metrics/answer_relevancy.py
@@ -0,0 +1,93 @@
+import concurrent.futures
+import logging
+import os
+from string import Template
+from typing import Optional
+
+import numpy as np
+from openai import OpenAI
+from tqdm import tqdm
+
+from embedchain.config.eval.base import AnswerRelevanceConfig
+from embedchain.eval.base import BaseMetric
+from embedchain.utils.eval import EvalData, EvalMetric
+
+
+class AnswerRelevance(BaseMetric):
+    """
+    Metric for evaluating the relevance of answers.
+    """
+
+    def __init__(self, config: Optional[AnswerRelevanceConfig] = AnswerRelevanceConfig()):
+        super().__init__(name=EvalMetric.ANSWER_RELEVANCY.value)
+        self.config = config
+        api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
+        self.client = OpenAI(api_key=api_key)
+
+    def _generate_prompt(self, data: EvalData) -> str:
+        """
+        Generates a prompt based on the provided data.
+        """
+        return Template(self.config.prompt).substitute(
+            num_gen_questions=self.config.num_gen_questions, answer=data.answer
+        )
+
+    def _generate_questions(self, prompt: str) -> list[str]:
+        """
+        Generates questions from the prompt.
+        """
+        response = self.client.chat.completions.create(
+            model=self.config.model,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content.strip().split("\n")
+
+    def _generate_embedding(self, question: str) -> np.ndarray:
+        """
+        Generates the embedding for a question.
+        """
+        response = self.client.embeddings.create(
+            input=question,
+            model=self.config.embedder,
+        )
+        return np.array(response.data[0].embedding)
+
+    def _compute_similarity(self, original: np.ndarray, generated: np.ndarray) -> float:
+        """
+        Computes the cosine similarity between two embeddings.
+        """
+        original = original.reshape(1, -1)
+        norm = np.linalg.norm(original) * np.linalg.norm(generated, axis=1)
+        return np.dot(generated, original.T).flatten() / norm
+
+    def _compute_score(self, data: EvalData) -> float:
+        """
+        Computes the relevance score for a given data item.
+        """
+        prompt = self._generate_prompt(data)
+        generated_questions = self._generate_questions(prompt)
+        original_embedding = self._generate_embedding(data.question)
+        generated_embeddings = np.array([self._generate_embedding(q) for q in generated_questions])
+        similarities = self._compute_similarity(original_embedding, generated_embeddings)
+        return np.mean(similarities)
+
+    def evaluate(self, dataset: list[EvalData]) -> float:
+        """
+        Evaluates the dataset and returns the average answer relevance score.
+        """
+        results = []
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
+            for future in tqdm(
+                concurrent.futures.as_completed(future_to_data), total=len(dataset), desc="Evaluating Answer Relevancy"
+            ):
+                data = future_to_data[future]
+                try:
+                    results.append(future.result())
+                except Exception as e:
+                    logging.error(f"Error evaluating answer relevancy for {data}: {e}")
+
+        return np.mean(results) if results else 0.0
diff --git a/embedchain/eval/metrics/context_relevancy.py b/embedchain/eval/metrics/context_relevancy.py
new file mode 100644
index 00000000..44bfe75d
--- /dev/null
+++ b/embedchain/eval/metrics/context_relevancy.py
@@ -0,0 +1,69 @@
+import concurrent.futures
+import os
+from string import Template
+from typing import Optional
+
+import numpy as np
+import pysbd
+from openai import OpenAI
+from tqdm import tqdm
+
+from embedchain.config.eval.base import ContextRelevanceConfig
+from embedchain.eval.base import BaseMetric
+from embedchain.utils.eval import EvalData, EvalMetric
+
+
+class ContextRelevance(BaseMetric):
+    """
+    Metric for evaluating the relevance of context in a dataset.
+    """
+
+    def __init__(self, config: Optional[ContextRelevanceConfig] = ContextRelevanceConfig()):
+        super().__init__(name=EvalMetric.CONTEXT_RELEVANCY.value)
+        self.config = config
+        api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("API key not found. Set 'OPENAI_API_KEY' or pass it in the config.")
+        self.client = OpenAI(api_key=api_key)
+        self._sbd = pysbd.Segmenter(language=self.config.language, clean=False)
+
+    def _sentence_segmenter(self, text: str) -> list[str]:
+        """
+        Segments the given text into sentences.
+        """
+        return self._sbd.segment(text)
+
+    def _compute_score(self, data: EvalData) -> float:
+        """
+        Computes the context relevance score for a given data item.
+        """
+        original_context = "\n".join(data.contexts)
+        prompt = Template(self.config.prompt).substitute(context=original_context, question=data.question)
+        response = self.client.chat.completions.create(
+            model=self.config.model, messages=[{"role": "user", "content": prompt}]
+        )
+        useful_context = response.choices[0].message.content.strip()
+        useful_context_sentences = self._sentence_segmenter(useful_context)
+        original_context_sentences = self._sentence_segmenter(original_context)
+
+        if not original_context_sentences:
+            return 0.0
+        return len(useful_context_sentences) / len(original_context_sentences)
+
+    def evaluate(self, dataset: list[EvalData]) -> float:
+        """
+        Evaluates the dataset and returns the average context relevance score.
+        """
+        scores = []
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(self._compute_score, data) for data in dataset]
+            for future in tqdm(
+                concurrent.futures.as_completed(futures), total=len(dataset), desc="Evaluating Context Relevancy"
+            ):
+                try:
+                    scores.append(future.result())
+                except Exception as e:
+                    print(f"Error during evaluation: {e}")
+
+        return np.mean(scores) if scores else 0.0
diff --git a/embedchain/eval/metrics/groundedness.py b/embedchain/eval/metrics/groundedness.py
new file mode 100644
index 00000000..4b5010f2
--- /dev/null
+++ b/embedchain/eval/metrics/groundedness.py
@@ -0,0 +1,102 @@
+import concurrent.futures
+import logging
+import os
+from string import Template
+from typing import Optional
+
+import numpy as np
+from openai import OpenAI
+from tqdm import tqdm
+
+from embedchain.config.eval.base import GroundednessConfig
+from embedchain.eval.base import BaseMetric
+from embedchain.utils.eval import EvalData, EvalMetric
+
+
+class Groundedness(BaseMetric):
+    """
+    Metric for groundedness (aka faithfulness) of answer from the given contexts.
+    """
+
+    def __init__(self, config: Optional[GroundednessConfig] = None):
+        super().__init__(name=EvalMetric.GROUNDEDNESS.value)
+        self.config = config or GroundednessConfig()
+        api_key = self.config.api_key or os.environ["OPENAI_API_KEY"]
+        if not api_key:
+            raise ValueError("Please set the OPENAI_API_KEY environment variable or pass the `api_key` in config.")
+        self.client = OpenAI(api_key=api_key)
+
+    def _generate_answer_claim_prompt(self, data: EvalData) -> str:
+        """
+        Generate the prompt for the given data.
+        """
+        prompt = Template(self.config.answer_claims_prompt).substitute(question=data.question, answer=data.answer)
+        return prompt
+
+    def _get_claim_statements(self, prompt: str) -> np.ndarray:
+        """
+        Get claim statements from the answer.
+        """
+        response = self.client.chat.completions.create(
+            model=self.config.model,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+        )
+        result = response.choices[0].message.content.strip()
+        claim_statements = np.array([statement for statement in result.split("\n") if statement])
+        return claim_statements
+
+    def _generate_claim_inference_prompt(self, data: EvalData, claim_statements: list[str]) -> str:
+        """
+        Generate the claim inference prompt for the given data and claim statements.
+        """
+        prompt = Template(self.config.claims_inference_prompt).substitute(
+            context="\n".join(data.contexts), claim_statements="\n".join(claim_statements)
+        )
+        return prompt
+
+    def _get_claim_verdict_scores(self, prompt: str) -> np.ndarray:
+        """
+        Get verdicts for claim statements.
+        """
+        response = self.client.chat.completions.create(
+            model=self.config.model,
+            messages=[{"role": "user", "content": f"{prompt}"}],
+        )
+        result = response.choices[0].message.content.strip()
+        claim_verdicts = result.split("\n")
+        verdict_score_map = {"1": 1, "0": 0, "-1": np.nan}
+        verdict_scores = np.array([verdict_score_map[verdict] for verdict in claim_verdicts])
+        return verdict_scores
+
+    def _compute_score(self, data: EvalData) -> float:
+        """
+        Compute the groundedness score (aka faithfulness) for a single data point.
+        """
+        answer_claims_prompt = self._generate_answer_claim_prompt(data)
+        claim_statements = self._get_claim_statements(answer_claims_prompt)
+
+        claim_inference_prompt = self._generate_claim_inference_prompt(data, claim_statements)
+        verdict_scores = self._get_claim_verdict_scores(claim_inference_prompt)
+        return np.sum(verdict_scores) / claim_statements.size
+
+    def evaluate(self, dataset: list[EvalData]):
+        """
+        Evaluate the dataset and returns the average groundedness score.
+        """
+        results = []
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_data = {executor.submit(self._compute_score, data): data for data in dataset}
+            for future in tqdm(
+                concurrent.futures.as_completed(future_to_data),
+                total=len(future_to_data),
+                desc="Evaluating groundedness (aka faithfulness)",
+            ):
+                data = future_to_data[future]
+                try:
+                    score = future.result()
+                    results.append(score)
+                except Exception as e:
+                    logging.error(f"Error while evaluating groundedness for data point {data}: {e}")
+
+        return np.mean(results) if results else 0.0
diff --git a/embedchain/utils/eval.py b/embedchain/utils/eval.py
new file mode 100644
index 00000000..62eaaeb7
--- /dev/null
+++ b/embedchain/utils/eval.py
@@ -0,0 +1,17 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class EvalMetric(Enum):
+    CONTEXT_RELEVANCY = "context_relevancy"
+    ANSWER_RELEVANCY = "answer_relevancy"
+    GROUNDEDNESS = "groundedness"
+
+
+class EvalData(BaseModel):
+    question: str
+    contexts: list[str]
+    answer: str
+    ground_truth: Optional[str] = None  # Not used as of now
diff --git a/embedchain/utils/misc.py b/embedchain/utils/misc.py
index f5b7f760..88732bb8 100644
--- a/embedchain/utils/misc.py
+++ b/embedchain/utils/misc.py
@@ -201,7 +201,8 @@ def detect_datatype(source: Any) -> DataType:
     formatted_source = format_source(str(source), 30)
 
     if url:
-        from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
+        from langchain.document_loaders.youtube import \
+            ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
 
         if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
             logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
diff --git a/embedchain/vectordb/zilliz.py b/embedchain/vectordb/zilliz.py
index 65e541db..657eb644 100644
--- a/embedchain/vectordb/zilliz.py
+++ b/embedchain/vectordb/zilliz.py
@@ -6,15 +6,8 @@ from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.vectordb.base import BaseVectorDB
 
 try:
-    from pymilvus import (
-        Collection,
-        CollectionSchema,
-        DataType,
-        FieldSchema,
-        MilvusClient,
-        connections,
-        utility,
-    )
+    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
+                          MilvusClient, connections, utility)
 except ImportError:
     raise ImportError(
         "Zilliz requires extra dependencies. Install with `pip install --upgrade embedchain[milvus]`"
diff --git a/poetry.lock b/poetry.lock
index 17711ee3..0399cf16 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -5267,6 +5267,16 @@ files = [
     {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"},
 ]
 
+[[package]]
+name = "pysbd"
+version = "0.3.4"
+description = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
+optional = false
+python-versions = ">=3"
+files = [
+    {file = "pysbd-0.3.4-py3-none-any.whl", hash = "sha256:cd838939b7b0b185fcf86b0baf6636667dfb6e474743beeff878e9f42e022953"},
+]
+
 [[package]]
 name = "pytesseract"
 version = "0.3.10"
@@ -8120,4 +8130,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "e22ac2ddd59d53039c47f326504364ff3a8000df9a954d79fbe671c75e88599f"
+content-hash = "02bd85e14374a9dc9b59523b8fb4baea7068251976ba7f87722cac94a9974ccc"
diff --git a/pyproject.toml b/pyproject.toml
index 010c120a..3db85910 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,7 @@ rich = "^13.7.0"
 beautifulsoup4 = "^4.12.2"
 pypdf = "^3.11.0"
 gptcache = "^0.1.43"
+pysbd = "^0.3.4"
 tiktoken = { version = "^0.4.0", optional = true }
 youtube-transcript-api = { version = "^0.6.1", optional = true }
 pytube = { version = "^15.0.0", optional = true }