From 59600e2a5bb05adabdb0f5c8177f3cd3dced91f3 Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Fri, 19 Jan 2024 10:31:41 +0530 Subject: [PATCH] [Improvement] add vector_dimension configuration in embedder config (#1192) Co-authored-by: Deven Patel --- docs/api-reference/advanced/configuration.mdx | 6 +++++- docs/components/llms.mdx | 2 +- embedchain/config/embedder/base.py | 7 ++++++- embedchain/embedder/google.py | 2 +- embedchain/embedder/gpt4all.py | 2 +- embedchain/embedder/huggingface.py | 2 +- embedchain/embedder/openai.py | 3 ++- embedchain/embedder/vertexai.py | 2 +- embedchain/utils/misc.py | 2 ++ 9 files changed, 20 insertions(+), 8 deletions(-) diff --git a/docs/api-reference/advanced/configuration.mdx b/docs/api-reference/advanced/configuration.mdx index f1648865..365a223e 100644 --- a/docs/api-reference/advanced/configuration.mdx +++ b/docs/api-reference/advanced/configuration.mdx @@ -8,7 +8,7 @@ You can configure different components of your app (`llm`, `embedding model`, or -Embedchain applications are configurable using YAML file, JSON file or by directly passing the config dictionary. Checkout the [docs here](/api-reference/pipeline/overview#usage) on how to use other formats. +Embedchain applications are configurable using YAML file, JSON file or by directly passing the config dictionary. Checkout the [docs here](/api-reference/app/overview#usage) on how to use other formats. @@ -214,7 +214,11 @@ Alright, let's dive into what each key means in the yaml config above: - `provider` (String): The provider for the embedder, set to 'openai'. You can find the full list of embedding model providers in [our docs](/components/embedding-models). - `config`: - `model` (String): The specific model used for text embedding, 'text-embedding-ada-002'. + - `vector_dimension` (Integer): The vector dimension of the embedding model. [Defaults](https://github.com/embedchain/embedchain/blob/e572b5a3dc1b66f1e9b3357d11a88c63b5ce06e3/embedchain/models/vector_dimensions.py) - `api_key` (String): The API key for the embedding model. + - `deployment_name` (String): The deployment name for the embedding model. + - `title` (String): The title for the embedding model for Google Embedder. + - `task_type` (String): The task type for the embedding model for Google Embedder. 5. `chunker` Section: - `chunk_size` (Integer): The size of each chunk of text that is sent to the language model. - `chunk_overlap` (Integer): The amount of overlap between each chunk of text. diff --git a/docs/components/llms.mdx b/docs/components/llms.mdx index 1a58d727..bbf05a36 100644 --- a/docs/components/llms.mdx +++ b/docs/components/llms.mdx @@ -250,7 +250,7 @@ app = App.from_config(config_path="config.yaml") llm: provider: azure_openai config: - model: gpt-35-turbo + model: gpt-3.5-turbo deployment_name: your_llm_deployment_name temperature: 0.5 max_tokens: 1000 diff --git a/embedchain/config/embedder/base.py b/embedchain/config/embedder/base.py index 78bb8b30..dc14cea2 100644 --- a/embedchain/config/embedder/base.py +++ b/embedchain/config/embedder/base.py @@ -6,7 +6,11 @@ from embedchain.helpers.json_serializable import register_deserializable @register_deserializable class BaseEmbedderConfig: def __init__( - self, model: Optional[str] = None, deployment_name: Optional[str] = None, api_key: Optional[str] = None + self, + model: Optional[str] = None, + deployment_name: Optional[str] = None, + vector_dimension: Optional[int] = None, + api_key: Optional[str] = None, ): """ Initialize a new instance of an embedder config class. @@ -18,4 +22,5 @@ class BaseEmbedderConfig: """ self.model = model self.deployment_name = deployment_name + self.vector_dimension = vector_dimension self.api_key = api_key diff --git a/embedchain/embedder/google.py b/embedchain/embedder/google.py index e69d3b14..4d09f6be 100644 --- a/embedchain/embedder/google.py +++ b/embedchain/embedder/google.py @@ -27,5 +27,5 @@ class GoogleAIEmbedder(BaseEmbedder): embedding_fn = GoogleAIEmbeddingFunction(config=config) self.set_embedding_fn(embedding_fn=embedding_fn) - vector_dimension = VectorDimensions.GOOGLE_AI.value + vector_dimension = self.config.vector_dimension or VectorDimensions.GOOGLE_AI.value self.set_vector_dimension(vector_dimension=vector_dimension) diff --git a/embedchain/embedder/gpt4all.py b/embedchain/embedder/gpt4all.py index d078825d..9e17131a 100644 --- a/embedchain/embedder/gpt4all.py +++ b/embedchain/embedder/gpt4all.py @@ -16,5 +16,5 @@ class GPT4AllEmbedder(BaseEmbedder): embedding_fn = BaseEmbedder._langchain_default_concept(embeddings) self.set_embedding_fn(embedding_fn=embedding_fn) - vector_dimension = VectorDimensions.GPT4ALL.value + vector_dimension = self.config.vector_dimension or VectorDimensions.GPT4ALL.value self.set_vector_dimension(vector_dimension=vector_dimension) diff --git a/embedchain/embedder/huggingface.py b/embedchain/embedder/huggingface.py index 32cae44e..83433c05 100644 --- a/embedchain/embedder/huggingface.py +++ b/embedchain/embedder/huggingface.py @@ -15,5 +15,5 @@ class HuggingFaceEmbedder(BaseEmbedder): embedding_fn = BaseEmbedder._langchain_default_concept(embeddings) self.set_embedding_fn(embedding_fn=embedding_fn) - vector_dimension = VectorDimensions.HUGGING_FACE.value + vector_dimension = self.config.vector_dimension or VectorDimensions.HUGGING_FACE.value self.set_vector_dimension(vector_dimension=vector_dimension) diff --git a/embedchain/embedder/openai.py b/embedchain/embedder/openai.py index e71f9c04..5f3114a3 100644 --- a/embedchain/embedder/openai.py +++ b/embedchain/embedder/openai.py @@ -32,4 +32,5 @@ class OpenAIEmbedder(BaseEmbedder): model_name=self.config.model, ) self.set_embedding_fn(embedding_fn=embedding_fn) - self.set_vector_dimension(vector_dimension=VectorDimensions.OPENAI.value) + vector_dimension = self.config.vector_dimension or VectorDimensions.OPENAI.value + self.set_vector_dimension(vector_dimension=vector_dimension) diff --git a/embedchain/embedder/vertexai.py b/embedchain/embedder/vertexai.py index fe947646..c8eed7d9 100644 --- a/embedchain/embedder/vertexai.py +++ b/embedchain/embedder/vertexai.py @@ -15,5 +15,5 @@ class VertexAIEmbedder(BaseEmbedder): embedding_fn = BaseEmbedder._langchain_default_concept(embeddings) self.set_embedding_fn(embedding_fn=embedding_fn) - vector_dimension = VectorDimensions.VERTEX_AI.value + vector_dimension = self.config.vector_dimension or VectorDimensions.VERTEX_AI.value self.set_vector_dimension(vector_dimension=vector_dimension) diff --git a/embedchain/utils/misc.py b/embedchain/utils/misc.py index 0f117b32..c08665c2 100644 --- a/embedchain/utils/misc.py +++ b/embedchain/utils/misc.py @@ -438,6 +438,7 @@ def validate_config(config_data): Optional("api_key"): str, Optional("title"): str, Optional("task_type"): str, + Optional("vector_dimension"): int, }, }, Optional("embedding_model"): { @@ -448,6 +449,7 @@ def validate_config(config_data): Optional("api_key"): str, Optional("title"): str, Optional("task_type"): str, + Optional("vector_dimension"): int, }, }, Optional("chunker"): {