[Improvement] add vector_dimension configuration in embedder config (#1192)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -8,7 +8,7 @@ You can configure different components of your app (`llm`, `embedding model`, or
|
|||||||
|
|
||||||
|
|
||||||
<Tip>
|
<Tip>
|
||||||
Embedchain applications are configurable using YAML file, JSON file or by directly passing the config dictionary. Checkout the [docs here](/api-reference/pipeline/overview#usage) on how to use other formats.
|
Embedchain applications are configurable using YAML file, JSON file or by directly passing the config dictionary. Checkout the [docs here](/api-reference/app/overview#usage) on how to use other formats.
|
||||||
</Tip>
|
</Tip>
|
||||||
|
|
||||||
<CodeGroup>
|
<CodeGroup>
|
||||||
@@ -214,7 +214,11 @@ Alright, let's dive into what each key means in the yaml config above:
|
|||||||
- `provider` (String): The provider for the embedder, set to 'openai'. You can find the full list of embedding model providers in [our docs](/components/embedding-models).
|
- `provider` (String): The provider for the embedder, set to 'openai'. You can find the full list of embedding model providers in [our docs](/components/embedding-models).
|
||||||
- `config`:
|
- `config`:
|
||||||
- `model` (String): The specific model used for text embedding, 'text-embedding-ada-002'.
|
- `model` (String): The specific model used for text embedding, 'text-embedding-ada-002'.
|
||||||
|
- `vector_dimension` (Integer): The vector dimension of the embedding model. [Defaults](https://github.com/embedchain/embedchain/blob/e572b5a3dc1b66f1e9b3357d11a88c63b5ce06e3/embedchain/models/vector_dimensions.py)
|
||||||
- `api_key` (String): The API key for the embedding model.
|
- `api_key` (String): The API key for the embedding model.
|
||||||
|
- `deployment_name` (String): The deployment name for the embedding model.
|
||||||
|
- `title` (String): The title for the embedding model for Google Embedder.
|
||||||
|
- `task_type` (String): The task type for the embedding model for Google Embedder.
|
||||||
5. `chunker` Section:
|
5. `chunker` Section:
|
||||||
- `chunk_size` (Integer): The size of each chunk of text that is sent to the language model.
|
- `chunk_size` (Integer): The size of each chunk of text that is sent to the language model.
|
||||||
- `chunk_overlap` (Integer): The amount of overlap between each chunk of text.
|
- `chunk_overlap` (Integer): The amount of overlap between each chunk of text.
|
||||||
|
|||||||
@@ -250,7 +250,7 @@ app = App.from_config(config_path="config.yaml")
|
|||||||
llm:
|
llm:
|
||||||
provider: azure_openai
|
provider: azure_openai
|
||||||
config:
|
config:
|
||||||
model: gpt-35-turbo
|
model: gpt-3.5-turbo
|
||||||
deployment_name: your_llm_deployment_name
|
deployment_name: your_llm_deployment_name
|
||||||
temperature: 0.5
|
temperature: 0.5
|
||||||
max_tokens: 1000
|
max_tokens: 1000
|
||||||
|
|||||||
@@ -6,7 +6,11 @@ from embedchain.helpers.json_serializable import register_deserializable
|
|||||||
@register_deserializable
|
@register_deserializable
|
||||||
class BaseEmbedderConfig:
|
class BaseEmbedderConfig:
|
||||||
def __init__(
|
def __init__(
|
||||||
self, model: Optional[str] = None, deployment_name: Optional[str] = None, api_key: Optional[str] = None
|
self,
|
||||||
|
model: Optional[str] = None,
|
||||||
|
deployment_name: Optional[str] = None,
|
||||||
|
vector_dimension: Optional[int] = None,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize a new instance of an embedder config class.
|
Initialize a new instance of an embedder config class.
|
||||||
@@ -18,4 +22,5 @@ class BaseEmbedderConfig:
|
|||||||
"""
|
"""
|
||||||
self.model = model
|
self.model = model
|
||||||
self.deployment_name = deployment_name
|
self.deployment_name = deployment_name
|
||||||
|
self.vector_dimension = vector_dimension
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
|
|||||||
@@ -27,5 +27,5 @@ class GoogleAIEmbedder(BaseEmbedder):
|
|||||||
embedding_fn = GoogleAIEmbeddingFunction(config=config)
|
embedding_fn = GoogleAIEmbeddingFunction(config=config)
|
||||||
self.set_embedding_fn(embedding_fn=embedding_fn)
|
self.set_embedding_fn(embedding_fn=embedding_fn)
|
||||||
|
|
||||||
vector_dimension = VectorDimensions.GOOGLE_AI.value
|
vector_dimension = self.config.vector_dimension or VectorDimensions.GOOGLE_AI.value
|
||||||
self.set_vector_dimension(vector_dimension=vector_dimension)
|
self.set_vector_dimension(vector_dimension=vector_dimension)
|
||||||
|
|||||||
@@ -16,5 +16,5 @@ class GPT4AllEmbedder(BaseEmbedder):
|
|||||||
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
||||||
self.set_embedding_fn(embedding_fn=embedding_fn)
|
self.set_embedding_fn(embedding_fn=embedding_fn)
|
||||||
|
|
||||||
vector_dimension = VectorDimensions.GPT4ALL.value
|
vector_dimension = self.config.vector_dimension or VectorDimensions.GPT4ALL.value
|
||||||
self.set_vector_dimension(vector_dimension=vector_dimension)
|
self.set_vector_dimension(vector_dimension=vector_dimension)
|
||||||
|
|||||||
@@ -15,5 +15,5 @@ class HuggingFaceEmbedder(BaseEmbedder):
|
|||||||
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
||||||
self.set_embedding_fn(embedding_fn=embedding_fn)
|
self.set_embedding_fn(embedding_fn=embedding_fn)
|
||||||
|
|
||||||
vector_dimension = VectorDimensions.HUGGING_FACE.value
|
vector_dimension = self.config.vector_dimension or VectorDimensions.HUGGING_FACE.value
|
||||||
self.set_vector_dimension(vector_dimension=vector_dimension)
|
self.set_vector_dimension(vector_dimension=vector_dimension)
|
||||||
|
|||||||
@@ -32,4 +32,5 @@ class OpenAIEmbedder(BaseEmbedder):
|
|||||||
model_name=self.config.model,
|
model_name=self.config.model,
|
||||||
)
|
)
|
||||||
self.set_embedding_fn(embedding_fn=embedding_fn)
|
self.set_embedding_fn(embedding_fn=embedding_fn)
|
||||||
self.set_vector_dimension(vector_dimension=VectorDimensions.OPENAI.value)
|
vector_dimension = self.config.vector_dimension or VectorDimensions.OPENAI.value
|
||||||
|
self.set_vector_dimension(vector_dimension=vector_dimension)
|
||||||
|
|||||||
@@ -15,5 +15,5 @@ class VertexAIEmbedder(BaseEmbedder):
|
|||||||
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
embedding_fn = BaseEmbedder._langchain_default_concept(embeddings)
|
||||||
self.set_embedding_fn(embedding_fn=embedding_fn)
|
self.set_embedding_fn(embedding_fn=embedding_fn)
|
||||||
|
|
||||||
vector_dimension = VectorDimensions.VERTEX_AI.value
|
vector_dimension = self.config.vector_dimension or VectorDimensions.VERTEX_AI.value
|
||||||
self.set_vector_dimension(vector_dimension=vector_dimension)
|
self.set_vector_dimension(vector_dimension=vector_dimension)
|
||||||
|
|||||||
@@ -438,6 +438,7 @@ def validate_config(config_data):
|
|||||||
Optional("api_key"): str,
|
Optional("api_key"): str,
|
||||||
Optional("title"): str,
|
Optional("title"): str,
|
||||||
Optional("task_type"): str,
|
Optional("task_type"): str,
|
||||||
|
Optional("vector_dimension"): int,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Optional("embedding_model"): {
|
Optional("embedding_model"): {
|
||||||
@@ -448,6 +449,7 @@ def validate_config(config_data):
|
|||||||
Optional("api_key"): str,
|
Optional("api_key"): str,
|
||||||
Optional("title"): str,
|
Optional("title"): str,
|
||||||
Optional("task_type"): str,
|
Optional("task_type"): str,
|
||||||
|
Optional("vector_dimension"): int,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Optional("chunker"): {
|
Optional("chunker"): {
|
||||||
|
|||||||
Reference in New Issue
Block a user