From 38b4e06963d506f5d8f67c0888bc474eaf6bba6b Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Thu, 15 Feb 2024 13:20:14 -0800 Subject: [PATCH] [Feature] Add support for hybrid search for pinecone vector database (#1259) --- docs/_snippets/missing-vector-db-tip.mdx | 2 +- docs/components/data-sources/google-drive.mdx | 2 +- docs/components/data-sources/overview.mdx | 57 ++--- docs/components/vector-databases.mdx | 238 ------------------ docs/components/vector-databases/chromadb.mdx | 35 +++ .../vector-databases/elasticsearch.mdx | 39 +++ .../vector-databases/opensearch.mdx | 36 +++ docs/components/vector-databases/pinecone.mdx | 106 ++++++++ docs/components/vector-databases/qdrant.mdx | 23 ++ docs/components/vector-databases/weaviate.mdx | 24 ++ docs/components/vector-databases/zilliz.mdx | 39 +++ docs/deployment/embedchain_ai.mdx | 2 +- docs/mint.json | 16 +- embedchain/config/vectordb/pinecone.py | 7 + embedchain/embedchain.py | 40 --- embedchain/vectordb/pinecone.py | 35 ++- poetry.lock | 90 ++++++- pyproject.toml | 5 +- 18 files changed, 470 insertions(+), 326 deletions(-) create mode 100644 docs/components/vector-databases/chromadb.mdx create mode 100644 docs/components/vector-databases/elasticsearch.mdx create mode 100644 docs/components/vector-databases/opensearch.mdx create mode 100644 docs/components/vector-databases/pinecone.mdx create mode 100644 docs/components/vector-databases/qdrant.mdx create mode 100644 docs/components/vector-databases/weaviate.mdx create mode 100644 docs/components/vector-databases/zilliz.mdx diff --git a/docs/_snippets/missing-vector-db-tip.mdx b/docs/_snippets/missing-vector-db-tip.mdx index 6149911d..2edbbe4b 100644 --- a/docs/_snippets/missing-vector-db-tip.mdx +++ b/docs/_snippets/missing-vector-db-tip.mdx @@ -1,6 +1,6 @@ -

If you can't find the specific vector database, please feel free to request through one of the following channels and help us prioritize.

+

If you can't find specific feature or run into issues, please feel free to reach out through one of the following channels.

diff --git a/docs/components/data-sources/google-drive.mdx b/docs/components/data-sources/google-drive.mdx index 3cb06c7b..5dcf4e45 100644 --- a/docs/components/data-sources/google-drive.mdx +++ b/docs/components/data-sources/google-drive.mdx @@ -25,4 +25,4 @@ app = App() url = "https://drive.google.com/drive/u/0/folders/xxx-xxx" app.add(url, data_type="google_drive") -``` \ No newline at end of file +``` diff --git a/docs/components/data-sources/overview.mdx b/docs/components/data-sources/overview.mdx index ed963aff..16114f52 100644 --- a/docs/components/data-sources/overview.mdx +++ b/docs/components/data-sources/overview.mdx @@ -5,34 +5,35 @@ title: Overview Embedchain comes with built-in support for various data sources. We handle the complexity of loading unstructured data from these data sources, allowing you to easily customize your app through a user-friendly interface. - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/docs/components/vector-databases.mdx b/docs/components/vector-databases.mdx index 8aa48049..c889e105 100644 --- a/docs/components/vector-databases.mdx +++ b/docs/components/vector-databases.mdx @@ -17,242 +17,4 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you
-## ChromaDB - - - -```python main.py -from embedchain import App - -# load chroma configuration from yaml file -app = App.from_config(config_path="config1.yaml") -``` - -```yaml config1.yaml -vectordb: - provider: chroma - config: - collection_name: 'my-collection' - dir: db - allow_reset: true -``` - -```yaml config2.yaml -vectordb: - provider: chroma - config: - collection_name: 'my-collection' - host: localhost - port: 5200 - allow_reset: true -``` - - - - -## Elasticsearch - -Install related dependencies using the following command: - -```bash -pip install --upgrade 'embedchain[elasticsearch]' -``` - - -You can configure the Elasticsearch connection by providing either `es_url` or `cloud_id`. If you are using the Elasticsearch Service on Elastic Cloud, you can find the `cloud_id` on the [Elastic Cloud dashboard](https://cloud.elastic.co/deployments). - - -You can authorize the connection to Elasticsearch by providing either `basic_auth`, `api_key`, or `bearer_auth`. - - - -```python main.py -from embedchain import App - -# load elasticsearch configuration from yaml file -app = App.from_config(config_path="config.yaml") -``` - -```yaml config.yaml -vectordb: - provider: elasticsearch - config: - collection_name: 'es-index' - cloud_id: 'deployment-name:xxxx' - basic_auth: - - elastic - - - verify_certs: false -``` - - -## OpenSearch - -Install related dependencies using the following command: - -```bash -pip install --upgrade 'embedchain[opensearch]' -``` - - - -```python main.py -from embedchain import App - -# load opensearch configuration from yaml file -app = App.from_config(config_path="config.yaml") -``` - -```yaml config.yaml -vectordb: - provider: opensearch - config: - collection_name: 'my-app' - opensearch_url: 'https://localhost:9200' - http_auth: - - admin - - admin - vector_dimension: 1536 - use_ssl: false - verify_certs: false -``` - - - -## Zilliz - -Install related dependencies using the following command: - -```bash -pip install --upgrade 'embedchain[milvus]' -``` - -Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN` which you can find it on their [cloud platform](https://cloud.zilliz.com/). - - - -```python main.py -import os -from embedchain import App - -os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com' -os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx' - -# load zilliz configuration from yaml file -app = App.from_config(config_path="config.yaml") -``` - -```yaml config.yaml -vectordb: - provider: zilliz - config: - collection_name: 'zilliz_app' - uri: https://xxxx.api.gcp-region.zillizcloud.com - token: xxx - vector_dim: 1536 - metric_type: L2 -``` - - - -## LanceDB - -_Coming soon_ - -## Pinecone - -Install pinecone related dependencies using the following command: - -```bash -pip install --upgrade 'embedchain[pinecone]' -``` - -In order to use Pinecone as vector database, set the environment variable `PINECONE_API_KEY` which you can find on [Pinecone dashboard](https://app.pinecone.io/). - - - -```python main.py -from embedchain import App - -# load pinecone configuration from yaml file -app = App.from_config(config_path="pod_config.yaml") -# or -app = App.from_config(config_path="serverless_config.yaml") -``` - -```yaml pod_config.yaml -vectordb: - provider: pinecone - config: - metric: cosine - vector_dimension: 1536 - index_name: my-pinecone-index - pod_config: - environment: gcp-starter - metadata_config: - indexed: - - "url" - - "hash" -``` - -```yaml serverless_config.yaml -vectordb: - provider: pinecone - config: - metric: cosine - vector_dimension: 1536 - index_name: my-pinecone-index - serverless_config: - cloud: aws - region: us-west-2 -``` - - - -
- -You can find more information about Pinecone configuration [here](https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index). -You can also optionally provide `index_name` as a config param in yaml file to specify the index name. If not provided, the index name will be `{collection_name}-{vector_dimension}`. - - -## Qdrant - -In order to use Qdrant as a vector database, set the environment variables `QDRANT_URL` and `QDRANT_API_KEY` which you can find on [Qdrant Dashboard](https://cloud.qdrant.io/). - - -```python main.py -from embedchain import App - -# load qdrant configuration from yaml file -app = App.from_config(config_path="config.yaml") -``` - -```yaml config.yaml -vectordb: - provider: qdrant - config: - collection_name: my_qdrant_index -``` - - -## Weaviate - -In order to use Weaviate as a vector database, set the environment variables `WEAVIATE_ENDPOINT` and `WEAVIATE_API_KEY` which you can find on [Weaviate dashboard](https://console.weaviate.cloud/dashboard). - - -```python main.py -from embedchain import App - -# load weaviate configuration from yaml file -app = App.from_config(config_path="config.yaml") -``` - -```yaml config.yaml -vectordb: - provider: weaviate - config: - collection_name: my_weaviate_index -``` - - diff --git a/docs/components/vector-databases/chromadb.mdx b/docs/components/vector-databases/chromadb.mdx new file mode 100644 index 00000000..783dfe89 --- /dev/null +++ b/docs/components/vector-databases/chromadb.mdx @@ -0,0 +1,35 @@ +--- +title: ChromaDB +--- + + + +```python main.py +from embedchain import App + +# load chroma configuration from yaml file +app = App.from_config(config_path="config1.yaml") +``` + +```yaml config1.yaml +vectordb: + provider: chroma + config: + collection_name: 'my-collection' + dir: db + allow_reset: true +``` + +```yaml config2.yaml +vectordb: + provider: chroma + config: + collection_name: 'my-collection' + host: localhost + port: 5200 + allow_reset: true +``` + + + + diff --git a/docs/components/vector-databases/elasticsearch.mdx b/docs/components/vector-databases/elasticsearch.mdx new file mode 100644 index 00000000..0a354e65 --- /dev/null +++ b/docs/components/vector-databases/elasticsearch.mdx @@ -0,0 +1,39 @@ +--- +title: Elasticsearch +--- + +Install related dependencies using the following command: + +```bash +pip install --upgrade 'embedchain[elasticsearch]' +``` + + +You can configure the Elasticsearch connection by providing either `es_url` or `cloud_id`. If you are using the Elasticsearch Service on Elastic Cloud, you can find the `cloud_id` on the [Elastic Cloud dashboard](https://cloud.elastic.co/deployments). + + +You can authorize the connection to Elasticsearch by providing either `basic_auth`, `api_key`, or `bearer_auth`. + + + +```python main.py +from embedchain import App + +# load elasticsearch configuration from yaml file +app = App.from_config(config_path="config.yaml") +``` + +```yaml config.yaml +vectordb: + provider: elasticsearch + config: + collection_name: 'es-index' + cloud_id: 'deployment-name:xxxx' + basic_auth: + - elastic + - + verify_certs: false +``` + + + diff --git a/docs/components/vector-databases/opensearch.mdx b/docs/components/vector-databases/opensearch.mdx new file mode 100644 index 00000000..8f686697 --- /dev/null +++ b/docs/components/vector-databases/opensearch.mdx @@ -0,0 +1,36 @@ +--- +title: OpenSearch +--- + +Install related dependencies using the following command: + +```bash +pip install --upgrade 'embedchain[opensearch]' +``` + + + +```python main.py +from embedchain import App + +# load opensearch configuration from yaml file +app = App.from_config(config_path="config.yaml") +``` + +```yaml config.yaml +vectordb: + provider: opensearch + config: + collection_name: 'my-app' + opensearch_url: 'https://localhost:9200' + http_auth: + - admin + - admin + vector_dimension: 1536 + use_ssl: false + verify_certs: false +``` + + + + diff --git a/docs/components/vector-databases/pinecone.mdx b/docs/components/vector-databases/pinecone.mdx new file mode 100644 index 00000000..29b37d71 --- /dev/null +++ b/docs/components/vector-databases/pinecone.mdx @@ -0,0 +1,106 @@ +--- +title: Pinecone +--- + +## Overview + +Install pinecone related dependencies using the following command: + +```bash +pip install --upgrade 'embedchain[pinecone]' +``` + +In order to use Pinecone as vector database, set the environment variable `PINECONE_API_KEY` which you can find on [Pinecone dashboard](https://app.pinecone.io/). + + + +```python main.py +from embedchain import App + +# Load pinecone configuration from yaml file +app = App.from_config(config_path="pod_config.yaml") +# Or +app = App.from_config(config_path="serverless_config.yaml") +``` + +```yaml pod_config.yaml +vectordb: + provider: pinecone + config: + metric: cosine + vector_dimension: 1536 + index_name: my-pinecone-index + pod_config: + environment: gcp-starter + metadata_config: + indexed: + - "url" + - "hash" +``` + +```yaml serverless_config.yaml +vectordb: + provider: pinecone + config: + metric: cosine + vector_dimension: 1536 + index_name: my-pinecone-index + serverless_config: + cloud: aws + region: us-west-2 +``` + + + +
+ +You can find more information about Pinecone configuration [here](https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index). +You can also optionally provide `index_name` as a config param in yaml file to specify the index name. If not provided, the index name will be `{collection_name}-{vector_dimension}`. + + +## Usage + +### Hybrid search + +Here is an example of how you can do hybrid search using Pinecone as a vector database through Embedchain. + +```python +import os + +from embedchain import App + +config = { + 'app': { + "config": { + "id": "ec-docs-hybrid-search" + } + }, + 'vectordb': { + 'provider': 'pinecone', + 'config': { + 'metric': 'dotproduct', + 'vector_dimension': 1536, + 'index_name': 'my-index', + 'serverless_config': { + 'cloud': 'aws', + 'region': 'us-west-2' + }, + 'hybrid_search': True, # Remember to set this for hybrid search + } + } +} + +# Initialize app +app = App.from_config(config=config) + +# Add documents +app.add("/path/to/file.pdf", data_type="pdf_file", namespace="my-namespace") + +# Query +app.query("", namespace="my-namespace") +``` + +Under the hood, Embedchain fetches the relevant chunks from the documents you added by doing hybrid search on the pinecone index. +If you have questions on how pinecone hybrid search works, please refer to their [offical documentation here](https://docs.pinecone.io/docs/hybrid-search). + + diff --git a/docs/components/vector-databases/qdrant.mdx b/docs/components/vector-databases/qdrant.mdx new file mode 100644 index 00000000..cadb42e9 --- /dev/null +++ b/docs/components/vector-databases/qdrant.mdx @@ -0,0 +1,23 @@ +--- +title: Qdrant +--- + +In order to use Qdrant as a vector database, set the environment variables `QDRANT_URL` and `QDRANT_API_KEY` which you can find on [Qdrant Dashboard](https://cloud.qdrant.io/). + + +```python main.py +from embedchain import App + +# load qdrant configuration from yaml file +app = App.from_config(config_path="config.yaml") +``` + +```yaml config.yaml +vectordb: + provider: qdrant + config: + collection_name: my_qdrant_index +``` + + + diff --git a/docs/components/vector-databases/weaviate.mdx b/docs/components/vector-databases/weaviate.mdx new file mode 100644 index 00000000..e5b1d5ed --- /dev/null +++ b/docs/components/vector-databases/weaviate.mdx @@ -0,0 +1,24 @@ +--- +title: Weaviate +--- + + +In order to use Weaviate as a vector database, set the environment variables `WEAVIATE_ENDPOINT` and `WEAVIATE_API_KEY` which you can find on [Weaviate dashboard](https://console.weaviate.cloud/dashboard). + + +```python main.py +from embedchain import App + +# load weaviate configuration from yaml file +app = App.from_config(config_path="config.yaml") +``` + +```yaml config.yaml +vectordb: + provider: weaviate + config: + collection_name: my_weaviate_index +``` + + + diff --git a/docs/components/vector-databases/zilliz.mdx b/docs/components/vector-databases/zilliz.mdx new file mode 100644 index 00000000..55c0dbaa --- /dev/null +++ b/docs/components/vector-databases/zilliz.mdx @@ -0,0 +1,39 @@ +--- +title: Zilliz +--- + +Install related dependencies using the following command: + +```bash +pip install --upgrade 'embedchain[milvus]' +``` + +Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN` which you can find it on their [cloud platform](https://cloud.zilliz.com/). + + + +```python main.py +import os +from embedchain import App + +os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com' +os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx' + +# load zilliz configuration from yaml file +app = App.from_config(config_path="config.yaml") +``` + +```yaml config.yaml +vectordb: + provider: zilliz + config: + collection_name: 'zilliz_app' + uri: https://xxxx.api.gcp-region.zillizcloud.com + token: xxx + vector_dim: 1536 + metric_type: L2 +``` + + + + diff --git a/docs/deployment/embedchain_ai.mdx b/docs/deployment/embedchain_ai.mdx index 45f49054..d4d29dc2 100644 --- a/docs/deployment/embedchain_ai.mdx +++ b/docs/deployment/embedchain_ai.mdx @@ -5,7 +5,7 @@ description: 'Deploy your RAG application to embedchain.ai platform' ## Deploy on Embedchain Platform -Embedchain enables developers to deploy their LLM-powered apps in production using the [Embedchain platform](https://app.embedchain.ai). The platform offers free access to context on your data through its REST API. Once the pipeline is deployed, you can update your data sources anytime after deployment. +Embedchain enables developers to deploy their LLM-powered apps in production using the Embedchain platform. The platform offers free access to context on your data through its REST API. Once the pipeline is deployed, you can update your data sources anytime after deployment. Deployment to Embedchain Platform is currently available on an invitation-only basis. To request access, please submit your information via the provided [Google Form](https://forms.gle/vigN11h7b4Ywat668). We will review your request and respond promptly. diff --git a/docs/mint.json b/docs/mint.json index 36d104ac..a6dbe6c1 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -88,9 +88,8 @@ "pages": [ "components/introduction", { - "group": "Data sources", + "group": "πŸ—‚οΈ Data sources", "pages": [ - "components/data-sources/overview", { "group": "Data types", @@ -129,8 +128,19 @@ "components/data-sources/data-type-handling" ] }, + { + "group": "πŸ—„οΈ Vector databases", + "pages": [ + "components/vector-databases/chromadb", + "components/vector-databases/elasticsearch", + "components/vector-databases/pinecone", + "components/vector-databases/opensearch", + "components/vector-databases/qdrant", + "components/vector-databases/weaviate", + "components/vector-databases/zilliz" + ] + }, "components/llms", - "components/vector-databases", "components/embedding-models", "components/evaluation" ] diff --git a/embedchain/config/vectordb/pinecone.py b/embedchain/config/vectordb/pinecone.py index dbf0f6d1..f82da24d 100644 --- a/embedchain/config/vectordb/pinecone.py +++ b/embedchain/config/vectordb/pinecone.py @@ -15,6 +15,7 @@ class PineconeDBConfig(BaseVectorDbConfig): metric: Optional[str] = "cosine", pod_config: Optional[dict[str, any]] = None, serverless_config: Optional[dict[str, any]] = None, + hybrid_search: bool = False, **extra_params: dict[str, any], ): self.metric = metric @@ -22,6 +23,7 @@ class PineconeDBConfig(BaseVectorDbConfig): self.index_name = index_name self.vector_dimension = vector_dimension self.extra_params = extra_params + self.hybrid_search = hybrid_search if pod_config is None and serverless_config is None: # If no config is provided, use the default pod spec config pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter") @@ -33,4 +35,9 @@ class PineconeDBConfig(BaseVectorDbConfig): if self.pod_config and self.serverless_config: raise ValueError("Only one of pod_config or serverless_config can be provided.") + if self.hybrid_search and self.metric != "dotproduct": + raise ValueError( + "Hybrid search is only supported with dotproduct metric in Pinecone. See full docs here: https://docs.pinecone.io/docs/hybrid-search#limitations" + ) # noqa:E501 + super().__init__(collection_name=self.index_name, dir=None) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 6623e2c0..67a97dd3 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -237,46 +237,6 @@ class EmbedChain(JSONSerializable): return source_hash - def add_local( - self, - source: Any, - data_type: Optional[DataType] = None, - metadata: Optional[dict[str, Any]] = None, - config: Optional[AddConfig] = None, - **kwargs: Optional[dict[str, Any]], - ): - """ - Adds the data from the given URL to the vector db. - Loads the data, chunks it, create embedding for each chunk - and then stores the embedding to vector database. - - Warning: - This method is deprecated and will be removed in future versions. Use `add` instead. - - :param source: The data to embed, can be a URL, local file or raw content, depending on the data type. - :type source: Any - :param data_type: Automatically detected, but can be forced with this argument. The type of the data to add, - defaults to None - :type data_type: Optional[DataType], optional - :param metadata: Metadata associated with the data source., defaults to None - :type metadata: Optional[dict[str, Any]], optional - :param config: The `AddConfig` instance to use as configuration options., defaults to None - :type config: Optional[AddConfig], optional - :raises ValueError: Invalid data type - :return: source_hash, a md5-hash of the source, in hexadecimal representation. - :rtype: str - """ - logging.warning( - "The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files." # noqa: E501 - ) - return self.add( - source=source, - data_type=data_type, - metadata=metadata, - config=config, - **kwargs, - ) - def _get_existing_doc_id(self, chunker: BaseChunker, src: Any): """ Get id of existing document for a given source, based on the data type diff --git a/embedchain/vectordb/pinecone.py b/embedchain/vectordb/pinecone.py index 200c1389..f3c05abe 100644 --- a/embedchain/vectordb/pinecone.py +++ b/embedchain/vectordb/pinecone.py @@ -1,3 +1,4 @@ +import logging import os from typing import Optional, Union @@ -8,6 +9,8 @@ except ImportError: "Pinecone requires extra dependencies. Install with `pip install --upgrade 'embedchain[pinecone]'`" ) from None +from pinecone_text.sparse import BM25Encoder + from embedchain.config.vectordb.pinecone import PineconeDBConfig from embedchain.helpers.json_serializable import register_deserializable from embedchain.utils.misc import chunks @@ -42,6 +45,14 @@ class PineconeDB(BaseVectorDB): ) self.config = config self._setup_pinecone_index() + + # Setup BM25Encoder if sparse vectors are to be used + self.bm25_encoder = None + if self.config.hybrid_search: + # TODO: Add support for fitting BM25Encoder on any corpus + logging.info("Initializing BM25Encoder for sparse vectors..") + self.bm25_encoder = BM25Encoder.default() + # Call parent init here because embedder is needed super().__init__(config=self.config) @@ -119,12 +130,17 @@ class PineconeDB(BaseVectorDB): docs = [] embeddings = self.embedder.embedding_fn(documents) for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings): + # Insert sparse vectors as well if the user wants to do the hybrid search + sparse_vector_dict = ( + {"sparse_values": self.bm25_encoder.encode_documents(text)} if self.bm25_encoder else {} + ) docs.append( { "id": id, "values": embedding, "metadata": {**metadata, "text": text}, - } + **sparse_vector_dict, + }, ) for chunk in chunks(docs, self.BATCH_SIZE, desc="Adding chunks in batches"): @@ -159,14 +175,19 @@ class PineconeDB(BaseVectorDB): query_filter["app_id"] = {"$eq": app_id} query_vector = self.embedder.embedding_fn([input_query])[0] - data = self.pinecone_index.query( - vector=query_vector, - filter=query_filter, - top_k=n_results, - include_metadata=True, + params = { + "vector": query_vector, + "filter": query_filter, + "top_k": n_results, + "include_metadata": True, **kwargs, - ) + } + if self.bm25_encoder: + sparse_query_vector = self.bm25_encoder.encode_queries(input_query) + params["sparse_vector"] = sparse_query_vector + + data = self.pinecone_index.query(**params) return [ (metadata.get("text"), {**metadata, "score": doc.get("score")}) if citations else metadata.get("text") for doc in data.get("matches", []) diff --git a/poetry.lock b/poetry.lock index 47da8b24..a65f466c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3561,6 +3561,50 @@ httpx = ">=0.25.2,<0.26.0" orjson = ">=3.9.10,<4.0.0" pydantic = ">=2.5.2,<3.0.0" +[[package]] +name = "mmh3" +version = "3.1.0" +description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions." +optional = true +python-versions = "*" +files = [ + {file = "mmh3-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ee043b1bac040b4324b8baee39df9fdca480a560a6d74f2eef66a5009a234e"}, + {file = "mmh3-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04ac865319e5b36148a4b6cdf27f8bda091c47c4ab7b355d7f353dfc2b8a3cce"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e751f5433417a21c2060b0efa1afc67cfbe29977c867336148c8edb086fae70"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdb863b89c1b34e3681d4a3b15d424734940eb8036f3457cb35ef34fb87a503c"}, + {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1230930fbf2faec4ddf5b76d0768ae73c102de173c301962bdd468177275adf9"}, + {file = "mmh3-3.1.0-cp310-cp310-win32.whl", hash = "sha256:b8ed7a2361718795a1b519a08d05f44947a20b27e202b53946561a00dde669c1"}, + {file = "mmh3-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:29e878e7467a000f34ab68c218ad7ad81312c0a94bc10df3c50a48bcad39dd83"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c271472325b70d64a4fbb1f2e964ca5b093ac10258e1390f8408890b065868fe"}, + {file = "mmh3-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0109320f7e0e262123ff4f1acd06acfbc8b3bf19cc13d98c0bc369264430aaeb"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524e29dfe66499695f9496edcfc96782d130aabd6ba12c50c72372163cc6f3ea"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66bdb06a03074e65e614da1aa199b1d16c90608bec9d8fc3faa81d887ffe93cc"}, + {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a4d471eb75df8320061ab3b8cbe11c970be9f116b01bc2222ebda9c0a777520"}, + {file = "mmh3-3.1.0-cp311-cp311-win32.whl", hash = "sha256:a886d9ce995a4bdfd7a600ddf61b9015cccbc73c50b898f8ff3c78af24384710"}, + {file = "mmh3-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:5edb5ac882c04aff8a2a18ae8b74a0c339ac9b83db9820d8456f518bb558e0d8"}, + {file = "mmh3-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:190fd10981fbd6c67e10ce3b56bcc021562c0df0fee2e2864347d64e65b1783a"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd781b115cf649811cfde76368c33d2e553b6f88bb41131c314f30d8e65e9d24"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f48bb0a867077acc1f548591ad49506389f36d18f36dccd10becf071e5cbdda4"}, + {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d0936a82438e340636a11b9a938378870fc1c7a139632dac09a9a9277351704"}, + {file = "mmh3-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:d196cc035c2238493248522ae4e54c3cb790549b1564f6dea4d88dfe4b326313"}, + {file = "mmh3-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:731d37f089b6c212fab1beea24e673161146eb6c76baf9ac074a3424d1172d41"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9977fb81f8c66f4eee8439734a18dba7826fe78723d15ab53f42db977005be0f"}, + {file = "mmh3-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bf4f3f20a8b8405c08b13bc9e4ac33bf55129b50b535cd07ce1891b7f96326ac"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87cdbc6e70099ad92f17a28b4054ffb1938657e8fb7c1e4e03b194a1b4683fd6"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6dd81321d14f62aa3711f30533c85a74dc7596e0fee63c8eddd375bc92ab846c"}, + {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e6eba88e5c1a2778f3de00a9502e3c214ebb757337ece2a7d71e060d188ddfa"}, + {file = "mmh3-3.1.0-cp38-cp38-win32.whl", hash = "sha256:d91e696925f208d28f3bb7bdf29815524ce955248276af256519bd3538c411ce"}, + {file = "mmh3-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cbc2917df568aeb86ec5aa863bfb20fa14e01039cbdce7650efbabc30960df49"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b22832d565128be83d69f5d49243bb567840a954df377c9f5b26646a6eec39b"}, + {file = "mmh3-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ced92a0e285a9111413541c197b0c17d280cee96f7c564b258caf5de5ab8ee01"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f906833753b4ddcb690c2c1b74e77725868bc3a8b762b7a77737d08be89ae41d"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72b5685832a7a87a55ebff481794bc410484d7bd4c5e80dae4d8ac50739138ef"}, + {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d2aa4d422c7c088bbc5d367b45431268ebe6742a0a64eade93fab708e25757c"}, + {file = "mmh3-3.1.0-cp39-cp39-win32.whl", hash = "sha256:4459bec818f534dc8378568ad89ab310ff47cda3e00ab322edce48dd899bba32"}, + {file = "mmh3-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:03e04b3480e71828f48d17653451a3286555f0534942cb6ba93065b10ad5f9dc"}, + {file = "mmh3-3.1.0.tar.gz", hash = "sha256:9b0f2b2ab4a915333c9d1089572e290a021ebb5b900bb7f7114dccc03995d732"}, +] + [[package]] name = "mock" version = "5.1.0" @@ -4696,6 +4740,32 @@ urllib3 = ">=1.26.0" [package.extras] grpc = ["googleapis-common-protos (>=1.53.0)", "grpc-gateway-protoc-gen-openapiv2 (==0.1.0)", "grpcio (>=1.44.0)", "lz4 (>=3.1.3)", "protobuf (>=3.20.0,<3.21.0)"] +[[package]] +name = "pinecone-text" +version = "0.8.0" +description = "Text utilities library by Pinecone.io" +optional = true +python-versions = ">=3.8,<4.0" +files = [ + {file = "pinecone_text-0.8.0-py3-none-any.whl", hash = "sha256:cf099c903f6bc630a2b9858bab63e291ebb361ca545b5968cd71eb0dcfbee221"}, + {file = "pinecone_text-0.8.0.tar.gz", hash = "sha256:9c386d43da7a0959452296217c3d77a6f431ff6602a06f4d413137f4ba3d82ee"}, +] + +[package.dependencies] +mmh3 = ">=3.1.0,<4.0.0" +nltk = ">=3.6.5,<4.0.0" +numpy = {version = ">=1.21.5,<2.0", markers = "python_version < \"3.12\""} +python-dotenv = ">=1.0.1,<2.0.0" +requests = ">=2.25.0,<3.0.0" +types-requests = ">=2.25.0,<3.0.0" +wget = ">=3.2,<4.0" + +[package.extras] +cohere = ["cohere (>=4.37,<5.0)"] +dense = ["openai (>=1.2.3,<2.0.0)", "sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] +openai = ["openai (>=1.2.3,<2.0.0)"] +splade = ["sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"] + [[package]] name = "platformdirs" version = "3.11.0" @@ -5575,13 +5645,13 @@ typing-extensions = "*" [[package]] name = "python-dotenv" -version = "1.0.0" +version = "1.0.1" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" files = [ - {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, - {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, + {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, + {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, ] [package.extras] @@ -7985,6 +8055,16 @@ MarkupSafe = ">=2.1.1" [package.extras] watchdog = ["watchdog (>=2.3)"] +[[package]] +name = "wget" +version = "3.2" +description = "pure python download utility" +optional = true +python-versions = "*" +files = [ + {file = "wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061"}, +] + [[package]] name = "wheel" version = "0.41.2" @@ -8266,7 +8346,7 @@ modal = ["modal"] mysql = ["mysql-connector-python"] opensearch = ["opensearch-py"] opensource = ["gpt4all", "sentence-transformers", "torch"] -pinecone = ["pinecone-client"] +pinecone = ["pinecone-client", "pinecone-text"] poe = ["fastapi-poe"] postgres = ["psycopg", "psycopg-binary", "psycopg-pool"] qdrant = ["qdrant-client"] @@ -8282,4 +8362,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "1dbb690590123f505675544aa0e1b3668f0d3819f4832f3f3464ff16b69e39e9" +content-hash = "f613dc1a3e9b724c95b407d4d8b9e67518e718142c77ad4723b7cb1e43eec9db" diff --git a/pyproject.toml b/pyproject.toml index 0ccc8c69..c7f41bcf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.79" +version = "0.1.80" description = "Simplest open source retrieval(RAG) framework" authors = [ "Taranjeet Singh ", @@ -124,6 +124,7 @@ together = { version = "^0.2.8", optional = true } weaviate-client = { version = "^3.24.1", optional = true } docx2txt = { version = "^0.8", optional = true } pinecone-client = { version = "^3.0.0", optional = true } +pinecone-text = { version = "^0.8.0", optional = true } qdrant-client = { version = "1.6.3", optional = true } unstructured = {extras = ["local-inference", "all-docs"], version = "^0.10.18", optional = true} huggingface_hub = { version = "^0.17.3", optional = true } @@ -178,7 +179,7 @@ discord = ["discord"] slack = ["slack-sdk", "flask"] whatsapp = ["twilio", "flask"] weaviate = ["weaviate-client"] -pinecone = ["pinecone-client"] +pinecone = ["pinecone-client", "pinecone-text"] qdrant = ["qdrant-client"] huggingface_hub=["huggingface_hub"] cohere = ["cohere"]