From 19d80914df1543c77345f92fd170bfcc234d64c8 Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Fri, 29 Dec 2023 14:48:41 +0530 Subject: [PATCH] [Improvement] return all the metadata when citations flag is `True` (#1059) Co-authored-by: Deven Patel --- docs/api-reference/pipeline/chat.mdx | 9 +++------ docs/api-reference/pipeline/query.mdx | 9 +++------ docs/api-reference/pipeline/search.mdx | 12 ++++++++---- docs/use-cases/semantic-search.mdx | 18 ++++++++++++------ embedchain/pipeline.py | 8 +------- embedchain/vectordb/chroma.py | 4 +--- embedchain/vectordb/elasticsearch.py | 6 ++---- embedchain/vectordb/opensearch.py | 4 +--- embedchain/vectordb/pinecone.py | 4 +--- embedchain/vectordb/qdrant.py | 4 +--- embedchain/vectordb/weaviate.py | 4 +--- embedchain/vectordb/zilliz.py | 4 +--- tests/vectordb/test_chroma_db.py | 5 ++++- tests/vectordb/test_elasticsearch_db.py | 4 ++-- tests/vectordb/test_zilliz_db.py | 8 ++++++-- 15 files changed, 47 insertions(+), 56 deletions(-) diff --git a/docs/api-reference/pipeline/chat.mdx b/docs/api-reference/pipeline/chat.mdx index 5a606529..991585d3 100644 --- a/docs/api-reference/pipeline/chat.mdx +++ b/docs/api-reference/pipeline/chat.mdx @@ -53,18 +53,15 @@ print(sources) # [ # ( # 'Elon Musk PROFILEElon MuskCEO, Tesla$247.1B$2.3B (0.96%)Real Time Net Worthas of 12/7/23 ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ), # ( # '74% of the company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ), # ( # 'founded in 2002, is worth nearly $150 billion after a $750 million tender offer in June 2023 ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ) # ] ``` diff --git a/docs/api-reference/pipeline/query.mdx b/docs/api-reference/pipeline/query.mdx index 5034c81d..36965d61 100644 --- a/docs/api-reference/pipeline/query.mdx +++ b/docs/api-reference/pipeline/query.mdx @@ -53,18 +53,15 @@ print(sources) # [ # ( # 'Elon Musk PROFILEElon MuskCEO, Tesla$247.1B$2.3B (0.96%)Real Time Net Worthas of 12/7/23 ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ), # ( # '74% of the company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ), # ( # 'founded in 2002, is worth nearly $150 billion after a $750 million tender offer in June 2023 ...', -# 'https://www.forbes.com/profile/elon-musk', -# '4651b266--4aa78839fe97' +# {'url': 'https://www.forbes.com/profile/elon-musk', ...} # ) # ] ``` diff --git a/docs/api-reference/pipeline/search.mdx b/docs/api-reference/pipeline/search.mdx index a2e618cd..6da5ba4e 100644 --- a/docs/api-reference/pipeline/search.mdx +++ b/docs/api-reference/pipeline/search.mdx @@ -39,13 +39,17 @@ print(context) # [ # { # 'context': 'Elon Musk PROFILEElon MuskCEO, Tesla$221.9BReal Time Net Worthas of 10/29/23Reflects change since 5 pm ET of prior trading day. 1 in the world todayPhoto by Martin Schoeller for ForbesAbout Elon MuskElon Musk cofounded six companies, including electric car maker Tesla, rocket producer SpaceX and tunneling startup Boring Company.He owns about 21% of Tesla between stock and options, but has pledged more than half his shares as collateral for personal loans of up to $3.5 billion.SpaceX, founded in', -# 'source': 'https://www.forbes.com/profile/elon-musk', -# 'document_id': 'some_document_id' +# 'metadata': { +# 'source': 'https://www.forbes.com/profile/elon-musk', +# 'document_id': 'some_document_id' +# } # }, # { # 'context': 'company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH BY YEARForbes Lists 1Forbes 400 (2023)The Richest Person In Every State (2023) 2Billionaires (2023) 1Innovative Leaders (2019) 25Powerful People (2018) 12Richest In Tech (2017)Global Game Changers (2016)More ListsPersonal StatsAge52Source of WealthTesla, SpaceX, Self MadeSelf-Made Score8Philanthropy Score1ResidenceAustin, TexasCitizenshipUnited StatesMarital StatusSingleChildren11EducationBachelor of Arts/Science, University', -# 'source': 'https://www.forbes.com/profile/elon-musk', -# 'document_id': 'some_document_id' +# 'metadata': { +# 'source': 'https://www.forbes.com/profile/elon-musk', +# 'document_id': 'some_document_id' +# } # } # ] ``` diff --git a/docs/use-cases/semantic-search.mdx b/docs/use-cases/semantic-search.mdx index 3a86fc3e..5f3a4582 100644 --- a/docs/use-cases/semantic-search.mdx +++ b/docs/use-cases/semantic-search.mdx @@ -48,18 +48,24 @@ app.search("Summarize the features of Next.js 14?") [ { 'context': 'Next.js 14 | Next.jsBack to BlogThursday, October 26th 2023Next.js 14Posted byLee Robinson@leeerobTim Neutkens@timneutkensAs we announced at Next.js Conf, Next.js 14 is our most focused release with: Turbopack: 5,000 tests passing for App & Pages Router 53% faster local server startup 94% faster code updates with Fast Refresh Server Actions (Stable): Progressively enhanced mutations Integrated with caching & revalidating Simple function calls, or works natively with forms Partial Prerendering', - 'source': 'https://nextjs.org/blog/next-14', - 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + 'metadata': { + 'source': 'https://nextjs.org/blog/next-14', + 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + } }, { 'context': 'Next.js 13.3 | Next.jsBack to BlogThursday, April 6th 2023Next.js 13.3Posted byDelba de Oliveira@delba_oliveiraTim Neutkens@timneutkensNext.js 13.3 adds popular community-requested features, including: File-Based Metadata API: Dynamically generate sitemaps, robots, favicons, and more. Dynamic Open Graph Images: Generate OG images using JSX, HTML, and CSS. Static Export for App Router: Static / Single-Page Application (SPA) support for Server Components. Parallel Routes and Interception: Advanced', - 'source': 'https://nextjs.org/blog/next-13-3', - 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + 'metadata': { + 'source': 'https://nextjs.org/blog/next-13-3', + 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + } }, { 'context': 'Upgrading: Version 14 | Next.js MenuUsing App RouterFeatures available in /appApp Router.UpgradingVersion 14Version 14 Upgrading from 13 to 14 To update to Next.js version 14, run the following command using your preferred package manager: Terminalnpm i next@latest react@latest react-dom@latest eslint-config-next@latest Terminalyarn add next@latest react@latest react-dom@latest eslint-config-next@latest Terminalpnpm up next react react-dom eslint-config-next -latest Terminalbun add next@latest', - 'source': 'https://nextjs.org/docs/app/building-your-application/upgrading/version-14', - 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + 'metadata': { + 'source': 'https://nextjs.org/docs/app/building-your-application/upgrading/version-14', + 'document_id': '6c8d1a7b-ea34-4927-8823-daa29dcfc5af--b83edb69b8fc7e442ff8ca311b48510e6c80bf00caa806b3a6acb34e1bcdd5d5' + } } ] ``` diff --git a/embedchain/pipeline.py b/embedchain/pipeline.py index f54e7c7a..d7809f4f 100644 --- a/embedchain/pipeline.py +++ b/embedchain/pipeline.py @@ -237,13 +237,7 @@ class Pipeline(EmbedChain): ) result = [] for c in context: - result.append( - { - "context": c[0], - "source": c[1], - "document_id": c[2], - } - ) + result.append({"context": c[0], "metadata": c[1]}) return result else: # Make API call to the backend to get the results diff --git a/embedchain/vectordb/chroma.py b/embedchain/vectordb/chroma.py index 86c5ff38..7b6fd8fe 100644 --- a/embedchain/vectordb/chroma.py +++ b/embedchain/vectordb/chroma.py @@ -250,9 +250,7 @@ class ChromaDB(BaseVectorDB): context = result[0].page_content if citations: metadata = result[0].metadata - source = metadata["url"] - doc_id = metadata["doc_id"] - contexts.append((context, source, doc_id)) + contexts.append((context, metadata)) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/elasticsearch.py b/embedchain/vectordb/elasticsearch.py index 866d3eb1..e7d4b0a1 100644 --- a/embedchain/vectordb/elasticsearch.py +++ b/embedchain/vectordb/elasticsearch.py @@ -202,7 +202,7 @@ class ElasticsearchDB(BaseVectorDB): if "app_id" in where: app_id = where["app_id"] query["script_score"]["query"] = {"match": {"metadata.app_id": app_id}} - _source = ["text", "metadata.url", "metadata.doc_id"] + _source = ["text", "metadata"] response = self.client.search(index=self._get_index(), query=query, _source=_source, size=n_results) docs = response["hits"]["hits"] contexts = [] @@ -210,9 +210,7 @@ class ElasticsearchDB(BaseVectorDB): context = doc["_source"]["text"] if citations: metadata = doc["_source"]["metadata"] - source = metadata["url"] - doc_id = metadata["doc_id"] - contexts.append(tuple((context, source, doc_id))) + contexts.append(tuple((context, metadata))) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/opensearch.py b/embedchain/vectordb/opensearch.py index da51b600..4cc2f8a2 100644 --- a/embedchain/vectordb/opensearch.py +++ b/embedchain/vectordb/opensearch.py @@ -218,9 +218,7 @@ class OpenSearchDB(BaseVectorDB): for doc in docs: context = doc.page_content if citations: - source = doc.metadata["url"] - doc_id = doc.metadata["doc_id"] - contexts.append(tuple((context, source, doc_id))) + contexts.append(tuple((context, doc.metadata))) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/pinecone.py b/embedchain/vectordb/pinecone.py index a794ccb8..a15d03fe 100644 --- a/embedchain/vectordb/pinecone.py +++ b/embedchain/vectordb/pinecone.py @@ -154,9 +154,7 @@ class PineconeDB(BaseVectorDB): metadata = doc["metadata"] context = metadata["text"] if citations: - source = metadata["url"] - doc_id = metadata["doc_id"] - contexts.append(tuple((context, source, doc_id))) + contexts.append(tuple((context, metadata))) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/qdrant.py b/embedchain/vectordb/qdrant.py index 6656b5d7..1ca111df 100644 --- a/embedchain/vectordb/qdrant.py +++ b/embedchain/vectordb/qdrant.py @@ -219,9 +219,7 @@ class QdrantDB(BaseVectorDB): context = result.payload["text"] if citations: metadata = result.payload["metadata"] - source = metadata["url"] - doc_id = metadata["doc_id"] - contexts.append(tuple((context, source, doc_id))) + contexts.append(tuple((context, metadata))) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/weaviate.py b/embedchain/vectordb/weaviate.py index ac3d9b57..37cd419a 100644 --- a/embedchain/vectordb/weaviate.py +++ b/embedchain/vectordb/weaviate.py @@ -271,9 +271,7 @@ class WeaviateDB(BaseVectorDB): context = doc["text"] if citations: metadata = doc["metadata"][0] - source = metadata["url"] - doc_id = metadata["doc_id"] - contexts.append((context, source, doc_id)) + contexts.append((context, metadata)) else: contexts.append(context) return contexts diff --git a/embedchain/vectordb/zilliz.py b/embedchain/vectordb/zilliz.py index e4806817..00d7a23e 100644 --- a/embedchain/vectordb/zilliz.py +++ b/embedchain/vectordb/zilliz.py @@ -187,9 +187,7 @@ class ZillizVectorDB(BaseVectorDB): data = query[0]["entity"] context = data["text"] if citations: - source = data["url"] - doc_id = data["doc_id"] - contexts.append(tuple((context, source, doc_id))) + contexts.append(tuple((context, data))) else: contexts.append(context) return contexts diff --git a/tests/vectordb/test_chroma_db.py b/tests/vectordb/test_chroma_db.py index 051de98c..d2a57689 100644 --- a/tests/vectordb/test_chroma_db.py +++ b/tests/vectordb/test_chroma_db.py @@ -341,7 +341,10 @@ def test_chroma_db_collection_query(app_with_settings): data_with_citations = app_with_settings.db.query( input_query=[0, 0, 0], where={}, n_results=2, skip_embedding=True, citations=True ) - expected_value_with_citations = [("document", "url_1", "doc_id_1"), ("document2", "url_2", "doc_id_2")] + expected_value_with_citations = [ + ("document", {"url": "url_1", "doc_id": "doc_id_1"}), + ("document2", {"url": "url_2", "doc_id": "doc_id_2"}), + ] assert data_with_citations == expected_value_with_citations app_with_settings.db.reset() diff --git a/tests/vectordb/test_elasticsearch_db.py b/tests/vectordb/test_elasticsearch_db.py index 75c54c57..9ceaa846 100644 --- a/tests/vectordb/test_elasticsearch_db.py +++ b/tests/vectordb/test_elasticsearch_db.py @@ -66,8 +66,8 @@ class TestEsDB(unittest.TestCase): results_with_citations = self.db.query(query, n_results=2, where={}, skip_embedding=False, citations=True) expected_results_with_citations = [ - ("This is a document.", "url_1", "doc_id_1"), - ("This is another document.", "url_2", "doc_id_2"), + ("This is a document.", {"url": "url_1", "doc_id": "doc_id_1"}), + ("This is another document.", {"url": "url_2", "doc_id": "doc_id_2"}), ] self.assertEqual(results_with_citations, expected_results_with_citations) diff --git a/tests/vectordb/test_zilliz_db.py b/tests/vectordb/test_zilliz_db.py index 80cbf205..bdc0fc15 100644 --- a/tests/vectordb/test_zilliz_db.py +++ b/tests/vectordb/test_zilliz_db.py @@ -150,7 +150,9 @@ class TestZillizDBCollection: output_fields=["text", "url", "doc_id"], ) - assert query_result_with_citations == [("result_doc", "url_1", "doc_id_1")] + assert query_result_with_citations == [ + ("result_doc", {"text": "result_doc", "url": "url_1", "doc_id": "doc_id_1"}) + ] @patch("embedchain.vectordb.zilliz.MilvusClient", autospec=True) @patch("embedchain.vectordb.zilliz.connections", autospec=True) @@ -202,4 +204,6 @@ class TestZillizDBCollection: output_fields=["text", "url", "doc_id"], ) - assert query_result_with_citations == [("result_doc", "url_1", "doc_id_1")] + assert query_result_with_citations == [ + ("result_doc", {"text": "result_doc", "url": "url_1", "doc_id": "doc_id_1"}) + ]