[Bug fix] Fix issue with gmail loader (#1228)

2024-01-29 18:36:02 +05:30
parent 31bb0e7f0f
commit 2985b667b0
4 changed files with 28 additions and 9 deletions
--- a/docs/api-reference/app/chat.mdx
+++ b/docs/api-reference/app/chat.mdx
@@ -129,3 +129,18 @@ app.chat("What is the net worth of Bill Gates?", session_id="user2")
 app.chat("What was my last question", session_id="user1")
 # 'Your last question was "What is the net worth of Elon Musk?"'
 ```
+
+### With custom context window
+
+If you want to customize the context window that you want to use during chat (default context window is 3 document chunks), you can do using the following code snippet:
+
+```python with custom chunks size
+from embedchain import App
+from embedchain.config import BaseLlmConfig
+
+app = App()
+app.add("https://www.forbes.com/profile/elon-musk")
+
+query_config = BaseLlmConfig(number_documents=5)
+app.chat("What is the net worth of Elon Musk?", config=query_config)
+```
--- a/docs/components/data-sources/custom.mdx
+++ b/docs/components/data-sources/custom.mdx
@@ -7,11 +7,12 @@ When we say "custom", we mean that you can customize the loader and chunker to y
 ```python
 from embedchain import App
 import your_loader
-import your_chunker
+from my_module import CustomLoader
+from my_module import CustomChunker

 app = App()
-loader = your_loader()
-chunker = your_chunker()
+loader = CustomLoader()
+chunker = CustomChunker()

 app.add("source", data_type="custom", loader=loader, chunker=chunker)
 ```
--- a/embedchain/chunkers/base_chunker.py
+++ b/embedchain/chunkers/base_chunker.py
@@ -39,11 +39,14 @@ class BaseChunker(JSONSerializable):
        for data in data_records:
            content = data["content"]

-            meta_data = data["meta_data"]
+            metadata = data["meta_data"]
            # add data type to meta data to allow query using data type
-            meta_data["data_type"] = self.data_type.value
-            meta_data["doc_id"] = doc_id
-            url = meta_data["url"]
+            metadata["data_type"] = self.data_type.value
+            metadata["doc_id"] = doc_id
+
+            # TODO: Currently defaulting to the src as the url. This is done intentianally since some
+            # of the data types like 'gmail' loader doesn't have the url in the meta data.
+            url = metadata.get("url", src)

            chunks = self.get_chunks(content)
            for chunk in chunks:
@@ -53,7 +56,7 @@ class BaseChunker(JSONSerializable):
                    id_map[chunk_id] = True
                    chunk_ids.append(chunk_id)
                    documents.append(chunk)
-                    metadatas.append(meta_data)
+                    metadatas.append(metadata)
        return {
            "documents": documents,
            "ids": chunk_ids,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.71"
+version = "0.1.72"
 description = "Simplest open source retrieval(RAG) framework"
 authors = [
    "Taranjeet Singh <taranjeet@embedchain.ai>",