diff --git a/docs/api-reference/app/chat.mdx b/docs/api-reference/app/chat.mdx index b044777e..ad0b20c9 100644 --- a/docs/api-reference/app/chat.mdx +++ b/docs/api-reference/app/chat.mdx @@ -129,3 +129,18 @@ app.chat("What is the net worth of Bill Gates?", session_id="user2") app.chat("What was my last question", session_id="user1") # 'Your last question was "What is the net worth of Elon Musk?"' ``` + +### With custom context window + +If you want to customize the context window that you want to use during chat (default context window is 3 document chunks), you can do using the following code snippet: + +```python with custom chunks size +from embedchain import App +from embedchain.config import BaseLlmConfig + +app = App() +app.add("https://www.forbes.com/profile/elon-musk") + +query_config = BaseLlmConfig(number_documents=5) +app.chat("What is the net worth of Elon Musk?", config=query_config) +``` diff --git a/docs/components/data-sources/custom.mdx b/docs/components/data-sources/custom.mdx index 7d151521..40a8c75e 100644 --- a/docs/components/data-sources/custom.mdx +++ b/docs/components/data-sources/custom.mdx @@ -7,11 +7,12 @@ When we say "custom", we mean that you can customize the loader and chunker to y ```python from embedchain import App import your_loader -import your_chunker +from my_module import CustomLoader +from my_module import CustomChunker app = App() -loader = your_loader() -chunker = your_chunker() +loader = CustomLoader() +chunker = CustomChunker() app.add("source", data_type="custom", loader=loader, chunker=chunker) ``` diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py index 6ce10b67..b4d50f78 100644 --- a/embedchain/chunkers/base_chunker.py +++ b/embedchain/chunkers/base_chunker.py @@ -39,11 +39,14 @@ class BaseChunker(JSONSerializable): for data in data_records: content = data["content"] - meta_data = data["meta_data"] + metadata = data["meta_data"] # add data type to meta data to allow query using data type - meta_data["data_type"] = self.data_type.value - meta_data["doc_id"] = doc_id - url = meta_data["url"] + metadata["data_type"] = self.data_type.value + metadata["doc_id"] = doc_id + + # TODO: Currently defaulting to the src as the url. This is done intentianally since some + # of the data types like 'gmail' loader doesn't have the url in the meta data. + url = metadata.get("url", src) chunks = self.get_chunks(content) for chunk in chunks: @@ -53,7 +56,7 @@ class BaseChunker(JSONSerializable): id_map[chunk_id] = True chunk_ids.append(chunk_id) documents.append(chunk) - metadatas.append(meta_data) + metadatas.append(metadata) return { "documents": documents, "ids": chunk_ids, diff --git a/pyproject.toml b/pyproject.toml index 452d0cec..0b333d85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.71" +version = "0.1.72" description = "Simplest open source retrieval(RAG) framework" authors = [ "Taranjeet Singh ",