[OpenSearch] Add chunks specific to an app_id if present (#765)

2023-10-04 15:46:22 -07:00
parent 352e71461d
commit 64a34cac32
6 changed files with 81 additions and 55 deletions
--- a/embedchain/chunkers/base_chunker.py
+++ b/embedchain/chunkers/base_chunker.py
@@ -10,7 +10,7 @@ class BaseChunker(JSONSerializable):
        self.text_splitter = text_splitter
        self.data_type = None

-    def create_chunks(self, loader, src):
+    def create_chunks(self, loader, src, app_id=None):
        """
        Loads data and chunks it.

@@ -18,13 +18,18 @@ class BaseChunker(JSONSerializable):
        the raw data.
        :param src: The data to be handled by the loader. Can be a URL for
        remote sources or local content for local loaders.
+        :param app_id: App id used to generate the doc_id.
        """
        documents = []
-        ids = []
+        chunk_ids = []
        idMap = {}
        data_result = loader.load_data(src)
        data_records = data_result["data"]
        doc_id = data_result["doc_id"]
+        # Prefix app_id in the document id if app_id is not None to
+        # distinguish between different documents stored in the same
+        # elasticsearch or opensearch index
+        doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
        metadatas = []
        for data in data_records:
            content = data["content"]
@@ -41,12 +46,12 @@ class BaseChunker(JSONSerializable):
                chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
                if idMap.get(chunk_id) is None:
                    idMap[chunk_id] = True
-                    ids.append(chunk_id)
+                    chunk_ids.append(chunk_id)
                    documents.append(chunk)
                    metadatas.append(meta_data)
        return {
            "documents": documents,
-            "ids": ids,
+            "ids": chunk_ids,
            "metadatas": metadatas,
            "doc_id": doc_id,
        }
--- a/embedchain/chunkers/images.py
+++ b/embedchain/chunkers/images.py
@@ -20,7 +20,7 @@ class ImagesChunker(BaseChunker):
        )
        super().__init__(image_splitter)

-    def create_chunks(self, loader, src):
+    def create_chunks(self, loader, src, app_id=None):
        """
        Loads the image(s), and creates their corresponding embedding. This creates one chunk for each image

@@ -35,6 +35,7 @@ class ImagesChunker(BaseChunker):
        data_result = loader.load_data(src)
        data_records = data_result["data"]
        doc_id = data_result["doc_id"]
+        doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
        metadatas = []
        for data in data_records:
            meta_data = data["meta_data"]