Rename embedchain to mem0 and open sourcing code for long term memory (#1474)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
2024-07-12 07:51:33 -07:00
parent 83e8c97295
commit f842a92e25
665 changed files with 9427 additions and 6592 deletions
--- a/embedchain/embedchain/chunkers/base_chunker.py
+++ b/embedchain/embedchain/chunkers/base_chunker.py
@@ -0,0 +1,87 @@
+import hashlib
+import logging
+from typing import Optional
+
+from embedchain.config.add_config import ChunkerConfig
+from embedchain.helpers.json_serializable import JSONSerializable
+from embedchain.models.data_type import DataType
+
+logger = logging.getLogger(__name__)
+
+
+class BaseChunker(JSONSerializable):
+    def __init__(self, text_splitter):
+        """Initialize the chunker."""
+        self.text_splitter = text_splitter
+        self.data_type = None
+
+    def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig] = None):
+        """
+        Loads data and chunks it.
+
+        :param loader: The loader whose `load_data` method is used to create
+        the raw data.
+        :param src: The data to be handled by the loader. Can be a URL for
+        remote sources or local content for local loaders.
+        :param app_id: App id used to generate the doc_id.
+        """
+        documents = []
+        chunk_ids = []
+        id_map = {}
+        min_chunk_size = config.min_chunk_size if config is not None else 1
+        logger.info(f"Skipping chunks smaller than {min_chunk_size} characters")
+        data_result = loader.load_data(src)
+        data_records = data_result["data"]
+        doc_id = data_result["doc_id"]
+        # Prefix app_id in the document id if app_id is not None to
+        # distinguish between different documents stored in the same
+        # elasticsearch or opensearch index
+        doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
+        metadatas = []
+        for data in data_records:
+            content = data["content"]
+
+            metadata = data["meta_data"]
+            # add data type to meta data to allow query using data type
+            metadata["data_type"] = self.data_type.value
+            metadata["doc_id"] = doc_id
+
+            # TODO: Currently defaulting to the src as the url. This is done intentianally since some
+            # of the data types like 'gmail' loader doesn't have the url in the meta data.
+            url = metadata.get("url", src)
+
+            chunks = self.get_chunks(content)
+            for chunk in chunks:
+                chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
+                chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
+                if id_map.get(chunk_id) is None and len(chunk) >= min_chunk_size:
+                    id_map[chunk_id] = True
+                    chunk_ids.append(chunk_id)
+                    documents.append(chunk)
+                    metadatas.append(metadata)
+        return {
+            "documents": documents,
+            "ids": chunk_ids,
+            "metadatas": metadatas,
+            "doc_id": doc_id,
+        }
+
+    def get_chunks(self, content):
+        """
+        Returns chunks using text splitter instance.
+
+        Override in child class if custom logic.
+        """
+        return self.text_splitter.split_text(content)
+
+    def set_data_type(self, data_type: DataType):
+        """
+        set the data type of chunker
+        """
+        self.data_type = data_type
+
+        # TODO: This should be done during initialization. This means it has to be done in the child classes.
+
+    @staticmethod
+    def get_word_count(documents) -> int:
+        return sum(len(document.split(" ")) for document in documents)