Remove load_and_embed_v1 method since it is not used (#638)

2023-09-27 04:58:49 +05:30
parent 84e5932ea5
commit b8a838aee1
1 changed files with 3 additions and 89 deletions
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -6,7 +6,7 @@ import os
 import threading
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional

 import requests
 from dotenv import load_dotenv
@@ -200,7 +200,7 @@ class EmbedChain(JSONSerializable):

        data_formatter = DataFormatter(data_type, config)
        self.user_asks.append([source, data_type.value, metadata])
-        documents, metadatas, _ids, new_chunks = self.load_and_embed_v2(
+        documents, metadatas, _ids, new_chunks = self.load_and_embed(
            data_formatter.loader, data_formatter.chunker, source, metadata, source_id, dry_run
        )
        if data_type in {DataType.DOCS_SITE}:
@@ -255,92 +255,6 @@ class EmbedChain(JSONSerializable):
        )
        return self.add(source=source, data_type=data_type, metadata=metadata, config=config)

-    def load_and_embed(
-        self,
-        loader: BaseLoader,
-        chunker: BaseChunker,
-        src: Any,
-        metadata: Optional[Dict[str, Any]] = None,
-        source_id: Optional[str] = None,
-        dry_run=False,
-    ) -> Tuple[List[str], Dict[str, Any], List[str], int]:
-        """The loader to use to load the data.
-
-        :param loader: The loader to use to load the data.
-        :type loader: BaseLoader
-        :param chunker: The chunker to use to chunk the data.
-        :type chunker: BaseChunker
-        :param src: The data to be handled by the loader.
-        Can be a URL for remote sources or local content for local loaders.
-        :type src: Any
-        :param metadata: Metadata associated with the data source., defaults to None
-        :type metadata: Dict[str, Any], optional
-        :param source_id: Hexadecimal hash of the source., defaults to None
-        :type source_id: str, optional
-        :param dry_run: Optional. A dry run returns chunks and doesn't update DB.
-        :type dry_run: bool, defaults to False
-        :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
-        :rtype: Tuple[List[str], Dict[str, Any], List[str], int]
-        """
-        embeddings_data = chunker.create_chunks(loader, src)
-
-        # spread chunking results
-        documents = embeddings_data["documents"]
-        metadatas = embeddings_data["metadatas"]
-        ids = embeddings_data["ids"]
-
-        # get existing ids, and discard doc if any common id exist.
-        where = {"app_id": self.config.id} if self.config.id is not None else {}
-        # where={"url": src}
-        db_result = self.db.get(
-            ids=ids,
-            where=where,  # optional filter
-        )
-        existing_ids = set(db_result["ids"])
-
-        if len(existing_ids):
-            data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)}
-            data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
-
-            if not data_dict:
-                src_copy = src
-                if len(src_copy) > 50:
-                    src_copy = src[:50] + "..."
-                print(f"All data from {src_copy} already exists in the database.")
-                # Make sure to return a matching return type
-                return [], [], [], 0
-
-            ids = list(data_dict.keys())
-            documents, metadatas = zip(*data_dict.values())
-
-        # Loop though all metadatas and add extras.
-        new_metadatas = []
-        for m in metadatas:
-            # Add app id in metadatas so that they can be queried on later
-            if self.config.id:
-                m["app_id"] = self.config.id
-
-            # Add hashed source
-            m["hash"] = source_id
-
-            # Note: Metadata is the function argument
-            if metadata:
-                # Spread whatever is in metadata into the new object.
-                m.update(metadata)
-
-            new_metadatas.append(m)
-        metadatas = new_metadatas
-
-        if dry_run:
-            return list(documents), metadatas, ids, 0
-
-        # Count before, to calculate a delta in the end.
-        chunks_before_addition = self.db.count()
-
-        self.db.add(documents=documents, metadatas=metadatas, ids=ids)
-        count_new_chunks = self.db.count() - chunks_before_addition
-        print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
-        return list(documents), metadatas, ids, count_new_chunks

    def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
        """
@@ -392,7 +306,7 @@ class EmbedChain(JSONSerializable):
                "When it should be  DirectDataType, IndirectDataType or SpecialDataType."
            )

-    def load_and_embed_v2(
+    def load_and_embed(
        self,
        loader: BaseLoader,
        chunker: BaseChunker,