Remove load_and_embed_v1 method since it is not used (#638)

2023-09-27 04:58:49 +05:30
parent 84e5932ea5
commit b8a838aee1
1 changed files with 3 additions and 89 deletions
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -6,7 +6,7 @@ import os
 import threading
 import uuid
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional
 import requests
 from dotenv import load_dotenv
@@ -200,7 +200,7 @@ class EmbedChain(JSONSerializable):
        data_formatter = DataFormatter(data_type, config)
        self.user_asks.append([source, data_type.value, metadata])
-        documents, metadatas, _ids, new_chunks = self.load_and_embed_v2(
+        documents, metadatas, _ids, new_chunks = self.load_and_embed(
            data_formatter.loader, data_formatter.chunker, source, metadata, source_id, dry_run
        )
        if data_type in {DataType.DOCS_SITE}:
@@ -255,92 +255,6 @@ class EmbedChain(JSONSerializable):
        )
        return self.add(source=source, data_type=data_type, metadata=metadata, config=config)
    def load_and_embed(
        self,
        loader: BaseLoader,
        chunker: BaseChunker,
        src: Any,
        metadata: Optional[Dict[str, Any]] = None,
        source_id: Optional[str] = None,
        dry_run=False,
    ) -> Tuple[List[str], Dict[str, Any], List[str], int]:
        """The loader to use to load the data.
        :param loader: The loader to use to load the data.
        :type loader: BaseLoader
        :param chunker: The chunker to use to chunk the data.
        :type chunker: BaseChunker
        :param src: The data to be handled by the loader.
        Can be a URL for remote sources or local content for local loaders.
        :type src: Any
        :param metadata: Metadata associated with the data source., defaults to None
        :type metadata: Dict[str, Any], optional
        :param source_id: Hexadecimal hash of the source., defaults to None
        :type source_id: str, optional
        :param dry_run: Optional. A dry run returns chunks and doesn't update DB.
        :type dry_run: bool, defaults to False
        :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
        :rtype: Tuple[List[str], Dict[str, Any], List[str], int]
        """
        embeddings_data = chunker.create_chunks(loader, src)
        # spread chunking results
        documents = embeddings_data["documents"]
        metadatas = embeddings_data["metadatas"]
        ids = embeddings_data["ids"]
        # get existing ids, and discard doc if any common id exist.
        where = {"app_id": self.config.id} if self.config.id is not None else {}
        # where={"url": src}
        db_result = self.db.get(
            ids=ids,
            where=where,  # optional filter
        )
        existing_ids = set(db_result["ids"])
        if len(existing_ids):
            data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)}
            data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
            if not data_dict:
                src_copy = src
                if len(src_copy) > 50:
                    src_copy = src[:50] + "..."
                print(f"All data from {src_copy} already exists in the database.")
                # Make sure to return a matching return type
                return [], [], [], 0
            ids = list(data_dict.keys())
            documents, metadatas = zip(*data_dict.values())
        # Loop though all metadatas and add extras.
        new_metadatas = []
        for m in metadatas:
            # Add app id in metadatas so that they can be queried on later
            if self.config.id:
                m["app_id"] = self.config.id
            # Add hashed source
            m["hash"] = source_id
            # Note: Metadata is the function argument
            if metadata:
                # Spread whatever is in metadata into the new object.
                m.update(metadata)
            new_metadatas.append(m)
        metadatas = new_metadatas
        if dry_run:
            return list(documents), metadatas, ids, 0
        # Count before, to calculate a delta in the end.
        chunks_before_addition = self.db.count()
        self.db.add(documents=documents, metadatas=metadatas, ids=ids)
        count_new_chunks = self.db.count() - chunks_before_addition
        print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
        return list(documents), metadatas, ids, count_new_chunks
    def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
        """
@@ -392,7 +306,7 @@ class EmbedChain(JSONSerializable):
                "When it should be  DirectDataType, IndirectDataType or SpecialDataType."
            )
-    def load_and_embed_v2(
+    def load_and_embed(
        self,
        loader: BaseLoader,
        chunker: BaseChunker,