fix: url metadata for all datatypes (#613)

2023-09-13 19:19:48 +02:00
parent 701d0b21ef
commit 79efa51941
4 changed files with 116 additions and 16 deletions
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -21,7 +21,8 @@ from embedchain.embedder.base import BaseEmbedder
 from embedchain.helper.json_serializable import JSONSerializable
 from embedchain.llm.base import BaseLlm
 from embedchain.loaders.base_loader import BaseLoader
-from embedchain.models.data_type import DataType
+from embedchain.models.data_type import (DataType, DirectDataType,
+                                         IndirectDataType, SpecialDataType)
 from embedchain.utils import detect_datatype
 from embedchain.vectordb.base import BaseVectorDB

@@ -339,16 +340,53 @@ class EmbedChain(JSONSerializable):
        :param source_id: Hexadecimal hash of the source.
        :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
        """
-        existing_embeddings_data = self.db.get(
-            where={
-                "url": src,
-            },
-            limit=1,
-        )
-        try:
-            existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
-        except Exception:
+        # Find existing embeddings for the source
+        # Depending on the data type, existing embeddings are checked for.
+        if chunker.data_type.value in [item.value for item in DirectDataType]:
+            # DirectDataTypes can't be updated.
+            # Think of a text:
+            #   Either it's the same, then it won't change, so it's not an update.
+            #   Or it's different, then it will be added as a new text.
            existing_doc_id = None
+        elif chunker.data_type.value in [item.value for item in IndirectDataType]:
+            # These types have a indirect source reference
+            # As long as the reference is the same, they can be updated.
+            existing_embeddings_data = self.db.get(
+                where={
+                    "url": src,
+                },
+                limit=1,
+            )
+            try:
+                existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
+            except Exception:
+                existing_doc_id = None
+        elif chunker.data_type.value in [item.value for item in SpecialDataType]:
+            # These types don't contain indirect references.
+            # Through custom logic, they can be attributed to a source and be updated.
+            if chunker.data_type == DataType.QNA_PAIR:
+                # QNA_PAIRs update the answer if the question already exists.
+                existing_embeddings_data = self.db.get(
+                    where={
+                        "question": src[0],
+                    },
+                    limit=1,
+                )
+                try:
+                    existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
+                except Exception:
+                    existing_doc_id = None
+            else:
+                raise NotImplementedError(
+                    f"SpecialDataType {chunker.data_type} must have a custom logic to check for existing data"
+                )
+        else:
+            raise TypeError(
+                f"{chunker.data_type} is type {type(chunker.data_type)}. "
+                "When it should be  DirectDataType, IndirectDataType or SpecialDataType."
+            )
+
+        # Create chunks
        embeddings_data = chunker.create_chunks(loader, src)

        # spread chunking results