fix: url metadata for all datatypes (#613)
This commit is contained in:
@@ -21,7 +21,8 @@ from embedchain.embedder.base import BaseEmbedder
|
||||
from embedchain.helper.json_serializable import JSONSerializable
|
||||
from embedchain.llm.base import BaseLlm
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.models.data_type import DataType
|
||||
from embedchain.models.data_type import (DataType, DirectDataType,
|
||||
IndirectDataType, SpecialDataType)
|
||||
from embedchain.utils import detect_datatype
|
||||
from embedchain.vectordb.base import BaseVectorDB
|
||||
|
||||
@@ -339,16 +340,53 @@ class EmbedChain(JSONSerializable):
|
||||
:param source_id: Hexadecimal hash of the source.
|
||||
:return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
|
||||
"""
|
||||
existing_embeddings_data = self.db.get(
|
||||
where={
|
||||
"url": src,
|
||||
},
|
||||
limit=1,
|
||||
)
|
||||
try:
|
||||
existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
||||
except Exception:
|
||||
# Find existing embeddings for the source
|
||||
# Depending on the data type, existing embeddings are checked for.
|
||||
if chunker.data_type.value in [item.value for item in DirectDataType]:
|
||||
# DirectDataTypes can't be updated.
|
||||
# Think of a text:
|
||||
# Either it's the same, then it won't change, so it's not an update.
|
||||
# Or it's different, then it will be added as a new text.
|
||||
existing_doc_id = None
|
||||
elif chunker.data_type.value in [item.value for item in IndirectDataType]:
|
||||
# These types have a indirect source reference
|
||||
# As long as the reference is the same, they can be updated.
|
||||
existing_embeddings_data = self.db.get(
|
||||
where={
|
||||
"url": src,
|
||||
},
|
||||
limit=1,
|
||||
)
|
||||
try:
|
||||
existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
||||
except Exception:
|
||||
existing_doc_id = None
|
||||
elif chunker.data_type.value in [item.value for item in SpecialDataType]:
|
||||
# These types don't contain indirect references.
|
||||
# Through custom logic, they can be attributed to a source and be updated.
|
||||
if chunker.data_type == DataType.QNA_PAIR:
|
||||
# QNA_PAIRs update the answer if the question already exists.
|
||||
existing_embeddings_data = self.db.get(
|
||||
where={
|
||||
"question": src[0],
|
||||
},
|
||||
limit=1,
|
||||
)
|
||||
try:
|
||||
existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
||||
except Exception:
|
||||
existing_doc_id = None
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"SpecialDataType {chunker.data_type} must have a custom logic to check for existing data"
|
||||
)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"{chunker.data_type} is type {type(chunker.data_type)}. "
|
||||
"When it should be DirectDataType, IndirectDataType or SpecialDataType."
|
||||
)
|
||||
|
||||
# Create chunks
|
||||
embeddings_data = chunker.create_chunks(loader, src)
|
||||
|
||||
# spread chunking results
|
||||
|
||||
Reference in New Issue
Block a user