[Feature/Improvements] Delete data sources from metadata db when using app.delete() (#1286)

This commit is contained in:
Deshraj Yadav
2024-02-26 13:18:42 -08:00
committed by GitHub
parent 92dd7edb57
commit 752f638cfc
4 changed files with 19 additions and 7 deletions

View File

@@ -16,6 +16,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
pod_config: Optional[dict[str, any]] = None, pod_config: Optional[dict[str, any]] = None,
serverless_config: Optional[dict[str, any]] = None, serverless_config: Optional[dict[str, any]] = None,
hybrid_search: bool = False, hybrid_search: bool = False,
bm25_encoder: any = None,
**extra_params: dict[str, any], **extra_params: dict[str, any],
): ):
self.metric = metric self.metric = metric
@@ -24,6 +25,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
self.vector_dimension = vector_dimension self.vector_dimension = vector_dimension
self.extra_params = extra_params self.extra_params = extra_params
self.hybrid_search = hybrid_search self.hybrid_search = hybrid_search
self.bm25_encoder = bm25_encoder
if pod_config is None and serverless_config is None: if pod_config is None and serverless_config is None:
# If no config is provided, use the default pod spec config # If no config is provided, use the default pod spec config
pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter") pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")

View File

@@ -6,17 +6,20 @@ from typing import Any, Optional, Union
from dotenv import load_dotenv from dotenv import load_dotenv
from langchain.docstore.document import Document from langchain.docstore.document import Document
from embedchain.cache import adapt, get_gptcache_session, gptcache_data_convert, gptcache_update_cache_callback from embedchain.cache import (adapt, get_gptcache_session,
gptcache_data_convert,
gptcache_update_cache_callback)
from embedchain.chunkers.base_chunker import BaseChunker from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
from embedchain.config.base_app_config import BaseAppConfig from embedchain.config.base_app_config import BaseAppConfig
from embedchain.core.db.models import DataSource from embedchain.core.db.models import ChatHistory, DataSource
from embedchain.data_formatter import DataFormatter from embedchain.data_formatter import DataFormatter
from embedchain.embedder.base import BaseEmbedder from embedchain.embedder.base import BaseEmbedder
from embedchain.helpers.json_serializable import JSONSerializable from embedchain.helpers.json_serializable import JSONSerializable
from embedchain.llm.base import BaseLlm from embedchain.llm.base import BaseLlm
from embedchain.loaders.base_loader import BaseLoader from embedchain.loaders.base_loader import BaseLoader
from embedchain.models.data_type import DataType, DirectDataType, IndirectDataType, SpecialDataType from embedchain.models.data_type import (DataType, DirectDataType,
IndirectDataType, SpecialDataType)
from embedchain.utils.misc import detect_datatype, is_valid_json_string from embedchain.utils.misc import detect_datatype, is_valid_json_string
from embedchain.vectordb.base import BaseVectorDB from embedchain.vectordb.base import BaseVectorDB
@@ -642,9 +645,10 @@ class EmbedChain(JSONSerializable):
""" """
try: try:
self.db_session.query(DataSource).filter_by(app_id=self.config.id).delete() self.db_session.query(DataSource).filter_by(app_id=self.config.id).delete()
self.db_session.query(ChatHistory).filter_by(app_id=self.config.id).delete()
self.db_session.commit() self.db_session.commit()
except Exception as e: except Exception as e:
logging.error(f"Error deleting chat history: {e}") logging.error(f"Error deleting data sources: {e}")
self.db_session.rollback() self.db_session.rollback()
return None return None
self.db.reset() self.db.reset()
@@ -682,6 +686,13 @@ class EmbedChain(JSONSerializable):
:param source_hash: The hash of the source. :param source_hash: The hash of the source.
:type source_hash: str :type source_hash: str
""" """
try:
self.db_session.query(DataSource).filter_by(hash=source_id, app_id=self.config.id).delete()
self.db_session.commit()
except Exception as e:
logging.error(f"Error deleting data sources: {e}")
self.db_session.rollback()
return None
self.db.delete(where={"hash": source_id}) self.db.delete(where={"hash": source_id})
logging.info(f"Successfully deleted {source_id}") logging.info(f"Successfully deleted {source_id}")
# Send anonymous telemetry # Send anonymous telemetry

View File

@@ -49,9 +49,8 @@ class PineconeDB(BaseVectorDB):
# Setup BM25Encoder if sparse vectors are to be used # Setup BM25Encoder if sparse vectors are to be used
self.bm25_encoder = None self.bm25_encoder = None
if self.config.hybrid_search: if self.config.hybrid_search:
# TODO: Add support for fitting BM25Encoder on any corpus
logging.info("Initializing BM25Encoder for sparse vectors..") logging.info("Initializing BM25Encoder for sparse vectors..")
self.bm25_encoder = BM25Encoder.default() self.bm25_encoder = self.config.bm25_encoder if self.config.bm25_encoder else BM25Encoder.default()
# Call parent init here because embedder is needed # Call parent init here because embedder is needed
super().__init__(config=self.config) super().__init__(config=self.config)

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "embedchain" name = "embedchain"
version = "0.1.86" version = "0.1.87"
description = "Simplest open source retrieval(RAG) framework" description = "Simplest open source retrieval(RAG) framework"
authors = [ authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>", "Taranjeet Singh <taranjeet@embedchain.ai>",