From 536f85b78ae81293e7e2aff60abf3da97b1c54fe Mon Sep 17 00:00:00 2001 From: Deshraj Yadav Date: Thu, 11 Apr 2024 15:00:04 -0700 Subject: [PATCH] [Improvements] Improve logging and fix insertion in data_sources table (#1337) --- embedchain/app.py | 14 +++++--------- embedchain/embedchain.py | 14 +++++++++----- embedchain/llm/anthropic.py | 1 - pyproject.toml | 2 +- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/embedchain/app.py b/embedchain/app.py index 9ea4eaf8..a486eec2 100644 --- a/embedchain/app.py +++ b/embedchain/app.py @@ -9,14 +9,9 @@ import requests import yaml from tqdm import tqdm -from embedchain.cache import ( - Config, - ExactMatchEvaluation, - SearchDistanceEvaluation, - cache, - gptcache_data_manager, - gptcache_pre_function, -) +from embedchain.cache import (Config, ExactMatchEvaluation, + SearchDistanceEvaluation, cache, + gptcache_data_manager, gptcache_pre_function) from embedchain.client import Client from embedchain.config import AppConfig, CacheConfig, ChunkerConfig from embedchain.core.db.database import get_session, init_db, setup_engine @@ -25,7 +20,8 @@ from embedchain.embedchain import EmbedChain from embedchain.embedder.base import BaseEmbedder from embedchain.embedder.openai import OpenAIEmbedder from embedchain.evaluation.base import BaseMetric -from embedchain.evaluation.metrics import AnswerRelevance, ContextRelevance, Groundedness +from embedchain.evaluation.metrics import (AnswerRelevance, ContextRelevance, + Groundedness) from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory from embedchain.helpers.json_serializable import register_deserializable from embedchain.llm.base import BaseLlm diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index ee824dd0..64a021fb 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -179,6 +179,10 @@ class EmbedChain(JSONSerializable): if data_type in {DataType.DOCS_SITE}: self.is_docs_site_instance = True + # Convert the source to a string if it is not already + if not isinstance(source, str): + source = str(source) + # Insert the data into the 'ec_data_sources' table self.db_session.add( DataSource( @@ -310,12 +314,12 @@ class EmbedChain(JSONSerializable): new_doc_id = embeddings_data["doc_id"] if existing_doc_id and existing_doc_id == new_doc_id: - print("Doc content has not changed. Skipping creating chunks and embeddings") + logger.info("Doc content has not changed. Skipping creating chunks and embeddings") return [], [], [], 0 # this means that doc content has changed. if existing_doc_id and existing_doc_id != new_doc_id: - print("Doc content has changed. Recomputing chunks and embeddings intelligently.") + logger.info("Doc content has changed. Recomputing chunks and embeddings intelligently.") self.db.delete({"doc_id": existing_doc_id}) # get existing ids, and discard doc if any common id exist. @@ -341,7 +345,7 @@ class EmbedChain(JSONSerializable): src_copy = src if len(src_copy) > 50: src_copy = src[:50] + "..." - print(f"All data from {src_copy} already exists in the database.") + logger.info(f"All data from {src_copy} already exists in the database.") # Make sure to return a matching return type return [], [], [], 0 @@ -388,12 +392,12 @@ class EmbedChain(JSONSerializable): if batch_docs: self.db.add(documents=batch_docs, metadatas=batch_meta, ids=batch_ids, **kwargs) except Exception as e: - print(f"Failed to add batch due to a bad request: {e}") + logger.info(f"Failed to add batch due to a bad request: {e}") # Handle the error, e.g., by logging, retrying, or skipping pass count_new_chunks = self.db.count() - chunks_before_addition - print(f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}") + logger.info(f"Successfully saved {str(src)[:100]} ({chunker.data_type}). New chunks count: {count_new_chunks}") return list(documents), metadatas, ids, count_new_chunks diff --git a/embedchain/llm/anthropic.py b/embedchain/llm/anthropic.py index 6f71569b..a6874e47 100644 --- a/embedchain/llm/anthropic.py +++ b/embedchain/llm/anthropic.py @@ -26,7 +26,6 @@ class AnthropicLlm(BaseLlm): @staticmethod def _get_answer(prompt: str, config: BaseLlmConfig) -> str: - chat = ChatAnthropic( anthropic_api_key=os.environ["ANTHROPIC_API_KEY"], temperature=config.temperature, model_name=config.model ) diff --git a/pyproject.toml b/pyproject.toml index da8f5d96..e0e1b8b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "embedchain" -version = "0.1.99" +version = "0.1.100" description = "Simplest open source retrieval (RAG) framework" authors = [ "Taranjeet Singh ",