diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..b9e1dbe0 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ +# Variables +PYTHON := python3 +PIP := $(PYTHON) -m pip +PROJECT_NAME := embedchain + +# Targets +.PHONY: install format lint clean test + +install: + $(PIP) install --upgrade pip + $(PIP) install .[dev] + +format: + $(PYTHON) -m black . + $(PYTHON) -m isort . + +lint: + $(PYTHON) -m ruff . + +clean: + rm -rf dist build *.egg-info + +test: + $(PYTHON) -m pytest diff --git a/README.md b/README.md index 165a0c6b..7f10b3f5 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ embedchain is a framework to easily create LLM powered bots over any dataset. If - [Reset](#reset) - [Count](#count) - [How does it work?](#how-does-it-work) +- [Contribution Guidelines](#contribution-guidelines) - [Tech Stack](#tech-stack) - [Team](#team) - [Author](#author) @@ -551,6 +552,27 @@ embedchain is a framework which takes care of all these nuances and provides a s In the first release, we are making it easier for anyone to get a chatbot over any dataset up and running in less than a minute. All you need to do is create an app instance, add the data sets using `.add` function and then use `.query` function to get the relevant answer. +# Contribution Guidelines + +Thank you for your interest in contributing to the EmbedChain project! We welcome your ideas and contributions to help improve the project. Please follow the instructions below to get started: + +1. **Fork the repository**: Click on the "Fork" button at the top right corner of this repository page. This will create a copy of the repository in your own GitHub account. + +2. **Install the required dependencies**: Ensure that you have the necessary dependencies installed in your Python environment. You can do this by running the following command: + +```bash +make install +``` + +3. **Make changes in the code**: Create a new branch in your forked repository and make your desired changes in the codebase. +4. **Format code**: Before creating a pull request, it's important to ensure that your code follows our formatting guidelines. Run the following commands to format the code: + +```bash +make lint format +``` + +5. **Create a pull request**: When you are ready to contribute your changes, submit a pull request to the EmbedChain repository. Provide a clear and descriptive title for your pull request, along with a detailed description of the changes you have made. + # Tech Stack embedchain is built on the following stack: diff --git a/embedchain/__init__.py b/embedchain/__init__.py index 24e8f1c1..1c73b73f 100644 --- a/embedchain/__init__.py +++ b/embedchain/__init__.py @@ -1 +1 @@ -from .embedchain import App, OpenSourceApp, PersonApp, PersonOpenSourceApp \ No newline at end of file +from .embedchain import App, OpenSourceApp, PersonApp, PersonOpenSourceApp diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py index 83c00d72..acee7895 100644 --- a/embedchain/chunkers/base_chunker.py +++ b/embedchain/chunkers/base_chunker.py @@ -3,15 +3,17 @@ import hashlib class BaseChunker: def __init__(self, text_splitter): - ''' Initialize the chunker. ''' + """Initialize the chunker.""" self.text_splitter = text_splitter def create_chunks(self, loader, src): """ Loads data and chunks it. - :param loader: The loader which's `load_data` method is used to create the raw data. - :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders. + :param loader: The loader which's `load_data` method is used to create + the raw data. + :param src: The data to be handled by the loader. Can be a URL for + remote sources or local content for local loaders. """ documents = [] ids = [] @@ -27,7 +29,7 @@ class BaseChunker: for chunk in chunks: chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() - if (idMap.get(chunk_id) is None): + if idMap.get(chunk_id) is None: idMap[chunk_id] = True ids.append(chunk_id) documents.append(chunk) diff --git a/embedchain/chunkers/docx_file.py b/embedchain/chunkers/docx_file.py index 3277223f..dc5ef305 100644 --- a/embedchain/chunkers/docx_file.py +++ b/embedchain/chunkers/docx_file.py @@ -1,10 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter - +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 1000, @@ -14,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class DocxFileChunker(BaseChunker): - ''' Chunker for .docx file. ''' + """Chunker for .docx file.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/chunkers/pdf_file.py b/embedchain/chunkers/pdf_file.py index 76847a3c..34ab7607 100644 --- a/embedchain/chunkers/pdf_file.py +++ b/embedchain/chunkers/pdf_file.py @@ -1,9 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 1000, @@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class PdfFileChunker(BaseChunker): - ''' Chunker for PDF file. ''' + """Chunker for PDF file.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/chunkers/qna_pair.py b/embedchain/chunkers/qna_pair.py index 6ea17af5..37533c93 100644 --- a/embedchain/chunkers/qna_pair.py +++ b/embedchain/chunkers/qna_pair.py @@ -1,9 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 300, @@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class QnaPairChunker(BaseChunker): - ''' Chunker for QnA pair. ''' + """Chunker for QnA pair.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/chunkers/text.py b/embedchain/chunkers/text.py index 800ed67e..36d07427 100644 --- a/embedchain/chunkers/text.py +++ b/embedchain/chunkers/text.py @@ -1,9 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 300, @@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class TextChunker(BaseChunker): - ''' Chunker for text. ''' + """Chunker for text.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/chunkers/web_page.py b/embedchain/chunkers/web_page.py index 17f9665b..2ec323b5 100644 --- a/embedchain/chunkers/web_page.py +++ b/embedchain/chunkers/web_page.py @@ -1,9 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 500, @@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class WebPageChunker(BaseChunker): - ''' Chunker for web page. ''' + """Chunker for web page.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/chunkers/youtube_video.py b/embedchain/chunkers/youtube_video.py index 6467ddfc..d63b2e05 100644 --- a/embedchain/chunkers/youtube_video.py +++ b/embedchain/chunkers/youtube_video.py @@ -1,9 +1,9 @@ from typing import Optional -from embedchain.chunkers.base_chunker import BaseChunker -from embedchain.config.AddConfig import ChunkerConfig from langchain.text_splitter import RecursiveCharacterTextSplitter +from embedchain.chunkers.base_chunker import BaseChunker +from embedchain.config.AddConfig import ChunkerConfig TEXT_SPLITTER_CHUNK_PARAMS = { "chunk_size": 2000, @@ -13,7 +13,8 @@ TEXT_SPLITTER_CHUNK_PARAMS = { class YoutubeVideoChunker(BaseChunker): - ''' Chunker for Youtube video. ''' + """Chunker for Youtube video.""" + def __init__(self, config: Optional[ChunkerConfig] = None): if config is None: config = TEXT_SPLITTER_CHUNK_PARAMS diff --git a/embedchain/config/AddConfig.py b/embedchain/config/AddConfig.py index f0e7b434..5ff2b3da 100644 --- a/embedchain/config/AddConfig.py +++ b/embedchain/config/AddConfig.py @@ -1,4 +1,5 @@ from typing import Callable, Optional + from embedchain.config.BaseConfig import BaseConfig @@ -6,27 +7,36 @@ class ChunkerConfig(BaseConfig): """ Config for the chunker used in `add` method """ - def __init__(self, - chunk_size: Optional[int] = 4000, - chunk_overlap: Optional[int] = 200, - length_function: Optional[Callable[[str], int]] = len): + + def __init__( + self, + chunk_size: Optional[int] = 4000, + chunk_overlap: Optional[int] = 200, + length_function: Optional[Callable[[str], int]] = len, + ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.length_function = length_function + class LoaderConfig(BaseConfig): """ Config for the chunker used in `add` method """ + def __init__(self): pass + class AddConfig(BaseConfig): """ Config for the `add` method. """ - def __init__(self, - chunker: Optional[ChunkerConfig] = None, - loader: Optional[LoaderConfig] = None): + + def __init__( + self, + chunker: Optional[ChunkerConfig] = None, + loader: Optional[LoaderConfig] = None, + ): self.loader = loader self.chunker = chunker diff --git a/embedchain/config/BaseConfig.py b/embedchain/config/BaseConfig.py index 200bfbf4..38d53f52 100644 --- a/embedchain/config/BaseConfig.py +++ b/embedchain/config/BaseConfig.py @@ -2,6 +2,7 @@ class BaseConfig: """ Base config. """ + def __init__(self): pass diff --git a/embedchain/config/ChatConfig.py b/embedchain/config/ChatConfig.py index 1f4b72d3..4e751745 100644 --- a/embedchain/config/ChatConfig.py +++ b/embedchain/config/ChatConfig.py @@ -1,8 +1,10 @@ -from embedchain.config.QueryConfig import QueryConfig from string import Template +from embedchain.config.QueryConfig import QueryConfig + DEFAULT_PROMPT = """ - You are a chatbot having a conversation with a human. You are given chat history and context. + You are a chatbot having a conversation with a human. You are given chat + history and context. You need to answer the query considering context, chat history and your knowledge base. If you don't know the answer or the answer is neither contained in the context nor in history, then simply say "I don't know". $context @@ -12,35 +14,41 @@ DEFAULT_PROMPT = """ Query: $query Helpful Answer: -""" +""" # noqa:E501 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT) + class ChatConfig(QueryConfig): """ Config for the `chat` method, inherits from `QueryConfig`. """ + def __init__(self, template: Template = None, stream: bool = False): """ Initializes the ChatConfig instance. - :param template: Optional. The `Template` instance to use as a template for prompt. - :param stream: Optional. Control if response is streamed back to the user - :raises ValueError: If the template is not valid as template should contain $context and $query and $history + :param template: Optional. The `Template` instance to use as a + template for prompt. + :param stream: Optional. Control if response is streamed back to the + user + :raises ValueError: If the template is not valid as template should + contain $context and $query and $history """ if template is None: template = DEFAULT_PROMPT_TEMPLATE - # History is set as 0 to ensure that there is always a history, that way, there don't have to be two templates. - # Having two templates would make it complicated because the history is not user controlled. + # History is set as 0 to ensure that there is always a history, that + # way, there don't have to be two templates. + # Having two templates would make it complicated because the history + # is not user controlled. super().__init__(template, history=[0], stream=stream) def set_history(self, history): """ Chat history is not user provided and not set at initialization time - + :param history: (string) history to set """ self.history = history return - diff --git a/embedchain/config/InitConfig.py b/embedchain/config/InitConfig.py index 3048c1af..90b562d3 100644 --- a/embedchain/config/InitConfig.py +++ b/embedchain/config/InitConfig.py @@ -1,8 +1,9 @@ -import os import logging +import os from embedchain.config.BaseConfig import BaseConfig + class InitConfig(BaseConfig): """ Config to initialize an embedchain `App` instance. @@ -10,7 +11,8 @@ class InitConfig(BaseConfig): def __init__(self, log_level=None, ef=None, db=None): """ - :param log_level: Optional. (String) Debug level ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']. + :param log_level: Optional. (String) Debug level + ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']. :param ef: Optional. Embedding function to use. :param db: Optional. (Vector) database to use for embeddings. """ @@ -19,16 +21,18 @@ class InitConfig(BaseConfig): # Embedding Function if ef is None: from chromadb.utils import embedding_functions + self.ef = embedding_functions.OpenAIEmbeddingFunction( api_key=os.getenv("OPENAI_API_KEY"), organization_id=os.getenv("OPENAI_ORGANIZATION"), - model_name="text-embedding-ada-002" + model_name="text-embedding-ada-002", ) else: self.ef = ef if db is None: from embedchain.vectordb.chroma_db import ChromaDB + self.db = ChromaDB(ef=self.ef) else: self.db = db @@ -44,9 +48,10 @@ class InitConfig(BaseConfig): if debug_level is not None: level = getattr(logging, debug_level.upper(), None) if not isinstance(level, int): - raise ValueError(f'Invalid log level: {debug_level}') + raise ValueError(f"Invalid log level: {debug_level}") - logging.basicConfig(format="%(asctime)s [%(name)s] [%(levelname)s] %(message)s", - level=level) + logging.basicConfig( + format="%(asctime)s [%(name)s] [%(levelname)s] %(message)s", level=level + ) self.logger = logging.getLogger(__name__) return diff --git a/embedchain/config/QueryConfig.py b/embedchain/config/QueryConfig.py index 2faa3fb5..c3ed210b 100644 --- a/embedchain/config/QueryConfig.py +++ b/embedchain/config/QueryConfig.py @@ -1,6 +1,7 @@ -from embedchain.config.BaseConfig import BaseConfig -from string import Template import re +from string import Template + +from embedchain.config.BaseConfig import BaseConfig DEFAULT_PROMPT = """ Use the following pieces of context to answer the query at the end. @@ -11,7 +12,7 @@ DEFAULT_PROMPT = """ Query: $query Helpful Answer: -""" +""" # noqa:E501 DEFAULT_PROMPT_WITH_HISTORY = """ Use the following pieces of context to answer the query at the end. @@ -25,7 +26,7 @@ DEFAULT_PROMPT_WITH_HISTORY = """ Query: $query Helpful Answer: -""" +""" # noqa:E501 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT) DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY) @@ -38,14 +39,17 @@ class QueryConfig(BaseConfig): """ Config for the `query` method. """ - def __init__(self, template: Template = None, history = None, stream: bool = False): + + def __init__(self, template: Template = None, history=None, stream: bool = False): """ Initializes the QueryConfig instance. - :param template: Optional. The `Template` instance to use as a template for prompt. + :param template: Optional. The `Template` instance to use as a + template for prompt. :param history: Optional. A list of strings to consider as history. - :param stream: Optional. Control if response is streamed back to the user - :raises ValueError: If the template is not valid as template should contain $context and $query (and optionally $history). + :param stream: Optional. Control if response is streamed back to user + :raises ValueError: If the template is not valid as template should + contain $context and $query (and optionally $history). """ if not history: self.history = None @@ -67,12 +71,13 @@ class QueryConfig(BaseConfig): if self.history is None: raise ValueError("`template` should have `query` and `context` keys") else: - raise ValueError("`template` should have `query`, `context` and `history` keys") + raise ValueError( + "`template` should have `query`, `context` and `history` keys" + ) if not isinstance(stream, bool): raise ValueError("`stream` should be bool") self.stream = stream - def validate_template(self, template: Template): """ @@ -82,9 +87,12 @@ class QueryConfig(BaseConfig): :return: Boolean, valid (true) or invalid (false) """ if self.history is None: - return (re.search(query_re, template.template) \ - and re.search(context_re, template.template)) + return re.search(query_re, template.template) and re.search( + context_re, template.template + ) else: - return (re.search(query_re, template.template) \ + return ( + re.search(query_re, template.template) and re.search(context_re, template.template) - and re.search(history_re, template.template)) + and re.search(history_re, template.template) + ) diff --git a/embedchain/config/__init__.py b/embedchain/config/__init__.py index a9b7c3f8..7b52162b 100644 --- a/embedchain/config/__init__.py +++ b/embedchain/config/__init__.py @@ -1,5 +1,5 @@ -from .BaseConfig import BaseConfig from .AddConfig import AddConfig +from .BaseConfig import BaseConfig from .ChatConfig import ChatConfig from .InitConfig import InitConfig -from .QueryConfig import QueryConfig \ No newline at end of file +from .QueryConfig import QueryConfig diff --git a/embedchain/data_formatter/__init__.py b/embedchain/data_formatter/__init__.py index ebd5943a..6f635581 100644 --- a/embedchain/data_formatter/__init__.py +++ b/embedchain/data_formatter/__init__.py @@ -1 +1 @@ -from .data_formatter import DataFormatter \ No newline at end of file +from .data_formatter import DataFormatter diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index a6165da6..bff1b4f2 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -1,24 +1,25 @@ -from embedchain.config import AddConfig -from embedchain.loaders.youtube_video import YoutubeVideoLoader -from embedchain.loaders.pdf_file import PdfFileLoader -from embedchain.loaders.web_page import WebPageLoader -from embedchain.loaders.local_qna_pair import LocalQnaPairLoader -from embedchain.loaders.local_text import LocalTextLoader -from embedchain.loaders.docx_file import DocxFileLoader -from embedchain.chunkers.youtube_video import YoutubeVideoChunker +from embedchain.chunkers.docx_file import DocxFileChunker from embedchain.chunkers.pdf_file import PdfFileChunker -from embedchain.chunkers.web_page import WebPageChunker from embedchain.chunkers.qna_pair import QnaPairChunker from embedchain.chunkers.text import TextChunker -from embedchain.chunkers.docx_file import DocxFileChunker +from embedchain.chunkers.web_page import WebPageChunker +from embedchain.chunkers.youtube_video import YoutubeVideoChunker +from embedchain.config import AddConfig +from embedchain.loaders.docx_file import DocxFileLoader +from embedchain.loaders.local_qna_pair import LocalQnaPairLoader +from embedchain.loaders.local_text import LocalTextLoader +from embedchain.loaders.pdf_file import PdfFileLoader +from embedchain.loaders.web_page import WebPageLoader +from embedchain.loaders.youtube_video import YoutubeVideoLoader class DataFormatter: """ DataFormatter is an internal utility class which abstracts the mapping for loaders and chunkers to the data_type entered by the user in their - .add or .add_local method call + .add or .add_local method call """ + def __init__(self, data_type: str, config: AddConfig): self.loader = self._get_loader(data_type, config.loader) self.chunker = self._get_chunker(data_type, config.chunker) @@ -32,12 +33,12 @@ class DataFormatter: :raises ValueError: If an unsupported data type is provided. """ loaders = { - 'youtube_video': YoutubeVideoLoader(), - 'pdf_file': PdfFileLoader(), - 'web_page': WebPageLoader(), - 'qna_pair': LocalQnaPairLoader(), - 'text': LocalTextLoader(), - 'docx': DocxFileLoader(), + "youtube_video": YoutubeVideoLoader(), + "pdf_file": PdfFileLoader(), + "web_page": WebPageLoader(), + "qna_pair": LocalQnaPairLoader(), + "text": LocalTextLoader(), + "docx": DocxFileLoader(), } if data_type in loaders: return loaders[data_type] @@ -53,12 +54,12 @@ class DataFormatter: :raises ValueError: If an unsupported data type is provided. """ chunkers = { - 'youtube_video': YoutubeVideoChunker(config), - 'pdf_file': PdfFileChunker(config), - 'web_page': WebPageChunker(config), - 'qna_pair': QnaPairChunker(config), - 'text': TextChunker(config), - 'docx': DocxFileChunker(config), + "youtube_video": YoutubeVideoChunker(config), + "pdf_file": PdfFileChunker(config), + "web_page": WebPageChunker(config), + "qna_pair": QnaPairChunker(config), + "text": TextChunker(config), + "docx": DocxFileChunker(config), } if data_type in chunkers: return chunkers[data_type] diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 3685f328..f98d4ace 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -1,17 +1,16 @@ -import openai -import os import logging +import os from string import Template +import openai from chromadb.utils import embedding_functions from dotenv import load_dotenv from langchain.docstore.document import Document -from langchain.embeddings.openai import OpenAIEmbeddings from langchain.memory import ConversationBufferMemory -from embedchain.config import InitConfig, AddConfig, QueryConfig, ChatConfig + +from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig from embedchain.config.QueryConfig import DEFAULT_PROMPT from embedchain.data_formatter import DataFormatter -from string import Template gpt4all_model = None @@ -45,7 +44,8 @@ class EmbedChain: :param data_type: The type of the data to add. :param url: The URL where the data is located. - :param config: Optional. The `AddConfig` instance to use as configuration options. + :param config: Optional. The `AddConfig` instance to use as configuration + options. """ if config is None: config = AddConfig() @@ -62,22 +62,28 @@ class EmbedChain: :param data_type: The type of the data to add. :param content: The local data. Refer to the `README` for formatting. - :param config: Optional. The `AddConfig` instance to use as configuration options. + :param config: Optional. The `AddConfig` instance to use as + configuration options. """ if config is None: config = AddConfig() data_formatter = DataFormatter(data_type, config) self.user_asks.append([data_type, content]) - self.load_and_embed(data_formatter.loader, data_formatter.chunker, content) + self.load_and_embed( + data_formatter.loader, + data_formatter.chunker, + content, + ) def load_and_embed(self, loader, chunker, src): """ - Loads the data from the given URL, chunks it, and adds it to the database. + Loads the data from the given URL, chunks it, and adds it to database. :param loader: The loader to use to load the data. :param chunker: The chunker to use to chunk the data. - :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders. + :param src: The data to be handled by the loader. Can be a URL for + remote sources or local content for local loaders. """ embeddings_data = chunker.create_chunks(loader, src) documents = embeddings_data["documents"] @@ -91,8 +97,12 @@ class EmbedChain: existing_ids = set(existing_docs["ids"]) if len(existing_ids): - data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)} - data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids} + data_dict = { + id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas) + } + data_dict = { + id: value for id, value in data_dict.items() if id not in existing_ids + } if not data_dict: print(f"All data from {src} already exists in the database.") @@ -103,12 +113,10 @@ class EmbedChain: chunks_before_addition = self.count() - self.collection.add( - documents=documents, - metadatas=list(metadatas), - ids=ids + self.collection.add(documents=documents, metadatas=list(metadatas), ids=ids) + print( + f"Successfully saved {src}. New chunks count: {self.count() - chunks_before_addition}" # noqa:E501 ) - print(f"Successfully saved {src}. New chunks count: {self.count() - chunks_before_addition}") def _format_result(self, results): return [ @@ -132,7 +140,9 @@ class EmbedChain: :return: The content of the document that matched your query. """ result = self.collection.query( - query_texts=[input_query,], + query_texts=[ + input_query, + ], n_results=1, ) result_formatted = self._format_result(result) @@ -144,17 +154,21 @@ class EmbedChain: def generate_prompt(self, input_query, context, config: QueryConfig): """ - Generates a prompt based on the given query and context, ready to be passed to an LLM + Generates a prompt based on the given query and context, ready to be + passed to an LLM :param input_query: The query to use. :param context: Similar documents to the query used as context. - :param config: Optional. The `QueryConfig` instance to use as configuration options. + :param config: Optional. The `QueryConfig` instance to use as + configuration options. :return: The prompt """ if not config.history: - prompt = config.template.substitute(context = context, query = input_query) + prompt = config.template.substitute(context=context, query=input_query) else: - prompt = config.template.substitute(context = context, query = input_query, history = config.history) + prompt = config.template.substitute( + context=context, query=input_query, history=config.history + ) return prompt def get_answer_from_llm(self, prompt, config: ChatConfig): @@ -166,7 +180,7 @@ class EmbedChain: :param context: Similar documents to the query used as context. :return: The answer. """ - + return self.get_llm_model_answer(prompt, config) def query(self, input_query, config: QueryConfig = None): @@ -176,7 +190,8 @@ class EmbedChain: LLM as context to get the answer. :param input_query: The query to use. - :param config: Optional. The `QueryConfig` instance to use as configuration options. + :param config: Optional. The `QueryConfig` instance to use as + configuration options. :return: The answer to the query. """ if config is None: @@ -188,7 +203,6 @@ class EmbedChain: logging.info(f"Answer: {answer}") return answer - def chat(self, input_query, config: ChatConfig = None): """ Queries the vector database on the given input query. @@ -197,30 +211,31 @@ class EmbedChain: Maintains last 5 conversations in memory. :param input_query: The query to use. - :param config: Optional. The `ChatConfig` instance to use as configuration options. + :param config: Optional. The `ChatConfig` instance to use as + configuration options. :return: The answer to the query. """ context = self.retrieve_from_database(input_query) global memory chat_history = memory.load_memory_variables({})["history"] - + if config is None: config = ChatConfig() if chat_history: config.set_history(chat_history) - + prompt = self.generate_prompt(input_query, context, config) logging.info(f"Prompt: {prompt}") answer = self.get_answer_from_llm(prompt, config) memory.chat_memory.add_user_message(input_query) - + if isinstance(answer, str): memory.chat_memory.add_ai_message(answer) logging.info(f"Answer: {answer}") return answer else: - #this is a streamed response and needs to be handled differently. + # this is a streamed response and needs to be handled differently. return self._stream_chat_response(answer) def _stream_chat_response(self, answer): @@ -230,7 +245,6 @@ class EmbedChain: yield chunk memory.chat_memory.add_ai_message(streamed_answer) logging.info(f"Answer: {streamed_answer}") - def dry_run(self, input_query, config: QueryConfig = None): """ @@ -242,7 +256,8 @@ class EmbedChain: the `max_tokens` parameter. :param input_query: The query to use. - :param config: Optional. The `QueryConfig` instance to use as configuration options. + :param config: Optional. The `QueryConfig` instance to use as + configuration options. :return: The prompt that would be sent to the LLM """ if config is None: @@ -260,7 +275,6 @@ class EmbedChain: """ return self.collection.count() - def reset(self): """ Resets the database. Deletes all embeddings irreversibly. @@ -288,35 +302,31 @@ class App(EmbedChain): super().__init__(config) def get_llm_model_answer(self, prompt, config: ChatConfig): - messages = [] - messages.append({ - "role": "user", "content": prompt - }) + messages.append({"role": "user", "content": prompt}) response = openai.ChatCompletion.create( model="gpt-3.5-turbo-0613", messages=messages, temperature=0, max_tokens=1000, top_p=1, - stream=config.stream + stream=config.stream, ) if config.stream: return self._stream_llm_model_response(response) else: return response["choices"][0]["message"]["content"] - + def _stream_llm_model_response(self, response): """ This is a generator for streaming response from the OpenAI completions API """ for line in response: - chunk = line['choices'][0].get('delta', {}).get('content', '') + chunk = line["choices"][0].get("delta", {}).get("content", "") yield chunk - class OpenSourceApp(EmbedChain): """ The OpenSource app. @@ -330,20 +340,24 @@ class OpenSourceApp(EmbedChain): def __init__(self, config: InitConfig = None): """ - :param config: InitConfig instance to load as configuration. Optional. `ef` defaults to open source. + :param config: InitConfig instance to load as configuration. Optional. + `ef` defaults to open source. """ - print("Loading open source embedding model. This may take some time...") + print( + "Loading open source embedding model. This may take some time..." + ) # noqa:E501 if not config: config = InitConfig( - ef = embedding_functions.SentenceTransformerEmbeddingFunction( + ef=embedding_functions.SentenceTransformerEmbeddingFunction( model_name="all-MiniLM-L6-v2" ) ) elif not config.ef: config._set_embedding_function( - embedding_functions.SentenceTransformerEmbeddingFunction( - model_name="all-MiniLM-L6-v2" - )) + embedding_functions.SentenceTransformerEmbeddingFunction( + model_name="all-MiniLM-L6-v2" + ) + ) print("Successfully loaded open source embedding model.") super().__init__(config) @@ -353,10 +367,7 @@ class OpenSourceApp(EmbedChain): global gpt4all_model if gpt4all_model is None: gpt4all_model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin") - response = gpt4all_model.generate( - prompt=prompt, - streaming=config.stream - ) + response = gpt4all_model.generate(prompt=prompt, streaming=config.stream) return response @@ -368,12 +379,11 @@ class EmbedChainPersonApp: :param person: name of the person, better if its a well known person. :param config: InitConfig instance to load as configuration. """ + def __init__(self, person, config: InitConfig = None): self.person = person - self.person_prompt = f"You are {person}. Whatever you say, you will always say in {person} style." - self.template = Template( - self.person_prompt + " " + DEFAULT_PROMPT - ) + self.person_prompt = f"You are {person}. Whatever you say, you will always say in {person} style." # noqa:E501 + self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT) if config is None: config = InitConfig() super().__init__(config) @@ -384,6 +394,7 @@ class PersonApp(EmbedChainPersonApp, App): The Person app. Extends functionality from EmbedChainPersonApp and App """ + def query(self, input_query, config: QueryConfig = None): query_config = QueryConfig( template=self.template, @@ -392,7 +403,7 @@ class PersonApp(EmbedChainPersonApp, App): def chat(self, input_query, config: ChatConfig = None): chat_config = ChatConfig( - template = self.template, + template=self.template, ) return super().chat(input_query, chat_config) @@ -402,6 +413,7 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): The Person app. Extends functionality from EmbedChainPersonApp and OpenSourceApp """ + def query(self, input_query, config: QueryConfig = None): query_config = QueryConfig( template=self.template, @@ -410,6 +422,6 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp): def chat(self, input_query, config: ChatConfig = None): chat_config = ChatConfig( - template = self.template, + template=self.template, ) - return super().chat(input_query, chat_config) \ No newline at end of file + return super().chat(input_query, chat_config) diff --git a/embedchain/loaders/docx_file.py b/embedchain/loaders/docx_file.py index e588c8d4..d88198be 100644 --- a/embedchain/loaders/docx_file.py +++ b/embedchain/loaders/docx_file.py @@ -1,8 +1,9 @@ from langchain.document_loaders import Docx2txtLoader + class DocxFileLoader: def load_data(self, url): - ''' Load data from a .docx file. ''' + """Load data from a .docx file.""" loader = Docx2txtLoader(url) output = [] data = loader.load() diff --git a/embedchain/loaders/local_qna_pair.py b/embedchain/loaders/local_qna_pair.py index 84302744..ceaf9779 100644 --- a/embedchain/loaders/local_qna_pair.py +++ b/embedchain/loaders/local_qna_pair.py @@ -1,13 +1,14 @@ class LocalQnaPairLoader: - def load_data(self, content): - ''' Load data from a local QnA pair. ''' + """Load data from a local QnA pair.""" question, answer = content content = f"Q: {question}\nA: {answer}" meta_data = { "url": "local", } - return [{ - "content": content, - "meta_data": meta_data, - }] + return [ + { + "content": content, + "meta_data": meta_data, + } + ] diff --git a/embedchain/loaders/local_text.py b/embedchain/loaders/local_text.py index 2f9e6a17..c61a6d85 100644 --- a/embedchain/loaders/local_text.py +++ b/embedchain/loaders/local_text.py @@ -1,11 +1,12 @@ class LocalTextLoader: - def load_data(self, content): - ''' Load data from a local text file. ''' + """Load data from a local text file.""" meta_data = { "url": "local", } - return [{ - "content": content, - "meta_data": meta_data, - }] + return [ + { + "content": content, + "meta_data": meta_data, + } + ] diff --git a/embedchain/loaders/pdf_file.py b/embedchain/loaders/pdf_file.py index 7d0c6a49..888193ba 100644 --- a/embedchain/loaders/pdf_file.py +++ b/embedchain/loaders/pdf_file.py @@ -4,9 +4,8 @@ from embedchain.utils import clean_string class PdfFileLoader: - def load_data(self, url): - ''' Load data from a PDF file. ''' + """Load data from a PDF file.""" loader = PyPDFLoader(url) output = [] pages = loader.load_and_split() @@ -17,8 +16,10 @@ class PdfFileLoader: content = clean_string(content) meta_data = page.metadata meta_data["url"] = url - output.append({ - "content": content, - "meta_data": meta_data, - }) + output.append( + { + "content": content, + "meta_data": meta_data, + } + ) return output diff --git a/embedchain/loaders/web_page.py b/embedchain/loaders/web_page.py index a9e25df3..10de41c3 100644 --- a/embedchain/loaders/web_page.py +++ b/embedchain/loaders/web_page.py @@ -1,22 +1,29 @@ import requests - from bs4 import BeautifulSoup from embedchain.utils import clean_string class WebPageLoader: - def load_data(self, url): - ''' Load data from a web page. ''' + """Load data from a web page.""" response = requests.get(url) data = response.content - soup = BeautifulSoup(data, 'html.parser') - for tag in soup([ - "nav", "aside", "form", "header", - "noscript", "svg", "canvas", - "footer", "script", "style" - ]): + soup = BeautifulSoup(data, "html.parser") + for tag in soup( + [ + "nav", + "aside", + "form", + "header", + "noscript", + "svg", + "canvas", + "footer", + "script", + "style", + ] + ): tag.string = " " output = [] content = soup.get_text() @@ -24,8 +31,10 @@ class WebPageLoader: meta_data = { "url": url, } - output.append({ - "content": content, - "meta_data": meta_data, - }) - return output \ No newline at end of file + output.append( + { + "content": content, + "meta_data": meta_data, + } + ) + return output diff --git a/embedchain/loaders/youtube_video.py b/embedchain/loaders/youtube_video.py index 601a7089..59790b57 100644 --- a/embedchain/loaders/youtube_video.py +++ b/embedchain/loaders/youtube_video.py @@ -4,9 +4,8 @@ from embedchain.utils import clean_string class YoutubeVideoLoader: - def load_data(self, url): - ''' Load data from a Youtube video. ''' + """Load data from a Youtube video.""" loader = YoutubeLoader.from_youtube_url(url, add_video_info=True) doc = loader.load() output = [] @@ -16,8 +15,10 @@ class YoutubeVideoLoader: content = clean_string(content) meta_data = doc[0].metadata meta_data["url"] = url - output.append({ - "content": content, - "meta_data": meta_data, - }) + output.append( + { + "content": content, + "meta_data": meta_data, + } + ) return output diff --git a/embedchain/utils.py b/embedchain/utils.py index d7a89301..f2b04490 100644 --- a/embedchain/utils.py +++ b/embedchain/utils.py @@ -3,30 +3,33 @@ import re def clean_string(text): """ - This function takes in a string and performs a series of text cleaning operations. + This function takes in a string and performs a series of text cleaning operations. Args: text (str): The text to be cleaned. This is expected to be a string. Returns: - cleaned_text (str): The cleaned text after all the cleaning operations have been performed. + cleaned_text (str): The cleaned text after all the cleaning operations + have been performed. """ # Replacement of newline characters: - text = text.replace('\n', ' ') + text = text.replace("\n", " ") # Stripping and reducing multiple spaces to single: - cleaned_text = re.sub(r'\s+', ' ', text.strip()) + cleaned_text = re.sub(r"\s+", " ", text.strip()) # Removing backslashes: - cleaned_text = cleaned_text.replace('\\', '') + cleaned_text = cleaned_text.replace("\\", "") # Replacing hash characters: - cleaned_text = cleaned_text.replace('#', ' ') + cleaned_text = cleaned_text.replace("#", " ") # Eliminating consecutive non-alphanumeric characters: - # This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string - # and replaces each group of such characters with a single occurrence of that character. + # This regex identifies consecutive non-alphanumeric characters (i.e., not + # a word character [a-zA-Z0-9_] and not a whitespace) in the string + # and replaces each group of such characters with a single occurrence of + # that character. # For example, "!!! hello !!!" would become "! hello !". - cleaned_text = re.sub(r'([^\w\s])\1*', r'\1', cleaned_text) + cleaned_text = re.sub(r"([^\w\s])\1*", r"\1", cleaned_text) return cleaned_text diff --git a/embedchain/vectordb/base_vector_db.py b/embedchain/vectordb/base_vector_db.py index fcde4542..79089063 100644 --- a/embedchain/vectordb/base_vector_db.py +++ b/embedchain/vectordb/base_vector_db.py @@ -1,12 +1,12 @@ class BaseVectorDB: - ''' Base class for vector database. ''' + """Base class for vector database.""" def __init__(self): self.client = self._get_or_create_db() self.collection = self._get_or_create_collection() def _get_or_create_db(self): - ''' Get or create the database. ''' + """Get or create the database.""" raise NotImplementedError def _get_or_create_collection(self): diff --git a/embedchain/vectordb/chroma_db.py b/embedchain/vectordb/chroma_db.py index 46b1b9e5..de4887e1 100644 --- a/embedchain/vectordb/chroma_db.py +++ b/embedchain/vectordb/chroma_db.py @@ -1,14 +1,14 @@ -import chromadb import os +import chromadb from chromadb.utils import embedding_functions from embedchain.vectordb.base_vector_db import BaseVectorDB class ChromaDB(BaseVectorDB): - ''' Vector database using ChromaDB. ''' - + """Vector database using ChromaDB.""" + def __init__(self, db_dir=None, ef=None): if ef: self.ef = ef @@ -16,23 +16,24 @@ class ChromaDB(BaseVectorDB): self.ef = embedding_functions.OpenAIEmbeddingFunction( api_key=os.getenv("OPENAI_API_KEY"), organization_id=os.getenv("OPENAI_ORGANIZATION"), - model_name="text-embedding-ada-002" + model_name="text-embedding-ada-002", ) if db_dir is None: db_dir = "db" self.client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=db_dir, - anonymized_telemetry=False + anonymized_telemetry=False, ) super().__init__() def _get_or_create_db(self): - ''' Get or create the database. ''' + """Get or create the database.""" return chromadb.Client(self.client_settings) def _get_or_create_collection(self): - ''' Get or create the collection. ''' + """Get or create the collection.""" return self.client.get_or_create_collection( - 'embedchain_store', embedding_function=self.ef, + "embedchain_store", + embedding_function=self.ef, ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..79842bbe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.ruff] +select = ["E", "F"] +ignore = [] +fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"] +unfixable = [] +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", +] +line-length = 88 +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" +target-version = "py38" + +[tool.ruff.mccabe] +max-complexity = 10 + +[tool.black] +line-length = 88 +target-version = ["py38", "py39", "py310", "py311"] +include = '\.pyi?$' +exclude = ''' +/( + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.nox + | \.pants.d + | \.pytype + | \.ruff_cache + | \.svn + | \.tox + | \.venv + | __pypackages__ + | _build + | buck-out + | build + | dist + | node_modules + | venv +)/ +''' + +[tool.black.format] +color = true diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 00000000..8dbd6789 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,5 @@ +pip +black==23.3.0 +isort==5.8.0 +ruff==0.0.277 +pytest==7.4.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 13a114f4..5b39583a 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setuptools.setup( version="0.0.18", author="Taranjeet Singh", author_email="reachtotj@gmail.com", - description="embedchain is a framework to easily create LLM powered bots over any dataset", + description="embedchain is a framework to easily create LLM powered bots over any dataset", # noqa:E501 long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/embedchain/embedchain", @@ -18,7 +18,7 @@ setuptools.setup( "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], - python_requires='>=3.8', + python_requires=">=3.8", py_modules=["embedchain"], install_requires=[ "langchain>=0.0.205", diff --git a/tests/test_embedchain.py b/tests/test_embedchain.py index b0534e03..3a60f10c 100644 --- a/tests/test_embedchain.py +++ b/tests/test_embedchain.py @@ -1,7 +1,7 @@ import os - import unittest -from unittest.mock import patch, MagicMock +from unittest.mock import MagicMock, patch + from embedchain import App