From cf1e000fb32672637eceeda30cb22e664759e3ea Mon Sep 17 00:00:00 2001 From: Taranjeet Singh Date: Wed, 5 Jul 2023 02:23:23 +0530 Subject: [PATCH] Open source embedding and LLM models (#133) * Add open source LLM model: gpt4all * Add open source embedding model: sentence transformers --- embedchain/__init__.py | 2 +- embedchain/embedchain.py | 86 ++++++++++++++++++++++++-------- embedchain/vectordb/chroma_db.py | 5 +- setup.py | 2 + 4 files changed, 71 insertions(+), 24 deletions(-) diff --git a/embedchain/__init__.py b/embedchain/__init__.py index 3c09b8d3..c023c80b 100644 --- a/embedchain/__init__.py +++ b/embedchain/__init__.py @@ -1 +1 @@ -from .embedchain import App \ No newline at end of file +from .embedchain import App, OpenSourceApp \ No newline at end of file diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 25d68e5c..4606cb08 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -1,7 +1,9 @@ import openai import os +from chromadb.utils import embedding_functions from dotenv import load_dotenv +from gpt4all import GPT4All from langchain.docstore.document import Document from langchain.embeddings.openai import OpenAIEmbeddings @@ -17,16 +19,23 @@ from embedchain.chunkers.qna_pair import QnaPairChunker from embedchain.chunkers.text import TextChunker from embedchain.vectordb.chroma_db import ChromaDB -load_dotenv() +openai_ef = embedding_functions.OpenAIEmbeddingFunction( + api_key=os.getenv("OPENAI_API_KEY"), + organization_id=os.getenv("OPENAI_ORGANIZATION"), + model_name="text-embedding-ada-002" +) +sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") -embeddings = OpenAIEmbeddings() +gpt4all_model = None + +load_dotenv() ABS_PATH = os.getcwd() DB_DIR = os.path.join(ABS_PATH, "db") class EmbedChain: - def __init__(self, db=None): + def __init__(self, db=None, ef=None): """ Initializes the EmbedChain instance, sets up a vector DB client and creates a collection. @@ -34,7 +43,7 @@ class EmbedChain: :param db: The instance of the VectorDB subclass. """ if db is None: - db = ChromaDB() + db = ChromaDB(ef=ef) self.db_client = db.client self.collection = db.collection self.user_asks = [] @@ -154,20 +163,9 @@ class EmbedChain: ) ] - def get_openai_answer(self, prompt): - messages = [] - messages.append({ - "role": "user", "content": prompt - }) - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0613", - messages=messages, - temperature=0, - max_tokens=1000, - top_p=1, - ) - return response["choices"][0]["message"]["content"] - + def get_llm_model_answer(self, prompt): + raise NotImplementedError + def retrieve_from_database(self, input_query): """ Queries the vector database based on the given input query. @@ -186,7 +184,7 @@ class EmbedChain: else: content = "" return content - + def generate_prompt(self, input_query, context): """ Generates a prompt based on the given query and context, ready to be passed to an LLM @@ -211,7 +209,7 @@ class EmbedChain: :param context: Similar documents to the query used as context. :return: The answer. """ - answer = self.get_openai_answer(prompt) + answer = self.get_llm_model_answer(prompt) return answer def query(self, input_query): @@ -237,4 +235,50 @@ class App(EmbedChain): adds(data_type, url): adds the data from the given URL to the vector db. query(query): finds answer to the given query using vector database and LLM. """ - pass + + def __int__(self, db=None, ef=None): + if ef is None: + ef = openai_ef + super().__init__(db, ef) + + def get_llm_model_answer(self, prompt): + messages = [] + messages.append({ + "role": "user", "content": prompt + }) + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + messages=messages, + temperature=0, + max_tokens=1000, + top_p=1, + ) + return response["choices"][0]["message"]["content"] + + +class OpenSourceApp(EmbedChain): + """ + The OpenSource app. + Same as App, but uses an open source embedding model and LLM. + + Has two function: add and query. + + adds(data_type, url): adds the data from the given URL to the vector db. + query(query): finds answer to the given query using vector database and LLM. + """ + + def __init__(self, db=None, ef=None): + print("Loading open source embedding model. This may take some time...") + if ef is None: + ef = sentence_transformer_ef + print("Successfully loaded open source embedding model.") + super().__init__(db, ef) + + def get_llm_model_answer(self, prompt): + global gpt4all_model + if gpt4all_model is None: + gpt4all_model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin") + response = gpt4all_model.generate( + prompt=prompt, + ) + return response \ No newline at end of file diff --git a/embedchain/vectordb/chroma_db.py b/embedchain/vectordb/chroma_db.py index 30de7bac..96166f83 100644 --- a/embedchain/vectordb/chroma_db.py +++ b/embedchain/vectordb/chroma_db.py @@ -12,7 +12,8 @@ openai_ef = embedding_functions.OpenAIEmbeddingFunction( ) class ChromaDB(BaseVectorDB): - def __init__(self, db_dir=None): + def __init__(self, db_dir=None, ef=None): + self.ef = ef if ef is not None else openai_ef if db_dir is None: db_dir = "db" self.client_settings = chromadb.config.Settings( @@ -27,5 +28,5 @@ class ChromaDB(BaseVectorDB): def _get_or_create_collection(self): return self.client.get_or_create_collection( - 'embedchain_store', embedding_function=openai_ef, + 'embedchain_store', embedding_function=self.ef, ) diff --git a/setup.py b/setup.py index cab2e776..c0dd2685 100644 --- a/setup.py +++ b/setup.py @@ -29,5 +29,7 @@ setuptools.setup( "beautifulsoup4", "pypdf", "pytube", + "gpt4all", + "sentence_transformers", ] )