Resolve conflicts (#208)

2023-07-10 21:50:05 -07:00
parent 6936d6983d
commit 9ca836520f
32 changed files with 396 additions and 207 deletions
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -1,17 +1,16 @@
-import openai
-import os
 import logging
+import os
 from string import Template

+import openai
 from chromadb.utils import embedding_functions
 from dotenv import load_dotenv
 from langchain.docstore.document import Document
-from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.memory import ConversationBufferMemory
-from embedchain.config import InitConfig, AddConfig, QueryConfig, ChatConfig
+
+from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
 from embedchain.config.QueryConfig import DEFAULT_PROMPT
 from embedchain.data_formatter import DataFormatter
-from string import Template

 gpt4all_model = None

@@ -45,7 +44,8 @@ class EmbedChain:

        :param data_type: The type of the data to add.
        :param url: The URL where the data is located.
-        :param config: Optional. The `AddConfig` instance to use as configuration options.
+        :param config: Optional. The `AddConfig` instance to use as configuration
+        options.
        """
        if config is None:
            config = AddConfig()
@@ -62,22 +62,28 @@ class EmbedChain:

        :param data_type: The type of the data to add.
        :param content: The local data. Refer to the `README` for formatting.
-        :param config: Optional. The `AddConfig` instance to use as configuration options.
+        :param config: Optional. The `AddConfig` instance to use as
+        configuration options.
        """
        if config is None:
            config = AddConfig()

        data_formatter = DataFormatter(data_type, config)
        self.user_asks.append([data_type, content])
-        self.load_and_embed(data_formatter.loader, data_formatter.chunker, content)
+        self.load_and_embed(
+            data_formatter.loader,
+            data_formatter.chunker,
+            content,
+        )

    def load_and_embed(self, loader, chunker, src):
        """
-        Loads the data from the given URL, chunks it, and adds it to the database.
+        Loads the data from the given URL, chunks it, and adds it to database.

        :param loader: The loader to use to load the data.
        :param chunker: The chunker to use to chunk the data.
-        :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
+        :param src: The data to be handled by the loader. Can be a URL for
+        remote sources or local content for local loaders.
        """
        embeddings_data = chunker.create_chunks(loader, src)
        documents = embeddings_data["documents"]
@@ -91,8 +97,12 @@ class EmbedChain:
        existing_ids = set(existing_docs["ids"])

        if len(existing_ids):
-            data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)}
-            data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
+            data_dict = {
+                id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)
+            }
+            data_dict = {
+                id: value for id, value in data_dict.items() if id not in existing_ids
+            }

            if not data_dict:
                print(f"All data from {src} already exists in the database.")
@@ -103,12 +113,10 @@ class EmbedChain:

        chunks_before_addition = self.count()

-        self.collection.add(
-            documents=documents,
-            metadatas=list(metadatas),
-            ids=ids
+        self.collection.add(documents=documents, metadatas=list(metadatas), ids=ids)
+        print(
+            f"Successfully saved {src}. New chunks count: {self.count() - chunks_before_addition}"  # noqa:E501
        )
-        print(f"Successfully saved {src}. New chunks count: {self.count() - chunks_before_addition}")

    def _format_result(self, results):
        return [
@@ -132,7 +140,9 @@ class EmbedChain:
        :return: The content of the document that matched your query.
        """
        result = self.collection.query(
-            query_texts=[input_query,],
+            query_texts=[
+                input_query,
+            ],
            n_results=1,
        )
        result_formatted = self._format_result(result)
@@ -144,17 +154,21 @@ class EmbedChain:

    def generate_prompt(self, input_query, context, config: QueryConfig):
        """
-        Generates a prompt based on the given query and context, ready to be passed to an LLM
+        Generates a prompt based on the given query and context, ready to be
+        passed to an LLM

        :param input_query: The query to use.
        :param context: Similar documents to the query used as context.
-        :param config: Optional. The `QueryConfig` instance to use as configuration options.
+        :param config: Optional. The `QueryConfig` instance to use as
+        configuration options.
        :return: The prompt
        """
        if not config.history:
-            prompt = config.template.substitute(context = context, query = input_query)
+            prompt = config.template.substitute(context=context, query=input_query)
        else:
-            prompt = config.template.substitute(context = context, query = input_query, history = config.history)
+            prompt = config.template.substitute(
+                context=context, query=input_query, history=config.history
+            )
        return prompt

    def get_answer_from_llm(self, prompt, config: ChatConfig):
@@ -166,7 +180,7 @@ class EmbedChain:
        :param context: Similar documents to the query used as context.
        :return: The answer.
        """
-        
+
        return self.get_llm_model_answer(prompt, config)

    def query(self, input_query, config: QueryConfig = None):
@@ -176,7 +190,8 @@ class EmbedChain:
        LLM as context to get the answer.

        :param input_query: The query to use.
-        :param config: Optional. The `QueryConfig` instance to use as configuration options.
+        :param config: Optional. The `QueryConfig` instance to use as
+        configuration options.
        :return: The answer to the query.
        """
        if config is None:
@@ -188,7 +203,6 @@ class EmbedChain:
        logging.info(f"Answer: {answer}")
        return answer

-
    def chat(self, input_query, config: ChatConfig = None):
        """
        Queries the vector database on the given input query.
@@ -197,30 +211,31 @@ class EmbedChain:

        Maintains last 5 conversations in memory.
        :param input_query: The query to use.
-        :param config: Optional. The `ChatConfig` instance to use as configuration options.
+        :param config: Optional. The `ChatConfig` instance to use as
+        configuration options.
        :return: The answer to the query.
        """
        context = self.retrieve_from_database(input_query)
        global memory
        chat_history = memory.load_memory_variables({})["history"]
-        
+
        if config is None:
            config = ChatConfig()
        if chat_history:
            config.set_history(chat_history)
-            
+
        prompt = self.generate_prompt(input_query, context, config)
        logging.info(f"Prompt: {prompt}")
        answer = self.get_answer_from_llm(prompt, config)

        memory.chat_memory.add_user_message(input_query)
-        
+
        if isinstance(answer, str):
            memory.chat_memory.add_ai_message(answer)
            logging.info(f"Answer: {answer}")
            return answer
        else:
-            #this is a streamed response and needs to be handled differently.
+            # this is a streamed response and needs to be handled differently.
            return self._stream_chat_response(answer)

    def _stream_chat_response(self, answer):
@@ -230,7 +245,6 @@ class EmbedChain:
            yield chunk
        memory.chat_memory.add_ai_message(streamed_answer)
        logging.info(f"Answer: {streamed_answer}")
-          

    def dry_run(self, input_query, config: QueryConfig = None):
        """
@@ -242,7 +256,8 @@ class EmbedChain:
        the `max_tokens` parameter.

        :param input_query: The query to use.
-        :param config: Optional. The `QueryConfig` instance to use as configuration options.
+        :param config: Optional. The `QueryConfig` instance to use as
+        configuration options.
        :return: The prompt that would be sent to the LLM
        """
        if config is None:
@@ -260,7 +275,6 @@ class EmbedChain:
        """
        return self.collection.count()

-
    def reset(self):
        """
        Resets the database. Deletes all embeddings irreversibly.
@@ -288,35 +302,31 @@ class App(EmbedChain):
        super().__init__(config)

    def get_llm_model_answer(self, prompt, config: ChatConfig):
-
        messages = []
-        messages.append({
-            "role": "user", "content": prompt
-        })
+        messages.append({"role": "user", "content": prompt})
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0613",
            messages=messages,
            temperature=0,
            max_tokens=1000,
            top_p=1,
-            stream=config.stream
+            stream=config.stream,
        )

        if config.stream:
            return self._stream_llm_model_response(response)
        else:
            return response["choices"][0]["message"]["content"]
-    
+
    def _stream_llm_model_response(self, response):
        """
        This is a generator for streaming response from the OpenAI completions API
        """
        for line in response:
-            chunk = line['choices'][0].get('delta', {}).get('content', '')
+            chunk = line["choices"][0].get("delta", {}).get("content", "")
            yield chunk


-
 class OpenSourceApp(EmbedChain):
    """
    The OpenSource app.
@@ -330,20 +340,24 @@ class OpenSourceApp(EmbedChain):

    def __init__(self, config: InitConfig = None):
        """
-        :param config: InitConfig instance to load as configuration. Optional. `ef` defaults to open source.
+        :param config: InitConfig instance to load as configuration. Optional.
+        `ef` defaults to open source.
        """
-        print("Loading open source embedding model. This may take some time...")
+        print(
+            "Loading open source embedding model. This may take some time..."
+        )  # noqa:E501
        if not config:
            config = InitConfig(
-                ef = embedding_functions.SentenceTransformerEmbeddingFunction(
+                ef=embedding_functions.SentenceTransformerEmbeddingFunction(
                    model_name="all-MiniLM-L6-v2"
                )
            )
        elif not config.ef:
            config._set_embedding_function(
-                    embedding_functions.SentenceTransformerEmbeddingFunction(
-                model_name="all-MiniLM-L6-v2"
-            ))
+                embedding_functions.SentenceTransformerEmbeddingFunction(
+                    model_name="all-MiniLM-L6-v2"
+                )
+            )
        print("Successfully loaded open source embedding model.")
        super().__init__(config)

@@ -353,10 +367,7 @@ class OpenSourceApp(EmbedChain):
        global gpt4all_model
        if gpt4all_model is None:
            gpt4all_model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")
-        response = gpt4all_model.generate(
-            prompt=prompt,
-            streaming=config.stream
-        )
+        response = gpt4all_model.generate(prompt=prompt, streaming=config.stream)
        return response


@@ -368,12 +379,11 @@ class EmbedChainPersonApp:
    :param person: name of the person, better if its a well known person.
    :param config: InitConfig instance to load as configuration.
    """
+
    def __init__(self, person, config: InitConfig = None):
        self.person = person
-        self.person_prompt = f"You are {person}. Whatever you say, you will always say in {person} style."
-        self.template = Template(
-            self.person_prompt + " " + DEFAULT_PROMPT
-        )
+        self.person_prompt = f"You are {person}. Whatever you say, you will always say in {person} style."  # noqa:E501
+        self.template = Template(self.person_prompt + " " + DEFAULT_PROMPT)
        if config is None:
            config = InitConfig()
        super().__init__(config)
@@ -384,6 +394,7 @@ class PersonApp(EmbedChainPersonApp, App):
    The Person app.
    Extends functionality from EmbedChainPersonApp and App
    """
+
    def query(self, input_query, config: QueryConfig = None):
        query_config = QueryConfig(
            template=self.template,
@@ -392,7 +403,7 @@ class PersonApp(EmbedChainPersonApp, App):

    def chat(self, input_query, config: ChatConfig = None):
        chat_config = ChatConfig(
-            template = self.template,
+            template=self.template,
        )
        return super().chat(input_query, chat_config)

@@ -402,6 +413,7 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp):
    The Person app.
    Extends functionality from EmbedChainPersonApp and OpenSourceApp
    """
+
    def query(self, input_query, config: QueryConfig = None):
        query_config = QueryConfig(
            template=self.template,
@@ -410,6 +422,6 @@ class PersonOpenSourceApp(EmbedChainPersonApp, OpenSourceApp):

    def chat(self, input_query, config: ChatConfig = None):
        chat_config = ChatConfig(
-            template = self.template,
+            template=self.template,
        )
-        return super().chat(input_query, chat_config)
+        return super().chat(input_query, chat_config)