feat: add new custom app (#313)
This commit is contained in:
@@ -3,6 +3,6 @@ import importlib.metadata
|
||||
__version__ = importlib.metadata.version(__package__ or __name__)
|
||||
|
||||
from embedchain.apps.App import App # noqa: F401
|
||||
from embedchain.apps.CustomApp import CustomApp # noqa: F401
|
||||
from embedchain.apps.OpenSourceApp import OpenSourceApp # noqa: F401
|
||||
from embedchain.apps.PersonApp import (PersonApp, # noqa: F401
|
||||
PersonOpenSourceApp)
|
||||
from embedchain.apps.PersonApp import PersonApp, PersonOpenSourceApp # noqa: F401
|
||||
|
||||
@@ -27,7 +27,7 @@ class App(EmbedChain):
|
||||
messages = []
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
response = openai.ChatCompletion.create(
|
||||
model=config.model,
|
||||
model=config.model or "gpt-3.5-turbo-0613",
|
||||
messages=messages,
|
||||
temperature=config.temperature,
|
||||
max_tokens=config.max_tokens,
|
||||
|
||||
128
embedchain/apps/CustomApp.py
Normal file
128
embedchain/apps/CustomApp.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import logging
|
||||
from typing import Iterable, List, Union
|
||||
|
||||
from langchain.schema import BaseMessage
|
||||
|
||||
from embedchain.config import ChatConfig, CustomAppConfig, OpenSourceAppConfig
|
||||
from embedchain.embedchain import EmbedChain
|
||||
from embedchain.models import Providers
|
||||
|
||||
|
||||
class CustomApp(EmbedChain):
|
||||
"""
|
||||
The custom EmbedChain app.
|
||||
Has two functions: add and query.
|
||||
|
||||
adds(data_type, url): adds the data from the given URL to the vector db.
|
||||
query(query): finds answer to the given query using vector database and LLM.
|
||||
dry_run(query): test your prompt without consuming tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, config: CustomAppConfig = None):
|
||||
"""
|
||||
:param config: Optional. `CustomAppConfig` instance to load as configuration.
|
||||
:raises ValueError: Config must be provided for custom app
|
||||
"""
|
||||
if config is None:
|
||||
raise ValueError("Config must be provided for custom app")
|
||||
|
||||
self.provider = config.provider
|
||||
|
||||
if config.provider == Providers.GPT4ALL:
|
||||
from embedchain import OpenSourceApp
|
||||
|
||||
# Because these models run locally, they should have an instance running when the custom app is created
|
||||
self.open_source_app = OpenSourceApp(config=config.open_source_app_config)
|
||||
|
||||
super().__init__(config)
|
||||
|
||||
def set_llm_model(self, provider: Providers):
|
||||
self.provider = provider
|
||||
if provider == Providers.GPT4ALL:
|
||||
raise ValueError(
|
||||
"GPT4ALL needs to be instantiated with the model known, please create a new app instance instead"
|
||||
)
|
||||
|
||||
def get_llm_model_answer(self, prompt, config: ChatConfig):
|
||||
# TODO: Quitting the streaming response here for now.
|
||||
# Idea: https://gist.github.com/jvelezmagic/03ddf4c452d011aae36b2a0f73d72f68
|
||||
if config.stream:
|
||||
raise NotImplementedError(
|
||||
"Streaming responses have not been implemented for this model yet. Please disable."
|
||||
)
|
||||
|
||||
try:
|
||||
if self.provider == Providers.OPENAI:
|
||||
return CustomApp._get_openai_answer(prompt, config)
|
||||
|
||||
if self.provider == Providers.ANTHROPHIC:
|
||||
return CustomApp._get_athrophic_answer(prompt, config)
|
||||
|
||||
if self.provider == Providers.VERTEX_AI:
|
||||
return CustomApp._get_vertex_answer(prompt, config)
|
||||
|
||||
if self.provider == Providers.GPT4ALL:
|
||||
return self.open_source_app._get_gpt4all_answer(prompt, config)
|
||||
|
||||
except ImportError as e:
|
||||
raise ImportError(e.msg) from None
|
||||
|
||||
@staticmethod
|
||||
def _get_openai_answer(prompt: str, config: ChatConfig) -> str:
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
|
||||
logging.info(vars(config))
|
||||
|
||||
chat = ChatOpenAI(
|
||||
temperature=config.temperature,
|
||||
model=config.model or "gpt-3.5-turbo",
|
||||
max_tokens=config.max_tokens,
|
||||
streaming=config.stream,
|
||||
)
|
||||
|
||||
if config.top_p and config.top_p != 1:
|
||||
logging.warning("Config option `top_p` is not supported by this model.")
|
||||
|
||||
messages = CustomApp._get_messages(prompt)
|
||||
|
||||
return chat(messages).content
|
||||
|
||||
@staticmethod
|
||||
def _get_athrophic_answer(prompt: str, config: ChatConfig) -> str:
|
||||
from langchain.chat_models import ChatAnthropic
|
||||
|
||||
chat = ChatAnthropic(temperature=config.temperature, model=config.model)
|
||||
|
||||
if config.max_tokens and config.max_tokens != 1000:
|
||||
logging.warning("Config option `max_tokens` is not supported by this model.")
|
||||
|
||||
messages = CustomApp._get_messages(prompt)
|
||||
|
||||
return chat(messages).content
|
||||
|
||||
@staticmethod
|
||||
def _get_vertex_answer(prompt: str, config: ChatConfig) -> str:
|
||||
from langchain.chat_models import ChatVertexAI
|
||||
|
||||
chat = ChatVertexAI(temperature=config.temperature, model=config.model, max_output_tokens=config.max_tokens)
|
||||
|
||||
if config.top_p and config.top_p != 1:
|
||||
logging.warning("Config option `top_p` is not supported by this model.")
|
||||
|
||||
messages = CustomApp._get_messages(prompt)
|
||||
|
||||
return chat(messages).content
|
||||
|
||||
@staticmethod
|
||||
def _get_messages(prompt: str) -> List[BaseMessage]:
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
|
||||
return [SystemMessage(content="You are a helpful assistant."), HumanMessage(content=prompt)]
|
||||
|
||||
def _stream_llm_model_response(self, response):
|
||||
"""
|
||||
This is a generator for streaming response from the OpenAI completions API
|
||||
"""
|
||||
for line in response:
|
||||
chunk = line["choices"][0].get("delta", {}).get("content", "")
|
||||
yield chunk
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from typing import Iterable, List, Union
|
||||
|
||||
from embedchain.config import ChatConfig, OpenSourceAppConfig
|
||||
from embedchain.embedchain import EmbedChain
|
||||
@@ -26,14 +27,39 @@ class OpenSourceApp(EmbedChain):
|
||||
if not config:
|
||||
config = OpenSourceAppConfig()
|
||||
|
||||
if not config.model:
|
||||
raise ValueError("OpenSourceApp needs a model to be instantiated. Maybe you passed the wrong config type?")
|
||||
|
||||
self.instance = OpenSourceApp._get_instance(config.model)
|
||||
|
||||
logging.info("Successfully loaded open source embedding model.")
|
||||
super().__init__(config)
|
||||
|
||||
def get_llm_model_answer(self, prompt, config: ChatConfig):
|
||||
from gpt4all import GPT4All
|
||||
return self._get_gpt4all_answer(prompt=prompt, config=config)
|
||||
|
||||
global gpt4all_model
|
||||
if gpt4all_model is None:
|
||||
gpt4all_model = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")
|
||||
response = gpt4all_model.generate(prompt=prompt, streaming=config.stream)
|
||||
@staticmethod
|
||||
def _get_instance(model):
|
||||
try:
|
||||
from gpt4all import GPT4All
|
||||
except ModuleNotFoundError:
|
||||
raise ValueError(
|
||||
"The GPT4All python package is not installed. Please install it with `pip install GPT4All`"
|
||||
) from None
|
||||
|
||||
return GPT4All(model)
|
||||
|
||||
def _get_gpt4all_answer(self, prompt: str, config: ChatConfig) -> Union[str, Iterable]:
|
||||
if config.model and config.model != self.config.model:
|
||||
raise RuntimeError(
|
||||
"OpenSourceApp does not support switching models at runtime. Please create a new app instance."
|
||||
)
|
||||
|
||||
response = self.instance.generate(
|
||||
prompt=prompt,
|
||||
streaming=config.stream,
|
||||
top_p=config.top_p,
|
||||
max_tokens=config.max_tokens,
|
||||
temp=config.temperature,
|
||||
)
|
||||
return response
|
||||
|
||||
@@ -4,8 +4,7 @@ from embedchain.apps.App import App
|
||||
from embedchain.apps.OpenSourceApp import OpenSourceApp
|
||||
from embedchain.config import ChatConfig, QueryConfig
|
||||
from embedchain.config.apps.BaseAppConfig import BaseAppConfig
|
||||
from embedchain.config.QueryConfig import (DEFAULT_PROMPT,
|
||||
DEFAULT_PROMPT_WITH_HISTORY)
|
||||
from embedchain.config.QueryConfig import DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY
|
||||
|
||||
|
||||
class EmbedChainPersonApp:
|
||||
|
||||
@@ -104,7 +104,7 @@ class QueryConfig(BaseConfig):
|
||||
|
||||
self.temperature = temperature if temperature else 0
|
||||
self.max_tokens = max_tokens if max_tokens else 1000
|
||||
self.model = model if model else "gpt-3.5-turbo-0613"
|
||||
self.model = model
|
||||
self.top_p = top_p if top_p else 1
|
||||
|
||||
if self.validate_template(template):
|
||||
|
||||
@@ -18,7 +18,9 @@ class AppConfig(BaseAppConfig):
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
"""
|
||||
super().__init__(log_level=log_level, ef=AppConfig.default_embedding_function(), host=host, port=port, id=id)
|
||||
super().__init__(
|
||||
log_level=log_level, embedding_fn=AppConfig.default_embedding_function(), host=host, port=port, id=id
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def default_embedding_function():
|
||||
|
||||
@@ -8,11 +8,11 @@ class BaseAppConfig(BaseConfig):
|
||||
Parent config to initialize an instance of `App`, `OpenSourceApp` or `CustomApp`.
|
||||
"""
|
||||
|
||||
def __init__(self, log_level=None, ef=None, db=None, host=None, port=None, id=None):
|
||||
def __init__(self, log_level=None, embedding_fn=None, db=None, host=None, port=None, id=None):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param ef: Embedding function to use.
|
||||
:param embedding_fn: Embedding function to use.
|
||||
:param db: Optional. (Vector) database instance to use for embeddings.
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
@@ -20,26 +20,26 @@ class BaseAppConfig(BaseConfig):
|
||||
"""
|
||||
self._setup_logging(log_level)
|
||||
|
||||
self.db = db if db else BaseAppConfig.default_db(ef=ef, host=host, port=port)
|
||||
self.db = db if db else BaseAppConfig.default_db(embedding_fn=embedding_fn, host=host, port=port)
|
||||
self.id = id
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def default_db(ef, host, port):
|
||||
def default_db(embedding_fn, host, port):
|
||||
"""
|
||||
Sets database to default (`ChromaDb`).
|
||||
|
||||
:param ef: Embedding function to use in database.
|
||||
:param embedding_fn: Embedding function to use in database.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:returns: Default database
|
||||
:raises ValueError: BaseAppConfig knows no default embedding function.
|
||||
"""
|
||||
if ef is None:
|
||||
if embedding_fn is None:
|
||||
raise ValueError("ChromaDb cannot be instantiated without an embedding function")
|
||||
from embedchain.vectordb.chroma_db import ChromaDB
|
||||
|
||||
return ChromaDB(ef=ef, host=host, port=port)
|
||||
return ChromaDB(embedding_fn=embedding_fn, host=host, port=port)
|
||||
|
||||
def _setup_logging(self, debug_level):
|
||||
level = logging.WARNING # Default level
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from chromadb.api.types import Documents, Embeddings
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from embedchain.models import EmbeddingFunctions, Providers
|
||||
|
||||
from .BaseAppConfig import BaseAppConfig
|
||||
from embedchain.models import Providers
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class CustomAppConfig(BaseAppConfig):
|
||||
@@ -6,14 +17,88 @@ class CustomAppConfig(BaseAppConfig):
|
||||
Config to initialize an embedchain custom `App` instance, with extra config options.
|
||||
"""
|
||||
|
||||
def __init__(self, log_level=None, ef=None, db=None, host=None, port=None, id=None):
|
||||
def __init__(
|
||||
self,
|
||||
log_level=None,
|
||||
embedding_fn: EmbeddingFunctions = None,
|
||||
embedding_fn_model=None,
|
||||
db=None,
|
||||
host=None,
|
||||
port=None,
|
||||
id=None,
|
||||
provider: Providers = None,
|
||||
model=None,
|
||||
open_source_app_config=None,
|
||||
):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param ef: Optional. Embedding function to use.
|
||||
:param embedding_fn: Optional. Embedding function to use.
|
||||
:param embedding_fn_model: Optional. Model name to use for embedding function.
|
||||
:param db: Optional. (Vector) database to use for embeddings.
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param provider: Optional. (Providers): LLM Provider to use.
|
||||
:param open_source_app_config: Optional. Config instance needed for open source apps.
|
||||
"""
|
||||
super().__init__(log_level=log_level, db=db, host=host, port=port, id=id)
|
||||
if provider:
|
||||
self.provider = provider
|
||||
else:
|
||||
raise ValueError("CustomApp must have a provider assigned.")
|
||||
|
||||
self.open_source_app_config = open_source_app_config
|
||||
|
||||
super().__init__(
|
||||
log_level=log_level,
|
||||
embedding_fn=CustomAppConfig.embedding_function(embedding_function=embedding_fn, model=embedding_fn_model),
|
||||
db=db,
|
||||
host=host,
|
||||
port=port,
|
||||
id=id,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def langchain_default_concept(embeddings: Any):
|
||||
"""
|
||||
Langchains default function layout for embeddings.
|
||||
"""
|
||||
|
||||
def embed_function(texts: Documents) -> Embeddings:
|
||||
return embeddings.embed_documents(texts)
|
||||
|
||||
return embed_function
|
||||
|
||||
@staticmethod
|
||||
def embedding_function(embedding_function: EmbeddingFunctions, model: str = None):
|
||||
if not isinstance(embedding_function, EmbeddingFunctions):
|
||||
raise ValueError(
|
||||
f"Invalid option: '{embedding_function}'. Expecting one of the following options: {list(map(lambda x: x.value, EmbeddingFunctions))}" # noqa: E501
|
||||
)
|
||||
|
||||
if embedding_function == EmbeddingFunctions.OPENAI:
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
if model:
|
||||
embeddings = OpenAIEmbeddings(model=model)
|
||||
else:
|
||||
embeddings = OpenAIEmbeddings()
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.HUGGING_FACE:
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
embeddings = HuggingFaceEmbeddings(model_name=model)
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.VERTEX_AI:
|
||||
from langchain.embeddings import VertexAIEmbeddings
|
||||
|
||||
embeddings = VertexAIEmbeddings(model_name=model)
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.GPT4ALL:
|
||||
# Note: We could use langchains GPT4ALL embedding, but it's not available in all versions.
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
return embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model)
|
||||
|
||||
@@ -8,16 +8,23 @@ class OpenSourceAppConfig(BaseAppConfig):
|
||||
Config to initialize an embedchain custom `OpenSourceApp` instance, with extra config options.
|
||||
"""
|
||||
|
||||
def __init__(self, log_level=None, host=None, port=None, id=None):
|
||||
def __init__(self, log_level=None, host=None, port=None, id=None, model=None):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param model: Optional. GPT4ALL uses the model to instantiate the class. So unlike `App`, it has to be provided before querying.
|
||||
"""
|
||||
self.model = model or "orca-mini-3b.ggmlv3.q4_0.bin"
|
||||
|
||||
super().__init__(
|
||||
log_level=log_level, ef=OpenSourceAppConfig.default_embedding_function(), host=host, port=port, id=id
|
||||
log_level=log_level,
|
||||
embedding_fn=OpenSourceAppConfig.default_embedding_function(),
|
||||
host=host,
|
||||
port=port,
|
||||
id=id,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -10,7 +10,7 @@ from embedchain.config.apps.BaseAppConfig import BaseAppConfig
|
||||
from embedchain.config.QueryConfig import DOCS_SITE_PROMPT_TEMPLATE
|
||||
from embedchain.data_formatter import DataFormatter
|
||||
|
||||
gpt4all_model = None
|
||||
from chromadb.errors import InvalidDimensionException
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@@ -26,7 +26,7 @@ class EmbedChain:
|
||||
Initializes the EmbedChain instance, sets up a vector DB client and
|
||||
creates a collection.
|
||||
|
||||
:param config: InitConfig instance to load as configuration.
|
||||
:param config: BaseAppConfig instance to load as configuration.
|
||||
"""
|
||||
|
||||
self.config = config
|
||||
@@ -152,14 +152,21 @@ class EmbedChain:
|
||||
:param config: The query configuration.
|
||||
:return: The content of the document that matched your query.
|
||||
"""
|
||||
where = {"app_id": self.config.id} if self.config.id is not None else {} # optional filter
|
||||
result = self.collection.query(
|
||||
query_texts=[
|
||||
input_query,
|
||||
],
|
||||
n_results=config.number_documents,
|
||||
where=where,
|
||||
)
|
||||
try:
|
||||
where = {"app_id": self.config.id} if self.config.id is not None else {} # optional filter
|
||||
result = self.collection.query(
|
||||
query_texts=[
|
||||
input_query,
|
||||
],
|
||||
n_results=config.number_documents,
|
||||
where=where,
|
||||
)
|
||||
except InvalidDimensionException as e:
|
||||
raise InvalidDimensionException(
|
||||
e.message()
|
||||
+ ". This is commonly a side-effect when an embedding function, different from the one used to add the embeddings, is used to retrieve an embedding from the database."
|
||||
) from None
|
||||
|
||||
results_formatted = self._format_result(result)
|
||||
contents = [result[0].page_content for result in results_formatted]
|
||||
return contents
|
||||
|
||||
8
embedchain/models/EmbeddingFunctions.py
Normal file
8
embedchain/models/EmbeddingFunctions.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class EmbeddingFunctions(Enum):
|
||||
OPENAI = "OPENAI"
|
||||
HUGGING_FACE = "HUGGING_FACE"
|
||||
VERTEX_AI = "VERTEX_AI"
|
||||
GPT4ALL = "GPT4ALL"
|
||||
8
embedchain/models/Providers.py
Normal file
8
embedchain/models/Providers.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Providers(Enum):
|
||||
OPENAI = "OPENAI"
|
||||
ANTHROPHIC = "ANTHPROPIC"
|
||||
VERTEX_AI = "VERTEX_AI"
|
||||
GPT4ALL = "GPT4ALL"
|
||||
2
embedchain/models/__init__.py
Normal file
2
embedchain/models/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .EmbeddingFunctions import EmbeddingFunctions # noqa: F401
|
||||
from .Providers import Providers # noqa: F401
|
||||
@@ -8,8 +8,11 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB
|
||||
class ChromaDB(BaseVectorDB):
|
||||
"""Vector database using ChromaDB."""
|
||||
|
||||
def __init__(self, db_dir=None, ef=None, host=None, port=None):
|
||||
self.ef = ef
|
||||
def __init__(self, db_dir=None, embedding_fn=None, host=None, port=None):
|
||||
self.embedding_fn = embedding_fn
|
||||
|
||||
if not hasattr(embedding_fn, "__call__"):
|
||||
raise ValueError("Embedding function is not a function")
|
||||
|
||||
if host and port:
|
||||
logging.info(f"Connecting to ChromaDB server: {host}:{port}")
|
||||
@@ -36,5 +39,5 @@ class ChromaDB(BaseVectorDB):
|
||||
"""Get or create the collection."""
|
||||
return self.client.get_or_create_collection(
|
||||
"embedchain_store",
|
||||
embedding_function=self.ef,
|
||||
embedding_function=self.embedding_fn,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user