[Feature] Add support to use any sql database as the metadata storage for embedchain apps (#1273)

2024-02-19 13:04:18 -08:00
parent 6c12bc9044
commit 5e2e7fb639
20 changed files with 601 additions and 202 deletions
--- a/embedchain/init.py
+++ b/embedchain/init.py
@@ -7,4 +7,4 @@ from embedchain.client import Client  # noqa: F401
 from embedchain.pipeline import Pipeline  # noqa: F401

 # Setup the user directory if doesn't exist already
-Client.setup_dir()
+Client.setup()
--- a/embedchain/alembic.ini
+++ b/embedchain/alembic.ini
@@ -0,0 +1,116 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = embedchain/migrations
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python>=3.9 or backports.zoneinfo library.
+# Any required deps can installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to ZoneInfo()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = driver://user:pass@localhost/dbname
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = WARN
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/embedchain/app.py
+++ b/embedchain/app.py
@@ -3,7 +3,6 @@ import concurrent.futures
 import json
 import logging
 import os
-import sqlite3
 import uuid
 from typing import Any, Optional, Union

@@ -16,7 +15,8 @@ from embedchain.cache import (Config, ExactMatchEvaluation,
                              gptcache_data_manager, gptcache_pre_function)
 from embedchain.client import Client
 from embedchain.config import AppConfig, CacheConfig, ChunkerConfig
-from embedchain.constants import SQLITE_PATH
+from embedchain.core.db.database import get_session
+from embedchain.core.db.models import DataSource
 from embedchain.embedchain import EmbedChain
 from embedchain.embedder.base import BaseEmbedder
 from embedchain.embedder.openai import OpenAIEmbedder
@@ -33,9 +33,6 @@ from embedchain.utils.misc import validate_config
 from embedchain.vectordb.base import BaseVectorDB
 from embedchain.vectordb.chroma import ChromaDB

-# Set up the user directory if it doesn't exist already
-Client.setup_dir()
-

@register_deserializable
 class App(EmbedChain):
@@ -120,6 +117,9 @@ class App(EmbedChain):
        self.llm = llm or OpenAILlm()
        self._init_db()

+        # Session for the metadata db
+        self.db_session = get_session()
+
        # If cache_config is provided, initializing the cache ...
        if self.cache_config is not None:
            self._init_cache()
@@ -127,27 +127,6 @@ class App(EmbedChain):
        # Send anonymous telemetry
        self._telemetry_props = {"class": self.__class__.__name__}
        self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
-
-        # Establish a connection to the SQLite database
-        self.connection = sqlite3.connect(SQLITE_PATH, check_same_thread=False)
-        self.cursor = self.connection.cursor()
-
-        # Create the 'data_sources' table if it doesn't exist
-        self.cursor.execute(
-            """
-            CREATE TABLE IF NOT EXISTS data_sources (
-                pipeline_id TEXT,
-                hash TEXT,
-                type TEXT,
-                value TEXT,
-                metadata TEXT,
-                is_uploaded INTEGER DEFAULT 0,
-                PRIMARY KEY (pipeline_id, hash)
-            )
-        """
-        )
-        self.connection.commit()
-        # Send anonymous telemetry
        self.telemetry.capture(event_name="init", properties=self._telemetry_props)

        self.user_asks = []
@@ -307,20 +286,14 @@ class App(EmbedChain):
            return False

    def _mark_data_as_uploaded(self, data_hash):
-        self.cursor.execute(
-            "UPDATE data_sources SET is_uploaded = 1 WHERE hash = ? AND pipeline_id = ?",
-            (data_hash, self.local_id),
-        )
-        self.connection.commit()
+        self.db_session.query(DataSource).filter_by(hash=data_hash, app_id=self.local_id).update({"is_uploaded": 1})

    def get_data_sources(self):
-        db_data = self.cursor.execute("SELECT * FROM data_sources WHERE pipeline_id = ?", (self.local_id,)).fetchall()
-
-        data_sources = []
-        for data in db_data:
-            data_sources.append({"data_type": data[2], "data_value": data[3], "metadata": data[4]})
-
-        return data_sources
+        data_sources = self.db_session.query(DataSource).filter_by(app_id=self.local_id).all()
+        results = []
+        for row in data_sources:
+            results.append({"data_type": row.data_type, "data_value": row.data_value, "metadata": row.metadata})
+        return results

    def deploy(self):
        if self.client is None:
@@ -329,14 +302,11 @@ class App(EmbedChain):
        pipeline_data = self._create_pipeline()
        self.id = pipeline_data["id"]

-        results = self.cursor.execute(
-            "SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,)  # noqa:E501
-        ).fetchall()
-
+        results = self.db_session.query(DataSource).filter_by(app_id=self.local_id, is_uploaded=0).all()
        if len(results) > 0:
            print("🛠️ Adding data to your pipeline...")
        for result in results:
-            data_hash, data_type, data_value = result[1], result[2], result[3]
+            data_hash, data_type, data_value = result.hash, result.data_type, result.data_value
            self._process_and_upload_data(data_hash, data_type, data_value)

        # Send anonymous telemetry
@@ -423,10 +393,6 @@ class App(EmbedChain):
        else:
            cache_config = None

-        # Send anonymous telemetry
-        event_properties = {"init_type": "config_data"}
-        AnonymousTelemetry().capture(event_name="init", properties=event_properties)
-
        return cls(
            config=app_config,
            llm=llm,
--- a/embedchain/client.py
+++ b/embedchain/client.py
@@ -5,7 +5,8 @@ import uuid

 import requests

-from embedchain.constants import CONFIG_DIR, CONFIG_FILE
+from embedchain.constants import CONFIG_DIR, CONFIG_FILE, DB_URI
+from embedchain.core.db.database import init_db, setup_engine


 class Client:
@@ -31,7 +32,7 @@ class Client:
                )

    @classmethod
-    def setup_dir(cls):
+    def setup(cls):
        """
        Loads the user id from the config file if it exists, otherwise generates a new
        one and saves it to the config file.
@@ -40,6 +41,9 @@ class Client:
        :rtype: str
        """
        os.makedirs(CONFIG_DIR, exist_ok=True)
+        setup_engine(database_uri=DB_URI)
+        init_db()
+
        if os.path.exists(CONFIG_FILE):
            with open(CONFIG_FILE, "r") as f:
                data = json.load(f)
@@ -53,7 +57,7 @@ class Client:
    @classmethod
    def load_config(cls):
        if not os.path.exists(CONFIG_FILE):
-            cls.setup_dir()
+            cls.setup()

        with open(CONFIG_FILE, "r") as config_file:
            return json.load(config_file)
--- a/embedchain/constants.py
+++ b/embedchain/constants.py
@@ -6,3 +6,4 @@ HOME_DIR = str(Path.home())
 CONFIG_DIR = os.path.join(HOME_DIR, ".embedchain")
 CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
 SQLITE_PATH = os.path.join(CONFIG_DIR, "embedchain.db")
+DB_URI = f"sqlite:///{SQLITE_PATH}"
--- a/embedchain/core/init.py
+++ b/embedchain/core/init.py
--- a/embedchain/core/db/init.py
+++ b/embedchain/core/db/init.py
--- a/embedchain/core/db/database.py
+++ b/embedchain/core/db/database.py
@@ -0,0 +1,83 @@
+import os
+
+from alembic import command
+from alembic.config import Config
+from sqlalchemy import create_engine
+from sqlalchemy.engine.base import Engine
+from sqlalchemy.orm import Session as SQLAlchemySession
+from sqlalchemy.orm import scoped_session, sessionmaker
+
+from .models import Base
+
+
+class DatabaseManager:
+    def __init__(self, database_uri: str = "sqlite:///embedchain.db", echo: bool = False):
+        self.database_uri = database_uri
+        self.echo = echo
+        self.engine: Engine = None
+        self._session_factory = None
+
+    def setup_engine(self) -> None:
+        """Initializes the database engine and session factory."""
+        self.engine = create_engine(self.database_uri, echo=self.echo, connect_args={"check_same_thread": False})
+        self._session_factory = scoped_session(sessionmaker(bind=self.engine))
+        Base.metadata.bind = self.engine
+
+    def init_db(self) -> None:
+        """Creates all tables defined in the Base metadata."""
+        if not self.engine:
+            raise RuntimeError("Database engine is not initialized. Call setup_engine() first.")
+        Base.metadata.create_all(self.engine)
+
+    def get_session(self) -> SQLAlchemySession:
+        """Provides a session for database operations."""
+        if not self._session_factory:
+            raise RuntimeError("Session factory is not initialized. Call setup_engine() first.")
+        return self._session_factory()
+
+    def close_session(self) -> None:
+        """Closes the current session."""
+        if self._session_factory:
+            self._session_factory.remove()
+
+    def execute_transaction(self, transaction_block):
+        """Executes a block of code within a database transaction."""
+        session = self.get_session()
+        try:
+            transaction_block(session)
+            session.commit()
+        except Exception as e:
+            session.rollback()
+            raise e
+        finally:
+            self.close_session()
+
+
+# Singleton pattern to use throughout the application
+database_manager = DatabaseManager()
+
+
+# Convenience functions for backward compatibility and ease of use
+def setup_engine(database_uri: str = "sqlite:///embedchain.db", echo: bool = False) -> None:
+    database_manager.database_uri = database_uri
+    database_manager.echo = echo
+    database_manager.setup_engine()
+
+
+def alembic_upgrade() -> None:
+    """Upgrades the database to the latest version."""
+    alembic_config_path = os.path.join(os.path.dirname(__file__), "..", "..", "alembic.ini")
+    alembic_cfg = Config(alembic_config_path)
+    command.upgrade(alembic_cfg, "head")
+
+
+def init_db() -> None:
+    alembic_upgrade()
+
+
+def get_session() -> SQLAlchemySession:
+    return database_manager.get_session()
+
+
+def execute_transaction(transaction_block):
+    database_manager.execute_transaction(transaction_block)
--- a/embedchain/core/db/models.py
+++ b/embedchain/core/db/models.py
@@ -0,0 +1,31 @@
+import uuid
+
+from sqlalchemy import TIMESTAMP, Column, Integer, String, Text, func
+from sqlalchemy.orm import declarative_base
+
+Base = declarative_base()
+metadata = Base.metadata
+
+
+class DataSource(Base):
+    __tablename__ = "ec_data_sources"
+
+    id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
+    app_id = Column(Text, index=True)
+    hash = Column(Text, index=True)
+    type = Column(Text, index=True)
+    value = Column(Text)
+    meta_data = Column(Text, name="metadata")
+    is_uploaded = Column(Integer, default=0)
+
+
+class ChatHistory(Base):
+    __tablename__ = "ec_chat_history"
+
+    app_id = Column(String, primary_key=True)
+    id = Column(String, primary_key=True)
+    session_id = Column(String, primary_key=True, index=True)
+    question = Column(Text)
+    answer = Column(Text)
+    meta_data = Column(Text, name="metadata")
+    created_at = Column(TIMESTAMP, default=func.current_timestamp(), index=True)
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -1,7 +1,6 @@
 import hashlib
 import json
 import logging
-import sqlite3
 from typing import Any, Optional, Union

 from dotenv import load_dotenv
@@ -13,7 +12,7 @@ from embedchain.cache import (adapt, get_gptcache_session,
 from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
 from embedchain.config.base_app_config import BaseAppConfig
-from embedchain.constants import SQLITE_PATH
+from embedchain.core.db.models import DataSource
 from embedchain.data_formatter import DataFormatter
 from embedchain.embedder.base import BaseEmbedder
 from embedchain.helpers.json_serializable import JSONSerializable
@@ -21,7 +20,6 @@ from embedchain.llm.base import BaseLlm
 from embedchain.loaders.base_loader import BaseLoader
 from embedchain.models.data_type import (DataType, DirectDataType,
                                         IndirectDataType, SpecialDataType)
-from embedchain.telemetry.posthog import AnonymousTelemetry
 from embedchain.utils.misc import detect_datatype, is_valid_json_string
 from embedchain.vectordb.base import BaseVectorDB

@@ -85,30 +83,6 @@ class EmbedChain(JSONSerializable):
        self.user_asks = []

        self.chunker: Optional[ChunkerConfig] = None
-        # Send anonymous telemetry
-        self._telemetry_props = {"class": self.__class__.__name__}
-        self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
-        # Establish a connection to the SQLite database
-        self.connection = sqlite3.connect(SQLITE_PATH, check_same_thread=False)
-        self.cursor = self.connection.cursor()
-
-        # Create the 'data_sources' table if it doesn't exist
-        self.cursor.execute(
-            """
-            CREATE TABLE IF NOT EXISTS data_sources (
-                pipeline_id TEXT,
-                hash TEXT,
-                type TEXT,
-                value TEXT,
-                metadata TEXT,
-                is_uploaded INTEGER DEFAULT 0,
-                PRIMARY KEY (pipeline_id, hash)
-            )
-        """
-        )
-        self.connection.commit()
-        # Send anonymous telemetry
-        self.telemetry.capture(event_name="init", properties=self._telemetry_props)

    @property
    def collect_metrics(self):
@@ -204,17 +178,21 @@ class EmbedChain(JSONSerializable):
        if data_type in {DataType.DOCS_SITE}:
            self.is_docs_site_instance = True

-        # Insert the data into the 'data' table
-        self.cursor.execute(
-            """
-            INSERT OR REPLACE INTO data_sources (hash, pipeline_id, type, value, metadata)
-            VALUES (?, ?, ?, ?, ?)
-        """,
-            (source_hash, self.config.id, data_type.value, str(source), json.dumps(metadata)),
+        # Insert the data into the 'ec_data_sources' table
+        self.db_session.add(
+            DataSource(
+                hash=source_hash,
+                app_id=self.config.id,
+                type=data_type.value,
+                value=source,
+                metadata=json.dumps(metadata),
+            )
        )
-
-        # Commit the transaction
-        self.connection.commit()
+        try:
+            self.db_session.commit()
+        except Exception as e:
+            logging.error(f"Error adding data source: {e}")
+            self.db_session.rollback()

        if dry_run:
            data_chunks_info = {"chunks": documents, "metadata": metadatas, "count": len(documents), "type": data_type}
@@ -666,9 +644,14 @@ class EmbedChain(JSONSerializable):
        Resets the database. Deletes all embeddings irreversibly.
        `App` does not have to be reinitialized after using this method.
        """
+        try:
+            self.db_session.query(DataSource).filter_by(app_id=self.config.id).delete()
+            self.db_session.commit()
+        except Exception as e:
+            logging.error(f"Error deleting chat history: {e}")
+            self.db_session.rollback()
+            return None
        self.db.reset()
-        self.cursor.execute("DELETE FROM data_sources WHERE pipeline_id = ?", (self.config.id,))
-        self.connection.commit()
        self.delete_all_chat_history(app_id=self.config.id)
        # Send anonymous telemetry
        self.telemetry.capture(event_name="reset", properties=self._telemetry_props)
--- a/embedchain/memory/base.py
+++ b/embedchain/memory/base.py
@@ -1,55 +1,40 @@
 import json
 import logging
-import sqlite3
 import uuid
 from typing import Any, Optional

-from embedchain.constants import SQLITE_PATH
+from embedchain.core.db.database import get_session
+from embedchain.core.db.models import ChatHistory as ChatHistoryModel
 from embedchain.memory.message import ChatMessage
 from embedchain.memory.utils import merge_metadata_dict

-CHAT_MESSAGE_CREATE_TABLE_QUERY = """
-    CREATE TABLE IF NOT EXISTS ec_chat_history (
-        app_id TEXT,
-        id TEXT,
-        session_id TEXT,
-        question TEXT,
-        answer TEXT,
-        metadata TEXT,
-        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-        PRIMARY KEY (id, app_id, session_id)
-    )
-"""
-

 class ChatHistory:
    def __init__(self) -> None:
-        with sqlite3.connect(SQLITE_PATH, check_same_thread=False) as self.connection:
-            self.cursor = self.connection.cursor()
-            self.cursor.execute(CHAT_MESSAGE_CREATE_TABLE_QUERY)
-            self.connection.commit()
+        self.db_session = get_session()

    def add(self, app_id, session_id, chat_message: ChatMessage) -> Optional[str]:
        memory_id = str(uuid.uuid4())
        metadata_dict = merge_metadata_dict(chat_message.human_message.metadata, chat_message.ai_message.metadata)
        if metadata_dict:
            metadata = self._serialize_json(metadata_dict)
-        ADD_CHAT_MESSAGE_QUERY = """
-            INSERT INTO ec_chat_history (app_id, id, session_id, question, answer, metadata)
-            VALUES (?, ?, ?, ?, ?, ?)
-        """
-        self.cursor.execute(
-            ADD_CHAT_MESSAGE_QUERY,
-            (
-                app_id,
-                memory_id,
-                session_id,
-                chat_message.human_message.content,
-                chat_message.ai_message.content,
-                metadata if metadata_dict else "{}",
-            ),
+        self.db_session.add(
+            ChatHistoryModel(
+                app_id=app_id,
+                id=memory_id,
+                session_id=session_id,
+                question=chat_message.human_message.content,
+                answer=chat_message.ai_message.content,
+                metadata=metadata if metadata_dict else "{}",
+            )
        )
-        self.connection.commit()
+        try:
+            self.db_session.commit()
+        except Exception as e:
+            logging.error(f"Error adding chat memory to db: {e}")
+            self.db_session.rollback()
+            return None
+
        logging.info(f"Added chat memory to db with id: {memory_id}")
        return memory_id

@@ -63,15 +48,15 @@ class ChatHistory:

        :return: None
        """
+        params = {"app_id": app_id}
        if session_id:
-            DELETE_CHAT_HISTORY_QUERY = "DELETE FROM ec_chat_history WHERE app_id=? AND session_id=?"
-            params = (app_id, session_id)
-        else:
-            DELETE_CHAT_HISTORY_QUERY = "DELETE FROM ec_chat_history WHERE app_id=?"
-            params = (app_id,)
-
-        self.cursor.execute(DELETE_CHAT_HISTORY_QUERY, params)
-        self.connection.commit()
+            params["session_id"] = session_id
+        self.db_session.query(ChatHistoryModel).filter_by(**params).delete()
+        try:
+            self.db_session.commit()
+        except Exception as e:
+            logging.error(f"Error deleting chat history: {e}")
+            self.db_session.rollback()

    def get(
        self, app_id, session_id: str = "default", num_rounds=10, fetch_all: bool = False, display_format=False
@@ -85,50 +70,31 @@ class ChatHistory:
        param: fetch_all (optional) - Whether to fetch all chat history or not. Defaults to False
        param: display_format (optional) - Whether to return the chat history in display format. Defaults to False
        """
-
-        base_query = """
-            SELECT * FROM ec_chat_history
-            WHERE app_id=?
-        """
-
-        if fetch_all:
-            additional_query = "ORDER BY created_at ASC"
-            params = (app_id,)
-        else:
-            additional_query = """
-                AND session_id=?
-                ORDER BY created_at ASC
-                LIMIT ?
-            """
-            params = (app_id, session_id, num_rounds)
-
-        QUERY = base_query + additional_query
-
-        self.cursor.execute(
-            QUERY,
-            params,
+        params = {"app_id": app_id}
+        if not fetch_all:
+            params["session_id"] = session_id
+        results = (
+            self.db_session.query(ChatHistoryModel).filter_by(**params).order_by(ChatHistoryModel.created_at.asc())
        )
-
-        results = self.cursor.fetchall()
+        results = results.limit(num_rounds) if not fetch_all else results
        history = []
        for result in results:
-            app_id, _, session_id, question, answer, metadata, timestamp = result
-            metadata = self._deserialize_json(metadata=metadata)
+            metadata = self._deserialize_json(metadata=result.meta_data or "{}")
            # Return list of dict if display_format is True
            if display_format:
                history.append(
                    {
-                        "session_id": session_id,
-                        "human": question,
-                        "ai": answer,
-                        "metadata": metadata,
-                        "timestamp": timestamp,
+                        "session_id": result.session_id,
+                        "human": result.question,
+                        "ai": result.answer,
+                        "metadata": result.meta_data,
+                        "timestamp": result.created_at,
                    }
                )
            else:
                memory = ChatMessage()
-                memory.add_user_message(question, metadata=metadata)
-                memory.add_ai_message(answer, metadata=metadata)
+                memory.add_user_message(result.question, metadata=metadata)
+                memory.add_ai_message(result.answer, metadata=metadata)
                history.append(memory)
        return history

@@ -141,16 +107,11 @@ class ChatHistory:

        :return: The number of chat messages for a given app_id and session_id
        """
+        # Rewrite the logic below with sqlalchemy
+        params = {"app_id": app_id}
        if session_id:
-            QUERY = "SELECT COUNT(*) FROM ec_chat_history WHERE app_id=? AND session_id=?"
-            params = (app_id, session_id)
-        else:
-            QUERY = "SELECT COUNT(*) FROM ec_chat_history WHERE app_id=?"
-            params = (app_id,)
-
-        self.cursor.execute(QUERY, params)
-        count = self.cursor.fetchone()[0]
-        return count
+            params["session_id"] = session_id
+        return self.db_session.query(ChatHistoryModel).filter_by(**params).count()

    @staticmethod
    def _serialize_json(metadata: dict[str, Any]):
--- a/embedchain/migrations/env.py
+++ b/embedchain/migrations/env.py
@@ -0,0 +1,74 @@
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+
+from embedchain.constants import DB_URI
+from embedchain.core.db.models import Base
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+config.set_main_option("sqlalchemy.url", DB_URI)
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section, {}),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(connection=connection, target_metadata=target_metadata)
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/embedchain/migrations/script.py.mako
+++ b/embedchain/migrations/script.py.mako
@@ -0,0 +1,26 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
--- a/embedchain/migrations/versions/40a327b3debd_create_initial_migrations.py
+++ b/embedchain/migrations/versions/40a327b3debd_create_initial_migrations.py
@@ -0,0 +1,62 @@
+"""Create initial migrations
+
+Revision ID: 40a327b3debd
+Revises:
+Create Date: 2024-02-18 15:29:19.409064
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "40a327b3debd"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "ec_chat_history",
+        sa.Column("app_id", sa.String(), nullable=False),
+        sa.Column("id", sa.String(), nullable=False),
+        sa.Column("session_id", sa.String(), nullable=False),
+        sa.Column("question", sa.Text(), nullable=True),
+        sa.Column("answer", sa.Text(), nullable=True),
+        sa.Column("metadata", sa.Text(), nullable=True),
+        sa.Column("created_at", sa.TIMESTAMP(), nullable=True),
+        sa.PrimaryKeyConstraint("app_id", "id", "session_id"),
+    )
+    op.create_index(op.f("ix_ec_chat_history_created_at"), "ec_chat_history", ["created_at"], unique=False)
+    op.create_index(op.f("ix_ec_chat_history_session_id"), "ec_chat_history", ["session_id"], unique=False)
+    op.create_table(
+        "ec_data_sources",
+        sa.Column("id", sa.String(), nullable=False),
+        sa.Column("app_id", sa.Text(), nullable=True),
+        sa.Column("hash", sa.Text(), nullable=True),
+        sa.Column("type", sa.Text(), nullable=True),
+        sa.Column("value", sa.Text(), nullable=True),
+        sa.Column("metadata", sa.Text(), nullable=True),
+        sa.Column("is_uploaded", sa.Integer(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(op.f("ix_ec_data_sources_hash"), "ec_data_sources", ["hash"], unique=False)
+    op.create_index(op.f("ix_ec_data_sources_app_id"), "ec_data_sources", ["app_id"], unique=False)
+    op.create_index(op.f("ix_ec_data_sources_type"), "ec_data_sources", ["type"], unique=False)
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(op.f("ix_ec_data_sources_type"), table_name="ec_data_sources")
+    op.drop_index(op.f("ix_ec_data_sources_app_id"), table_name="ec_data_sources")
+    op.drop_index(op.f("ix_ec_data_sources_hash"), table_name="ec_data_sources")
+    op.drop_table("ec_data_sources")
+    op.drop_index(op.f("ix_ec_chat_history_session_id"), table_name="ec_chat_history")
+    op.drop_index(op.f("ix_ec_chat_history_created_at"), table_name="ec_chat_history")
+    op.drop_table("ec_chat_history")
+    # ### end Alembic commands ###
--- a/embedchain/store/assistants.py
+++ b/embedchain/store/assistants.py
@@ -20,7 +20,7 @@ from embedchain.utils.misc import detect_datatype
 logging.basicConfig(level=logging.WARN)

 # Set up the user directory if it doesn't exist already
-Client.setup_dir()
+Client.setup()


 class OpenAIAssistant: