docs: update docstrings (#565)

2023-09-07 02:04:44 +02:00
parent 4754372fcd
commit 1ac8aef4de
25 changed files with 736 additions and 298 deletions
--- a/embedchain/vectordb/base_vector_db.py
+++ b/embedchain/vectordb/base_vector_db.py
@@ -7,6 +7,11 @@ class BaseVectorDB(JSONSerializable):
    """Base class for vector database."""

    def __init__(self, config: BaseVectorDbConfig):
+        """Initialize the database. Save the config and client as an attribute.
+
+        :param config: Database configuration class instance.
+        :type config: BaseVectorDbConfig
+        """
        self.client = self._get_or_create_db()
        self.config: BaseVectorDbConfig = config

@@ -23,25 +28,50 @@ class BaseVectorDB(JSONSerializable):
        raise NotImplementedError

    def _get_or_create_collection(self):
+        """Get or create a named collection."""
        raise NotImplementedError

    def _set_embedder(self, embedder: BaseEmbedder):
+        """
+        The database needs to access the embedder sometimes, with this method you can persistently set it.
+
+        :param embedder: Embedder to be set as the embedder for this database.
+        :type embedder: BaseEmbedder
+        """
        self.embedder = embedder

    def get(self):
+        """Get database embeddings by id."""
        raise NotImplementedError

    def add(self):
+        """Add to database"""
        raise NotImplementedError

    def query(self):
+        """Query contents from vector data base based on vector similarity"""
        raise NotImplementedError

-    def count(self):
+    def count(self) -> int:
+        """
+        Count number of documents/chunks embedded in the database.
+
+        :return: number of documents
+        :rtype: int
+        """
        raise NotImplementedError

    def reset(self):
+        """
+        Resets the database. Deletes all embeddings irreversibly.
+        """
        raise NotImplementedError

    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
        raise NotImplementedError
--- a/embedchain/vectordb/chroma_db.py
+++ b/embedchain/vectordb/chroma_db.py
@@ -1,6 +1,7 @@
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional

+from chromadb import Collection, QueryResult
 from langchain.docstore.document import Document

 from embedchain.config import ChromaDbConfig
@@ -25,6 +26,11 @@ class ChromaDB(BaseVectorDB):
    """Vector database using ChromaDB."""

    def __init__(self, config: Optional[ChromaDbConfig] = None):
+        """Initialize a new ChromaDB instance
+
+        :param config: Configuration options for Chroma, defaults to None
+        :type config: Optional[ChromaDbConfig], optional
+        """
        if config:
            self.config = config
        else:
@@ -60,11 +66,19 @@ class ChromaDB(BaseVectorDB):
        self._get_or_create_collection(self.config.collection_name)

    def _get_or_create_db(self):
-        """Get or create the database."""
+        """Called during initialization"""
        return self.client

-    def _get_or_create_collection(self, name):
-        """Get or create the collection."""
+    def _get_or_create_collection(self, name: str) -> Collection:
+        """
+        Get or create a named collection.
+
+        :param name: Name of the collection
+        :type name: str
+        :raises ValueError: No embedder configured.
+        :return: Created collection
+        :rtype: Collection
+        """
        if not hasattr(self, "embedder") or not self.embedder:
            raise ValueError("Cannot create a Chroma database collection without an embedder.")
        self.collection = self.client.get_or_create_collection(
@@ -76,8 +90,13 @@ class ChromaDB(BaseVectorDB):
    def get(self, ids: List[str], where: Dict[str, any]) -> List[str]:
        """
        Get existing doc ids present in vector database
-        :param ids: list of doc ids to check for existance
+
+        :param ids: list of doc ids to check for existence
+        :type ids: List[str]
        :param where: Optional. to filter data
+        :type where: Dict[str, any]
+        :return: Existing documents.
+        :rtype: List[str]
        """
        existing_docs = self.collection.get(
            ids=ids,
@@ -86,16 +105,28 @@ class ChromaDB(BaseVectorDB):

        return set(existing_docs["ids"])

-    def add(self, documents: List[str], metadatas: List[object], ids: List[str]) -> Any:
+    def add(self, documents: List[str], metadatas: List[object], ids: List[str]):
        """
-        add data in vector database
-        :param documents: list of texts to add
-        :param metadatas: list of metadata associated with docs
-        :param ids: ids of docs
+        Add vectors to chroma database
+
+        :param documents: Documents
+        :type documents: List[str]
+        :param metadatas: Metadatas
+        :type metadatas: List[object]
+        :param ids: ids
+        :type ids: List[str]
        """
        self.collection.add(documents=documents, metadatas=metadatas, ids=ids)

-    def _format_result(self, results):
+    def _format_result(self, results: QueryResult) -> list[tuple[Document, float]]:
+        """
+        Format Chroma results
+
+        :param results: ChromaDB query results to format.
+        :type results: QueryResult
+        :return: Formatted results
+        :rtype: list[tuple[Document, float]]
+        """
        return [
            (Document(page_content=result[0], metadata=result[1] or {}), result[2])
            for result in zip(
@@ -107,11 +138,17 @@ class ChromaDB(BaseVectorDB):

    def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
        """
-        query contents from vector data base based on vector similarity
+        Query contents from vector data base based on vector similarity
+
        :param input_query: list of query string
+        :type input_query: List[str]
        :param n_results: no of similar documents to fetch from database
-        :param where: Optional. to filter data
+        :type n_results: int
+        :param where: to filter data
+        :type where: Dict[str, any]
+        :raises InvalidDimensionException: Dimensions do not match.
        :return: The content of the document that matched your query.
+        :rtype: List[str]
        """
        try:
            result = self.collection.query(
@@ -132,21 +169,27 @@ class ChromaDB(BaseVectorDB):
        return contents

    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
        self.config.collection_name = name
        self._get_or_create_collection(self.config.collection_name)

    def count(self) -> int:
        """
-        Count the number of embeddings.
+        Count number of documents/chunks embedded in the database.

-        :return: The number of embeddings.
+        :return: number of documents
+        :rtype: int
        """
        return self.collection.count()

    def reset(self):
        """
        Resets the database. Deletes all embeddings irreversibly.
-        `App` does not have to be reinitialized after using this method.
        """
        # Delete all data from the database
        try:
--- a/embedchain/vectordb/elasticsearch_db.py
+++ b/embedchain/vectordb/elasticsearch_db.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Dict, List, Optional, Set

 try:
    from elasticsearch import Elasticsearch
@@ -15,16 +15,23 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB

@register_deserializable
 class ElasticsearchDB(BaseVectorDB):
+    """
+    Elasticsearch as vector database
+    """
+
    def __init__(
        self,
-        config: ElasticsearchDBConfig = None,
-        es_config: ElasticsearchDBConfig = None,  # Backwards compatibility
+        config: Optional[ElasticsearchDBConfig] = None,
+        es_config: Optional[ElasticsearchDBConfig] = None,  # Backwards compatibility
    ):
-        """
-        Elasticsearch as vector database
-        :param es_config. elasticsearch database config to be used for connection
-        :param embedding_fn: Function to generate embedding vectors.
-        :param vector_dim: Vector dimension generated by embedding fn
+        """Elasticsearch as vector database.
+
+        :param config: Elasticsearch database config, defaults to None
+        :type config: ElasticsearchDBConfig, optional
+        :param es_config: `es_config` is supported as an alias for `config` (for backwards compatibility),
+        defaults to None
+        :type es_config: ElasticsearchDBConfig, optional
+        :raises ValueError: No config provided
        """
        if config is None and es_config is None:
            raise ValueError("ElasticsearchDBConfig is required")
@@ -53,16 +60,22 @@ class ElasticsearchDB(BaseVectorDB):
            self.client.indices.create(index=es_index, body=index_settings)

    def _get_or_create_db(self):
+        """Called during initialization"""
        return self.client

    def _get_or_create_collection(self, name):
        """Note: nothing to return here. Discuss later"""

-    def get(self, ids: List[str], where: Dict[str, any]) -> List[str]:
+    def get(self, ids: List[str], where: Dict[str, any]) -> Set[str]:
        """
        Get existing doc ids present in vector database
-        :param ids: list of doc ids to check for existance
-        :param where: Optional. to filter data
+
+        :param ids: _list of doc ids to check for existance
+        :type ids: List[str]
+        :param where: to filter data
+        :type where: Dict[str, any]
+        :return: ids
+        :rtype: Set[str]
        """
        query = {"bool": {"must": [{"ids": {"values": ids}}]}}
        if "app_id" in where:
@@ -73,13 +86,17 @@ class ElasticsearchDB(BaseVectorDB):
        ids = [doc["_id"] for doc in docs]
        return set(ids)

-    def add(self, documents: List[str], metadatas: List[object], ids: List[str]) -> Any:
-        """
-        add data in vector database
+    def add(self, documents: List[str], metadatas: List[object], ids: List[str]):
+        """add data in vector database
+
        :param documents: list of texts to add
+        :type documents: List[str]
        :param metadatas: list of metadata associated with docs
+        :type metadatas: List[object]
        :param ids: ids of docs
+        :type ids: List[str]
        """
+
        docs = []
        embeddings = self.embedder.embedding_fn(documents)
        for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
@@ -92,14 +109,19 @@ class ElasticsearchDB(BaseVectorDB):
            )
        bulk(self.client, docs)
        self.client.indices.refresh(index=self._get_index())
-        return

    def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
        """
        query contents from vector data base based on vector similarity
+
        :param input_query: list of query string
+        :type input_query: List[str]
        :param n_results: no of similar documents to fetch from database
+        :type n_results: int
        :param where: Optional. to filter data
+        :type where: Dict[str, any]
+        :return: Database contents that are the result of the query
+        :rtype: List[str]
        """
        input_query_vector = self.embedder.embedding_fn(input_query)
        query_vector = input_query_vector[0]
@@ -122,21 +144,41 @@ class ElasticsearchDB(BaseVectorDB):
        return contents

    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
        self.config.collection_name = name

    def count(self) -> int:
+        """
+        Count number of documents/chunks embedded in the database.
+
+        :return: number of documents
+        :rtype: int
+        """
        query = {"match_all": {}}
        response = self.client.count(index=self._get_index(), query=query)
        doc_count = response["count"]
        return doc_count

    def reset(self):
+        """
+        Resets the database. Deletes all embeddings irreversibly.
+        """
        # Delete all data from the database
        if self.client.indices.exists(index=self._get_index()):
            # delete index in Es
            self.client.indices.delete(index=self._get_index())

-    def _get_index(self):
+    def _get_index(self) -> str:
+        """Get the Elasticsearch index for a collection
+
+        :return: Elasticsearch index
+        :rtype: str
+        """
        # NOTE: The method is preferred to an attribute, because if collection name changes,
        # it's always up-to-date.
        return f"{self.config.collection_name}_{self.embedder.vector_dimension}"