docs: update docstrings (#565)

2023-09-07 02:04:44 +02:00
parent 4754372fcd
commit 1ac8aef4de
25 changed files with 736 additions and 298 deletions
--- a/embedchain/vectordb/elasticsearch_db.py
+++ b/embedchain/vectordb/elasticsearch_db.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List
+from typing import Dict, List, Optional, Set

 try:
    from elasticsearch import Elasticsearch
@@ -15,16 +15,23 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB

@register_deserializable
 class ElasticsearchDB(BaseVectorDB):
+    """
+    Elasticsearch as vector database
+    """
+
    def __init__(
        self,
-        config: ElasticsearchDBConfig = None,
-        es_config: ElasticsearchDBConfig = None,  # Backwards compatibility
+        config: Optional[ElasticsearchDBConfig] = None,
+        es_config: Optional[ElasticsearchDBConfig] = None,  # Backwards compatibility
    ):
-        """
-        Elasticsearch as vector database
-        :param es_config. elasticsearch database config to be used for connection
-        :param embedding_fn: Function to generate embedding vectors.
-        :param vector_dim: Vector dimension generated by embedding fn
+        """Elasticsearch as vector database.
+
+        :param config: Elasticsearch database config, defaults to None
+        :type config: ElasticsearchDBConfig, optional
+        :param es_config: `es_config` is supported as an alias for `config` (for backwards compatibility),
+        defaults to None
+        :type es_config: ElasticsearchDBConfig, optional
+        :raises ValueError: No config provided
        """
        if config is None and es_config is None:
            raise ValueError("ElasticsearchDBConfig is required")
@@ -53,16 +60,22 @@ class ElasticsearchDB(BaseVectorDB):
            self.client.indices.create(index=es_index, body=index_settings)

    def _get_or_create_db(self):
+        """Called during initialization"""
        return self.client

    def _get_or_create_collection(self, name):
        """Note: nothing to return here. Discuss later"""

-    def get(self, ids: List[str], where: Dict[str, any]) -> List[str]:
+    def get(self, ids: List[str], where: Dict[str, any]) -> Set[str]:
        """
        Get existing doc ids present in vector database
-        :param ids: list of doc ids to check for existance
-        :param where: Optional. to filter data
+
+        :param ids: _list of doc ids to check for existance
+        :type ids: List[str]
+        :param where: to filter data
+        :type where: Dict[str, any]
+        :return: ids
+        :rtype: Set[str]
        """
        query = {"bool": {"must": [{"ids": {"values": ids}}]}}
        if "app_id" in where:
@@ -73,13 +86,17 @@ class ElasticsearchDB(BaseVectorDB):
        ids = [doc["_id"] for doc in docs]
        return set(ids)

-    def add(self, documents: List[str], metadatas: List[object], ids: List[str]) -> Any:
-        """
-        add data in vector database
+    def add(self, documents: List[str], metadatas: List[object], ids: List[str]):
+        """add data in vector database
+
        :param documents: list of texts to add
+        :type documents: List[str]
        :param metadatas: list of metadata associated with docs
+        :type metadatas: List[object]
        :param ids: ids of docs
+        :type ids: List[str]
        """
+
        docs = []
        embeddings = self.embedder.embedding_fn(documents)
        for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
@@ -92,14 +109,19 @@ class ElasticsearchDB(BaseVectorDB):
            )
        bulk(self.client, docs)
        self.client.indices.refresh(index=self._get_index())
-        return

    def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
        """
        query contents from vector data base based on vector similarity
+
        :param input_query: list of query string
+        :type input_query: List[str]
        :param n_results: no of similar documents to fetch from database
+        :type n_results: int
        :param where: Optional. to filter data
+        :type where: Dict[str, any]
+        :return: Database contents that are the result of the query
+        :rtype: List[str]
        """
        input_query_vector = self.embedder.embedding_fn(input_query)
        query_vector = input_query_vector[0]
@@ -122,21 +144,41 @@ class ElasticsearchDB(BaseVectorDB):
        return contents

    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
        self.config.collection_name = name

    def count(self) -> int:
+        """
+        Count number of documents/chunks embedded in the database.
+
+        :return: number of documents
+        :rtype: int
+        """
        query = {"match_all": {}}
        response = self.client.count(index=self._get_index(), query=query)
        doc_count = response["count"]
        return doc_count

    def reset(self):
+        """
+        Resets the database. Deletes all embeddings irreversibly.
+        """
        # Delete all data from the database
        if self.client.indices.exists(index=self._get_index()):
            # delete index in Es
            self.client.indices.delete(index=self._get_index())

-    def _get_index(self):
+    def _get_index(self) -> str:
+        """Get the Elasticsearch index for a collection
+
+        :return: Elasticsearch index
+        :rtype: str
+        """
        # NOTE: The method is preferred to an attribute, because if collection name changes,
        # it's always up-to-date.
        return f"{self.config.collection_name}_{self.embedder.vector_dimension}"