From a94bd11a768d99d606706bbe08967aedd91cf6b3 Mon Sep 17 00:00:00 2001 From: Tibor Sloboda Date: Tue, 27 Aug 2024 13:26:01 +0200 Subject: [PATCH] Distance metric change and PGVectorScale support (#1703) --- docs/components/vectordbs/dbs/pgvector.mdx | 3 ++- mem0/configs/vector_stores/pgvector.py | 1 + mem0/vector_stores/configs.py | 3 ++- mem0/vector_stores/pgvector.py | 22 +++++++++++++++++++--- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/docs/components/vectordbs/dbs/pgvector.mdx b/docs/components/vectordbs/dbs/pgvector.mdx index 6aa8de05..57c4bdee 100644 --- a/docs/components/vectordbs/dbs/pgvector.mdx +++ b/docs/components/vectordbs/dbs/pgvector.mdx @@ -36,4 +36,5 @@ Here's the parameters available for configuring pgvector: | `user` | User name to connect to the database | `None` | | `password` | Password to connect to the database | `None` | | `host` | The host where the Postgres server is running | `None` | -| `port` | The port where the Postgres server is running | `None` | \ No newline at end of file +| `port` | The port where the Postgres server is running | `None` | +| `diskann` | Whether to use diskann for vector similarity search (requires pgvectorscale) | `True` | \ No newline at end of file diff --git a/mem0/configs/vector_stores/pgvector.py b/mem0/configs/vector_stores/pgvector.py index 839e707f..1ed59c75 100644 --- a/mem0/configs/vector_stores/pgvector.py +++ b/mem0/configs/vector_stores/pgvector.py @@ -14,6 +14,7 @@ class PGVectorConfig(BaseModel): password: Optional[str] = Field(None, description="Database password") host: Optional[str] = Field(None, description="Database host. Default is localhost") port: Optional[int] = Field(None, description="Database port. Default is 1536") + diskann: Optional[bool] = Field(True, description="Use diskann for approximate nearest neighbors search") @model_validator(mode="before") def check_auth_and_connection(cls, values): diff --git a/mem0/vector_stores/configs.py b/mem0/vector_stores/configs.py index fc6b102e..3b2eb293 100644 --- a/mem0/vector_stores/configs.py +++ b/mem0/vector_stores/configs.py @@ -39,7 +39,8 @@ class VectorStoreConfig(BaseModel): raise ValueError(f"Invalid config type for provider {provider}") return self - if "path" not in config: + # also check if path in allowed kays for pydantic model, and whether config extra fields are allowed + if "path" not in config and "path" in config_class.__annotations__: config["path"] = f"/tmp/{provider}" self.config = config_class(**config) diff --git a/mem0/vector_stores/pgvector.py b/mem0/vector_stores/pgvector.py index ad92624e..180792fd 100644 --- a/mem0/vector_stores/pgvector.py +++ b/mem0/vector_stores/pgvector.py @@ -22,7 +22,7 @@ class OutputData(BaseModel): class PGVector(VectorStoreBase): def __init__( - self, dbname, collection_name, embedding_model_dims, user, password, host, port + self, dbname, collection_name, embedding_model_dims, user, password, host, port, diskann ): """ Initialize the PGVector database. @@ -35,8 +35,10 @@ class PGVector(VectorStoreBase): password (str): Database password host (str, optional): Database host port (int, optional): Database port + diskann (bool, optional): Use DiskANN for faster search """ self.collection_name = collection_name + self.use_diskann = diskann self.conn = psycopg2.connect( dbname=dbname, user=user, password=password, host=host, port=port @@ -50,6 +52,7 @@ class PGVector(VectorStoreBase): def create_col(self, embedding_model_dims): """ Create a new collection (table in PostgreSQL). + Will also initialize DiskANN index if the extension is installed. Args: name (str): Name of the collection. @@ -64,6 +67,19 @@ class PGVector(VectorStoreBase): ); """ ) + + if self.use_diskann and embedding_model_dims < 2000: + # Check if vectorscale extension is installed + self.cur.execute("SELECT * FROM pg_extension WHERE extname = 'vectorscale'") + if self.cur.fetchone(): + # Create DiskANN index if extension is installed for faster search + self.cur.execute(f""" + CREATE INDEX IF NOT EXISTS {self.collection_name}_vector_idx + ON {self.collection_name} + USING diskann (vector); + """ + ) + self.conn.commit() def insert(self, vectors, payloads=None, ids=None): @@ -114,13 +130,13 @@ class PGVector(VectorStoreBase): self.cur.execute( f""" - SELECT id, vector <-> %s::vector AS distance, payload + SELECT id, vector <=> %s::vector AS distance, payload FROM {self.collection_name} {filter_clause} ORDER BY distance LIMIT %s """, - (query, *filter_params, limit), + (query, *filter_params, limit), ) results = self.cur.fetchall()