[Feature] Improve github and youtube channel loader (#966)

Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
Deshraj Yadav
2023-11-17 18:25:14 -08:00
committed by GitHub
parent 51df00729e
commit 9fcf2130b5
13 changed files with 117 additions and 268 deletions

View File

@@ -1,6 +1,9 @@
import logging
import time
from typing import Dict, List, Optional, Set, Tuple, Union
from tqdm import tqdm
try:
from opensearchpy import OpenSearch
from opensearchpy.helpers import bulk
@@ -23,6 +26,8 @@ class OpenSearchDB(BaseVectorDB):
OpenSearch as vector database
"""
BATCH_SIZE = 100
def __init__(self, config: OpenSearchDBConfig):
"""OpenSearch as vector database.
@@ -131,19 +136,28 @@ class OpenSearchDB(BaseVectorDB):
:type skip_embedding: bool
"""
docs = []
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
docs.append(
{
"_index": self._get_index(),
"_id": id,
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
}
)
bulk(self.client, docs)
self.client.indices.refresh(index=self._get_index())
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents[i : i + self.BATCH_SIZE])
docs = []
for id, text, metadata, embeddings in zip(
ids[i : i + self.BATCH_SIZE],
documents[i : i + self.BATCH_SIZE],
metadatas[i : i + self.BATCH_SIZE],
embeddings[i : i + self.BATCH_SIZE],
):
docs.append(
{
"_index": self._get_index(),
"_id": id,
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
}
)
bulk(self.client, docs)
self.client.indices.refresh(index=self._get_index())
# Sleep for 0.1 seconds to avoid rate limiting
time.sleep(0.1)
def query(
self,