[Feature] Improve github and youtube channel loader (#966)
Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from chromadb import Collection, QueryResult
|
||||
from langchain.docstore.document import Document
|
||||
from tqdm import tqdm
|
||||
|
||||
from embedchain.config import ChromaDbConfig
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
@@ -157,12 +158,7 @@ class ChromaDB(BaseVectorDB):
|
||||
" Ids size: {}".format(len(documents), len(metadatas), len(ids))
|
||||
)
|
||||
|
||||
for i in range(0, len(documents), self.BATCH_SIZE):
|
||||
print(
|
||||
"Inserting batches from {} to {} in vector database.".format(
|
||||
i, min(len(documents), i + self.BATCH_SIZE)
|
||||
)
|
||||
)
|
||||
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in chromadb"):
|
||||
if skip_embedding:
|
||||
self.collection.add(
|
||||
embeddings=embeddings[i : i + self.BATCH_SIZE],
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
try:
|
||||
from opensearchpy import OpenSearch
|
||||
from opensearchpy.helpers import bulk
|
||||
@@ -23,6 +26,8 @@ class OpenSearchDB(BaseVectorDB):
|
||||
OpenSearch as vector database
|
||||
"""
|
||||
|
||||
BATCH_SIZE = 100
|
||||
|
||||
def __init__(self, config: OpenSearchDBConfig):
|
||||
"""OpenSearch as vector database.
|
||||
|
||||
@@ -131,19 +136,28 @@ class OpenSearchDB(BaseVectorDB):
|
||||
:type skip_embedding: bool
|
||||
"""
|
||||
|
||||
docs = []
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
|
||||
docs.append(
|
||||
{
|
||||
"_index": self._get_index(),
|
||||
"_id": id,
|
||||
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
|
||||
}
|
||||
)
|
||||
bulk(self.client, docs)
|
||||
self.client.indices.refresh(index=self._get_index())
|
||||
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents[i : i + self.BATCH_SIZE])
|
||||
|
||||
docs = []
|
||||
for id, text, metadata, embeddings in zip(
|
||||
ids[i : i + self.BATCH_SIZE],
|
||||
documents[i : i + self.BATCH_SIZE],
|
||||
metadatas[i : i + self.BATCH_SIZE],
|
||||
embeddings[i : i + self.BATCH_SIZE],
|
||||
):
|
||||
docs.append(
|
||||
{
|
||||
"_index": self._get_index(),
|
||||
"_id": id,
|
||||
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
|
||||
}
|
||||
)
|
||||
bulk(self.client, docs)
|
||||
self.client.indices.refresh(index=self._get_index())
|
||||
# Sleep for 0.1 seconds to avoid rate limiting
|
||||
time.sleep(0.1)
|
||||
|
||||
def query(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user