refactor: Use src instead of url as argument value (#111)
This commit is contained in:
@@ -5,16 +5,24 @@ class BaseChunker:
|
|||||||
def __init__(self, text_splitter):
|
def __init__(self, text_splitter):
|
||||||
self.text_splitter = text_splitter
|
self.text_splitter = text_splitter
|
||||||
|
|
||||||
def create_chunks(self, loader, url):
|
def create_chunks(self, loader, src):
|
||||||
|
"""
|
||||||
|
Loads data and chunks it.
|
||||||
|
|
||||||
|
:param loader: The loader which's `load_data` method is used to create the raw data.
|
||||||
|
:param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
|
||||||
|
"""
|
||||||
documents = []
|
documents = []
|
||||||
ids = []
|
ids = []
|
||||||
datas = loader.load_data(url)
|
datas = loader.load_data(src)
|
||||||
metadatas = []
|
metadatas = []
|
||||||
for data in datas:
|
for data in datas:
|
||||||
content = data["content"]
|
content = data["content"]
|
||||||
meta_data = data["meta_data"]
|
meta_data = data["meta_data"]
|
||||||
chunks = self.text_splitter.split_text(content)
|
|
||||||
url = meta_data["url"]
|
url = meta_data["url"]
|
||||||
|
|
||||||
|
chunks = self.text_splitter.split_text(content)
|
||||||
|
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||||
ids.append(chunk_id)
|
ids.append(chunk_id)
|
||||||
|
|||||||
@@ -121,22 +121,22 @@ class EmbedChain:
|
|||||||
self.user_asks.append([data_type, content])
|
self.user_asks.append([data_type, content])
|
||||||
self.load_and_embed(loader, chunker, content)
|
self.load_and_embed(loader, chunker, content)
|
||||||
|
|
||||||
def load_and_embed(self, loader, chunker, url):
|
def load_and_embed(self, loader, chunker, src):
|
||||||
"""
|
"""
|
||||||
Loads the data from the given URL, chunks it, and adds it to the database.
|
Loads the data from the given URL, chunks it, and adds it to the database.
|
||||||
|
|
||||||
:param loader: The loader to use to load the data.
|
:param loader: The loader to use to load the data.
|
||||||
:param chunker: The chunker to use to chunk the data.
|
:param chunker: The chunker to use to chunk the data.
|
||||||
:param url: The URL where the data is located.
|
:param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
|
||||||
"""
|
"""
|
||||||
embeddings_data = chunker.create_chunks(loader, url)
|
embeddings_data = chunker.create_chunks(loader, src)
|
||||||
documents = embeddings_data["documents"]
|
documents = embeddings_data["documents"]
|
||||||
metadatas = embeddings_data["metadatas"]
|
metadatas = embeddings_data["metadatas"]
|
||||||
ids = embeddings_data["ids"]
|
ids = embeddings_data["ids"]
|
||||||
# get existing ids, and discard doc if any common id exist.
|
# get existing ids, and discard doc if any common id exist.
|
||||||
existing_docs = self.collection.get(
|
existing_docs = self.collection.get(
|
||||||
ids=ids,
|
ids=ids,
|
||||||
# where={"url": url}
|
# where={"url": src}
|
||||||
)
|
)
|
||||||
existing_ids = set(existing_docs["ids"])
|
existing_ids = set(existing_docs["ids"])
|
||||||
|
|
||||||
@@ -145,7 +145,7 @@ class EmbedChain:
|
|||||||
data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
|
data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
|
||||||
|
|
||||||
if not data_dict:
|
if not data_dict:
|
||||||
print(f"All data from {url} already exists in the database.")
|
print(f"All data from {src} already exists in the database.")
|
||||||
return
|
return
|
||||||
|
|
||||||
ids = list(data_dict.keys())
|
ids = list(data_dict.keys())
|
||||||
@@ -156,7 +156,7 @@ class EmbedChain:
|
|||||||
metadatas=list(metadatas),
|
metadatas=list(metadatas),
|
||||||
ids=ids
|
ids=ids
|
||||||
)
|
)
|
||||||
print(f"Successfully saved {url}. Total chunks count: {self.collection.count()}")
|
print(f"Successfully saved {src}. Total chunks count: {self.collection.count()}")
|
||||||
|
|
||||||
def _format_result(self, results):
|
def _format_result(self, results):
|
||||||
return [
|
return [
|
||||||
|
|||||||
Reference in New Issue
Block a user