From 51adc5c886c04f99f2f55645acf6d906eb168f31 Mon Sep 17 00:00:00 2001 From: cachho Date: Fri, 7 Jul 2023 12:44:44 +0200 Subject: [PATCH] refactor: Use src instead of url as argument value (#111) --- embedchain/chunkers/base_chunker.py | 14 +++++++++++--- embedchain/embedchain.py | 12 ++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py index bcd5a86d..9d4c034f 100644 --- a/embedchain/chunkers/base_chunker.py +++ b/embedchain/chunkers/base_chunker.py @@ -5,16 +5,24 @@ class BaseChunker: def __init__(self, text_splitter): self.text_splitter = text_splitter - def create_chunks(self, loader, url): + def create_chunks(self, loader, src): + """ + Loads data and chunks it. + + :param loader: The loader which's `load_data` method is used to create the raw data. + :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders. + """ documents = [] ids = [] - datas = loader.load_data(url) + datas = loader.load_data(src) metadatas = [] for data in datas: content = data["content"] meta_data = data["meta_data"] - chunks = self.text_splitter.split_text(content) url = meta_data["url"] + + chunks = self.text_splitter.split_text(content) + for chunk in chunks: chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() ids.append(chunk_id) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 5cf833eb..66cc1e56 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -121,22 +121,22 @@ class EmbedChain: self.user_asks.append([data_type, content]) self.load_and_embed(loader, chunker, content) - def load_and_embed(self, loader, chunker, url): + def load_and_embed(self, loader, chunker, src): """ Loads the data from the given URL, chunks it, and adds it to the database. :param loader: The loader to use to load the data. :param chunker: The chunker to use to chunk the data. - :param url: The URL where the data is located. + :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders. """ - embeddings_data = chunker.create_chunks(loader, url) + embeddings_data = chunker.create_chunks(loader, src) documents = embeddings_data["documents"] metadatas = embeddings_data["metadatas"] ids = embeddings_data["ids"] # get existing ids, and discard doc if any common id exist. existing_docs = self.collection.get( ids=ids, - # where={"url": url} + # where={"url": src} ) existing_ids = set(existing_docs["ids"]) @@ -145,7 +145,7 @@ class EmbedChain: data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids} if not data_dict: - print(f"All data from {url} already exists in the database.") + print(f"All data from {src} already exists in the database.") return ids = list(data_dict.keys()) @@ -156,7 +156,7 @@ class EmbedChain: metadatas=list(metadatas), ids=ids ) - print(f"Successfully saved {url}. Total chunks count: {self.collection.count()}") + print(f"Successfully saved {src}. Total chunks count: {self.collection.count()}") def _format_result(self, results): return [