[Bug fix] Fix typos, static methods and other sanity improvements in the package (#1129)

This commit is contained in:
Sandra Serrano
2024-01-08 19:47:46 +01:00
committed by GitHub
parent 62c0c52e31
commit 2496ed133e
41 changed files with 133 additions and 103 deletions

View File

@@ -17,7 +17,7 @@ class BaseChunker(JSONSerializable):
"""
Loads data and chunks it.
:param loader: The loader which's `load_data` method is used to create
:param loader: The loader whose `load_data` method is used to create
the raw data.
:param src: The data to be handled by the loader. Can be a URL for
remote sources or local content for local loaders.
@@ -25,7 +25,7 @@ class BaseChunker(JSONSerializable):
"""
documents = []
chunk_ids = []
idMap = {}
id_map = {}
min_chunk_size = config.min_chunk_size if config is not None else 1
logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
data_result = loader.load_data(src)
@@ -49,8 +49,8 @@ class BaseChunker(JSONSerializable):
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
if idMap.get(chunk_id) is None and len(chunk) >= min_chunk_size:
idMap[chunk_id] = True
if id_map.get(chunk_id) is None and len(chunk) >= min_chunk_size:
id_map[chunk_id] = True
chunk_ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
@@ -77,5 +77,6 @@ class BaseChunker(JSONSerializable):
# TODO: This should be done during initialization. This means it has to be done in the child classes.
def get_word_count(self, documents):
@staticmethod
def get_word_count(documents) -> int:
return sum([len(document.split(" ")) for document in documents])