feat: add embedchain javascript package (#576)
This commit is contained in:
44
embedchain-js/embedchain/chunkers/BaseChunker.ts
Normal file
44
embedchain-js/embedchain/chunkers/BaseChunker.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { createHash } from 'crypto';
|
||||
import type { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import type { BaseLoader } from '../loaders';
|
||||
import type { Input, LoaderResult } from '../models';
|
||||
import type { ChunkResult } from '../models/ChunkResult';
|
||||
|
||||
class BaseChunker {
|
||||
textSplitter: RecursiveCharacterTextSplitter;
|
||||
|
||||
constructor(textSplitter: RecursiveCharacterTextSplitter) {
|
||||
this.textSplitter = textSplitter;
|
||||
}
|
||||
|
||||
async createChunks(loader: BaseLoader, url: Input): Promise<ChunkResult> {
|
||||
const documents: ChunkResult['documents'] = [];
|
||||
const ids: ChunkResult['ids'] = [];
|
||||
const datas: LoaderResult = await loader.loadData(url);
|
||||
const metadatas: ChunkResult['metadatas'] = [];
|
||||
|
||||
const dataPromises = datas.map(async (data) => {
|
||||
const { content, metaData } = data;
|
||||
const chunks: string[] = await this.textSplitter.splitText(content);
|
||||
chunks.forEach((chunk) => {
|
||||
const chunkId = createHash('sha256')
|
||||
.update(chunk + metaData.url)
|
||||
.digest('hex');
|
||||
ids.push(chunkId);
|
||||
documents.push(chunk);
|
||||
metadatas.push(metaData);
|
||||
});
|
||||
});
|
||||
|
||||
await Promise.all(dataPromises);
|
||||
|
||||
return {
|
||||
documents,
|
||||
ids,
|
||||
metadatas,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export { BaseChunker };
|
||||
26
embedchain-js/embedchain/chunkers/PdfFile.ts
Normal file
26
embedchain-js/embedchain/chunkers/PdfFile.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { BaseChunker } from './BaseChunker';
|
||||
|
||||
interface TextSplitterChunkParams {
|
||||
chunkSize: number;
|
||||
chunkOverlap: number;
|
||||
keepSeparator: boolean;
|
||||
}
|
||||
|
||||
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
|
||||
chunkSize: 1000,
|
||||
chunkOverlap: 0,
|
||||
keepSeparator: false,
|
||||
};
|
||||
|
||||
class PdfFileChunker extends BaseChunker {
|
||||
constructor() {
|
||||
const textSplitter = new RecursiveCharacterTextSplitter(
|
||||
TEXT_SPLITTER_CHUNK_PARAMS
|
||||
);
|
||||
super(textSplitter);
|
||||
}
|
||||
}
|
||||
|
||||
export { PdfFileChunker };
|
||||
26
embedchain-js/embedchain/chunkers/QnaPair.ts
Normal file
26
embedchain-js/embedchain/chunkers/QnaPair.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { BaseChunker } from './BaseChunker';
|
||||
|
||||
interface TextSplitterChunkParams {
|
||||
chunkSize: number;
|
||||
chunkOverlap: number;
|
||||
keepSeparator: boolean;
|
||||
}
|
||||
|
||||
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
|
||||
chunkSize: 300,
|
||||
chunkOverlap: 0,
|
||||
keepSeparator: false,
|
||||
};
|
||||
|
||||
class QnaPairChunker extends BaseChunker {
|
||||
constructor() {
|
||||
const textSplitter = new RecursiveCharacterTextSplitter(
|
||||
TEXT_SPLITTER_CHUNK_PARAMS
|
||||
);
|
||||
super(textSplitter);
|
||||
}
|
||||
}
|
||||
|
||||
export { QnaPairChunker };
|
||||
26
embedchain-js/embedchain/chunkers/WebPage.ts
Normal file
26
embedchain-js/embedchain/chunkers/WebPage.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { BaseChunker } from './BaseChunker';
|
||||
|
||||
interface TextSplitterChunkParams {
|
||||
chunkSize: number;
|
||||
chunkOverlap: number;
|
||||
keepSeparator: boolean;
|
||||
}
|
||||
|
||||
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
|
||||
chunkSize: 500,
|
||||
chunkOverlap: 0,
|
||||
keepSeparator: false,
|
||||
};
|
||||
|
||||
class WebPageChunker extends BaseChunker {
|
||||
constructor() {
|
||||
const textSplitter = new RecursiveCharacterTextSplitter(
|
||||
TEXT_SPLITTER_CHUNK_PARAMS
|
||||
);
|
||||
super(textSplitter);
|
||||
}
|
||||
}
|
||||
|
||||
export { WebPageChunker };
|
||||
6
embedchain-js/embedchain/chunkers/index.ts
Normal file
6
embedchain-js/embedchain/chunkers/index.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
import { BaseChunker } from './BaseChunker';
|
||||
import { PdfFileChunker } from './PdfFile';
|
||||
import { QnaPairChunker } from './QnaPair';
|
||||
import { WebPageChunker } from './WebPage';
|
||||
|
||||
export { BaseChunker, PdfFileChunker, QnaPairChunker, WebPageChunker };
|
||||
Reference in New Issue
Block a user