feat: add embedchain javascript package (#576)
This commit is contained in:
5
embedchain-js/embedchain/loaders/BaseLoader.ts
Normal file
5
embedchain-js/embedchain/loaders/BaseLoader.ts
Normal file
@@ -0,0 +1,5 @@
|
||||
import type { Input, LoaderResult } from '../models';
|
||||
|
||||
export abstract class BaseLoader {
|
||||
abstract loadData(src: Input): Promise<LoaderResult>;
|
||||
}
|
||||
21
embedchain-js/embedchain/loaders/LocalQnaPair.ts
Normal file
21
embedchain-js/embedchain/loaders/LocalQnaPair.ts
Normal file
@@ -0,0 +1,21 @@
|
||||
import type { LoaderResult, QnaPair } from '../models';
|
||||
import { BaseLoader } from './BaseLoader';
|
||||
|
||||
class LocalQnaPairLoader extends BaseLoader {
|
||||
// eslint-disable-next-line class-methods-use-this
|
||||
async loadData(content: QnaPair): Promise<LoaderResult> {
|
||||
const [question, answer] = content;
|
||||
const contentText = `Q: ${question}\nA: ${answer}`;
|
||||
const metaData = {
|
||||
url: 'local',
|
||||
};
|
||||
return [
|
||||
{
|
||||
content: contentText,
|
||||
metaData,
|
||||
},
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
export { LocalQnaPairLoader };
|
||||
58
embedchain-js/embedchain/loaders/PdfFile.ts
Normal file
58
embedchain-js/embedchain/loaders/PdfFile.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
|
||||
|
||||
import type { LoaderResult, Metadata } from '../models';
|
||||
import { cleanString } from '../utils';
|
||||
import { BaseLoader } from './BaseLoader';
|
||||
|
||||
const pdfjsLib = require('pdfjs-dist');
|
||||
|
||||
interface Page {
|
||||
page_content: string;
|
||||
}
|
||||
|
||||
class PdfFileLoader extends BaseLoader {
|
||||
static async getPagesFromPdf(url: string): Promise<Page[]> {
|
||||
const loadingTask = pdfjsLib.getDocument(url);
|
||||
const pdf = await loadingTask.promise;
|
||||
const { numPages } = pdf;
|
||||
|
||||
const promises = Array.from({ length: numPages }, async (_, i) => {
|
||||
const page = await pdf.getPage(i + 1);
|
||||
const pageText: TextContent = await page.getTextContent();
|
||||
const pageContent: string = pageText.items
|
||||
.map((item) => ('str' in item ? item.str : ''))
|
||||
.join(' ');
|
||||
|
||||
return {
|
||||
page_content: pageContent,
|
||||
};
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
}
|
||||
|
||||
// eslint-disable-next-line class-methods-use-this
|
||||
async loadData(url: string): Promise<LoaderResult> {
|
||||
const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url);
|
||||
const output: LoaderResult = [];
|
||||
|
||||
if (!pages.length) {
|
||||
throw new Error('No data found');
|
||||
}
|
||||
|
||||
pages.forEach((page) => {
|
||||
let content: string = page.page_content;
|
||||
content = cleanString(content);
|
||||
const metaData: Metadata = {
|
||||
url,
|
||||
};
|
||||
output.push({
|
||||
content,
|
||||
metaData,
|
||||
});
|
||||
});
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
export { PdfFileLoader };
|
||||
51
embedchain-js/embedchain/loaders/WebPage.ts
Normal file
51
embedchain-js/embedchain/loaders/WebPage.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import axios from 'axios';
|
||||
import { JSDOM } from 'jsdom';
|
||||
|
||||
import { cleanString } from '../utils';
|
||||
import { BaseLoader } from './BaseLoader';
|
||||
|
||||
class WebPageLoader extends BaseLoader {
|
||||
// eslint-disable-next-line class-methods-use-this
|
||||
async loadData(url: string) {
|
||||
const response = await axios.get(url);
|
||||
const html = response.data;
|
||||
const dom = new JSDOM(html);
|
||||
const { document } = dom.window;
|
||||
const unwantedTags = [
|
||||
'nav',
|
||||
'aside',
|
||||
'form',
|
||||
'header',
|
||||
'noscript',
|
||||
'svg',
|
||||
'canvas',
|
||||
'footer',
|
||||
'script',
|
||||
'style',
|
||||
];
|
||||
unwantedTags.forEach((tagName) => {
|
||||
const elements = document.getElementsByTagName(tagName);
|
||||
Array.from(elements).forEach((element) => {
|
||||
// eslint-disable-next-line no-param-reassign
|
||||
(element as HTMLElement).textContent = ' ';
|
||||
});
|
||||
});
|
||||
|
||||
const output = [];
|
||||
let content = document.body.textContent;
|
||||
if (!content) {
|
||||
throw new Error('Web page content is empty.');
|
||||
}
|
||||
content = cleanString(content);
|
||||
const metaData = {
|
||||
url,
|
||||
};
|
||||
output.push({
|
||||
content,
|
||||
metaData,
|
||||
});
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
export { WebPageLoader };
|
||||
6
embedchain-js/embedchain/loaders/index.ts
Normal file
6
embedchain-js/embedchain/loaders/index.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
import { BaseLoader } from './BaseLoader';
|
||||
import { LocalQnaPairLoader } from './LocalQnaPair';
|
||||
import { PdfFileLoader } from './PdfFile';
|
||||
import { WebPageLoader } from './WebPage';
|
||||
|
||||
export { BaseLoader, LocalQnaPairLoader, PdfFileLoader, WebPageLoader };
|
||||
Reference in New Issue
Block a user