feat: add embedchain javascript package (#576)

This commit is contained in:
Taranjeet Singh
2023-09-06 17:22:44 -07:00
committed by GitHub
parent f582d70031
commit 3c3d98b9c3
44 changed files with 20073 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
import type { Input, LoaderResult } from '../models';
export abstract class BaseLoader {
abstract loadData(src: Input): Promise<LoaderResult>;
}

View File

@@ -0,0 +1,21 @@
import type { LoaderResult, QnaPair } from '../models';
import { BaseLoader } from './BaseLoader';
class LocalQnaPairLoader extends BaseLoader {
// eslint-disable-next-line class-methods-use-this
async loadData(content: QnaPair): Promise<LoaderResult> {
const [question, answer] = content;
const contentText = `Q: ${question}\nA: ${answer}`;
const metaData = {
url: 'local',
};
return [
{
content: contentText,
metaData,
},
];
}
}
export { LocalQnaPairLoader };

View File

@@ -0,0 +1,58 @@
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
import type { LoaderResult, Metadata } from '../models';
import { cleanString } from '../utils';
import { BaseLoader } from './BaseLoader';
const pdfjsLib = require('pdfjs-dist');
interface Page {
page_content: string;
}
class PdfFileLoader extends BaseLoader {
static async getPagesFromPdf(url: string): Promise<Page[]> {
const loadingTask = pdfjsLib.getDocument(url);
const pdf = await loadingTask.promise;
const { numPages } = pdf;
const promises = Array.from({ length: numPages }, async (_, i) => {
const page = await pdf.getPage(i + 1);
const pageText: TextContent = await page.getTextContent();
const pageContent: string = pageText.items
.map((item) => ('str' in item ? item.str : ''))
.join(' ');
return {
page_content: pageContent,
};
});
return Promise.all(promises);
}
// eslint-disable-next-line class-methods-use-this
async loadData(url: string): Promise<LoaderResult> {
const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url);
const output: LoaderResult = [];
if (!pages.length) {
throw new Error('No data found');
}
pages.forEach((page) => {
let content: string = page.page_content;
content = cleanString(content);
const metaData: Metadata = {
url,
};
output.push({
content,
metaData,
});
});
return output;
}
}
export { PdfFileLoader };

View File

@@ -0,0 +1,51 @@
import axios from 'axios';
import { JSDOM } from 'jsdom';
import { cleanString } from '../utils';
import { BaseLoader } from './BaseLoader';
class WebPageLoader extends BaseLoader {
// eslint-disable-next-line class-methods-use-this
async loadData(url: string) {
const response = await axios.get(url);
const html = response.data;
const dom = new JSDOM(html);
const { document } = dom.window;
const unwantedTags = [
'nav',
'aside',
'form',
'header',
'noscript',
'svg',
'canvas',
'footer',
'script',
'style',
];
unwantedTags.forEach((tagName) => {
const elements = document.getElementsByTagName(tagName);
Array.from(elements).forEach((element) => {
// eslint-disable-next-line no-param-reassign
(element as HTMLElement).textContent = ' ';
});
});
const output = [];
let content = document.body.textContent;
if (!content) {
throw new Error('Web page content is empty.');
}
content = cleanString(content);
const metaData = {
url,
};
output.push({
content,
metaData,
});
return output;
}
}
export { WebPageLoader };

View File

@@ -0,0 +1,6 @@
import { BaseLoader } from './BaseLoader';
import { LocalQnaPairLoader } from './LocalQnaPair';
import { PdfFileLoader } from './PdfFile';
import { WebPageLoader } from './WebPage';
export { BaseLoader, LocalQnaPairLoader, PdfFileLoader, WebPageLoader };