feat: add embedchain javascript package (#576)
This commit is contained in:
51
embedchain-js/embedchain/loaders/WebPage.ts
Normal file
51
embedchain-js/embedchain/loaders/WebPage.ts
Normal file
@@ -0,0 +1,51 @@
|
||||
import axios from 'axios';
|
||||
import { JSDOM } from 'jsdom';
|
||||
|
||||
import { cleanString } from '../utils';
|
||||
import { BaseLoader } from './BaseLoader';
|
||||
|
||||
class WebPageLoader extends BaseLoader {
|
||||
// eslint-disable-next-line class-methods-use-this
|
||||
async loadData(url: string) {
|
||||
const response = await axios.get(url);
|
||||
const html = response.data;
|
||||
const dom = new JSDOM(html);
|
||||
const { document } = dom.window;
|
||||
const unwantedTags = [
|
||||
'nav',
|
||||
'aside',
|
||||
'form',
|
||||
'header',
|
||||
'noscript',
|
||||
'svg',
|
||||
'canvas',
|
||||
'footer',
|
||||
'script',
|
||||
'style',
|
||||
];
|
||||
unwantedTags.forEach((tagName) => {
|
||||
const elements = document.getElementsByTagName(tagName);
|
||||
Array.from(elements).forEach((element) => {
|
||||
// eslint-disable-next-line no-param-reassign
|
||||
(element as HTMLElement).textContent = ' ';
|
||||
});
|
||||
});
|
||||
|
||||
const output = [];
|
||||
let content = document.body.textContent;
|
||||
if (!content) {
|
||||
throw new Error('Web page content is empty.');
|
||||
}
|
||||
content = cleanString(content);
|
||||
const metaData = {
|
||||
url,
|
||||
};
|
||||
output.push({
|
||||
content,
|
||||
metaData,
|
||||
});
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
export { WebPageLoader };
|
||||
Reference in New Issue
Block a user