Remove embedchain js (#1408)

This commit is contained in:
Taranjeet Singh
2024-06-10 13:24:56 -07:00
committed by GitHub
parent 52fd3e0dd4
commit 445fed4d3f
44 changed files with 0 additions and 20064 deletions

View File

@@ -1,2 +0,0 @@
node_modules
dist

View File

@@ -1,56 +0,0 @@
{
// Configuration for JavaScript files
"extends": [
"airbnb-base",
"plugin:prettier/recommended"
],
"rules": {
"prettier/prettier": [
"error",
{
"singleQuote": true,
"endOfLine": "auto"
}
]
},
"overrides": [
// Configuration for TypeScript files
{
"files": ["**/*.ts", "**/__tests__/*.test.ts"],
"plugins": [
"@typescript-eslint",
"unused-imports",
"simple-import-sort"
],
"extends": [
"airbnb-typescript",
"plugin:prettier/recommended"
],
"parserOptions": {
"project": "./tsconfig.json"
},
"rules": {
"prettier/prettier": [
"error",
{
"singleQuote": true,
"endOfLine": "auto"
}
],
"@typescript-eslint/comma-dangle": "off", // Avoid conflict rule between Eslint and Prettier
"@typescript-eslint/consistent-type-imports": "error", // Ensure `import type` is used when it's necessary
"import/prefer-default-export": "off", // Named export is easier to refactor automatically
"simple-import-sort/imports": "error", // Import configuration for `eslint-plugin-simple-import-sort`
"simple-import-sort/exports": "error", // Export configuration for `eslint-plugin-simple-import-sort`
"@typescript-eslint/no-unused-vars": "off",
"react/jsx-filename-extension": "off", // Gives error
"unused-imports/no-unused-imports": "error",
"unused-imports/no-unused-vars": [
"error",
{ "argsIgnorePattern": "^_" }
]
}
}
]
}

View File

@@ -1,47 +0,0 @@
name: Node.js Package
on:
release:
types: [created]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-node@v3
with:
node-version: 16
- run: npm ci
- run: npm test
- run: npm run build
- uses: actions/upload-artifact@v3
with:
name: dist
path: dist
- uses: actions/upload-artifact@v3
with:
name: types
path: types
publish-npm:
needs: build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-node@v3
with:
node-version: 16
registry-url: https://registry.npmjs.org/
- uses: actions/download-artifact@v3
with:
name: dist
path: dist
- uses: actions/download-artifact@v3
with:
name: types
path: types
- run: npm ci
- run: npm publish
env:
NODE_AUTH_TOKEN: ${{secrets.npm_token}}

View File

@@ -1,138 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
.cache
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
.ideas.md
.todos.md
# Custom
dist
types
build

View File

@@ -1,4 +0,0 @@
#!/bin/sh
. "$(dirname "$0")/_/husky.sh"
npx --no -- commitlint --edit $1

View File

@@ -1,5 +0,0 @@
#!/bin/sh
. "$(dirname "$0")/_/husky.sh"
# Disable concurent to run `check-types` after ESLint in lint-staged
npx lint-staged --concurrent false

View File

@@ -1,8 +0,0 @@
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Singh"
given-names: "Taranjeet"
title: "Embedchain"
date-released: 2023-06-25
url: "https://github.com/embedchain/embedchainjs"

View File

@@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -1,254 +0,0 @@
# embedchainjs
[![Discord](https://dcbadge.vercel.app/api/server/CUU9FPhRNt?style=flat)](https://discord.gg/CUU9FPhRNt)
[![Twitter](https://img.shields.io/twitter/follow/embedchain)](https://twitter.com/embedchain)
[![Substack](https://img.shields.io/badge/Substack-%23006f5c.svg?logo=substack)](https://embedchain.substack.com/)
embedchain is a framework to easily create LLM powered bots over any dataset. embedchainjs is Javascript version of embedchain. If you want a python version, check out [embedchain-python](https://github.com/embedchain/embedchain)
# 🤝 Let's Talk Embedchain!
Schedule a [Feedback Session](https://cal.com/taranjeetio/ec) with Taranjeet, the founder, to discuss any issues, provide feedback, or explore improvements.
# How it works
It abstracts the entire process of loading dataset, chunking it, creating embeddings and then storing in vector database.
You can add a single or multiple dataset using `.add` and `.addLocal` function and then use `.query` function to find an answer from the added datasets.
If you want to create a Naval Ravikant bot which has 2 of his blog posts, as well as a question and answer pair you supply, all you need to do is add the links to the blog posts and the QnA pair and embedchain will create a bot for you.
```javascript
const dotenv = require("dotenv");
dotenv.config();
const { App } = require("embedchain");
//Run the app commands inside an async function only
async function testApp() {
const navalChatBot = await App();
// Embed Online Resources
await navalChatBot.add("web_page", "https://nav.al/feedback");
await navalChatBot.add("web_page", "https://nav.al/agi");
await navalChatBot.add(
"pdf_file",
"https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
);
// Embed Local Resources
await navalChatBot.addLocal("qna_pair", [
"Who is Naval Ravikant?",
"Naval Ravikant is an Indian-American entrepreneur and investor.",
]);
const result = await navalChatBot.query(
"What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"
);
console.log(result);
// answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
}
testApp();
```
# Getting Started
## Installation
- First make sure that you have the package installed. If not, then install it using `npm`
```bash
npm install embedchain && npm install -S openai@^3.3.0
```
- Currently, it is only compatible with openai 3.X, not the latest version 4.X. Please make sure to use the right version, otherwise you will see the `ChromaDB` error `TypeError: OpenAIApi.Configuration is not a constructor`
- Make sure that dotenv package is installed and your `OPENAI_API_KEY` in a file called `.env` in the root folder. You can install dotenv by
```js
npm install dotenv
```
- Download and install Docker on your device by visiting [this link](https://www.docker.com/). You will need this to run Chroma vector database on your machine.
- Run the following commands to setup Chroma container in Docker
```bash
git clone https://github.com/chroma-core/chroma.git
cd chroma
docker-compose up -d --build
```
- Once Chroma container has been set up, run it inside Docker
## Usage
- We use OpenAI's embedding model to create embeddings for chunks and ChatGPT API as LLM to get answer given the relevant docs. Make sure that you have an OpenAI account and an API key. If you have dont have an API key, you can create one by visiting [this link](https://platform.openai.com/account/api-keys).
- Once you have the API key, set it in an environment variable called `OPENAI_API_KEY`
```js
// Set this inside your .env file
OPENAI_API_KEY = "sk-xxxx";
```
- Load the environment variables inside your .js file using the following commands
```js
const dotenv = require("dotenv");
dotenv.config();
```
- Next import the `App` class from embedchain and use `.add` function to add any dataset.
- Now your app is created. You can use `.query` function to get the answer for any query.
```js
const dotenv = require("dotenv");
dotenv.config();
const { App } = require("embedchain");
async function testApp() {
const navalChatBot = await App();
// Embed Online Resources
await navalChatBot.add("web_page", "https://nav.al/feedback");
await navalChatBot.add("web_page", "https://nav.al/agi");
await navalChatBot.add(
"pdf_file",
"https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
);
// Embed Local Resources
await navalChatBot.addLocal("qna_pair", [
"Who is Naval Ravikant?",
"Naval Ravikant is an Indian-American entrepreneur and investor.",
]);
const result = await navalChatBot.query(
"What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"
);
console.log(result);
// answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
}
testApp();
```
- If there is any other app instance in your script or app, you can change the import as
```javascript
const { App: EmbedChainApp } = require("embedchain");
// or
const { App: ECApp } = require("embedchain");
```
## Format supported
We support the following formats:
### PDF File
To add any pdf file, use the data_type as `pdf_file`. Eg:
```javascript
await app.add("pdf_file", "a_valid_url_where_pdf_file_can_be_accessed");
```
### Web Page
To add any web page, use the data_type as `web_page`. Eg:
```javascript
await app.add("web_page", "a_valid_web_page_url");
```
### QnA Pair
To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
```javascript
await app.addLocal("qna_pair", ["Question", "Answer"]);
```
### More Formats coming soon
- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchainjs/issues) and we will add it to the list of supported formats.
## Testing
Before you consume valuable tokens, you should make sure that the embedding you have done works and that it's receiving the correct document from the database.
For this you can use the `dryRun` method.
Following the example above, add this to your script:
```js
let result = await naval_chat_bot.dryRun("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?");console.log(result);
'''
Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
terms of the unseen. And I think thats critical. That is what humans do uniquely that no other creature, no other computer, no other intelligencebiological or artificialthat we have ever encountered does. And not only do we do it uniquely, but if we were to meet an alien species that also had the power to generate these good explanations, there is no explanation that they could generate that we could not understand. We are maximally capable of understanding. There is no concept out there that is possible in this physical reality that a human being, given sufficient time and resources and
Query: What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?
Helpful Answer:
'''
```
_The embedding is confirmed to work as expected. It returns the right document, even if the question is asked slightly different. No prompt tokens have been consumed._
**The dry run will still consume tokens to embed your query, but it is only ~1/15 of the prompt.**
# How does it work?
Creating a chat bot over any dataset needs the following steps to happen
- load the data
- create meaningful chunks
- create embeddings for each chunk
- store the chunks in vector database
Whenever a user asks any query, following process happens to find the answer for the query
- create the embedding for query
- find similar documents for this query from vector database
- pass similar documents as context to LLM to get the final answer.
The process of loading the dataset and then querying involves multiple steps and each steps has nuances of it is own.
- How should I chunk the data? What is a meaningful chunk size?
- How should I create embeddings for each chunk? Which embedding model should I use?
- How should I store the chunks in vector database? Which vector database should I use?
- Should I store meta data along with the embeddings?
- How should I find similar documents for a query? Which ranking model should I use?
These questions may be trivial for some but for a lot of us, it needs research, experimentation and time to find out the accurate answers.
embedchain is a framework which takes care of all these nuances and provides a simple interface to create bots over any dataset.
In the first release, we are making it easier for anyone to get a chatbot over any dataset up and running in less than a minute. All you need to do is create an app instance, add the data sets using `.add` function and then use `.query` function to get the relevant answer.
# Team
## Author
- Taranjeet Singh ([@taranjeetio](https://twitter.com/taranjeetio))
## Maintainer
- [cachho](https://github.com/cachho)
- [sahilyadav902](https://github.com/sahilyadav902)
## Citation
If you utilize this repository, please consider citing it with:
```
@misc{embedchain,
author = {Taranjeet Singh},
title = {Embechain: Framework to easily create LLM powered bots over any dataset},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/embedchain/embedchainjs}},
}
```

View File

@@ -1 +0,0 @@
module.exports = { extends: ['@commitlint/config-conventional'] };

View File

@@ -1,66 +0,0 @@
import { EmbedChainApp } from '../embedchain';
const mockAdd = jest.fn();
const mockAddLocal = jest.fn();
const mockQuery = jest.fn();
jest.mock('../embedchain', () => {
return {
EmbedChainApp: jest.fn().mockImplementation(() => {
return {
add: mockAdd,
addLocal: mockAddLocal,
query: mockQuery,
};
}),
};
});
describe('Test App', () => {
beforeEach(() => {
jest.clearAllMocks();
});
it('tests the App', async () => {
mockQuery.mockResolvedValue(
'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.'
);
const navalChatBot = await new EmbedChainApp(undefined, false);
// Embed Online Resources
await navalChatBot.add('web_page', 'https://nav.al/feedback');
await navalChatBot.add('web_page', 'https://nav.al/agi');
await navalChatBot.add(
'pdf_file',
'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf'
);
// Embed Local Resources
await navalChatBot.addLocal('qna_pair', [
'Who is Naval Ravikant?',
'Naval Ravikant is an Indian-American entrepreneur and investor.',
]);
const result = await navalChatBot.query(
'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?'
);
expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/feedback');
expect(mockAdd).toHaveBeenCalledWith('web_page', 'https://nav.al/agi');
expect(mockAdd).toHaveBeenCalledWith(
'pdf_file',
'https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf'
);
expect(mockAddLocal).toHaveBeenCalledWith('qna_pair', [
'Who is Naval Ravikant?',
'Naval Ravikant is an Indian-American entrepreneur and investor.',
]);
expect(mockQuery).toHaveBeenCalledWith(
'What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?'
);
expect(result).toBe(
'Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.'
);
});
});

View File

@@ -1,44 +0,0 @@
import { createHash } from 'crypto';
import type { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import type { BaseLoader } from '../loaders';
import type { Input, LoaderResult } from '../models';
import type { ChunkResult } from '../models/ChunkResult';
class BaseChunker {
textSplitter: RecursiveCharacterTextSplitter;
constructor(textSplitter: RecursiveCharacterTextSplitter) {
this.textSplitter = textSplitter;
}
async createChunks(loader: BaseLoader, url: Input): Promise<ChunkResult> {
const documents: ChunkResult['documents'] = [];
const ids: ChunkResult['ids'] = [];
const datas: LoaderResult = await loader.loadData(url);
const metadatas: ChunkResult['metadatas'] = [];
const dataPromises = datas.map(async (data) => {
const { content, metaData } = data;
const chunks: string[] = await this.textSplitter.splitText(content);
chunks.forEach((chunk) => {
const chunkId = createHash('sha256')
.update(chunk + metaData.url)
.digest('hex');
ids.push(chunkId);
documents.push(chunk);
metadatas.push(metaData);
});
});
await Promise.all(dataPromises);
return {
documents,
ids,
metadatas,
};
}
}
export { BaseChunker };

View File

@@ -1,26 +0,0 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { BaseChunker } from './BaseChunker';
interface TextSplitterChunkParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
}
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
chunkSize: 1000,
chunkOverlap: 0,
keepSeparator: false,
};
class PdfFileChunker extends BaseChunker {
constructor() {
const textSplitter = new RecursiveCharacterTextSplitter(
TEXT_SPLITTER_CHUNK_PARAMS
);
super(textSplitter);
}
}
export { PdfFileChunker };

View File

@@ -1,26 +0,0 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { BaseChunker } from './BaseChunker';
interface TextSplitterChunkParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
}
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
chunkSize: 300,
chunkOverlap: 0,
keepSeparator: false,
};
class QnaPairChunker extends BaseChunker {
constructor() {
const textSplitter = new RecursiveCharacterTextSplitter(
TEXT_SPLITTER_CHUNK_PARAMS
);
super(textSplitter);
}
}
export { QnaPairChunker };

View File

@@ -1,26 +0,0 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { BaseChunker } from './BaseChunker';
interface TextSplitterChunkParams {
chunkSize: number;
chunkOverlap: number;
keepSeparator: boolean;
}
const TEXT_SPLITTER_CHUNK_PARAMS: TextSplitterChunkParams = {
chunkSize: 500,
chunkOverlap: 0,
keepSeparator: false,
};
class WebPageChunker extends BaseChunker {
constructor() {
const textSplitter = new RecursiveCharacterTextSplitter(
TEXT_SPLITTER_CHUNK_PARAMS
);
super(textSplitter);
}
}
export { WebPageChunker };

View File

@@ -1,6 +0,0 @@
import { BaseChunker } from './BaseChunker';
import { PdfFileChunker } from './PdfFile';
import { QnaPairChunker } from './QnaPair';
import { WebPageChunker } from './WebPage';
export { BaseChunker, PdfFileChunker, QnaPairChunker, WebPageChunker };

View File

@@ -1,317 +0,0 @@
/* eslint-disable max-classes-per-file */
import type { Collection } from 'chromadb';
import type { QueryResponse } from 'chromadb/dist/main/types';
import * as fs from 'fs';
import { Document } from 'langchain/document';
import OpenAI from 'openai';
import * as path from 'path';
import { v4 as uuidv4 } from 'uuid';
import type { BaseChunker } from './chunkers';
import { PdfFileChunker, QnaPairChunker, WebPageChunker } from './chunkers';
import type { BaseLoader } from './loaders';
import { LocalQnaPairLoader, PdfFileLoader, WebPageLoader } from './loaders';
import type {
DataDict,
DataType,
FormattedResult,
Input,
LocalInput,
Metadata,
Method,
RemoteInput,
} from './models';
import { ChromaDB } from './vectordb';
import type { BaseVectorDB } from './vectordb/BaseVectorDb';
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
class EmbedChain {
dbClient: any;
// TODO: Definitely assign
collection!: Collection;
userAsks: [DataType, Input][] = [];
initApp: Promise<void>;
collectMetrics: boolean;
sId: string; // sessionId
constructor(db?: BaseVectorDB, collectMetrics: boolean = true) {
if (!db) {
this.initApp = this.setupChroma();
} else {
this.initApp = this.setupOther(db);
}
this.collectMetrics = collectMetrics;
// Send anonymous telemetry
this.sId = uuidv4();
this.sendTelemetryEvent('init');
}
async setupChroma(): Promise<void> {
const db = new ChromaDB();
await db.initDb;
this.dbClient = db.client;
if (db.collection) {
this.collection = db.collection;
} else {
// TODO: Add proper error handling
console.error('No collection');
}
}
async setupOther(db: BaseVectorDB): Promise<void> {
await db.initDb;
// TODO: Figure out how we can initialize an unknown database.
// this.dbClient = db.client;
// this.collection = db.collection;
this.userAsks = [];
}
static getLoader(dataType: DataType) {
const loaders: { [t in DataType]: BaseLoader } = {
pdf_file: new PdfFileLoader(),
web_page: new WebPageLoader(),
qna_pair: new LocalQnaPairLoader(),
};
return loaders[dataType];
}
static getChunker(dataType: DataType) {
const chunkers: { [t in DataType]: BaseChunker } = {
pdf_file: new PdfFileChunker(),
web_page: new WebPageChunker(),
qna_pair: new QnaPairChunker(),
};
return chunkers[dataType];
}
public async add(dataType: DataType, url: RemoteInput) {
const loader = EmbedChain.getLoader(dataType);
const chunker = EmbedChain.getChunker(dataType);
this.userAsks.push([dataType, url]);
const { documents, countNewChunks } = await this.loadAndEmbed(
loader,
chunker,
url
);
if (this.collectMetrics) {
const wordCount = documents.reduce(
(sum, document) => sum + document.split(' ').length,
0
);
this.sendTelemetryEvent('add', {
data_type: dataType,
word_count: wordCount,
chunks_count: countNewChunks,
});
}
}
public async addLocal(dataType: DataType, content: LocalInput) {
const loader = EmbedChain.getLoader(dataType);
const chunker = EmbedChain.getChunker(dataType);
this.userAsks.push([dataType, content]);
const { documents, countNewChunks } = await this.loadAndEmbed(
loader,
chunker,
content
);
if (this.collectMetrics) {
const wordCount = documents.reduce(
(sum, document) => sum + document.split(' ').length,
0
);
this.sendTelemetryEvent('add_local', {
data_type: dataType,
word_count: wordCount,
chunks_count: countNewChunks,
});
}
}
protected async loadAndEmbed(
loader: any,
chunker: BaseChunker,
src: Input
): Promise<{
documents: string[];
metadatas: Metadata[];
ids: string[];
countNewChunks: number;
}> {
const embeddingsData = await chunker.createChunks(loader, src);
let { documents, ids, metadatas } = embeddingsData;
const existingDocs = await this.collection.get({ ids });
const existingIds = new Set(existingDocs.ids);
if (existingIds.size > 0) {
const dataDict: DataDict = {};
for (let i = 0; i < ids.length; i += 1) {
const id = ids[i];
if (!existingIds.has(id)) {
dataDict[id] = { doc: documents[i], meta: metadatas[i] };
}
}
if (Object.keys(dataDict).length === 0) {
console.log(`All data from ${src} already exists in the database.`);
return { documents: [], metadatas: [], ids: [], countNewChunks: 0 };
}
ids = Object.keys(dataDict);
const dataValues = Object.values(dataDict);
documents = dataValues.map(({ doc }) => doc);
metadatas = dataValues.map(({ meta }) => meta);
}
const countBeforeAddition = await this.count();
await this.collection.add({ documents, metadatas, ids });
const countNewChunks = (await this.count()) - countBeforeAddition;
console.log(
`Successfully saved ${src}. New chunks count: ${countNewChunks}`
);
return { documents, metadatas, ids, countNewChunks };
}
static async formatResult(
results: QueryResponse
): Promise<FormattedResult[]> {
return results.documents[0].map((document: any, index: number) => {
const metadata = results.metadatas[0][index] || {};
// TODO: Add proper error handling
const distance = results.distances ? results.distances[0][index] : null;
return [new Document({ pageContent: document, metadata }), distance];
});
}
static async getOpenAiAnswer(prompt: string) {
const messages: OpenAI.Chat.CreateChatCompletionRequestMessage[] = [
{ role: 'user', content: prompt },
];
const response = await openai.chat.completions.create({
model: 'gpt-3.5-turbo',
messages,
temperature: 0,
max_tokens: 1000,
top_p: 1,
});
return (
response.choices[0].message?.content ?? 'Response could not be processed.'
);
}
protected async retrieveFromDatabase(inputQuery: string) {
const result = await this.collection.query({
nResults: 1,
queryTexts: [inputQuery],
});
const resultFormatted = await EmbedChain.formatResult(result);
const content = resultFormatted[0][0].pageContent;
return content;
}
static generatePrompt(inputQuery: string, context: any) {
const prompt = `Use the following pieces of context to answer the query at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n${context}\nQuery: ${inputQuery}\nHelpful Answer:`;
return prompt;
}
static async getAnswerFromLlm(prompt: string) {
const answer = await EmbedChain.getOpenAiAnswer(prompt);
return answer;
}
public async query(inputQuery: string) {
const context = await this.retrieveFromDatabase(inputQuery);
const prompt = EmbedChain.generatePrompt(inputQuery, context);
const answer = await EmbedChain.getAnswerFromLlm(prompt);
this.sendTelemetryEvent('query');
return answer;
}
public async dryRun(input_query: string) {
const context = await this.retrieveFromDatabase(input_query);
const prompt = EmbedChain.generatePrompt(input_query, context);
return prompt;
}
/**
* Count the number of embeddings.
* @returns {Promise<number>}: The number of embeddings.
*/
public count(): Promise<number> {
return this.collection.count();
}
protected async sendTelemetryEvent(method: Method, extraMetadata?: object) {
if (!this.collectMetrics) {
return;
}
const url = 'https://api.embedchain.ai/api/v1/telemetry/';
// Read package version from filesystem (because it's not in the ts root dir)
const packageJsonPath = path.join(__dirname, '..', 'package.json');
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8'));
const metadata = {
s_id: this.sId,
version: packageJson.version,
method,
language: 'js',
...extraMetadata,
};
const maxRetries = 3;
// Retry the fetch
for (let i = 0; i < maxRetries; i += 1) {
try {
// eslint-disable-next-line no-await-in-loop
const response = await fetch(url, {
method: 'POST',
body: JSON.stringify({ metadata }),
});
if (response.ok) {
// Break out of the loop if the request was successful
break;
} else {
// Log the unsuccessful response (optional)
console.error(
`Telemetry: Attempt ${i + 1} failed with status:`,
response.status
);
}
} catch (error) {
// Log the error (optional)
console.error(`Telemetry: Attempt ${i + 1} failed with error:`, error);
}
// If this was the last attempt, throw an error or handle the failure
if (i === maxRetries - 1) {
console.error('Telemetry: Max retries reached');
}
}
}
}
class EmbedChainApp extends EmbedChain {
// The EmbedChain app.
// Has two functions: add and query.
// adds(dataType, url): adds the data from the given URL to the vector db.
// query(query): finds answer to the given query using vector database and LLM.
}
export { EmbedChainApp };

View File

@@ -1,7 +0,0 @@
import { EmbedChainApp } from './embedchain';
export const App = async () => {
const app = new EmbedChainApp();
await app.initApp;
return app;
};

View File

@@ -1,5 +0,0 @@
import type { Input, LoaderResult } from '../models';
export abstract class BaseLoader {
abstract loadData(src: Input): Promise<LoaderResult>;
}

View File

@@ -1,21 +0,0 @@
import type { LoaderResult, QnaPair } from '../models';
import { BaseLoader } from './BaseLoader';
class LocalQnaPairLoader extends BaseLoader {
// eslint-disable-next-line class-methods-use-this
async loadData(content: QnaPair): Promise<LoaderResult> {
const [question, answer] = content;
const contentText = `Q: ${question}\nA: ${answer}`;
const metaData = {
url: 'local',
};
return [
{
content: contentText,
metaData,
},
];
}
}
export { LocalQnaPairLoader };

View File

@@ -1,58 +0,0 @@
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
import type { LoaderResult, Metadata } from '../models';
import { cleanString } from '../utils';
import { BaseLoader } from './BaseLoader';
const pdfjsLib = require('pdfjs-dist');
interface Page {
page_content: string;
}
class PdfFileLoader extends BaseLoader {
static async getPagesFromPdf(url: string): Promise<Page[]> {
const loadingTask = pdfjsLib.getDocument(url);
const pdf = await loadingTask.promise;
const { numPages } = pdf;
const promises = Array.from({ length: numPages }, async (_, i) => {
const page = await pdf.getPage(i + 1);
const pageText: TextContent = await page.getTextContent();
const pageContent: string = pageText.items
.map((item) => ('str' in item ? item.str : ''))
.join(' ');
return {
page_content: pageContent,
};
});
return Promise.all(promises);
}
// eslint-disable-next-line class-methods-use-this
async loadData(url: string): Promise<LoaderResult> {
const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url);
const output: LoaderResult = [];
if (!pages.length) {
throw new Error('No data found');
}
pages.forEach((page) => {
let content: string = page.page_content;
content = cleanString(content);
const metaData: Metadata = {
url,
};
output.push({
content,
metaData,
});
});
return output;
}
}
export { PdfFileLoader };

View File

@@ -1,51 +0,0 @@
import axios from 'axios';
import { JSDOM } from 'jsdom';
import { cleanString } from '../utils';
import { BaseLoader } from './BaseLoader';
class WebPageLoader extends BaseLoader {
// eslint-disable-next-line class-methods-use-this
async loadData(url: string) {
const response = await axios.get(url);
const html = response.data;
const dom = new JSDOM(html);
const { document } = dom.window;
const unwantedTags = [
'nav',
'aside',
'form',
'header',
'noscript',
'svg',
'canvas',
'footer',
'script',
'style',
];
unwantedTags.forEach((tagName) => {
const elements = document.getElementsByTagName(tagName);
Array.from(elements).forEach((element) => {
// eslint-disable-next-line no-param-reassign
(element as HTMLElement).textContent = ' ';
});
});
const output = [];
let content = document.body.textContent;
if (!content) {
throw new Error('Web page content is empty.');
}
content = cleanString(content);
const metaData = {
url,
};
output.push({
content,
metaData,
});
return output;
}
}
export { WebPageLoader };

View File

@@ -1,6 +0,0 @@
import { BaseLoader } from './BaseLoader';
import { LocalQnaPairLoader } from './LocalQnaPair';
import { PdfFileLoader } from './PdfFile';
import { WebPageLoader } from './WebPage';
export { BaseLoader, LocalQnaPairLoader, PdfFileLoader, WebPageLoader };

View File

@@ -1,7 +0,0 @@
import type { Metadata } from './Metadata';
export type ChunkResult = {
documents: string[];
ids: string[];
metadatas: Metadata[];
};

View File

@@ -1,10 +0,0 @@
import type { ChunkResult } from './ChunkResult';
type Data = {
doc: ChunkResult['documents'][0];
meta: ChunkResult['metadatas'][0];
};
export type DataDict = {
[id: string]: Data;
};

View File

@@ -1 +0,0 @@
export type DataType = 'pdf_file' | 'web_page' | 'qna_pair';

View File

@@ -1,3 +0,0 @@
import type { Document } from 'langchain/document';
export type FormattedResult = [Document, number | null];

View File

@@ -1,7 +0,0 @@
import type { QnaPair } from './QnAPair';
export type RemoteInput = string;
export type LocalInput = QnaPair;
export type Input = RemoteInput | LocalInput;

View File

@@ -1,3 +0,0 @@
import type { Metadata } from './Metadata';
export type LoaderResult = { content: any; metaData: Metadata }[];

View File

@@ -1,3 +0,0 @@
export type Metadata = {
url: string;
};

View File

@@ -1 +0,0 @@
export type Method = 'init' | 'query' | 'add' | 'add_local';

View File

@@ -1,4 +0,0 @@
type Question = string;
type Answer = string;
export type QnaPair = [Question, Answer];

View File

@@ -1,21 +0,0 @@
import { DataDict } from './DataDict';
import { DataType } from './DataType';
import { FormattedResult } from './FormattedResult';
import { Input, LocalInput, RemoteInput } from './Input';
import { LoaderResult } from './LoaderResult';
import { Metadata } from './Metadata';
import { Method } from './Method';
import { QnaPair } from './QnAPair';
export {
DataDict,
DataType,
FormattedResult,
Input,
LoaderResult,
LocalInput,
Metadata,
Method,
QnaPair,
RemoteInput,
};

View File

@@ -1,26 +0,0 @@
/**
* This function takes in a string and performs a series of text cleaning operations.
* @param {str} text: The text to be cleaned. This is expected to be a string.
* @returns {str}: The cleaned text after all the cleaning operations have been performed.
*/
export function cleanString(text: string): string {
// Replacement of newline characters:
let cleanedText = text.replace(/\n/g, ' ');
// Stripping and reducing multiple spaces to single:
cleanedText = cleanedText.trim().replace(/\s+/g, ' ');
// Removing backslashes:
cleanedText = cleanedText.replace(/\\/g, '');
// Replacing hash characters:
cleanedText = cleanedText.replace(/#/g, ' ');
// Eliminating consecutive non-alphanumeric characters:
// This regex identifies consecutive non-alphanumeric characters (i.e., not a word character [a-zA-Z0-9_] and not a whitespace) in the string
// and replaces each group of such characters with a single occurrence of that character.
// For example, "!!! hello !!!" would become "! hello !".
cleanedText = cleanedText.replace(/([^\w\s])\1*/g, '$1');
return cleanedText;
}

View File

@@ -1,14 +0,0 @@
class BaseVectorDB {
initDb: Promise<void>;
constructor() {
this.initDb = this.getClientAndCollection();
}
// eslint-disable-next-line class-methods-use-this
protected async getClientAndCollection(): Promise<void> {
throw new Error('getClientAndCollection() method is not implemented');
}
}
export { BaseVectorDB };

View File

@@ -1,38 +0,0 @@
import type { Collection } from 'chromadb';
import { ChromaClient, OpenAIEmbeddingFunction } from 'chromadb';
import { BaseVectorDB } from './BaseVectorDb';
const embedder = new OpenAIEmbeddingFunction({
openai_api_key: process.env.OPENAI_API_KEY ?? '',
});
class ChromaDB extends BaseVectorDB {
client: ChromaClient | undefined;
collection: Collection | null = null;
// eslint-disable-next-line @typescript-eslint/no-useless-constructor
constructor() {
super();
}
protected async getClientAndCollection(): Promise<void> {
this.client = new ChromaClient({ path: 'http://localhost:8000' });
try {
this.collection = await this.client.getCollection({
name: 'embedchain_store',
embeddingFunction: embedder,
});
} catch (err) {
if (!this.collection) {
this.collection = await this.client.createCollection({
name: 'embedchain_store',
embeddingFunction: embedder,
});
}
}
}
}
export { ChromaDB };

View File

@@ -1,3 +0,0 @@
import { ChromaDB } from './ChromaDb';
export { ChromaDB };

View File

@@ -1,9 +0,0 @@
const { EmbedChainApp } = require("./embedchain/embedchain");
async function App() {
const app = new EmbedChainApp();
await app.init_app;
return app;
}
module.exports = { App };

View File

@@ -1,5 +0,0 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
testPathIgnorePatterns: ['.d.ts'],
};

View File

@@ -1,5 +0,0 @@
module.exports = {
'*.{js,ts}': ['eslint --fix', 'eslint'],
'**/*.ts?(x)': () => 'npm run check-types',
'*.json': ['prettier --write'],
};

File diff suppressed because it is too large Load Diff

View File

@@ -1,53 +0,0 @@
{
"name": "embedchain",
"version": "0.0.8",
"description": "embedchain is a framework to easily create LLM powered bots over any dataset",
"main": "dist/index.js",
"types": "types/index.d.ts",
"files": [
"dist",
"types"
],
"scripts": {
"build": "tsc -p tsconfig.build.json --listFiles",
"prepare": "husky install",
"test": "jest",
"check-types": "tsc --noEmit --pretty"
},
"author": "Taranjeet Singh",
"license": "Apache-2.0",
"dependencies": {
"axios": "^1.4.0",
"chromadb": "^1.5.6",
"jsdom": "^22.1.0",
"langchain": "^0.0.136",
"openai": "^4.3.1",
"pdfjs-dist": "^3.8.162",
"uuid": "^9.0.0"
},
"devDependencies": {
"@commitlint/cli": "^17.1.2",
"@commitlint/config-conventional": "^17.1.0",
"@commitlint/cz-commitlint": "^17.1.2",
"@types/jest": "^29.5.1",
"@types/jsdom": "^21.1.1",
"@typescript-eslint/eslint-plugin": "^5.41.0",
"@typescript-eslint/parser": "^5.41.0",
"eslint": "^8.34.0",
"eslint-config-airbnb-base": "^15.0.0",
"eslint-config-airbnb-typescript": "^17.0.0",
"eslint-config-prettier": "^8.5.0",
"eslint-plugin-import": "^2.27.5",
"eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-simple-import-sort": "^8.0.0",
"eslint-plugin-testing-library": "^5.9.1",
"eslint-plugin-unused-imports": "^2.0.0",
"husky": "^8.0.1",
"jest": "^29.5.0",
"lint-staged": "^13.0.3",
"prettier": "^2.7.1",
"ts-jest": "^29.1.0",
"ts-loader": "^9.4.2",
"typescript": "^5.2.2"
}
}

View File

@@ -1,4 +0,0 @@
{
"extends": "./tsconfig.json",
"exclude": ["embedchain/__tests__"]
}

View File

@@ -1,15 +0,0 @@
{
"compilerOptions": {
"target": "es6",
"module": "CommonJS",
"strict": true,
"outDir": "dist",
"rootDir": "embedchain",
"sourceMap": true,
"declaration": true,
"declarationDir": "types",
"esModuleInterop": true
},
"include": ["embedchain/**/*.ts"],
"exclude": ["node_modules", "dist"]
}