[Pipelines] Improvements in pipelines feature (#861)

This commit is contained in:
Deshraj Yadav
2023-10-27 18:42:46 -07:00
committed by GitHub
parent 68183e9dce
commit f6c4f86986
4 changed files with 103 additions and 9 deletions

View File

@@ -71,6 +71,10 @@
"group": "Examples", "group": "Examples",
"pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"] "pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"]
}, },
{
"group": "Pipelines",
"pages": ["pipelines/quickstart"]
},
{ {
"group": "Community", "group": "Community",
"pages": [ "pages": [

View File

@@ -0,0 +1,44 @@
---
title: '🚀 Pipelines'
description: '💡 Start building LLM powered data pipelines in 1 minute'
---
Embedchain lets you build data pipelines on your own data sources and deploy it in production in less than a minute. It can load, index, retrieve, and sync any unstructured data.
Install embedchain python package:
```bash
pip install embedchain
```
Creating a pipeline involves 3 steps:
<Steps>
<Step title="⚙️ Import pipeline instance">
```python
from embedchain import Pipeline
p = Pipeline(name="Elon Musk")
```
</Step>
<Step title="🗃️ Add data sources">
```python
# Add different data sources
p.add("https://en.wikipedia.org/wiki/Elon_Musk")
p.add("https://www.forbes.com/profile/elon-musk")
# You can also add local data sources such as pdf, csv files etc.
# p.add("/path/to/file.pdf")
```
</Step>
<Step title="💬 Deploy your pipeline to Embedchain platform">
```python
p.deploy()
```
</Step>
</Steps>
That's it. Now, head to the [Embedchain platform](https://app.embedchain.ai) and your pipeline is available there. Make sure to set the `OPENAI_API_KEY` 🔑 environment variable in the code.
After you deploy your pipeline to Embedchain platform, you can still add more data sources and update the pipeline multiple times.
Here is a Google Colab notebook for you to get started: [![Open in Colab](https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/drive/1YVXaBO4yqlHZY4ho67GCJ6aD4CHNiScD?usp=sharing)

View File

@@ -34,6 +34,8 @@ class Pipeline(EmbedChain):
def __init__( def __init__(
self, self,
id: str = None,
name: str = None,
config: PipelineConfig = None, config: PipelineConfig = None,
db: BaseVectorDB = None, db: BaseVectorDB = None,
embedding_model: BaseEmbedder = None, embedding_model: BaseEmbedder = None,
@@ -61,6 +63,15 @@ class Pipeline(EmbedChain):
:type auto_deploy: bool, optional :type auto_deploy: bool, optional
:raises Exception: If an error occurs while creating the pipeline :raises Exception: If an error occurs while creating the pipeline
""" """
if id and yaml_path:
raise Exception("Cannot provide both id and config. Please provide only one of them.")
if id and name:
raise Exception("Cannot provide both id and name. Please provide only one of them.")
if name and config:
raise Exception("Cannot provide both name and config. Please provide only one of them.")
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
@@ -71,16 +82,28 @@ class Pipeline(EmbedChain):
self.client = None self.client = None
# pipeline_id from the backend # pipeline_id from the backend
self.id = None self.id = None
if yaml_path:
with open(yaml_path, "r") as file:
config_data = yaml.safe_load(file)
self.yaml_config = config_data
self.config = config or PipelineConfig() self.config = config or PipelineConfig()
self.name = self.config.name self.name = self.config.name
self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id
if yaml_path:
with open(yaml_path, "r") as file:
config_data = yaml.safe_load(file)
self.yaml_config = config_data
if id is not None:
# Init client first since user is trying to fetch the pipeline
# details from the platform
self._init_client()
pipeline_details = self._get_pipeline(id)
self.config.id = self.local_id = pipeline_details["metadata"]["local_id"]
self.id = id
if name is not None:
self.name = name
self.embedding_model = embedding_model or OpenAIEmbedder() self.embedding_model = embedding_model or OpenAIEmbedder()
self.db = db or ChromaDB() self.db = db or ChromaDB()
self.llm = llm or OpenAILlm() self.llm = llm or OpenAILlm()
@@ -134,6 +157,24 @@ class Pipeline(EmbedChain):
) )
self.client = Client(api_key=api_key) self.client = Client(api_key=api_key)
def _get_pipeline(self, id):
"""
Get existing pipeline
"""
print("🛠️ Fetching pipeline details from the platform...")
url = f"{self.client.host}/api/v1/pipelines/{id}/cli/"
r = requests.get(
url,
headers={"Authorization": f"Token {self.client.api_key}"},
)
if r.status_code == 404:
raise Exception(f"❌ Pipeline with id {id} not found!")
print(
f"🎉 Pipeline loaded successfully! Pipeline url: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
)
return r.json()
def _create_pipeline(self): def _create_pipeline(self):
""" """
Create a pipeline on the platform. Create a pipeline on the platform.
@@ -154,6 +195,11 @@ class Pipeline(EmbedChain):
if r.status_code not in [200, 201]: if r.status_code not in [200, 201]:
raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}") raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}")
if r.status_code == 200:
print(
f"🎉🎉🎉 Existing pipeline found! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
) # noqa: E501
elif r.status_code == 201:
print( print(
f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501 f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
) )
@@ -257,7 +303,7 @@ class Pipeline(EmbedChain):
self.id = pipeline_data["id"] self.id = pipeline_data["id"]
results = self.cursor.execute( results = self.cursor.execute(
"SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,) "SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,) # noqa:E501
).fetchall() ).fetchall()
if len(results) > 0: if len(results) > 0:

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "embedchain" name = "embedchain"
version = "0.0.78" version = "0.0.79"
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
authors = ["Taranjeet Singh, Deshraj Yadav"] authors = ["Taranjeet Singh, Deshraj Yadav"]
license = "Apache License" license = "Apache License"