[Pipelines] Improvements in pipelines feature (#861)
This commit is contained in:
@@ -71,6 +71,10 @@
|
|||||||
"group": "Examples",
|
"group": "Examples",
|
||||||
"pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"]
|
"pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"group": "Pipelines",
|
||||||
|
"pages": ["pipelines/quickstart"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"group": "Community",
|
"group": "Community",
|
||||||
"pages": [
|
"pages": [
|
||||||
|
|||||||
44
docs/pipelines/quickstart.mdx
Normal file
44
docs/pipelines/quickstart.mdx
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
---
|
||||||
|
title: '🚀 Pipelines'
|
||||||
|
description: '💡 Start building LLM powered data pipelines in 1 minute'
|
||||||
|
---
|
||||||
|
|
||||||
|
Embedchain lets you build data pipelines on your own data sources and deploy it in production in less than a minute. It can load, index, retrieve, and sync any unstructured data.
|
||||||
|
|
||||||
|
Install embedchain python package:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install embedchain
|
||||||
|
```
|
||||||
|
|
||||||
|
Creating a pipeline involves 3 steps:
|
||||||
|
|
||||||
|
<Steps>
|
||||||
|
<Step title="⚙️ Import pipeline instance">
|
||||||
|
```python
|
||||||
|
from embedchain import Pipeline
|
||||||
|
p = Pipeline(name="Elon Musk")
|
||||||
|
```
|
||||||
|
</Step>
|
||||||
|
|
||||||
|
<Step title="🗃️ Add data sources">
|
||||||
|
```python
|
||||||
|
# Add different data sources
|
||||||
|
p.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||||
|
p.add("https://www.forbes.com/profile/elon-musk")
|
||||||
|
# You can also add local data sources such as pdf, csv files etc.
|
||||||
|
# p.add("/path/to/file.pdf")
|
||||||
|
```
|
||||||
|
</Step>
|
||||||
|
<Step title="💬 Deploy your pipeline to Embedchain platform">
|
||||||
|
```python
|
||||||
|
p.deploy()
|
||||||
|
```
|
||||||
|
</Step>
|
||||||
|
</Steps>
|
||||||
|
|
||||||
|
That's it. Now, head to the [Embedchain platform](https://app.embedchain.ai) and your pipeline is available there. Make sure to set the `OPENAI_API_KEY` 🔑 environment variable in the code.
|
||||||
|
|
||||||
|
After you deploy your pipeline to Embedchain platform, you can still add more data sources and update the pipeline multiple times.
|
||||||
|
|
||||||
|
Here is a Google Colab notebook for you to get started: [](https://colab.research.google.com/drive/1YVXaBO4yqlHZY4ho67GCJ6aD4CHNiScD?usp=sharing)
|
||||||
@@ -34,6 +34,8 @@ class Pipeline(EmbedChain):
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
id: str = None,
|
||||||
|
name: str = None,
|
||||||
config: PipelineConfig = None,
|
config: PipelineConfig = None,
|
||||||
db: BaseVectorDB = None,
|
db: BaseVectorDB = None,
|
||||||
embedding_model: BaseEmbedder = None,
|
embedding_model: BaseEmbedder = None,
|
||||||
@@ -61,6 +63,15 @@ class Pipeline(EmbedChain):
|
|||||||
:type auto_deploy: bool, optional
|
:type auto_deploy: bool, optional
|
||||||
:raises Exception: If an error occurs while creating the pipeline
|
:raises Exception: If an error occurs while creating the pipeline
|
||||||
"""
|
"""
|
||||||
|
if id and yaml_path:
|
||||||
|
raise Exception("Cannot provide both id and config. Please provide only one of them.")
|
||||||
|
|
||||||
|
if id and name:
|
||||||
|
raise Exception("Cannot provide both id and name. Please provide only one of them.")
|
||||||
|
|
||||||
|
if name and config:
|
||||||
|
raise Exception("Cannot provide both name and config. Please provide only one of them.")
|
||||||
|
|
||||||
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -71,16 +82,28 @@ class Pipeline(EmbedChain):
|
|||||||
self.client = None
|
self.client = None
|
||||||
# pipeline_id from the backend
|
# pipeline_id from the backend
|
||||||
self.id = None
|
self.id = None
|
||||||
if yaml_path:
|
|
||||||
with open(yaml_path, "r") as file:
|
|
||||||
config_data = yaml.safe_load(file)
|
|
||||||
self.yaml_config = config_data
|
|
||||||
|
|
||||||
self.config = config or PipelineConfig()
|
self.config = config or PipelineConfig()
|
||||||
self.name = self.config.name
|
self.name = self.config.name
|
||||||
|
|
||||||
self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id
|
self.config.id = self.local_id = str(uuid.uuid4()) if self.config.id is None else self.config.id
|
||||||
|
|
||||||
|
if yaml_path:
|
||||||
|
with open(yaml_path, "r") as file:
|
||||||
|
config_data = yaml.safe_load(file)
|
||||||
|
self.yaml_config = config_data
|
||||||
|
|
||||||
|
if id is not None:
|
||||||
|
# Init client first since user is trying to fetch the pipeline
|
||||||
|
# details from the platform
|
||||||
|
self._init_client()
|
||||||
|
pipeline_details = self._get_pipeline(id)
|
||||||
|
self.config.id = self.local_id = pipeline_details["metadata"]["local_id"]
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
if name is not None:
|
||||||
|
self.name = name
|
||||||
|
|
||||||
self.embedding_model = embedding_model or OpenAIEmbedder()
|
self.embedding_model = embedding_model or OpenAIEmbedder()
|
||||||
self.db = db or ChromaDB()
|
self.db = db or ChromaDB()
|
||||||
self.llm = llm or OpenAILlm()
|
self.llm = llm or OpenAILlm()
|
||||||
@@ -134,6 +157,24 @@ class Pipeline(EmbedChain):
|
|||||||
)
|
)
|
||||||
self.client = Client(api_key=api_key)
|
self.client = Client(api_key=api_key)
|
||||||
|
|
||||||
|
def _get_pipeline(self, id):
|
||||||
|
"""
|
||||||
|
Get existing pipeline
|
||||||
|
"""
|
||||||
|
print("🛠️ Fetching pipeline details from the platform...")
|
||||||
|
url = f"{self.client.host}/api/v1/pipelines/{id}/cli/"
|
||||||
|
r = requests.get(
|
||||||
|
url,
|
||||||
|
headers={"Authorization": f"Token {self.client.api_key}"},
|
||||||
|
)
|
||||||
|
if r.status_code == 404:
|
||||||
|
raise Exception(f"❌ Pipeline with id {id} not found!")
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"🎉 Pipeline loaded successfully! Pipeline url: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
|
||||||
|
)
|
||||||
|
return r.json()
|
||||||
|
|
||||||
def _create_pipeline(self):
|
def _create_pipeline(self):
|
||||||
"""
|
"""
|
||||||
Create a pipeline on the platform.
|
Create a pipeline on the platform.
|
||||||
@@ -154,9 +195,14 @@ class Pipeline(EmbedChain):
|
|||||||
if r.status_code not in [200, 201]:
|
if r.status_code not in [200, 201]:
|
||||||
raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}")
|
raise Exception(f"❌ Error occurred while creating pipeline. API response: {r.text}")
|
||||||
|
|
||||||
print(
|
if r.status_code == 200:
|
||||||
f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
|
print(
|
||||||
)
|
f"🎉🎉🎉 Existing pipeline found! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
|
||||||
|
) # noqa: E501
|
||||||
|
elif r.status_code == 201:
|
||||||
|
print(
|
||||||
|
f"🎉🎉🎉 Pipeline created successfully! View your pipeline: https://app.embedchain.ai/pipelines/{r.json()['id']}\n" # noqa: E501
|
||||||
|
)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
def _get_presigned_url(self, data_type, data_value):
|
def _get_presigned_url(self, data_type, data_value):
|
||||||
@@ -257,7 +303,7 @@ class Pipeline(EmbedChain):
|
|||||||
self.id = pipeline_data["id"]
|
self.id = pipeline_data["id"]
|
||||||
|
|
||||||
results = self.cursor.execute(
|
results = self.cursor.execute(
|
||||||
"SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,)
|
"SELECT * FROM data_sources WHERE pipeline_id = ? AND is_uploaded = 0", (self.local_id,) # noqa:E501
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
if len(results) > 0:
|
if len(results) > 0:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "embedchain"
|
name = "embedchain"
|
||||||
version = "0.0.78"
|
version = "0.0.79"
|
||||||
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
||||||
authors = ["Taranjeet Singh, Deshraj Yadav"]
|
authors = ["Taranjeet Singh, Deshraj Yadav"]
|
||||||
license = "Apache License"
|
license = "Apache License"
|
||||||
|
|||||||
Reference in New Issue
Block a user