Add folder and branch to GitHub (#1308)
This commit is contained in:
@@ -29,11 +29,13 @@ response = app.query("What is Embedchain?")
|
|||||||
```
|
```
|
||||||
The `add` function of the app will accept any valid github query with qualifiers. It only supports loading github code, repository, issues and pull-requests.
|
The `add` function of the app will accept any valid github query with qualifiers. It only supports loading github code, repository, issues and pull-requests.
|
||||||
<Note>
|
<Note>
|
||||||
You must provide qualifiers `type:` and `repo:` in the query. The `type:` qualifier can be a combination of `code`, `repo`, `pr`, `issue`. The `repo:` qualifier must be a valid github repository name.
|
You must provide qualifiers `type:` and `repo:` in the query. The `type:` qualifier can be a combination of `code`, `repo`, `pr`, `issue`, `branch`, `file`. The `repo:` qualifier must be a valid github repository name.
|
||||||
</Note>
|
</Note>
|
||||||
|
|
||||||
<Card title="Valid queries" icon="lightbulb" iconType="duotone" color="#ca8b04">
|
<Card title="Valid queries" icon="lightbulb" iconType="duotone" color="#ca8b04">
|
||||||
- `repo:embedchain/embedchain type:repo` - to load the repository
|
- `repo:embedchain/embedchain type:repo` - to load the repository
|
||||||
|
- `repo:embedchain/embedchain type:branch name:feature_test` - to load the branch of the repository
|
||||||
|
- `repo:embedchain/embedchain type:file path:README.md` - to load the specific file of the repository
|
||||||
- `repo:embedchain/embedchain type:issue,pr` - to load the issues and pull-requests of the repository
|
- `repo:embedchain/embedchain type:issue,pr` - to load the issues and pull-requests of the repository
|
||||||
- `repo:embedchain/embedchain type:issue state:closed` - to load the closed issues of the repository
|
- `repo:embedchain/embedchain type:issue state:closed` - to load the closed issues of the repository
|
||||||
</Card>
|
</Card>
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import shlex
|
import shlex
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
@@ -14,7 +13,7 @@ from embedchain.utils.misc import clean_string
|
|||||||
GITHUB_URL = "https://github.com"
|
GITHUB_URL = "https://github.com"
|
||||||
GITHUB_API_URL = "https://api.github.com"
|
GITHUB_API_URL = "https://api.github.com"
|
||||||
|
|
||||||
VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion"])
|
VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion", "branch", "file"])
|
||||||
|
|
||||||
|
|
||||||
class GithubLoader(BaseLoader):
|
class GithubLoader(BaseLoader):
|
||||||
@@ -66,85 +65,56 @@ class GithubLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@staticmethod
|
def _get_github_repo_data(self, repo_name: str, branch_name: str = None, file_path: str = None) -> list[dict]:
|
||||||
def _get_github_repo_data(repo_url: str):
|
"""Get file contents from Repo"""
|
||||||
local_hash = hashlib.sha256(repo_url.encode()).hexdigest()
|
|
||||||
local_path = f"/tmp/{local_hash}"
|
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
def _get_repo_tree(repo_url: str, local_path: str):
|
repo = self.client.get_repo(repo_name)
|
||||||
try:
|
repo_contents = repo.get_contents("")
|
||||||
from git import Repo
|
|
||||||
except ImportError as e:
|
|
||||||
raise ValueError(
|
|
||||||
"GithubLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[github]'`" # noqa: E501
|
|
||||||
) from e
|
|
||||||
|
|
||||||
if os.path.exists(local_path):
|
if branch_name:
|
||||||
logging.info("Repository already exists. Fetching updates...")
|
repo_contents = repo.get_contents("", ref=branch_name)
|
||||||
repo = Repo(local_path)
|
if file_path:
|
||||||
logging.info("Fetch completed.")
|
repo_contents = [repo.get_contents(file_path)]
|
||||||
else:
|
|
||||||
logging.info("Cloning repository...")
|
|
||||||
repo = Repo.clone_from(repo_url, local_path)
|
|
||||||
logging.info("Clone completed.")
|
|
||||||
return repo.head.commit.tree
|
|
||||||
|
|
||||||
def _get_repo_tree_contents(repo_path, tree, progress_bar):
|
with tqdm(desc="Loading files:", unit="item") as progress_bar:
|
||||||
for subtree in tree:
|
while repo_contents:
|
||||||
if subtree.type == "tree":
|
file_content = repo_contents.pop(0)
|
||||||
_get_repo_tree_contents(repo_path, subtree, progress_bar)
|
if file_content.type == "dir":
|
||||||
else:
|
|
||||||
assert subtree.type == "blob"
|
|
||||||
try:
|
try:
|
||||||
contents = subtree.data_stream.read().decode("utf-8")
|
repo_contents.extend(repo.get_contents(file_content.path))
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(f"Failed to read file: {subtree.path}")
|
logging.warning(f"Failed to read directory: {file_content.path}")
|
||||||
progress_bar.update(1) if progress_bar else None
|
progress_bar.update(1)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
file_text = file_content.decoded_content.decode()
|
||||||
|
except Exception:
|
||||||
|
logging.warning(f"Failed to read file: {file_content.path}")
|
||||||
|
progress_bar.update(1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url = f"{repo_url.rstrip('.git')}/blob/main/{subtree.path}"
|
file_path = file_content.path
|
||||||
data.append(
|
data.append(
|
||||||
{
|
{
|
||||||
"content": clean_string(contents),
|
"content": clean_string(file_text),
|
||||||
"meta_data": {
|
"meta_data": {
|
||||||
"url": url,
|
"path": file_path,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if progress_bar is not None:
|
|
||||||
progress_bar.update(1)
|
|
||||||
|
|
||||||
repo_tree = _get_repo_tree(repo_url, local_path)
|
progress_bar.update(1)
|
||||||
tree_list = list(repo_tree.traverse())
|
|
||||||
with tqdm(total=len(tree_list), desc="Loading files:", unit="item") as progress_bar:
|
|
||||||
_get_repo_tree_contents(local_path, repo_tree, progress_bar)
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _github_search_repo(self, query: str) -> list[dict]:
|
def _github_search_repo(self, query: str) -> list[dict]:
|
||||||
"""Search GitHub repo."""
|
"""Search GitHub repo."""
|
||||||
data = []
|
|
||||||
logging.info(f"Searching github repos with query: {query}")
|
|
||||||
results = self.client.search_repositories(query)
|
|
||||||
# Add repo urls and descriptions
|
|
||||||
urls = list(map(lambda x: x.html_url, results))
|
|
||||||
descriptions = list(map(lambda x: x.description, results))
|
|
||||||
data.append(
|
|
||||||
{
|
|
||||||
"content": clean_string(desc),
|
|
||||||
"meta_data": {
|
|
||||||
"url": url,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for url, desc in zip(urls, descriptions)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add repo contents
|
logging.info(f"Searching github repos with query: {query}")
|
||||||
for result in results:
|
updated_query = query.split(":")[-1]
|
||||||
clone_url = result.clone_url
|
data = self._get_github_repo_data(updated_query)
|
||||||
logging.info(f"Cloning repository: {clone_url}")
|
|
||||||
data = self._get_github_repo_data(clone_url)
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]:
|
def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]:
|
||||||
@@ -222,6 +192,43 @@ class GithubLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def _get_github_repo_branch(self, query: str, type: str) -> list[dict]:
|
||||||
|
"""Get file contents for specific branch"""
|
||||||
|
|
||||||
|
logging.info(f"Searching github repo for query: {query} is:{type}")
|
||||||
|
pattern = r"repo:(\S+) name:(\S+)"
|
||||||
|
match = re.search(pattern, query)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
repo_name = match.group(1)
|
||||||
|
branch_name = match.group(2)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Repository name and Branch name not found, instead found this \
|
||||||
|
Repo: {repo_name}, Branch: {branch_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
data = self._get_github_repo_data(repo_name=repo_name, branch_name=branch_name)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _get_github_repo_file(self, query: str, type: str) -> list[dict]:
|
||||||
|
"""Get specific file content"""
|
||||||
|
|
||||||
|
logging.info(f"Searching github repo for query: {query} is:{type}")
|
||||||
|
pattern = r"repo:(\S+) path:(\S+)"
|
||||||
|
match = re.search(pattern, query)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
repo_name = match.group(1)
|
||||||
|
file_path = match.group(2)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Repository name and File name not found, instead found this Repo: {repo_name}, File: {file_path}"
|
||||||
|
)
|
||||||
|
|
||||||
|
data = self._get_github_repo_data(repo_name=repo_name, file_path=file_path)
|
||||||
|
return data
|
||||||
|
|
||||||
def _search_github_data(self, search_type: str, query: str):
|
def _search_github_data(self, search_type: str, query: str):
|
||||||
"""Search github data."""
|
"""Search github data."""
|
||||||
if search_type == "code":
|
if search_type == "code":
|
||||||
@@ -232,6 +239,10 @@ class GithubLoader(BaseLoader):
|
|||||||
data = self._github_search_issues_and_pr(query, search_type)
|
data = self._github_search_issues_and_pr(query, search_type)
|
||||||
elif search_type == "pr":
|
elif search_type == "pr":
|
||||||
data = self._github_search_issues_and_pr(query, search_type)
|
data = self._github_search_issues_and_pr(query, search_type)
|
||||||
|
elif search_type == "branch":
|
||||||
|
data = self._get_github_repo_branch(query, search_type)
|
||||||
|
elif search_type == "file":
|
||||||
|
data = self._get_github_repo_file(query, search_type)
|
||||||
elif search_type == "discussion":
|
elif search_type == "discussion":
|
||||||
raise ValueError("GithubLoader does not support searching discussions yet.")
|
raise ValueError("GithubLoader does not support searching discussions yet.")
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user