Add folder and branch to GitHub (#1308)

This commit is contained in:
Dev Khant
2024-03-13 00:45:37 +05:30
committed by GitHub
parent f77f5b996e
commit 117824b32c
2 changed files with 74 additions and 61 deletions

View File

@@ -29,11 +29,13 @@ response = app.query("What is Embedchain?")
``` ```
The `add` function of the app will accept any valid github query with qualifiers. It only supports loading github code, repository, issues and pull-requests. The `add` function of the app will accept any valid github query with qualifiers. It only supports loading github code, repository, issues and pull-requests.
<Note> <Note>
You must provide qualifiers `type:` and `repo:` in the query. The `type:` qualifier can be a combination of `code`, `repo`, `pr`, `issue`. The `repo:` qualifier must be a valid github repository name. You must provide qualifiers `type:` and `repo:` in the query. The `type:` qualifier can be a combination of `code`, `repo`, `pr`, `issue`, `branch`, `file`. The `repo:` qualifier must be a valid github repository name.
</Note> </Note>
<Card title="Valid queries" icon="lightbulb" iconType="duotone" color="#ca8b04"> <Card title="Valid queries" icon="lightbulb" iconType="duotone" color="#ca8b04">
- `repo:embedchain/embedchain type:repo` - to load the repository - `repo:embedchain/embedchain type:repo` - to load the repository
- `repo:embedchain/embedchain type:branch name:feature_test` - to load the branch of the repository
- `repo:embedchain/embedchain type:file path:README.md` - to load the specific file of the repository
- `repo:embedchain/embedchain type:issue,pr` - to load the issues and pull-requests of the repository - `repo:embedchain/embedchain type:issue,pr` - to load the issues and pull-requests of the repository
- `repo:embedchain/embedchain type:issue state:closed` - to load the closed issues of the repository - `repo:embedchain/embedchain type:issue state:closed` - to load the closed issues of the repository
</Card> </Card>

View File

@@ -1,7 +1,6 @@
import concurrent.futures import concurrent.futures
import hashlib import hashlib
import logging import logging
import os
import re import re
import shlex import shlex
from typing import Any, Optional from typing import Any, Optional
@@ -14,7 +13,7 @@ from embedchain.utils.misc import clean_string
GITHUB_URL = "https://github.com" GITHUB_URL = "https://github.com"
GITHUB_API_URL = "https://api.github.com" GITHUB_API_URL = "https://api.github.com"
VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion"]) VALID_SEARCH_TYPES = set(["code", "repo", "pr", "issue", "discussion", "branch", "file"])
class GithubLoader(BaseLoader): class GithubLoader(BaseLoader):
@@ -66,85 +65,56 @@ class GithubLoader(BaseLoader):
) )
return data return data
@staticmethod def _get_github_repo_data(self, repo_name: str, branch_name: str = None, file_path: str = None) -> list[dict]:
def _get_github_repo_data(repo_url: str): """Get file contents from Repo"""
local_hash = hashlib.sha256(repo_url.encode()).hexdigest()
local_path = f"/tmp/{local_hash}"
data = [] data = []
def _get_repo_tree(repo_url: str, local_path: str): repo = self.client.get_repo(repo_name)
try: repo_contents = repo.get_contents("")
from git import Repo
except ImportError as e:
raise ValueError(
"GithubLoader requires extra dependencies. Install with `pip install --upgrade 'embedchain[github]'`" # noqa: E501
) from e
if os.path.exists(local_path): if branch_name:
logging.info("Repository already exists. Fetching updates...") repo_contents = repo.get_contents("", ref=branch_name)
repo = Repo(local_path) if file_path:
logging.info("Fetch completed.") repo_contents = [repo.get_contents(file_path)]
else:
logging.info("Cloning repository...")
repo = Repo.clone_from(repo_url, local_path)
logging.info("Clone completed.")
return repo.head.commit.tree
def _get_repo_tree_contents(repo_path, tree, progress_bar): with tqdm(desc="Loading files:", unit="item") as progress_bar:
for subtree in tree: while repo_contents:
if subtree.type == "tree": file_content = repo_contents.pop(0)
_get_repo_tree_contents(repo_path, subtree, progress_bar) if file_content.type == "dir":
else:
assert subtree.type == "blob"
try: try:
contents = subtree.data_stream.read().decode("utf-8") repo_contents.extend(repo.get_contents(file_content.path))
except Exception: except Exception:
logging.warning(f"Failed to read file: {subtree.path}") logging.warning(f"Failed to read directory: {file_content.path}")
progress_bar.update(1) if progress_bar else None progress_bar.update(1)
continue
else:
try:
file_text = file_content.decoded_content.decode()
except Exception:
logging.warning(f"Failed to read file: {file_content.path}")
progress_bar.update(1)
continue continue
url = f"{repo_url.rstrip('.git')}/blob/main/{subtree.path}" file_path = file_content.path
data.append( data.append(
{ {
"content": clean_string(contents), "content": clean_string(file_text),
"meta_data": { "meta_data": {
"url": url, "path": file_path,
}, },
} }
) )
if progress_bar is not None:
progress_bar.update(1)
repo_tree = _get_repo_tree(repo_url, local_path) progress_bar.update(1)
tree_list = list(repo_tree.traverse())
with tqdm(total=len(tree_list), desc="Loading files:", unit="item") as progress_bar:
_get_repo_tree_contents(local_path, repo_tree, progress_bar)
return data return data
def _github_search_repo(self, query: str) -> list[dict]: def _github_search_repo(self, query: str) -> list[dict]:
"""Search GitHub repo.""" """Search GitHub repo."""
data = []
logging.info(f"Searching github repos with query: {query}")
results = self.client.search_repositories(query)
# Add repo urls and descriptions
urls = list(map(lambda x: x.html_url, results))
descriptions = list(map(lambda x: x.description, results))
data.append(
{
"content": clean_string(desc),
"meta_data": {
"url": url,
},
}
for url, desc in zip(urls, descriptions)
)
# Add repo contents logging.info(f"Searching github repos with query: {query}")
for result in results: updated_query = query.split(":")[-1]
clone_url = result.clone_url data = self._get_github_repo_data(updated_query)
logging.info(f"Cloning repository: {clone_url}")
data = self._get_github_repo_data(clone_url)
return data return data
def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]: def _github_search_issues_and_pr(self, query: str, type: str) -> list[dict]:
@@ -222,6 +192,43 @@ class GithubLoader(BaseLoader):
) )
return data return data
def _get_github_repo_branch(self, query: str, type: str) -> list[dict]:
"""Get file contents for specific branch"""
logging.info(f"Searching github repo for query: {query} is:{type}")
pattern = r"repo:(\S+) name:(\S+)"
match = re.search(pattern, query)
if match:
repo_name = match.group(1)
branch_name = match.group(2)
else:
raise ValueError(
f"Repository name and Branch name not found, instead found this \
Repo: {repo_name}, Branch: {branch_name}"
)
data = self._get_github_repo_data(repo_name=repo_name, branch_name=branch_name)
return data
def _get_github_repo_file(self, query: str, type: str) -> list[dict]:
"""Get specific file content"""
logging.info(f"Searching github repo for query: {query} is:{type}")
pattern = r"repo:(\S+) path:(\S+)"
match = re.search(pattern, query)
if match:
repo_name = match.group(1)
file_path = match.group(2)
else:
raise ValueError(
f"Repository name and File name not found, instead found this Repo: {repo_name}, File: {file_path}"
)
data = self._get_github_repo_data(repo_name=repo_name, file_path=file_path)
return data
def _search_github_data(self, search_type: str, query: str): def _search_github_data(self, search_type: str, query: str):
"""Search github data.""" """Search github data."""
if search_type == "code": if search_type == "code":
@@ -232,6 +239,10 @@ class GithubLoader(BaseLoader):
data = self._github_search_issues_and_pr(query, search_type) data = self._github_search_issues_and_pr(query, search_type)
elif search_type == "pr": elif search_type == "pr":
data = self._github_search_issues_and_pr(query, search_type) data = self._github_search_issues_and_pr(query, search_type)
elif search_type == "branch":
data = self._get_github_repo_branch(query, search_type)
elif search_type == "file":
data = self._get_github_repo_file(query, search_type)
elif search_type == "discussion": elif search_type == "discussion":
raise ValueError("GithubLoader does not support searching discussions yet.") raise ValueError("GithubLoader does not support searching discussions yet.")
else: else: