2023-06-06 01:38:15 +03:00
|
|
|
import os
|
|
|
|
import time
|
|
|
|
|
|
|
|
from langchain.document_loaders import GitLoader
|
|
|
|
from langchain.schema import Document
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
from parsers.common import file_already_exists_from_content
|
2023-06-17 00:36:53 +03:00
|
|
|
from utils.common import CommonsDep
|
|
|
|
from utils.file import compute_sha1_from_content
|
|
|
|
from utils.vectors import create_vector
|
2023-06-06 01:38:15 +03:00
|
|
|
|
|
|
|
|
2023-06-17 00:36:53 +03:00
|
|
|
async def process_github(commons: CommonsDep, repo, enable_summarization, user, supabase, user_openai_api_key):
|
2023-06-06 01:38:15 +03:00
|
|
|
random_dir_name = os.urandom(16).hex()
|
|
|
|
dateshort = time.strftime("%Y%m%d")
|
|
|
|
loader = GitLoader(
|
|
|
|
clone_url=repo,
|
|
|
|
repo_path="/tmp/" + random_dir_name,
|
|
|
|
)
|
|
|
|
documents = loader.load()
|
|
|
|
os.system("rm -rf /tmp/" + random_dir_name)
|
|
|
|
|
2023-06-19 18:53:07 +03:00
|
|
|
chunk_size = 500
|
2023-06-06 01:38:15 +03:00
|
|
|
chunk_overlap = 0
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
|
|
|
|
|
|
documents = text_splitter.split_documents(documents)
|
|
|
|
print(documents[:1])
|
|
|
|
|
|
|
|
for doc in documents:
|
2023-06-19 12:23:58 +03:00
|
|
|
if doc.metadata["file_type"] in [".pyc",".png",".svg", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git", ".json"]:
|
2023-06-06 01:38:15 +03:00
|
|
|
continue
|
|
|
|
metadata = {
|
|
|
|
"file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
|
|
|
|
"file_size": len(doc.page_content)*8,
|
|
|
|
"file_name": doc.metadata["file_name"],
|
|
|
|
"chunk_size": chunk_size,
|
|
|
|
"chunk_overlap": chunk_overlap,
|
|
|
|
"date": dateshort,
|
|
|
|
"summarization": "true" if enable_summarization else "false"
|
|
|
|
}
|
|
|
|
doc_with_metadata = Document(
|
|
|
|
page_content=doc.page_content, metadata=metadata)
|
|
|
|
exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user)
|
|
|
|
if not exist:
|
2023-06-17 00:36:53 +03:00
|
|
|
create_vector(commons, user.email, doc_with_metadata, user_openai_api_key)
|
2023-06-06 01:38:15 +03:00
|
|
|
print("Created vector for ", doc.metadata["file_name"])
|
|
|
|
|
|
|
|
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}
|
|
|
|
|