feat(github): now github loader

This commit is contained in:
Stan Girard 2023-06-06 00:36:25 +02:00
parent 000933f5e0
commit 6a591cc92c
4 changed files with 107 additions and 15 deletions

View File

@ -1,10 +1,11 @@
import requests
from pydantic import BaseModel
import requests
import re
import unicodedata
import tempfile
import os
import re
import tempfile
import unicodedata
import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel
class CrawlWebsite(BaseModel):
@ -23,6 +24,7 @@ class CrawlWebsite(BaseModel):
def process(self):
content = self._crawl(self.url)
## Create a file
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
@ -34,6 +36,12 @@ class CrawlWebsite(BaseModel):
return temp_file_path, file_name
else:
return None
def checkGithub(self):
if "github.com" in self.url:
return True
else:
return False
def slugify(text):

View File

@ -13,6 +13,7 @@ from logger import get_logger
from middlewares.cors import add_cors_middleware
from models.chats import ChatMessage
from models.users import User
from parsers.github import process_github
from utils.file import convert_bytes, get_file_size
from utils.processors import filter_file
from utils.vectors import (CommonsDep, create_user, similarity_search,
@ -114,18 +115,42 @@ async def chat_endpoint(commons: CommonsDep, chat_message: ChatMessage, credenti
@app.post("/crawl/", dependencies=[Depends(JWTBearer())])
async def crawl_endpoint(commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())):
max_brain_size = os.getenv("MAX_BRAIN_SIZE")
user = User(email=credentials.get('email', 'none'))
file_path, file_name = crawl_website.process()
user_vectors_response = commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("user_id", "eq", user.email)\
.execute()
documents = user_vectors_response.data # Access the data from the response
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)]
# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile()
with open(file_path, 'rb') as f:
shutil.copyfileobj(f, spooled_file)
current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors)
# Pass the SpooledTemporaryFile to UploadFile
file = UploadFile(file=spooled_file, filename=file_name)
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
return message
file_size = 1000000
remaining_free_space = float(max_brain_size) - (current_brain_size)
if remaining_free_space - file_size < 0:
message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"}
else:
user = User(email=credentials.get('email', 'none'))
if not crawl_website.checkGithub():
file_path, file_name = crawl_website.process()
# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile()
with open(file_path, 'rb') as f:
shutil.copyfileobj(f, spooled_file)
# Pass the SpooledTemporaryFile to UploadFile
file = UploadFile(file=spooled_file, filename=file_name)
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
return message
else:
message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase'])
@app.get("/explore", dependencies=[Depends(JWTBearer())])

View File

@ -65,3 +65,9 @@ async def file_already_exists(supabase, file, user):
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
.filter("user_id", "eq", user.email).execute()
return len(response.data) > 0
async def file_already_exists_from_content(supabase, file_content, user):
file_sha1 = compute_sha1_from_content(file_content)
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
.filter("user_id", "eq", user.email).execute()
return len(response.data) > 0

53
backend/parsers/github.py Normal file
View File

@ -0,0 +1,53 @@
import os
import time
from fastapi import UploadFile
from langchain.document_loaders import GitLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from parsers.common import file_already_exists_from_content
from utils.file import compute_sha1_from_content, compute_sha1_from_file
from utils.vectors import create_summary, create_vector, documents_vector_store
from .common import process_file
async def process_github(repo, enable_summarization, user, supabase):
random_dir_name = os.urandom(16).hex()
dateshort = time.strftime("%Y%m%d")
loader = GitLoader(
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
)
documents = loader.load()
os.system("rm -rf /tmp/" + random_dir_name)
chunk_size = 500
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(documents)
print(documents[:1])
for doc in documents:
if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]:
continue
metadata = {
"file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
"file_size": len(doc.page_content)*8,
"file_name": doc.metadata["file_name"],
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"date": dateshort,
"summarization": "true" if enable_summarization else "false"
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user)
if not exist:
create_vector(user.email, doc_with_metadata)
print("Created vector for ", doc.metadata["file_name"])
return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}