fix(doc): retrieval to 8 because it takes a long time

2024-10-26 14:00:37 +03:00 · 2023-06-19 11:23:58 +02:00 · 2023-06-19 11:23:58 +02:00 · 225280f2f5
commit 225280f2f5
parent f21630c70d
3 changed files with 3 additions and 3 deletions
--- a/backend/llm/qa.py
+++ b/backend/llm/qa.py
@ -32,7 +32,7 @@ class CustomSupabaseVectorStore(SupabaseVectorStore):
        query: str, 
        user_id: str = "none",
        table: str = "match_vectors", 
-        k: int = 16, 
+        k: int = 8, 
        threshold: float = 0.5, 
        **kwargs: Any
    ) -> List[Document]:
--- a/backend/parsers/github.py
+++ b/backend/parsers/github.py
@ -29,7 +29,7 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, user,
    print(documents[:1])

    for doc in documents:
-        if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]:
+        if doc.metadata["file_type"] in [".pyc",".png",".svg", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git", ".json"]:
            continue
        metadata = {
            "file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
--- a/backend/routes/crawl_routes.py
+++ b/backend/routes/crawl_routes.py
@ -57,4 +57,4 @@ async def crawl_endpoint(request: Request,commons: CommonsDep, crawl_website: Cr
            message = await filter_file(commons, file, enable_summarization, user=current_user, openai_api_key=request.headers.get('Openai-Api-Key', None))
            return message
        else:
-            message = await process_github(crawl_website.url, "false", user=current_user, supabase=commons['supabase'], user_openai_api_key=request.headers.get('Openai-Api-Key', None))
+            message = await process_github(commons,crawl_website.url, "false", user=current_user, supabase=commons['supabase'], user_openai_api_key=request.headers.get('Openai-Api-Key', None))