From 225280f2f5d7450b3ae9ea8f044b6ecc2fc4151b Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Mon, 19 Jun 2023 11:23:58 +0200
Subject: [PATCH] fix(doc): retrieval to 8 because it takes a long time

---
 backend/llm/qa.py              | 2 +-
 backend/parsers/github.py      | 2 +-
 backend/routes/crawl_routes.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/llm/qa.py b/backend/llm/qa.py
index d7fa03716..ccab97eb6 100644
--- a/backend/llm/qa.py
+++ b/backend/llm/qa.py
@@ -32,7 +32,7 @@ class CustomSupabaseVectorStore(SupabaseVectorStore):
         query: str, 
         user_id: str = "none",
         table: str = "match_vectors", 
-        k: int = 16, 
+        k: int = 8, 
         threshold: float = 0.5, 
         **kwargs: Any
     ) -> List[Document]:
diff --git a/backend/parsers/github.py b/backend/parsers/github.py
index 3c156819f..e1d0a4dc1 100644
--- a/backend/parsers/github.py
+++ b/backend/parsers/github.py
@@ -29,7 +29,7 @@ async def process_github(commons: CommonsDep, repo, enable_summarization, user,
     print(documents[:1])
 
     for doc in documents:
-        if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]:
+        if doc.metadata["file_type"] in [".pyc",".png",".svg", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git", ".json"]:
             continue
         metadata = {
             "file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
diff --git a/backend/routes/crawl_routes.py b/backend/routes/crawl_routes.py
index 314d1b5f2..c288c4f60 100644
--- a/backend/routes/crawl_routes.py
+++ b/backend/routes/crawl_routes.py
@@ -57,4 +57,4 @@ async def crawl_endpoint(request: Request,commons: CommonsDep, crawl_website: Cr
             message = await filter_file(commons, file, enable_summarization, user=current_user, openai_api_key=request.headers.get('Openai-Api-Key', None))
             return message
         else:
-            message = await process_github(crawl_website.url, "false", user=current_user, supabase=commons['supabase'], user_openai_api_key=request.headers.get('Openai-Api-Key', None))
+            message = await process_github(commons,crawl_website.url, "false", user=current_user, supabase=commons['supabase'], user_openai_api_key=request.headers.get('Openai-Api-Key', None))