From 66bb4b1867847fd605ea1bafd76c55d905e9a69c Mon Sep 17 00:00:00 2001
From: Stan Girard <girard.stanislas@gmail.com>
Date: Sun, 28 Jan 2024 16:01:54 -0800
Subject: [PATCH] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20sources?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

moved to own function
---
 backend/llm/knowledge_brain_qa.py            | 118 ++++++++++---------
 backend/llm/tests/test_knowledge_brain_qa.py |  31 +++++
 2 files changed, 92 insertions(+), 57 deletions(-)
 create mode 100644 backend/llm/tests/test_knowledge_brain_qa.py

diff --git a/backend/llm/knowledge_brain_qa.py b/backend/llm/knowledge_brain_qa.py
index 1987a8d73..8acf3ae4e 100644
--- a/backend/llm/knowledge_brain_qa.py
+++ b/backend/llm/knowledge_brain_qa.py
@@ -38,6 +38,59 @@ def is_valid_uuid(uuid_to_test, version=4):
     return str(uuid_obj) == uuid_to_test
 
 
+def generate_source(result, brain):
+    # Initialize an empty list for sources
+    sources_list: List[Sources] = []
+
+    # Get source documents from the result, default to an empty list if not found
+    source_documents = result.get("source_documents", [])
+
+    # If source documents exist
+    if source_documents:
+        logger.info(f"Source documents found: {source_documents}")
+        # Iterate over each document
+        for doc in source_documents:
+            # Check if 'url' is in the document metadata
+            logger.info(f"Metadata: {doc['metadata']}")
+            is_url = (
+                "original_file_name" in doc["metadata"]
+                and doc["metadata"]["original_file_name"] is not None
+                and doc["metadata"]["original_file_name"].startswith("http")
+            )
+            logger.info(f"Is URL: {is_url}")
+
+            # Determine the name based on whether it's a URL or a file
+            name = (
+                doc["metadata"]["original_file_name"]
+                if is_url
+                else doc["metadata"]["file_name"]
+            )
+
+            # Determine the type based on whether it's a URL or a file
+            type_ = "url" if is_url else "file"
+
+            # Determine the source URL based on whether it's a URL or a file
+            if is_url:
+                source_url = doc["metadata"]["original_file_name"]
+            else:
+                source_url = generate_file_signed_url(
+                    f"{brain.brain_id}/{doc['metadata']['file_name']}"
+                ).get("signedURL", "")
+
+            # Append a new Sources object to the list
+            sources_list.append(
+                Sources(
+                    name=name,
+                    type=type_,
+                    source_url=source_url,
+                    original_file_name=name,
+                )
+            )
+    else:
+        logger.info("No source documents found or source_documents is not a list.")
+    return sources_list
+
+
 class KnowledgeBrainQA(BaseModel, QAInterface):
     """
     Main class for the Brain Picking functionality.
@@ -304,64 +357,15 @@ class KnowledgeBrainQA(BaseModel, QAInterface):
             # Await the run
             result = await run
 
-            # Initialize an empty list for sources
-            sources_list: List[Sources] = []
+            sources_list = generate_source(result, brain)
+            # Create metadata if it doesn't exist
+            if not streamed_chat_history.metadata:
+                streamed_chat_history.metadata = {}
+                # Serialize the sources list
+            serialized_sources_list = [source.dict() for source in sources_list]
+            streamed_chat_history.metadata["sources"] = serialized_sources_list
+            yield f"data: {json.dumps(streamed_chat_history.dict())}"
 
-            # Get source documents from the result, default to an empty list if not found
-            source_documents = result.get("source_documents", [])
-
-            # If source documents exist
-            if source_documents:
-                logger.info(f"Source documents found: {source_documents}")
-                # Iterate over each document
-                for doc in source_documents:
-                    # Check if 'url' is in the document metadata
-                    logger.info(f"Metadata: {doc.metadata}")
-                    is_url = (
-                        "original_file_name" in doc.metadata
-                        and doc.metadata["original_file_name"] is not None
-                        and doc.metadata["original_file_name"].startswith("http")
-                    )
-                    logger.info(f"Is URL: {is_url}")
-
-                    # Determine the name based on whether it's a URL or a file
-                    name = (
-                        doc.metadata["original_file_name"]
-                        if is_url
-                        else doc.metadata["file_name"]
-                    )
-
-                    # Determine the type based on whether it's a URL or a file
-                    type_ = "url" if is_url else "file"
-
-                    # Determine the source URL based on whether it's a URL or a file
-                    if is_url:
-                        source_url = doc.metadata["original_file_name"]
-                    else:
-                        source_url = generate_file_signed_url(
-                            f"{brain.brain_id}/{doc.metadata['file_name']}"
-                        ).get("signedURL", "")
-
-                    # Append a new Sources object to the list
-                    sources_list.append(
-                        Sources(
-                            name=name,
-                            type=type_,
-                            source_url=source_url,
-                            original_file_name=name,
-                        )
-                    )
-                # Create metadata if it doesn't exist
-                if not streamed_chat_history.metadata:
-                    streamed_chat_history.metadata = {}
-                    # Serialize the sources list
-                serialized_sources_list = [source.dict() for source in sources_list]
-                streamed_chat_history.metadata["sources"] = serialized_sources_list
-                yield f"data: {json.dumps(streamed_chat_history.dict())}"
-            else:
-                logger.info(
-                    "No source documents found or source_documents is not a list."
-                )
         except Exception as e:
             logger.error("Error processing source documents: %s", e)
 
diff --git a/backend/llm/tests/test_knowledge_brain_qa.py b/backend/llm/tests/test_knowledge_brain_qa.py
new file mode 100644
index 000000000..5c5455441
--- /dev/null
+++ b/backend/llm/tests/test_knowledge_brain_qa.py
@@ -0,0 +1,31 @@
+from llm.knowledge_brain_qa import generate_source
+
+
+def test_generate_source_no_documents():
+    result = {"source_documents": []}
+    brain = {"brain_id": "123"}
+
+    sources = generate_source(result, brain)
+
+    assert sources == []
+
+
+def test_generate_source_with_url():
+    result = {
+        "source_documents": [
+            {
+                "metadata": {
+                    "original_file_name": "http://example.com",
+                }
+            }
+        ]
+    }
+    brain = {"brain_id": "123"}
+
+    sources = generate_source(result, brain)
+
+    assert len(sources) == 1
+    assert sources[0].name == "http://example.com"
+    assert sources[0].type == "url"
+    assert sources[0].source_url == "http://example.com"
+    assert sources[0].original_file_name == "http://example.com"