quivr/backend/modules/brain/integrations/Notion/Notion_connector.py

391 lines
14 KiB
Python
Raw Normal View History

import os
import tempfile
import time
from io import BytesIO
from typing import Any, List, Optional
import requests
from celery_config import celery
from fastapi import UploadFile
from logger import get_logger
from modules.brain.entity.integration_brain import IntegrationEntity
from modules.brain.repository.integration_brains import Integration, IntegrationBrain
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.repository.knowledge_interface import KnowledgeInterface
from modules.knowledge.service.knowledge_service import KnowledgeService
from pydantic import BaseModel
from repository.files.upload_file import upload_file_storage
logger = get_logger(__name__)
class NotionPage(BaseModel):
"""Represents a Notion Page object"""
id: str
created_time: str
last_edited_time: str
archived: bool
properties: dict[str, Any]
url: str
class NotionSearchResponse(BaseModel):
"""Represents the response from the Notion Search API"""
results: list[dict[str, Any]]
next_cursor: Optional[str] = None
has_more: bool = False
class NotionConnector(IntegrationBrain, Integration):
"""A class to interact with the Notion API"""
credentials: dict[str, str] = None
integration_details: IntegrationEntity = None
brain_id: str = None
user_id: str = None
knowledge_service: KnowledgeInterface
recursive_index_enabled: bool = False
max_pages: int = 100
def __init__(self, brain_id: str, user_id: str):
super().__init__()
self.brain_id = brain_id
self.user_id = user_id
self._load_credentials()
self.knowledge_service = KnowledgeService()
def _load_credentials(self) -> dict[str, str]:
"""Load the Notion credentials"""
self.integration_details = self.get_integration_brain(
self.brain_id, self.user_id
)
if self.credentials is None:
logger.info("Loading Notion credentials")
self.integration_details.credentials = {
"notion_integration_token": self.integration_details.settings.get(
"notion_integration_token", ""
)
}
self.update_integration_brain(
self.brain_id, self.user_id, self.integration_details
)
self.credentials = self.integration_details.credentials
else: # pragma: no cover
self.credentials = self.integration_details.credentials
def _headers(self) -> dict[str, str]:
"""Get the headers for the Notion API"""
return {
"Authorization": f'Bearer {self.credentials["notion_integration_token"]}',
"Content-Type": "application/json",
"Notion-Version": "2022-06-28",
}
def _search_notion(self, query_dict: dict[str, Any]) -> NotionSearchResponse:
"""
Search for pages from a Notion database.
"""
# Use self.credentials to authenticate the request
headers = self._headers()
res = requests.post(
"https://api.notion.com/v1/search",
headers=headers,
json=query_dict,
# Adjust the timeout as needed
timeout=10,
)
res.raise_for_status()
return NotionSearchResponse(**res.json())
def _fetch_blocks(self, page_id: str, cursor: str | None = None) -> dict[str, Any]:
"""
Fetch the blocks of a Notion page.
"""
logger.info(f"Fetching blocks for page: {page_id}")
headers = self._headers()
query_params = None if not cursor else {"start_cursor": cursor}
res = requests.get(
f"https://api.notion.com/v1/blocks/{page_id}/children",
params=query_params,
headers=headers,
timeout=10,
)
res.raise_for_status()
return res.json()
def _fetch_page(self, page_id: str) -> dict[str, Any]:
"""
Fetch a Notion page.
"""
logger.info(f"Fetching page: {page_id}")
headers = self._headers()
block_url = f"https://api.notion.com/v1/pages/{page_id}"
res = requests.get(
block_url,
headers=headers,
timeout=10,
)
try:
res.raise_for_status()
except Exception:
logger.exception(f"Error fetching page - {res.json()}")
return None
return NotionPage(**res.json())
def _read_blocks(
self, page_block_id: str
) -> tuple[list[tuple[str, str]], list[str]]:
"""Reads blocks for a page"""
result_lines: list[tuple[str, str]] = []
child_pages: list[str] = []
cursor = None
while True:
data = self._fetch_blocks(page_block_id, cursor)
for result in data["results"]:
result_block_id = result["id"]
result_type = result["type"]
result_obj = result[result_type]
cur_result_text_arr = []
if "rich_text" in result_obj:
for rich_text in result_obj["rich_text"]:
# skip if doesn't have text object
if "text" in rich_text:
text = rich_text["text"]["content"]
cur_result_text_arr.append(text)
if result["has_children"]:
if result_type == "child_page":
child_pages.append(result_block_id)
else:
logger.info(f"Entering sub-block: {result_block_id}")
subblock_result_lines, subblock_child_pages = self._read_blocks(
result_block_id
)
logger.info(f"Finished sub-block: {result_block_id}")
result_lines.extend(subblock_result_lines)
child_pages.extend(subblock_child_pages)
# if result_type == "child_database" and self.recursive_index_enabled:
# child_pages.extend(self._read_pages_from_database(result_block_id))
cur_result_text = "\n".join(cur_result_text_arr)
if cur_result_text:
result_lines.append((cur_result_text, result_block_id))
if data["next_cursor"] is None:
break
cursor = data["next_cursor"]
return result_lines, child_pages
def _read_page_title(self, page: NotionPage) -> str:
"""Extracts the title from a Notion page"""
page_title = None
for _, prop in page.properties.items():
if prop["type"] == "title" and len(prop["title"]) > 0:
page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip()
break
if page_title is None:
page_title = f"Untitled Page [{page.id}]"
page_title = "".join(e for e in page_title if e.isalnum())
return page_title
def _read_page_url(self, page: NotionPage) -> str:
"""Extracts the URL from a Notion page"""
return page.url
def _read_pages_from_database(self, database_id: str) -> list[str]:
"""Reads pages from a Notion database"""
headers = self._headers()
res = requests.post(
f"https://api.notion.com/v1/databases/{database_id}/query",
headers=headers,
timeout=10,
)
res.raise_for_status()
return [page["id"] for page in res.json()["results"]]
def _read_page(self, page_id: str) -> tuple[str, list[str]]:
"""Reads a Notion page"""
page = self._fetch_page(page_id)
if page is None:
return None, None, None, None
page_title = self._read_page_title(page)
page_content, child_pages = self._read_blocks(page_id)
page_url = self._read_page_url(page)
return page_title, page_content, child_pages, page_url
def _filter_pages_by_time(
self,
pages: list[dict[str, Any]],
start: str,
filter_field: str = "last_edited_time",
) -> list[NotionPage]:
filtered_pages: list[NotionPage] = []
start_time = time.mktime(
time.strptime(start, "%Y-%m-%dT%H:%M:%S.%f%z")
) # Convert `start` to a float
for page in pages:
compare_time = time.mktime(
time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.%f%z")
)
if compare_time > start_time: # Compare `compare_time` with `start_time`
filtered_pages += [NotionPage(**page)]
return filtered_pages
def get_all_pages(self) -> list[NotionPage]:
"""
Get all the pages from Notion.
"""
query_dict = {
"filter": {"property": "object", "value": "page"},
"page_size": 100,
}
max_pages = self.max_pages
pages_count = 0
while True:
search_response = self._search_notion(query_dict)
for page in search_response.results:
pages_count += 1
if pages_count > max_pages:
break
yield NotionPage(**page)
if search_response.has_more:
query_dict["start_cursor"] = search_response.next_cursor
else:
break
def add_file_to_knowledge(
self, page_content: List[tuple[str, str]], page_name: str, page_url: str
):
"""
Add a file to the knowledge base
"""
logger.info(f"Adding file to knowledge: {page_name}")
filename_with_brain_id = (
str(self.brain_id) + "/" + str(page_name) + "_notion.txt"
)
try:
concatened_page_content = ""
if page_content:
for content in page_content:
concatened_page_content += content[0] + "\n"
# Create a BytesIO object from the content
content_io = BytesIO(concatened_page_content.encode("utf-8"))
# Create a file of type UploadFile
file = UploadFile(filename=filename_with_brain_id, file=content_io)
# Write the UploadFile content to a temporary file
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.file.read())
temp_file_path = temp_file.name
# Upload the temporary file to the knowledge base
response = upload_file_storage(
temp_file_path, filename_with_brain_id, "true"
)
logger.info(f"File {response} uploaded successfully")
# Delete the temporary file
os.remove(temp_file_path)
knowledge_to_add = CreateKnowledgeProperties(
brain_id=self.brain_id,
file_name=page_name + "_notion.txt",
extension="txt",
integration="notion",
integration_link=page_url,
)
added_knowledge = self.knowledge_service.add_knowledge(knowledge_to_add)
logger.info(f"Knowledge {added_knowledge} added successfully")
celery.send_task(
"process_file_and_notify",
kwargs={
"file_name": filename_with_brain_id,
"file_original_name": page_name + "_notion.txt",
"brain_id": self.brain_id,
"delete_file": True,
},
)
except Exception:
logger.error("Error adding knowledge")
def load(self):
"""
Get all the pages, blocks, databases from Notion into a single document per page
"""
all_pages = list(self.get_all_pages()) # Convert generator to list
documents = []
for page in all_pages:
logger.info(f"Reading page: {page.id}")
page_title, page_content, child_pages, page_url = self._read_page(page.id)
document = {
"page_title": page_title,
"page_content": page_content,
"child_pages": child_pages,
"page_url": page_url,
}
documents.append(document)
self.add_file_to_knowledge(page_content, page_title, page_url)
return documents
def poll(self):
"""
Update all the brains with the latest data from Notion
"""
integration = self.get_integration_brain(self.brain_id, self.user_id)
last_synced = integration.last_synced
query_dict = {
"page_size": self.max_pages,
"sort": {"timestamp": "last_edited_time", "direction": "descending"},
"filter": {"property": "object", "value": "page"},
}
documents = []
while True:
db_res = self._search_notion(query_dict)
pages = self._filter_pages_by_time(
db_res.results, last_synced, filter_field="last_edited_time"
)
for page in pages:
logger.info(f"Reading page: {page.id}")
page_title, page_content, child_pages, page_url = self._read_page(
page.id
)
document = {
"page_title": page_title,
"page_content": page_content,
"child_pages": child_pages,
"page_url": page_url,
}
documents.append(document)
self.add_file_to_knowledge(page_content, page_title, page_url)
if not db_res.has_more:
break
query_dict["start_cursor"] = db_res.next_cursor
logger.info(
f"last Synced: {self.update_last_synced(self.brain_id, self.user_id)}"
)
return documents
if __name__ == "__main__":
notion = NotionConnector(
brain_id="73f7d092-d596-4fd0-b24f-24031e9b53cd",
user_id="39418e3b-0258-4452-af60-7acfcc1263ff",
)
print(notion.poll())