feat(notion): added custom integration (#2268)

This pull request adds a custom integration feature and sync
functionality to the application. It includes the following changes:

- Added a new integration entity for custom integrations.

- Implemented the ability to load and poll the custom integration.

- Added a task to sync the custom integration with the user's brain.

- Updated the celery beat schedule to include the new task.

Please review and merge this pull request.
This commit is contained in:
Stan Girard 2024-02-27 21:30:25 -08:00 committed by GitHub
parent 24098e7b56
commit aeaa16dc5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 288 additions and 31 deletions

View File

@ -0,0 +1,83 @@
{
"taskDefinitionArn": "arn:aws:ecs:eu-west-3:253053805092:task-definition/quivr-preview-beat:1",
"containerDefinitions": [
{
"name": "quivr-beat",
"image": "253053805092.dkr.ecr.eu-west-3.amazonaws.com/quivr:600ff1ede02741c66853cc3e4e7f5001aaba3bc2",
"cpu": "2048",
"memory": "4096",
"essential": true,
"command": ["celery", "-A", "celery_worker", "beat", "-l", "info"],
"environment": [],
"environmentFiles": [
{
"value": "arn:aws:s3:::quivr-env-variables/preview.env",
"type": "s3"
}
],
"mountPoints": [],
"volumesFrom": [],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-create-group": "true",
"awslogs-group": "/ecs/quivr-preview-beat",
"awslogs-region": "eu-west-3",
"awslogs-stream-prefix": "ecs"
}
}
}
],
"family": "quivr-preview-beat",
"taskRoleArn": "arn:aws:iam::253053805092:role/ecsTaskExecutionRole",
"executionRoleArn": "arn:aws:iam::253053805092:role/ecsTaskExecutionRole",
"networkMode": "awsvpc",
"revision": 1,
"volumes": [],
"status": "ACTIVE",
"requiresAttributes": [
{
"name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
},
{
"name": "ecs.capability.execution-role-awslogs"
},
{
"name": "com.amazonaws.ecs.capability.ecr-auth"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
},
{
"name": "ecs.capability.env-files.s3"
},
{
"name": "com.amazonaws.ecs.capability.task-iam-role"
},
{
"name": "ecs.capability.execution-role-ecr-pull"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
},
{
"name": "ecs.capability.task-eni"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
}
],
"placementConstraints": [],
"compatibilities": ["EC2", "FARGATE"],
"requiresCompatibilities": ["FARGATE"],
"cpu": "2048",
"memory": "4096",
"runtimePlatform": {
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"
},
"registeredAt": "2023-08-18T09:01:56.187Z",
"registeredBy": "arn:aws:iam::253053805092:root",
"tags": []
}

View File

@ -85,6 +85,10 @@ jobs:
service: "preview-worker"
task_definition: ".aws/task_definition_preview_worker.json"
container: "quivr-chat"
- name: "quivr-beat"
service: "preview-beat"
task_definition: ".aws/task_definition_preview_beat.json"
container: "quivr-beat"
steps:
- name: Checkout

View File

@ -8,3 +8,4 @@
**/build/
**/.docusaurus/
**/node_modules/
**/.venv/

View File

@ -1,6 +1,7 @@
import asyncio
import io
import os
from datetime import datetime, timedelta, timezone
from celery.schedules import crontab
from celery_config import celery
@ -9,7 +10,9 @@ from logger import get_logger
from models.files import File
from models.settings import get_supabase_client
from modules.brain.integrations.Notion.Notion_connector import NotionConnector
from modules.brain.repository.integration_brains import IntegrationBrain
from modules.brain.service.brain_service import BrainService
from modules.brain.service.brain_vector_service import BrainVectorService
from modules.notification.dto.inputs import NotificationUpdatableProperties
from modules.notification.entity.notification import NotificationsStatusEnum
from modules.notification.service.notification_service import NotificationService
@ -32,6 +35,7 @@ def process_file_and_notify(
brain_id,
notification_id=None,
integration=None,
delete_file=False,
):
try:
supabase_client = get_supabase_client()
@ -50,6 +54,11 @@ def process_file_and_notify(
file_instance = File(file=upload_file)
loop = asyncio.get_event_loop()
brain_vector_service = BrainVectorService(brain_id)
if delete_file: # TODO fix bug
brain_vector_service.delete_file_from_brain(
file_original_name, only_vectors=True
)
message = loop.run_until_complete(
filter_file(
file=file_instance,
@ -156,18 +165,48 @@ def remove_onboarding_more_than_x_days_task():
onboardingService.remove_onboarding_more_than_x_days(7)
@celery.task(name="NotionConnectorLoad")
def process_integration_brain_created_initial_load(brain_id, user_id):
notion_connector = NotionConnector(brain_id=brain_id, user_id=user_id)
pages = notion_connector.load()
print("pages: ", len(pages))
@celery.task
def process_integration_brain_sync_user_brain(brain_id, user_id):
notion_connector = NotionConnector(brain_id=brain_id, user_id=user_id)
notion_connector.poll()
@celery.task
def process_integration_brain_sync():
integration = IntegrationBrain()
integrations = integration.get_integration_brain_by_type_integration("notion")
time = datetime.now(timezone.utc) # Make `time` timezone-aware
# last_synced is a string that represents a timestampz in the database
# only call process_integration_brain_sync_user_brain if more than 1 day has passed since the last sync
for integration in integrations:
print(f"last_synced: {integration.last_synced}") # Add this line
last_synced = datetime.strptime(
integration.last_synced, "%Y-%m-%dT%H:%M:%S.%f%z"
)
if last_synced < time - timedelta(hours=12):
process_integration_brain_sync_user_brain.delay(
brain_id=integration.brain_id, user_id=integration.user_id
)
celery.conf.beat_schedule = {
"remove_onboarding_more_than_x_days_task": {
"task": f"{__name__}.remove_onboarding_more_than_x_days_task",
"schedule": crontab(minute="0", hour="0"),
},
"process_integration_brain_sync": {
"task": f"{__name__}.process_integration_brain_sync",
"schedule": crontab(minute="*/5", hour="*"),
},
}
@celery.task(name="NotionConnectorLoad")
def process_integration_brain_created_initial_load(brain_id, user_id):
notion_connector = NotionConnector(brain_id=brain_id, user_id=user_id)
pages = notion_connector.compile_all_pages()
print("pages: ", len(pages))

View File

@ -27,3 +27,4 @@ class IntegrationEntity(BaseModel):
integration_id: str
settings: Optional[dict] = None
credentials: Optional[dict] = None
last_synced: str

View File

@ -1,5 +1,6 @@
import os
import tempfile
import time
from io import BytesIO
from typing import Any, List, Optional
@ -8,7 +9,7 @@ from celery_config import celery
from fastapi import UploadFile
from logger import get_logger
from modules.brain.entity.integration_brain import IntegrationEntity
from modules.brain.repository.integration_brains import IntegrationBrain
from modules.brain.repository.integration_brains import Integration, IntegrationBrain
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
from modules.knowledge.repository.knowledge_interface import KnowledgeInterface
from modules.knowledge.service.knowledge_service import KnowledgeService
@ -37,7 +38,7 @@ class NotionSearchResponse(BaseModel):
has_more: bool = False
class NotionConnector(IntegrationBrain):
class NotionConnector(IntegrationBrain, Integration):
"""A class to interact with the Notion API"""
credentials: dict[str, str] = None
@ -219,6 +220,24 @@ class NotionConnector(IntegrationBrain):
page_url = self._read_page_url(page)
return page_title, page_content, child_pages, page_url
def _filter_pages_by_time(
self,
pages: list[dict[str, Any]],
start: str,
filter_field: str = "last_edited_time",
) -> list[NotionPage]:
filtered_pages: list[NotionPage] = []
start_time = time.mktime(
time.strptime(start, "%Y-%m-%dT%H:%M:%S.%f%z")
) # Convert `start` to a float
for page in pages:
compare_time = time.mktime(
time.strptime(page[filter_field], "%Y-%m-%dT%H:%M:%S.%f%z")
)
if compare_time > start_time: # Compare `compare_time` with `start_time`
filtered_pages += [NotionPage(**page)]
return filtered_pages
def get_all_pages(self) -> list[NotionPage]:
"""
Get all the pages from Notion.
@ -248,6 +267,7 @@ class NotionConnector(IntegrationBrain):
"""
Add a file to the knowledge base
"""
logger.info(f"Adding file to knowledge: {page_name}")
filename_with_brain_id = (
str(self.brain_id) + "/" + str(page_name) + "_notion.txt"
)
@ -269,7 +289,9 @@ class NotionConnector(IntegrationBrain):
temp_file_path = temp_file.name
# Upload the temporary file to the knowledge base
response = upload_file_storage(temp_file_path, filename_with_brain_id)
response = upload_file_storage(
temp_file_path, filename_with_brain_id, "true"
)
logger.info(f"File {response} uploaded successfully")
# Delete the temporary file
@ -292,12 +314,13 @@ class NotionConnector(IntegrationBrain):
"file_name": filename_with_brain_id,
"file_original_name": page_name + "_notion.txt",
"brain_id": self.brain_id,
"delete_file": True,
},
)
except Exception:
logger.error("Error adding knowledge")
def compile_all_pages(self):
def load(self):
"""
Get all the pages, blocks, databases from Notion into a single document per page
"""
@ -316,18 +339,52 @@ class NotionConnector(IntegrationBrain):
self.add_file_to_knowledge(page_content, page_title, page_url)
return documents
def poll(self):
"""
Update all the brains with the latest data from Notion
"""
integration = self.get_integration_brain(self.brain_id, self.user_id)
last_synced = integration.last_synced
query_dict = {
"page_size": self.max_pages,
"sort": {"timestamp": "last_edited_time", "direction": "descending"},
"filter": {"property": "object", "value": "page"},
}
documents = []
while True:
db_res = self._search_notion(query_dict)
pages = self._filter_pages_by_time(
db_res.results, last_synced, filter_field="last_edited_time"
)
for page in pages:
logger.info(f"Reading page: {page.id}")
page_title, page_content, child_pages, page_url = self._read_page(
page.id
)
document = {
"page_title": page_title,
"page_content": page_content,
"child_pages": child_pages,
"page_url": page_url,
}
documents.append(document)
self.add_file_to_knowledge(page_content, page_title, page_url)
if not db_res.has_more:
break
query_dict["start_cursor"] = db_res.next_cursor
logger.info(
f"last Synced: {self.update_last_synced(self.brain_id, self.user_id)}"
)
return documents
if __name__ == "__main__":
notion = NotionConnector(
brain_id="b3ab23c5-9e13-4dd8-8883-106d613e3de8",
brain_id="73f7d092-d596-4fd0-b24f-24031e9b53cd",
user_id="39418e3b-0258-4452-af60-7acfcc1263ff",
)
celery.send_task(
"NotionConnectorLoad",
kwargs={
"brain_id": "b3ab23c5-9e13-4dd8-8883-106d613e3de8",
"user_id": "39418e3b-0258-4452-af60-7acfcc1263ff",
},
)
print(notion.poll())

View File

@ -86,9 +86,11 @@ def generate_source(source_documents, brain_id):
if file_path in generated_urls:
source_url = generated_urls[file_path]
else:
source_url = generate_file_signed_url(file_path).get(
"signedURL", ""
)
generated_url = generate_file_signed_url(file_path)
if generated_url is not None:
source_url = generated_url.get("signedURL", "")
else:
source_url = ""
# Store the generated URL
generated_urls[file_path] = source_url

View File

@ -57,6 +57,7 @@ class BrainsVectors(BrainsVectorsInterface):
def delete_file_from_brain(self, brain_id, file_name: str):
# First, get the vector_ids associated with the file_name
# TODO: filter by brain_id
file_vectors = (
self.db.table("vectors")
.select("id")

View File

@ -1,3 +1,6 @@
from abc import ABC, abstractmethod
from typing import List
from models.settings import get_supabase_client
from modules.brain.entity.integration_brain import (
IntegrationDescriptionEntity,
@ -9,6 +12,17 @@ from modules.brain.repository.interfaces.integration_brains_interface import (
)
class Integration(ABC):
@abstractmethod
def load(self):
pass
@abstractmethod
def poll(self):
pass
class IntegrationBrain(IntegrationBrainInterface):
"""This is all the methods to interact with the integration brain.
@ -32,6 +46,18 @@ class IntegrationBrain(IntegrationBrainInterface):
return IntegrationEntity(**response.data[0])
def update_last_synced(self, brain_id, user_id):
response = (
self.db.table("integrations_user")
.update({"last_synced": "now()"})
.filter("brain_id", "eq", str(brain_id))
.filter("user_id", "eq", str(user_id))
.execute()
)
if len(response.data) == 0:
return None
return IntegrationEntity(**response.data[0])
def add_integration_brain(self, brain_id, user_id, integration_id, settings):
response = (
@ -70,6 +96,20 @@ class IntegrationBrain(IntegrationBrainInterface):
).filter("user_id", "eq", str(user_id)).execute()
return None
def get_integration_brain_by_type_integration(
self, integration_name
) -> List[IntegrationEntity]:
response = (
self.db.table("integrations_user")
.select("*, integrations ()")
.filter("integrations.integration_name", "eq", integration_name)
.execute()
)
if len(response.data) == 0:
return None
return [IntegrationEntity(**data) for data in response.data]
class IntegrationDescription(IntegrationDescriptionInterface):

View File

@ -44,9 +44,10 @@ class BrainVectorService:
return self.files
def delete_file_from_brain(self, file_name: str):
def delete_file_from_brain(self, file_name: str, only_vectors: bool = False):
file_name_with_brain_id = f"{self.id}/{file_name}"
storage = Storage()
if not only_vectors:
storage.remove_file(file_name_with_brain_id)
return self.repository.delete_file_from_brain(self.id, file_name) # type: ignore

View File

@ -14,6 +14,18 @@ class Knowledges(KnowledgeInterface):
"""
Add a knowledge
"""
# Check if the knowledge already exists
knowledge_exists = (
self.db.from_("knowledge")
.select("*")
.filter("brain_id", "eq", knowledge.brain_id)
.filter("file_name", "eq", knowledge.file_name)
.execute()
).data
if knowledge_exists:
return Knowledge(**knowledge_exists[0]) # TODO fix this
response = (self.db.from_("knowledge").insert(knowledge.dict()).execute()).data
return Knowledge(**response[0])

View File

@ -36,7 +36,7 @@ mime_types = {
}
def upload_file_storage(file, file_identifier: str):
def upload_file_storage(file, file_identifier: str, upsert: str = "false"):
supabase_client: Client = get_supabase_client()
response = None
@ -48,12 +48,28 @@ def upload_file_storage(file, file_identifier: str):
mime_type = mime_types.get(file_extension, "text/html")
response = supabase_client.storage.from_("quivr").upload(
file_identifier, file, file_options={"content-type": mime_type}
file_identifier,
file,
file_options={
"content-type": mime_type,
"upsert": upsert,
"cache-control": "3600",
},
)
return response
except Exception as e:
logger.error(e)
if "The resource already exists" in str(e) and upsert == "true":
response = supabase_client.storage.from_("quivr").update(
file_identifier,
file,
file_options={
"content-type": mime_type,
"upsert": upsert,
"cache-control": "3600",
},
)
else:
raise e