2023-09-08 12:03:14 +03:00
|
|
|
from typing import Optional
|
2023-06-28 20:39:27 +03:00
|
|
|
from uuid import UUID
|
2023-06-11 00:59:16 +03:00
|
|
|
|
2023-09-14 12:56:59 +03:00
|
|
|
from celery_worker import process_crawl_and_notify
|
|
|
|
from fastapi import APIRouter, Depends, Query, Request
|
2023-09-20 10:35:37 +03:00
|
|
|
from logger import get_logger
|
2023-11-14 16:31:02 +03:00
|
|
|
from middlewares.auth import AuthBearer, get_current_user
|
2023-12-01 00:29:28 +03:00
|
|
|
from models import UserUsage
|
2023-11-29 11:04:03 +03:00
|
|
|
from modules.knowledge.dto.inputs import CreateKnowledgeProperties
|
|
|
|
from modules.knowledge.service.knowledge_service import KnowledgeService
|
2023-11-28 16:27:39 +03:00
|
|
|
from modules.notification.service.notification_service import NotificationService
|
2023-11-15 15:17:51 +03:00
|
|
|
from modules.user.entity.user_identity import UserIdentity
|
2023-11-14 16:31:02 +03:00
|
|
|
from packages.files.crawl.crawler import CrawlWebsite
|
2023-11-14 11:52:44 +03:00
|
|
|
from packages.files.file import convert_bytes
|
2023-06-11 00:59:16 +03:00
|
|
|
|
2023-09-20 10:35:37 +03:00
|
|
|
logger = get_logger(__name__)
|
2023-06-11 00:59:16 +03:00
|
|
|
crawl_router = APIRouter()
|
|
|
|
|
2023-11-28 16:27:39 +03:00
|
|
|
notification_service = NotificationService()
|
2023-11-29 11:04:03 +03:00
|
|
|
knowledge_service = KnowledgeService()
|
2023-11-28 16:27:39 +03:00
|
|
|
|
2023-06-29 09:00:34 +03:00
|
|
|
|
2023-08-21 00:20:57 +03:00
|
|
|
@crawl_router.get("/crawl/healthz", tags=["Health"])
|
|
|
|
async def healthz():
|
|
|
|
return {"status": "ok"}
|
|
|
|
|
|
|
|
|
2023-06-29 09:00:34 +03:00
|
|
|
@crawl_router.post("/crawl", dependencies=[Depends(AuthBearer())], tags=["Crawl"])
|
|
|
|
async def crawl_endpoint(
|
|
|
|
request: Request,
|
|
|
|
crawl_website: CrawlWebsite,
|
|
|
|
brain_id: UUID = Query(..., description="The ID of the brain"),
|
2023-09-08 12:03:14 +03:00
|
|
|
chat_id: Optional[UUID] = Query(None, description="The ID of the chat"),
|
2023-08-21 15:05:13 +03:00
|
|
|
current_user: UserIdentity = Depends(get_current_user),
|
2023-06-29 09:00:34 +03:00
|
|
|
):
|
2023-06-15 15:43:40 +03:00
|
|
|
"""
|
|
|
|
Crawl a website and process the crawled data.
|
|
|
|
"""
|
2023-06-12 18:58:05 +03:00
|
|
|
|
2023-06-29 09:00:34 +03:00
|
|
|
# [TODO] check if the user is the owner/editor of the brain
|
2023-06-11 00:59:16 +03:00
|
|
|
|
2023-09-13 14:47:12 +03:00
|
|
|
userDailyUsage = UserUsage(
|
|
|
|
id=current_user.id,
|
|
|
|
email=current_user.email,
|
|
|
|
)
|
|
|
|
userSettings = userDailyUsage.get_user_settings()
|
|
|
|
|
2023-06-11 00:59:16 +03:00
|
|
|
file_size = 1000000
|
2023-09-13 14:47:12 +03:00
|
|
|
remaining_free_space = userSettings.get("max_brain_size", 1000000000)
|
2023-06-11 00:59:16 +03:00
|
|
|
|
|
|
|
if remaining_free_space - file_size < 0:
|
2023-06-29 09:00:34 +03:00
|
|
|
message = {
|
2023-08-21 15:05:13 +03:00
|
|
|
"message": f"❌ UserIdentity's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}",
|
2023-06-29 09:00:34 +03:00
|
|
|
"type": "error",
|
|
|
|
}
|
|
|
|
else:
|
2023-09-20 10:35:37 +03:00
|
|
|
knowledge_to_add = CreateKnowledgeProperties(
|
|
|
|
brain_id=brain_id,
|
|
|
|
url=crawl_website.url,
|
|
|
|
extension="html",
|
|
|
|
)
|
|
|
|
|
2023-11-29 11:04:03 +03:00
|
|
|
added_knowledge = knowledge_service.add_knowledge(knowledge_to_add)
|
2023-09-20 10:35:37 +03:00
|
|
|
logger.info(f"Knowledge {added_knowledge} added successfully")
|
|
|
|
|
2023-09-14 12:56:59 +03:00
|
|
|
process_crawl_and_notify.delay(
|
|
|
|
crawl_website_url=crawl_website.url,
|
|
|
|
brain_id=brain_id,
|
2024-01-26 05:56:54 +03:00
|
|
|
notification_id=None,
|
2023-09-14 12:56:59 +03:00
|
|
|
)
|
2023-06-11 00:59:16 +03:00
|
|
|
|
2023-09-14 12:56:59 +03:00
|
|
|
return {"message": "Crawl processing has started."}
|
2023-09-07 14:22:06 +03:00
|
|
|
return message
|