quivr/backend/parsers/html.py
Zineb El Bachiri e53bc6807d
Feat/multiple brains backend (#340)
* 🗃️ add new tables for multiple brains

* 🗑️ remove date input from fetch_user_id_from_credentials

*  new /brain endpoints

* ♻️ refactor backend utils by splitting it into files

* 💡 comments for next actions to update /upload
2023-06-16 23:36:53 +02:00

32 lines
845 B
Python

import os
import re
import tempfile
import unicodedata
import requests
from fastapi import UploadFile
from langchain.document_loaders import UnstructuredHTMLLoader
from utils.common import CommonsDep
from .common import process_file
def process_html(commons: CommonsDep, file: UploadFile, enable_summarization, user, user_openai_api_key):
return process_file(commons, file, UnstructuredHTMLLoader, ".html", enable_summarization, user, user_openai_api_key)
def get_html(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
def slugify(text):
text = unicodedata.normalize('NFKD', text).encode(
'ascii', 'ignore').decode('utf-8')
text = re.sub(r'[^\w\s-]', '', text).strip().lower()
text = re.sub(r'[-\s]+', '-', text)
return text