quivr/streamlit-demo/loaders/html.py

from .common import process_file
from langchain.document_loaders import UnstructuredHTMLLoader
import requests
import re
import unicodedata
import tempfile
import os
import streamlit as st
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile

def process_html(vector_store, file, stats_db):
    return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)


def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return None

def create_html_file(url, content):
    file_name = slugify(url) + ".html"
    temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
    with open(temp_file_path, 'w') as temp_file:
        temp_file.write(content)

    record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())
    uploaded_file = UploadedFile(record)
    
    return uploaded_file, temp_file_path

def delete_tempfile(temp_file_path, url, ret):
    try:
        os.remove(temp_file_path)
        if ret:
            st.write(f"✅ Content saved... {url}  ")
    except OSError as e:
        print(f"Error while deleting the temporary file: {str(e)}")
        if ret:
            st.write(f"❌ Error while saving content... {url}  ")

def slugify(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\w\s-]', '', text).strip().lower()
    text = re.sub(r'[-\s]+', '-', text)
    return text
feat(files.py): curlwebpage and add it to db 2023-05-16 02:02:52 +03:00			`from .common import process_file`
feat(url): added html loader instead of text 2023-05-16 10:56:12 +03:00			`from langchain.document_loaders import UnstructuredHTMLLoader`
feat(url): moved to own file 2023-05-16 11:17:57 +03:00			`import requests`
			`import re`
			`import unicodedata`
			`import tempfile`
			`import os`
			`import streamlit as st`
			`from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile`
feat(files.py): curlwebpage and add it to db 2023-05-16 02:02:52 +03:00
feat(demo): app can now have a demo 2023-05-17 13:12:52 +03:00			`def process_html(vector_store, file, stats_db):`
			`return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db)`
feat(url): moved to own file 2023-05-16 11:17:57 +03:00

			`def get_html(url):`
			`response = requests.get(url)`
			`if response.status_code == 200:`
			`return response.text`
			`else:`
			`return None`

			`def create_html_file(url, content):`
			`file_name = slugify(url) + ".html"`
			`temp_file_path = os.path.join(tempfile.gettempdir(), file_name)`
			`with open(temp_file_path, 'w') as temp_file:`
			`temp_file.write(content)`

			`record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read())`
			`uploaded_file = UploadedFile(record)`

			`return uploaded_file, temp_file_path`

			`def delete_tempfile(temp_file_path, url, ret):`
			`try:`
			`os.remove(temp_file_path)`
			`if ret:`
			`st.write(f"✅ Content saved... {url} ")`
			`except OSError as e:`
			`print(f"Error while deleting the temporary file: {str(e)}")`
			`if ret:`
			`st.write(f"❌ Error while saving content... {url} ")`

			`def slugify(text):`
			`text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')`
			`text = re.sub(r'[^\w\s-]', '', text).strip().lower()`
			`text = re.sub(r'[-\s]+', '-', text)`
			`return text`