quivr/backend/core/MegaParse/notebooks/test.ipynb
Jacopo Chevallard ef90e8e672
feat: introducing configurable retrieval workflows (#3227)
# Description

Major PR which, among other things, introduces the possibility of easily
customizing the retrieval workflows. Workflows are based on LangGraph,
and can be customized using a [yaml configuration
file](core/tests/test_llm_endpoint.py), and adding the implementation of
the nodes logic into
[quivr_rag_langgraph.py](1a0c98437a/backend/core/quivr_core/quivr_rag_langgraph.py)

This is a first, simple implementation that will significantly evolve in
the coming weeks to enable more complex workflows (for instance, with
conditional nodes). We also plan to adopt a similar approach for the
ingestion part, i.e. to enable user to easily customize the ingestion
pipeline.

Closes CORE-195, CORE-203, CORE-204

## Checklist before requesting a review

Please delete options that are not relevant.

- [X] My code follows the style guidelines of this project
- [X] I have performed a self-review of my code
- [X] I have commented hard-to-understand areas
- [X] I have ideally added tests that prove my fix is effective or that
my feature works
- [X] New and existing unit tests pass locally with my changes
- [X] Any dependent changes have been merged

## Screenshots (if appropriate):
2024-09-23 09:11:06 -07:00

160 lines
4.0 KiB
Plaintext
Vendored

{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from megaparse.Converter import MegaParse\n",
"from IPython.display import display_markdown\n",
"import pdfminer\n",
"from pdfminer.image import ImageWriter\n",
"from pdfminer.high_level import extract_pages\n",
"\n",
"import fitz\n",
"import io\n",
"from PIL import Image"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"file_path = \"megaparse/tests/input_tests/MegaFake_report.pdf\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"megaparse = MegaParse(file_path=file_path)\n",
"content = megaparse.convert()\n",
"megaparse.save_md(md_content=content, file_path=\"./content.md\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display_markdown(content, raw=True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# def extract_images_from_pdf(pdf_file_path, output_dir):\n",
"# iw = ImageWriter(output_dir)\n",
"# image_count = 0\n",
"\n",
"# for page_num, page_layout in enumerate(extract_pages(pdf_file_path)):\n",
"# for image in get_images_from_page(page_layout):\n",
"# image_name = f\"image_{image_count}_page_{page_num}.png\"\n",
"# iw.export_image(image)\n",
"# image_count += 1\n",
"\n",
"\n",
"# def get_images_from_page(page_layout):\n",
"# if isinstance(page_layout, pdfminer.layout.LTImage):\n",
"# return [page_layout]\n",
"# if isinstance(page_layout, pdfminer.layout.LTContainer):\n",
"# img_list = []\n",
"# for child in page_layout:\n",
"# img_list += get_images_from_page(child)\n",
"# return img_list\n",
"# else:\n",
"# return []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"def extract_images_from_pdf(pdf_file_path: str, output_dir: str):\n",
" pdf_file = fitz.open(pdf_file_path)\n",
" for page_number in range(1, len(pdf_file)):\n",
" page = pdf_file[page_number]\n",
" for image_index, img in enumerate(page.get_images(), start=1):\n",
" xref = img[0]\n",
" base_image = pdf_file.extract_image(xref)\n",
" image_bytes = base_image[\"image\"]\n",
" image_ext = base_image[\"ext\"]\n",
" pil_image = Image.open(io.BytesIO(image_bytes))\n",
" image_path = (\n",
" f\"{output_dir}image_{image_index}_page_{page_number}.{image_ext}\"\n",
" )\n",
" pil_image.save(image_path)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"1\n",
"1\n",
"1\n",
"1\n"
]
}
],
"source": [
"extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ENV",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}