{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from megaparse.Converter import MegaParse\n", "from IPython.display import display_markdown\n", "import pdfminer\n", "from pdfminer.image import ImageWriter\n", "from pdfminer.high_level import extract_pages\n", "\n", "import fitz\n", "import io\n", "from PIL import Image" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "file_path = \"megaparse/tests/input_tests/MegaFake_report.pdf\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "megaparse = MegaParse(file_path=file_path)\n", "content = megaparse.convert()\n", "megaparse.save_md(md_content=content, file_path=\"./content.md\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display_markdown(content, raw=True)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# def extract_images_from_pdf(pdf_file_path, output_dir):\n", "# iw = ImageWriter(output_dir)\n", "# image_count = 0\n", "\n", "# for page_num, page_layout in enumerate(extract_pages(pdf_file_path)):\n", "# for image in get_images_from_page(page_layout):\n", "# image_name = f\"image_{image_count}_page_{page_num}.png\"\n", "# iw.export_image(image)\n", "# image_count += 1\n", "\n", "\n", "# def get_images_from_page(page_layout):\n", "# if isinstance(page_layout, pdfminer.layout.LTImage):\n", "# return [page_layout]\n", "# if isinstance(page_layout, pdfminer.layout.LTContainer):\n", "# img_list = []\n", "# for child in page_layout:\n", "# img_list += get_images_from_page(child)\n", "# return img_list\n", "# else:\n", "# return []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def extract_images_from_pdf(pdf_file_path: str, output_dir: str):\n", " pdf_file = fitz.open(pdf_file_path)\n", " for page_number in range(1, len(pdf_file)):\n", " page = pdf_file[page_number]\n", " for image_index, img in enumerate(page.get_images(), start=1):\n", " xref = img[0]\n", " base_image = pdf_file.extract_image(xref)\n", " image_bytes = base_image[\"image\"]\n", " image_ext = base_image[\"ext\"]\n", " pil_image = Image.open(io.BytesIO(image_bytes))\n", " image_path = (\n", " f\"{output_dir}image_{image_index}_page_{page_number}.{image_ext}\"\n", " )\n", " pil_image.save(image_path)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "1\n", "1\n", "1\n", "1\n" ] } ], "source": [ "extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ENV", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }