mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
- Updated unit tests - Added documentation (Example notebook) Note: MyPy fails. Seems to be a known issue with BeautifulSoup: https://github.com/python/typeshed/pull/13604 Signed-off-by: Alexander Vaagan <alexander.vaagan@gmail.com> Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
314 lines
354 KiB
Plaintext
314 lines
354 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "77f5bf7983ccfc35",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Conversion of HTML files\n",
|
|
"\n",
|
|
"This example shows how to convert HTML files to a structured Docling Document.\n",
|
|
"\n",
|
|
"Three examples are given:\n",
|
|
"- HTML ignoring images\n",
|
|
"- HTML with images parsed as references\n",
|
|
"- HTML with images parsed inline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5729820db99cacbb",
|
|
"metadata": {},
|
|
"source": "## HTML ignoring images"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "ba735966c052d9ab",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.420830Z",
|
|
"start_time": "2025-04-17T11:35:06.445943Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from docling_core.types.doc import ImageRefMode\n",
|
|
"\n",
|
|
"from docling.backend.html_backend import (\n",
|
|
" HTMLDocumentBackend,\n",
|
|
" HTMLDocumentBackendImagesInline,\n",
|
|
" HTMLDocumentBackendImagesReferenced,\n",
|
|
")\n",
|
|
"from docling.datamodel.base_models import InputFormat\n",
|
|
"from docling.datamodel.pipeline_options import PdfPipelineOptions\n",
|
|
"from docling.document_converter import (\n",
|
|
" DocumentConverter,\n",
|
|
" HTMLFormatOption,\n",
|
|
" PdfFormatOption,\n",
|
|
")\n",
|
|
"from docling.pipeline.simple_pipeline import SimplePipeline"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "86f2468b5e03bd2e",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.426930Z",
|
|
"start_time": "2025-04-17T11:35:09.424303Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def pdf_pipeline():\n",
|
|
" pipeline_options = PdfPipelineOptions()\n",
|
|
" pipeline_options.do_ocr = True\n",
|
|
" pipeline_options.do_table_structure = True\n",
|
|
" pipeline_options.do_picture_description = True\n",
|
|
" pipeline_options.table_structure_options.do_cell_matching = True\n",
|
|
" pipeline_options.generate_page_images = True\n",
|
|
" return pipeline_options"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "a428f20a1724beb3",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.500904Z",
|
|
"start_time": "2025-04-17T11:35:09.498507Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using the HTMLDocumentBackend to convert HTML files. Images will be ignored\n",
|
|
"doc_converter = (\n",
|
|
" DocumentConverter( # all of the below is optional, has internal defaults.\n",
|
|
" allowed_formats=[\n",
|
|
" InputFormat.HTML,\n",
|
|
" ], # whitelist formats, non-matching files are ignored.\n",
|
|
" format_options={\n",
|
|
" InputFormat.HTML: HTMLFormatOption(\n",
|
|
" pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend\n",
|
|
" ),\n",
|
|
" InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline()),\n",
|
|
" },\n",
|
|
" )\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "8f11e488b6e81339",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.577033Z",
|
|
"start_time": "2025-04-17T11:35:09.547937Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/markdown": "# Introduction to parsing HTML files withDocling\n\nDocling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.\n\n### Supported file formats\n\nDocling supports multiple file formats..\n\n- Advanced PDF understanding\n- Microsoft Office DOCX\n- HTML files (with optional support for images)\n\n#### Three backends for handling HTML files\n\nDocling has three backends for parsing HTML files:\n\n1. HTMLDocumentBackend Ignores images\n2. HTMLDocumentBackendImagesInline Extracts images inline\n3. HTMLDocumentBackendImagesReferenced Extracts images as references",
|
|
"text/plain": [
|
|
"<IPython.core.display.Markdown object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import Latex, Markdown, display\n",
|
|
"\n",
|
|
"html_file = \"../../tests/data/html/example_08.html\"\n",
|
|
"result = doc_converter.convert(html_file)\n",
|
|
"document = result.document\n",
|
|
"markdown = document.export_to_markdown()\n",
|
|
"display(Markdown(markdown))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9aa94a8da08cbbf1",
|
|
"metadata": {},
|
|
"source": "## HTML with images as references"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "e6c4ea8cf2cdf8c1",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.595771Z",
|
|
"start_time": "2025-04-17T11:35:09.592457Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using the HTMLDocumentBackend to convert HTML files. Images will be extracted as references\n",
|
|
"doc_converter = (\n",
|
|
" DocumentConverter( # all of the below is optional, has internal defaults.\n",
|
|
" allowed_formats=[\n",
|
|
" InputFormat.HTML,\n",
|
|
" ], # whitelist formats, non-matching files are ignored.\n",
|
|
" format_options={\n",
|
|
" InputFormat.HTML: HTMLFormatOption(\n",
|
|
" pipeline_cls=SimplePipeline, backend=HTMLDocumentBackendImagesReferenced\n",
|
|
" ),\n",
|
|
" InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline()),\n",
|
|
" },\n",
|
|
" )\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "4f088a6b02ea54bd",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.650443Z",
|
|
"start_time": "2025-04-17T11:35:09.639985Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/markdown": "# Introduction to parsing HTML files withDocling\n\nDocling\n\n\n\nDocling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.\n\n### Supported file formats\n\nDocling supports multiple file formats..\n\n- Advanced PDF understanding\n- PDF\n\n\n- Microsoft Office DOCX\n- DOCX\n\n\n- HTML files (with optional support for images)\n- HTML\n\n\n\n#### Three backends for handling HTML files\n\nDocling has three backends for parsing HTML files:\n\n1. HTMLDocumentBackend Ignores images\n2. HTMLDocumentBackendImagesInline Extracts images inline\n3. HTMLDocumentBackendImagesReferenced Extracts images as references",
|
|
"text/plain": [
|
|
"<IPython.core.display.Markdown object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import Latex, Markdown, display\n",
|
|
"\n",
|
|
"html_file = \"../../tests/data/html/example_08.html\"\n",
|
|
"result = doc_converter.convert(html_file)\n",
|
|
"document = result.document\n",
|
|
"markdown = document.export_to_markdown(image_mode=ImageRefMode.REFERENCED)\n",
|
|
"# Does not show otherwise. Not sure why\n",
|
|
"markdown = markdown.replace(\"file://\", \"\")\n",
|
|
"display(Markdown(markdown))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a6017dc49bf33601",
|
|
"metadata": {},
|
|
"source": "## HTML with images inline"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "b1112e9c386805b2",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:09.691056Z",
|
|
"start_time": "2025-04-17T11:35:09.687780Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Using the HTMLDocumentBackend to convert HTML files. Images will be extracted as references\n",
|
|
"doc_converter = (\n",
|
|
" DocumentConverter( # all of the below is optional, has internal defaults.\n",
|
|
" allowed_formats=[\n",
|
|
" InputFormat.HTML,\n",
|
|
" ], # whitelist formats, non-matching files are ignored.\n",
|
|
" format_options={\n",
|
|
" InputFormat.HTML: HTMLFormatOption(\n",
|
|
" pipeline_cls=SimplePipeline, backend=HTMLDocumentBackendImagesInline\n",
|
|
" ),\n",
|
|
" InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline()),\n",
|
|
" },\n",
|
|
" )\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "c37aa8c8afcacd16",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:11.139730Z",
|
|
"start_time": "2025-04-17T11:35:09.734469Z"
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Could not load image (src=https://github.com/docling-project/docling/tree/main/docs/assets/pdf.png): cannot identify image file <_io.BytesIO object at 0x73d26d7658a0>\n",
|
|
"Could not load image (src=https://github.com/docling-project/docling/tree/main/docs/assets/docx.png): cannot identify image file <_io.BytesIO object at 0x73d2675e5170>\n",
|
|
"Could not load image (src=https://github.com/docling-project/docling/tree/main/docs/assets/html.png): cannot identify image file <_io.BytesIO object at 0x73d2675e4c20>\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/markdown": "# Introduction to parsing HTML files withDocling\n\nDocling\n\n\n\nDocling simplifies document processing, parsing diverse formats — including HTML — and providing seamless integrations with the gen AI ecosystem.\n\n### Supported file formats\n\nDocling supports multiple file formats..\n\n- Advanced PDF understanding\n- PDF\n- Microsoft Office DOCX\n- DOCX\n- HTML files (with optional support for images)\n- HTML\n\n#### Three backends for handling HTML files\n\nDocling has three backends for parsing HTML files:\n\n1. HTMLDocumentBackend Ignores images\n2. HTMLDocumentBackendImagesInline Extracts images inline\n3. HTMLDocumentBackendImagesReferenced Extracts images as references",
|
|
"text/plain": [
|
|
"<IPython.core.display.Markdown object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"from IPython.display import Latex, Markdown, display\n",
|
|
"\n",
|
|
"html_file = \"../../tests/data/html/example_09.html\"\n",
|
|
"result = doc_converter.convert(html_file)\n",
|
|
"document = result.document\n",
|
|
"markdown = document.export_to_markdown(image_mode=ImageRefMode.EMBEDDED)\n",
|
|
"display(Markdown(markdown))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "94f7cc6d7288c909",
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-04-17T11:35:11.185013Z",
|
|
"start_time": "2025-04-17T11:35:11.182838Z"
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|