Merge branch 'main' into html_backend

Signed-off-by: vaaale <2428222+vaaale@users.noreply.github.com>
This commit is contained in:
vaaale 2025-05-31 08:57:01 +02:00 committed by GitHub
commit 064a236ebf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 4472 additions and 307 deletions

3
.gitattributes vendored Normal file
View File

@ -0,0 +1,3 @@
tests/data/** linguist-vendored
tests/data_scanned/** linguist-vendored
docs/** linguist-vendored

View File

@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
import rich.table
import typer
from docling_core.transforms.serializer.html import (
HTMLDocSerializer,
HTMLOutputStyle,
HTMLParams,
)
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter
@ -156,6 +162,7 @@ def export_documents(
export_json: bool,
export_html: bool,
export_html_split_page: bool,
show_layout: bool,
export_md: bool,
export_txt: bool,
export_doctags: bool,
@ -189,9 +196,27 @@ def export_documents(
if export_html_split_page:
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(
filename=fname, image_mode=image_export_mode, split_page_view=True
)
if show_layout:
ser = HTMLDocSerializer(
doc=conv_res.document,
params=HTMLParams(
image_mode=image_export_mode,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
visualizer = LayoutVisualizer()
visualizer.params.show_label = False
ser_res = ser.serialize(
visualizer=visualizer,
)
with open(fname, "w") as fw:
fw.write(ser_res.text)
else:
conv_res.document.save_as_html(
filename=fname,
image_mode=image_export_mode,
split_page_view=True,
)
# Export Text format:
if export_txt:
@ -250,6 +275,13 @@ def convert( # noqa: C901
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown."
),
show_layout: Annotated[
bool,
typer.Option(
...,
help="If enabled, the page images will show the bounding-boxes of the items.",
),
] = False,
headers: str = typer.Option(
None,
"--headers",
@ -596,6 +628,7 @@ def convert( # noqa: C901
export_json=export_json,
export_html=export_html,
export_html_split_page=export_html_split_page,
show_layout=show_layout,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,

View File

@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
) -> Optional[InputFormat]:
"""Guess the input format of a document by checking part of its content."""
input_format: Optional[InputFormat] = None
content_str = content.decode("utf-8")
if mime == "application/xml":
content_str = content.decode("utf-8")
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
if match_doctype:
xml_doctype = match_doctype.group()
@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
input_format = InputFormat.XML_JATS
elif mime == "text/plain":
content_str = content.decode("utf-8")
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO

View File

@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
).postprocess()
# processed_clusters, processed_cells = clusters, page.cells
conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters])
)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Mean of empty slice|invalid value encountered in scalar divide",
RuntimeWarning,
"numpy",
)
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr])
)
conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters])
)
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean(
[c.confidence for c in processed_cells if c.from_ocr]
)
)
page.cells = processed_cells
page.predictions.layout = LayoutPrediction(

View File

@ -1,4 +1,5 @@
import re
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
@ -7,7 +8,7 @@ import numpy as np
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
score = self.rate_text_quality(c.text)
text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile(
text_scores, q=0.10
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
)
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
"When defined, it must point to a folder containing all models required by the pipeline."
)
self.keep_images = (
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
)
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
self.keep_images = (
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
)
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
)
# Generate images of the requested element types
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
)
assert page is not None
assert page.size is not None
assert page.image is not None
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
if (
self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
) or (
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
page_ix = element.prov[0].page_no - 1
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
)
assert page is not None
assert page.size is not None
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale)
)
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(
page_height=page.size.height * scale
)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale)
)
# Aggregate confidence values for document:
if len(conv_res.pages) > 0:
conv_res.confidence.layout_score = float(
np.nanmean(
[c.layout_score for c in conv_res.confidence.pages.values()]
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=RuntimeWarning,
message="Mean of empty slice|All-NaN slice encountered",
)
)
conv_res.confidence.parse_score = float(
np.nanquantile(
[c.parse_score for c in conv_res.confidence.pages.values()],
q=0.1, # parse score should relate to worst 10% of pages.
conv_res.confidence.layout_score = float(
np.nanmean(
[c.layout_score for c in conv_res.confidence.pages.values()]
)
)
)
conv_res.confidence.table_score = float(
np.nanmean(
[c.table_score for c in conv_res.confidence.pages.values()]
conv_res.confidence.parse_score = float(
np.nanquantile(
[c.parse_score for c in conv_res.confidence.pages.values()],
q=0.1, # parse score should relate to worst 10% of pages.
)
)
)
conv_res.confidence.ocr_score = float(
np.nanmean(
[c.ocr_score for c in conv_res.confidence.pages.values()]
conv_res.confidence.table_score = float(
np.nanmean(
[c.table_score for c in conv_res.confidence.pages.values()]
)
)
conv_res.confidence.ocr_score = float(
np.nanmean(
[c.ocr_score for c in conv_res.confidence.pages.values()]
)
)
)
return conv_res

737
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
######################
python = "^3.9"
pydantic = "^2.0.0"
docling-core = {version = "^2.29.0", extras = ["chunking"]}
docling-core = {version = "^2.31.2", extras = ["chunking"]}
docling-ibm-models = "^3.4.0"
docling-parse = "^4.0.0"
filetype = "^1.2.0"

View File

@ -0,0 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-1
item-2 at level 2: section_header: Pivot table with with 1 row header
item-3 at level 3: table with [6x4]
item-4 at level 2: section_header: Pivot table with 2 row headers
item-5 at level 3: table with [6x5]
item-6 at level 2: section_header: Equivalent pivot table
item-7 at level 3: table with [6x5]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
## Pivot table with with 1 row header
| Year | Month | Revenue | Cost |
|--------|----------|-----------|--------|
| 2025 | January | $134 | $162 |
| 2025 | February | $150 | $155 |
| 2025 | March | $160 | $143 |
| 2025 | April | $210 | $150 |
| 2025 | May | $280 | $120 |
## Pivot table with 2 row headers
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |
## Equivalent pivot table
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |

View File

@ -0,0 +1,94 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
item-3 at level 1: paragraph:
item-4 at level 1: section: group textbox
item-5 at level 2: paragraph: Student falls ill
item-6 at level 2: paragraph:
item-7 at level 2: paragraph:
item-8 at level 2: list: group list
item-9 at level 3: list_item: Suggested Reportable Symptoms:
... sh
Blisters
Headache
Sore throat
item-10 at level 1: list_item:
item-11 at level 1: paragraph:
item-12 at level 1: paragraph:
item-13 at level 1: section: group textbox
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-15 at level 1: paragraph:
item-16 at level 1: paragraph:
item-17 at level 1: paragraph:
item-18 at level 1: paragraph:
item-19 at level 1: section: group textbox
item-20 at level 2: paragraph: Yes
item-21 at level 1: paragraph:
item-22 at level 1: paragraph:
item-23 at level 1: section: group textbox
item-24 at level 2: paragraph:  A report must be submitted wi ... saster Prevention Information Network.
item-25 at level 2: paragraph:  A report must also be submitt ... d Infectious Disease Reporting System.
item-26 at level 2: paragraph:
item-27 at level 2: paragraph:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: paragraph:
item-34 at level 1: section: group textbox
item-35 at level 2: paragraph: Health Bureau:
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-37 at level 2: list: group list
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-40 at level 2: paragraph:
item-41 at level 2: paragraph:
item-42 at level 1: list: group list
item-43 at level 2: list_item:
item-44 at level 1: paragraph:
item-45 at level 1: section: group textbox
item-46 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: paragraph:
item-53 at level 1: paragraph:
item-54 at level 1: section: group textbox
item-55 at level 2: inline: group group
item-56 at level 3: paragraph: The Health Bureau will handle
item-57 at level 3: paragraph: reporting and specimen collection
item-58 at level 3: paragraph: .
item-59 at level 2: paragraph:
item-60 at level 2: paragraph:
item-61 at level 1: paragraph:
item-62 at level 1: paragraph:
item-63 at level 1: paragraph:
item-64 at level 1: section: group textbox
item-65 at level 2: paragraph: Whether the epidemic has eased.
item-66 at level 2: paragraph:
item-67 at level 2: paragraph:
item-68 at level 1: paragraph:
item-69 at level 1: section: group textbox
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-71 at level 2: paragraph: No
item-72 at level 1: paragraph:
item-73 at level 1: paragraph:
item-74 at level 1: section: group textbox
item-75 at level 1: paragraph:
item-76 at level 1: section: group textbox
item-77 at level 1: paragraph:
item-78 at level 1: paragraph:
item-79 at level 1: section: group textbox
item-80 at level 2: paragraph: Case closed.
item-81 at level 2: paragraph:
item-82 at level 2: paragraph:
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-84 at level 1: paragraph:
item-85 at level 1: section: group textbox
item-86 at level 1: paragraph:
item-87 at level 1: paragraph:
item-88 at level 1: paragraph:

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
**Student falls ill**
- Suggested Reportable Symptoms:
Fever
Cough
Diarrhea
Vomiting
Rash
Blisters
Headache
Sore throat
If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
show the same suggested reportable symptoms
Yes
 A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.
 A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.
**Health Bureau:**
Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
Department of Education:
Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
The Health Bureau will handle **reporting and specimen collection** .
**Whether the epidemic has eased.**
**Whether the test results are positive for a legally designated infectious disease.**
No
**Case closed.**
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.

View File

@ -18,4 +18,3 @@
</ol>
</body>
</html>

View File

@ -1,145 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<style>
table,
th,
td {
border: 1px solid black;
}
</style>
</head>
<body>
<h2>Pivot table with with 1 row header</h2>
<table>
<tr>
<th>Year</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="6">2025</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Pivot table with 2 row headers</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="7">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
<h2>Equivalent pivot table</h2>
<table>
<tr>
<th>Year</th>
<th>Quarter</th>
<th>Month</th>
<th>Revenue</th>
<th>Cost</th>
</tr>
<tr>
<th rowspan="8">2025</th>
<th rowspan="4">Q1</th>
</tr>
<tr>
<td>January</td>
<td>$134</td>
<td>$162</td>
</tr>
<tr>
<td>February</td>
<td>$150</td>
<td>$155</td>
</tr>
<tr>
<td>March</td>
<td>$160</td>
<td>$143</td>
</tr>
<tr>
<th rowspan="3">Q2</th>
</tr>
<tr>
<td>April</td>
<td>$210</td>
<td>$150</td>
</tr>
<tr>
<td>May</td>
<td>$280</td>
<td>$120</td>
</tr>
</table>
</body>
</html>

View File

@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
print(f"converting {csv_path}")
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
conv_result: ConversionResult = converter.convert(csv_path)
if csv_path.stem in (
"csv-too-few-columns",
"csv-too-many-columns",
"csv-inconsistent-header",
):
with warns(UserWarning, match="Inconsistent column lengths"):
conv_result: ConversionResult = converter.convert(csv_path)
else:
conv_result: ConversionResult = converter.convert(csv_path)
doc: DoclingDocument = conv_result.document

View File

@ -38,17 +38,15 @@ def get_converter():
def test_compare_legacy_output(test_doc_paths):
converter = get_converter()
res = converter.convert_all(test_doc_paths, raises_on_error=True)
for conv_res in res:
print(f"Results for {conv_res.input.file}")
print(
json.dumps(
conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
with pytest.warns(DeprecationWarning, match="Use document instead"):
print(
json.dumps(
conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
)
)
# assert res.legacy_output == res.legacy_output_transformed

View File

@ -4,6 +4,7 @@ import warnings
from pathlib import Path
from typing import List, Optional
import pytest
from docling_core.types.doc import (
DocItem,
DoclingDocument,
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
)
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_document
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
with pytest.warns(DeprecationWarning, match="Use document instead"):
doc_pred: DsDocument = doc_result.legacy_document
doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_document_tokens()
doc_pred_dt = doc_result.document.export_to_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"