mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image
Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
commit
84dc120d39
3
.gitattributes
vendored
Normal file
3
.gitattributes
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
tests/data/** linguist-vendored
|
||||||
|
tests/data_scanned/** linguist-vendored
|
||||||
|
docs/** linguist-vendored
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,15 @@
|
|||||||
|
## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* **ocr:** Auto-detect rotated pages in Tesseract ([#1167](https://github.com/docling-project/docling/issues/1167)) ([`45265bf`](https://github.com/docling-project/docling/commit/45265bf8b1a6d6ad5367bb3f17fb3fa9d4366a05))
|
||||||
|
* Establish confidence estimation for document and pages ([#1313](https://github.com/docling-project/docling/issues/1313)) ([`9087524`](https://github.com/docling-project/docling/commit/90875247e5813da1de17f3cd4475937e8bd45571))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Fix ZeroDivisionError for cell_bbox.area() ([#1636](https://github.com/docling-project/docling/issues/1636)) ([`c2f595d`](https://github.com/docling-project/docling/commit/c2f595d2830ca2e28e68c5da606e89541264f156))
|
||||||
|
* **integration:** Update the Apify Actor integration ([#1619](https://github.com/docling-project/docling/issues/1619)) ([`14d4f5b`](https://github.com/docling-project/docling/commit/14d4f5b109fa65d777ab147b3ce9b5174d020a5d))
|
||||||
|
|
||||||
## [v2.33.0](https://github.com/docling-project/docling/releases/tag/v2.33.0) - 2025-05-20
|
## [v2.33.0](https://github.com/docling-project/docling/releases/tag/v2.33.0) - 2025-05-20
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_height=page_size.height * scale)
|
).to_top_left_origin(page_height=page_size.height * scale)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||||
).to_top_left_origin(page_height=page_size.height * scale)
|
).to_top_left_origin(page_height=page_size.height * scale)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|||||||
.scaled(scale)
|
.scaled(scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
overlap_frac = cell_bbox.intersection_over_self(bbox)
|
||||||
|
|
||||||
if overlap_frac > 0.5:
|
if overlap_frac > 0.5:
|
||||||
if len(text_piece) > 0:
|
if len(text_piece) > 0:
|
||||||
|
@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
|||||||
|
|
||||||
import rich.table
|
import rich.table
|
||||||
import typer
|
import typer
|
||||||
|
from docling_core.transforms.serializer.html import (
|
||||||
|
HTMLDocSerializer,
|
||||||
|
HTMLOutputStyle,
|
||||||
|
HTMLParams,
|
||||||
|
)
|
||||||
|
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
@ -156,6 +162,7 @@ def export_documents(
|
|||||||
export_json: bool,
|
export_json: bool,
|
||||||
export_html: bool,
|
export_html: bool,
|
||||||
export_html_split_page: bool,
|
export_html_split_page: bool,
|
||||||
|
show_layout: bool,
|
||||||
export_md: bool,
|
export_md: bool,
|
||||||
export_txt: bool,
|
export_txt: bool,
|
||||||
export_doctags: bool,
|
export_doctags: bool,
|
||||||
@ -189,9 +196,27 @@ def export_documents(
|
|||||||
if export_html_split_page:
|
if export_html_split_page:
|
||||||
fname = output_dir / f"{doc_filename}.html"
|
fname = output_dir / f"{doc_filename}.html"
|
||||||
_log.info(f"writing HTML output to {fname}")
|
_log.info(f"writing HTML output to {fname}")
|
||||||
conv_res.document.save_as_html(
|
if show_layout:
|
||||||
filename=fname, image_mode=image_export_mode, split_page_view=True
|
ser = HTMLDocSerializer(
|
||||||
)
|
doc=conv_res.document,
|
||||||
|
params=HTMLParams(
|
||||||
|
image_mode=image_export_mode,
|
||||||
|
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
visualizer = LayoutVisualizer()
|
||||||
|
visualizer.params.show_label = False
|
||||||
|
ser_res = ser.serialize(
|
||||||
|
visualizer=visualizer,
|
||||||
|
)
|
||||||
|
with open(fname, "w") as fw:
|
||||||
|
fw.write(ser_res.text)
|
||||||
|
else:
|
||||||
|
conv_res.document.save_as_html(
|
||||||
|
filename=fname,
|
||||||
|
image_mode=image_export_mode,
|
||||||
|
split_page_view=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Export Text format:
|
# Export Text format:
|
||||||
if export_txt:
|
if export_txt:
|
||||||
@ -250,6 +275,13 @@ def convert( # noqa: C901
|
|||||||
to_formats: List[OutputFormat] = typer.Option(
|
to_formats: List[OutputFormat] = typer.Option(
|
||||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||||
),
|
),
|
||||||
|
show_layout: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="If enabled, the page images will show the bounding-boxes of the items.",
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
headers: str = typer.Option(
|
headers: str = typer.Option(
|
||||||
None,
|
None,
|
||||||
"--headers",
|
"--headers",
|
||||||
@ -596,6 +628,7 @@ def convert( # noqa: C901
|
|||||||
export_json=export_json,
|
export_json=export_json,
|
||||||
export_html=export_html,
|
export_html=export_html,
|
||||||
export_html_split_page=export_html_split_page,
|
export_html_split_page=export_html_split_page,
|
||||||
|
show_layout=show_layout,
|
||||||
export_md=export_md,
|
export_md=export_md,
|
||||||
export_txt=export_txt,
|
export_txt=export_txt,
|
||||||
export_doctags=export_doctags,
|
export_doctags=export_doctags,
|
||||||
|
@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
) -> Optional[InputFormat]:
|
) -> Optional[InputFormat]:
|
||||||
"""Guess the input format of a document by checking part of its content."""
|
"""Guess the input format of a document by checking part of its content."""
|
||||||
input_format: Optional[InputFormat] = None
|
input_format: Optional[InputFormat] = None
|
||||||
content_str = content.decode("utf-8")
|
|
||||||
|
|
||||||
if mime == "application/xml":
|
if mime == "application/xml":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
||||||
if match_doctype:
|
if match_doctype:
|
||||||
xml_doctype = match_doctype.group()
|
xml_doctype = match_doctype.group()
|
||||||
@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
input_format = InputFormat.XML_JATS
|
input_format = InputFormat.XML_JATS
|
||||||
|
|
||||||
elif mime == "text/plain":
|
elif mime == "text/plain":
|
||||||
|
content_str = content.decode("utf-8")
|
||||||
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
||||||
input_format = InputFormat.XML_USPTO
|
input_format = InputFormat.XML_USPTO
|
||||||
|
|
||||||
|
@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
|
|||||||
).postprocess()
|
).postprocess()
|
||||||
# processed_clusters, processed_cells = clusters, page.cells
|
# processed_clusters, processed_cells = clusters, page.cells
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].layout_score = float(
|
with warnings.catch_warnings():
|
||||||
np.mean([c.confidence for c in processed_clusters])
|
warnings.filterwarnings(
|
||||||
)
|
"ignore",
|
||||||
|
"Mean of empty slice|invalid value encountered in scalar divide",
|
||||||
|
RuntimeWarning,
|
||||||
|
"numpy",
|
||||||
|
)
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||||
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
np.mean([c.confidence for c in processed_clusters])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||||
|
np.mean(
|
||||||
|
[c.confidence for c in processed_cells if c.from_ocr]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
page.cells = processed_cells
|
page.cells = processed_cells
|
||||||
page.predictions.layout = LayoutPrediction(
|
page.predictions.layout = LayoutPrediction(
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@ -7,7 +8,7 @@ import numpy as np
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, ScoreValue
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
score = self.rate_text_quality(c.text)
|
score = self.rate_text_quality(c.text)
|
||||||
text_scores.append(score)
|
text_scores.append(score)
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].parse_score = float(
|
with warnings.catch_warnings():
|
||||||
np.nanquantile(
|
warnings.filterwarnings(
|
||||||
text_scores, q=0.10
|
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
||||||
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
)
|
||||||
)
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
text_scores, q=0.10
|
||||||
|
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||||
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
|
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.keep_images = (
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
self.pipeline_options.generate_page_images
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
or self.pipeline_options.generate_picture_images
|
self.keep_images = (
|
||||||
or self.pipeline_options.generate_table_images
|
self.pipeline_options.generate_page_images
|
||||||
)
|
or self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
)
|
||||||
|
|
||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate images of the requested element types
|
# Generate images of the requested element types
|
||||||
if (
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
self.pipeline_options.generate_picture_images
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
or self.pipeline_options.generate_table_images
|
if (
|
||||||
):
|
self.pipeline_options.generate_picture_images
|
||||||
scale = self.pipeline_options.images_scale
|
or self.pipeline_options.generate_table_images
|
||||||
for element, _level in conv_res.document.iterate_items():
|
):
|
||||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
scale = self.pipeline_options.images_scale
|
||||||
continue
|
for element, _level in conv_res.document.iterate_items():
|
||||||
if (
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
isinstance(element, PictureItem)
|
continue
|
||||||
and self.pipeline_options.generate_picture_images
|
if (
|
||||||
) or (
|
isinstance(element, PictureItem)
|
||||||
isinstance(element, TableItem)
|
and self.pipeline_options.generate_picture_images
|
||||||
and self.pipeline_options.generate_table_images
|
) or (
|
||||||
):
|
isinstance(element, TableItem)
|
||||||
page_ix = element.prov[0].page_no - 1
|
and self.pipeline_options.generate_table_images
|
||||||
page = next(
|
):
|
||||||
(p for p in conv_res.pages if p.page_no == page_ix),
|
page_ix = element.prov[0].page_no - 1
|
||||||
cast("Page", None),
|
page = next(
|
||||||
)
|
(p for p in conv_res.pages if p.page_no == page_ix),
|
||||||
assert page is not None
|
cast("Page", None),
|
||||||
assert page.size is not None
|
)
|
||||||
assert page.image is not None
|
assert page is not None
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
crop_bbox = (
|
crop_bbox = (
|
||||||
element.prov[0]
|
element.prov[0]
|
||||||
.bbox.scaled(scale=scale)
|
.bbox.scaled(scale=scale)
|
||||||
.to_top_left_origin(page_height=page.size.height * scale)
|
.to_top_left_origin(
|
||||||
)
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
element.image = ImageRef.from_pil(
|
element.image = ImageRef.from_pil(
|
||||||
cropped_im, dpi=int(72 * scale)
|
cropped_im, dpi=int(72 * scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Aggregate confidence values for document:
|
# Aggregate confidence values for document:
|
||||||
if len(conv_res.pages) > 0:
|
if len(conv_res.pages) > 0:
|
||||||
conv_res.confidence.layout_score = float(
|
with warnings.catch_warnings():
|
||||||
np.nanmean(
|
warnings.filterwarnings(
|
||||||
[c.layout_score for c in conv_res.confidence.pages.values()]
|
"ignore",
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message="Mean of empty slice|All-NaN slice encountered",
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.layout_score = float(
|
||||||
conv_res.confidence.parse_score = float(
|
np.nanmean(
|
||||||
np.nanquantile(
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
[c.parse_score for c in conv_res.confidence.pages.values()],
|
)
|
||||||
q=0.1, # parse score should relate to worst 10% of pages.
|
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.parse_score = float(
|
||||||
conv_res.confidence.table_score = float(
|
np.nanquantile(
|
||||||
np.nanmean(
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
[c.table_score for c in conv_res.confidence.pages.values()]
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.table_score = float(
|
||||||
conv_res.confidence.ocr_score = float(
|
np.nanmean(
|
||||||
np.nanmean(
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
@ -90,17 +90,12 @@ class SpatialClusterIndex:
|
|||||||
containment_threshold: float,
|
containment_threshold: float,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Check if two bboxes overlap sufficiently."""
|
"""Check if two bboxes overlap sufficiently."""
|
||||||
area1, area2 = bbox1.area(), bbox2.area()
|
if bbox1.area() <= 0 or bbox2.area() <= 0:
|
||||||
if area1 <= 0 or area2 <= 0:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
overlap_area = bbox1.intersection_area_with(bbox2)
|
iou = bbox1.intersection_over_union(bbox2)
|
||||||
if overlap_area <= 0:
|
containment1 = bbox1.intersection_over_self(bbox2)
|
||||||
return False
|
containment2 = bbox2.intersection_over_self(bbox1)
|
||||||
|
|
||||||
iou = overlap_area / (area1 + area2 - overlap_area)
|
|
||||||
containment1 = overlap_area / area1
|
|
||||||
containment2 = overlap_area / area2
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
iou > overlap_threshold
|
iou > overlap_threshold
|
||||||
@ -321,11 +316,9 @@ class LayoutPostprocessor:
|
|||||||
for special in special_clusters:
|
for special in special_clusters:
|
||||||
contained = []
|
contained = []
|
||||||
for cluster in self.regular_clusters:
|
for cluster in self.regular_clusters:
|
||||||
overlap = cluster.bbox.intersection_area_with(special.bbox)
|
containment = cluster.bbox.intersection_over_self(special.bbox)
|
||||||
if overlap > 0:
|
if containment > 0.8:
|
||||||
containment = overlap / cluster.bbox.area()
|
contained.append(cluster)
|
||||||
if containment > 0.8:
|
|
||||||
contained.append(cluster)
|
|
||||||
|
|
||||||
if contained:
|
if contained:
|
||||||
# Sort contained clusters by minimum cell ID:
|
# Sort contained clusters by minimum cell ID:
|
||||||
@ -379,9 +372,7 @@ class LayoutPostprocessor:
|
|||||||
for regular in self.regular_clusters:
|
for regular in self.regular_clusters:
|
||||||
if regular.label == DocItemLabel.TABLE:
|
if regular.label == DocItemLabel.TABLE:
|
||||||
# Calculate overlap
|
# Calculate overlap
|
||||||
overlap = regular.bbox.intersection_area_with(wrapper.bbox)
|
overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
|
||||||
wrapper_area = wrapper.bbox.area()
|
|
||||||
overlap_ratio = overlap / wrapper_area
|
|
||||||
|
|
||||||
conf_diff = wrapper.confidence - regular.confidence
|
conf_diff = wrapper.confidence - regular.confidence
|
||||||
|
|
||||||
@ -421,8 +412,7 @@ class LayoutPostprocessor:
|
|||||||
# Rule 2: CODE vs others
|
# Rule 2: CODE vs others
|
||||||
if candidate.label == DocItemLabel.CODE:
|
if candidate.label == DocItemLabel.CODE:
|
||||||
# Calculate how much of the other cluster is contained within the CODE cluster
|
# Calculate how much of the other cluster is contained within the CODE cluster
|
||||||
overlap = other.bbox.intersection_area_with(candidate.bbox)
|
containment = other.bbox.intersection_over_self(candidate.bbox)
|
||||||
containment = overlap / other.bbox.area()
|
|
||||||
if containment > 0.8: # other is 80% contained within CODE
|
if containment > 0.8: # other is 80% contained within CODE
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -586,11 +576,9 @@ class LayoutPostprocessor:
|
|||||||
if cell.rect.to_bounding_box().area() <= 0:
|
if cell.rect.to_bounding_box().area() <= 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
overlap = cell.rect.to_bounding_box().intersection_area_with(
|
overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
|
||||||
cluster.bbox
|
cluster.bbox
|
||||||
)
|
)
|
||||||
overlap_ratio = overlap / cell.rect.to_bounding_box().area()
|
|
||||||
|
|
||||||
if overlap_ratio > best_overlap:
|
if overlap_ratio > best_overlap:
|
||||||
best_overlap = overlap_ratio
|
best_overlap = overlap_ratio
|
||||||
best_cluster = cluster
|
best_cluster = cluster
|
||||||
|
737
poetry.lock
generated
737
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.33.0" # DO NOT EDIT, updated automatically
|
version = "2.34.0" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = [
|
authors = [
|
||||||
"Christoph Auer <cau@zurich.ibm.com>",
|
"Christoph Auer <cau@zurich.ibm.com>",
|
||||||
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = {version = "^2.29.0", extras = ["chunking"]}
|
docling-core = {version = "^2.31.2", extras = ["chunking"]}
|
||||||
docling-ibm-models = "^3.4.0"
|
docling-ibm-models = "^3.4.0"
|
||||||
docling-parse = "^4.0.0"
|
docling-parse = "^4.0.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
vendored
Normal file
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group header-1
|
||||||
|
item-2 at level 2: section_header: Pivot table with with 1 row header
|
||||||
|
item-3 at level 3: table with [6x4]
|
||||||
|
item-4 at level 2: section_header: Pivot table with 2 row headers
|
||||||
|
item-5 at level 3: table with [6x5]
|
||||||
|
item-6 at level 2: section_header: Equivalent pivot table
|
||||||
|
item-7 at level 3: table with [6x5]
|
2008
tests/data/groundtruth/docling_v2/example_08.html.json
vendored
Normal file
2008
tests/data/groundtruth/docling_v2/example_08.html.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
29
tests/data/groundtruth/docling_v2/example_08.html.md
vendored
Normal file
29
tests/data/groundtruth/docling_v2/example_08.html.md
vendored
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
## Pivot table with with 1 row header
|
||||||
|
|
||||||
|
| Year | Month | Revenue | Cost |
|
||||||
|
|--------|----------|-----------|--------|
|
||||||
|
| 2025 | January | $134 | $162 |
|
||||||
|
| 2025 | February | $150 | $155 |
|
||||||
|
| 2025 | March | $160 | $143 |
|
||||||
|
| 2025 | April | $210 | $150 |
|
||||||
|
| 2025 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Pivot table with 2 row headers
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Equivalent pivot table
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
Normal file
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
|
||||||
|
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
|
||||||
|
item-3 at level 1: paragraph:
|
||||||
|
item-4 at level 1: section: group textbox
|
||||||
|
item-5 at level 2: paragraph: Student falls ill
|
||||||
|
item-6 at level 2: paragraph:
|
||||||
|
item-7 at level 2: paragraph:
|
||||||
|
item-8 at level 2: list: group list
|
||||||
|
item-9 at level 3: list_item: Suggested Reportable Symptoms:
|
||||||
|
* ... sh
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
item-10 at level 1: list_item:
|
||||||
|
item-11 at level 1: paragraph:
|
||||||
|
item-12 at level 1: paragraph:
|
||||||
|
item-13 at level 1: section: group textbox
|
||||||
|
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||||
|
item-15 at level 1: paragraph:
|
||||||
|
item-16 at level 1: paragraph:
|
||||||
|
item-17 at level 1: paragraph:
|
||||||
|
item-18 at level 1: paragraph:
|
||||||
|
item-19 at level 1: section: group textbox
|
||||||
|
item-20 at level 2: paragraph: Yes
|
||||||
|
item-21 at level 1: paragraph:
|
||||||
|
item-22 at level 1: paragraph:
|
||||||
|
item-23 at level 1: section: group textbox
|
||||||
|
item-24 at level 2: paragraph: A report must be submitted wi ... saster Prevention Information Network.
|
||||||
|
item-25 at level 2: paragraph: A report must also be submitt ... d Infectious Disease Reporting System.
|
||||||
|
item-26 at level 2: paragraph:
|
||||||
|
item-27 at level 2: paragraph:
|
||||||
|
item-28 at level 1: paragraph:
|
||||||
|
item-29 at level 1: paragraph:
|
||||||
|
item-30 at level 1: paragraph:
|
||||||
|
item-31 at level 1: paragraph:
|
||||||
|
item-32 at level 1: paragraph:
|
||||||
|
item-33 at level 1: paragraph:
|
||||||
|
item-34 at level 1: section: group textbox
|
||||||
|
item-35 at level 2: paragraph: Health Bureau:
|
||||||
|
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||||
|
item-37 at level 2: list: group list
|
||||||
|
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||||
|
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||||
|
item-40 at level 2: paragraph:
|
||||||
|
item-41 at level 2: paragraph:
|
||||||
|
item-42 at level 1: list: group list
|
||||||
|
item-43 at level 2: list_item:
|
||||||
|
item-44 at level 1: paragraph:
|
||||||
|
item-45 at level 1: section: group textbox
|
||||||
|
item-46 at level 2: paragraph: Department of Education:
|
||||||
|
Collabo ... vention measures at all school levels.
|
||||||
|
item-47 at level 1: paragraph:
|
||||||
|
item-48 at level 1: paragraph:
|
||||||
|
item-49 at level 1: paragraph:
|
||||||
|
item-50 at level 1: paragraph:
|
||||||
|
item-51 at level 1: paragraph:
|
||||||
|
item-52 at level 1: paragraph:
|
||||||
|
item-53 at level 1: paragraph:
|
||||||
|
item-54 at level 1: section: group textbox
|
||||||
|
item-55 at level 2: inline: group group
|
||||||
|
item-56 at level 3: paragraph: The Health Bureau will handle
|
||||||
|
item-57 at level 3: paragraph: reporting and specimen collection
|
||||||
|
item-58 at level 3: paragraph: .
|
||||||
|
item-59 at level 2: paragraph:
|
||||||
|
item-60 at level 2: paragraph:
|
||||||
|
item-61 at level 1: paragraph:
|
||||||
|
item-62 at level 1: paragraph:
|
||||||
|
item-63 at level 1: paragraph:
|
||||||
|
item-64 at level 1: section: group textbox
|
||||||
|
item-65 at level 2: paragraph: Whether the epidemic has eased.
|
||||||
|
item-66 at level 2: paragraph:
|
||||||
|
item-67 at level 2: paragraph:
|
||||||
|
item-68 at level 1: paragraph:
|
||||||
|
item-69 at level 1: section: group textbox
|
||||||
|
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||||
|
item-71 at level 2: paragraph: No
|
||||||
|
item-72 at level 1: paragraph:
|
||||||
|
item-73 at level 1: paragraph:
|
||||||
|
item-74 at level 1: section: group textbox
|
||||||
|
item-75 at level 1: paragraph:
|
||||||
|
item-76 at level 1: section: group textbox
|
||||||
|
item-77 at level 1: paragraph:
|
||||||
|
item-78 at level 1: paragraph:
|
||||||
|
item-79 at level 1: section: group textbox
|
||||||
|
item-80 at level 2: paragraph: Case closed.
|
||||||
|
item-81 at level 2: paragraph:
|
||||||
|
item-82 at level 2: paragraph:
|
||||||
|
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||||
|
item-84 at level 1: paragraph:
|
||||||
|
item-85 at level 1: section: group textbox
|
||||||
|
item-86 at level 1: paragraph:
|
||||||
|
item-87 at level 1: paragraph:
|
||||||
|
item-88 at level 1: paragraph:
|
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
Normal file
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
46
tests/data/groundtruth/docling_v2/textbox.docx.md
vendored
Normal file
46
tests/data/groundtruth/docling_v2/textbox.docx.md
vendored
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
|
||||||
|
|
||||||
|
**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
|
||||||
|
|
||||||
|
**Student falls ill**
|
||||||
|
|
||||||
|
- Suggested Reportable Symptoms:
|
||||||
|
* Fever
|
||||||
|
* Cough
|
||||||
|
* Diarrhea
|
||||||
|
* Vomiting
|
||||||
|
* Rash
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
|
||||||
|
If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
|
||||||
|
show the same suggested reportable symptoms
|
||||||
|
|
||||||
|
Yes
|
||||||
|
|
||||||
|
A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
|
||||||
|
|
||||||
|
A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
|
||||||
|
|
||||||
|
**Health Bureau:**
|
||||||
|
|
||||||
|
Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
|
||||||
|
|
||||||
|
- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
|
||||||
|
- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
|
||||||
|
|
||||||
|
Department of Education:
|
||||||
|
Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
|
||||||
|
|
||||||
|
The Health Bureau will handle **reporting and specimen collection** .
|
||||||
|
|
||||||
|
**Whether the epidemic has eased.**
|
||||||
|
|
||||||
|
**Whether the test results are positive for a legally designated infectious disease.**
|
||||||
|
|
||||||
|
No
|
||||||
|
|
||||||
|
**Case closed.**
|
||||||
|
|
||||||
|
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
|
@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
|
|||||||
print(f"converting {csv_path}")
|
print(f"converting {csv_path}")
|
||||||
|
|
||||||
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
||||||
|
if csv_path.stem in (
|
||||||
conv_result: ConversionResult = converter.convert(csv_path)
|
"csv-too-few-columns",
|
||||||
|
"csv-too-many-columns",
|
||||||
|
"csv-inconsistent-header",
|
||||||
|
):
|
||||||
|
with warns(UserWarning, match="Inconsistent column lengths"):
|
||||||
|
conv_result: ConversionResult = converter.convert(csv_path)
|
||||||
|
else:
|
||||||
|
conv_result: ConversionResult = converter.convert(csv_path)
|
||||||
|
|
||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
|
@ -38,17 +38,15 @@ def get_converter():
|
|||||||
|
|
||||||
def test_compare_legacy_output(test_doc_paths):
|
def test_compare_legacy_output(test_doc_paths):
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
res = converter.convert_all(test_doc_paths, raises_on_error=True)
|
res = converter.convert_all(test_doc_paths, raises_on_error=True)
|
||||||
|
|
||||||
for conv_res in res:
|
for conv_res in res:
|
||||||
print(f"Results for {conv_res.input.file}")
|
print(f"Results for {conv_res.input.file}")
|
||||||
print(
|
with pytest.warns(DeprecationWarning, match="Use document instead"):
|
||||||
json.dumps(
|
print(
|
||||||
conv_res.legacy_document.model_dump(
|
json.dumps(
|
||||||
mode="json", by_alias=True, exclude_none=True
|
conv_res.legacy_document.model_dump(
|
||||||
|
mode="json", by_alias=True, exclude_none=True
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
# assert res.legacy_output == res.legacy_output_transformed
|
# assert res.legacy_output == res.legacy_output_transformed
|
||||||
|
@ -4,6 +4,7 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
|
|||||||
)
|
)
|
||||||
|
|
||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DsDocument = doc_result.legacy_document
|
with pytest.warns(DeprecationWarning, match="Use document instead"):
|
||||||
with warnings.catch_warnings():
|
doc_pred: DsDocument = doc_result.legacy_document
|
||||||
warnings.simplefilter("ignore", DeprecationWarning)
|
|
||||||
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
||||||
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
||||||
|
|
||||||
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
|
|||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DoclingDocument = doc_result.document
|
doc_pred: DoclingDocument = doc_result.document
|
||||||
doc_pred_md = doc_result.document.export_to_markdown()
|
doc_pred_md = doc_result.document.export_to_markdown()
|
||||||
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
doc_pred_dt = doc_result.document.export_to_doctags()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user