Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
Michael Krissgau
2025-06-05 22:20:09 +02:00
67 changed files with 8728 additions and 9731 deletions

Binary file not shown.

View File

@@ -26,69 +26,71 @@ item-0 at level 0: unspecified: group _root_
item-21 at level 1: paragraph:
item-22 at level 1: paragraph:
item-23 at level 1: section: group textbox
item-24 at level 2: paragraph:  A report must be submitted wi ... saster Prevention Information Network.
item-25 at level 2: paragraph:  A report must also be submitt ... d Infectious Disease Reporting System.
item-26 at level 2: paragraph:
item-24 at level 2: list: group list
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-27 at level 2: paragraph:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-28 at level 2: paragraph:
item-29 at level 1: list: group list
item-30 at level 2: list_item:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: paragraph:
item-34 at level 1: section: group textbox
item-35 at level 2: paragraph: Health Bureau:
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-37 at level 2: list: group list
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-40 at level 2: paragraph:
item-41 at level 2: paragraph:
item-42 at level 1: list: group list
item-43 at level 2: list_item:
item-44 at level 1: paragraph:
item-45 at level 1: section: group textbox
item-46 at level 2: paragraph: Department of Education:
item-34 at level 1: paragraph:
item-35 at level 1: paragraph:
item-36 at level 1: section: group textbox
item-37 at level 2: paragraph: Health Bureau:
item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-39 at level 2: list: group list
item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-42 at level 2: paragraph:
item-43 at level 2: paragraph:
item-44 at level 1: list: group list
item-45 at level 2: list_item:
item-46 at level 1: paragraph:
item-47 at level 1: section: group textbox
item-48 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: paragraph:
item-53 at level 1: paragraph:
item-54 at level 1: section: group textbox
item-55 at level 2: inline: group group
item-56 at level 3: paragraph: The Health Bureau will handle
item-57 at level 3: paragraph: reporting and specimen collection
item-58 at level 3: paragraph: .
item-59 at level 2: paragraph:
item-60 at level 2: paragraph:
item-61 at level 1: paragraph:
item-62 at level 1: paragraph:
item-54 at level 1: paragraph:
item-55 at level 1: paragraph:
item-56 at level 1: section: group textbox
item-57 at level 2: inline: group group
item-58 at level 3: paragraph: The Health Bureau will handle
item-59 at level 3: paragraph: reporting and specimen collection
item-60 at level 3: paragraph: .
item-61 at level 2: paragraph:
item-62 at level 2: paragraph:
item-63 at level 1: paragraph:
item-64 at level 1: section: group textbox
item-65 at level 2: paragraph: Whether the epidemic has eased.
item-66 at level 2: paragraph:
item-67 at level 2: paragraph:
item-68 at level 1: paragraph:
item-69 at level 1: section: group textbox
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-71 at level 2: paragraph: No
item-72 at level 1: paragraph:
item-73 at level 1: paragraph:
item-74 at level 1: section: group textbox
item-64 at level 1: paragraph:
item-65 at level 1: paragraph:
item-66 at level 1: section: group textbox
item-67 at level 2: paragraph: Whether the epidemic has eased.
item-68 at level 2: paragraph:
item-69 at level 2: paragraph:
item-70 at level 1: paragraph:
item-71 at level 1: section: group textbox
item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-73 at level 2: paragraph: No
item-74 at level 1: paragraph:
item-75 at level 1: paragraph:
item-76 at level 1: section: group textbox
item-77 at level 1: paragraph:
item-78 at level 1: paragraph:
item-79 at level 1: section: group textbox
item-80 at level 2: paragraph: Case closed.
item-81 at level 2: paragraph:
item-82 at level 2: paragraph:
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-84 at level 1: paragraph:
item-85 at level 1: section: group textbox
item-78 at level 1: section: group textbox
item-79 at level 1: paragraph:
item-80 at level 1: paragraph:
item-81 at level 1: section: group textbox
item-82 at level 2: paragraph: Case closed.
item-83 at level 2: paragraph:
item-84 at level 2: paragraph:
item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-86 at level 1: paragraph:
item-87 at level 1: paragraph:
item-88 at level 1: paragraph:
item-87 at level 1: section: group textbox
item-88 at level 1: paragraph:
item-89 at level 1: paragraph:
item-90 at level 1: paragraph:

View File

@@ -4,7 +4,7 @@
"name": "textbox",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 830302052279341882,
"binary_hash": 11723995438039370060,
"filename": "textbox.docx"
},
"furniture": {
@@ -66,7 +66,7 @@
"$ref": "#/groups/4"
},
{
"$ref": "#/texts/22"
"$ref": "#/groups/6"
},
{
"$ref": "#/texts/23"
@@ -84,16 +84,16 @@
"$ref": "#/texts/27"
},
{
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
{
"$ref": "#/groups/7"
"$ref": "#/groups/9"
},
{
"$ref": "#/texts/35"
},
{
"$ref": "#/groups/8"
"$ref": "#/groups/10"
},
{
"$ref": "#/texts/37"
@@ -117,7 +117,7 @@
"$ref": "#/texts/43"
},
{
"$ref": "#/groups/9"
"$ref": "#/groups/11"
},
{
"$ref": "#/texts/49"
@@ -129,13 +129,13 @@
"$ref": "#/texts/51"
},
{
"$ref": "#/groups/11"
"$ref": "#/groups/13"
},
{
"$ref": "#/texts/55"
},
{
"$ref": "#/groups/12"
"$ref": "#/groups/14"
},
{
"$ref": "#/texts/58"
@@ -144,13 +144,13 @@
"$ref": "#/texts/59"
},
{
"$ref": "#/groups/13"
"$ref": "#/groups/15"
},
{
"$ref": "#/texts/60"
},
{
"$ref": "#/groups/14"
"$ref": "#/groups/16"
},
{
"$ref": "#/texts/61"
@@ -159,13 +159,13 @@
"$ref": "#/texts/62"
},
{
"$ref": "#/groups/15"
"$ref": "#/groups/17"
},
{
"$ref": "#/texts/67"
},
{
"$ref": "#/groups/16"
"$ref": "#/groups/18"
},
{
"$ref": "#/texts/68"
@@ -254,10 +254,7 @@
},
"children": [
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/20"
@@ -272,6 +269,37 @@
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/groups/4"
},
"children": [
{
"$ref": "#/texts/18"
},
{
"$ref": "#/texts/19"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/22"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/body"
},
@@ -283,7 +311,7 @@
"$ref": "#/texts/29"
},
{
"$ref": "#/groups/6"
"$ref": "#/groups/8"
},
{
"$ref": "#/texts/32"
@@ -297,9 +325,9 @@
"label": "section"
},
{
"self_ref": "#/groups/6",
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
"children": [
{
@@ -314,7 +342,7 @@
"label": "list"
},
{
"self_ref": "#/groups/7",
"self_ref": "#/groups/9",
"parent": {
"$ref": "#/body"
},
@@ -328,7 +356,7 @@
"label": "list"
},
{
"self_ref": "#/groups/8",
"self_ref": "#/groups/10",
"parent": {
"$ref": "#/body"
},
@@ -342,13 +370,13 @@
"label": "section"
},
{
"self_ref": "#/groups/9",
"self_ref": "#/groups/11",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/groups/10"
"$ref": "#/groups/12"
},
{
"$ref": "#/texts/47"
@@ -362,9 +390,9 @@
"label": "section"
},
{
"self_ref": "#/groups/10",
"self_ref": "#/groups/12",
"parent": {
"$ref": "#/groups/9"
"$ref": "#/groups/11"
},
"children": [
{
@@ -382,7 +410,7 @@
"label": "inline"
},
{
"self_ref": "#/groups/11",
"self_ref": "#/groups/13",
"parent": {
"$ref": "#/body"
},
@@ -402,7 +430,7 @@
"label": "section"
},
{
"self_ref": "#/groups/12",
"self_ref": "#/groups/14",
"parent": {
"$ref": "#/body"
},
@@ -418,31 +446,31 @@
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/13",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/14",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/15",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/16",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"name": "textbox",
"label": "section"
},
{
"self_ref": "#/groups/17",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/63"
@@ -462,7 +490,7 @@
"label": "section"
},
{
"self_ref": "#/groups/16",
"self_ref": "#/groups/18",
"parent": {
"$ref": "#/body"
},
@@ -732,38 +760,42 @@
{
"self_ref": "#/texts/18",
"parent": {
"$ref": "#/groups/4"
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "list_item",
"prov": [],
"orig": "A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.",
"text": "A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.",
"orig": "A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.",
"text": "A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/4"
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "list_item",
"prov": [],
"orig": "A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.",
"text": "A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.",
"orig": "A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.",
"text": "A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false
}
},
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/20",
@@ -792,14 +824,16 @@
{
"self_ref": "#/texts/22",
"parent": {
"$ref": "#/body"
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "list_item",
"prov": [],
"orig": "",
"text": ""
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/23",
@@ -864,7 +898,7 @@
{
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
@@ -882,7 +916,7 @@
{
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
@@ -900,7 +934,7 @@
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/6"
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
@@ -920,7 +954,7 @@
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/groups/6"
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
@@ -940,7 +974,7 @@
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
@@ -952,7 +986,7 @@
{
"self_ref": "#/texts/33",
"parent": {
"$ref": "#/groups/5"
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
@@ -964,7 +998,7 @@
{
"self_ref": "#/texts/34",
"parent": {
"$ref": "#/groups/7"
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
@@ -990,7 +1024,7 @@
{
"self_ref": "#/texts/36",
"parent": {
"$ref": "#/groups/8"
"$ref": "#/groups/10"
},
"children": [],
"content_layer": "body",
@@ -1092,7 +1126,7 @@
{
"self_ref": "#/texts/44",
"parent": {
"$ref": "#/groups/10"
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
@@ -1110,7 +1144,7 @@
{
"self_ref": "#/texts/45",
"parent": {
"$ref": "#/groups/10"
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
@@ -1128,7 +1162,7 @@
{
"self_ref": "#/texts/46",
"parent": {
"$ref": "#/groups/10"
"$ref": "#/groups/12"
},
"children": [],
"content_layer": "body",
@@ -1146,7 +1180,7 @@
{
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/groups/9"
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
@@ -1158,7 +1192,7 @@
{
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/9"
"$ref": "#/groups/11"
},
"children": [],
"content_layer": "body",
@@ -1206,7 +1240,7 @@
{
"self_ref": "#/texts/52",
"parent": {
"$ref": "#/groups/11"
"$ref": "#/groups/13"
},
"children": [],
"content_layer": "body",
@@ -1224,7 +1258,7 @@
{
"self_ref": "#/texts/53",
"parent": {
"$ref": "#/groups/11"
"$ref": "#/groups/13"
},
"children": [],
"content_layer": "body",
@@ -1236,7 +1270,7 @@
{
"self_ref": "#/texts/54",
"parent": {
"$ref": "#/groups/11"
"$ref": "#/groups/13"
},
"children": [],
"content_layer": "body",
@@ -1260,7 +1294,7 @@
{
"self_ref": "#/texts/56",
"parent": {
"$ref": "#/groups/12"
"$ref": "#/groups/14"
},
"children": [],
"content_layer": "body",
@@ -1278,7 +1312,7 @@
{
"self_ref": "#/texts/57",
"parent": {
"$ref": "#/groups/12"
"$ref": "#/groups/14"
},
"children": [],
"content_layer": "body",
@@ -1356,7 +1390,7 @@
{
"self_ref": "#/texts/63",
"parent": {
"$ref": "#/groups/15"
"$ref": "#/groups/17"
},
"children": [],
"content_layer": "body",
@@ -1374,7 +1408,7 @@
{
"self_ref": "#/texts/64",
"parent": {
"$ref": "#/groups/15"
"$ref": "#/groups/17"
},
"children": [],
"content_layer": "body",
@@ -1386,7 +1420,7 @@
{
"self_ref": "#/texts/65",
"parent": {
"$ref": "#/groups/15"
"$ref": "#/groups/17"
},
"children": [],
"content_layer": "body",
@@ -1398,7 +1432,7 @@
{
"self_ref": "#/texts/66",
"parent": {
"$ref": "#/groups/15"
"$ref": "#/groups/17"
},
"children": [],
"content_layer": "body",

View File

@@ -19,9 +19,8 @@ show the same suggested reportable symptoms
Yes
A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.
 A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.
- A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.
- A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.
**Health Bureau:**

View File

@@ -1,5 +1,7 @@
from pathlib import Path
import pytest
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
@@ -17,6 +19,7 @@ from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
@pytest.mark.xfail(strict=False)
def test_textbox_extraction():
in_path = Path("tests/data/docx/textbox.docx")
in_doc = InputDocument(
@@ -78,8 +81,7 @@ def get_converter():
return converter
def test_e2e_docx_conversions():
docx_paths = get_docx_paths()
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
converter = get_converter()
for docx_path in docx_paths:
@@ -118,6 +120,20 @@ def test_e2e_docx_conversions():
), "export to html"
flaky_path = Path("tests/data/docx/textbox.docx")
def test_e2e_docx_conversions():
_test_e2e_docx_conversions_impl(
docx_paths=[path for path in get_docx_paths() if path != flaky_path]
)
@pytest.mark.xfail(strict=False)
def test_textbox_conversion():
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
def test_text_after_image_anchors():
"""
Test to analyse whether text gets parsed after image anchors.

View File

@@ -1,9 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .test_data_gen_flag import GEN_TEST_DATA

View File

@@ -3,10 +3,10 @@ from pathlib import Path
from typing import List, Tuple
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
EasyOcrOptions,
OcrMacOptions,
OcrOptions,

View File

@@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML
html_str = ( # HTML starting with a script
"<script>\nconsole.log('foo');\n</script>"
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
)
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
assert dci._guess_format(stream) == InputFormat.HTML
# Valid MD
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf)

View File

@@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TableFormerMode,
)

View File

@@ -323,33 +323,33 @@ def verify_conversion_result_v1(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
with open(pages_path, mode="w", encoding="utf-8") as fw:
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
with open(json_path, mode="w", encoding="utf-8") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
with open(md_path, mode="w", encoding="utf-8") as fw:
fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
with open(dt_path, mode="w", encoding="utf-8") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path) as fr:
with open(pages_path, encoding="utf-8") as fr:
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path) as fr:
with open(json_path, encoding="utf-8") as fr:
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
with open(md_path) as fr:
with open(md_path, encoding="utf-8") as fr:
doc_true_md = fr.read()
with open(dt_path) as fr:
with open(dt_path, encoding="utf-8") as fr:
doc_true_dt = fr.read()
if not fuzzy:
@@ -408,33 +408,33 @@ def verify_conversion_result_v2(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
with open(pages_path, mode="w", encoding="utf-8") as fw:
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
with open(json_path, mode="w", encoding="utf-8") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
with open(md_path, mode="w", encoding="utf-8") as fw:
fw.write(doc_pred_md)
dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
with open(dt_path, mode="w", encoding="utf-8") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path) as fr:
with open(pages_path, encoding="utf-8") as fr:
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path) as fr:
with open(json_path, encoding="utf-8") as fr:
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
with open(md_path) as fr:
with open(md_path, encoding="utf-8") as fr:
doc_true_md = fr.read()
with open(dt_path) as fr:
with open(dt_path, encoding="utf-8") as fr:
doc_true_dt = fr.read()
if not fuzzy:
@@ -461,12 +461,12 @@ def verify_conversion_result_v2(
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
if not os.path.exists(gtfile) or generate:
with open(gtfile, "w") as fw:
with open(gtfile, mode="w", encoding="utf-8") as fw:
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
return True
else:
with open(gtfile) as fr:
with open(gtfile, encoding="utf-8") as fr:
true_doc = DoclingDocument.model_validate_json(fr.read())
return verify_docitems(pred_doc, true_doc, fuzzy=False)
@@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
file = Path(gtfile)
if not file.exists() or generate:
with file.open("w") as fw:
with file.open(mode="w", encoding="utf-8") as fw:
fw.write(pred_text)
return True
with file.open("r") as fr:
with file.open(encoding="utf-8") as fr:
true_text = fr.read()
return pred_text == true_text