Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
Michael Krissgau
2025-05-29 15:04:06 +02:00
23 changed files with 4498 additions and 187 deletions

View File

@@ -0,0 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-1
item-2 at level 2: section_header: Pivot table with with 1 row header
item-3 at level 3: table with [6x4]
item-4 at level 2: section_header: Pivot table with 2 row headers
item-5 at level 3: table with [6x5]
item-6 at level 2: section_header: Equivalent pivot table
item-7 at level 3: table with [6x5]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,29 @@
## Pivot table with with 1 row header
| Year | Month | Revenue | Cost |
|--------|----------|-----------|--------|
| 2025 | January | $134 | $162 |
| 2025 | February | $150 | $155 |
| 2025 | March | $160 | $143 |
| 2025 | April | $210 | $150 |
| 2025 | May | $280 | $120 |
## Pivot table with 2 row headers
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |
## Equivalent pivot table
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |

View File

@@ -0,0 +1,94 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
item-3 at level 1: paragraph:
item-4 at level 1: section: group textbox
item-5 at level 2: paragraph: Student falls ill
item-6 at level 2: paragraph:
item-7 at level 2: paragraph:
item-8 at level 2: list: group list
item-9 at level 3: list_item: Suggested Reportable Symptoms:
... sh
Blisters
Headache
Sore throat
item-10 at level 1: list_item:
item-11 at level 1: paragraph:
item-12 at level 1: paragraph:
item-13 at level 1: section: group textbox
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-15 at level 1: paragraph:
item-16 at level 1: paragraph:
item-17 at level 1: paragraph:
item-18 at level 1: paragraph:
item-19 at level 1: section: group textbox
item-20 at level 2: paragraph: Yes
item-21 at level 1: paragraph:
item-22 at level 1: paragraph:
item-23 at level 1: section: group textbox
item-24 at level 2: paragraph:  A report must be submitted wi ... saster Prevention Information Network.
item-25 at level 2: paragraph:  A report must also be submitt ... d Infectious Disease Reporting System.
item-26 at level 2: paragraph:
item-27 at level 2: paragraph:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: paragraph:
item-34 at level 1: section: group textbox
item-35 at level 2: paragraph: Health Bureau:
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-37 at level 2: list: group list
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-40 at level 2: paragraph:
item-41 at level 2: paragraph:
item-42 at level 1: list: group list
item-43 at level 2: list_item:
item-44 at level 1: paragraph:
item-45 at level 1: section: group textbox
item-46 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: paragraph:
item-53 at level 1: paragraph:
item-54 at level 1: section: group textbox
item-55 at level 2: inline: group group
item-56 at level 3: paragraph: The Health Bureau will handle
item-57 at level 3: paragraph: reporting and specimen collection
item-58 at level 3: paragraph: .
item-59 at level 2: paragraph:
item-60 at level 2: paragraph:
item-61 at level 1: paragraph:
item-62 at level 1: paragraph:
item-63 at level 1: paragraph:
item-64 at level 1: section: group textbox
item-65 at level 2: paragraph: Whether the epidemic has eased.
item-66 at level 2: paragraph:
item-67 at level 2: paragraph:
item-68 at level 1: paragraph:
item-69 at level 1: section: group textbox
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-71 at level 2: paragraph: No
item-72 at level 1: paragraph:
item-73 at level 1: paragraph:
item-74 at level 1: section: group textbox
item-75 at level 1: paragraph:
item-76 at level 1: section: group textbox
item-77 at level 1: paragraph:
item-78 at level 1: paragraph:
item-79 at level 1: section: group textbox
item-80 at level 2: paragraph: Case closed.
item-81 at level 2: paragraph:
item-82 at level 2: paragraph:
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-84 at level 1: paragraph:
item-85 at level 1: section: group textbox
item-86 at level 1: paragraph:
item-87 at level 1: paragraph:
item-88 at level 1: paragraph:

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,46 @@
**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
**Student falls ill**
- Suggested Reportable Symptoms:
Fever
Cough
Diarrhea
Vomiting
Rash
Blisters
Headache
Sore throat
If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
show the same suggested reportable symptoms
Yes
 A report must be submitted within 24 hours via the Ministry of Educations Campus Safety and Disaster Prevention Information Network.
 A report must also be submitted within 48 hours through Chiayi Countys School Suspected Infectious Disease Reporting System.
**Health Bureau:**
Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
Department of Education:
Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
The Health Bureau will handle **reporting and specimen collection** .
**Whether the epidemic has eased.**
**Whether the test results are positive for a legally designated infectious disease.**
No
**Case closed.**
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.

View File

@@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
print(f"converting {csv_path}")
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
conv_result: ConversionResult = converter.convert(csv_path)
if csv_path.stem in (
"csv-too-few-columns",
"csv-too-many-columns",
"csv-inconsistent-header",
):
with warns(UserWarning, match="Inconsistent column lengths"):
conv_result: ConversionResult = converter.convert(csv_path)
else:
conv_result: ConversionResult = converter.convert(csv_path)
doc: DoclingDocument = conv_result.document

View File

@@ -38,17 +38,15 @@ def get_converter():
def test_compare_legacy_output(test_doc_paths):
converter = get_converter()
res = converter.convert_all(test_doc_paths, raises_on_error=True)
for conv_res in res:
print(f"Results for {conv_res.input.file}")
print(
json.dumps(
conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
with pytest.warns(DeprecationWarning, match="Use document instead"):
print(
json.dumps(
conv_res.legacy_document.model_dump(
mode="json", by_alias=True, exclude_none=True
)
)
)
)
# assert res.legacy_output == res.legacy_output_transformed

View File

@@ -4,6 +4,7 @@ import warnings
from pathlib import Path
from typing import List, Optional
import pytest
from docling_core.types.doc import (
DocItem,
DoclingDocument,
@@ -302,9 +303,8 @@ def verify_conversion_result_v1(
)
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.legacy_document
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
with pytest.warns(DeprecationWarning, match="Use document instead"):
doc_pred: DsDocument = doc_result.legacy_document
doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
@@ -391,7 +391,7 @@ def verify_conversion_result_v2(
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_document_tokens()
doc_pred_dt = doc_result.document.export_to_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"