Merge branch 'main' of github.com:DS4SD/docling into cau/ocr-cells-in-segmented-page

This commit is contained in:
Christoph Auer 2025-06-13 11:10:00 +02:00
commit d99080e036
22 changed files with 4947 additions and 660 deletions

View File

@ -423,18 +423,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
# Handle notes slide
if slide.has_notes_slide:
notes_slide = slide.notes_slide
notes_text = notes_slide.notes_text_frame.text.strip()
if notes_text:
bbox = BoundingBox(l=0, t=0, r=0, b=0)
prov = ProvenanceItem(
page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
)
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_slide,
text=notes_text,
prov=prov,
content_layer=ContentLayer.FURNITURE,
)
if notes_slide.notes_text_frame is not None:
notes_text = notes_slide.notes_text_frame.text.strip()
if notes_text:
bbox = BoundingBox(l=0, t=0, r=0, b=0)
prov = ProvenanceItem(
page_no=slide_ind + 1,
charspan=[0, len(notes_text)],
bbox=bbox,
)
doc.add_text(
label=DocItemLabel.TEXT,
parent=parent_slide,
text=notes_text,
prov=prov,
content_layer=ContentLayer.FURNITURE,
)
return doc

View File

@ -70,7 +70,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
}

View File

@ -13,6 +13,12 @@ from docling.datamodel.pipeline_options import (
)
from docling.document_converter import DocumentConverter, PdfFormatOption
### Example of PictureDescriptionApiOptions definitions
#### Using vLLM
# Models can be launched via:
# $ vllm serve MODEL_NAME
def vllm_local_options(model: str):
options = PictureDescriptionApiOptions(
@ -28,6 +34,26 @@ def vllm_local_options(model: str):
return options
#### Using LM Studio
def lms_local_options(model: str):
options = PictureDescriptionApiOptions(
url="http://localhost:1234/v1/chat/completions",
params=dict(
model=model,
seed=42,
max_completion_tokens=200,
),
prompt="Describe the image in three sentences. Be consise and accurate.",
timeout=90,
)
return options
#### Using a cloud service like IBM watsonx.ai
def watsonx_vlm_options():
load_dotenv()
api_key = os.environ.get("WX_API_KEY")
@ -49,7 +75,7 @@ def watsonx_vlm_options():
options = PictureDescriptionApiOptions(
url="https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29",
params=dict(
model_id="meta-llama/llama-3-2-11b-vision-instruct",
model_id="ibm/granite-vision-3-2-2b",
project_id=project_id,
parameters=dict(
max_new_tokens=400,
@ -64,6 +90,9 @@ def watsonx_vlm_options():
return options
### Usage and conversion
def main():
logging.basicConfig(level=logging.INFO)
@ -80,20 +109,28 @@ def main():
# One possibility is self-hosting model, e.g. via VLLM.
# $ vllm serve MODEL_NAME
# Then PictureDescriptionApiOptions can point to the localhost endpoint.
#
# Example for the Granite Vision model: (uncomment the following lines)
# Example for the Granite Vision model:
# (uncomment the following lines)
# pipeline_options.picture_description_options = vllm_local_options(
# model="ibm-granite/granite-vision-3.1-2b-preview"
# )
#
# Example for the SmolVLM model: (uncomment the following lines)
pipeline_options.picture_description_options = vllm_local_options(
model="HuggingFaceTB/SmolVLM-256M-Instruct"
# Example for the SmolVLM model:
# (uncomment the following lines)
# pipeline_options.picture_description_options = vllm_local_options(
# model="HuggingFaceTB/SmolVLM-256M-Instruct"
# )
# For using models on LM Studio using the built-in GGUF or MLX runtimes, e.g. the SmolVLM model:
# (uncomment the following lines)
pipeline_options.picture_description_options = lms_local_options(
model="smolvlm-256m-instruct"
)
#
# Another possibility is using online services, e.g. watsonx.ai.
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
# Uncomment the following line for this option:
# (uncomment the following lines)
# pipeline_options.picture_description_options = watsonx_vlm_options()
doc_converter = DocumentConverter(

View File

@ -13,6 +13,27 @@ from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, Response
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
### Example of ApiVlmOptions definitions
#### Using LM Studio
def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
options = ApiVlmOptions(
url="http://localhost:1234/v1/chat/completions", # the default LM Studio
params=dict(
model=model,
),
prompt=prompt,
timeout=90,
scale=1.0,
response_format=format,
)
return options
#### Using Ollama
def ollama_vlm_options(model: str, prompt: str):
options = ApiVlmOptions(
@ -28,6 +49,9 @@ def ollama_vlm_options(model: str, prompt: str):
return options
#### Using a cloud service like IBM watsonx.ai
def watsonx_vlm_options(model: str, prompt: str):
load_dotenv()
api_key = os.environ.get("WX_API_KEY")
@ -65,6 +89,9 @@ def watsonx_vlm_options(model: str, prompt: str):
return options
### Usage and conversion
def main():
logging.basicConfig(level=logging.INFO)
@ -78,16 +105,34 @@ def main():
# The ApiVlmOptions() allows to interface with APIs supporting
# the multi-modal chat interface. Here follow a few example on how to configure those.
# One possibility is self-hosting model, e.g. via Ollama.
# Example using the Granite Vision model: (uncomment the following lines)
pipeline_options.vlm_options = ollama_vlm_options(
model="granite3.2-vision:2b",
prompt="OCR the full page to markdown.",
# One possibility is self-hosting model, e.g. via LM Studio, Ollama or others.
# Example using the SmolDocling model with LM Studio:
# (uncomment the following lines)
pipeline_options.vlm_options = lms_vlm_options(
model="smoldocling-256m-preview-mlx-docling-snap",
prompt="Convert this page to docling.",
format=ResponseFormat.DOCTAGS,
)
# Example using the Granite Vision model with LM Studio:
# (uncomment the following lines)
# pipeline_options.vlm_options = lms_vlm_options(
# model="granite-vision-3.2-2b",
# prompt="OCR the full page to markdown.",
# format=ResponseFormat.MARKDOWN,
# )
# Example using the Granite Vision model with Ollama:
# (uncomment the following lines)
# pipeline_options.vlm_options = ollama_vlm_options(
# model="granite3.2-vision:2b",
# prompt="OCR the full page to markdown.",
# )
# Another possibility is using online services, e.g. watsonx.ai.
# Using requires setting the env variables WX_API_KEY and WX_PROJECT_ID.
# Uncomment the following line for this option:
# (uncomment the following lines)
# pipeline_options.vlm_options = watsonx_vlm_options(
# model="ibm/granite-vision-3-2-2b", prompt="OCR the full page to markdown."
# )

View File

@ -213,10 +213,10 @@
"prov": [
{
"bbox": [
139.6674041748047,
139.66741943359375,
322.5054626464844,
475.00927734375,
454.4546203613281
454.45458984375
],
"page": 1,
"span": [

View File

@ -2705,7 +2705,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -2745,7 +2745,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,
@ -2785,7 +2785,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -2940,7 +2940,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -3155,7 +3155,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -3339,8 +3339,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -7846,7 +7846,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -7911,7 +7911,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -8243,8 +8243,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -13641,7 +13641,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -13687,7 +13687,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,
@ -13733,7 +13733,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -13900,7 +13900,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -14121,7 +14121,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -14311,8 +14311,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -19701,7 +19701,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -19772,7 +19772,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -20116,7 +20116,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -20283,7 +20283,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -20504,7 +20504,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -20694,8 +20694,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -26084,7 +26084,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -26155,7 +26155,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -26499,7 +26499,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -26545,7 +26545,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,

View File

@ -336,8 +336,8 @@
{
"page_no": 1,
"bbox": {
"l": 139.6674041748047,
"t": 454.4546203613281,
"l": 139.66741943359375,
"t": 454.45458984375,
"r": 475.00927734375,
"b": 322.5054626464844,
"coord_origin": "BOTTOMLEFT"

View File

@ -2705,7 +2705,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -2745,7 +2745,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,
@ -2785,7 +2785,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -2940,7 +2940,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -3155,7 +3155,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -3339,8 +3339,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -7846,7 +7846,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -7911,7 +7911,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -8243,8 +8243,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -13641,7 +13641,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -13687,7 +13687,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,
@ -13733,7 +13733,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -13900,7 +13900,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -14121,7 +14121,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -14311,8 +14311,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -19701,7 +19701,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -19772,7 +19772,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -20116,7 +20116,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806435108184814,
"confidence": 0.9806433916091919,
"cells": [
{
"index": 2,
@ -20283,7 +20283,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9850425124168396,
"confidence": 0.98504239320755,
"cells": [
{
"index": 7,
@ -20504,7 +20504,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591907262802124,
"confidence": 0.9591909050941467,
"cells": [
{
"index": 15,
@ -20694,8 +20694,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.6674041748047,
"t": 337.5453796386719,
"l": 139.66741943359375,
"t": 337.54541015625,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -26084,7 +26084,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589295387268066,
"confidence": 0.9589294195175171,
"cells": [
{
"index": 91,
@ -26155,7 +26155,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849976301193237,
"confidence": 0.9849975109100342,
"cells": [
{
"index": 93,
@ -26499,7 +26499,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373533129692078,
"confidence": 0.9373534917831421,
"cells": [
{
"index": 0,
@ -26545,7 +26545,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858679533004761,
"confidence": 0.8858680725097656,
"cells": [
{
"index": 1,

View File

@ -0,0 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-1
item-2 at level 2: section_header: Pivot table with with 1 row header
item-3 at level 3: table with [6x4]
item-4 at level 2: section_header: Pivot table with 2 row headers
item-5 at level 3: table with [6x5]
item-6 at level 2: section_header: Equivalent pivot table
item-7 at level 3: table with [6x5]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
## Pivot table with with 1 row header
| Year | Month | Revenue | Cost |
|--------|----------|-----------|--------|
| 2025 | January | $134 | $162 |
| 2025 | February | $150 | $155 |
| 2025 | March | $160 | $143 |
| 2025 | April | $210 | $150 |
| 2025 | May | $280 | $120 |
## Pivot table with 2 row headers
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |
## Equivalent pivot table
| Year | Quarter | Month | Revenue | Cost |
|--------|-----------|----------|-----------|--------|
| 2025 | Q1 | January | $134 | $162 |
| 2025 | Q1 | February | $150 | $155 |
| 2025 | Q1 | March | $160 | $143 |
| 2025 | Q2 | April | $210 | $150 |
| 2025 | Q2 | May | $280 | $120 |

View File

@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group sheet: SalesData
item-2 at level 2: table with [21x4]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,22 @@
| Product | Date | Quantity | Revenue |
|-----------|---------------------|------------|-----------|
| Widget A | 2024-01-01 00:00:00 | 5 | 5000 |
| Widget B | 2024-01-02 00:00:00 | 10 | 12000 |
| Widget C | 2024-01-03 00:00:00 | 3 | 3000 |
| Widget D | 2024-01-04 00:00:00 | 8 | 8000 |
| Widget A | 2024-01-05 00:00:00 | 7 | 7000 |
| Widget B | 2024-01-06 00:00:00 | 6 | 6000 |
| Widget C | 2024-01-07 00:00:00 | 12 | 15000 |
| Widget D | 2024-01-08 00:00:00 | 9 | 9000 |
| Widget A | 2024-01-09 00:00:00 | 4 | 4000 |
| Widget B | 2024-01-10 00:00:00 | 11 | 11000 |
| Widget C | 2024-01-11 00:00:00 | 5 | 5000 |
| Widget D | 2024-01-12 00:00:00 | 8 | 8500 |
| Widget A | 2024-01-13 00:00:00 | 6 | 6200 |
| Widget B | 2024-01-14 00:00:00 | 7 | 7100 |
| Widget C | 2024-01-15 00:00:00 | 10 | 10500 |
| Widget D | 2024-01-16 00:00:00 | 3 | 3200 |
| Widget A | 2024-01-17 00:00:00 | 9 | 9400 |
| Widget B | 2024-01-18 00:00:00 | 12 | 12500 |
| Widget C | 2024-01-19 00:00:00 | 6 | 6100 |
| Widget D | 2024-01-20 00:00:00 | 8 | 8900 |

View File

@ -5,92 +5,89 @@ item-0 at level 0: unspecified: group _root_
item-4 at level 1: section: group textbox
item-5 at level 2: paragraph: Student falls ill
item-6 at level 2: paragraph:
item-7 at level 2: paragraph:
item-8 at level 2: list: group list
item-9 at level 3: list_item: Suggested Reportable Symptoms:
item-7 at level 2: list: group list
item-8 at level 3: list_item: Suggested Reportable Symptoms:
... sh
Blisters
Headache
Sore throat
item-10 at level 1: list_item:
item-9 at level 1: list_item:
item-10 at level 1: paragraph:
item-11 at level 1: paragraph:
item-12 at level 1: paragraph:
item-13 at level 1: section: group textbox
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-12 at level 1: section: group textbox
item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
item-14 at level 1: paragraph:
item-15 at level 1: paragraph:
item-16 at level 1: paragraph:
item-17 at level 1: paragraph:
item-18 at level 1: paragraph:
item-19 at level 1: section: group textbox
item-20 at level 2: paragraph: Yes
item-18 at level 1: section: group textbox
item-19 at level 2: paragraph: Yes
item-20 at level 1: paragraph:
item-21 at level 1: paragraph:
item-22 at level 1: paragraph:
item-23 at level 1: section: group textbox
item-24 at level 2: list: group list
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-27 at level 2: paragraph:
item-28 at level 2: paragraph:
item-29 at level 1: list: group list
item-30 at level 2: list_item:
item-22 at level 1: section: group textbox
item-23 at level 2: list: group list
item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-26 at level 2: paragraph:
item-27 at level 1: list: group list
item-28 at level 2: list_item:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: paragraph:
item-34 at level 1: paragraph:
item-35 at level 1: paragraph:
item-36 at level 1: section: group textbox
item-37 at level 2: paragraph: Health Bureau:
item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-39 at level 2: list: group list
item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-42 at level 2: paragraph:
item-43 at level 2: paragraph:
item-44 at level 1: list: group list
item-45 at level 2: list_item:
item-46 at level 1: paragraph:
item-47 at level 1: section: group textbox
item-48 at level 2: paragraph: Department of Education:
item-34 at level 1: section: group textbox
item-35 at level 2: paragraph: Health Bureau:
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-37 at level 2: list: group list
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-40 at level 2: paragraph:
item-41 at level 1: list: group list
item-42 at level 2: list_item:
item-43 at level 1: paragraph:
item-44 at level 1: section: group textbox
item-45 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-46 at level 1: paragraph:
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: paragraph:
item-53 at level 1: paragraph:
item-54 at level 1: paragraph:
item-55 at level 1: paragraph:
item-56 at level 1: section: group textbox
item-57 at level 2: inline: group group
item-58 at level 3: paragraph: The Health Bureau will handle
item-59 at level 3: paragraph: reporting and specimen collection
item-60 at level 3: paragraph: .
item-61 at level 2: paragraph:
item-62 at level 2: paragraph:
item-63 at level 1: paragraph:
item-64 at level 1: paragraph:
item-53 at level 1: section: group textbox
item-54 at level 2: inline: group group
item-55 at level 3: paragraph: The Health Bureau will handle
item-56 at level 3: paragraph: reporting and specimen collection
item-57 at level 3: paragraph: .
item-58 at level 2: paragraph:
item-59 at level 1: paragraph:
item-60 at level 1: paragraph:
item-61 at level 1: paragraph:
item-62 at level 1: section: group textbox
item-63 at level 2: paragraph: Whether the epidemic has eased.
item-64 at level 2: paragraph:
item-65 at level 1: paragraph:
item-66 at level 1: section: group textbox
item-67 at level 2: paragraph: Whether the epidemic has eased.
item-68 at level 2: paragraph:
item-69 at level 2: paragraph:
item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-68 at level 2: paragraph: No
item-69 at level 1: paragraph:
item-70 at level 1: paragraph:
item-71 at level 1: section: group textbox
item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-73 at level 2: paragraph: No
item-74 at level 1: paragraph:
item-75 at level 1: paragraph:
item-76 at level 1: section: group textbox
item-72 at level 2: paragraph: Yes
item-73 at level 1: paragraph:
item-74 at level 1: section: group textbox
item-75 at level 2: paragraph: Yes
item-76 at level 1: paragraph:
item-77 at level 1: paragraph:
item-78 at level 1: section: group textbox
item-79 at level 1: paragraph:
item-80 at level 1: paragraph:
item-81 at level 1: section: group textbox
item-82 at level 2: paragraph: Case closed.
item-83 at level 2: paragraph:
item-84 at level 2: paragraph:
item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-79 at level 2: paragraph: Case closed.
item-80 at level 2: paragraph:
item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-82 at level 1: paragraph:
item-83 at level 1: section: group textbox
item-84 at level 2: paragraph: No
item-85 at level 1: paragraph:
item-86 at level 1: paragraph:
item-87 at level 1: section: group textbox
item-88 at level 1: paragraph:
item-89 at level 1: paragraph:
item-90 at level 1: paragraph:
item-87 at level 1: paragraph:

File diff suppressed because it is too large Load Diff

View File

@ -40,6 +40,12 @@ The Health Bureau will handle **reporting and specimen collection** .
No
Yes
Yes
**Case closed.**
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
No

View File

@ -1,2 +1,2 @@
<doctag><text><loc_60><loc_46><loc_424><loc_91>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
<doctag><text><loc_59><loc_46><loc_424><loc_90>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</text>
</doctag>

View File

@ -42,10 +42,10 @@
{
"page_no": 1,
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 2570.0959833241664,
"r": 1696.0985546594009,
"b": 2315.204273887442,
"r": 1696.0985042090742,
"b": 2319.1220927976665,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [

View File

@ -40,14 +40,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@ -65,14 +65,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@ -90,13 +90,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@ -132,14 +132,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@ -157,14 +157,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@ -195,13 +195,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@ -237,14 +237,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@ -262,14 +262,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
@ -293,13 +293,13 @@
"id": 0,
"label": "text",
"bbox": {
"l": 238.19302423176944,
"l": 234.08627147881114,
"t": 258.9040166758338,
"r": 1696.0985546594009,
"b": 513.795726112558,
"r": 1696.0985042090742,
"b": 509.8779072023336,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9721010327339172,
"confidence": 0.9721011519432068,
"cells": [
{
"index": 0,
@ -335,14 +335,14 @@
"a": 255
},
"rect": {
"r_x0": 238.19302423176944,
"r_y0": 415.36904822716525,
"r_x1": 1696.0985546594009,
"r_y1": 415.36904822716525,
"r_x2": 1696.0985546594009,
"r_y2": 345.20535775097477,
"r_x3": 238.19302423176944,
"r_y3": 345.20535775097477,
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
@ -360,14 +360,14 @@
"a": 255
},
"rect": {
"r_x0": 245.43122061153045,
"r_y0": 513.795726112558,
"r_x1": 514.3223724413002,
"r_y1": 513.795726112558,
"r_x2": 514.3223724413002,
"r_y2": 436.0574704074058,
"r_x3": 245.43122061153045,
"r_y3": 436.0574704074058,
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",

BIN
tests/data/xlsx/sample_sales_data.xlsm vendored Normal file

Binary file not shown.

View File

@ -16,13 +16,13 @@ _log = logging.getLogger(__name__)
GENERATE = GEN_TEST_DATA
def get_xlsx_paths():
def get_excel_paths():
# Define the directory you want to search
directory = Path("./tests/data/xlsx/")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.xlsx"))
return pdf_files
# List all Excel files in the directory and its subdirectories
excel_files = sorted(directory.rglob("*.xlsx")) + sorted(directory.rglob("*.xlsm"))
return excel_files
def get_converter():
@ -35,17 +35,17 @@ def get_converter():
def documents() -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
xlsx_paths = get_xlsx_paths()
excel_paths = get_excel_paths()
converter = get_converter()
for xlsx_path in xlsx_paths:
_log.debug(f"converting {xlsx_path}")
for excel_path in excel_paths:
_log.debug(f"converting {excel_path}")
gt_path = (
xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name
excel_path.parent.parent / "groundtruth" / "docling_v2" / excel_path.name
)
conv_result: ConversionResult = converter.convert(xlsx_path)
conv_result: ConversionResult = converter.convert(excel_path)
doc: DoclingDocument = conv_result.document
@ -55,7 +55,7 @@ def documents() -> list[tuple[Path, DoclingDocument]]:
return documents
def test_e2e_xlsx_conversions(documents) -> None:
def test_e2e_excel_conversions(documents) -> None:
for gt_path, doc in documents:
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
@ -79,7 +79,7 @@ def test_pages(documents) -> None:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
path = next(item for item in get_excel_paths() if item.stem == "test-01")
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,