feat: Implement new reading-order model (#916)

* Implement new reading-order model, replacing DS GLM model (WIP)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update reading-order model branch

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile [skip ci]

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add captions, footnotes and merges [skip ci]

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Updates for reading-order implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Updates for reading-order implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests and lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes, update tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add normalization, update tests again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests with code

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Push final lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* sanitize text

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Inlcude furniture, Update tests with furniture

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix content_layer assignment

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* chore: Delete empty file docling/models/ds_glm_model.py

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2025-02-20 17:51:17 +01:00
committed by GitHub
parent c031a7ae47
commit c93e36988f
67 changed files with 757 additions and 871 deletions

View File

@@ -5,16 +5,18 @@ from pathlib import Path
from typing import Iterable
import yaml
from docling_core.types.doc import ImageRefMode
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)
USE_V2 = True
USE_LEGACY = True
USE_LEGACY = False
def export_documents(
@@ -33,26 +35,31 @@ def export_documents(
doc_filename = conv_res.input.file.stem
if USE_V2:
# Export Docling document format to JSON:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.document.export_to_dict()))
conv_res.document.save_as_json(
output_dir / f"{doc_filename}.json",
image_mode=ImageRefMode.PLACEHOLDER,
)
conv_res.document.save_as_html(
output_dir / f"{doc_filename}.html",
image_mode=ImageRefMode.EMBEDDED,
)
conv_res.document.save_as_document_tokens(
output_dir / f"{doc_filename}.doctags.txt"
)
conv_res.document.save_as_markdown(
output_dir / f"{doc_filename}.md",
image_mode=ImageRefMode.PLACEHOLDER,
)
conv_res.document.save_as_markdown(
output_dir / f"{doc_filename}.txt",
image_mode=ImageRefMode.PLACEHOLDER,
strict_text=True,
)
# Export Docling document format to YAML:
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
# Export Docling document format to doctags:
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
fp.write(conv_res.document.export_to_document_tokens())
# Export Docling document format to markdown:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.document.export_to_markdown())
# Export Docling document format to text:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.document.export_to_markdown(strict_text=True))
if USE_LEGACY:
# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.legacy.json").open(
@@ -119,13 +126,20 @@ def main():
# settings.debug.visualize_tables = True
# settings.debug.visualize_cells = True
doc_converter = DocumentConverter()
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths,
raises_on_error=False, # to let conversion run through all and examine results at the end
raises_on_error=True, # to let conversion run through all and examine results at the end
)
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("scratch")