mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
* feat: Scaffolding for layout and table model plugin factory Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add missing files Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add base options classes for layout and table Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(experimental): Add experimental TableCropsLayoutModel Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
41 lines
1.2 KiB
Python
Vendored
41 lines
1.2 KiB
Python
Vendored
"""Run Docling on an image using the experimental TableCrops layout model."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import docling
|
|
from docling.datamodel.document import InputFormat
|
|
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
|
|
from docling.document_converter import DocumentConverter, ImageFormatOption
|
|
from docling.experimental.datamodel.table_crops_layout_options import (
|
|
TableCropsLayoutOptions,
|
|
)
|
|
from docling.experimental.models.table_crops_layout_model import TableCropsLayoutModel
|
|
from docling.models.factories import get_layout_factory
|
|
|
|
|
|
def main() -> None:
|
|
sample_image = "tests/data/2305.03393v1-table_crop.png"
|
|
|
|
pipeline_options = ThreadedPdfPipelineOptions(
|
|
layout_options=TableCropsLayoutOptions(),
|
|
do_table_structure=True,
|
|
generate_page_images=True,
|
|
)
|
|
|
|
converter = DocumentConverter(
|
|
allowed_formats=[InputFormat.IMAGE],
|
|
format_options={
|
|
InputFormat.IMAGE: ImageFormatOption(pipeline_options=pipeline_options)
|
|
},
|
|
)
|
|
|
|
conv_res = converter.convert(sample_image)
|
|
|
|
print(conv_res.document.tables[0].export_to_markdown())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|