mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat!: Docling v2 (#117)
--------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
@@ -1,17 +1,12 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PageElement,
|
||||
TableElement,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling_core.types.doc import PictureItem, TableItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -21,64 +16,64 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
input_doc_path = Path("./tests/data/2206.01062.pdf")
|
||||
output_dir = Path("scratch")
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
# The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
|
||||
# with the image field
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options.generate_page_images = True
|
||||
pipeline_options.generate_table_images = True
|
||||
pipeline_options.generate_picture_images = True
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
# Save page images
|
||||
for page_no, page in conv_res.document.pages.items():
|
||||
page_no = page.page_no
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.pil_image.save(fp, format="PNG")
|
||||
|
||||
# Export page images
|
||||
for page in conv_res.pages:
|
||||
page_no = page.page_no + 1
|
||||
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
|
||||
with page_image_filename.open("wb") as fp:
|
||||
page.image.save(fp, format="PNG")
|
||||
|
||||
# Export figures and tables
|
||||
for element, image in conv_res.render_element_images(
|
||||
element_types=(FigureElement, TableElement)
|
||||
):
|
||||
# Save images of figures and tables
|
||||
table_counter = 0
|
||||
picture_counter = 0
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if isinstance(element, TableItem):
|
||||
table_counter += 1
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-element-{element.id}.png"
|
||||
output_dir / f"{doc_filename}-table-{table_counter}.png"
|
||||
)
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
element.image.pil_image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
if isinstance(element, PictureItem):
|
||||
picture_counter += 1
|
||||
element_image_filename = (
|
||||
output_dir / f"{doc_filename}-picture-{picture_counter}.png"
|
||||
)
|
||||
with element_image_filename.open("wb") as fp:
|
||||
element.image.pil_image.save(fp, "PNG")
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
_log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user