diff --git a/docs/examples/inspect_picture_content.py b/docs/examples/inspect_picture_content.py new file mode 100644 index 00000000..7930acf5 --- /dev/null +++ b/docs/examples/inspect_picture_content.py @@ -0,0 +1,29 @@ +from docling_core.types.doc import TextItem + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + +source = "tests/data/amt_handbook_sample.pdf" + +pipeline_options = PdfPipelineOptions() +pipeline_options.images_scale = 2 +pipeline_options.generate_page_images = True + +doc_converter = DocumentConverter( + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} +) + +result = doc_converter.convert(source) + +doc = result.document + +for picture in doc.pictures: + # picture.get_image(doc).show() # display the picture + print(picture.caption_text(doc), " contains these elements:") + + for item, level in doc.iterate_items(root=picture, traverse_pictures=True): + if isinstance(item, TextItem): + print(item.text) + + print("\n") diff --git a/tests/data/amt_handbook_sample.pdf b/tests/data/amt_handbook_sample.pdf new file mode 100644 index 00000000..058513a3 Binary files /dev/null and b/tests/data/amt_handbook_sample.pdf differ