diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index 1c2325d4..84344862 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -93,6 +93,7 @@ class ReadingOrderModel: RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem for elem in conv_res.assembled.elements } + cid_to_rels = {rel.cid: rel for rel in ro_elements} origin = DocumentOrigin( mimetype="application/pdf", @@ -111,53 +112,39 @@ class ReadingOrderModel: out_doc.add_page(page_no=page_no, size=size) current_list = None + skippable_cids = { + cid + for mapping in ( + el_to_captions_mapping, + el_to_footnotes_mapping, + el_merges_mapping, + ) + for lst in mapping.values() + for cid in lst + } # TODO: handle merges for rel in ro_elements: + if rel.cid in skippable_cids: + continue element = id_to_elem[rel.ref.cref] page_height = conv_res.pages[element.page_no].size.height # type: ignore if isinstance(element, TextElement): - text = element.text - - prov = ProvenanceItem( - page_no=element.page_no + 1, - charspan=(0, len(text)), - bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + new_item, current_list = self._handle_text_element( + element, out_doc, current_list, page_height ) - label = element.label - if label == DocItemLabel.LIST_ITEM: - if current_list is None: - current_list = out_doc.add_group( - label=GroupLabel.LIST, name="list" + if rel.cid in el_merges_mapping.keys(): + for merged_cid in el_merges_mapping[rel.cid]: + merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref] + + self._merge_elements( + element, merged_elem, new_item, page_height ) - # TODO: Infer if this is a numbered or a bullet list item - out_doc.add_list_item( - text=text, enumerated=False, prov=prov, parent=current_list - ) - elif label == DocItemLabel.SECTION_HEADER: - current_list = None - - out_doc.add_heading(text=text, prov=prov) - elif label == DocItemLabel.CODE: - current_list = None - - out_doc.add_code(text=text, prov=prov) - elif label == DocItemLabel.FORMULA: - current_list = None - - out_doc.add_text( - label=DocItemLabel.FORMULA, text="", orig=text, prov=prov - ) - else: - current_list = None - - out_doc.add_text(label=element.label, text=text, prov=prov) - elif isinstance(element, Table): tbl_data = TableData( @@ -176,23 +163,54 @@ class ReadingOrderModel: data=tbl_data, prov=prov, label=element.cluster.label ) + if rel.cid in el_to_captions_mapping.keys(): + for caption_cid in el_to_captions_mapping[rel.cid]: + caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref] + new_cap_item = self._add_caption_or_footnote( + caption_elem, out_doc, tbl, page_height + ) + + tbl.captions.append(new_cap_item.get_ref()) + + if rel.cid in el_to_footnotes_mapping.keys(): + for footnote_cid in el_to_footnotes_mapping[rel.cid]: + footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref] + new_footnote_item = self._add_caption_or_footnote( + footnote_elem, out_doc, tbl, page_height + ) + + tbl.footnotes.append(new_footnote_item.get_ref()) + # TODO: handle element.cluster.children. - # TODO: handle captions - # tbl.captions.extend(caption_refs) elif isinstance(element, FigureElement): - text = "" + cap_text = "" prov = ProvenanceItem( page_no=element.page_no + 1, - charspan=(0, len(text)), + charspan=(0, len(cap_text)), bbox=element.cluster.bbox.to_bottom_left_origin(page_height), ) - pic = out_doc.add_picture(prov=prov) + if rel.cid in el_to_captions_mapping.keys(): + for caption_cid in el_to_captions_mapping[rel.cid]: + caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref] + new_cap_item = self._add_caption_or_footnote( + caption_elem, out_doc, pic, page_height + ) + + pic.captions.append(new_cap_item.get_ref()) + + if rel.cid in el_to_footnotes_mapping.keys(): + for footnote_cid in el_to_footnotes_mapping[rel.cid]: + footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref] + new_footnote_item = self._add_caption_or_footnote( + footnote_elem, out_doc, pic, page_height + ) + + pic.footnotes.append(new_footnote_item.get_ref()) + # TODO: handle element.cluster.children. - # TODO: handle captions - # pic.captions.extend(caption_refs) # _add_child_elements(pic, doc, obj, pelem) elif isinstance(element, ContainerElement): @@ -201,6 +219,74 @@ class ReadingOrderModel: return out_doc + def _add_caption_or_footnote(self, elem, out_doc, parent, page_height): + assert isinstance(elem, TextElement) + text = elem.text + prov = ProvenanceItem( + page_no=elem.page_no + 1, + charspan=(0, len(text)), + bbox=elem.cluster.bbox.to_bottom_left_origin(page_height), + ) + new_item = out_doc.add_text( + label=elem.label, text=text, prov=prov, parent=parent + ) + return new_item + + def _handle_text_element(self, element, out_doc, current_list, page_height): + cap_text = element.text + prov = ProvenanceItem( + page_no=element.page_no + 1, + charspan=(0, len(cap_text)), + bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + ) + label = element.label + if label == DocItemLabel.LIST_ITEM: + if current_list is None: + current_list = out_doc.add_group(label=GroupLabel.LIST, name="list") + + # TODO: Infer if this is a numbered or a bullet list item + new_item = out_doc.add_list_item( + text=cap_text, enumerated=False, prov=prov, parent=current_list + ) + elif label == DocItemLabel.SECTION_HEADER: + current_list = None + + new_item = out_doc.add_heading(text=cap_text, prov=prov) + elif label == DocItemLabel.CODE: + current_list = None + + new_item = out_doc.add_code(text=cap_text, prov=prov) + elif label == DocItemLabel.FORMULA: + current_list = None + + new_item = out_doc.add_text( + label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov + ) + else: + current_list = None + + new_item = out_doc.add_text(label=element.label, text=cap_text, prov=prov) + return new_item, current_list + + def _merge_elements(self, element, merged_elem, new_item, page_height): + assert isinstance( + merged_elem, type(element) + ), "Merged element must be of same type as element." + assert ( + merged_elem.label == new_item.label + ), "Labels of merged elements must match." + prov = ProvenanceItem( + page_no=element.page_no + 1, + charspan=( + len(new_item.text) + 1, + len(new_item.text) + 1 + len(merged_elem.text), + ), + bbox=element.cluster.bbox.to_bottom_left_origin(page_height), + ) + new_item.text += f" {merged_elem.text}" + new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete. + new_item.prov.append(prov) + def __call__(self, conv_res: ConversionResult) -> DoclingDocument: with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): page_elements = self._assembled_to_readingorder_elements(conv_res) diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index c2e04d10..536b9aae 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import Iterable import yaml +from docling_core.types.doc import ImageRefMode from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult @@ -14,7 +15,7 @@ from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) USE_V2 = True -USE_LEGACY = True +USE_LEGACY = False def export_documents( @@ -33,26 +34,31 @@ def export_documents( doc_filename = conv_res.input.file.stem if USE_V2: - # Export Docling document format to JSON: - with (output_dir / f"{doc_filename}.json").open("w") as fp: - fp.write(json.dumps(conv_res.document.export_to_dict())) + conv_res.document.save_as_json( + output_dir / f"{doc_filename}.json", + image_mode=ImageRefMode.PLACEHOLDER, + ) + conv_res.document.save_as_document_tokens( + output_dir / f"{doc_filename}.doctags.txt" + ) + conv_res.document.save_as_markdown( + output_dir / f"{doc_filename}.md", + image_mode=ImageRefMode.PLACEHOLDER, + ) + conv_res.document.save_as_markdown( + output_dir / f"{doc_filename}.txt", + image_mode=ImageRefMode.PLACEHOLDER, + strict_text=True, + ) + conv_res.document.save_as_html( + output_dir / f"{doc_filename}.html", + image_mode=ImageRefMode.EMBEDDED, + ) # Export Docling document format to YAML: with (output_dir / f"{doc_filename}.yaml").open("w") as fp: fp.write(yaml.safe_dump(conv_res.document.export_to_dict())) - # Export Docling document format to doctags: - with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp: - fp.write(conv_res.document.export_to_document_tokens()) - - # Export Docling document format to markdown: - with (output_dir / f"{doc_filename}.md").open("w") as fp: - fp.write(conv_res.document.export_to_markdown()) - - # Export Docling document format to text: - with (output_dir / f"{doc_filename}.txt").open("w") as fp: - fp.write(conv_res.document.export_to_markdown(strict_text=True)) - if USE_LEGACY: # Export Deep Search document JSON format: with (output_dir / f"{doc_filename}.legacy.json").open(