mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 15:02:21 +00:00
Add captions, footnotes and merges [skip ci]
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
5aebaf58de
commit
a6ee5a4326
@ -93,6 +93,7 @@ class ReadingOrderModel:
|
|||||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||||
for elem in conv_res.assembled.elements
|
for elem in conv_res.assembled.elements
|
||||||
}
|
}
|
||||||
|
cid_to_rels = {rel.cid: rel for rel in ro_elements}
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
mimetype="application/pdf",
|
mimetype="application/pdf",
|
||||||
@ -111,53 +112,39 @@ class ReadingOrderModel:
|
|||||||
out_doc.add_page(page_no=page_no, size=size)
|
out_doc.add_page(page_no=page_no, size=size)
|
||||||
|
|
||||||
current_list = None
|
current_list = None
|
||||||
|
skippable_cids = {
|
||||||
|
cid
|
||||||
|
for mapping in (
|
||||||
|
el_to_captions_mapping,
|
||||||
|
el_to_footnotes_mapping,
|
||||||
|
el_merges_mapping,
|
||||||
|
)
|
||||||
|
for lst in mapping.values()
|
||||||
|
for cid in lst
|
||||||
|
}
|
||||||
|
|
||||||
# TODO: handle merges
|
# TODO: handle merges
|
||||||
|
|
||||||
for rel in ro_elements:
|
for rel in ro_elements:
|
||||||
|
if rel.cid in skippable_cids:
|
||||||
|
continue
|
||||||
element = id_to_elem[rel.ref.cref]
|
element = id_to_elem[rel.ref.cref]
|
||||||
|
|
||||||
page_height = conv_res.pages[element.page_no].size.height # type: ignore
|
page_height = conv_res.pages[element.page_no].size.height # type: ignore
|
||||||
|
|
||||||
if isinstance(element, TextElement):
|
if isinstance(element, TextElement):
|
||||||
text = element.text
|
new_item, current_list = self._handle_text_element(
|
||||||
|
element, out_doc, current_list, page_height
|
||||||
prov = ProvenanceItem(
|
|
||||||
page_no=element.page_no + 1,
|
|
||||||
charspan=(0, len(text)),
|
|
||||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
|
||||||
)
|
)
|
||||||
label = element.label
|
|
||||||
|
|
||||||
if label == DocItemLabel.LIST_ITEM:
|
if rel.cid in el_merges_mapping.keys():
|
||||||
if current_list is None:
|
for merged_cid in el_merges_mapping[rel.cid]:
|
||||||
current_list = out_doc.add_group(
|
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
||||||
label=GroupLabel.LIST, name="list"
|
|
||||||
|
self._merge_elements(
|
||||||
|
element, merged_elem, new_item, page_height
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Infer if this is a numbered or a bullet list item
|
|
||||||
out_doc.add_list_item(
|
|
||||||
text=text, enumerated=False, prov=prov, parent=current_list
|
|
||||||
)
|
|
||||||
elif label == DocItemLabel.SECTION_HEADER:
|
|
||||||
current_list = None
|
|
||||||
|
|
||||||
out_doc.add_heading(text=text, prov=prov)
|
|
||||||
elif label == DocItemLabel.CODE:
|
|
||||||
current_list = None
|
|
||||||
|
|
||||||
out_doc.add_code(text=text, prov=prov)
|
|
||||||
elif label == DocItemLabel.FORMULA:
|
|
||||||
current_list = None
|
|
||||||
|
|
||||||
out_doc.add_text(
|
|
||||||
label=DocItemLabel.FORMULA, text="", orig=text, prov=prov
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
current_list = None
|
|
||||||
|
|
||||||
out_doc.add_text(label=element.label, text=text, prov=prov)
|
|
||||||
|
|
||||||
elif isinstance(element, Table):
|
elif isinstance(element, Table):
|
||||||
|
|
||||||
tbl_data = TableData(
|
tbl_data = TableData(
|
||||||
@ -176,23 +163,54 @@ class ReadingOrderModel:
|
|||||||
data=tbl_data, prov=prov, label=element.cluster.label
|
data=tbl_data, prov=prov, label=element.cluster.label
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if rel.cid in el_to_captions_mapping.keys():
|
||||||
|
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||||
|
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||||
|
new_cap_item = self._add_caption_or_footnote(
|
||||||
|
caption_elem, out_doc, tbl, page_height
|
||||||
|
)
|
||||||
|
|
||||||
|
tbl.captions.append(new_cap_item.get_ref())
|
||||||
|
|
||||||
|
if rel.cid in el_to_footnotes_mapping.keys():
|
||||||
|
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||||
|
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
||||||
|
new_footnote_item = self._add_caption_or_footnote(
|
||||||
|
footnote_elem, out_doc, tbl, page_height
|
||||||
|
)
|
||||||
|
|
||||||
|
tbl.footnotes.append(new_footnote_item.get_ref())
|
||||||
|
|
||||||
# TODO: handle element.cluster.children.
|
# TODO: handle element.cluster.children.
|
||||||
# TODO: handle captions
|
|
||||||
# tbl.captions.extend(caption_refs)
|
|
||||||
|
|
||||||
elif isinstance(element, FigureElement):
|
elif isinstance(element, FigureElement):
|
||||||
text = ""
|
cap_text = ""
|
||||||
prov = ProvenanceItem(
|
prov = ProvenanceItem(
|
||||||
page_no=element.page_no + 1,
|
page_no=element.page_no + 1,
|
||||||
charspan=(0, len(text)),
|
charspan=(0, len(cap_text)),
|
||||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||||
)
|
)
|
||||||
|
|
||||||
pic = out_doc.add_picture(prov=prov)
|
pic = out_doc.add_picture(prov=prov)
|
||||||
|
|
||||||
|
if rel.cid in el_to_captions_mapping.keys():
|
||||||
|
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||||
|
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||||
|
new_cap_item = self._add_caption_or_footnote(
|
||||||
|
caption_elem, out_doc, pic, page_height
|
||||||
|
)
|
||||||
|
|
||||||
|
pic.captions.append(new_cap_item.get_ref())
|
||||||
|
|
||||||
|
if rel.cid in el_to_footnotes_mapping.keys():
|
||||||
|
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||||
|
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
||||||
|
new_footnote_item = self._add_caption_or_footnote(
|
||||||
|
footnote_elem, out_doc, pic, page_height
|
||||||
|
)
|
||||||
|
|
||||||
|
pic.footnotes.append(new_footnote_item.get_ref())
|
||||||
|
|
||||||
# TODO: handle element.cluster.children.
|
# TODO: handle element.cluster.children.
|
||||||
# TODO: handle captions
|
|
||||||
# pic.captions.extend(caption_refs)
|
|
||||||
# _add_child_elements(pic, doc, obj, pelem)
|
# _add_child_elements(pic, doc, obj, pelem)
|
||||||
|
|
||||||
elif isinstance(element, ContainerElement):
|
elif isinstance(element, ContainerElement):
|
||||||
@ -201,6 +219,74 @@ class ReadingOrderModel:
|
|||||||
|
|
||||||
return out_doc
|
return out_doc
|
||||||
|
|
||||||
|
def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
|
||||||
|
assert isinstance(elem, TextElement)
|
||||||
|
text = elem.text
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=elem.page_no + 1,
|
||||||
|
charspan=(0, len(text)),
|
||||||
|
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
|
||||||
|
)
|
||||||
|
new_item = out_doc.add_text(
|
||||||
|
label=elem.label, text=text, prov=prov, parent=parent
|
||||||
|
)
|
||||||
|
return new_item
|
||||||
|
|
||||||
|
def _handle_text_element(self, element, out_doc, current_list, page_height):
|
||||||
|
cap_text = element.text
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=element.page_no + 1,
|
||||||
|
charspan=(0, len(cap_text)),
|
||||||
|
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||||
|
)
|
||||||
|
label = element.label
|
||||||
|
if label == DocItemLabel.LIST_ITEM:
|
||||||
|
if current_list is None:
|
||||||
|
current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
|
||||||
|
|
||||||
|
# TODO: Infer if this is a numbered or a bullet list item
|
||||||
|
new_item = out_doc.add_list_item(
|
||||||
|
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
||||||
|
)
|
||||||
|
elif label == DocItemLabel.SECTION_HEADER:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
||||||
|
elif label == DocItemLabel.CODE:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
new_item = out_doc.add_code(text=cap_text, prov=prov)
|
||||||
|
elif label == DocItemLabel.FORMULA:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
new_item = out_doc.add_text(
|
||||||
|
label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
current_list = None
|
||||||
|
|
||||||
|
new_item = out_doc.add_text(label=element.label, text=cap_text, prov=prov)
|
||||||
|
return new_item, current_list
|
||||||
|
|
||||||
|
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
||||||
|
assert isinstance(
|
||||||
|
merged_elem, type(element)
|
||||||
|
), "Merged element must be of same type as element."
|
||||||
|
assert (
|
||||||
|
merged_elem.label == new_item.label
|
||||||
|
), "Labels of merged elements must match."
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
page_no=element.page_no + 1,
|
||||||
|
charspan=(
|
||||||
|
len(new_item.text) + 1,
|
||||||
|
len(new_item.text) + 1 + len(merged_elem.text),
|
||||||
|
),
|
||||||
|
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||||
|
)
|
||||||
|
new_item.text += f" {merged_elem.text}"
|
||||||
|
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete.
|
||||||
|
new_item.prov.append(prov)
|
||||||
|
|
||||||
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
||||||
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
||||||
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
||||||
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
from docling_core.types.doc import ImageRefMode
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -14,7 +15,7 @@ from docling.document_converter import DocumentConverter
|
|||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
USE_V2 = True
|
USE_V2 = True
|
||||||
USE_LEGACY = True
|
USE_LEGACY = False
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
@ -33,26 +34,31 @@ def export_documents(
|
|||||||
doc_filename = conv_res.input.file.stem
|
doc_filename = conv_res.input.file.stem
|
||||||
|
|
||||||
if USE_V2:
|
if USE_V2:
|
||||||
# Export Docling document format to JSON:
|
conv_res.document.save_as_json(
|
||||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
output_dir / f"{doc_filename}.json",
|
||||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
|
)
|
||||||
|
conv_res.document.save_as_document_tokens(
|
||||||
|
output_dir / f"{doc_filename}.doctags.txt"
|
||||||
|
)
|
||||||
|
conv_res.document.save_as_markdown(
|
||||||
|
output_dir / f"{doc_filename}.md",
|
||||||
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
|
)
|
||||||
|
conv_res.document.save_as_markdown(
|
||||||
|
output_dir / f"{doc_filename}.txt",
|
||||||
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
|
strict_text=True,
|
||||||
|
)
|
||||||
|
conv_res.document.save_as_html(
|
||||||
|
output_dir / f"{doc_filename}.html",
|
||||||
|
image_mode=ImageRefMode.EMBEDDED,
|
||||||
|
)
|
||||||
|
|
||||||
# Export Docling document format to YAML:
|
# Export Docling document format to YAML:
|
||||||
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
||||||
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
|
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
|
||||||
|
|
||||||
# Export Docling document format to doctags:
|
|
||||||
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
|
|
||||||
fp.write(conv_res.document.export_to_document_tokens())
|
|
||||||
|
|
||||||
# Export Docling document format to markdown:
|
|
||||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
|
||||||
fp.write(conv_res.document.export_to_markdown())
|
|
||||||
|
|
||||||
# Export Docling document format to text:
|
|
||||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
|
||||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
|
||||||
|
|
||||||
if USE_LEGACY:
|
if USE_LEGACY:
|
||||||
# Export Deep Search document JSON format:
|
# Export Deep Search document JSON format:
|
||||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||||
|
Loading…
Reference in New Issue
Block a user