Add normalization, update tests again

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-02-19 16:55:20 +01:00
parent eb67337e51
commit 857d6c4292
8 changed files with 52 additions and 19 deletions

View File

@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
sanitized_text = "".join(lines)
# Text normalization
sanitized_text.replace("", "/")
sanitized_text.replace("", "'")
sanitized_text.replace("", "'")
sanitized_text.replace("", '"')
sanitized_text.replace("", '"')
sanitized_text.replace("", "·")
return sanitized_text.strip() # Strip any leading or trailing whitespace
def __call__(

View File

@ -166,17 +166,46 @@ class ReadingOrderModel:
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
if isinstance(element, TextElement):
new_item, current_list = self._handle_text_element(
element, out_doc, current_list, page_height
)
if element.label == DocItemLabel.CODE:
cap_text = element.text
prov = ProvenanceItem(
page_no=element.page_no + 1,
charspan=(0, len(cap_text)),
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
)
code_item = out_doc.add_code(text=cap_text, prov=prov)
if rel.cid in el_merges_mapping.keys():
for merged_cid in el_merges_mapping[rel.cid]:
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
if rel.cid in el_to_captions_mapping.keys():
for caption_cid in el_to_captions_mapping[rel.cid]:
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
new_cap_item = self._add_caption_or_footnote(
caption_elem, out_doc, code_item, page_height
)
self._merge_elements(
element, merged_elem, new_item, page_height
)
code_item.captions.append(new_cap_item.get_ref())
if rel.cid in el_to_footnotes_mapping.keys():
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
footnote_elem = id_to_elem[
cid_to_rels[footnote_cid].ref.cref
]
new_footnote_item = self._add_caption_or_footnote(
footnote_elem, out_doc, code_item, page_height
)
code_item.footnotes.append(new_footnote_item.get_ref())
else:
new_item, current_list = self._handle_text_element(
element, out_doc, current_list, page_height
)
if rel.cid in el_merges_mapping.keys():
for merged_cid in el_merges_mapping[rel.cid]:
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
self._merge_elements(
element, merged_elem, new_item, page_height
)
elif isinstance(element, Table):
@ -292,10 +321,6 @@ class ReadingOrderModel:
current_list = None
new_item = out_doc.add_heading(text=cap_text, prov=prov)
elif label == DocItemLabel.CODE:
current_list = None
new_item = out_doc.add_code(text=cap_text, prov=prov)
elif label == DocItemLabel.FORMULA:
current_list = None

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 506.6666564941406, 767.2550048828125], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}

View File

@ -1 +1 @@
{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550048828125, "r": 506.6666564941406, "b": 688.58837890625, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}