mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Add normalization, update tests again
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
eb67337e51
commit
857d6c4292
@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
|
||||
|
||||
sanitized_text = "".join(lines)
|
||||
|
||||
# Text normalization
|
||||
sanitized_text.replace("⁄", "/")
|
||||
sanitized_text.replace("’", "'")
|
||||
sanitized_text.replace("‘", "'")
|
||||
sanitized_text.replace("“", '"')
|
||||
sanitized_text.replace("”", '"')
|
||||
sanitized_text.replace("•", "·")
|
||||
|
||||
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
||||
|
||||
def __call__(
|
||||
|
@ -166,17 +166,46 @@ class ReadingOrderModel:
|
||||
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
||||
|
||||
if isinstance(element, TextElement):
|
||||
new_item, current_list = self._handle_text_element(
|
||||
element, out_doc, current_list, page_height
|
||||
)
|
||||
if element.label == DocItemLabel.CODE:
|
||||
cap_text = element.text
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(0, len(cap_text)),
|
||||
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
||||
)
|
||||
code_item = out_doc.add_code(text=cap_text, prov=prov)
|
||||
|
||||
if rel.cid in el_merges_mapping.keys():
|
||||
for merged_cid in el_merges_mapping[rel.cid]:
|
||||
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
||||
if rel.cid in el_to_captions_mapping.keys():
|
||||
for caption_cid in el_to_captions_mapping[rel.cid]:
|
||||
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
||||
new_cap_item = self._add_caption_or_footnote(
|
||||
caption_elem, out_doc, code_item, page_height
|
||||
)
|
||||
|
||||
self._merge_elements(
|
||||
element, merged_elem, new_item, page_height
|
||||
)
|
||||
code_item.captions.append(new_cap_item.get_ref())
|
||||
|
||||
if rel.cid in el_to_footnotes_mapping.keys():
|
||||
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
||||
footnote_elem = id_to_elem[
|
||||
cid_to_rels[footnote_cid].ref.cref
|
||||
]
|
||||
new_footnote_item = self._add_caption_or_footnote(
|
||||
footnote_elem, out_doc, code_item, page_height
|
||||
)
|
||||
|
||||
code_item.footnotes.append(new_footnote_item.get_ref())
|
||||
else:
|
||||
new_item, current_list = self._handle_text_element(
|
||||
element, out_doc, current_list, page_height
|
||||
)
|
||||
|
||||
if rel.cid in el_merges_mapping.keys():
|
||||
for merged_cid in el_merges_mapping[rel.cid]:
|
||||
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
||||
|
||||
self._merge_elements(
|
||||
element, merged_elem, new_item, page_height
|
||||
)
|
||||
|
||||
elif isinstance(element, Table):
|
||||
|
||||
@ -292,10 +321,6 @@ class ReadingOrderModel:
|
||||
current_list = None
|
||||
|
||||
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
||||
elif label == DocItemLabel.CODE:
|
||||
current_list = None
|
||||
|
||||
new_item = out_doc.add_code(text=cap_text, prov=prov)
|
||||
elif label == DocItemLabel.FORMULA:
|
||||
current_list = None
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 506.6666564941406, 767.2550048828125], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1 +1 @@
|
||||
{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550048828125, "r": 506.6666564941406, "b": 688.58837890625, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
||||
{"schema_name": "DoclingDocument", "version": "1.1.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
Loading…
Reference in New Issue
Block a user