diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index dfd219fd..efff2546 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -180,8 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[0], label=DocItemLabel.TITLE, text=text ) - return # dont add another heading with the same text! - + return # dont add another heading with the same text! + elif hlevel > self.level: # add invisible group diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index aa085a93..cfc01e01 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,15 +1,18 @@ -import os import json - +import os from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument, SectionHeaderItem -from docling.datamodel.document import ConversionResult +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + SectionHeaderItem, +) from docling.document_converter import DocumentConverter -GENERATE=False +GENERATE = False + def test_heading_levels(): in_path = Path("tests/data/html/wiki_duck.html") @@ -35,6 +38,7 @@ def test_heading_levels(): assert item.level == 3 assert found_lvl_2 and found_lvl_3 + def get_html_paths(): # Define the directory you want to search @@ -44,48 +48,51 @@ def get_html_paths(): html_files = sorted(directory.rglob("*.html")) return html_files + def get_converter(): - converter = DocumentConverter( - allowed_formats = [InputFormat.HTML] - ) + converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) return converter -def verify_export(pred_text:str, gtfile:str): + +def verify_export(pred_text: str, gtfile: str): if not os.path.exists(gtfile) or GENERATE: with open(gtfile, "w") as fw: fw.write(pred_text) return True - + else: with open(gtfile, "r") as fr: true_text = fr.read() - - assert pred_text==true_text, "pred_itxt==true_itxt" - return pred_text==true_text - + + assert pred_text == true_text, "pred_itxt==true_itxt" + return pred_text == true_text + + def test_e2e_html_conversions(): - + html_paths = get_html_paths() converter = get_converter() - + for html_path in html_paths: - #print(f"converting {html_path}") + # print(f"converting {html_path}") conv_result: ConversionResult = converter.convert(html_path) doc: DoclingDocument = conv_result.document - pred_md:str = doc.export_to_markdown() - assert verify_export(pred_md, str(html_path)+".md"), "export to md" - - pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False) - assert verify_export(pred_itxt, str(html_path)+".itxt"), "export to indented-text" - - pred_json:str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(html_path)+".json"), "export to json" + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(html_path) + ".md"), "export to md" + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export( + pred_itxt, str(html_path) + ".itxt" + ), "export to indented-text" + pred_json: str = json.dumps(doc.export_to_dict(), indent=2) + assert verify_export(pred_json, str(html_path) + ".json"), "export to json" diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 91f92222..6a579b12 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,15 +1,18 @@ -import os import json - +import os from pathlib import Path from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument, SectionHeaderItem -from docling.datamodel.document import ConversionResult +from docling.datamodel.document import ( + ConversionResult, + InputDocument, + SectionHeaderItem, +) from docling.document_converter import DocumentConverter -GENERATE=False +GENERATE = False + def test_heading_levels(): in_path = Path("tests/data/docx/word_sample.docx") @@ -45,48 +48,51 @@ def get_docx_paths(): pdf_files = sorted(directory.rglob("*.docx")) return pdf_files + def get_converter(): - converter = DocumentConverter( - allowed_formats = [InputFormat.DOCX] - ) + converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) return converter -def verify_export(pred_text:str, gtfile:str): + +def verify_export(pred_text: str, gtfile: str): if not os.path.exists(gtfile) or GENERATE: with open(gtfile, "w") as fw: fw.write(pred_text) return True - + else: with open(gtfile, "r") as fr: true_text = fr.read() - - assert pred_text==true_text, "pred_itxt==true_itxt" - return pred_text==true_text + + assert pred_text == true_text, "pred_itxt==true_itxt" + return pred_text == true_text def test_e2e_docx_conversions(): - + docx_paths = get_docx_paths() converter = get_converter() - + for docx_path in docx_paths: - #print(f"converting {docx_path}") - + # print(f"converting {docx_path}") + conv_result: ConversionResult = converter.convert(docx_path) - + doc: DoclingDocument = conv_result.document - pred_md:str = doc.export_to_markdown() - assert verify_export(pred_md, str(docx_path)+".md"), "export to md" - - pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False) - assert verify_export(pred_itxt, str(docx_path)+".itxt"), "export to indented-text" - - pred_json:str = json.dumps(doc.export_to_dict(), indent=2) - assert verify_export(pred_json, str(docx_path)+".json"), "export to json" - + pred_md: str = doc.export_to_markdown() + assert verify_export(pred_md, str(docx_path) + ".md"), "export to md" + + pred_itxt: str = doc._export_to_indented_text( + max_text_len=70, explicit_tables=False + ) + assert verify_export( + pred_itxt, str(docx_path) + ".itxt" + ), "export to indented-text" + + pred_json: str = json.dumps(doc.export_to_dict(), indent=2) + assert verify_export(pred_json, str(docx_path) + ".json"), "export to json"