reformatted the text

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
2025-07-30 14:04:27 +00:00 · 2024-10-29 06:02:49 +01:00 · 2024-10-29 06:02:49 +01:00 · 6163409305
commit 6163409305
parent 7cb7da7ce9
3 changed files with 67 additions and 54 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -180,8 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                parent=self.parents[0], label=DocItemLabel.TITLE, text=text
            )
-            return # dont add another heading with the same text!
+            return  # dont add another heading with the same text!
-        
+
        elif hlevel > self.level:
            # add invisible group
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@ -1,15 +1,18 @@
 import os
 import json
-
+import os
 from pathlib import Path
 from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument, SectionHeaderItem
+from docling.datamodel.document import (
-from docling.datamodel.document import ConversionResult
+    ConversionResult,
    InputDocument,
    SectionHeaderItem,
 )
 from docling.document_converter import DocumentConverter
-GENERATE=False
+GENERATE = False
 def test_heading_levels():
    in_path = Path("tests/data/html/wiki_duck.html")
@ -35,6 +38,7 @@ def test_heading_levels():
                assert item.level == 3
    assert found_lvl_2 and found_lvl_3
 def get_html_paths():
    # Define the directory you want to search
@ -44,48 +48,51 @@ def get_html_paths():
    html_files = sorted(directory.rglob("*.html"))
    return html_files
 def get_converter():
-    converter = DocumentConverter(
+    converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
        allowed_formats = [InputFormat.HTML]
    )
    return converter
-def verify_export(pred_text:str, gtfile:str):
+
 def verify_export(pred_text: str, gtfile: str):
    if not os.path.exists(gtfile) or GENERATE:
        with open(gtfile, "w") as fw:
            fw.write(pred_text)
        return True
-            
+
    else:
        with open(gtfile, "r") as fr:
            true_text = fr.read()
-            
+
-        assert pred_text==true_text, "pred_itxt==true_itxt"    
+        assert pred_text == true_text, "pred_itxt==true_itxt"
-        return pred_text==true_text
+        return pred_text == true_text
-        
+
 def test_e2e_html_conversions():
-    
+
    html_paths = get_html_paths()
    converter = get_converter()
-    
+
    for html_path in html_paths:
-        #print(f"converting {html_path}")
+        # print(f"converting {html_path}")
        conv_result: ConversionResult = converter.convert(html_path)
        doc: DoclingDocument = conv_result.document
-        pred_md:str = doc.export_to_markdown()
+        pred_md: str = doc.export_to_markdown()
-        assert verify_export(pred_md, str(html_path)+".md"), "export to md"
+        assert verify_export(pred_md, str(html_path) + ".md"), "export to md"
        pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False)
        assert verify_export(pred_itxt, str(html_path)+".itxt"), "export to indented-text"
        pred_json:str = json.dumps(doc.export_to_dict(), indent=2)
        assert verify_export(pred_json, str(html_path)+".json"), "export to json"
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(
            pred_itxt, str(html_path) + ".itxt"
        ), "export to indented-text"
        pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
        assert verify_export(pred_json, str(html_path) + ".json"), "export to json"
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -1,15 +1,18 @@
 import os
 import json
-
+import os
 from pathlib import Path
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument, SectionHeaderItem
+from docling.datamodel.document import (
-from docling.datamodel.document import ConversionResult
+    ConversionResult,
    InputDocument,
    SectionHeaderItem,
 )
 from docling.document_converter import DocumentConverter
-GENERATE=False
+GENERATE = False
 def test_heading_levels():
    in_path = Path("tests/data/docx/word_sample.docx")
@ -45,48 +48,51 @@ def get_docx_paths():
    pdf_files = sorted(directory.rglob("*.docx"))
    return pdf_files
 def get_converter():
-    converter = DocumentConverter(
+    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
        allowed_formats = [InputFormat.DOCX]
    )
    return converter
-def verify_export(pred_text:str, gtfile:str):
+
 def verify_export(pred_text: str, gtfile: str):
    if not os.path.exists(gtfile) or GENERATE:
        with open(gtfile, "w") as fw:
            fw.write(pred_text)
        return True
-            
+
    else:
        with open(gtfile, "r") as fr:
            true_text = fr.read()
-            
+
-        assert pred_text==true_text, "pred_itxt==true_itxt"    
+        assert pred_text == true_text, "pred_itxt==true_itxt"
-        return pred_text==true_text
+        return pred_text == true_text
 def test_e2e_docx_conversions():
-    
+
    docx_paths = get_docx_paths()
    converter = get_converter()
-    
+
    for docx_path in docx_paths:
-        #print(f"converting {docx_path}")
+        # print(f"converting {docx_path}")
-        
+
        conv_result: ConversionResult = converter.convert(docx_path)
-        
+
        doc: DoclingDocument = conv_result.document
-        pred_md:str = doc.export_to_markdown()
+        pred_md: str = doc.export_to_markdown()
-        assert verify_export(pred_md, str(docx_path)+".md"), "export to md"
+        assert verify_export(pred_md, str(docx_path) + ".md"), "export to md"
-        
+
-        pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False)
+        pred_itxt: str = doc._export_to_indented_text(
-        assert verify_export(pred_itxt, str(docx_path)+".itxt"), "export to indented-text"
+            max_text_len=70, explicit_tables=False
-        
+        )
-        pred_json:str = json.dumps(doc.export_to_dict(), indent=2)
+        assert verify_export(
-        assert verify_export(pred_json, str(docx_path)+".json"), "export to json"
+            pred_itxt, str(docx_path) + ".itxt"
-        
+        ), "export to indented-text"
        pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
        assert verify_export(pred_json, str(docx_path) + ".json"), "export to json"