diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index dfd219fd..efff2546 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -180,8 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
)
- return # dont add another heading with the same text!
-
+ return # dont add another heading with the same text!
+
elif hlevel > self.level:
# add invisible group
diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py
index aa085a93..cfc01e01 100644
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -1,15 +1,18 @@
-import os
import json
-
+import os
from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument, SectionHeaderItem
-from docling.datamodel.document import ConversionResult
+from docling.datamodel.document import (
+ ConversionResult,
+ InputDocument,
+ SectionHeaderItem,
+)
from docling.document_converter import DocumentConverter
-GENERATE=False
+GENERATE = False
+
def test_heading_levels():
in_path = Path("tests/data/html/wiki_duck.html")
@@ -35,6 +38,7 @@ def test_heading_levels():
assert item.level == 3
assert found_lvl_2 and found_lvl_3
+
def get_html_paths():
# Define the directory you want to search
@@ -44,48 +48,51 @@ def get_html_paths():
html_files = sorted(directory.rglob("*.html"))
return html_files
+
def get_converter():
- converter = DocumentConverter(
- allowed_formats = [InputFormat.HTML]
- )
+ converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter
-def verify_export(pred_text:str, gtfile:str):
+
+def verify_export(pred_text: str, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
fw.write(pred_text)
return True
-
+
else:
with open(gtfile, "r") as fr:
true_text = fr.read()
-
- assert pred_text==true_text, "pred_itxt==true_itxt"
- return pred_text==true_text
-
+
+ assert pred_text == true_text, "pred_itxt==true_itxt"
+ return pred_text == true_text
+
+
def test_e2e_html_conversions():
-
+
html_paths = get_html_paths()
converter = get_converter()
-
+
for html_path in html_paths:
- #print(f"converting {html_path}")
+ # print(f"converting {html_path}")
conv_result: ConversionResult = converter.convert(html_path)
doc: DoclingDocument = conv_result.document
- pred_md:str = doc.export_to_markdown()
- assert verify_export(pred_md, str(html_path)+".md"), "export to md"
-
- pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False)
- assert verify_export(pred_itxt, str(html_path)+".itxt"), "export to indented-text"
-
- pred_json:str = json.dumps(doc.export_to_dict(), indent=2)
- assert verify_export(pred_json, str(html_path)+".json"), "export to json"
+ pred_md: str = doc.export_to_markdown()
+ assert verify_export(pred_md, str(html_path) + ".md"), "export to md"
+ pred_itxt: str = doc._export_to_indented_text(
+ max_text_len=70, explicit_tables=False
+ )
+ assert verify_export(
+ pred_itxt, str(html_path) + ".itxt"
+ ), "export to indented-text"
+ pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
+ assert verify_export(pred_json, str(html_path) + ".json"), "export to json"
diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py
index 91f92222..6a579b12 100644
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@@ -1,15 +1,18 @@
-import os
import json
-
+import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument, SectionHeaderItem
-from docling.datamodel.document import ConversionResult
+from docling.datamodel.document import (
+ ConversionResult,
+ InputDocument,
+ SectionHeaderItem,
+)
from docling.document_converter import DocumentConverter
-GENERATE=False
+GENERATE = False
+
def test_heading_levels():
in_path = Path("tests/data/docx/word_sample.docx")
@@ -45,48 +48,51 @@ def get_docx_paths():
pdf_files = sorted(directory.rglob("*.docx"))
return pdf_files
+
def get_converter():
- converter = DocumentConverter(
- allowed_formats = [InputFormat.DOCX]
- )
+ converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
return converter
-def verify_export(pred_text:str, gtfile:str):
+
+def verify_export(pred_text: str, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
fw.write(pred_text)
return True
-
+
else:
with open(gtfile, "r") as fr:
true_text = fr.read()
-
- assert pred_text==true_text, "pred_itxt==true_itxt"
- return pred_text==true_text
+
+ assert pred_text == true_text, "pred_itxt==true_itxt"
+ return pred_text == true_text
def test_e2e_docx_conversions():
-
+
docx_paths = get_docx_paths()
converter = get_converter()
-
+
for docx_path in docx_paths:
- #print(f"converting {docx_path}")
-
+ # print(f"converting {docx_path}")
+
conv_result: ConversionResult = converter.convert(docx_path)
-
+
doc: DoclingDocument = conv_result.document
- pred_md:str = doc.export_to_markdown()
- assert verify_export(pred_md, str(docx_path)+".md"), "export to md"
-
- pred_itxt:str = doc._export_to_indented_text(max_text_len=70, explicit_tables=False)
- assert verify_export(pred_itxt, str(docx_path)+".itxt"), "export to indented-text"
-
- pred_json:str = json.dumps(doc.export_to_dict(), indent=2)
- assert verify_export(pred_json, str(docx_path)+".json"), "export to json"
-
+ pred_md: str = doc.export_to_markdown()
+ assert verify_export(pred_md, str(docx_path) + ".md"), "export to md"
+
+ pred_itxt: str = doc._export_to_indented_text(
+ max_text_len=70, explicit_tables=False
+ )
+ assert verify_export(
+ pred_itxt, str(docx_path) + ".itxt"
+ ), "export to indented-text"
+
+ pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
+ assert verify_export(pred_json, str(docx_path) + ".json"), "export to json"