fix: parse html with omitted body tag (#818)

* fix: parse HTML files without body tag Parse HTML files without 'body' tag, since it is optional in HTML5 specification. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * test: ensure docling converts HTML without body tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-08 12:48:28 +00:00 · 2025-01-27 16:59:00 +01:00
parent 95b293a723
commit a112d7a035
6 changed files with 364 additions and 3 deletions
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -6,6 +6,7 @@ from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import (
    ConversionResult,
+    DoclingDocument,
    InputDocument,
    SectionHeaderItem,
 )
@@ -44,7 +45,7 @@ def get_html_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/html/")

-    # List all PDF files in the directory and its subdirectories
+    # List all HTML files in the directory and its subdirectories
    html_files = sorted(directory.rglob("*.html"))
    return html_files