fix: Skip NavigableString in HTML parsing

Signed-off-by: higuhigu-lb <higuchi@lightblute-tech.com>
This commit is contained in:
higuhigu-lb 2024-12-03 11:44:16 +09:00
parent 8ada0bccc7
commit 624b392e79

View File

@ -3,7 +3,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, NavigableString
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -92,6 +92,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try: try:
# Iterate over elements in the body of the document # Iterate over elements in the body of the document
for idx, element in enumerate(element.children): for idx, element in enumerate(element.children):
if isinstance(element, NavigableString):
continue # Skip over navigable strings
try: try:
self.analyse_element(element, idx, doc) self.analyse_element(element, idx, doc)
except Exception as exc_child: except Exception as exc_child: