mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
fix: Skip NavigableString in HTML parsing
Signed-off-by: higuhigu-lb <higuchi@lightblute-tech.com>
This commit is contained in:
parent
8ada0bccc7
commit
624b392e79
@ -3,7 +3,7 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -92,6 +92,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
try:
|
try:
|
||||||
# Iterate over elements in the body of the document
|
# Iterate over elements in the body of the document
|
||||||
for idx, element in enumerate(element.children):
|
for idx, element in enumerate(element.children):
|
||||||
|
if isinstance(element, NavigableString):
|
||||||
|
continue # Skip over navigable strings
|
||||||
try:
|
try:
|
||||||
self.analyse_element(element, idx, doc)
|
self.analyse_element(element, idx, doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
|
Loading…
Reference in New Issue
Block a user