mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
add the skip_furniture parameter
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
ebe0b203c8
commit
473ad9a032
@ -21,7 +21,12 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False):
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
skip_furniture: bool = False,
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
self.soup = None
|
||||
@ -36,7 +41,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.labels = {} # type: ignore
|
||||
|
||||
self.skip_furniture = skip_furniture
|
||||
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
@ -84,9 +89,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for br in self.soup.body.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
|
||||
self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture
|
||||
self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture
|
||||
self.detected_h1 = False
|
||||
|
||||
|
||||
doc = self.walk(self.soup.body, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
@ -124,7 +129,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
if element.name in ["h1"]:
|
||||
self.detected_h1 = True
|
||||
|
||||
|
||||
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||
self.handle_header(element, idx, doc)
|
||||
|
Loading…
Reference in New Issue
Block a user