mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
add the skip_furniture parameter
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
ebe0b203c8
commit
473ad9a032
@ -21,7 +21,12 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False):
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_doc: "InputDocument",
|
||||||
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
skip_furniture: bool = False,
|
||||||
|
):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
_log.debug("About to init HTML backend...")
|
_log.debug("About to init HTML backend...")
|
||||||
self.soup = None
|
self.soup = None
|
||||||
@ -36,7 +41,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.labels = {} # type: ignore
|
self.labels = {} # type: ignore
|
||||||
|
|
||||||
self.skip_furniture = skip_furniture
|
self.skip_furniture = skip_furniture
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
@ -84,9 +89,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for br in self.soup.body.find_all("br"):
|
for br in self.soup.body.find_all("br"):
|
||||||
br.replace_with("\n")
|
br.replace_with("\n")
|
||||||
|
|
||||||
self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture
|
self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture
|
||||||
self.detected_h1 = False
|
self.detected_h1 = False
|
||||||
|
|
||||||
doc = self.walk(self.soup.body, doc)
|
doc = self.walk(self.soup.body, doc)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@ -124,7 +129,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
if element.name in ["h1"]:
|
if element.name in ["h1"]:
|
||||||
self.detected_h1 = True
|
self.detected_h1 = True
|
||||||
|
|
||||||
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
|
||||||
self.handle_header(element, idx, doc)
|
self.handle_header(element, idx, doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user