add the skip_furniture parameter

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-11-01 11:32:56 +01:00
parent ebe0b203c8
commit 473ad9a032

View File

@ -21,7 +21,12 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend): class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False): def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
skip_furniture: bool = False,
):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...") _log.debug("About to init HTML backend...")
self.soup = None self.soup = None
@ -84,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
for br in self.soup.body.find_all("br"): for br in self.soup.body.find_all("br"):
br.replace_with("\n") br.replace_with("\n")
self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture
self.detected_h1 = False self.detected_h1 = False
doc = self.walk(self.soup.body, doc) doc = self.walk(self.soup.body, doc)