mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
added the detection of h1 and the skip_furniture parameter
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
c52e68c52b
commit
ebe0b203c8
@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
_log.debug("About to init HTML backend...")
|
_log.debug("About to init HTML backend...")
|
||||||
self.soup = None
|
self.soup = None
|
||||||
@ -35,6 +35,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
self.labels = {} # type: ignore
|
self.labels = {} # type: ignore
|
||||||
|
|
||||||
|
self.skip_furniture = skip_furniture
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
@ -82,7 +84,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for br in self.soup.body.find_all("br"):
|
for br in self.soup.body.find_all("br"):
|
||||||
br.replace_with("\n")
|
br.replace_with("\n")
|
||||||
|
|
||||||
self.contains_h1 = True
|
self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture
|
||||||
self.detected_h1 = False
|
self.detected_h1 = False
|
||||||
|
|
||||||
doc = self.walk(self.soup.body, doc)
|
doc = self.walk(self.soup.body, doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user