diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3d3d3ca9..e46ce09a 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -47,16 +47,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): text_stream = self.path_or_stream.getvalue().decode("utf-8") - print("BytesIO") + _log.debug("reading from BytesIO") self.soup = BeautifulSoup(text_stream, "html.parser") if isinstance(self.path_or_stream, Path): - print("file") + _log.debug("reading from file") with open(self.path_or_stream, "r", encoding="utf-8") as f: html_content = f.read() - - with open("./scratch/file.html", "w") as fw: - fw.write(html_content) - self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( @@ -115,32 +111,32 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: self.analyse_element(child, idx, doc) except Exception as exc: - _log.error(f" -> error treating child: {exc}") + _log.info(f" -> error treating child: {exc}") raise exc elif isinstance(element, Tag): try: self.analyse_element(element, 0, doc) except Exception as exc: - _log.error(f" -> error treating elem: {exc}") + _log.info(f" -> error treating elem: {exc}") raise exc else: - _log.warn(f"ignoring element of type {type(element)}") + _log.debug(f"ignoring element of type {type(element)}") except Exception as exc: - _log.warn(f"error walking element: {type(element)}") + _log.debug(f"error walking element: {type(element)}") pass return doc + def is_body(self): + return (not self.contains_h1) or (self.contains_h1 and self.detected_h1) + def analyse_element(self, element, idx, doc): if element.name!=None: - #_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") - print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") + _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") - #print(element.name) - if element.name in self.labels: self.labels[element.name] += 1 else: @@ -150,53 +146,43 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.detected_h1 = True if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]: - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_header(element, idx, doc) elif element.name in ["p"]: - print(" --> detected ...") - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_paragraph(element, idx, doc) - print(" --> registered ...") - elif element.name in ["ul", "ol"]: - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_list(element, idx, doc) elif element.name in ["li"]: - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_listitem(element, idx, doc) elif element.name == "table": - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_table(element, idx, doc) elif element.name == "figure": - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_figure(element, idx, doc) elif element.name == "img": - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): + if self.is_body(): self.handle_image(element, idx, doc) elif element.name == "svg": - if (not self.contains_h1) or (self.contains_h1 and self.detected_h1): - #self.handle_image(element, idx, doc) - _log.warn("Add `svg` elements") + if self.is_body(): + self.handle_svg(element, idx, doc) - elif True and isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'): + elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'): try: - #print("\n\n\nattempt decoding: ", element['data-content']) - # Decode the data-content attribute #data_content = html.unescape(element['data-content']) - #print(data_content) - data_content = element['data-content'] # Parse the decoded HTML content content_soup = BeautifulSoup(data_content, 'html.parser') - print("\n\n\nsoup: ", content_soup) for jdx, _ in enumerate(content_soup): - print(_) self.analyse_element(_, jdx, doc) except: - _log.warn("could not parse the `data-content` attribute") + _log.debug("could not parse the `data-content` attribute") self.walk(element, doc) @@ -277,17 +263,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" - if element.text is None: - print(" -> text is None ...") + if element.text is None: return - text = element.text.strip() - print("doc is adding paragraph: ", text) - label = DocItemLabel.PARAGRAPH + text = element.text.strip() if len(text) == 0: - print(" -> text is zero length ...") return - print("doc is adding paragraph: ", text) + + label = DocItemLabel.PARAGRAPH doc.add_text(parent=self.parents[self.level], label=label, text=text) def handle_list(self, element, idx, doc): @@ -325,7 +308,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # we need to extract it recursively text = self.extract_text_recursively(element) # Flatten text, remove break lines: - text = text.replace("\n", "").replace("\r", "") + text = text.replace("\n", " ").replace("\r", "") text = " ".join(text.split()).strip() marker = "" @@ -357,12 +340,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if parent_list_label == GroupLabel.ORDERED_LIST: marker = f"{str(index_in_list)}." enumerated = True - doc.add_list_item( - text=text, - enumerated=enumerated, - marker=marker, - parent=self.parents[self.level], - ) + + if len(text) > 0: + doc.add_list_item( + text=text, + enumerated=enumerated, + marker=marker, + parent=self.parents[self.level], + ) else: _log.warn("list-item has no text: ", element) @@ -502,3 +487,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_image(self, element, idx, doc): """Handles image tags (img).""" doc.add_picture(parent=self.parents[self.level], caption=None) + + def handle_svg(self, element, idx, doc): + """Handles svg tags.""" + doc.add_picture(parent=self.parents[self.level], caption=None)