mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
reformatted the code
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
3257034631
commit
ddd1474c8d
@ -1,12 +1,11 @@
|
|||||||
import re
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -171,14 +170,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if self.is_body():
|
if self.is_body():
|
||||||
self.handle_svg(element, idx, doc)
|
self.handle_svg(element, idx, doc)
|
||||||
|
|
||||||
elif isinstance(element, Tag) and element.name in ["section"] and element.has_attr('data-content'):
|
elif (
|
||||||
|
isinstance(element, Tag)
|
||||||
|
and element.name in ["section"]
|
||||||
|
and element.has_attr("data-content")
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
# Decode the data-content attribute
|
# Decode the data-content attribute
|
||||||
# data_content = html.unescape(element['data-content'])
|
# data_content = html.unescape(element['data-content'])
|
||||||
data_content = element['data-content']
|
data_content = element["data-content"]
|
||||||
|
|
||||||
# Parse the decoded HTML content
|
# Parse the decoded HTML content
|
||||||
content_soup = BeautifulSoup(data_content, 'html.parser')
|
content_soup = BeautifulSoup(data_content, "html.parser")
|
||||||
|
|
||||||
for jdx, _ in enumerate(content_soup):
|
for jdx, _ in enumerate(content_soup):
|
||||||
self.analyse_element(_, jdx, doc)
|
self.analyse_element(_, jdx, doc)
|
||||||
@ -311,7 +314,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Flatten text, remove break lines:
|
# Flatten text, remove break lines:
|
||||||
text = text.replace("\n", " ").replace("\r", "")
|
text = text.replace("\n", " ").replace("\r", "")
|
||||||
text = " ".join(text.split()).strip()
|
text = " ".join(text.split()).strip()
|
||||||
text = re.sub(r'\s{2,}', ' ', text)
|
text = re.sub(r"\s{2,}", " ", text)
|
||||||
|
|
||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
@ -337,7 +340,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif isinstance(element.text, str):
|
elif isinstance(element.text, str):
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
text = text.replace("\n", " ").replace("\r", "")
|
text = text.replace("\n", " ").replace("\r", "")
|
||||||
text = re.sub(r'\s{2,}', ' ', text)
|
text = re.sub(r"\s{2,}", " ", text)
|
||||||
|
|
||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
|
Loading…
Reference in New Issue
Block a user