Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Michele Dolfi 2024-10-11 12:59:11 +02:00
commit 786b89efd9
12 changed files with 80 additions and 37 deletions

View File

@ -203,7 +203,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
if not success:
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
f"docling-parse could not load document with hash {document_hash}."
)
def page_count(self) -> int:

View File

@ -21,7 +21,7 @@ _log = logging.getLogger(__name__)
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
print("About to init HTML backend...")
_log.debug("About to init HTML backend...")
super().__init__(path_or_stream, document_hash)
self.soup = None
# HTML file:
@ -36,16 +36,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = byte_stream.getvalue().decode("utf-8")
print(text_stream)
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
_log.error("could not parse html: {}".format(e))
return doc
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {document_hash}."
) from e
def is_valid(self) -> bool:
return True
@ -66,7 +66,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
print("Trying to convert HTML...")
_log.debug("Trying to convert HTML...")
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")
@ -93,7 +93,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def analyse_element(self, element, idx, doc):
"""
if element.name!=None:
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
"""
if element.name in self.labels:
@ -323,7 +323,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data, parent=self.parents[self.level])
def get_list_text(list_element, level=0):
def get_list_text(self, list_element, level=0):
"""Recursively extract text from <ul> or <ol> with proper indentation."""
result = []
bullet_char = "*" # Default bullet character for unordered lists
@ -335,7 +335,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(get_list_text(nested_list, level + 1))
result.extend(self.get_list_text(nested_list, level + 1))
elif list_element.name == "ul": # For unordered lists, use bullet points
for li in list_element.find_all("li", recursive=False):
# Add bullet points for unordered lists
@ -345,7 +345,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# Handle nested lists
nested_list = li.find(["ul", "ol"])
if nested_list:
result.extend(get_list_text(nested_list, level + 1))
result.extend(self.get_list_text(nested_list, level + 1))
return result

View File

@ -39,12 +39,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
self.path_or_stream = path_or_stream
self.pptx_obj = None
self.valid = True
self.valid = False
try:
self.pptx_obj = Presentation(self.path_or_stream)
self.valid = True
except Exception:
_log.error("could not parse pptx")
self.valid = False
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
return

View File

@ -34,6 +34,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# self.initialise(path_or_stream)
# Word file:
self.path_or_stream = path_or_stream
self.valid = False
# Initialise the parents for the hierarchy
self.max_levels = 10
self.level_at_new_list = None
@ -50,6 +51,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
"indents": [None],
}
self.docx_obj = None
try:
self.docx_obj = docx.Document(self.path_or_stream)
self.valid = True
except Exception as e:
raise RuntimeError(
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
) from e
def is_valid(self) -> bool:
return True
@ -69,15 +79,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument:
# Parses the DOCX into a structured document model.
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
docx_obj = None
try:
docx_obj = docx.Document(self.path_or_stream)
except Exception:
_log.error("could not parse docx")
return doc
# self.initialise()
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
return doc
def update_history(self, name, level, numid, ilevel):

View File

@ -238,7 +238,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
f"pypdfium could not load document with hash {document_hash}"
) from e
def page_count(self) -> int:

View File

@ -1,4 +1,5 @@
import logging
import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
@ -171,7 +172,7 @@ class ConvertedDocument(BaseModel):
pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit()
legacy_output: DsDocument = _EMPTY_LEGACY_DOC
legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
output: DoclingDocument = _EMPTY_DOCLING_DOC
def _to_legacy_document(self) -> DsDocument:
@ -497,19 +498,40 @@ class DocumentConversionInput(BaseModel):
)
def _guess_format(self, obj):
content = None
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
if mime is None:
# TODO improve this.
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
if obj.suffix == ".html":
mime = "text/html"
elif isinstance(obj, DocumentStream):
obj.stream.seek(0)
content = obj.stream.read(8192)
obj.stream.seek(0)
mime = filetype.guess_mime(content)
if mime is None:
mime = self._detect_html_xhtml(content)
format = MimeTypeToFormat.get(mime)
return format
def _detect_html_xhtml(self, content):
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
content_str = content_str.lstrip()
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"
return None
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]

View File

@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
)
class PipelineOptions(BaseModel): ...
class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions):

View File

@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
class GlmModel:
def __init__(self, config):
self.config = config
self.create_legacy_output = config.get("create_legacy_output", True)
self.model_names = self.config.get(
"model_names", ""
) # "language;term;reference"
@ -42,6 +44,9 @@ class GlmModel:
)
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
legacy_doc: DsLegacyDocument = None
if self.create_legacy_output:
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code:
@ -92,4 +97,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
return (legacy_doc, docling_doc)
return (docling_doc, legacy_doc)

View File

@ -41,7 +41,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={})
self.glm_model = GlmModel(
config={"create_legacy_output": pipeline_options.create_legacy_output}
)
if ocr_model := self.get_ocr_model() is None:
raise RuntimeError(
@ -140,7 +142,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
return conv_res

View File

@ -120,7 +120,7 @@ def main():
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()

View File

@ -1,3 +1,4 @@
import json
import logging
from pathlib import Path
@ -38,6 +39,7 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
InputFormat.PDF,
# InputFormat.IMAGE,
InputFormat.DOCX,
InputFormat.HTML,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
@ -53,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
conv_results = doc_converter.convert_batch(input)
for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
out_path = Path("./scratch")
print(
f"Document {res.input.file.name} converted with status {res.status}."
f"\nSaved markdown output to: {str(out_path)}"
)
# print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental):
with out_path.open("w") as fp:
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.output.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.output.export_to_dict()))

Binary file not shown.