diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 5bfc02a2..b47b11cd 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: class AbstractDocumentBackend(ABC): @abstractmethod def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + self.file = in_doc.file self.path_or_stream = path_or_stream self.document_hash = in_doc.document_hash self.input_format = in_doc.format diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 5ece3813..c9d2fc52 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -1,4 +1,5 @@ import logging +import os import re from io import BytesIO from pathlib import Path @@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend): Parses the ASCII into a structured document model. """ - fname = "" - if isinstance(self.path_or_stream, Path): - fname = self.path_or_stream.name - origin = DocumentOrigin( - filename=fname, + filename=self.file.name or "file", mimetype="text/asciidoc", binary_hash=self.document_hash, ) - if len(fname) > 0: - docname = Path(fname).stem - else: - docname = "stream" - doc = DoclingDocument(name=docname, origin=origin) + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = self._parse(doc) @@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Lists elif self._is_list_item(line): - print("line: ", line) + _log.debug(f"line: {line}") item = self._parse_list_item(line) - print("parsed list-item: ", item) + _log.debug(f"parsed list-item: {item}") level = self._get_current_level(parents) @@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend): elif in_list and item["indent"] < indents[level]: - print(item["indent"], " => ", indents[level]) + # print(item["indent"], " => ", indents[level]) while item["indent"] < indents[level]: - print(item["indent"], " => ", indents[level]) + # print(item["indent"], " => ", indents[level]) parents[level] = None indents[level] = None level -= 1 @@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): caption_data = [] item = self._parse_picture(line) - print(item) size = None if "width" in item and "height" in item: @@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Fallback if no match return { "type": "list_item", - "marker": item_marker, + "marker": "-", "text": line, "numbered": False, "indent": 0, diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index bd098c95..7bae3463 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup from docling_core.types.doc import ( DocItemLabel, DoclingDocument, + DocumentOrigin, GroupLabel, TableCell, TableData, @@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # access self.path_or_stream to load stuff - doc = DoclingDocument(name="dummy") + origin = DocumentOrigin( + filename=self.file.name or "file", + mimetype="text/html", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) _log.debug("Trying to convert HTML...") if self.is_valid(): diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 5f326065..0f51b052 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: _log.debug("converting Markdown...") - fname = "" - if isinstance(self.path_or_stream, Path): - fname = self.path_or_stream.name - origin = DocumentOrigin( - filename=fname, + filename=self.file.name or "file", mimetype="text/markdown", binary_hash=self.document_hash, ) - if len(fname) > 0: - docname = Path(fname).stem - else: - docname = "stream" - doc = DoclingDocument(name=docname, origin=origin) + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): # Parse the markdown into an abstract syntax tree (AST) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 0adebb15..0544cc9c 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB # Parses the PPTX into a structured document model. # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash) - fname = "" - if isinstance(self.path_or_stream, Path): - fname = self.path_or_stream.name - origin = DocumentOrigin( - filename=fname, + filename=self.file.name or "file", mimetype="application/vnd.ms-powerpoint", binary_hash=self.document_hash, ) - if len(fname) > 0: - docname = Path(fname).stem - else: - docname = "stream" + doc = DoclingDocument( - name=docname, origin=origin + name=self.file.stem or "file", origin=origin ) # must add origin information doc = self.walk_linear(self.pptx_obj, doc) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5b5420f9..5b166d5b 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: # Parses the DOCX into a structured document model. - fname = "" - if isinstance(self.path_or_stream, Path): - fname = self.path_or_stream.name - origin = DocumentOrigin( - filename=fname, + filename=self.file.name or "file", mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", binary_hash=self.document_hash, ) - if len(fname) > 0: - docname = Path(fname).stem - else: - docname = "stream" - doc = DoclingDocument(name=docname, origin=origin) + + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index f80056c8..a82d86a5 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], } -FormatToMimeType: Dict[InputFormat, Set[str]] = { - InputFormat.DOCX: { +FormatToMimeType: Dict[InputFormat, List[str]] = { + InputFormat.DOCX: [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.template", - }, - InputFormat.PPTX: { + ], + InputFormat.PPTX: [ "application/vnd.openxmlformats-officedocument.presentationml.template", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.openxmlformats-officedocument.presentationml.presentation", - }, - InputFormat.HTML: {"text/html", "application/xhtml+xml"}, - InputFormat.IMAGE: { + ], + InputFormat.HTML: ["text/html", "application/xhtml+xml"], + InputFormat.IMAGE: [ "image/png", "image/jpeg", "image/tiff", "image/gif", "image/bmp", - }, - InputFormat.PDF: {"application/pdf"}, - InputFormat.ASCIIDOC: {"text/asciidoc"}, - InputFormat.MD: {"text/markdown", "text/x-markdown"}, + ], + InputFormat.PDF: ["application/pdf"], + InputFormat.ASCIIDOC: ["text/asciidoc"], + InputFormat.MD: ["text/markdown", "text/x-markdown"], } MimeTypeToFormat = { mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index be213bc5..e1ecf17f 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -45,6 +45,8 @@ from docling.datamodel.base_models import ( ConversionStatus, DocumentStream, ErrorItem, + FormatToExtensions, + FormatToMimeType, InputFormat, MimeTypeToFormat, Page, @@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel): else: raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}") - def _guess_format(self, obj): - content = None + def _guess_format(self, obj: Union[Path, DocumentStream]): + content = b"" # empty binary blob + format = None + if isinstance(obj, Path): mime = filetype.guess_mime(str(obj)) if mime is None: + ext = obj.suffix[1:] + mime = self._mime_from_extension(ext) + if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB elif isinstance(obj, DocumentStream): - obj.stream.seek(0) content = obj.stream.read(8192) obj.stream.seek(0) mime = filetype.guess_mime(content) + if mime is None: + ext = ( + obj.name.rsplit(".", 1)[-1] + if ("." in obj.name and not obj.name.startswith(".")) + else "" + ) + mime = self._mime_from_extension(ext) - if mime is None: - mime = self._detect_html_xhtml(content) - if mime is None: - mime = "text/markdown" + mime = mime or self._detect_html_xhtml(content) + mime = mime or "text/plain" format = MimeTypeToFormat.get(mime) return format + def _mime_from_extension(self, ext): + mime = None + if ext in FormatToExtensions[InputFormat.ASCIIDOC]: + mime = FormatToMimeType[InputFormat.ASCIIDOC][0] + elif ext in FormatToExtensions[InputFormat.HTML]: + mime = FormatToMimeType[InputFormat.HTML][0] + elif ext in FormatToExtensions[InputFormat.MD]: + mime = FormatToMimeType[InputFormat.MD][0] + + return mime + def _detect_html_xhtml(self, content): content_str = content.decode("ascii", errors="ignore").lower() # Remove XML comments diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index bb3d6722..80384f6d 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -1,11 +1,13 @@ import json import logging +from io import BytesIO from pathlib import Path import yaml +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import InputFormat +from docling.datamodel.base_models import DocumentStream, InputFormat from docling.document_converter import ( DocumentConverter, PdfFormatOption, @@ -19,18 +21,24 @@ _log = logging.getLogger(__name__) def main(): input_paths = [ - Path("README.md"), Path("tests/data/wiki_duck.html"), Path("tests/data/word_sample.docx"), + Path("tests/data/word_nested.docx"), Path("tests/data/lorem_ipsum.docx"), Path("tests/data/powerpoint_sample.pptx"), Path("tests/data/2305.03393v1-pg9-img.png"), Path("tests/data/2206.01062.pdf"), Path("tests/data/test_01.asciidoc"), - Path("tests/data/test_01.asciidoc"), + Path("tests/data/test_02.asciidoc"), Path("README.md"), ] + # To read from bytes instead: + # docs = [ + # DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read())) + # for f in input_paths + # ] + ## for defaults use: # doc_converter = DocumentConverter() @@ -49,7 +57,8 @@ def main(): ], # whitelist formats, non-matching files are ignored. format_options={ InputFormat.PDF: PdfFormatOption( - pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend + pipeline_cls=StandardPdfPipeline, + backend=DoclingParseDocumentBackend, ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend @@ -59,6 +68,7 @@ def main(): ) conv_results = doc_converter.convert_all(input_paths) + # conv_results = doc_converter.convert_all(docs) for res in conv_results: out_path = Path("scratch")