mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Update all backends with proper filename in DocumentOrigin
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
789b29bb24
commit
b1a2af6d39
@ -13,6 +13,7 @@ if TYPE_CHECKING:
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
self.file = in_doc.file
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = in_doc.document_hash
|
||||
self.input_format = in_doc.format
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
Parses the ASCII into a structured document model.
|
||||
"""
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/asciidoc",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
|
||||
doc = DoclingDocument(name=docname, origin=origin)
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
doc = self._parse(doc)
|
||||
|
||||
@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
# Lists
|
||||
elif self._is_list_item(line):
|
||||
|
||||
print("line: ", line)
|
||||
_log.debug(f"line: {line}")
|
||||
item = self._parse_list_item(line)
|
||||
print("parsed list-item: ", item)
|
||||
_log.debug(f"parsed list-item: {item}")
|
||||
|
||||
level = self._get_current_level(parents)
|
||||
|
||||
@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif in_list and item["indent"] < indents[level]:
|
||||
|
||||
print(item["indent"], " => ", indents[level])
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
while item["indent"] < indents[level]:
|
||||
print(item["indent"], " => ", indents[level])
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
parents[level] = None
|
||||
indents[level] = None
|
||||
level -= 1
|
||||
@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
caption_data = []
|
||||
|
||||
item = self._parse_picture(line)
|
||||
print(item)
|
||||
|
||||
size = None
|
||||
if "width" in item and "height" in item:
|
||||
@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
# Fallback if no match
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": item_marker,
|
||||
"marker": "-",
|
||||
"text": line,
|
||||
"numbered": False,
|
||||
"indent": 0,
|
||||
|
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(name="dummy")
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/html",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
_log.debug("Trying to convert HTML...")
|
||||
|
||||
if self.is_valid():
|
||||
|
@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("converting Markdown...")
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/markdown",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
|
||||
doc = DoclingDocument(name=docname, origin=origin)
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
if self.is_valid():
|
||||
# Parse the markdown into an abstract syntax tree (AST)
|
||||
|
@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
# Parses the PPTX into a structured document model.
|
||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="application/vnd.ms-powerpoint",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
|
||||
doc = DoclingDocument(
|
||||
name=docname, origin=origin
|
||||
name=self.file.stem or "file", origin=origin
|
||||
) # must add origin information
|
||||
doc = self.walk_linear(self.pptx_obj, doc)
|
||||
|
||||
|
@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the DOCX into a structured document model.
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(name=docname, origin=origin)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
|
@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
InputFormat.DOCX: {
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
},
|
||||
InputFormat.PPTX: {
|
||||
],
|
||||
InputFormat.PPTX: [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
},
|
||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||
InputFormat.IMAGE: {
|
||||
],
|
||||
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
||||
InputFormat.IMAGE: [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/gif",
|
||||
"image/bmp",
|
||||
},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
InputFormat.ASCIIDOC: {"text/asciidoc"},
|
||||
InputFormat.MD: {"text/markdown", "text/x-markdown"},
|
||||
],
|
||||
InputFormat.PDF: ["application/pdf"],
|
||||
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
|
@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
FormatToExtensions,
|
||||
FormatToMimeType,
|
||||
InputFormat,
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel):
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
||||
content = b"" # empty binary blob
|
||||
format = None
|
||||
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
ext = obj.suffix[1:]
|
||||
mime = self._mime_from_extension(ext)
|
||||
if mime is None: # must guess from
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
|
||||
elif isinstance(obj, DocumentStream):
|
||||
obj.stream.seek(0)
|
||||
content = obj.stream.read(8192)
|
||||
obj.stream.seek(0)
|
||||
mime = filetype.guess_mime(content)
|
||||
if mime is None:
|
||||
ext = (
|
||||
obj.name.rsplit(".", 1)[-1]
|
||||
if ("." in obj.name and not obj.name.startswith("."))
|
||||
else ""
|
||||
)
|
||||
mime = self._mime_from_extension(ext)
|
||||
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
if mime is None:
|
||||
mime = "text/markdown"
|
||||
mime = mime or self._detect_html_xhtml(content)
|
||||
mime = mime or "text/plain"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
def _mime_from_extension(self, ext):
|
||||
mime = None
|
||||
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
||||
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
||||
elif ext in FormatToExtensions[InputFormat.HTML]:
|
||||
mime = FormatToMimeType[InputFormat.HTML][0]
|
||||
elif ext in FormatToExtensions[InputFormat.MD]:
|
||||
mime = FormatToMimeType[InputFormat.MD][0]
|
||||
|
||||
return mime
|
||||
|
||||
def _detect_html_xhtml(self, content):
|
||||
content_str = content.decode("ascii", errors="ignore").lower()
|
||||
# Remove XML comments
|
||||
|
@ -1,11 +1,13 @@
|
||||
import json
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
PdfFormatOption,
|
||||
@ -19,18 +21,24 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
input_paths = [
|
||||
Path("README.md"),
|
||||
Path("tests/data/wiki_duck.html"),
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/word_nested.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
Path("tests/data/test_02.asciidoc"),
|
||||
Path("README.md"),
|
||||
]
|
||||
|
||||
# To read from bytes instead:
|
||||
# docs = [
|
||||
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
|
||||
# for f in input_paths
|
||||
# ]
|
||||
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
|
||||
@ -49,7 +57,8 @@ def main():
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
||||
pipeline_cls=StandardPdfPipeline,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||
@ -59,6 +68,7 @@ def main():
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert_all(input_paths)
|
||||
# conv_results = doc_converter.convert_all(docs)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("scratch")
|
||||
|
Loading…
Reference in New Issue
Block a user