mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
Update all backends with proper filename in DocumentOrigin
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
789b29bb24
commit
b1a2af6d39
@ -13,6 +13,7 @@ if TYPE_CHECKING:
|
|||||||
class AbstractDocumentBackend(ABC):
|
class AbstractDocumentBackend(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
self.file = in_doc.file
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
self.document_hash = in_doc.document_hash
|
self.document_hash = in_doc.document_hash
|
||||||
self.input_format = in_doc.format
|
self.input_format = in_doc.format
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -67,21 +68,13 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
Parses the ASCII into a structured document model.
|
Parses the ASCII into a structured document model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
fname = ""
|
|
||||||
if isinstance(self.path_or_stream, Path):
|
|
||||||
fname = self.path_or_stream.name
|
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=fname,
|
filename=self.file.name or "file",
|
||||||
mimetype="text/asciidoc",
|
mimetype="text/asciidoc",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
if len(fname) > 0:
|
|
||||||
docname = Path(fname).stem
|
|
||||||
else:
|
|
||||||
docname = "stream"
|
|
||||||
|
|
||||||
doc = DoclingDocument(name=docname, origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
|
|
||||||
doc = self._parse(doc)
|
doc = self._parse(doc)
|
||||||
|
|
||||||
@ -138,9 +131,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
# Lists
|
# Lists
|
||||||
elif self._is_list_item(line):
|
elif self._is_list_item(line):
|
||||||
|
|
||||||
print("line: ", line)
|
_log.debug(f"line: {line}")
|
||||||
item = self._parse_list_item(line)
|
item = self._parse_list_item(line)
|
||||||
print("parsed list-item: ", item)
|
_log.debug(f"parsed list-item: {item}")
|
||||||
|
|
||||||
level = self._get_current_level(parents)
|
level = self._get_current_level(parents)
|
||||||
|
|
||||||
@ -160,9 +153,9 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
elif in_list and item["indent"] < indents[level]:
|
elif in_list and item["indent"] < indents[level]:
|
||||||
|
|
||||||
print(item["indent"], " => ", indents[level])
|
# print(item["indent"], " => ", indents[level])
|
||||||
while item["indent"] < indents[level]:
|
while item["indent"] < indents[level]:
|
||||||
print(item["indent"], " => ", indents[level])
|
# print(item["indent"], " => ", indents[level])
|
||||||
parents[level] = None
|
parents[level] = None
|
||||||
indents[level] = None
|
indents[level] = None
|
||||||
level -= 1
|
level -= 1
|
||||||
@ -217,7 +210,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
caption_data = []
|
caption_data = []
|
||||||
|
|
||||||
item = self._parse_picture(line)
|
item = self._parse_picture(line)
|
||||||
print(item)
|
|
||||||
|
|
||||||
size = None
|
size = None
|
||||||
if "width" in item and "height" in item:
|
if "width" in item and "height" in item:
|
||||||
@ -355,7 +347,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
# Fallback if no match
|
# Fallback if no match
|
||||||
return {
|
return {
|
||||||
"type": "list_item",
|
"type": "list_item",
|
||||||
"marker": item_marker,
|
"marker": "-",
|
||||||
"text": line,
|
"text": line,
|
||||||
"numbered": False,
|
"numbered": False,
|
||||||
"indent": 0,
|
"indent": 0,
|
||||||
|
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
|||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
GroupLabel,
|
GroupLabel,
|
||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# access self.path_or_stream to load stuff
|
# access self.path_or_stream to load stuff
|
||||||
doc = DoclingDocument(name="dummy")
|
origin = DocumentOrigin(
|
||||||
|
filename=self.file.name or "file",
|
||||||
|
mimetype="text/html",
|
||||||
|
binary_hash=self.document_hash,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
_log.debug("Trying to convert HTML...")
|
_log.debug("Trying to convert HTML...")
|
||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
|
@ -237,21 +237,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
_log.debug("converting Markdown...")
|
_log.debug("converting Markdown...")
|
||||||
|
|
||||||
fname = ""
|
|
||||||
if isinstance(self.path_or_stream, Path):
|
|
||||||
fname = self.path_or_stream.name
|
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=fname,
|
filename=self.file.name or "file",
|
||||||
mimetype="text/markdown",
|
mimetype="text/markdown",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
if len(fname) > 0:
|
|
||||||
docname = Path(fname).stem
|
|
||||||
else:
|
|
||||||
docname = "stream"
|
|
||||||
|
|
||||||
doc = DoclingDocument(name=docname, origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
# Parse the markdown into an abstract syntax tree (AST)
|
# Parse the markdown into an abstract syntax tree (AST)
|
||||||
|
@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
# Parses the PPTX into a structured document model.
|
# Parses the PPTX into a structured document model.
|
||||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||||
|
|
||||||
fname = ""
|
|
||||||
if isinstance(self.path_or_stream, Path):
|
|
||||||
fname = self.path_or_stream.name
|
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=fname,
|
filename=self.file.name or "file",
|
||||||
mimetype="application/vnd.ms-powerpoint",
|
mimetype="application/vnd.ms-powerpoint",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
if len(fname) > 0:
|
|
||||||
docname = Path(fname).stem
|
|
||||||
else:
|
|
||||||
docname = "stream"
|
|
||||||
doc = DoclingDocument(
|
doc = DoclingDocument(
|
||||||
name=docname, origin=origin
|
name=self.file.stem or "file", origin=origin
|
||||||
) # must add origin information
|
) # must add origin information
|
||||||
doc = self.walk_linear(self.pptx_obj, doc)
|
doc = self.walk_linear(self.pptx_obj, doc)
|
||||||
|
|
||||||
|
@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
# Parses the DOCX into a structured document model.
|
# Parses the DOCX into a structured document model.
|
||||||
|
|
||||||
fname = ""
|
|
||||||
if isinstance(self.path_or_stream, Path):
|
|
||||||
fname = self.path_or_stream.name
|
|
||||||
|
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
filename=fname,
|
filename=self.file.name or "file",
|
||||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
binary_hash=self.document_hash,
|
binary_hash=self.document_hash,
|
||||||
)
|
)
|
||||||
if len(fname) > 0:
|
|
||||||
docname = Path(fname).stem
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
else:
|
|
||||||
docname = "stream"
|
|
||||||
doc = DoclingDocument(name=docname, origin=origin)
|
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
|
@ -51,27 +51,27 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
}
|
}
|
||||||
|
|
||||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||||
InputFormat.DOCX: {
|
InputFormat.DOCX: [
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||||
},
|
],
|
||||||
InputFormat.PPTX: {
|
InputFormat.PPTX: [
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
},
|
],
|
||||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
||||||
InputFormat.IMAGE: {
|
InputFormat.IMAGE: [
|
||||||
"image/png",
|
"image/png",
|
||||||
"image/jpeg",
|
"image/jpeg",
|
||||||
"image/tiff",
|
"image/tiff",
|
||||||
"image/gif",
|
"image/gif",
|
||||||
"image/bmp",
|
"image/bmp",
|
||||||
},
|
],
|
||||||
InputFormat.PDF: {"application/pdf"},
|
InputFormat.PDF: ["application/pdf"],
|
||||||
InputFormat.ASCIIDOC: {"text/asciidoc"},
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||||
InputFormat.MD: {"text/markdown", "text/x-markdown"},
|
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
||||||
}
|
}
|
||||||
MimeTypeToFormat = {
|
MimeTypeToFormat = {
|
||||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||||
|
@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
|
|||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
ErrorItem,
|
ErrorItem,
|
||||||
|
FormatToExtensions,
|
||||||
|
FormatToMimeType,
|
||||||
InputFormat,
|
InputFormat,
|
||||||
MimeTypeToFormat,
|
MimeTypeToFormat,
|
||||||
Page,
|
Page,
|
||||||
@ -480,28 +482,48 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
else:
|
else:
|
||||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||||
|
|
||||||
def _guess_format(self, obj):
|
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
||||||
content = None
|
content = b"" # empty binary blob
|
||||||
|
format = None
|
||||||
|
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
mime = filetype.guess_mime(str(obj))
|
mime = filetype.guess_mime(str(obj))
|
||||||
if mime is None:
|
if mime is None:
|
||||||
|
ext = obj.suffix[1:]
|
||||||
|
mime = self._mime_from_extension(ext)
|
||||||
|
if mime is None: # must guess from
|
||||||
with obj.open("rb") as f:
|
with obj.open("rb") as f:
|
||||||
content = f.read(1024) # Read first 1KB
|
content = f.read(1024) # Read first 1KB
|
||||||
|
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
obj.stream.seek(0)
|
|
||||||
content = obj.stream.read(8192)
|
content = obj.stream.read(8192)
|
||||||
obj.stream.seek(0)
|
obj.stream.seek(0)
|
||||||
mime = filetype.guess_mime(content)
|
mime = filetype.guess_mime(content)
|
||||||
|
if mime is None:
|
||||||
|
ext = (
|
||||||
|
obj.name.rsplit(".", 1)[-1]
|
||||||
|
if ("." in obj.name and not obj.name.startswith("."))
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
mime = self._mime_from_extension(ext)
|
||||||
|
|
||||||
if mime is None:
|
mime = mime or self._detect_html_xhtml(content)
|
||||||
mime = self._detect_html_xhtml(content)
|
mime = mime or "text/plain"
|
||||||
if mime is None:
|
|
||||||
mime = "text/markdown"
|
|
||||||
|
|
||||||
format = MimeTypeToFormat.get(mime)
|
format = MimeTypeToFormat.get(mime)
|
||||||
return format
|
return format
|
||||||
|
|
||||||
|
def _mime_from_extension(self, ext):
|
||||||
|
mime = None
|
||||||
|
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
||||||
|
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.HTML]:
|
||||||
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
||||||
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
||||||
|
mime = FormatToMimeType[InputFormat.MD][0]
|
||||||
|
|
||||||
|
return mime
|
||||||
|
|
||||||
def _detect_html_xhtml(self, content):
|
def _detect_html_xhtml(self, content):
|
||||||
content_str = content.decode("ascii", errors="ignore").lower()
|
content_str = content.decode("ascii", errors="ignore").lower()
|
||||||
# Remove XML comments
|
# Remove XML comments
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.document_converter import (
|
from docling.document_converter import (
|
||||||
DocumentConverter,
|
DocumentConverter,
|
||||||
PdfFormatOption,
|
PdfFormatOption,
|
||||||
@ -19,18 +21,24 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
input_paths = [
|
input_paths = [
|
||||||
Path("README.md"),
|
|
||||||
Path("tests/data/wiki_duck.html"),
|
Path("tests/data/wiki_duck.html"),
|
||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
|
Path("tests/data/word_nested.docx"),
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
Path("tests/data/test_01.asciidoc"),
|
Path("tests/data/test_01.asciidoc"),
|
||||||
Path("tests/data/test_01.asciidoc"),
|
Path("tests/data/test_02.asciidoc"),
|
||||||
Path("README.md"),
|
Path("README.md"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# To read from bytes instead:
|
||||||
|
# docs = [
|
||||||
|
# DocumentStream(name=f.name, stream=BytesIO(f.open("rb").read()))
|
||||||
|
# for f in input_paths
|
||||||
|
# ]
|
||||||
|
|
||||||
## for defaults use:
|
## for defaults use:
|
||||||
# doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
|
|
||||||
@ -49,7 +57,8 @@ def main():
|
|||||||
], # whitelist formats, non-matching files are ignored.
|
], # whitelist formats, non-matching files are ignored.
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend
|
pipeline_cls=StandardPdfPipeline,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
),
|
),
|
||||||
InputFormat.DOCX: WordFormatOption(
|
InputFormat.DOCX: WordFormatOption(
|
||||||
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
pipeline_cls=SimplePipeline # , backend=MsWordDocumentBackend
|
||||||
@ -59,6 +68,7 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_paths)
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
|
# conv_results = doc_converter.convert_all(docs)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
|
Loading…
Reference in New Issue
Block a user