Fixed example run_md, added origin info to md_backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-21 16:42:18 +02:00
parent 1456a36618
commit 8c60dfa0e6
2 changed files with 18 additions and 9 deletions

View File

@ -10,6 +10,7 @@ import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin,
GroupLabel, GroupLabel,
TableCell, TableCell,
TableData, TableData,
@ -235,7 +236,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
_log.debug("converting Markdown...") _log.debug("converting Markdown...")
doc = DoclingDocument(name="Test")
fname = ""
if isinstance(self.path_or_stream, Path):
fname = self.path_or_stream.name
origin = DocumentOrigin(
filename=fname,
mimetype="text/markdown",
binary_hash=self.document_hash,
)
if len(fname) > 0:
docname = Path(fname).stem
else:
docname = "stream"
doc = DoclingDocument(name=docname, origin=origin)
if self.is_valid(): if self.is_valid():
# Parse the markdown into an abstract syntax tree (AST) # Parse the markdown into an abstract syntax tree (AST)

View File

@ -13,14 +13,7 @@ _log = logging.getLogger(__name__)
def main(): def main():
input_paths = [ input_paths = [Path("README.md")]
Path("README.md"),
Path("scratch_a/2203.01017v2.md"),
Path("scratch_a/2206.01062.md"),
Path("scratch_a/2305.03393v1.md"),
Path("scratch_a/redp5110.md"),
Path("scratch_a/redp5695.md"),
]
for path in input_paths: for path in input_paths:
in_doc = InputDocument( in_doc = InputDocument(