From 8c60dfa0e66dc279020b96777597f6df9cce5741 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Mon, 21 Oct 2024 16:42:18 +0200 Subject: [PATCH] Fixed example run_md, added origin info to md_backend Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 18 +++++++++++++++++- docs/examples/run_md.py | 9 +-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index e59dd51e..5e5f7496 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -10,6 +10,7 @@ import marko.inline from docling_core.types.doc import ( DocItemLabel, DoclingDocument, + DocumentOrigin, GroupLabel, TableCell, TableData, @@ -235,7 +236,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): def convert(self) -> DoclingDocument: _log.debug("converting Markdown...") - doc = DoclingDocument(name="Test") + + fname = "" + if isinstance(self.path_or_stream, Path): + fname = self.path_or_stream.name + + origin = DocumentOrigin( + filename=fname, + mimetype="text/markdown", + binary_hash=self.document_hash, + ) + if len(fname) > 0: + docname = Path(fname).stem + else: + docname = "stream" + + doc = DoclingDocument(name=docname, origin=origin) if self.is_valid(): # Parse the markdown into an abstract syntax tree (AST) diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py index d6fb9b11..46be97e2 100644 --- a/docs/examples/run_md.py +++ b/docs/examples/run_md.py @@ -13,14 +13,7 @@ _log = logging.getLogger(__name__) def main(): - input_paths = [ - Path("README.md"), - Path("scratch_a/2203.01017v2.md"), - Path("scratch_a/2206.01062.md"), - Path("scratch_a/2305.03393v1.md"), - Path("scratch_a/redp5110.md"), - Path("scratch_a/redp5695.md"), - ] + input_paths = [Path("README.md")] for path in input_paths: in_doc = InputDocument(