feat: Support AsciiDoc and Markdown input format (#168)

* updated the base-model and added the asciidoc_backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the asciidoc backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Ensure all models work only on valid pages (#158) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * ci: run ci also on forks (#160) --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> * fix: fix legacy doc ref (#162) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * docs: typo fix (#155) * Docs: Typo fix - Corrected spelling of invidual to automatic Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> * add synchronize event for forks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> * feat: add coverage_threshold to skip OCR for small images (#161) * feat: add coverage_threshold to skip OCR for small images Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * filter individual boxes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * chore: bump version to 2.1.0 [skip ci] * adding tests for asciidocs Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working asciidoc parser Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * adding test_02.asciidoc Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Drafting Markdown backend via Marko library Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * work in progress on MD backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * md_backend produces docling document with headers, paragraphs, lists Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Improvements in md parsing Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Detecting and assembling tables in markdown in temporary buffers Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added initial docling table support to md_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Cleaned code, improved logging for MD Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixes MyPy requirements, and rest of pre-commit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed example run_md, added origin info to md_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * working on asciidocs, struggling with ImageRef Signed-off-by: Peter Staar <taa@zurich.ibm.com> * able to parse the captions and image uri's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Update all backends with proper filename in DocumentOrigin Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update to docling-core v2.1.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for MD Backend, to avoid duplicated text inserts into docling doc Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix styling Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Added support for code blocks and fenced code in MD Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaned prints Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added proper processing of in-line textual elements for MD backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed issues with duplicated paragraphs and incorrect lists in pptx Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed issue with group ordeering in pptx backend, added gebug log into run with formats Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-12-09 13:18:24 +00:00 · 2024-10-23 16:14:26 +02:00
parent 3496b4838f
commit 3023f18ba0
52 changed files with 3731 additions and 3517 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
 class AbstractDocumentBackend(ABC):
    @abstractmethod
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        self.file = in_doc.file
        self.path_or_stream = path_or_stream
        self.document_hash = in_doc.document_hash
        self.input_format = in_doc.format
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@@ -0,0 +1,435 @@
+import logging
+import os
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling_core.types.doc import (
+    DocItem,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupItem,
+    GroupLabel,
+    ImageRef,
+    NodeItem,
+    Size,
+    TableCell,
+    TableData,
+)
+from pydantic import AnyUrl
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class AsciiDocBackend(DeclarativeDocumentBackend):
+
+    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        self.path_or_stream = path_or_stream
+
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                self.lines = text_stream.split("\n")
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    self.lines = f.readlines()
+            self.valid = True
+
+        except Exception as e:
+            raise RuntimeError(
+                f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
+            ) from e
+        return
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.ASCIIDOC}
+
+    def convert(self) -> DoclingDocument:
+        """
+        Parses the ASCII into a structured document model.
+        """
+
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="text/asciidoc",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+        doc = self._parse(doc)
+
+        return doc
+
+    def _parse(self, doc: DoclingDocument):
+        """
+        Main function that orchestrates the parsing by yielding components:
+        title, section headers, text, lists, and tables.
+        """
+
+        content = ""
+
+        in_list = False
+        in_table = False
+
+        text_data: list[str] = []
+        table_data: list[str] = []
+        caption_data: list[str] = []
+
+        # parents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        parents: dict[int, Union[GroupItem, None]] = {}
+        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
+        indents: dict[int, Union[GroupItem, None]] = {}
+
+        for i in range(0, 10):
+            parents[i] = None
+            indents[i] = None
+
+        for line in self.lines:
+            # line = line.strip()
+
+            # Title
+            if self._is_title(line):
+                item = self._parse_title(line)
+                level = item["level"]
+
+                parents[level] = doc.add_text(
+                    text=item["text"], label=DocItemLabel.TITLE
+                )
+
+            # Section headers
+            elif self._is_section_header(line):
+                item = self._parse_section_header(line)
+                level = item["level"]
+
+                parents[level] = doc.add_heading(
+                    text=item["text"], level=item["level"], parent=parents[level - 1]
+                )
+                for k, v in parents.items():
+                    if k > level:
+                        parents[k] = None
+
+            # Lists
+            elif self._is_list_item(line):
+
+                _log.debug(f"line: {line}")
+                item = self._parse_list_item(line)
+                _log.debug(f"parsed list-item: {item}")
+
+                level = self._get_current_level(parents)
+
+                if not in_list:
+                    in_list = True
+
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] > indents[level]:
+                    parents[level + 1] = doc.add_group(
+                        parent=parents[level], name="list", label=GroupLabel.LIST
+                    )
+                    indents[level + 1] = item["indent"]
+
+                elif in_list and item["indent"] < indents[level]:
+
+                    # print(item["indent"], " => ", indents[level])
+                    while item["indent"] < indents[level]:
+                        # print(item["indent"], " => ", indents[level])
+                        parents[level] = None
+                        indents[level] = None
+                        level -= 1
+
+                doc.add_list_item(
+                    item["text"], parent=self._get_current_parent(parents)
+                )
+
+            elif in_list and not self._is_list_item(line):
+                in_list = False
+
+                level = self._get_current_level(parents)
+                parents[level] = None
+
+            # Tables
+            elif line.strip() == "|===" and not in_table:  # start of table
+                in_table = True
+
+            elif self._is_table_line(line):  # within a table
+                in_table = True
+                table_data.append(self._parse_table_line(line))
+
+            elif in_table and (
+                (not self._is_table_line(line)) or line.strip() == "|==="
+            ):  # end of table
+
+                caption = None
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []
+
+                data = self._populate_table_as_grid(table_data)
+                doc.add_table(
+                    data=data, parent=self._get_current_parent(parents), caption=caption
+                )
+
+                in_table = False
+                table_data = []
+
+            # Picture
+            elif self._is_picture(line):
+
+                caption = None
+                if len(caption_data) > 0:
+                    caption = doc.add_text(
+                        text=" ".join(caption_data), label=DocItemLabel.CAPTION
+                    )
+
+                caption_data = []
+
+                item = self._parse_picture(line)
+
+                size = None
+                if "width" in item and "height" in item:
+                    size = Size(width=int(item["width"]), height=int(item["height"]))
+
+                uri = None
+                if (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("//")
+                ):
+                    uri = "file:" + item["uri"]
+                elif (
+                    "uri" in item
+                    and not item["uri"].startswith("http")
+                    and item["uri"].startswith("/")
+                ):
+                    uri = "file:/" + item["uri"]
+                elif "uri" in item and not item["uri"].startswith("http"):
+                    uri = "file://" + item["uri"]
+
+                image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
+                doc.add_picture(image=image, caption=caption)
+
+            # Caption
+            elif self._is_caption(line) and len(caption_data) == 0:
+                item = self._parse_caption(line)
+                caption_data.append(item["text"])
+
+            elif (
+                len(line.strip()) > 0 and len(caption_data) > 0
+            ):  # allow multiline captions
+                item = self._parse_text(line)
+                caption_data.append(item["text"])
+
+            # Plain text
+            elif len(line.strip()) == 0 and len(text_data) > 0:
+                doc.add_text(
+                    text=" ".join(text_data),
+                    label=DocItemLabel.PARAGRAPH,
+                    parent=self._get_current_parent(parents),
+                )
+                text_data = []
+
+            elif len(line.strip()) > 0:  # allow multiline texts
+
+                item = self._parse_text(line)
+                text_data.append(item["text"])
+
+        if len(text_data) > 0:
+            doc.add_text(
+                text=" ".join(text_data),
+                label=DocItemLabel.PARAGRAPH,
+                parent=self._get_current_parent(parents),
+            )
+            text_data = []
+
+        if in_table and len(table_data) > 0:
+            data = self._populate_table_as_grid(table_data)
+            doc.add_table(data=data, parent=self._get_current_parent(parents))
+
+            in_table = False
+            table_data = []
+
+        return doc
+
+    def _get_current_level(self, parents):
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return k - 1
+
+        return 0
+
+    def _get_current_parent(self, parents):
+        for k, v in parents.items():
+            if v == None and k > 0:
+                return parents[k - 1]
+
+        return None
+
+    #   =========   Title
+    def _is_title(self, line):
+        return re.match(r"^= ", line)
+
+    def _parse_title(self, line):
+        return {"type": "title", "text": line[2:].strip(), "level": 0}
+
+    #   =========   Section headers
+    def _is_section_header(self, line):
+        return re.match(r"^==+", line)
+
+    def _parse_section_header(self, line):
+        match = re.match(r"^(=+)\s+(.*)", line)
+
+        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
+        text = match.group(2)  # The actual text of the list item
+
+        header_level = marker.count("=")  # number of '=' represents level
+        return {
+            "type": "header",
+            "level": header_level - 1,
+            "text": text.strip(),
+        }
+
+    #   =========   Lists
+    def _is_list_item(self, line):
+        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
+
+    def _parse_list_item(self, line):
+        """Extract the item marker (number or bullet symbol) and the text of the item."""
+
+        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
+        if match:
+            indent = match.group(1)
+            marker = match.group(2)  # The list marker (e.g., "*", "-", "1.")
+            text = match.group(3)  # The actual text of the list item
+
+            if marker == "*" or marker == "-":
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": False,
+                    "indent": 0 if indent == None else len(indent),
+                }
+            else:
+                return {
+                    "type": "list_item",
+                    "marker": marker,
+                    "text": text.strip(),
+                    "numbered": True,
+                    "indent": 0 if indent == None else len(indent),
+                }
+        else:
+            # Fallback if no match
+            return {
+                "type": "list_item",
+                "marker": "-",
+                "text": line,
+                "numbered": False,
+                "indent": 0,
+            }
+
+    #   =========   Tables
+    def _is_table_line(self, line):
+        return re.match(r"^\|.*\|", line)
+
+    def _parse_table_line(self, line):
+        # Split table cells and trim extra spaces
+        return [cell.strip() for cell in line.split("|") if cell.strip()]
+
+    def _populate_table_as_grid(self, table_data):
+
+        num_rows = len(table_data)
+
+        # Adjust the table data into a grid format
+        num_cols = max(len(row) for row in table_data)
+
+        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
+        for row_idx, row in enumerate(table_data):
+            # Pad rows with empty strings to match column count
+            # grid.append(row + [''] * (max_cols - len(row)))
+
+            for col_idx, text in enumerate(row):
+                row_span = 1
+                col_span = 1
+
+                cell = TableCell(
+                    text=text,
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=row_idx,
+                    end_row_offset_idx=row_idx + row_span,
+                    start_col_offset_idx=col_idx,
+                    end_col_offset_idx=col_idx + col_span,
+                    col_header=False,
+                    row_header=False,
+                )
+                data.table_cells.append(cell)
+
+        return data
+
+    #   =========   Pictures
+    def _is_picture(self, line):
+        return re.match(r"^image::", line)
+
+    def _parse_picture(self, line):
+        """
+        Parse an image macro, extracting its path and attributes.
+        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
+        """
+        mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
+        if mtch:
+            picture_path = mtch.group(1).strip()
+            attributes = mtch.group(2).split(",")
+            picture_info = {"type": "picture", "uri": picture_path}
+
+            # Extract optional attributes (alt text, width, height, alignment)
+            if attributes:
+                picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
+                for attr in attributes[1:]:
+                    key, value = attr.split("=")
+                    picture_info[key.strip()] = value.strip()
+
+            return picture_info
+
+        return {"type": "picture", "uri": line}
+
+    #   =========   Captions
+    def _is_caption(self, line):
+        return re.match(r"^\.(.+)", line)
+
+    def _parse_caption(self, line):
+        mtch = re.match(r"^\.(.+)", line)
+        if mtch:
+            text = mtch.group(1)
+            return {"type": "caption", "text": text}
+
+        return {"type": "caption", "text": ""}
+
+    #   =========   Plain text
+    def _parse_text(self, line):
+        return {"type": "text", "text": line.strip()}
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
 from docling_core.types.doc import (
    DocItemLabel,
    DoclingDocument,
+    DocumentOrigin,
    GroupLabel,
    TableCell,
    TableData,
@@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):

    def convert(self) -> DoclingDocument:
        # access self.path_or_stream to load stuff
-        doc = DoclingDocument(name="dummy")
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="text/html",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        _log.debug("Trying to convert HTML...")

        if self.is_valid():
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -0,0 +1,293 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+import marko
+import marko.ext
+import marko.ext.gfm
+import marko.inline
+from docling_core.types.doc import (
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    GroupLabel,
+    TableCell,
+    TableData,
+)
+from marko import Markdown
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class MarkdownDocumentBackend(DeclarativeDocumentBackend):
+    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+        super().__init__(in_doc, path_or_stream)
+
+        _log.debug("MD INIT!!!")
+
+        # Markdown file:
+        self.path_or_stream = path_or_stream
+        self.valid = True
+        self.markdown = ""  # To store original Markdown string
+
+        self.in_table = False
+        self.md_table_buffer: list[str] = []
+        self.inline_text_buffer = ""
+
+        try:
+            if isinstance(self.path_or_stream, BytesIO):
+                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                self.markdown = text_stream
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                    md_content = f.read()
+                    self.markdown = md_content
+            self.valid = True
+
+            _log.debug(self.markdown)
+        except Exception as e:
+            raise RuntimeError(
+                f"Could not initialize MD backend for file with hash {self.document_hash}."
+            ) from e
+        return
+
+    def close_table(self, doc=None):
+        if self.in_table:
+            _log.debug("=== TABLE START ===")
+            for md_table_row in self.md_table_buffer:
+                _log.debug(md_table_row)
+            _log.debug("=== TABLE END ===")
+            tcells = []
+            result_table = []
+            for n, md_table_row in enumerate(self.md_table_buffer):
+                data = []
+                if n == 0:
+                    header = [t.strip() for t in md_table_row.split("|")[1:-1]]
+                    for value in header:
+                        data.append(value)
+                    result_table.append(data)
+                if n > 1:
+                    values = [t.strip() for t in md_table_row.split("|")[1:-1]]
+                    for value in values:
+                        data.append(value)
+                    result_table.append(data)
+
+            for trow_ind, trow in enumerate(result_table):
+                for tcol_ind, cellval in enumerate(trow):
+                    row_span = (
+                        1  # currently supporting just simple tables (without spans)
+                    )
+                    col_span = (
+                        1  # currently supporting just simple tables (without spans)
+                    )
+                    icell = TableCell(
+                        text=cellval.strip(),
+                        row_span=row_span,
+                        col_span=col_span,
+                        start_row_offset_idx=trow_ind,
+                        end_row_offset_idx=trow_ind + row_span,
+                        start_col_offset_idx=tcol_ind,
+                        end_col_offset_idx=tcol_ind + col_span,
+                        col_header=False,
+                        row_header=False,
+                    )
+                    tcells.append(icell)
+
+            num_rows = len(result_table)
+            num_cols = len(result_table[0])
+            self.in_table = False
+            self.md_table_buffer = []  # clean table markdown buffer
+            # Initialize Docling TableData
+            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            # Populate
+            for tcell in tcells:
+                data.table_cells.append(tcell)
+            if len(tcells) > 0:
+                doc.add_table(data=data)
+        return
+
+    def process_inline_text(self, parent_element, doc=None):
+        # self.inline_text_buffer += str(text_in)
+        txt = self.inline_text_buffer.strip()
+        if len(txt) > 0:
+            doc.add_text(
+                label=DocItemLabel.PARAGRAPH,
+                parent=parent_element,
+                text=txt,
+            )
+        self.inline_text_buffer = ""
+
+    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+        # Iterates over all elements in the AST
+        # Check for different element types and process relevant details
+        if isinstance(element, marko.block.Heading):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(
+                f" - Heading level {element.level}, content: {element.children[0].children}"
+            )
+            if element.level == 1:
+                doc_label = DocItemLabel.TITLE
+            else:
+                doc_label = DocItemLabel.SECTION_HEADER
+            snippet_text = element.children[0].children.strip()
+
+            parent_element = doc.add_text(
+                label=doc_label, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.block.List):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
+            list_label = GroupLabel.LIST
+            if element.ordered:
+                list_label = GroupLabel.ORDERED_LIST
+            parent_element = doc.add_group(
+                label=list_label, name=f"list", parent=parent_element
+            )
+
+        elif isinstance(element, marko.block.ListItem):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(" - List item")
+
+            snippet_text = str(element.children[0].children[0].children)
+            is_numbered = False
+            if parent_element.label == GroupLabel.ORDERED_LIST:
+                is_numbered = True
+            doc.add_list_item(
+                enumerated=is_numbered, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.inline.Image):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
+            doc.add_picture(parent=parent_element, caption=element.title)
+
+        elif isinstance(element, marko.block.Paragraph):
+            self.process_inline_text(parent_element, doc)
+
+        elif isinstance(element, marko.inline.RawText):
+            _log.debug(f" - Paragraph (raw text): {element.children}")
+            snippet_text = str(element.children).strip()
+            # Detect start of the table:
+            if "|" in snippet_text:
+                # most likely part of the markdown table
+                self.in_table = True
+                if len(self.md_table_buffer) > 0:
+                    self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
+                        snippet_text
+                    )
+                else:
+                    self.md_table_buffer.append(snippet_text)
+            else:
+                self.close_table(doc)
+                self.in_table = False
+                # most likely just inline text
+                self.inline_text_buffer += str(
+                    element.children
+                )  # do not strip an inline text, as it may contain important spaces
+
+        elif isinstance(element, marko.inline.CodeSpan):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(f" - Code Span: {element.children}")
+            snippet_text = str(element.children).strip()
+            doc.add_text(
+                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.block.CodeBlock):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(f" - Code Block: {element.children}")
+            snippet_text = str(element.children[0].children).strip()
+            doc.add_text(
+                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.block.FencedCode):
+            self.close_table(doc)
+            self.process_inline_text(parent_element, doc)
+            _log.debug(f" - Code Block: {element.children}")
+            snippet_text = str(element.children[0].children).strip()
+            doc.add_text(
+                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+            )
+
+        elif isinstance(element, marko.inline.LineBreak):
+            self.process_inline_text(parent_element, doc)
+            if self.in_table:
+                _log.debug("Line break in a table")
+                self.md_table_buffer.append("")
+
+        elif isinstance(element, marko.block.HTMLBlock):
+            self.process_inline_text(parent_element, doc)
+            self.close_table(doc)
+            _log.debug("HTML Block: {}".format(element))
+            if (
+                len(element.children) > 0
+            ):  # If Marko doesn't return any content for HTML block, skip it
+                snippet_text = str(element.children).strip()
+                doc.add_text(
+                    label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
+                )
+        else:
+            if not isinstance(element, str):
+                self.close_table(doc)
+                _log.debug("Some other element: {}".format(element))
+
+        # Iterate through the element's children (if any)
+        if not isinstance(element, marko.block.ListItem):
+            if not isinstance(element, marko.block.Heading):
+                if not isinstance(element, marko.block.FencedCode):
+                    # if not isinstance(element, marko.block.Paragraph):
+                    if hasattr(element, "children"):
+                        for child in element.children:
+                            self.iterate_elements(child, depth + 1, doc, parent_element)
+
+    def is_valid(self) -> bool:
+        return self.valid
+
+    def unload(self):
+        if isinstance(self.path_or_stream, BytesIO):
+            self.path_or_stream.close()
+        self.path_or_stream = None
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return {InputFormat.MD}
+
+    def convert(self) -> DoclingDocument:
+        _log.debug("converting Markdown...")
+
+        origin = DocumentOrigin(
+            filename=self.file.name or "file",
+            mimetype="text/markdown",
+            binary_hash=self.document_hash,
+        )
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
+
+        if self.is_valid():
+            # Parse the markdown into an abstract syntax tree (AST)
+            marko_parser = Markdown()
+            parsed_ast = marko_parser.parse(self.markdown)
+            # Start iterating from the root of the AST
+            self.iterate_elements(parsed_ast, 0, doc, None)
+        else:
+            raise RuntimeError(
+                f"Cannot convert md with {self.document_hash} because the backend failed to init."
+            )
+        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        # Parses the PPTX into a structured document model.
        # origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="application/vnd.ms-powerpoint",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"
+
        doc = DoclingDocument(
-            name=docname, origin=origin
+            name=self.file.stem or "file", origin=origin
        )  # must add origin information
        doc = self.walk_linear(self.pptx_obj, doc)

@@ -119,10 +112,16 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

    def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
        is_a_list = False
+        is_list_group_created = False
        enum_list_item_value = 0
+        new_list = None
+        bullet_type = "None"
+        list_text = ""
+        list_label = GroupLabel.LIST
+        prov = self.generate_prov(shape, slide_ind, shape.text.strip())
+
+        # Identify if shape contains lists
        for paragraph in shape.text_frame.paragraphs:
-            enum_list_item_value += 1
-            bullet_type = "None"
            # Check if paragraph is a bullet point using the `element` XML
            p = paragraph._element
            if (
@@ -143,29 +142,32 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            if paragraph.level > 0:
                # Most likely a sub-list
                is_a_list = True
-            list_text = paragraph.text.strip()
-
-            prov = self.generate_prov(shape, slide_ind, shape.text.strip())

            if is_a_list:
                # Determine if this is an unordered list or an ordered list.
                # Set GroupLabel.ORDERED_LIST when it fits.
-                list_label = GroupLabel.LIST
                if bullet_type == "Numbered":
                    list_label = GroupLabel.ORDERED_LIST

-                new_list = doc.add_group(
-                    label=list_label, name=f"list", parent=parent_slide
-                )
-            else:
-                new_list = None
-
            if is_a_list:
                _log.debug("LIST DETECTED!")
            else:
                _log.debug("No List")

-            # for e in p.iter():
+        # If there is a list inside of the shape, create a new docling list to assign list items to
+        # if is_a_list:
+        #     new_list = doc.add_group(
+        #         label=list_label, name=f"list", parent=parent_slide
+        #     )
+
+        # Iterate through paragraphs to build up text
+        for paragraph in shape.text_frame.paragraphs:
+            # p_text = paragraph.text.strip()
+            p = paragraph._element
+            enum_list_item_value += 1
+            inline_paragraph_text = ""
+            inline_list_item_text = ""
+
            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
                if len(e.text.strip()) > 0:
                    e_is_a_list_item = False
@@ -187,15 +189,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                        e_is_a_list_item = False

                    if e_is_a_list_item:
+                        if len(inline_paragraph_text) > 0:
+                            # output accumulated inline text:
+                            doc.add_text(
+                                label=doc_label,
+                                parent=parent_slide,
+                                text=inline_paragraph_text,
+                                prov=prov,
+                            )
                        # Set marker and enumerated arguments if this is an enumeration element.
-                        enum_marker = str(enum_list_item_value) + "."
-                        doc.add_list_item(
-                            marker=enum_marker,
-                            enumerated=is_numbered,
-                            parent=new_list,
-                            text=list_text,
-                            prov=prov,
-                        )
+                        inline_list_item_text += e.text
+                        # print(e.text)
                    else:
                        # Assign proper label to the text, depending if it's a Title or Section Header
                        # For other types of text, assign - PARAGRAPH
@@ -210,15 +214,34 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                                doc_label = DocItemLabel.TITLE
                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
                                DocItemLabel.SECTION_HEADER
-
                        enum_list_item_value = 0
+                        inline_paragraph_text += e.text

-                        doc.add_text(
-                            label=doc_label,
-                            parent=parent_slide,
-                            text=list_text,
-                            prov=prov,
-                        )
+            if len(inline_paragraph_text) > 0:
+                # output accumulated inline text:
+                doc.add_text(
+                    label=doc_label,
+                    parent=parent_slide,
+                    text=inline_paragraph_text,
+                    prov=prov,
+                )
+
+            if len(inline_list_item_text) > 0:
+                enum_marker = ""
+                if is_numbered:
+                    enum_marker = str(enum_list_item_value) + "."
+                if not is_list_group_created:
+                    new_list = doc.add_group(
+                        label=list_label, name=f"list", parent=parent_slide
+                    )
+                    is_list_group_created = True
+                doc.add_list_item(
+                    marker=enum_marker,
+                    enumerated=is_numbered,
+                    parent=new_list,
+                    text=inline_list_item_text,
+                    prov=prov,
+                )
        return

    def handle_title(self, shape, parent_slide, slide_ind, doc):
@@ -311,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            if len(tcells) > 0:
                # If table is not fully empty...
                # Create Docling table
-                doc.add_table(data=data, prov=prov)
+                doc.add_table(parent=parent_slide, data=data, prov=prov)
        return

    def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def convert(self) -> DoclingDocument:
        # Parses the DOCX into a structured document model.

-        fname = ""
-        if isinstance(self.path_or_stream, Path):
-            fname = self.path_or_stream.name
-
        origin = DocumentOrigin(
-            filename=fname,
+            filename=self.file.name or "file",
            mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            binary_hash=self.document_hash,
        )
-        if len(fname) > 0:
-            docname = Path(fname).stem
-        else:
-            docname = "stream"
-        doc = DoclingDocument(name=docname, origin=origin)
+
+        doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
        if self.is_valid():
            assert self.docx_obj is not None
            doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -30,6 +30,8 @@ class InputFormat(str, Enum):
    HTML = "html"
    IMAGE = "image"
    PDF = "pdf"
+    ASCIIDOC = "asciidoc"
+    MD = "md"


 class OutputFormat(str, Enum):
@@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
+    InputFormat.MD: ["md"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
+    InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
 }

-FormatToMimeType: Dict[InputFormat, Set[str]] = {
-    InputFormat.DOCX: {
+FormatToMimeType: Dict[InputFormat, List[str]] = {
+    InputFormat.DOCX: [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
-    },
-    InputFormat.PPTX: {
+    ],
+    InputFormat.PPTX: [
        "application/vnd.openxmlformats-officedocument.presentationml.template",
        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-    },
-    InputFormat.HTML: {"text/html", "application/xhtml+xml"},
-    InputFormat.IMAGE: {
+    ],
+    InputFormat.HTML: ["text/html", "application/xhtml+xml"],
+    InputFormat.IMAGE: [
        "image/png",
        "image/jpeg",
        "image/tiff",
        "image/gif",
        "image/bmp",
-    },
-    InputFormat.PDF: {"application/pdf"},
+    ],
+    InputFormat.PDF: ["application/pdf"],
+    InputFormat.ASCIIDOC: ["text/asciidoc"],
+    InputFormat.MD: ["text/markdown", "text/x-markdown"],
 }
 MimeTypeToFormat = {
    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
    ConversionStatus,
    DocumentStream,
    ErrorItem,
+    FormatToExtensions,
+    FormatToMimeType,
    InputFormat,
    MimeTypeToFormat,
    Page,
@@ -484,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
            else:
                raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

-    def _guess_format(self, obj):
-        content = None
+    def _guess_format(self, obj: Union[Path, DocumentStream]):
+        content = b""  # empty binary blob
+        format = None
+
        if isinstance(obj, Path):
            mime = filetype.guess_mime(str(obj))
            if mime is None:
+                ext = obj.suffix[1:]
+                mime = self._mime_from_extension(ext)
+            if mime is None:  # must guess from
                with obj.open("rb") as f:
                    content = f.read(1024)  # Read first 1KB

        elif isinstance(obj, DocumentStream):
-            obj.stream.seek(0)
            content = obj.stream.read(8192)
            obj.stream.seek(0)
            mime = filetype.guess_mime(content)
+            if mime is None:
+                ext = (
+                    obj.name.rsplit(".", 1)[-1]
+                    if ("." in obj.name and not obj.name.startswith("."))
+                    else ""
+                )
+                mime = self._mime_from_extension(ext)

-        if mime is None:
-            mime = self._detect_html_xhtml(content)
+        mime = mime or self._detect_html_xhtml(content)
+        mime = mime or "text/plain"

        format = MimeTypeToFormat.get(mime)
        return format

+    def _mime_from_extension(self, ext):
+        mime = None
+        if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
+            mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
+        elif ext in FormatToExtensions[InputFormat.HTML]:
+            mime = FormatToMimeType[InputFormat.HTML][0]
+        elif ext in FormatToExtensions[InputFormat.MD]:
+            mime = FormatToMimeType[InputFormat.MD][0]
+
+        return mime
+
    def _detect_html_xhtml(self, content):
        content_str = content.decode("ascii", errors="ignore").lower()
        # Remove XML comments
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call

 from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.asciidoc_backend import AsciiDocBackend
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
+from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
@@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend


+class MarkdownFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
+
+
+class AsciiDocFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
+
+
 class HTMLFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
@@ -74,6 +86,12 @@ _format_to_default_options = {
    InputFormat.PPTX: FormatOption(
        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
    ),
+    InputFormat.MD: FormatOption(
+        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+    ),
+    InputFormat.ASCIIDOC: FormatOption(
+        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+    ),
    InputFormat.HTML: FormatOption(
        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
    ),