mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
Fixes MyPy requirements, and rest of pre-commit
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
dae366440c
commit
1456a36618
@ -3,6 +3,10 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
|
import marko
|
||||||
|
import marko.ext
|
||||||
|
import marko.ext.gfm
|
||||||
|
import marko.inline
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -10,19 +14,12 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
import marko.ext
|
from marko import Markdown
|
||||||
import marko.ext.gfm
|
|
||||||
import marko.inline
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import (
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
DeclarativeDocumentBackend
|
|
||||||
)
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
import marko
|
|
||||||
from marko import Markdown
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -38,7 +35,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.markdown = "" # To store original Markdown string
|
self.markdown = "" # To store original Markdown string
|
||||||
|
|
||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer = []
|
self.md_table_buffer: list[str] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -69,20 +66,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||||
data = []
|
data = []
|
||||||
if n == 0:
|
if n == 0:
|
||||||
header = [t.strip() for t in md_table_row.split('|')[1:-1]]
|
header = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
||||||
for value in header:
|
for value in header:
|
||||||
data.append(value)
|
data.append(value)
|
||||||
result_table.append(data)
|
result_table.append(data)
|
||||||
if n > 1:
|
if n > 1:
|
||||||
values = [t.strip() for t in md_table_row.split('|')[1:-1]]
|
values = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
||||||
for value in values:
|
for value in values:
|
||||||
data.append(value)
|
data.append(value)
|
||||||
result_table.append(data)
|
result_table.append(data)
|
||||||
|
|
||||||
for trow_ind, trow in enumerate(result_table):
|
for trow_ind, trow in enumerate(result_table):
|
||||||
for tcol_ind, cellval in enumerate(trow):
|
for tcol_ind, cellval in enumerate(trow):
|
||||||
row_span = 1 # currently supporting just simple tables (without spans)
|
row_span = (
|
||||||
col_span = 1 # currently supporting just simple tables (without spans)
|
1 # currently supporting just simple tables (without spans)
|
||||||
|
)
|
||||||
|
col_span = (
|
||||||
|
1 # currently supporting just simple tables (without spans)
|
||||||
|
)
|
||||||
icell = TableCell(
|
icell = TableCell(
|
||||||
text=cellval.strip(),
|
text=cellval.strip(),
|
||||||
row_span=row_span,
|
row_span=row_span,
|
||||||
@ -114,7 +115,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Check for different element types and process relevant details
|
# Check for different element types and process relevant details
|
||||||
if isinstance(element, marko.block.Heading):
|
if isinstance(element, marko.block.Heading):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug(f" - Heading level {element.level}, content: {element.children[0].children}")
|
_log.debug(
|
||||||
|
f" - Heading level {element.level}, content: {element.children[0].children}"
|
||||||
|
)
|
||||||
if element.level == 1:
|
if element.level == 1:
|
||||||
doc_label = DocItemLabel.TITLE
|
doc_label = DocItemLabel.TITLE
|
||||||
else:
|
else:
|
||||||
@ -122,12 +125,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
snippet_text = element.children[0].children
|
snippet_text = element.children[0].children
|
||||||
|
|
||||||
parent_element = doc.add_text(
|
parent_element = doc.add_text(
|
||||||
label=doc_label,
|
label=doc_label, parent=parent_element, text=snippet_text
|
||||||
parent=parent_element,
|
|
||||||
text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
elif isinstance(element, marko.block.List):
|
elif isinstance(element, marko.block.List):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||||
@ -135,9 +135,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if element.ordered:
|
if element.ordered:
|
||||||
list_label = GroupLabel.ORDERED_LIST
|
list_label = GroupLabel.ORDERED_LIST
|
||||||
parent_element = doc.add_group(
|
parent_element = doc.add_group(
|
||||||
label=list_label,
|
label=list_label, name=f"list", parent=parent_element
|
||||||
name=f"list",
|
|
||||||
parent=parent_element
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.ListItem):
|
elif isinstance(element, marko.block.ListItem):
|
||||||
@ -148,9 +146,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
||||||
parent=parent_element,
|
|
||||||
text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.block.Paragraph):
|
elif isinstance(element, marko.block.Paragraph):
|
||||||
@ -158,18 +154,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f" - Paragraph: {element.children[0].children}")
|
_log.debug(f" - Paragraph: {element.children[0].children}")
|
||||||
snippet_text = str(element.children[0].children)
|
snippet_text = str(element.children[0].children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH, parent=parent_element, text=snippet_text
|
||||||
parent=parent_element,
|
|
||||||
text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.Image):
|
elif isinstance(element, marko.inline.Image):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||||
doc.add_picture(
|
doc.add_picture(parent=parent_element, caption=element.title)
|
||||||
parent=parent_element,
|
|
||||||
caption=element.title
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.RawText):
|
elif isinstance(element, marko.inline.RawText):
|
||||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||||
@ -180,7 +171,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# most likely part of the markdown table
|
# most likely part of the markdown table
|
||||||
self.in_table = True
|
self.in_table = True
|
||||||
if len(self.md_table_buffer) > 0:
|
if len(self.md_table_buffer) > 0:
|
||||||
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text)
|
self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
|
||||||
|
snippet_text
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.md_table_buffer.append(snippet_text)
|
self.md_table_buffer.append(snippet_text)
|
||||||
else:
|
else:
|
||||||
@ -190,7 +183,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
parent=parent_element,
|
parent=parent_element,
|
||||||
text=snippet_text
|
text=snippet_text,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.CodeSpan):
|
elif isinstance(element, marko.inline.CodeSpan):
|
||||||
@ -198,9 +191,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug(f" - Paragraph (code): {element.children}")
|
_log.debug(f" - Paragraph (code): {element.children}")
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.CODE,
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
parent=parent_element,
|
|
||||||
text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, marko.inline.LineBreak):
|
elif isinstance(element, marko.inline.LineBreak):
|
||||||
@ -213,9 +204,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug("HTML Block: {}".format(element))
|
_log.debug("HTML Block: {}".format(element))
|
||||||
snippet_text = str(element.children)
|
snippet_text = str(element.children)
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.CODE,
|
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||||
parent=parent_element,
|
|
||||||
text=snippet_text
|
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -224,7 +213,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
_log.debug("Some other element: {}".format(element))
|
_log.debug("Some other element: {}".format(element))
|
||||||
|
|
||||||
# Iterate through the element's children (if any)
|
# Iterate through the element's children (if any)
|
||||||
if hasattr(element, 'children'):
|
if hasattr(element, "children"):
|
||||||
for child in element.children:
|
for child in element.children:
|
||||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||||
|
|
||||||
|
@ -10,9 +10,9 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
|
||||||
import os
|
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -15,7 +14,12 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
input_paths = [
|
input_paths = [
|
||||||
Path("README.md")
|
Path("README.md"),
|
||||||
|
Path("scratch_a/2203.01017v2.md"),
|
||||||
|
Path("scratch_a/2206.01062.md"),
|
||||||
|
Path("scratch_a/2305.03393v1.md"),
|
||||||
|
Path("scratch_a/redp5110.md"),
|
||||||
|
Path("scratch_a/redp5695.md"),
|
||||||
]
|
]
|
||||||
|
|
||||||
for path in input_paths:
|
for path in input_paths:
|
||||||
@ -29,8 +33,7 @@ def main():
|
|||||||
|
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {path} converted."
|
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
|
Loading…
Reference in New Issue
Block a user