Fixes MyPy requirements, and rest of pre-commit

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-21 15:43:39 +02:00
parent dae366440c
commit 1456a36618
3 changed files with 48 additions and 56 deletions

View File

@ -3,6 +3,10 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
import marko
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
@ -10,19 +14,12 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
import marko.ext from marko import Markdown
import marko.ext.gfm
import marko.inline
from docling.backend.abstract_backend import ( from docling.backend.abstract_backend import DeclarativeDocumentBackend
DeclarativeDocumentBackend
)
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
import marko
from marko import Markdown
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -38,7 +35,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.markdown = "" # To store original Markdown string self.markdown = "" # To store original Markdown string
self.in_table = False self.in_table = False
self.md_table_buffer = [] self.md_table_buffer: list[str] = []
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -57,7 +54,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e ) from e
return return
def close_table(self, doc = None): def close_table(self, doc=None):
if self.in_table: if self.in_table:
_log.debug("=== TABLE START ===") _log.debug("=== TABLE START ===")
@ -69,20 +66,24 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
for n, md_table_row in enumerate(self.md_table_buffer): for n, md_table_row in enumerate(self.md_table_buffer):
data = [] data = []
if n == 0: if n == 0:
header = [t.strip() for t in md_table_row.split('|')[1:-1]] header = [t.strip() for t in md_table_row.split("|")[1:-1]]
for value in header: for value in header:
data.append(value) data.append(value)
result_table.append(data) result_table.append(data)
if n > 1: if n > 1:
values = [t.strip() for t in md_table_row.split('|')[1:-1]] values = [t.strip() for t in md_table_row.split("|")[1:-1]]
for value in values: for value in values:
data.append(value) data.append(value)
result_table.append(data) result_table.append(data)
for trow_ind, trow in enumerate(result_table): for trow_ind, trow in enumerate(result_table):
for tcol_ind, cellval in enumerate(trow): for tcol_ind, cellval in enumerate(trow):
row_span = 1 # currently supporting just simple tables (without spans) row_span = (
col_span = 1 # currently supporting just simple tables (without spans) 1 # currently supporting just simple tables (without spans)
)
col_span = (
1 # currently supporting just simple tables (without spans)
)
icell = TableCell( icell = TableCell(
text=cellval.strip(), text=cellval.strip(),
row_span=row_span, row_span=row_span,
@ -109,12 +110,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=data) doc.add_table(data=data)
return return
def iterate_elements(self, element, depth=0, doc=None, parent_element = None): def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
# Iterates over all elements in the AST # Iterates over all elements in the AST
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading): if isinstance(element, marko.block.Heading):
self.close_table(doc) self.close_table(doc)
_log.debug(f" - Heading level {element.level}, content: {element.children[0].children}") _log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}"
)
if element.level == 1: if element.level == 1:
doc_label = DocItemLabel.TITLE doc_label = DocItemLabel.TITLE
else: else:
@ -122,12 +125,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = element.children[0].children snippet_text = element.children[0].children
parent_element = doc.add_text( parent_element = doc.add_text(
label=doc_label, label=doc_label, parent=parent_element, text=snippet_text
parent=parent_element,
text=snippet_text
) )
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
self.close_table(doc) self.close_table(doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
@ -135,9 +135,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if element.ordered: if element.ordered:
list_label = GroupLabel.ORDERED_LIST list_label = GroupLabel.ORDERED_LIST
parent_element = doc.add_group( parent_element = doc.add_group(
label=list_label, label=list_label, name=f"list", parent=parent_element
name=f"list",
parent=parent_element
) )
elif isinstance(element, marko.block.ListItem): elif isinstance(element, marko.block.ListItem):
@ -148,9 +146,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if parent_element.label == GroupLabel.ORDERED_LIST: if parent_element.label == GroupLabel.ORDERED_LIST:
is_numbered = True is_numbered = True
doc.add_list_item( doc.add_list_item(
enumerated=is_numbered, enumerated=is_numbered, parent=parent_element, text=snippet_text
parent=parent_element,
text=snippet_text
) )
elif isinstance(element, marko.block.Paragraph): elif isinstance(element, marko.block.Paragraph):
@ -158,18 +154,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f" - Paragraph: {element.children[0].children}") _log.debug(f" - Paragraph: {element.children[0].children}")
snippet_text = str(element.children[0].children) snippet_text = str(element.children[0].children)
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH, parent=parent_element, text=snippet_text
parent=parent_element,
text=snippet_text
) )
elif isinstance(element, marko.inline.Image): elif isinstance(element, marko.inline.Image):
self.close_table(doc) self.close_table(doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}") _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture( doc.add_picture(parent=parent_element, caption=element.title)
parent=parent_element,
caption=element.title
)
elif isinstance(element, marko.inline.RawText): elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}") _log.debug(f" - Paragraph (raw text): {element.children}")
@ -180,7 +171,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# most likely part of the markdown table # most likely part of the markdown table
self.in_table = True self.in_table = True
if len(self.md_table_buffer) > 0: if len(self.md_table_buffer) > 0:
self.md_table_buffer[len(self.md_table_buffer)-1] += str(snippet_text) self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
snippet_text
)
else: else:
self.md_table_buffer.append(snippet_text) self.md_table_buffer.append(snippet_text)
else: else:
@ -190,7 +183,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_text( doc.add_text(
label=DocItemLabel.PARAGRAPH, label=DocItemLabel.PARAGRAPH,
parent=parent_element, parent=parent_element,
text=snippet_text text=snippet_text,
) )
elif isinstance(element, marko.inline.CodeSpan): elif isinstance(element, marko.inline.CodeSpan):
@ -198,9 +191,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f" - Paragraph (code): {element.children}") _log.debug(f" - Paragraph (code): {element.children}")
snippet_text = str(element.children) snippet_text = str(element.children)
doc.add_text( doc.add_text(
label=DocItemLabel.CODE, label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
parent=parent_element,
text=snippet_text
) )
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
@ -213,9 +204,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug("HTML Block: {}".format(element)) _log.debug("HTML Block: {}".format(element))
snippet_text = str(element.children) snippet_text = str(element.children)
doc.add_text( doc.add_text(
label=DocItemLabel.CODE, label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
parent=parent_element,
text=snippet_text
) )
else: else:
@ -224,7 +213,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug("Some other element: {}".format(element)) _log.debug("Some other element: {}".format(element))
# Iterate through the element's children (if any) # Iterate through the element's children (if any)
if hasattr(element, 'children'): if hasattr(element, "children"):
for child in element.children: for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element) self.iterate_elements(child, depth + 1, doc, parent_element)
@ -253,7 +242,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
marko_parser = Markdown() marko_parser = Markdown()
parsed_ast = marko_parser.parse(self.markdown) parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST # Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0 , doc, None) self.iterate_elements(parsed_ast, 0, doc, None)
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init." f"Cannot convert md with {self.document_hash} because the backend failed to init."

View File

@ -10,9 +10,9 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,

View File

@ -1,11 +1,10 @@
import json import json
import logging import logging
import os
from pathlib import Path from pathlib import Path
import yaml import yaml
from docling.datamodel.base_models import InputFormat
import os
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -15,7 +14,12 @@ _log = logging.getLogger(__name__)
def main(): def main():
input_paths = [ input_paths = [
Path("README.md") Path("README.md"),
Path("scratch_a/2203.01017v2.md"),
Path("scratch_a/2206.01062.md"),
Path("scratch_a/2305.03393v1.md"),
Path("scratch_a/redp5110.md"),
Path("scratch_a/redp5695.md"),
] ]
for path in input_paths: for path in input_paths:
@ -24,13 +28,12 @@ def main():
format=InputFormat.PDF, format=InputFormat.PDF,
backend=MarkdownDocumentBackend, backend=MarkdownDocumentBackend,
) )
mdb = MarkdownDocumentBackend(in_doc = in_doc, path_or_stream = path) mdb = MarkdownDocumentBackend(in_doc=in_doc, path_or_stream=path)
document = mdb.convert() document = mdb.convert()
out_path = Path("scratch") out_path = Path("scratch")
print( print(
f"Document {path} converted." f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
f"\nSaved markdown output to: {str(out_path)}"
) )
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc: