diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 187c05c9..a26bc861 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -1,5 +1,6 @@ import logging import re +import warnings from io import BytesIO from pathlib import Path from typing import Set, Union @@ -27,7 +28,7 @@ _log = logging.getLogger(__name__) class MarkdownDocumentBackend(DeclarativeDocumentBackend): - def shorten_underscore_sequences(self, markdown_text, max_length=4): + def shorten_underscore_sequences(self, markdown_text, max_length=10): # This regex will match any sequence of underscores pattern = r"_+" @@ -45,6 +46,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Use re.sub to replace long underscore sequences shortened_text = re.sub(pattern, replace_match, markdown_text) + if len(shortened_text) != len(markdown_text): + warnings.warn("Detected potentially incorrect Markdown, correcting...") + return shortened_text def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):