From 9b5b14f1a8f49b56d6378eb1f34c61f980488901 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 25 Oct 2024 10:10:24 +0200 Subject: [PATCH] making fix more rare Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 187c05c9..a26bc861 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -1,5 +1,6 @@ import logging import re +import warnings from io import BytesIO from pathlib import Path from typing import Set, Union @@ -27,7 +28,7 @@ _log = logging.getLogger(__name__) class MarkdownDocumentBackend(DeclarativeDocumentBackend): - def shorten_underscore_sequences(self, markdown_text, max_length=4): + def shorten_underscore_sequences(self, markdown_text, max_length=10): # This regex will match any sequence of underscores pattern = r"_+" @@ -45,6 +46,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Use re.sub to replace long underscore sequences shortened_text = re.sub(pattern, replace_match, markdown_text) + if len(shortened_text) != len(markdown_text): + warnings.warn("Detected potentially incorrect Markdown, correcting...") + return shortened_text def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):