mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 21:44:32 +00:00
making fix more rare
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
d654a292e8
commit
9b5b14f1a8
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
@ -27,7 +28,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def shorten_underscore_sequences(self, markdown_text, max_length=4):
|
||||
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
||||
# This regex will match any sequence of underscores
|
||||
pattern = r"_+"
|
||||
|
||||
@ -45,6 +46,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Use re.sub to replace long underscore sequences
|
||||
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
||||
|
||||
if len(shortened_text) != len(markdown_text):
|
||||
warnings.warn("Detected potentially incorrect Markdown, correcting...")
|
||||
|
||||
return shortened_text
|
||||
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
|
Loading…
Reference in New Issue
Block a user