mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
making fix more rare
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
d654a292e8
commit
9b5b14f1a8
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
@ -27,7 +28,7 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
def shorten_underscore_sequences(self, markdown_text, max_length=4):
|
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
||||||
# This regex will match any sequence of underscores
|
# This regex will match any sequence of underscores
|
||||||
pattern = r"_+"
|
pattern = r"_+"
|
||||||
|
|
||||||
@ -45,6 +46,9 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Use re.sub to replace long underscore sequences
|
# Use re.sub to replace long underscore sequences
|
||||||
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
||||||
|
|
||||||
|
if len(shortened_text) != len(markdown_text):
|
||||||
|
warnings.warn("Detected potentially incorrect Markdown, correcting...")
|
||||||
|
|
||||||
return shortened_text
|
return shortened_text
|
||||||
|
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
Loading…
Reference in New Issue
Block a user