mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fix for md hanging when encountering long sequence of unescaped underscore chars
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
8208c93e3a
commit
1783f137da
@ -25,6 +25,11 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
|
def clean_md(self, md_text):
|
||||||
|
res_text = md_text.replace("____", "")
|
||||||
|
return res_text
|
||||||
|
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
@ -42,11 +47,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
self.markdown = text_stream
|
self.markdown = self.clean_md(text_stream) # remove invalid sequences
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
md_content = f.read()
|
md_content = f.read()
|
||||||
self.markdown = md_content
|
self.markdown = self.clean_md(
|
||||||
|
md_content
|
||||||
|
) # remove invalid sequences
|
||||||
self.valid = True
|
self.valid = True
|
||||||
|
|
||||||
_log.debug(self.markdown)
|
_log.debug(self.markdown)
|
||||||
|
Loading…
Reference in New Issue
Block a user