From 1783f137da4670634c440c63ce19edcbc1ced50b Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Thu, 24 Oct 2024 13:09:34 +0200 Subject: [PATCH] Fix for md hanging when encountering long sequence of unescaped underscore chars Signed-off-by: Maksym Lysak --- docling/backend/md_backend.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index e2d26754..5b3043b9 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -25,6 +25,11 @@ _log = logging.getLogger(__name__) class MarkdownDocumentBackend(DeclarativeDocumentBackend): + + def clean_md(self, md_text): + res_text = md_text.replace("____", "") + return res_text + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) @@ -42,11 +47,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): text_stream = self.path_or_stream.getvalue().decode("utf-8") - self.markdown = text_stream + self.markdown = self.clean_md(text_stream) # remove invalid sequences if isinstance(self.path_or_stream, Path): with open(self.path_or_stream, "r", encoding="utf-8") as f: md_content = f.read() - self.markdown = md_content + self.markdown = self.clean_md( + md_content + ) # remove invalid sequences self.valid = True _log.debug(self.markdown)