fix(xml-jats): replace new line character by a space

Instead of removing new line character from text, replace it by a space character. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-07-31 14:34:40 +00:00 · 2025-02-05 19:26:20 +01:00 · 2025-02-05 19:26:20 +01:00 · c9ae5f6545
commit c9ae5f6545
parent 21a99fc27d
1 changed files with 7 additions and 7 deletions
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@ -132,7 +132,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
    def _parse_title(self) -> str:
        title: str = " ".join(
            [
-                t.replace("\n", "")
+                t.replace("\n", " ")
                for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
            ]
        )
@ -186,7 +186,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
        texts = []
        for abstract_node in self.tree.xpath(".//abstract"):
            for text in abstract_node.itertext():
-                texts.append(text.replace("\n", ""))
+                texts.append(text.replace("\n", " "))
        abstract: str = "".join(texts)
        return abstract

@ -201,7 +201,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):

            # Text
            paragraph["text"] = "".join(
-                [t.replace("\n", "") for t in paragraph_node.itertext()]
+                [t.replace("\n", " ") for t in paragraph_node.itertext()]
            )

            # Header
@ -210,7 +210,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                paragraph["headers"].append(
                    "".join(
                        [
-                            t.replace("\n", "")
+                            t.replace("\n", " ")
                            for t in paragraph_node.xpath(path)[0].itertext()
                        ]
                    )
@ -245,7 +245,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                caption_node = None
            if caption_node != None:
                table["caption"] = "".join(
-                    [t.replace("\n", "") for t in caption_node.itertext()]
+                    [t.replace("\n", " ") for t in caption_node.itertext()]
                )

            # Label
@ -271,7 +271,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
            if figure_node.xpath("label"):
                figure_caption["label"] = "".join(
                    [
-                        t.replace("\n", "")
+                        t.replace("\n", " ")
                        for t in figure_node.xpath("label")[0].itertext()
                    ]
                )
@ -281,7 +281,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                caption = ""
                for caption_node in figure_node.xpath("caption")[0].getchildren():
                    caption += (
-                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
+                        "".join([t.replace("\n", " ") for t in caption_node.itertext()])
                        + "\n"
                    )
                figure_caption["caption"] = caption