From c9ae5f6545691dca88fc54c8f43adde90831b468 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Wed, 5 Feb 2025 19:26:20 +0100
Subject: [PATCH] fix(xml-jats): replace new line character by a space

Instead of removing new line character from text, replace it by a space character.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 docling/backend/xml/pubmed_backend.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docling/backend/xml/pubmed_backend.py b/docling/backend/xml/pubmed_backend.py
index 260afe92..657d9fb4 100755
--- a/docling/backend/xml/pubmed_backend.py
+++ b/docling/backend/xml/pubmed_backend.py
@@ -132,7 +132,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
     def _parse_title(self) -> str:
         title: str = " ".join(
             [
-                t.replace("\n", "")
+                t.replace("\n", " ")
                 for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
             ]
         )
@@ -186,7 +186,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
         texts = []
         for abstract_node in self.tree.xpath(".//abstract"):
             for text in abstract_node.itertext():
-                texts.append(text.replace("\n", ""))
+                texts.append(text.replace("\n", " "))
         abstract: str = "".join(texts)
         return abstract
 
@@ -201,7 +201,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
 
             # Text
             paragraph["text"] = "".join(
-                [t.replace("\n", "") for t in paragraph_node.itertext()]
+                [t.replace("\n", " ") for t in paragraph_node.itertext()]
             )
 
             # Header
@@ -210,7 +210,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                 paragraph["headers"].append(
                     "".join(
                         [
-                            t.replace("\n", "")
+                            t.replace("\n", " ")
                             for t in paragraph_node.xpath(path)[0].itertext()
                         ]
                     )
@@ -245,7 +245,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                 caption_node = None
             if caption_node != None:
                 table["caption"] = "".join(
-                    [t.replace("\n", "") for t in caption_node.itertext()]
+                    [t.replace("\n", " ") for t in caption_node.itertext()]
                 )
 
             # Label
@@ -271,7 +271,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
             if figure_node.xpath("label"):
                 figure_caption["label"] = "".join(
                     [
-                        t.replace("\n", "")
+                        t.replace("\n", " ")
                         for t in figure_node.xpath("label")[0].itertext()
                     ]
                 )
@@ -281,7 +281,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
                 caption = ""
                 for caption_node in figure_node.xpath("caption")[0].getchildren():
                     caption += (
-                        "".join([t.replace("\n", "") for t in caption_node.itertext()])
+                        "".join([t.replace("\n", " ") for t in caption_node.itertext()])
                         + "\n"
                     )
                 figure_caption["caption"] = caption