fix(xml-jats): replace new line character by a space

Instead of removing new line character from text, replace it by a space character.

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-02-05 19:26:20 +01:00
parent 21a99fc27d
commit c9ae5f6545

View File

@ -132,7 +132,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
def _parse_title(self) -> str: def _parse_title(self) -> str:
title: str = " ".join( title: str = " ".join(
[ [
t.replace("\n", "") t.replace("\n", " ")
for t in self.tree.xpath(".//title-group/article-title")[0].itertext() for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
] ]
) )
@ -186,7 +186,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
texts = [] texts = []
for abstract_node in self.tree.xpath(".//abstract"): for abstract_node in self.tree.xpath(".//abstract"):
for text in abstract_node.itertext(): for text in abstract_node.itertext():
texts.append(text.replace("\n", "")) texts.append(text.replace("\n", " "))
abstract: str = "".join(texts) abstract: str = "".join(texts)
return abstract return abstract
@ -201,7 +201,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
# Text # Text
paragraph["text"] = "".join( paragraph["text"] = "".join(
[t.replace("\n", "") for t in paragraph_node.itertext()] [t.replace("\n", " ") for t in paragraph_node.itertext()]
) )
# Header # Header
@ -210,7 +210,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
paragraph["headers"].append( paragraph["headers"].append(
"".join( "".join(
[ [
t.replace("\n", "") t.replace("\n", " ")
for t in paragraph_node.xpath(path)[0].itertext() for t in paragraph_node.xpath(path)[0].itertext()
] ]
) )
@ -245,7 +245,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
caption_node = None caption_node = None
if caption_node != None: if caption_node != None:
table["caption"] = "".join( table["caption"] = "".join(
[t.replace("\n", "") for t in caption_node.itertext()] [t.replace("\n", " ") for t in caption_node.itertext()]
) )
# Label # Label
@ -271,7 +271,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
if figure_node.xpath("label"): if figure_node.xpath("label"):
figure_caption["label"] = "".join( figure_caption["label"] = "".join(
[ [
t.replace("\n", "") t.replace("\n", " ")
for t in figure_node.xpath("label")[0].itertext() for t in figure_node.xpath("label")[0].itertext()
] ]
) )
@ -281,7 +281,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend):
caption = "" caption = ""
for caption_node in figure_node.xpath("caption")[0].getchildren(): for caption_node in figure_node.xpath("caption")[0].getchildren():
caption += ( caption += (
"".join([t.replace("\n", "") for t in caption_node.itertext()]) "".join([t.replace("\n", " ") for t in caption_node.itertext()])
+ "\n" + "\n"
) )
figure_caption["caption"] = caption figure_caption["caption"] = caption