From c9ae5f6545691dca88fc54c8f43adde90831b468 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 5 Feb 2025 19:26:20 +0100 Subject: [PATCH] fix(xml-jats): replace new line character by a space Instead of removing new line character from text, replace it by a space character. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/xml/pubmed_backend.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docling/backend/xml/pubmed_backend.py b/docling/backend/xml/pubmed_backend.py index 260afe92..657d9fb4 100755 --- a/docling/backend/xml/pubmed_backend.py +++ b/docling/backend/xml/pubmed_backend.py @@ -132,7 +132,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): def _parse_title(self) -> str: title: str = " ".join( [ - t.replace("\n", "") + t.replace("\n", " ") for t in self.tree.xpath(".//title-group/article-title")[0].itertext() ] ) @@ -186,7 +186,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): texts = [] for abstract_node in self.tree.xpath(".//abstract"): for text in abstract_node.itertext(): - texts.append(text.replace("\n", "")) + texts.append(text.replace("\n", " ")) abstract: str = "".join(texts) return abstract @@ -201,7 +201,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): # Text paragraph["text"] = "".join( - [t.replace("\n", "") for t in paragraph_node.itertext()] + [t.replace("\n", " ") for t in paragraph_node.itertext()] ) # Header @@ -210,7 +210,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): paragraph["headers"].append( "".join( [ - t.replace("\n", "") + t.replace("\n", " ") for t in paragraph_node.xpath(path)[0].itertext() ] ) @@ -245,7 +245,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): caption_node = None if caption_node != None: table["caption"] = "".join( - [t.replace("\n", "") for t in caption_node.itertext()] + [t.replace("\n", " ") for t in caption_node.itertext()] ) # Label @@ -271,7 +271,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): if figure_node.xpath("label"): figure_caption["label"] = "".join( [ - t.replace("\n", "") + t.replace("\n", " ") for t in figure_node.xpath("label")[0].itertext() ] ) @@ -281,7 +281,7 @@ class PubMedDocumentBackend(DeclarativeDocumentBackend): caption = "" for caption_node in figure_node.xpath("caption")[0].getchildren(): caption += ( - "".join([t.replace("\n", "") for t in caption_node.itertext()]) + "".join([t.replace("\n", " ") for t in caption_node.itertext()]) + "\n" ) figure_caption["caption"] = caption