mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
feat(xml-jats): parse XML JATS documents (#967)
* chore(xml-jats): separate authors and affiliations In XML PubMed (JATS) backend, convert authors and affiliations as they are typically rendered on PDFs. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(xml-jats): replace new line character by a space Instead of removing new line character from text, replace it by a space character. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * feat(xml-jats): improve existing parser and extend features Partially support lists, respect reading order, parse more sections, support equations, better text formatting. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * chore(xml-jats): rename PubMed objects to JATS Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
e1436a8b05
commit
428b656793
@@ -18,7 +18,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
@@ -102,9 +102,9 @@ class PatentUsptoFormatOption(FormatOption):
|
||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||
|
||||
|
||||
class XMLPubMedFormatOption(FormatOption):
|
||||
class XMLJatsFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
||||
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
@@ -143,8 +143,8 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
),
|
||||
InputFormat.XML_PUBMED: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
||||
InputFormat.XML_JATS: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
|
||||
Reference in New Issue
Block a user