feat: Create a backend to transform PubMed XML files to DoclingDocument (#557)

Signed-off-by: lucas-morin <lucas.morin222@gmail.com>
This commit is contained in:
Lucas Morin
2024-12-17 19:27:09 +01:00
committed by GitHub
parent e31f09f71f
commit fd034802b6
24 changed files with 31040 additions and 4 deletions

View File

@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLPubMedFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.XML_PUBMED: FormatOption(
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
@@ -171,7 +180,6 @@ class DocumentConverter:
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
) -> ConversionResult:
all_res = self.convert_all(
source=[source],
raises_on_error=raises_on_error,