mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
feat: Create a backend to transform PubMed XML files to DoclingDocument (#557)
Signed-off-by: lucas-morin <lucas.morin222@gmail.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
|
||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||
|
||||
|
||||
class XMLPubMedFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
||||
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
),
|
||||
InputFormat.XML_PUBMED: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
||||
),
|
||||
@@ -171,7 +180,6 @@ class DocumentConverter:
|
||||
max_num_pages: int = sys.maxsize,
|
||||
max_file_size: int = sys.maxsize,
|
||||
) -> ConversionResult:
|
||||
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
raises_on_error=raises_on_error,
|
||||
|
||||
Reference in New Issue
Block a user