mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
(feat): Create a XML backend for PubMed documents based on the pubmed_parser library (merge conflicts)
This commit is contained in:
commit
c024275f24
@ -212,6 +212,25 @@ class _DummyBackend(AbstractDocumentBackend):
|
|||||||
return super().unload()
|
return super().unload()
|
||||||
|
|
||||||
|
|
||||||
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supports_pagination(cls) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
return super().unload()
|
||||||
|
|
||||||
|
|
||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
@ -281,6 +300,11 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
)
|
)
|
||||||
mime = self._mime_from_extension(ext)
|
mime = self._mime_from_extension(ext)
|
||||||
|
|
||||||
|
# Detect PubMed XML documents
|
||||||
|
xml_doctype = re.search(r"<!DOCTYPE [^>]+>", content.decode("utf-8")).group()
|
||||||
|
if "/NLM//DTD JATS" in xml_doctype:
|
||||||
|
return InputFormat.PUBMED
|
||||||
|
|
||||||
mime = mime or self._detect_html_xhtml(content)
|
mime = mime or self._detect_html_xhtml(content)
|
||||||
mime = mime or "text/plain"
|
mime = mime or "text/plain"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user