mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
use PdfDocumentBackend
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
46b904e059
commit
9da610e95b
@ -22,7 +22,7 @@ from PIL import Image
|
|||||||
from PIL.Image import Image as PILImage
|
from PIL.Image import Image as PILImage
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@ -194,7 +194,7 @@ def _extract_confidence(title_str) -> float:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class MetsGbsDocumentBackend(PaginatedDocumentBackend):
|
class MetsGbsDocumentBackend(PdfDocumentBackend):
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
|
@ -84,9 +84,9 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|||||||
|
|
||||||
buf.seek(0)
|
buf.seek(0)
|
||||||
self.path_or_stream = buf
|
self.path_or_stream = buf
|
||||||
else:
|
elif self.input_format not in self.supported_formats():
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend. Valid format are {','.join(self.supported_formats())}."
|
||||||
)
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -99,7 +99,7 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
return {InputFormat.PDF}
|
return {InputFormat.PDF, InputFormat.IMAGE}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def supports_pagination(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
|
Loading…
Reference in New Issue
Block a user