mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add image format support to PdfBackend
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
d0fccb9342
commit
6efcf0a5a5
@ -3,6 +3,7 @@ from io import BytesIO
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
|
# from docling.datamodel.document import InputDocument
|
||||||
from docling_core.types.experimental import DoclingDocument
|
from docling_core.types.experimental import DoclingDocument
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -10,9 +11,10 @@ from docling.datamodel.base_models import InputFormat
|
|||||||
|
|
||||||
class AbstractDocumentBackend(ABC):
|
class AbstractDocumentBackend(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
self.document_hash = document_hash
|
self.document_hash = in_doc.document_hash
|
||||||
|
self.input_format = in_doc.format
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
|
@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
|
|||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docling.datamodel.base_models import Cell
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -187,23 +188,25 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
|
|
||||||
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
self.parser = pdf_parser()
|
self.parser = pdf_parser()
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
if isinstance(path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
success = self.parser.load_document_from_bytesio(
|
success = self.parser.load_document_from_bytesio(
|
||||||
document_hash, path_or_stream
|
self.document_hash, self.path_or_stream
|
||||||
|
)
|
||||||
|
elif isinstance(self.path_or_stream, Path):
|
||||||
|
success = self.parser.load_document(
|
||||||
|
self.document_hash, str(self.path_or_stream)
|
||||||
)
|
)
|
||||||
elif isinstance(path_or_stream, Path):
|
|
||||||
success = self.parser.load_document(document_hash, str(path_or_stream))
|
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"docling-parse could not load document with hash {document_hash}."
|
f"docling-parse could not load document with hash {self.document_hash}."
|
||||||
)
|
)
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
|
@ -15,14 +15,15 @@ from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
_log.debug("About to init HTML backend...")
|
_log.debug("About to init HTML backend...")
|
||||||
super().__init__(path_or_stream, document_hash)
|
|
||||||
self.soup = None
|
self.soup = None
|
||||||
# HTML file:
|
# HTML file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
@ -44,7 +45,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Could not initialize HTML backend for file with hash {document_hash}."
|
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
|
@ -23,13 +23,14 @@ from docling.backend.abstract_backend import (
|
|||||||
PaginatedDocumentBackend,
|
PaginatedDocumentBackend,
|
||||||
)
|
)
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.namespaces = {
|
self.namespaces = {
|
||||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||||
@ -45,7 +46,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
self.valid = True
|
self.valid = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"MsPowerpointDocumentBackend could not load document with hash {document_hash}"
|
f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -17,20 +17,21 @@ from lxml import etree
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.XML_KEY = (
|
self.XML_KEY = (
|
||||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
)
|
)
|
||||||
self.xml_namespaces = {
|
self.xml_namespaces = {
|
||||||
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
||||||
}
|
}
|
||||||
super().__init__(path_or_stream, document_hash)
|
|
||||||
# self.initialise(path_or_stream)
|
# self.initialise(path_or_stream)
|
||||||
# Word file:
|
# Word file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
@ -1,11 +1,14 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Iterable, Optional, Set
|
from io import BytesIO
|
||||||
|
from typing import Iterable, Optional, Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.doc.doc_ocr import Path
|
||||||
from docling_core.types.experimental import BoundingBox, Size
|
from docling_core.types.experimental import BoundingBox, Size
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||||
from docling.datamodel.base_models import Cell, InputFormat
|
from docling.datamodel.base_models import Cell, InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
class PdfPageBackend(ABC):
|
class PdfPageBackend(ABC):
|
||||||
@ -42,6 +45,22 @@ class PdfPageBackend(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||||
|
|
||||||
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
|
if self.input_format is not InputFormat.PDF:
|
||||||
|
if self.input_format is InputFormat.IMAGE:
|
||||||
|
buf = BytesIO()
|
||||||
|
img = Image.open(self.path_or_stream)
|
||||||
|
img.save(buf, "PDF")
|
||||||
|
buf.seek(0)
|
||||||
|
self.path_or_stream = buf
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
|
||||||
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||||
pass
|
pass
|
||||||
|
@ -232,13 +232,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
|
|
||||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
except PdfiumError as e:
|
except PdfiumError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"pypdfium could not load document with hash {document_hash}"
|
f"pypdfium could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
|
@ -149,9 +149,7 @@ class InputDocument(BaseModel):
|
|||||||
f"Please check your format configuration on DocumentConverter."
|
f"Please check your format configuration on DocumentConverter."
|
||||||
)
|
)
|
||||||
|
|
||||||
self._backend = backend(
|
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentFormat(str, Enum):
|
class DocumentFormat(str, Enum):
|
||||||
|
@ -61,6 +61,11 @@ class PdfFormatOption(FormatOption):
|
|||||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
class ImageFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = StandardPdfModelPipeline
|
||||||
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
_format_to_default_options = {
|
_format_to_default_options = {
|
||||||
InputFormat.DOCX: FormatOption(
|
InputFormat.DOCX: FormatOption(
|
||||||
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
||||||
|
@ -21,57 +21,34 @@ input_paths = [
|
|||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
|
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
# Path("tests/data/2305.03393v1-pg9-img.png"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
## for defaults use:
|
## for defaults use:
|
||||||
# doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
|
|
||||||
## to customize use:
|
## to customize use:
|
||||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
doc_converter = (
|
||||||
formats=[
|
DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
InputFormat.PDF,
|
formats=[
|
||||||
# InputFormat.IMAGE,
|
InputFormat.PDF,
|
||||||
InputFormat.DOCX,
|
InputFormat.IMAGE,
|
||||||
InputFormat.HTML,
|
InputFormat.DOCX,
|
||||||
InputFormat.PPTX,
|
InputFormat.HTML,
|
||||||
], # whitelist formats, other files are ignored.
|
InputFormat.PPTX,
|
||||||
format_options={
|
], # whitelist formats, non-matching files are ignored.
|
||||||
InputFormat.PDF: PdfFormatOption(
|
format_options={
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
InputFormat.PDF: PdfFormatOption(
|
||||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||||
InputFormat.DOCX: WordFormatOption(
|
),
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
InputFormat.DOCX: WordFormatOption(
|
||||||
),
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
# InputFormat.IMAGE: PdfFormatOption(),
|
),
|
||||||
},
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
|
||||||
pdf=None,
|
|
||||||
docx=WordFormatOption(
|
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
|
||||||
),
|
|
||||||
formats=[
|
|
||||||
InputFormat.PDF,
|
|
||||||
# InputFormat.IMAGE,
|
|
||||||
InputFormat.DOCX,
|
|
||||||
InputFormat.HTML,
|
|
||||||
InputFormat.PPTX,
|
|
||||||
], # whitelist formats, other files are ignored.
|
|
||||||
format_options={
|
|
||||||
InputFormat.PDF: PdfFormatOption(
|
|
||||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
|
||||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
|
||||||
InputFormat.DOCX: WordFormatOption(
|
|
||||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
|
||||||
),
|
|
||||||
# InputFormat.IMAGE: PdfFormatOption(),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
conv_results = doc_converter.convert_all(input_paths)
|
conv_results = doc_converter.convert_all(input_paths)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
|
@ -7,6 +7,8 @@ from docling.backend.docling_parse_backend import (
|
|||||||
DoclingParseDocumentBackend,
|
DoclingParseDocumentBackend,
|
||||||
DoclingParsePageBackend,
|
DoclingParsePageBackend,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -14,10 +16,21 @@ def test_doc_path():
|
|||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend(pdf_doc):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=pdf_doc,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_backend = in_doc._backend
|
||||||
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5695.pdf")
|
pdf_doc = Path("./tests/data/redp5695.pdf")
|
||||||
|
|
||||||
doc_backend = DoclingParseDocumentBackend(pdf_doc, "123456xyz")
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(0, doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
@ -36,7 +49,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
def test_get_text_from_rect(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
# Get the title text of the DocLayNet paper
|
||||||
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
def test_crop_page_image(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend = DoclingParseDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
doc_backend.page_count() == 9
|
doc_backend.page_count() == 9
|
||||||
|
@ -7,6 +7,8 @@ from docling.backend.pypdfium2_backend import (
|
|||||||
PyPdfiumDocumentBackend,
|
PyPdfiumDocumentBackend,
|
||||||
PyPdfiumPageBackend,
|
PyPdfiumPageBackend,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -14,10 +16,21 @@ def test_doc_path():
|
|||||||
return Path("./tests/data/2206.01062.pdf")
|
return Path("./tests/data/2206.01062.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_backend(pdf_doc):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=pdf_doc,
|
||||||
|
format=InputFormat.PDF,
|
||||||
|
backend=PyPdfiumDocumentBackend,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc_backend = in_doc._backend
|
||||||
|
return doc_backend
|
||||||
|
|
||||||
|
|
||||||
def test_text_cell_counts():
|
def test_text_cell_counts():
|
||||||
pdf_doc = Path("./tests/data/redp5695.pdf")
|
pdf_doc = Path("./tests/data/redp5695.pdf")
|
||||||
|
|
||||||
doc_backend = PyPdfiumDocumentBackend(pdf_doc, "123456xyz")
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(0, doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
@ -36,7 +49,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
|
|
||||||
def test_get_text_from_rect(test_doc_path):
|
def test_get_text_from_rect(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Get the title text of the DocLayNet paper
|
# Get the title text of the DocLayNet paper
|
||||||
@ -49,7 +62,7 @@ def test_get_text_from_rect(test_doc_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_crop_page_image(test_doc_path):
|
def test_crop_page_image(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
@ -60,5 +73,5 @@ def test_crop_page_image(test_doc_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_num_pages(test_doc_path):
|
def test_num_pages(test_doc_path):
|
||||||
doc_backend = PyPdfiumDocumentBackend(test_doc_path, "123456xyz")
|
doc_backend = _get_backend(test_doc_path)
|
||||||
doc_backend.page_count() == 9
|
doc_backend.page_count() == 9
|
||||||
|
Loading…
Reference in New Issue
Block a user