diff --git a/README.md b/README.md index 2fd199cd..431afe17 100644 --- a/README.md +++ b/README.md @@ -67,11 +67,12 @@ pip install docling ### Convert a single document To convert invidual PDF documents, use `convert_single()`, for example: + ```python -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL -converter = DocumentConverter() +converter = PdfDocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## Docling Technical Report[...]" print(result.render_as_doctags()) # output: "<page_1><loc_20>..." diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index d3c1a7dd..87d586a4 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -1,13 +1,11 @@ from abc import ABC, abstractmethod from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Iterable, Optional, Union +from typing import Set, Union -from docling_core.types.experimental import BoundingBox, Size -from PIL import Image +from docling_core.types.experimental import DoclingDocument -if TYPE_CHECKING: - from docling.datamodel.base_models import Cell +from docling.datamodel.base_models import InputFormat class AbstractDocumentBackend(ABC): @@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC): def is_valid(self) -> bool: pass + @classmethod + @abstractmethod + def is_paginated(cls) -> bool: + pass + @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): @@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC): self.path_or_stream = None - -class PdfPageBackend(ABC): - + @classmethod @abstractmethod - def get_text_in_rect(self, bbox: "BoundingBox") -> str: - pass - - @abstractmethod - def get_text_cells(self) -> Iterable["Cell"]: - pass - - @abstractmethod - def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]: - pass - - @abstractmethod - def get_page_image( - self, scale: float = 1, cropbox: Optional["BoundingBox"] = None - ) -> Image.Image: - pass - - @abstractmethod - def get_size(self) -> "Size": - pass - - @abstractmethod - def is_valid(self) -> bool: - pass - - @abstractmethod - def unload(self): + def supported_formats(cls) -> Set[InputFormat]: pass -class PdfDocumentBackend(AbstractDocumentBackend): - @abstractmethod - def load_page(self, page_no: int) -> PdfPageBackend: - pass +class DeclarativeDocumentBackend(AbstractDocumentBackend): + """DeclarativeDocumentBackend. + + A declarative document backend is a backend that can transform to DoclingDocument + straight without a recognition pipeline. + """ @abstractmethod - def page_count(self) -> int: + def convert(self) -> DoclingDocument: pass diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 95b687ad..7d5c3113 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser from PIL import Image, ImageDraw from pypdfium2 import PdfPage -from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell _log = logging.getLogger(__name__) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py new file mode 100644 index 00000000..8ecb9579 --- /dev/null +++ b/docling/backend/html_backend.py @@ -0,0 +1,40 @@ +from io import BytesIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.experimental import ( + DescriptionItem, + DocItemLabel, + DoclingDocument, +) + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat + + +class HTMLDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) + + def is_valid(self) -> bool: + return True + + def is_paginated(cls) -> bool: + False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.HTML} + + def convert(self) -> DoclingDocument: + + # access self.path_or_stream to load stuff + doc = DoclingDocument(description=DescriptionItem(), name="dummy") + doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT) + return doc diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py new file mode 100644 index 00000000..7886eb15 --- /dev/null +++ b/docling/backend/mspowerpoint_backend.py @@ -0,0 +1,38 @@ +from io import BytesIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.experimental import ( + DescriptionItem, + DocItemLabel, + DoclingDocument, +) + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat + + +class MsPowerpointDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) + + def is_valid(self) -> bool: + return True + + def is_paginated(cls) -> bool: + False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.PPTX} + + def convert(self) -> DoclingDocument: + doc = DoclingDocument(description=DescriptionItem(), name="dummy") + doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT) + return doc diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py new file mode 100644 index 00000000..a1ec3891 --- /dev/null +++ b/docling/backend/msword_backend.py @@ -0,0 +1,38 @@ +from io import BytesIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.experimental import ( + DescriptionItem, + DocItemLabel, + DoclingDocument, +) + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat + + +class MsWordDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): + super().__init__(path_or_stream, document_hash) + + def is_valid(self) -> bool: + return True + + def is_paginated(cls) -> bool: + False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.DOCX} + + def convert(self) -> DoclingDocument: + doc = DoclingDocument(description=DescriptionItem(), name="dummy") + doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT) + return doc diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py new file mode 100644 index 00000000..16e249ad --- /dev/null +++ b/docling/backend/pdf_backend.py @@ -0,0 +1,59 @@ +from abc import ABC, abstractmethod +from typing import Iterable, Optional, Set + +from docling_core.types.experimental import BoundingBox, Size +from PIL import Image + +from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.datamodel.base_models import Cell, InputFormat + + +class PdfPageBackend(ABC): + + @abstractmethod + def get_text_in_rect(self, bbox: "BoundingBox") -> str: + pass + + @abstractmethod + def get_text_cells(self) -> Iterable["Cell"]: + pass + + @abstractmethod + def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]: + pass + + @abstractmethod + def get_page_image( + self, scale: float = 1, cropbox: Optional["BoundingBox"] = None + ) -> Image.Image: + pass + + @abstractmethod + def get_size(self) -> "Size": + pass + + @abstractmethod + def is_valid(self) -> bool: + pass + + @abstractmethod + def unload(self): + pass + + +class PdfDocumentBackend(AbstractDocumentBackend): + @abstractmethod + def load_page(self, page_no: int) -> PdfPageBackend: + pass + + @abstractmethod + def page_count(self) -> int: + pass + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.PDF} + + @classmethod + def is_paginated(cls) -> bool: + return True diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index ffd497de..e7d7ae84 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -8,10 +8,10 @@ import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from docling_core.types.experimental import BoundingBox, CoordOrigin, Size from PIL import Image, ImageDraw -from pypdfium2 import PdfPage, PdfTextPage +from pypdfium2 import PdfTextPage from pypdfium2._helpers.misc import PdfiumError -from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend +from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell _log = logging.getLogger(__name__) diff --git a/docling/cli/main.py b/docling/cli/main.py index 894e9ab1..f187cd7b 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -190,12 +190,12 @@ def convert( case _: raise RuntimeError(f"Unexpected backend type {backend}") - pipeline_options = PipelineOptions( + pipeline_options = PdfPipelineOptions( do_ocr=ocr, do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = do_cell_matching - doc_converter = DocumentConverter( + doc_converter = PdfDocumentConverter( pipeline_options=pipeline_options, pdf_backend=pdf_backend, ) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index fe7d8a15..d1f7aab9 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,8 +1,8 @@ -import copy import warnings from enum import Enum, auto from io import BytesIO -from typing import Annotated, Any, Dict, List, Optional, Tuple, Union +from pathlib import Path +from typing import Annotated, Dict, List, Optional, Union from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental.document import BasePictureData, TableCell @@ -11,8 +11,6 @@ from PIL.Image import Image from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Self -from docling.backend.abstract_backend import PdfPageBackend - class ConversionStatus(str, Enum): PENDING = auto() @@ -30,13 +28,29 @@ class InputFormat(str, Enum): PDF = auto() +FormatToMimeType = { + InputFormat.DOCX: { + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + }, + InputFormat.PPTX: { + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + }, + InputFormat.HTML: {"text/html", "application/xhtml+xml"}, + InputFormat.IMAGE: {"image/png", "image/jpeg"}, + InputFormat.PDF: {"application/pdf"}, +} +MimeTypeToFormat = { + mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes +} + + class DocInputType(str, Enum): PATH = auto() STREAM = auto() class DoclingComponentType(str, Enum): - PDF_BACKEND = auto() + DOCUMENT_BACKEND = auto() MODEL = auto() DOC_ASSEMBLER = auto() @@ -128,13 +142,13 @@ class Page(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) page_no: int - page_hash: Optional[str] = None + # page_hash: Optional[str] = None size: Optional[Size] = None cells: List[Cell] = [] predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None - _backend: Optional[PdfPageBackend] = ( + _backend: Optional["PdfPageBackend"] = ( None # Internal PDF backend. By default it is cleared during assembling. ) _default_image_scale: float = 1.0 # Default image scale for external usage. @@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel): ) -class PipelineOptions(BaseModel): +class PipelineOptions(BaseModel): ... + + +class PdfPipelineOptions(PipelineOptions): + artifacts_path: Optional[Union[Path, str]] = None do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - -class AssembleOptions(BaseModel): keep_page_images: Annotated[ bool, Field( diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index fa5fa045..0802602f 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,9 +1,10 @@ import logging from io import BytesIO from pathlib import Path, PurePath -from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import Dict, Iterable, List, Optional, Tuple, Type, Union -from docling_core.types import BaseCell, BaseText +import filetype +from docling_core.types import BaseText from docling_core.types import Document as DsDocument from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import FileInfoObject as DsFileInfoObject @@ -19,8 +20,11 @@ from docling_core.types.experimental import ( from pydantic import BaseModel from typing_extensions import deprecated -from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend +from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.html_backend import HTMLDocumentBackend +from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend +from docling.backend.msword_backend import MsWordDocumentBackend from docling.datamodel.base_models import ( AssembledUnit, ConversionStatus, @@ -28,13 +32,14 @@ from docling.datamodel.base_models import ( ErrorItem, FigureElement, InputFormat, + MimeTypeToFormat, Page, PageElement, Table, TextElement, ) from docling.datamodel.settings import DocumentLimits -from docling.utils.utils import create_file_hash +from docling.utils.utils import create_file_hash, create_hash _log = logging.getLogger(__name__) @@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument( _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = { InputFormat.PDF: DoclingParseDocumentBackend, - InputFormat.DOCX: None, - InputFormat.PPTX: None, + InputFormat.HTML: HTMLDocumentBackend, + InputFormat.DOCX: MsWordDocumentBackend, + InputFormat.PPTX: MsPowerpointDocumentBackend, InputFormat.IMAGE: None, } @@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] class InputDocument(BaseModel): file: PurePath = None document_hash: Optional[str] = None - valid: bool = False + valid: bool = True limits: DocumentLimits = DocumentLimits() + format: Optional[InputFormat] = None filesize: Optional[int] = None - page_count: Optional[int] = None + page_count: int = 0 - _backend: PdfDocumentBackend = None # Internal PDF backend used + _backend: AbstractDocumentBackend = None # Internal PDF backend used def __init__( self, @@ -94,27 +101,31 @@ class InputDocument(BaseModel): filename: Optional[str] = None, limits: Optional[DocumentLimits] = None, backend: Optional[Type[AbstractDocumentBackend]] = None, + format: Optional[InputFormat] = None, ): super().__init__() - if not backend: - backend = _input_format_default_backends[InputFormat.PDF] - self.limits = limits or DocumentLimits() try: if isinstance(path_or_stream, Path): + mime = filetype.guess_mime(str(path_or_stream)) + if mime is None: + if path_or_stream.suffix == ".html": + mime = "text/html" + self.file = path_or_stream self.filesize = path_or_stream.stat().st_size if self.filesize > self.limits.max_file_size: self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = backend( - path_or_stream=path_or_stream, document_hash=self.document_hash - ) + + self._init_doc(backend, mime, path_or_stream) elif isinstance(path_or_stream, BytesIO): + mime = filetype.guess_mime(path_or_stream.read(8192)) + self.file = PurePath(filename) self.filesize = path_or_stream.getbuffer().nbytes @@ -122,15 +133,15 @@ class InputDocument(BaseModel): self.valid = False else: self.document_hash = create_file_hash(path_or_stream) - self._backend = backend( - path_or_stream=path_or_stream, document_hash=self.document_hash - ) - if self.document_hash and self._backend.page_count() > 0: - self.page_count = self._backend.page_count() + self._init_doc(backend, mime, path_or_stream) - if self.page_count <= self.limits.max_num_pages: - self.valid = True + # For paginated backends, check if the maximum page count is exceeded. + if self.valid and self._backend.is_valid(): + if self._backend.is_paginated(): + self.page_count = self._backend.page_count() + if not self.page_count <= self.limits.max_num_pages: + self.valid = False except (FileNotFoundError, OSError) as e: _log.exception( @@ -144,6 +155,27 @@ class InputDocument(BaseModel): ) # raise + def _init_doc( + self, + backend: AbstractDocumentBackend, + mime: str, + path_or_stream: Union[BytesIO, Path], + ) -> None: + self.format = MimeTypeToFormat.get(mime) + if self.format is not None: + backend = backend or _input_format_default_backends.get(self.format) + if backend is None: + raise RuntimeError( + f"Could not find suitable default backend for format: {self.format}" + ) + if self.format is None or self.format not in backend.supported_formats(): + # TODO decide if to raise exception here too. + self.valid = False + else: + self._backend = backend( + path_or_stream=path_or_stream, document_hash=self.document_hash + ) + @deprecated("Use `ConversionResult` instead.") class ConvertedDocument(BaseModel): @@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel): desc = DsDocumentDescription(logs=[]) page_hashes = [ - PageReference(hash=p.page_hash, page=p.page_no + 1, model="default") + PageReference( + hash=create_hash(self.input.document_hash + ":" + str(p.page_no)), + page=p.page_no + 1, + model="default", + ) for p in self.pages ] @@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel): _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None limits: Optional[DocumentLimits] = DocumentLimits() - DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend - def docs( - self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None + self, backend: Optional[Type[AbstractDocumentBackend]] = None ) -> Iterable[InputDocument]: - pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND - for obj in self._path_or_stream_iterator: if isinstance(obj, Path): yield InputDocument( - path_or_stream=obj, limits=self.limits, backend=pdf_backend + path_or_stream=obj, limits=self.limits, backend=backend ) elif isinstance(obj, DocumentStream): yield InputDocument( path_or_stream=obj.stream, filename=obj.filename, limits=self.limits, - backend=pdf_backend, + backend=backend, ) @classmethod diff --git a/docling/document_converter.py b/docling/document_converter.py index e12e0e42..3494a6e3 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,81 +1,78 @@ -import functools import logging import tempfile import time -import traceback from pathlib import Path -from typing import Iterable, Optional, Type, Union +from typing import Dict, Iterable, List, Optional, Type import requests -from PIL import ImageDraw -from pydantic import AnyHttpUrl, TypeAdapter, ValidationError +from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError -from docling.backend.abstract_backend import PdfDocumentBackend -from docling.datamodel.base_models import ( - AssembledUnit, - AssembleOptions, - ConversionStatus, - DoclingComponentType, - ErrorItem, - Page, - PipelineOptions, -) +from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions from docling.datamodel.document import ( ConversionResult, DocumentConversionInput, InputDocument, ) from docling.datamodel.settings import settings -from docling.models.ds_glm_model import GlmModel -from docling.models.page_assemble_model import PageAssembleModel from docling.pipeline.base_model_pipeline import BaseModelPipeline -from docling.pipeline.standard_model_pipeline import StandardModelPipeline -from docling.utils.utils import chunkify, create_hash +from docling.pipeline.simple_model_pipeline import SimpleModelPipeline +from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline +from docling.utils.utils import chunkify _log = logging.getLogger(__name__) -class DocumentConverter: - _default_download_filename = "file.pdf" +class FormatOption(BaseModel): + pipeline_cls: Type[BaseModelPipeline] + pipeline_options: Optional[PipelineOptions] = None + backend: Optional[Type[AbstractDocumentBackend]] + + model_config = ConfigDict(arbitrary_types_allowed=True) def __init__( self, - artifacts_path: Optional[Union[Path, str]] = None, - pipeline_options: PipelineOptions = PipelineOptions(), - pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND, - pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline, - assemble_options: AssembleOptions = AssembleOptions(), + pipeline_cls: Type[BaseModelPipeline], + pipeline_options: Optional[PipelineOptions] = None, + backend: Optional[Type[AbstractDocumentBackend]] = None, ): - if not artifacts_path: - artifacts_path = self.download_models_hf() + if pipeline_options is None: + pipeline_options = pipeline_cls.get_default_options() - artifacts_path = Path(artifacts_path) - - self.model_pipeline = pipeline_cls( - artifacts_path=artifacts_path, pipeline_options=pipeline_options + super().__init__( + pipeline_cls=pipeline_cls, + pipeline_options=pipeline_options, + backend=backend, ) - self.page_assemble_model = PageAssembleModel(config={}) - self.glm_model = GlmModel(config={}) - self.pdf_backend = pdf_backend - self.assemble_options = assemble_options - @staticmethod - def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False - ) -> Path: - from huggingface_hub import snapshot_download +_format_to_default_options = { + InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline), + InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline), + InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline), + InputFormat.IMAGE: None, + InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline), +} - download_path = snapshot_download( - repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir + +class DocumentConverter: + _default_download_filename = "file" + + def __init__( + self, + formats: List[InputFormat] = [e for e in InputFormat], + format_options: Dict[InputFormat, FormatOption] = _format_to_default_options, + ): + self.formats = formats + self.format_to_options = format_options + self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = ( + {} ) - return Path(download_path) - def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: for input_batch in chunkify( - input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size + input.docs(), settings.perf.doc_batch_size # pass format_options ): _log.info(f"Going to convert document batch...") # parallel processing only within input_batch @@ -84,8 +81,8 @@ class DocumentConverter: # ) as pool: # yield from pool.map(self.process_document, input_batch) - # Note: Pdfium backend is not thread-safe, thread pool usage was disabled. - yield from map(self._process_document, input_batch) + # Note: PDF backends are not thread-safe, thread pool usage was disabled. + yield from map(self.process_document, input_batch) def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult: """Convert a single document. @@ -137,156 +134,42 @@ class DocumentConverter: raise RuntimeError(f"Conversion failed with status: {conv_res.status}") return conv_res - def _process_document(self, in_doc: InputDocument) -> ConversionResult: - start_doc_time = time.time() - conv_res = ConversionResult(input=in_doc) + def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]: + pipeline_class = None + fopt = self.format_to_options.get(doc.format) + if fopt is None: + return None + else: + pipeline_class = fopt.pipeline_cls - _log.info(f"Processing document {in_doc.file.name}") - - if not in_doc.valid: - conv_res.status = ConversionStatus.FAILURE - return conv_res - - for i in range(0, in_doc.page_count): - conv_res.pages.append(Page(page_no=i)) - - all_assembled_pages = [] - - try: - # Iterate batches of pages (page_batch_size) in the doc - for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size): - start_pb_time = time.time() - # Pipeline - - # 1. Initialise the page resources - init_pages = map( - functools.partial(self._initialize_page, in_doc), page_batch - ) - - # 2. Populate page image - pages_with_images = map( - functools.partial(self._populate_page_images, in_doc), init_pages - ) - - # 3. Populate programmatic page cells - pages_with_cells = map( - functools.partial(self._parse_page_cells, in_doc), - pages_with_images, - ) - - # 4. Run pipeline stages - pipeline_pages = self.model_pipeline.apply(pages_with_cells) - - # 5. Assemble page elements (per page) - assembled_pages = self.page_assemble_model(pipeline_pages) - - # exhaust assembled_pages - for assembled_page in assembled_pages: - # Free up mem resources before moving on with next batch - - # Remove page images (can be disabled) - if self.assemble_options.images_scale is None: - assembled_page._image_cache = {} - - # Unload backend - assembled_page._backend.unload() - - all_assembled_pages.append(assembled_page) - - end_pb_time = time.time() - start_pb_time - _log.info(f"Finished converting page batch time={end_pb_time:.3f}") - - # Free up mem resources of PDF backend - in_doc._backend.unload() - - conv_res.pages = all_assembled_pages - self._assemble_doc(conv_res) - - status = ConversionStatus.SUCCESS - for page in conv_res.pages: - if not page._backend.is_valid(): - conv_res.errors.append( - ErrorItem( - component_type=DoclingComponentType.PDF_BACKEND, - module_name=type(page._backend).__name__, - error_message=f"Page {page.page_no} failed to parse.", - ) - ) - status = ConversionStatus.PARTIAL_SUCCESS - - conv_res.status = status - - except Exception as e: - conv_res.status = ConversionStatus.FAILURE - trace = "\n".join(traceback.format_exception(e)) - _log.info( - f"Encountered an error during conversion of document {in_doc.document_hash}:\n" - f"{trace}" + if pipeline_class not in self.initialized_pipelines: + self.initialized_pipelines[pipeline_class] = pipeline_class( + pipeline_options=pipeline_class.get_default_options() ) + return self.initialized_pipelines[pipeline_class] + + def process_document(self, in_doc: InputDocument) -> ConversionResult: + start_doc_time = time.time() + + conv_res = self._execute_pipeline(in_doc) end_doc_time = time.time() - start_doc_time - _log.info( - f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}" - ) + _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.") return conv_res - # Initialise and load resources for a page, before downstream steps (populate images, cells, ...) - def _initialize_page(self, doc: InputDocument, page: Page) -> Page: - page._backend = doc._backend.load_page(page.page_no) - page.size = page._backend.get_size() - page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no)) + def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult: + if in_doc.valid and in_doc.format in self.formats: + pipeline = self._get_pipeline(in_doc) + if pipeline is None: # Can't find a default pipeline. Should this raise? + conv_res = ConversionResult(input=in_doc) + conv_res.status = ConversionStatus.FAILURE + return conv_res - return page + conv_res = pipeline.execute(in_doc) + else: # invalid doc or not of desired format + conv_res = ConversionResult(input=in_doc) + conv_res.status = ConversionStatus.FAILURE + # TODO add error log why it failed. - # Generate the page image and store it in the page object - def _populate_page_images(self, doc: InputDocument, page: Page) -> Page: - # default scale - page.get_image( - scale=1.0 - ) # puts the page image on the image cache at default scale - - # user requested scales - if self.assemble_options.images_scale is not None: - page._default_image_scale = self.assemble_options.images_scale - page.get_image( - scale=self.assemble_options.images_scale - ) # this will trigger storing the image in the internal cache - - return page - - # Extract and populate the page cells and store it in the page object - def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page: - page.cells = page._backend.get_text_cells() - - # DEBUG code: - def draw_text_boxes(image, cells): - draw = ImageDraw.Draw(image) - for c in cells: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="red") - image.show() - - # draw_text_boxes(page.get_image(scale=1.0), cells) - - return page - - def _assemble_doc(self, conv_res: ConversionResult): - all_elements = [] - all_headers = [] - all_body = [] - - for p in conv_res.pages: - - for el in p.assembled.body: - all_body.append(el) - for el in p.assembled.headers: - all_headers.append(el) - for el in p.assembled.elements: - all_elements.append(el) - - conv_res.assembled = AssembledUnit( - elements=all_elements, headers=all_headers, body=all_body - ) - - conv_res.output, conv_res.experimental = self.glm_model(conv_res) + return conv_res diff --git a/docling/models/abstract_model.py b/docling/models/abstract_model.py new file mode 100644 index 00000000..ba5dc62c --- /dev/null +++ b/docling/models/abstract_model.py @@ -0,0 +1,10 @@ +from abc import ABC, abstractmethod +from typing import Iterable + +from docling.datamodel.base_models import Page + + +class AbstractPageModel(ABC): + @abstractmethod + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + pass diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index a0777363..ea0feb82 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -3,7 +3,6 @@ import logging from abc import abstractmethod from typing import Iterable, List, Tuple -import numpy import numpy as np from docling_core.types.experimental import BoundingBox, CoordOrigin from PIL import Image, ImageDraw @@ -11,11 +10,12 @@ from rtree import index from scipy.ndimage import find_objects, label from docling.datamodel.base_models import OcrCell, Page +from docling.models.abstract_model import AbstractPageModel _log = logging.getLogger(__name__) -class BaseOcrModel: +class BaseOcrModel(AbstractPageModel): def __init__(self, config): self.config = config self.enabled = config["enabled"] diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 5ef39043..362ba88b 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -16,12 +16,13 @@ from docling.datamodel.base_models import ( LayoutPrediction, Page, ) +from docling.models.abstract_model import AbstractPageModel from docling.utils import layout_utils as lu _log = logging.getLogger(__name__) -class LayoutModel: +class LayoutModel(AbstractPageModel): TEXT_ELEM_LABELS = [ DocItemLabel.TEXT, diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index b3444d1d..28f93c12 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -10,12 +10,13 @@ from docling.datamodel.base_models import ( Table, TextElement, ) +from docling.models.abstract_model import AbstractPageModel from docling.models.layout_model import LayoutModel _log = logging.getLogger(__name__) -class PageAssembleModel: +class PageAssembleModel(AbstractPageModel): def __init__(self, config): self.config = config @@ -145,4 +146,11 @@ class PageAssembleModel: elements=elements, headers=headers, body=body ) + # Remove page images (can be disabled) + if self.config["images_scale"] is None: + page._image_cache = {} + + # Unload backend + page._backend.unload() + yield page diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py new file mode 100644 index 00000000..3683123c --- /dev/null +++ b/docling/models/page_preprocessing_model.py @@ -0,0 +1,50 @@ +from typing import Iterable + +from PIL import ImageDraw + +from docling.datamodel.base_models import Page +from docling.models.abstract_model import AbstractPageModel + + +class PagePreprocessingModel(AbstractPageModel): + def __init__(self, config): + self.config = config + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + for page in page_batch: + page = self._populate_page_images(page) + page = self._parse_page_cells(page) + yield page + + # Generate the page image and store it in the page object + def _populate_page_images(self, page: Page) -> Page: + # default scale + page.get_image( + scale=1.0 + ) # puts the page image on the image cache at default scale + + images_scale = self.config["images_scale"] + # user requested scales + if images_scale is not None: + page._default_image_scale = images_scale + page.get_image( + scale=images_scale + ) # this will trigger storing the image in the internal cache + + return page + + # Extract and populate the page cells and store it in the page object + def _parse_page_cells(self, page: Page) -> Page: + page.cells = page._backend.get_text_cells() + + # DEBUG code: + def draw_text_boxes(image, cells): + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="red") + image.show() + + # draw_text_boxes(page.get_image(scale=1.0), cells) + + return page diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index b2098004..b2fb6e58 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic from PIL import ImageDraw from docling.datamodel.base_models import Page, Table, TableStructurePrediction +from docling.models.abstract_model import AbstractPageModel -class TableStructureModel: +class TableStructureModel(AbstractPageModel): def __init__(self, config): self.config = config self.do_cell_matching = config["do_cell_matching"] diff --git a/docling/pipeline/base_model_pipeline.py b/docling/pipeline/base_model_pipeline.py index 4fdde951..b4898148 100644 --- a/docling/pipeline/base_model_pipeline.py +++ b/docling/pipeline/base_model_pipeline.py @@ -1,17 +1,117 @@ -from pathlib import Path +import functools +import logging +import time +import traceback +from abc import ABC, abstractmethod from typing import Callable, Iterable, List -from docling.datamodel.base_models import Page, PipelineOptions +from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.datamodel.base_models import ( + ConversionStatus, + DoclingComponentType, + ErrorItem, + Page, + PipelineOptions, +) +from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.settings import settings +from docling.utils.utils import chunkify + +_log = logging.getLogger(__name__) -class BaseModelPipeline: - def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): - self.model_pipe: List[Callable] = [] - self.artifacts_path = artifacts_path +class BaseModelPipeline(ABC): + def __init__(self, pipeline_options: PipelineOptions): self.pipeline_options = pipeline_options + self.model_pipe: List[Callable] = [] - def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]: + @abstractmethod + def execute(self, in_doc: InputDocument) -> ConversionResult: + pass + + @abstractmethod + def assemble_document( + self, in_doc: InputDocument, conv_res: ConversionResult + ) -> ConversionResult: + pass + + @classmethod + @abstractmethod + def get_default_options(cls) -> PipelineOptions: + pass + + @classmethod + @abstractmethod + def is_backend_supported(cls, backend: AbstractDocumentBackend): + pass + + +class PaginatedModelPipeline(BaseModelPipeline): + + def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: for model in self.model_pipe: page_batch = model(page_batch) yield from page_batch + + def execute(self, in_doc: InputDocument) -> ConversionResult: + conv_res = ConversionResult(input=in_doc) + + _log.info(f"Processing document {in_doc.file.name}") + + for i in range(0, in_doc.page_count): + conv_res.pages.append(Page(page_no=i)) + + try: + # Iterate batches of pages (page_batch_size) in the doc + for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size): + start_pb_time = time.time() + + # 1. Initialise the page resources + init_pages = map( + functools.partial(self.initialize_page, in_doc), page_batch + ) + + # 2. Run pipeline stages + pipeline_pages = self.apply_on_pages(init_pages) + + for p in pipeline_pages: + pass + + end_pb_time = time.time() - start_pb_time + _log.info(f"Finished converting page batch time={end_pb_time:.3f}") + + # Free up mem resources of PDF backend + in_doc._backend.unload() + + conv_res = self.assemble_document(in_doc, conv_res) + + status = ConversionStatus.SUCCESS + for page in conv_res.pages: + if not page._backend.is_valid(): + conv_res.errors.append( + ErrorItem( + component_type=DoclingComponentType.DOCUMENT_BACKEND, + module_name=type(page._backend).__name__, + error_message=f"Page {page.page_no} failed to parse.", + ) + ) + status = ConversionStatus.PARTIAL_SUCCESS + + conv_res.status = status + + except Exception as e: + conv_res.status = ConversionStatus.FAILURE + trace = "\n".join(traceback.format_exception(e)) + _log.info( + f"Encountered an error during conversion of document {in_doc.document_hash}:\n" + f"{trace}" + ) + raise e + + return conv_res + + # Initialise and load resources for a page + @abstractmethod + def initialize_page(self, doc: InputDocument, page: Page) -> Page: + pass diff --git a/docling/pipeline/simple_model_pipeline.py b/docling/pipeline/simple_model_pipeline.py new file mode 100644 index 00000000..efb7439b --- /dev/null +++ b/docling/pipeline/simple_model_pipeline.py @@ -0,0 +1,57 @@ +import logging + +from docling.backend.abstract_backend import ( + AbstractDocumentBackend, + DeclarativeDocumentBackend, +) +from docling.datamodel.base_models import ( + ConversionStatus, + PdfPipelineOptions, + PipelineOptions, +) +from docling.datamodel.document import ConversionResult, InputDocument +from docling.pipeline.base_model_pipeline import BaseModelPipeline + +_log = logging.getLogger(__name__) + + +class SimpleModelPipeline(BaseModelPipeline): + + def __init__(self, pipeline_options: PdfPipelineOptions): + super().__init__(pipeline_options) + + def execute(self, in_doc: InputDocument) -> ConversionResult: + conv_res = ConversionResult(input=in_doc) + + _log.info(f"Processing document {in_doc.file.name}") + + if not in_doc.valid: + conv_res.status = ConversionStatus.FAILURE + return conv_res + + if not isinstance(in_doc._backend, DeclarativeDocumentBackend): + conv_res.status = ConversionStatus.FAILURE + return conv_res + + conv_res.experimental = in_doc._backend.convert() + + # Do other stuff with conv_res.experimental + + conv_res = self.assemble_document(in_doc, conv_res) + + conv_res.status = ConversionStatus.SUCCESS + + return conv_res + + def assemble_document( + self, in_doc: InputDocument, conv_res: ConversionResult + ) -> ConversionResult: + return conv_res + + @classmethod + def get_default_options(cls) -> PipelineOptions: + return PipelineOptions() + + @classmethod + def is_backend_supported(cls, backend: AbstractDocumentBackend): + return isinstance(backend, DeclarativeDocumentBackend) diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py deleted file mode 100644 index 1abf59a3..00000000 --- a/docling/pipeline/standard_model_pipeline.py +++ /dev/null @@ -1,38 +0,0 @@ -from pathlib import Path - -from docling.datamodel.base_models import PipelineOptions -from docling.models.easyocr_model import EasyOcrModel -from docling.models.layout_model import LayoutModel -from docling.models.table_structure_model import TableStructureModel -from docling.pipeline.base_model_pipeline import BaseModelPipeline - - -class StandardModelPipeline(BaseModelPipeline): - _layout_model_path = "model_artifacts/layout/beehive_v0.0.5" - _table_model_path = "model_artifacts/tableformer" - - def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): - super().__init__(artifacts_path, pipeline_options) - - self.model_pipe = [ - EasyOcrModel( - config={ - "lang": ["fr", "de", "es", "en"], - "enabled": pipeline_options.do_ocr, - } - ), - LayoutModel( - config={ - "artifacts_path": artifacts_path - / StandardModelPipeline._layout_model_path - } - ), - TableStructureModel( - config={ - "artifacts_path": artifacts_path - / StandardModelPipeline._table_model_path, - "enabled": pipeline_options.do_table_structure, - "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, - } - ), - ] diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py new file mode 100644 index 00000000..4001149d --- /dev/null +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -0,0 +1,108 @@ +import logging +from pathlib import Path +from typing import Optional + +from docling.backend.abstract_backend import AbstractDocumentBackend +from docling.backend.pdf_backend import PdfDocumentBackend +from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions +from docling.datamodel.document import ConversionResult, InputDocument +from docling.models.ds_glm_model import GlmModel +from docling.models.easyocr_model import EasyOcrModel +from docling.models.layout_model import LayoutModel +from docling.models.page_assemble_model import PageAssembleModel +from docling.models.page_preprocessing_model import PagePreprocessingModel +from docling.models.table_structure_model import TableStructureModel +from docling.pipeline.base_model_pipeline import PaginatedModelPipeline + +_log = logging.getLogger(__name__) + + +class StandardPdfModelPipeline(PaginatedModelPipeline): + _layout_model_path = "model_artifacts/layout/beehive_v0.0.5" + _table_model_path = "model_artifacts/tableformer" + + def __init__(self, pipeline_options: PdfPipelineOptions): + super().__init__(pipeline_options) + + if not pipeline_options.artifacts_path: + artifacts_path = self.download_models_hf() + + self.artifacts_path = Path(artifacts_path) + self.glm_model = GlmModel(config={}) + + self.model_pipe = [ + PagePreprocessingModel( + config={"images_scale": pipeline_options.images_scale} + ), + EasyOcrModel( + config={ + "lang": ["fr", "de", "es", "en"], + "enabled": pipeline_options.do_ocr, + } + ), + LayoutModel( + config={ + "artifacts_path": artifacts_path + / StandardPdfModelPipeline._layout_model_path + } + ), + TableStructureModel( + config={ + "artifacts_path": artifacts_path + / StandardPdfModelPipeline._table_model_path, + "enabled": pipeline_options.do_table_structure, + "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching, + } + ), + PageAssembleModel(config={"images_scale": pipeline_options.images_scale}), + ] + + @staticmethod + def download_models_hf( + local_dir: Optional[Path] = None, force: bool = False + ) -> Path: + from huggingface_hub import snapshot_download + + download_path = snapshot_download( + repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir + ) + + return Path(download_path) + + def initialize_page(self, doc: InputDocument, page: Page) -> Page: + page._backend = doc._backend.load_page(page.page_no) + page.size = page._backend.get_size() + + return page + + def assemble_document( + self, in_doc: InputDocument, conv_res: ConversionResult + ) -> ConversionResult: + all_elements = [] + all_headers = [] + all_body = [] + + for p in conv_res.pages: + + for el in p.assembled.body: + all_body.append(el) + for el in p.assembled.headers: + all_headers.append(el) + for el in p.assembled.elements: + all_elements.append(el) + + conv_res.assembled = AssembledUnit( + elements=all_elements, headers=all_headers, body=all_body + ) + + conv_res.output, conv_res.experimental = self.glm_model(conv_res) + + return conv_res + + @classmethod + def get_default_options(cls) -> PdfPipelineOptions: + return PdfPipelineOptions() + + @classmethod + def is_backend_supported(cls, backend: AbstractDocumentBackend): + return isinstance(backend, PdfDocumentBackend) diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 286623dd..871f6039 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -6,9 +6,9 @@ from typing import Iterable import yaml -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter _log = logging.getLogger(__name__) @@ -107,7 +107,11 @@ def main(): # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) - doc_converter = DocumentConverter() + doc_converter = PdfDocumentConverter( + pipeline_options=PdfPipelineOptions(), + pdf_backend=DocumentConversionInput.DEFAULT_BACKEND, + pipeline_cls=StandardModelPipeline, + ) input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 6f0b8f8f..8ee6acc8 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -6,9 +6,9 @@ from typing import Iterable from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter _log = logging.getLogger(__name__) @@ -93,12 +93,12 @@ def main(): # Docling Parse without OCR # ------------------------- - pipeline_options = PipelineOptions() + pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - doc_converter = DocumentConverter( + doc_converter = PdfDocumentConverter( pipeline_options=pipeline_options, pdf_backend=DoclingParseDocumentBackend, ) diff --git a/examples/export_figures.py b/examples/export_figures.py index 60d70156..8d80e7de 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -4,14 +4,14 @@ from pathlib import Path from typing import Tuple from docling.datamodel.base_models import ( - AssembleOptions, ConversionStatus, FigureElement, PageElement, + PdfPipelineOptions, Table, ) from docling.datamodel.document import DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter _log = logging.getLogger(__name__) @@ -30,12 +30,12 @@ def main(): # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. - # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. + # This is done by setting PipelineOptions.images_scale, which also defines the scale of images. # scale=1 correspond of a standard 72 DPI image - assemble_options = AssembleOptions() - assemble_options.images_scale = IMAGE_RESOLUTION_SCALE + pipeline_options = PdfPipelineOptions() + pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE - doc_converter = DocumentConverter(assemble_options=assemble_options) + doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options) start_time = time.time() diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index 7c016b19..2ac33466 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -5,9 +5,9 @@ from pathlib import Path import pandas as pd -from docling.datamodel.base_models import AssembleOptions, ConversionStatus +from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions from docling.datamodel.document import DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter from docling.utils.export import generate_multimodal_pages _log = logging.getLogger(__name__) @@ -27,12 +27,12 @@ def main(): # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. - # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. + # This is done by setting PipelineOptions.images_scale, which also defines the scale of images. # scale=1 correspond of a standard 72 DPI image - assemble_options = AssembleOptions() - assemble_options.images_scale = IMAGE_RESOLUTION_SCALE + pipeline_options = PdfPipelineOptions() + pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE - doc_converter = DocumentConverter(assemble_options=assemble_options) + doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options) start_time = time.time() diff --git a/examples/export_tables.py b/examples/export_tables.py index a0c605c1..59fd2723 100644 --- a/examples/export_tables.py +++ b/examples/export_tables.py @@ -7,7 +7,7 @@ import pandas as pd from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter _log = logging.getLogger(__name__) @@ -22,7 +22,7 @@ def main(): input_files = DocumentConversionInput.from_paths(input_doc_paths) - doc_converter = DocumentConverter() + doc_converter = PdfDocumentConverter() start_time = time.time() diff --git a/examples/minimal.py b/examples/minimal.py index 837db718..4102f7e2 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -1,6 +1,6 @@ -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL -converter = DocumentConverter() +converter = PdfDocumentConverter() doc = converter.convert_single(source) print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]" diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py new file mode 100644 index 00000000..df76ebe7 --- /dev/null +++ b/examples/run_with_formats.py @@ -0,0 +1,41 @@ +from pathlib import Path + +from docling.backend.msword_backend import MsWordDocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import ( + InputFormat, + PdfPipelineOptions, + PipelineOptions, +) +from docling.datamodel.document import DocumentConversionInput +from docling.document_converter import DocumentConverter, FormatOption +from docling.pipeline.simple_model_pipeline import SimpleModelPipeline +from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline + +input_paths = [ + Path("tests/data/wiki_duck.html"), + Path("tests/data/word_sample.docx"), + Path("tests/data/powerpoint_sample.pptx"), + Path("tests/data/2206.01062.pdf"), +] +input = DocumentConversionInput.from_paths(input_paths) + +# for defaults use: +doc_converter = DocumentConverter() + +# to customize use: +# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. +# formats=[InputFormat.PDF, InputFormat.DOCX], +# format_options={ +# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend), +# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend) +# } +# ) + +conv_results = doc_converter.convert(input) + +for res in conv_results: + print( + f"Document {res.input.file.name} converted with status {res.status}. Content:" + ) + print(res.experimental.export_to_markdown()) diff --git a/tests/data/powerpoint_sample.pptx b/tests/data/powerpoint_sample.pptx new file mode 100644 index 00000000..f54963e2 Binary files /dev/null and b/tests/data/powerpoint_sample.pptx differ diff --git a/tests/data/wiki_duck.html b/tests/data/wiki_duck.html new file mode 100644 index 00000000..7d0896e7 --- /dev/null +++ b/tests/data/wiki_duck.html @@ -0,0 +1,1311 @@ +<!DOCTYPE html> +<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> +<head> +<meta charset="UTF-8"> +<title>Duck - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +

Duck

+ +
+ + +
+ +
+ + + +
+ +
+
+
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+
+
Page semi-protected
+
+ +
From Wikipedia, the free encyclopedia
+
+
(Redirected from Duckling)
+ + +
+ + +

+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Duck +
+
Bufflehead
(Bucephala albeola) +
Scientific classification Edit this classification +
Domain: +Eukaryota +
Kingdom: +Animalia +
Phylum: +Chordata +
Class: +Aves +
Order: +Anseriformes +
Superfamily: +Anatoidea +
Family: +Anatidae +
Subfamilies +
+

See text +

+
+

Duck is the common name for numerous species of waterfowl in the family Anatidae. Ducks are generally smaller and shorter-necked than swans and geese, which are members of the same family. Divided among several subfamilies, they are a form taxon; they do not represent a monophyletic group (the group of all descendants of a single common ancestral species), since swans and geese are not considered ducks. Ducks are mostly aquatic birds, and may be found in both fresh water and sea water. +

Ducks are sometimes confused with several types of unrelated water birds with similar forms, such as loons or divers, grebes, gallinules and coots. +

+ +

Etymology

+

The word duck comes from Old English dūce 'diver', a derivative of the verb *dūcan 'to duck, bend down low as if to get under something, or dive', because of the way many species in the dabbling duck group feed by upending; compare with Dutch duiken and German tauchen 'to dive'. +

+
Pacific black duck displaying the characteristic upending "duck"
+

This word replaced Old English ened /ænid 'duck', possibly to avoid confusion with other words, such as ende 'end' with similar forms. Other Germanic languages still have similar words for duck, for example, Dutch eend, German Ente and Norwegian and. The word ened /ænid was inherited from Proto-Indo-European; cf. Latin anas "duck", Lithuanian ántis 'duck', Ancient Greek νῆσσα /νῆττα (nēssa /nētta) 'duck', and Sanskrit ātí 'water bird', among others. +

A duckling is a young duck in downy plumage[1] or baby duck,[2] but in the food trade a young domestic duck which has just reached adult size and bulk and its meat is still fully tender, is sometimes labelled as a duckling. +

A male is called a drake and the female is called a duck, or in ornithology a hen.[3][4] +

+
Male mallard.
+
Wood ducks.
+

Taxonomy

+

All ducks belong to the biological order Anseriformes, a group that contains the ducks, geese and swans, as well as the screamers, and the magpie goose.[5] All except the screamers belong to the biological family Anatidae.[5] Within the family, ducks are split into a variety of subfamilies and 'tribes'. The number and composition of these subfamilies and tribes is the cause of considerable disagreement among taxonomists.[5] Some base their decisions on morphological characteristics, others on shared behaviours or genetic studies.[6][7] The number of suggested subfamilies containing ducks ranges from two to five.[8][9] The significant level of hybridisation that occurs among wild ducks complicates efforts to tease apart the relationships between various species.[9] +

+
Mallard landing in approach
+

In most modern classifications, the so-called 'true ducks' belong to the subfamily Anatinae, which is further split into a varying number of tribes.[10] The largest of these, the Anatini, contains the 'dabbling' or 'river' ducks – named for their method of feeding primarily at the surface of fresh water.[11] The 'diving ducks', also named for their primary feeding method, make up the tribe Aythyini.[12] The 'sea ducks' of the tribe Mergini are diving ducks which specialise on fish and shellfish and spend a majority of their lives in saltwater.[13] The tribe Oxyurini contains the 'stifftails', diving ducks notable for their small size and stiff, upright tails.[14] +

A number of other species called ducks are not considered to be 'true ducks', and are typically placed in other subfamilies or tribes. The whistling ducks are assigned either to a tribe (Dendrocygnini) in the subfamily Anatinae or the subfamily Anserinae,[15] or to their own subfamily (Dendrocygninae) or family (Dendrocyganidae).[9][16] The freckled duck of Australia is either the sole member of the tribe Stictonettini in the subfamily Anserinae,[15] or in its own family, the Stictonettinae.[9] The shelducks make up the tribe Tadornini in the family Anserinae in some classifications,[15] and their own subfamily, Tadorninae, in others,[17] while the steamer ducks are either placed in the family Anserinae in the tribe Tachyerini[15] or lumped with the shelducks in the tribe Tadorini.[9] The perching ducks make up in the tribe Cairinini in the subfamily Anserinae in some classifications, while that tribe is eliminated in other classifications and its members assigned to the tribe Anatini.[9] The torrent duck is generally included in the subfamily Anserinae in the monotypic tribe Merganettini,[15] but is sometimes included in the tribe Tadornini.[18] The pink-eared duck is sometimes included as a true duck either in the tribe Anatini[15] or the tribe Malacorhynchini,[19] and other times is included with the shelducks in the tribe Tadornini.[15] +

+

Morphology

+
Male Mandarin duck
+

The overall body plan of ducks is elongated and broad, and they are also relatively long-necked, albeit not as long-necked as the geese and swans. The body shape of diving ducks varies somewhat from this in being more rounded. The bill is usually broad and contains serrated pectens, which are particularly well defined in the filter-feeding species. In the case of some fishing species the bill is long and strongly serrated. The scaled legs are strong and well developed, and generally set far back on the body, more so in the highly aquatic species. The wings are very strong and are generally short and pointed, and the flight of ducks requires fast continuous strokes, requiring in turn strong wing muscles. Three species of steamer duck are almost flightless, however. Many species of duck are temporarily flightless while moulting; they seek out protected habitat with good food supplies during this period. This moult typically precedes migration. +

The drakes of northern species often have extravagant plumage, but that is moulted in summer to give a more female-like appearance, the "eclipse" plumage. Southern resident species typically show less sexual dimorphism, although there are exceptions such as the paradise shelduck of New Zealand, which is both strikingly sexually dimorphic and in which the female's plumage is brighter than that of the male. The plumage of juvenile birds generally resembles that of the female. Female ducks have evolved to have a corkscrew shaped vagina to prevent rape. +

+

Distribution and habitat

+ +
Flying steamer ducks in Ushuaia, Argentina
+

Ducks have a cosmopolitan distribution, and are found on every continent except Antarctica.[5] Several species manage to live on subantarctic islands, including South Georgia and the Auckland Islands.[20] Ducks have reached a number of isolated oceanic islands, including the Hawaiian Islands, Micronesia and the Galápagos Islands, where they are often vagrants and less often residents.[21][22] A handful are endemic to such far-flung islands.[21] +

+
A brown duck in a fast-flowing stream
Female mallard in Cornwall, England
+

Some duck species, mainly those breeding in the temperate and Arctic Northern Hemisphere, are migratory; those in the tropics are generally not. Some ducks, particularly in Australia where rainfall is erratic, are nomadic, seeking out the temporary lakes and pools that form after localised heavy rain.[23] +

+

Behaviour

+

Feeding

+
Pecten along the bill
+
Mallard duckling preening
+

Ducks eat food sources such as grasses, aquatic plants, fish, insects, small amphibians, worms, and small molluscs. +

Dabbling ducks feed on the surface of water or on land, or as deep as they can reach by up-ending without completely submerging.[24] Along the edge of the bill, there is a comb-like structure called a pecten. This strains the water squirting from the side of the bill and traps any food. The pecten is also used to preen feathers and to hold slippery food items. +

Diving ducks and sea ducks forage deep underwater. To be able to submerge more easily, the diving ducks are heavier than dabbling ducks, and therefore have more difficulty taking off to fly. +

A few specialized species such as the mergansers are adapted to catch and swallow large fish. +

The others have the characteristic wide flat bill adapted to dredging-type jobs such as pulling up waterweed, pulling worms and small molluscs out of mud, searching for insect larvae, and bulk jobs such as dredging out, holding, turning head first, and swallowing a squirming frog. To avoid injury when digging into sediment it has no cere, but the nostrils come out through hard horn. +

The Guardian published an article advising that ducks should not be fed with bread because it damages the health of the ducks and pollutes waterways.[25] +

+

Breeding

+
A Muscovy duckling
+

Ducks generally only have one partner at a time, although the partnership usually only lasts one year.[26] Larger species and the more sedentary species (like fast-river specialists) tend to have pair-bonds that last numerous years.[27] Most duck species breed once a year, choosing to do so in favourable conditions (spring/summer or wet seasons). Ducks also tend to make a nest before breeding, and, after hatching, lead their ducklings to water. Mother ducks are very caring and protective of their young, but may abandon some of their ducklings if they are physically stuck in an area they cannot get out of (such as nesting in an enclosed courtyard) or are not prospering due to genetic defects or sickness brought about by hypothermia, starvation, or disease. Ducklings can also be orphaned by inconsistent late hatching where a few eggs hatch after the mother has abandoned the nest and led her ducklings to water.[28] +

+

Communication

+

Female mallard ducks (as well as several other species in the genus Anas, such as the American and Pacific black ducks, spot-billed duck, northern pintail and common teal) make the classic "quack" sound while males make a similar but raspier sound that is sometimes written as "breeeeze",[29][self-published source?] but, despite widespread misconceptions, most species of duck do not "quack".[30] In general, ducks make a range of calls, including whistles, cooing, yodels and grunts. For example, the scaup – which are diving ducks – make a noise like "scaup" (hence their name). Calls may be loud displaying calls or quieter contact calls. +

A common urban legend claims that duck quacks do not echo; however, this has been proven to be false. This myth was first debunked by the Acoustics Research Centre at the University of Salford in 2003 as part of the British Association's Festival of Science.[31] It was also debunked in one of the earlier episodes of the popular Discovery Channel television show MythBusters.[32] +

+

Predators

+
Ringed teal
+

Ducks have many predators. Ducklings are particularly vulnerable, since their inability to fly makes them easy prey not only for predatory birds but also for large fish like pike, crocodilians, predatory testudines such as the alligator snapping turtle, and other aquatic hunters, including fish-eating birds such as herons. Ducks' nests are raided by land-based predators, and brooding females may be caught unaware on the nest by mammals, such as foxes, or large birds, such as hawks or owls. +

Adult ducks are fast fliers, but may be caught on the water by large aquatic predators including big fish such as the North American muskie and the European pike. In flight, ducks are safe from all but a few predators such as humans and the peregrine falcon, which uses its speed and strength to catch ducks. +

+

Relationship with humans

+

Hunting

+ +

Humans have hunted ducks since prehistoric times. Excavations of middens in California dating to 7800 – 6400 BP have turned up bones of ducks, including at least one now-extinct flightless species.[33] Ducks were captured in "significant numbers" by Holocene inhabitants of the lower Ohio River valley, suggesting they took advantage of the seasonal bounty provided by migrating waterfowl.[34] Neolithic hunters in locations as far apart as the Caribbean,[35] Scandinavia,[36] Egypt,[37] Switzerland,[38] and China relied on ducks as a source of protein for some or all of the year.[39] Archeological evidence shows that Māori people in New Zealand hunted the flightless Finsch's duck, possibly to extinction, though rat predation may also have contributed to its fate.[40] A similar end awaited the Chatham duck, a species with reduced flying capabilities which went extinct shortly after its island was colonised by Polynesian settlers.[41] It is probable that duck eggs were gathered by Neolithic hunter-gathers as well, though hard evidence of this is uncommon.[35][42] +

In many areas, wild ducks (including ducks farmed and released into the wild) are hunted for food or sport,[43] by shooting, or by being trapped using duck decoys. Because an idle floating duck or a duck squatting on land cannot react to fly or move quickly, "a sitting duck" has come to mean "an easy target". These ducks may be contaminated by pollutants such as PCBs.[44] +

+

Domestication

+ +
Indian Runner ducks, a common breed of domestic ducks
+

Ducks have many economic uses, being farmed for their meat, eggs, and feathers (particularly their down). Approximately 3 billion ducks are slaughtered each year for meat worldwide.[45] They are also kept and bred by aviculturists and often displayed in zoos. Almost all the varieties of domestic ducks are descended from the mallard (Anas platyrhynchos), apart from the Muscovy duck (Cairina moschata).[46][47] The Call duck is another example of a domestic duck breed. Its name comes from its original use established by hunters, as a decoy to attract wild mallards from the sky, into traps set for them on the ground. The call duck is the world's smallest domestic duck breed, as it weighs less than 1 kg (2.2 lb).[48] +

+

Heraldry

+
Three black-colored ducks in the coat of arms of Maaninka[49]
+

Ducks appear on several coats of arms, including the coat of arms of Lubāna (Latvia)[50] and the coat of arms of Föglö (Åland).[51] +

+

Cultural references

+

In 2002, psychologist Richard Wiseman and colleagues at the University of Hertfordshire, UK, finished a year-long LaughLab experiment, concluding that of all animals, ducks attract the most humor and silliness; he said, "If you're going to tell a joke involving an animal, make it a duck."[52] The word "duck" may have become an inherently funny word in many languages, possibly because ducks are seen as silly in their looks or behavior. Of the many ducks in fiction, many are cartoon characters, such as Walt Disney's Donald Duck, and Warner Bros.' Daffy Duck. Howard the Duck started as a comic book character in 1973[53][54] and was made into a movie in 1986. +

The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck as the mascot for the fictional youth hockey team who are protagonists of the movie, based on the duck being described as a fierce fighter. This led to the duck becoming the nickname and mascot for the eventual National Hockey League professional team of the Anaheim Ducks, who were founded with the name the Mighty Ducks of Anaheim.[citation needed] The duck is also the nickname of the University of Oregon sports teams as well as the Long Island Ducks minor league baseball team.[55] +

+

See also

+ + +
+

Notes

+

Citations

+
+
    +
  1. ^ "Duckling". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22. +
  2. +
  3. ^ "Duckling". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22. +
  4. +
  5. ^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139. +
  6. +
  7. ^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566. +
  8. +
  9. ^ a b c d Carboneras 1992, p. 536. +
  10. +
  11. ^ Livezey 1986, pp. 737–738. +
  12. +
  13. ^ Madsen, McHugh & de Kloet 1988, p. 452. +
  14. +
  15. ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354. +
  16. +
  17. ^ a b c d e f Carboneras 1992, p. 540. +
  18. +
  19. ^ Elphick, Dunning & Sibley 2001, p. 191. +
  20. +
  21. ^ Kear 2005, p. 448. +
  22. +
  23. ^ Kear 2005, p. 622–623. +
  24. +
  25. ^ Kear 2005, p. 686. +
  26. +
  27. ^ Elphick, Dunning & Sibley 2001, p. 193. +
  28. +
  29. ^ a b c d e f g Carboneras 1992, p. 537. +
  30. +
  31. ^ American Ornithologists' Union 1998, p. xix. +
  32. +
  33. ^ American Ornithologists' Union 1998. +
  34. +
  35. ^ Carboneras 1992, p. 538. +
  36. +
  37. ^ Christidis & Boles 2008, p. 62. +
  38. +
  39. ^ Shirihai 2008, pp. 239, 245. +
  40. +
  41. ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107. +
  42. +
  43. ^ Fitter, Fitter & Hosking 2000, pp. 52–3. +
  44. +
  45. ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27. +
  46. +
  47. ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02. +
  48. +
  49. ^ Karl Mathiesen (16 March 2015). "Don't feed the ducks bread, say conservationists". The Guardian. Retrieved 13 November 2016. +
  50. +
  51. ^ Rohwer, Frank C.; Anderson, Michael G. (1988). "Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9. +
  52. +
  53. ^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). "Long-Term Pair Bonds in Harlequin Ducks". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797. +
  54. +
  55. ^ "If You Find An Orphaned Duckling - Wildlife Rehabber". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22. +
  56. +
  57. ^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source] +
  58. +
  59. ^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707. +
  60. +
  61. ^ Amos, Jonathan (2003-09-08). "Sound science is quackers". BBC News. Retrieved 2006-11-02. +
  62. +
  63. ^ "Mythbusters Episode 8". 12 December 2003. +
  64. +
  65. ^ Erlandson 1994, p. 171. +
  66. +
  67. ^ Jeffries 2008, pp. 168, 243. +
  68. +
  69. ^ a b Sued-Badillo 2003, p. 65. +
  70. +
  71. ^ Thorpe 1996, p. 68. +
  72. +
  73. ^ Maisels 1999, p. 42. +
  74. +
  75. ^ Rau 1876, p. 133. +
  76. +
  77. ^ Higman 2012, p. 23. +
  78. +
  79. ^ Hume 2012, p. 53. +
  80. +
  81. ^ Hume 2012, p. 52. +
  82. +
  83. ^ Fieldhouse 2002, p. 167. +
  84. +
  85. ^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774. +
  86. +
  87. ^ "Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019. +
  88. +
  89. ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25. +
  90. +
  91. ^ "Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin". Digimorph.org. Retrieved 2012-12-23. +
  92. +
  93. ^ Sy Montgomery. "Mallard; Encyclopædia Britannica". Britannica.com. Retrieved 2012-12-23. +
  94. +
  95. ^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9. +
  96. +
  97. ^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3. +
  98. +
  99. ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021. +
  100. +
  101. ^ "Föglö" (in Swedish). Retrieved September 9, 2021. +
  102. +
  103. ^ Young, Emma. "World's funniest joke revealed". New Scientist. Retrieved 7 January 2019. +
  104. +
  105. ^ "Howard the Duck (character)". Grand Comics Database. +
  106. +
  107. ^ Sanderson, Peter; Gilbert, Laura (2008). "1970s". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck. +
  108. +
  109. ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20. +
  110. +
+

Sources

+
+ +
+
+ + + + + + + +
+
+ +
+
+ +
+ +
+
+
+ +
+ + + \ No newline at end of file diff --git a/tests/data/word_sample.docx b/tests/data/word_sample.docx new file mode 100644 index 00000000..62c403fe Binary files /dev/null and b/tests/data/word_sample.docx differ diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index a4ecff16..37ed42a9 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -2,9 +2,9 @@ from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import PipelineOptions +from docling.datamodel.base_models import PdfPipelineOptions from docling.datamodel.document import ConversionResult -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter from .verify_utils import verify_conversion_result @@ -23,12 +23,12 @@ def get_pdf_paths(): def get_converter(): - pipeline_options = PipelineOptions() + pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - converter = DocumentConverter( + converter = PdfDocumentConverter( pipeline_options=pipeline_options, pdf_backend=DoclingParseDocumentBackend, ) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 9475bcef..55f8bb3b 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -5,9 +5,9 @@ import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.datamodel.base_models import DocumentStream, PipelineOptions +from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.document_converter import DocumentConverter +from docling.pdf_document_converter import PdfDocumentConverter from .verify_utils import verify_conversion_result @@ -21,12 +21,12 @@ def get_pdf_path(): @pytest.fixture def converter(): - pipeline_options = PipelineOptions() + pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - converter = DocumentConverter( + converter = PdfDocumentConverter( pipeline_options=pipeline_options, pdf_backend=DoclingParseDocumentBackend, ) @@ -34,7 +34,7 @@ def converter(): return converter -def test_convert_single(converter: DocumentConverter): +def test_convert_single(converter: PdfDocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") @@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter): verify_conversion_result(input_path=pdf_path, doc_result=doc_result) -def test_batch_path(converter: DocumentConverter): +def test_batch_path(converter: PdfDocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}") @@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter): verify_conversion_result(input_path=pdf_path, doc_result=doc_result) -def test_batch_bytes(converter: DocumentConverter): +def test_batch_bytes(converter: PdfDocumentConverter): pdf_path = get_pdf_path() print(f"converting {pdf_path}")