mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Lots of improvements
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
c0447206af
commit
203cf19b1b
@ -98,21 +98,18 @@ class InputDocument(BaseModel):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
format: InputFormat,
|
||||||
|
backend: AbstractDocumentBackend,
|
||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
limits: Optional[DocumentLimits] = None,
|
limits: Optional[DocumentLimits] = None,
|
||||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
|
||||||
format: Optional[InputFormat] = None,
|
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.limits = limits or DocumentLimits()
|
self.limits = limits or DocumentLimits()
|
||||||
|
self.format = format
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(path_or_stream, Path):
|
if isinstance(path_or_stream, Path):
|
||||||
mime = filetype.guess_mime(str(path_or_stream))
|
|
||||||
if mime is None:
|
|
||||||
if path_or_stream.suffix == ".html":
|
|
||||||
mime = "text/html"
|
|
||||||
|
|
||||||
self.file = path_or_stream
|
self.file = path_or_stream
|
||||||
self.filesize = path_or_stream.stat().st_size
|
self.filesize = path_or_stream.stat().st_size
|
||||||
@ -121,11 +118,9 @@ class InputDocument(BaseModel):
|
|||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
|
||||||
self._init_doc(backend, mime, path_or_stream)
|
self._init_doc(backend, path_or_stream)
|
||||||
|
|
||||||
elif isinstance(path_or_stream, BytesIO):
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
mime = filetype.guess_mime(path_or_stream.read(8192))
|
|
||||||
|
|
||||||
self.file = PurePath(filename)
|
self.file = PurePath(filename)
|
||||||
self.filesize = path_or_stream.getbuffer().nbytes
|
self.filesize = path_or_stream.getbuffer().nbytes
|
||||||
|
|
||||||
@ -134,7 +129,7 @@ class InputDocument(BaseModel):
|
|||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
|
||||||
self._init_doc(backend, mime, path_or_stream)
|
self._init_doc(backend, path_or_stream)
|
||||||
|
|
||||||
# For paginated backends, check if the maximum page count is exceeded.
|
# For paginated backends, check if the maximum page count is exceeded.
|
||||||
if self.valid and self._backend.is_valid():
|
if self.valid and self._backend.is_valid():
|
||||||
@ -158,23 +153,19 @@ class InputDocument(BaseModel):
|
|||||||
def _init_doc(
|
def _init_doc(
|
||||||
self,
|
self,
|
||||||
backend: AbstractDocumentBackend,
|
backend: AbstractDocumentBackend,
|
||||||
mime: str,
|
|
||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
) -> None:
|
) -> None:
|
||||||
self.format = MimeTypeToFormat.get(mime)
|
if backend is None:
|
||||||
if self.format is not None:
|
backend = _input_format_default_backends.get(self.format)
|
||||||
backend = backend or _input_format_default_backends.get(self.format)
|
|
||||||
if backend is None:
|
if backend is None:
|
||||||
|
self.valid = False
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Could not find suitable default backend for format: {self.format}"
|
f"Could not find suitable backend for file: {self.file}"
|
||||||
)
|
)
|
||||||
if self.format is None or self.format not in backend.supported_formats():
|
|
||||||
# TODO decide if to raise exception here too.
|
self._backend = backend(
|
||||||
self.valid = False
|
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||||
else:
|
)
|
||||||
self._backend = backend(
|
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@deprecated("Use `ConversionResult` instead.")
|
@deprecated("Use `ConversionResult` instead.")
|
||||||
@ -478,17 +469,46 @@ class DocumentConversionInput(BaseModel):
|
|||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
def docs(
|
def docs(
|
||||||
self, backend: Optional[Type[AbstractDocumentBackend]] = None
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
|
|
||||||
for obj in self._path_or_stream_iterator:
|
for obj in self._path_or_stream_iterator:
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
|
|
||||||
|
mime = filetype.guess_mime(str(obj))
|
||||||
|
if mime is None:
|
||||||
|
if obj.suffix == ".html":
|
||||||
|
mime = "text/html"
|
||||||
|
|
||||||
|
format = MimeTypeToFormat.get(mime)
|
||||||
|
if format not in format_options.keys():
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
backend = format_options.get(format).backend
|
||||||
|
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj, limits=self.limits, backend=backend
|
path_or_stream=obj,
|
||||||
|
format=format,
|
||||||
|
limits=self.limits,
|
||||||
|
backend=backend,
|
||||||
)
|
)
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
|
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||||
|
obj.stream.seek(0)
|
||||||
|
|
||||||
|
if mime is None:
|
||||||
|
if obj.suffix == ".html":
|
||||||
|
mime = "text/html"
|
||||||
|
|
||||||
|
format = MimeTypeToFormat.get(mime)
|
||||||
|
if format not in format_options.keys():
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
backend = format_options.get(format).backend
|
||||||
|
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj.stream,
|
path_or_stream=obj.stream,
|
||||||
|
format=format,
|
||||||
filename=obj.filename,
|
filename=obj.filename,
|
||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
|
@ -8,6 +8,7 @@ import requests
|
|||||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
@ -27,7 +28,7 @@ _log = logging.getLogger(__name__)
|
|||||||
class FormatOption(BaseModel):
|
class FormatOption(BaseModel):
|
||||||
pipeline_cls: Type[BaseModelPipeline]
|
pipeline_cls: Type[BaseModelPipeline]
|
||||||
pipeline_options: Optional[PipelineOptions] = None
|
pipeline_options: Optional[PipelineOptions] = None
|
||||||
backend: Optional[Type[AbstractDocumentBackend]]
|
backend: Optional[Type[AbstractDocumentBackend]] = None
|
||||||
|
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
@ -47,11 +48,29 @@ class FormatOption(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PdfFormatOption(FormatOption):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
|
||||||
|
pipeline_options: Optional[PipelineOptions] = None,
|
||||||
|
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||||
|
):
|
||||||
|
if pipeline_cls is None:
|
||||||
|
pipeline_cls = StandardPdfModelPipeline
|
||||||
|
if backend is None:
|
||||||
|
backend = DoclingParseDocumentBackend
|
||||||
|
super().__init__(
|
||||||
|
pipeline_cls=pipeline_cls,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=backend,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_format_to_default_options = {
|
_format_to_default_options = {
|
||||||
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
InputFormat.IMAGE: None,
|
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||||
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,11 +80,26 @@ class DocumentConverter:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
formats: List[InputFormat] = [e for e in InputFormat],
|
formats: Optional[List[InputFormat]] = None,
|
||||||
format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.formats = formats
|
self.formats = formats
|
||||||
self.format_to_options = format_options
|
self.format_to_options = format_options
|
||||||
|
|
||||||
|
if self.formats is None:
|
||||||
|
if self.format_to_options is not None:
|
||||||
|
self.formats = self.format_to_options.keys()
|
||||||
|
else:
|
||||||
|
self.formats = [e for e in InputFormat] # all formats
|
||||||
|
|
||||||
|
if self.format_to_options is None:
|
||||||
|
self.format_to_options = _format_to_default_options
|
||||||
|
|
||||||
|
for f in self.formats:
|
||||||
|
if f not in self.format_to_options.keys():
|
||||||
|
_log.info(f"Requested format {f} will use default options.")
|
||||||
|
self.format_to_options[f] = _format_to_default_options[f]
|
||||||
|
|
||||||
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
||||||
{}
|
{}
|
||||||
)
|
)
|
||||||
@ -73,7 +107,8 @@ class DocumentConverter:
|
|||||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
input.docs(), settings.perf.doc_batch_size # pass format_options
|
input.docs(self.format_to_options),
|
||||||
|
settings.perf.doc_batch_size, # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info(f"Going to convert document batch...")
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
@ -83,7 +118,9 @@ class DocumentConverter:
|
|||||||
# yield from pool.map(self.process_document, input_batch)
|
# yield from pool.map(self.process_document, input_batch)
|
||||||
|
|
||||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||||
yield from map(self.process_document, input_batch)
|
for item in map(self.process_document, input_batch):
|
||||||
|
if item is not None:
|
||||||
|
yield item
|
||||||
|
|
||||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
||||||
"""Convert a single document.
|
"""Convert a single document.
|
||||||
@ -136,31 +173,40 @@ class DocumentConverter:
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
||||||
pipeline_class = None
|
|
||||||
fopt = self.format_to_options.get(doc.format)
|
fopt = self.format_to_options.get(doc.format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None:
|
||||||
return None
|
raise RuntimeError(f"Could not get pipeline for document {doc.file}")
|
||||||
else:
|
else:
|
||||||
pipeline_class = fopt.pipeline_cls
|
pipeline_class = fopt.pipeline_cls
|
||||||
|
pipeline_options = fopt.pipeline_options
|
||||||
|
|
||||||
if pipeline_class not in self.initialized_pipelines:
|
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||||
|
if (
|
||||||
|
pipeline_class not in self.initialized_pipelines
|
||||||
|
or self.initialized_pipelines[pipeline_class].pipeline_options
|
||||||
|
!= pipeline_options
|
||||||
|
):
|
||||||
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
||||||
pipeline_options=pipeline_class.get_default_options()
|
pipeline_options=pipeline_options
|
||||||
)
|
)
|
||||||
return self.initialized_pipelines[pipeline_class]
|
return self.initialized_pipelines[pipeline_class]
|
||||||
|
|
||||||
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
start_doc_time = time.time()
|
if in_doc.format not in self.formats:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
start_doc_time = time.time()
|
||||||
|
|
||||||
conv_res = self._execute_pipeline(in_doc)
|
conv_res = self._execute_pipeline(in_doc)
|
||||||
|
|
||||||
end_doc_time = time.time() - start_doc_time
|
end_doc_time = time.time() - start_doc_time
|
||||||
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
|
def _execute_pipeline(self, in_doc: InputDocument) -> Optional[ConversionResult]:
|
||||||
if in_doc.valid and in_doc.format in self.formats:
|
if in_doc.valid:
|
||||||
pipeline = self._get_pipeline(in_doc)
|
pipeline = self._get_pipeline(in_doc)
|
||||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
@ -168,7 +214,9 @@ class DocumentConverter:
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
conv_res = pipeline.execute(in_doc)
|
conv_res = pipeline.execute(in_doc)
|
||||||
else: # invalid doc or not of desired format
|
|
||||||
|
else:
|
||||||
|
# invalid doc or not of desired format
|
||||||
conv_res = ConversionResult(input=in_doc)
|
conv_res = ConversionResult(input=in_doc)
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
# TODO add error log why it failed.
|
# TODO add error log why it failed.
|
||||||
|
@ -109,7 +109,7 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
|||||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||||
f"{trace}"
|
f"{trace}"
|
||||||
)
|
)
|
||||||
raise e
|
raise e # TODO Debug, should not be here.
|
||||||
finally:
|
finally:
|
||||||
# Always unload the PDF backend, even in case of failure
|
# Always unload the PDF backend, even in case of failure
|
||||||
if in_doc._backend:
|
if in_doc._backend:
|
||||||
|
@ -6,8 +6,9 @@ from typing import Iterable
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -110,11 +111,7 @@ def main():
|
|||||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||||
# input = DocumentConversionInput.from_streams(docs)
|
# input = DocumentConversionInput.from_streams(docs)
|
||||||
|
|
||||||
doc_converter = PdfDocumentConverter(
|
doc_converter = DocumentConverter()
|
||||||
pipeline_options=PdfPipelineOptions(),
|
|
||||||
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
|
|
||||||
pipeline_cls=StandardModelPipeline,
|
|
||||||
)
|
|
||||||
|
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
|
@ -4,10 +4,11 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, FormatOption
|
||||||
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -101,9 +102,12 @@ def main():
|
|||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
doc_converter = PdfDocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
format_options={
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
InputFormat.PDF: FormatOption(
|
||||||
|
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Docling Parse with OCR
|
# Docling Parse with OCR
|
||||||
|
@ -5,11 +5,12 @@ from pathlib import Path
|
|||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
PdfPipelineOptions,
|
InputFormat,
|
||||||
Table,
|
Table,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -28,12 +29,16 @@ def main():
|
|||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||||
# scale=1 correspond of a standard 72 DPI image
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|
||||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -5,9 +5,10 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.utils.export import generate_multimodal_pages
|
from docling.utils.export import generate_multimodal_pages
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -27,12 +28,16 @@ def main():
|
|||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||||
# scale=1 correspond of a standard 72 DPI image
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|
||||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
doc_converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ def main():
|
|||||||
|
|
||||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
doc_converter = PdfDocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||||
converter = PdfDocumentConverter()
|
converter = DocumentConverter()
|
||||||
doc = converter.convert_single(source)
|
doc = converter.convert_single(source)
|
||||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
||||||
|
@ -1,8 +1,13 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -13,6 +18,7 @@ input_paths = [
|
|||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
input = DocumentConversionInput.from_paths(input_paths)
|
input = DocumentConversionInput.from_paths(input_paths)
|
||||||
@ -21,22 +27,29 @@ input = DocumentConversionInput.from_paths(input_paths)
|
|||||||
doc_converter = DocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
|
||||||
# to customize use:
|
# to customize use:
|
||||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
# formats=[InputFormat.PDF, InputFormat.DOCX],
|
# formats=[
|
||||||
|
# InputFormat.PDF,
|
||||||
|
# InputFormat.DOCX,
|
||||||
|
# ], # whitelist formats, other files are ignored.
|
||||||
# format_options={
|
# format_options={
|
||||||
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
|
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
|
||||||
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
|
# InputFormat.DOCX: FormatOption(
|
||||||
# }
|
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
|
||||||
|
# ),
|
||||||
|
# # InputFormat.IMAGE: PdfFormatOption(),
|
||||||
|
# },
|
||||||
# )
|
# )
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert(input)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
print("")
|
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted with status {res.status}. Content:"
|
f"Document {res.input.file.name} converted with status {res.status}."
|
||||||
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
print(res.experimental.export_to_markdown())
|
# print(res.experimental.export_to_markdown())
|
||||||
# Export Docling document format to markdown (experimental):
|
# Export Docling document format to markdown (experimental):
|
||||||
with (Path("./scratch") / f"{res.input.file.name}.experimental.md").open("w") as fp:
|
with out_path.open("w") as fp:
|
||||||
fp.write(res.experimental.export_to_markdown())
|
fp.write(res.experimental.export_to_markdown())
|
||||||
|
Loading…
Reference in New Issue
Block a user