Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-01 16:27:22 +02:00
parent cd06d89c2a
commit 1fa7cd9855
34 changed files with 2102 additions and 365 deletions

View File

@@ -6,9 +6,9 @@ from typing import Iterable
import yaml
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@@ -107,7 +107,11 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
doc_converter = PdfDocumentConverter(
pipeline_options=PdfPipelineOptions(),
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls=StandardModelPipeline,
)
input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@@ -6,9 +6,9 @@ from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@@ -93,12 +93,12 @@ def main():
# Docling Parse without OCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

View File

@@ -4,14 +4,14 @@ from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
PdfPipelineOptions,
Table,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@@ -30,12 +30,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time()

View File

@@ -5,9 +5,9 @@ from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from docling.utils.export import generate_multimodal_pages
_log = logging.getLogger(__name__)
@@ -27,12 +27,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time()

View File

@@ -7,7 +7,7 @@ import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@@ -22,7 +22,7 @@ def main():
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
doc_converter = PdfDocumentConverter()
start_time = time.time()

View File

@@ -1,6 +1,6 @@
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
converter = PdfDocumentConverter()
doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"

View File

@@ -0,0 +1,41 @@
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[InputFormat.PDF, InputFormat.DOCX],
# format_options={
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
# }
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())