mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
Lots of improvements
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -6,8 +6,9 @@ from typing import Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -110,11 +111,7 @@ def main():
|
||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=PdfPipelineOptions(),
|
||||
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
|
||||
pipeline_cls=StandardModelPipeline,
|
||||
)
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
|
||||
@@ -4,10 +4,11 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, FormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -101,9 +102,12 @@ def main():
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
# Docling Parse with OCR
|
||||
|
||||
@@ -5,11 +5,12 @@ from pathlib import Path
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PdfPipelineOptions,
|
||||
InputFormat,
|
||||
Table,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -28,12 +29,16 @@ def main():
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
@@ -5,9 +5,10 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -27,12 +28,16 @@ def main():
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -21,7 +21,7 @@ def main():
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = PdfDocumentConverter()
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = PdfDocumentConverter()
|
||||
converter = DocumentConverter()
|
||||
doc = converter.convert_single(source)
|
||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -13,6 +18,7 @@ input_paths = [
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
@@ -21,22 +27,29 @@ input = DocumentConversionInput.from_paths(input_paths)
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
# to customize use:
|
||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
# formats=[InputFormat.PDF, InputFormat.DOCX],
|
||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
# formats=[
|
||||
# InputFormat.PDF,
|
||||
# InputFormat.DOCX,
|
||||
# ], # whitelist formats, other files are ignored.
|
||||
# format_options={
|
||||
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
|
||||
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
|
||||
# }
|
||||
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
|
||||
# InputFormat.DOCX: FormatOption(
|
||||
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
|
||||
# ),
|
||||
# # InputFormat.IMAGE: PdfFormatOption(),
|
||||
# },
|
||||
# )
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
|
||||
for res in conv_results:
|
||||
print("")
|
||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
||||
print(
|
||||
f"Document {res.input.file.name} converted with status {res.status}. Content:"
|
||||
f"Document {res.input.file.name} converted with status {res.status}."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
print(res.experimental.export_to_markdown())
|
||||
# print(res.experimental.export_to_markdown())
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with (Path("./scratch") / f"{res.input.file.name}.experimental.md").open("w") as fp:
|
||||
with out_path.open("w") as fp:
|
||||
fp.write(res.experimental.export_to_markdown())
|
||||
|
||||
Reference in New Issue
Block a user