mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 07:08:19 +00:00
@@ -8,7 +8,6 @@ import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@@ -31,19 +30,23 @@ def export_documents(
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
if USE_EXPERIMENTAL:
|
||||
|
||||
@@ -5,7 +5,6 @@ from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
@@ -28,19 +27,23 @@ def export_documents(
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.json").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||
with (output_dir / f"{doc_filename}.doctags").open(
|
||||
"w", encoding="utf-8"
|
||||
) as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
@@ -82,7 +85,7 @@ def main():
|
||||
# PyPdfium with OCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PageElement,
|
||||
PdfPipelineOptions,
|
||||
Table,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@@ -1,22 +1,8 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
InputFormat,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter, FormatOption
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user