mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-16 08:38:14 +00:00
@@ -8,6 +8,7 @@ from pathlib import Path
|
||||
from typing import Annotated, Iterable, List, Optional
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
@@ -109,11 +110,11 @@ def export_documents(
|
||||
@app.command(no_args_is_help=True)
|
||||
def convert(
|
||||
input_sources: Annotated[
|
||||
List[Path],
|
||||
List[str],
|
||||
typer.Argument(
|
||||
...,
|
||||
metavar="source",
|
||||
help="PDF files to convert. Directories are also accepted.",
|
||||
help="PDF files to convert. Can be local file / directory paths or URL.",
|
||||
),
|
||||
],
|
||||
export_json: Annotated[
|
||||
@@ -167,7 +168,8 @@ def convert(
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for source in input_sources:
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
|
||||
@@ -373,20 +373,30 @@ class ConvertedDocument(BaseModel):
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
page_tagging: bool = True,
|
||||
location_tagging: bool = True,
|
||||
location_dimensions: Tuple[int, int] = (100, 100),
|
||||
add_new_line: bool = True,
|
||||
xsize: int = 100,
|
||||
ysize: int = 100,
|
||||
add_location: bool = True,
|
||||
add_content: bool = True,
|
||||
add_page_index: bool = True,
|
||||
# table specific flags
|
||||
add_table_cell_location: bool = False,
|
||||
add_table_cell_label: bool = True,
|
||||
add_table_cell_text: bool = True,
|
||||
) -> str:
|
||||
return self.output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
page_tagging=page_tagging,
|
||||
location_tagging=location_tagging,
|
||||
location_dimensions=location_dimensions,
|
||||
add_new_line=add_new_line,
|
||||
xsize=xsize,
|
||||
ysize=ysize,
|
||||
add_location=add_location,
|
||||
add_content=add_content,
|
||||
add_page_index=add_page_index,
|
||||
# table specific flags
|
||||
add_table_cell_location=add_table_cell_location,
|
||||
add_table_cell_label=add_table_cell_label,
|
||||
add_table_cell_text=add_table_cell_text,
|
||||
)
|
||||
|
||||
def render_element_images(
|
||||
|
||||
@@ -112,7 +112,7 @@ def generate_multimodal_pages(
|
||||
)
|
||||
# No page-tagging since we only do 1 page at the time
|
||||
content_dt = doc.export_to_document_tokens(
|
||||
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
|
||||
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
|
||||
)
|
||||
|
||||
return content_text, content_md, content_dt, page_cells, page_segments, page
|
||||
|
||||
Reference in New Issue
Block a user