mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Merge branch 'docling-project:main' into main
This commit is contained in:
commit
4b65566076
@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type
|
||||
|
||||
import rich.table
|
||||
import typer
|
||||
from docling_core.transforms.serializer.html import (
|
||||
HTMLDocSerializer,
|
||||
HTMLOutputStyle,
|
||||
HTMLParams,
|
||||
)
|
||||
from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter
|
||||
@ -156,6 +162,7 @@ def export_documents(
|
||||
export_json: bool,
|
||||
export_html: bool,
|
||||
export_html_split_page: bool,
|
||||
show_layout: bool,
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
@ -189,9 +196,27 @@ def export_documents(
|
||||
if export_html_split_page:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname, image_mode=image_export_mode, split_page_view=True
|
||||
)
|
||||
if show_layout:
|
||||
ser = HTMLDocSerializer(
|
||||
doc=conv_res.document,
|
||||
params=HTMLParams(
|
||||
image_mode=image_export_mode,
|
||||
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
||||
),
|
||||
)
|
||||
visualizer = LayoutVisualizer()
|
||||
visualizer.params.show_label = False
|
||||
ser_res = ser.serialize(
|
||||
visualizer=visualizer,
|
||||
)
|
||||
with open(fname, "w") as fw:
|
||||
fw.write(ser_res.text)
|
||||
else:
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname,
|
||||
image_mode=image_export_mode,
|
||||
split_page_view=True,
|
||||
)
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
@ -250,6 +275,13 @@ def convert( # noqa: C901
|
||||
to_formats: List[OutputFormat] = typer.Option(
|
||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||
),
|
||||
show_layout: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
help="If enabled, the page images will show the bounding-boxes of the items.",
|
||||
),
|
||||
] = False,
|
||||
headers: str = typer.Option(
|
||||
None,
|
||||
"--headers",
|
||||
@ -596,6 +628,7 @@ def convert( # noqa: C901
|
||||
export_json=export_json,
|
||||
export_html=export_html,
|
||||
export_html_split_page=export_html_split_page,
|
||||
show_layout=show_layout,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
|
734
poetry.lock
generated
734
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
||||
######################
|
||||
python = "^3.9"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = {version = "^2.29.0", extras = ["chunking"]}
|
||||
docling-core = {version = "^2.31.2", extras = ["chunking"]}
|
||||
docling-ibm-models = "^3.4.0"
|
||||
docling-parse = "^4.0.0"
|
||||
filetype = "^1.2.0"
|
||||
|
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
Normal file
8
tests/data/groundtruth/docling_v2/example_08.html.itxt
Normal file
@ -0,0 +1,8 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group header-1
|
||||
item-2 at level 2: section_header: Pivot table with with 1 row header
|
||||
item-3 at level 3: table with [6x4]
|
||||
item-4 at level 2: section_header: Pivot table with 2 row headers
|
||||
item-5 at level 3: table with [6x5]
|
||||
item-6 at level 2: section_header: Equivalent pivot table
|
||||
item-7 at level 3: table with [6x5]
|
2008
tests/data/groundtruth/docling_v2/example_08.html.json
Normal file
2008
tests/data/groundtruth/docling_v2/example_08.html.json
Normal file
File diff suppressed because it is too large
Load Diff
29
tests/data/groundtruth/docling_v2/example_08.html.md
Normal file
29
tests/data/groundtruth/docling_v2/example_08.html.md
Normal file
@ -0,0 +1,29 @@
|
||||
## Pivot table with with 1 row header
|
||||
|
||||
| Year | Month | Revenue | Cost |
|
||||
|--------|----------|-----------|--------|
|
||||
| 2025 | January | $134 | $162 |
|
||||
| 2025 | February | $150 | $155 |
|
||||
| 2025 | March | $160 | $143 |
|
||||
| 2025 | April | $210 | $150 |
|
||||
| 2025 | May | $280 | $120 |
|
||||
|
||||
## Pivot table with 2 row headers
|
||||
|
||||
| Year | Quarter | Month | Revenue | Cost |
|
||||
|--------|-----------|----------|-----------|--------|
|
||||
| 2025 | Q1 | January | $134 | $162 |
|
||||
| 2025 | Q1 | February | $150 | $155 |
|
||||
| 2025 | Q1 | March | $160 | $143 |
|
||||
| 2025 | Q2 | April | $210 | $150 |
|
||||
| 2025 | Q2 | May | $280 | $120 |
|
||||
|
||||
## Equivalent pivot table
|
||||
|
||||
| Year | Quarter | Month | Revenue | Cost |
|
||||
|--------|-----------|----------|-----------|--------|
|
||||
| 2025 | Q1 | January | $134 | $162 |
|
||||
| 2025 | Q1 | February | $150 | $155 |
|
||||
| 2025 | Q1 | March | $160 | $143 |
|
||||
| 2025 | Q2 | April | $210 | $150 |
|
||||
| 2025 | Q2 | May | $280 | $120 |
|
Loading…
Reference in New Issue
Block a user