mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Properly propagating image data per page, together with predicted tags in VLM pipeline. This enables correct figure extraction and page numbers in provenances
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
01c46e24b1
commit
61bb9dbba2
@ -73,7 +73,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
disable_progress_bars()
|
disable_progress_bars()
|
||||||
|
|
||||||
# TODO download the correct model (private repo)
|
# TODO: download the correct model (private repo)
|
||||||
download_path = snapshot_download(
|
download_path = snapshot_download(
|
||||||
repo_id="ds4sd/xxx",
|
repo_id="ds4sd/xxx",
|
||||||
force_download=force,
|
force_download=force,
|
||||||
@ -95,30 +95,17 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
# Read and concatenate the page doctags:
|
# Read and concatenate the page doctags:
|
||||||
document_tags = ""
|
# document_tags = ""
|
||||||
|
page_tags = []
|
||||||
|
page_images = []
|
||||||
|
|
||||||
for page in conv_res.pages:
|
for page in conv_res.pages:
|
||||||
if page.predictions.doctags is not None:
|
if page.predictions.doctags is not None:
|
||||||
document_tags += page.predictions.doctags.tag_string
|
page_tags.append(page.predictions.doctags.tag_string)
|
||||||
|
page_images.append(page.image)
|
||||||
|
|
||||||
conv_res.document = self._turn_tags_into_doc(document_tags, page.image)
|
conv_res.document = self._turn_tags_into_doc(page_tags, page_images)
|
||||||
"""
|
|
||||||
image_bytes = BytesIO()
|
|
||||||
if page.image:
|
|
||||||
page.image.save(image_bytes, format="PNG")
|
|
||||||
# TODO implement this function
|
|
||||||
conv_res.document = self._turn_tags_into_doc(
|
|
||||||
document_tags, image_bytes.getvalue()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate page images in the output
|
|
||||||
if self.pipeline_options.generate_page_images:
|
|
||||||
for page in conv_res.pages:
|
|
||||||
assert page.image is not None
|
|
||||||
page_no = page.page_no + 1
|
|
||||||
conv_res.document.pages[page_no].image = ImageRef.from_pil(
|
|
||||||
page.image, dpi=int(72 * self.pipeline_options.images_scale)
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
# Generate images of the requested element types
|
# Generate images of the requested element types
|
||||||
if (
|
if (
|
||||||
self.pipeline_options.generate_picture_images
|
self.pipeline_options.generate_picture_images
|
||||||
@ -153,9 +140,8 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
# def _turn_tags_into_doc(self, xml_content: str, image_bytes: bytes) -> (DoclingDocument, list):
|
|
||||||
def _turn_tags_into_doc(
|
def _turn_tags_into_doc(
|
||||||
self, xml_content: str, pil_image: Optional[Image] = None
|
self, full_doc_xml_content: list[str], pil_images: list[Image | None]
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
def extract_text(tag_content: str) -> str:
|
def extract_text(tag_content: str) -> str:
|
||||||
return re.sub(r"<.*?>", "", tag_content).strip()
|
return re.sub(r"<.*?>", "", tag_content).strip()
|
||||||
@ -251,7 +237,7 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
cell_text = texts[i + 1]
|
cell_text = texts[i + 1]
|
||||||
right_offset = 2
|
right_offset = 2
|
||||||
|
|
||||||
# TODO: Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
||||||
next_right_cell = texts[i + right_offset]
|
next_right_cell = texts[i + right_offset]
|
||||||
|
|
||||||
next_bottom_cell = ""
|
next_bottom_cell = ""
|
||||||
@ -333,212 +319,236 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
doc = DoclingDocument(name="Example Document")
|
doc = DoclingDocument(name="Example Document")
|
||||||
current_group = None
|
current_group = None
|
||||||
lines = xml_content.split("\n")
|
|
||||||
# pil_image = input_image #Image.open(BytesIO(image_bytes))
|
|
||||||
bounding_boxes = []
|
|
||||||
|
|
||||||
for line in lines:
|
for pg_idx, xml_content in enumerate(full_doc_xml_content):
|
||||||
line = line.strip()
|
pil_image = pil_images[pg_idx]
|
||||||
line = line.replace("<doc_tag>", "")
|
page_no = pg_idx + 1
|
||||||
if line.startswith("<paragraph>"):
|
lines = xml_content.split("\n")
|
||||||
content = extract_text(line)
|
# pil_image = input_image #Image.open(BytesIO(image_bytes))
|
||||||
prov_item = extract_bounding_box(line)
|
bounding_boxes = []
|
||||||
if prov_item:
|
|
||||||
bounding_boxes.append((prov_item, "red"))
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.PARAGRAPH,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=(
|
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
|
||||||
if prov_item
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
elif line.startswith("<title>"):
|
|
||||||
content = extract_text(line)
|
|
||||||
prov_item = extract_bounding_box(line)
|
|
||||||
if prov_item:
|
|
||||||
bounding_boxes.append((prov_item, "blue"))
|
|
||||||
current_group = doc.add_group(label=GroupLabel.SECTION, name=content)
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.TITLE,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=(
|
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
|
||||||
if prov_item
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
elif line.startswith("<section-header>"):
|
for line in lines:
|
||||||
content = extract_text(line)
|
line = line.strip()
|
||||||
prov_item = extract_bounding_box(line)
|
line = line.replace("<doc_tag>", "")
|
||||||
if prov_item:
|
if line.startswith("<paragraph>"):
|
||||||
bounding_boxes.append((prov_item, "green"))
|
content = extract_text(line)
|
||||||
current_group = doc.add_group(label=GroupLabel.SECTION, name=content)
|
prov_item = extract_bounding_box(line)
|
||||||
doc.add_text(
|
if prov_item:
|
||||||
label=DocItemLabel.SECTION_HEADER,
|
bounding_boxes.append((prov_item, "red"))
|
||||||
text=content,
|
doc.add_text(
|
||||||
parent=current_group,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
prov=(
|
text=content,
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
parent=current_group,
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
prov=(
|
||||||
if prov_item
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
else None
|
ProvenanceItem(
|
||||||
),
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
)
|
)
|
||||||
|
if prov_item
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
elif line.startswith("<title>"):
|
||||||
|
content = extract_text(line)
|
||||||
|
prov_item = extract_bounding_box(line)
|
||||||
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "blue"))
|
||||||
|
current_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION, name=content
|
||||||
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.TITLE,
|
||||||
|
text=content,
|
||||||
|
parent=current_group,
|
||||||
|
prov=(
|
||||||
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
|
ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
|
)
|
||||||
|
if prov_item
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
elif line.startswith("<otsl>"):
|
elif line.startswith("<section-header>"):
|
||||||
prov_item = extract_bounding_box(line)
|
content = extract_text(line)
|
||||||
if prov_item:
|
prov_item = extract_bounding_box(line)
|
||||||
bounding_boxes.append((prov_item, "aquamarine"))
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "green"))
|
||||||
|
current_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION, name=content
|
||||||
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.SECTION_HEADER,
|
||||||
|
text=content,
|
||||||
|
parent=current_group,
|
||||||
|
prov=(
|
||||||
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
|
ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
|
)
|
||||||
|
if prov_item
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
table_data = parse_table_content(line)
|
elif line.startswith("<otsl>"):
|
||||||
doc.add_table(data=table_data, parent=current_group)
|
prov_item = extract_bounding_box(line)
|
||||||
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "aquamarine"))
|
||||||
|
|
||||||
elif line.startswith("<footnote>"):
|
table_data = parse_table_content(line)
|
||||||
content = extract_text(line)
|
doc.add_table(data=table_data, parent=current_group)
|
||||||
prov_item = extract_bounding_box(line)
|
|
||||||
if prov_item:
|
|
||||||
bounding_boxes.append((prov_item, "orange"))
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.FOOTNOTE,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=(
|
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
|
||||||
if prov_item
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
elif line.startswith("<page-header>"):
|
elif line.startswith("<footnote>"):
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if prov_item:
|
if prov_item:
|
||||||
bounding_boxes.append((prov_item, "purple"))
|
bounding_boxes.append((prov_item, "orange"))
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PAGE_HEADER,
|
label=DocItemLabel.FOOTNOTE,
|
||||||
text=content,
|
text=content,
|
||||||
parent=current_group,
|
parent=current_group,
|
||||||
prov=(
|
prov=(
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
ProvenanceItem(
|
||||||
if prov_item
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
else None
|
)
|
||||||
),
|
if prov_item
|
||||||
)
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
elif line.startswith("<page-footer>"):
|
elif line.startswith("<page-header>"):
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if prov_item:
|
if prov_item:
|
||||||
bounding_boxes.append((prov_item, "cyan"))
|
bounding_boxes.append((prov_item, "purple"))
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PAGE_FOOTER,
|
label=DocItemLabel.PAGE_HEADER,
|
||||||
text=content,
|
text=content,
|
||||||
parent=current_group,
|
parent=current_group,
|
||||||
prov=(
|
prov=(
|
||||||
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)
|
ProvenanceItem(
|
||||||
if prov_item
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
else None
|
)
|
||||||
),
|
if prov_item
|
||||||
)
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
elif line.startswith("<figure>"):
|
elif line.startswith("<page-footer>"):
|
||||||
bbox = extract_bounding_box(line)
|
content = extract_text(line)
|
||||||
if bbox:
|
prov_item = extract_bounding_box(line)
|
||||||
bounding_boxes.append((bbox, "yellow"))
|
if prov_item:
|
||||||
if pil_image:
|
bounding_boxes.append((prov_item, "cyan"))
|
||||||
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
|
doc.add_text(
|
||||||
width, height = pil_image.size
|
label=DocItemLabel.PAGE_FOOTER,
|
||||||
crop_box = (
|
text=content,
|
||||||
int(bbox.l * width),
|
parent=current_group,
|
||||||
int(bbox.t * height),
|
prov=(
|
||||||
int(bbox.r * width),
|
# [ProvenanceItem(bbox=prov_item, charspan=(0, 0), page_no=1)]
|
||||||
int(bbox.b * height),
|
ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
|
)
|
||||||
|
if prov_item
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
elif line.startswith("<figure>"):
|
||||||
|
bbox = extract_bounding_box(line)
|
||||||
|
if bbox:
|
||||||
|
bounding_boxes.append((bbox, "yellow"))
|
||||||
|
if pil_image:
|
||||||
|
# Convert bounding box normalized to 0-100 into pixel coordinates for cropping
|
||||||
|
width, height = pil_image.size
|
||||||
|
crop_box = (
|
||||||
|
int(bbox.l * width),
|
||||||
|
int(bbox.t * height),
|
||||||
|
int(bbox.r * width),
|
||||||
|
int(bbox.b * height),
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_image = pil_image.crop(crop_box)
|
||||||
|
doc.add_picture(
|
||||||
|
parent=current_group,
|
||||||
|
image=ImageRef.from_pil(image=cropped_image, dpi=300),
|
||||||
|
prov=ProvenanceItem(
|
||||||
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
doc.add_picture(
|
||||||
|
parent=current_group,
|
||||||
|
prov=ProvenanceItem(
|
||||||
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
||||||
|
),
|
||||||
|
)
|
||||||
|
elif line.startswith("<list>"):
|
||||||
|
content = extract_text(line)
|
||||||
|
prov_item_inst = None
|
||||||
|
prov_item = extract_bounding_box(line)
|
||||||
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "brown"))
|
||||||
|
prov_item_inst = ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
)
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.LIST_ITEM,
|
||||||
|
text=content,
|
||||||
|
parent=current_group,
|
||||||
|
prov=prov_item_inst if prov_item_inst else None,
|
||||||
|
)
|
||||||
|
|
||||||
cropped_image = pil_image.crop(crop_box)
|
elif line.startswith("<caption>"):
|
||||||
doc.add_picture(
|
content = extract_text(line)
|
||||||
parent=current_group,
|
prov_item_inst = None
|
||||||
image=ImageRef.from_pil(image=cropped_image, dpi=300),
|
prov_item = extract_bounding_box(line)
|
||||||
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "magenta"))
|
||||||
|
prov_item_inst = ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
)
|
)
|
||||||
else:
|
doc.add_text(
|
||||||
doc.add_picture(
|
label=DocItemLabel.PARAGRAPH,
|
||||||
parent=current_group,
|
text=content,
|
||||||
prov=ProvenanceItem(bbox=bbox, charspan=(0, 0), page_no=1),
|
parent=current_group,
|
||||||
|
prov=prov_item_inst if prov_item_inst else None,
|
||||||
|
)
|
||||||
|
elif line.startswith("<checkbox-unselected>"):
|
||||||
|
content = extract_text(line)
|
||||||
|
prov_item_inst = None
|
||||||
|
prov_item = extract_bounding_box(line)
|
||||||
|
if prov_item:
|
||||||
|
bounding_boxes.append((prov_item, "gray"))
|
||||||
|
prov_item_inst = ProvenanceItem(
|
||||||
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
)
|
)
|
||||||
elif line.startswith("<list>"):
|
doc.add_text(
|
||||||
content = extract_text(line)
|
label=DocItemLabel.CHECKBOX_UNSELECTED,
|
||||||
prov_item_inst = None
|
text=content,
|
||||||
prov_item = extract_bounding_box(line)
|
parent=current_group,
|
||||||
if prov_item:
|
prov=prov_item_inst if prov_item_inst else None,
|
||||||
bounding_boxes.append((prov_item, "brown"))
|
|
||||||
prov_item_inst = ProvenanceItem(
|
|
||||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
|
||||||
)
|
)
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.LIST_ITEM,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=prov_item_inst if prov_item_inst else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif line.startswith("<caption>"):
|
elif line.startswith("<checkbox-selected>"):
|
||||||
content = extract_text(line)
|
content = extract_text(line)
|
||||||
prov_item_inst = None
|
prov_item_inst = None
|
||||||
prov_item = extract_bounding_box(line)
|
prov_item = extract_bounding_box(line)
|
||||||
if prov_item:
|
if prov_item:
|
||||||
bounding_boxes.append((prov_item, "magenta"))
|
bounding_boxes.append((prov_item, "black"))
|
||||||
prov_item_inst = ProvenanceItem(
|
prov_item_inst = ProvenanceItem(
|
||||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
bbox=prov_item, charspan=(0, 0), page_no=page_no
|
||||||
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.CHECKBOX_SELECTED,
|
||||||
|
text=content,
|
||||||
|
parent=current_group,
|
||||||
|
prov=prov_item_inst if prov_item_inst else None,
|
||||||
)
|
)
|
||||||
doc.add_text(
|
# return doc, bounding_boxes
|
||||||
label=DocItemLabel.PARAGRAPH,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=prov_item_inst if prov_item_inst else None,
|
|
||||||
)
|
|
||||||
elif line.startswith("<checkbox-unselected>"):
|
|
||||||
content = extract_text(line)
|
|
||||||
prov_item_inst = None
|
|
||||||
prov_item = extract_bounding_box(line)
|
|
||||||
if prov_item:
|
|
||||||
bounding_boxes.append((prov_item, "gray"))
|
|
||||||
prov_item_inst = ProvenanceItem(
|
|
||||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
|
||||||
)
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.CHECKBOX_UNSELECTED,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=prov_item_inst if prov_item_inst else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
elif line.startswith("<checkbox-selected>"):
|
|
||||||
content = extract_text(line)
|
|
||||||
prov_item_inst = None
|
|
||||||
prov_item = extract_bounding_box(line)
|
|
||||||
if prov_item:
|
|
||||||
bounding_boxes.append((prov_item, "black"))
|
|
||||||
prov_item_inst = ProvenanceItem(
|
|
||||||
bbox=prov_item, charspan=(0, 0), page_no=1
|
|
||||||
)
|
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.CHECKBOX_SELECTED,
|
|
||||||
text=content,
|
|
||||||
parent=current_group,
|
|
||||||
prov=prov_item_inst if prov_item_inst else None,
|
|
||||||
)
|
|
||||||
# return doc, bounding_boxes
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -6,10 +6,11 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
# source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
|
||||||
# source = "tests/data/2305.03393v1-pg9-img.png"
|
# source = "tests/data/2305.03393v1-pg9-img.png"
|
||||||
source = "tests/data/2305.03393v1-pg9.pdf"
|
# source = "tests/data/2305.03393v1-pg9.pdf"
|
||||||
# source = "page.png"
|
# source = "demo_data/page.png"
|
||||||
|
# source = "demo_data/original_tables.pdf"
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
|
Loading…
Reference in New Issue
Block a user