From d74e4075268db6632e98a090496eef50f8b21e74 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Mon, 14 Apr 2025 14:05:12 +0200 Subject: [PATCH] enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi --- .pre-commit-config.yaml | 61 ++++--- docling/backend/asciidoc_backend.py | 6 - docling/backend/docling_parse_backend.py | 1 - docling/backend/docling_parse_v2_backend.py | 1 - docling/backend/docling_parse_v4_backend.py | 1 - docling/backend/docx/latex/omml.py | 2 - docling/backend/html_backend.py | 3 - docling/backend/md_backend.py | 2 - docling/backend/msexcel_backend.py | 6 - docling/backend/pypdfium2_backend.py | 1 - docling/backend/xml/jats_backend.py | 1 - docling/backend/xml/uspto_backend.py | 1 - docling/cli/main.py | 1 - docling/datamodel/base_models.py | 6 +- docling/datamodel/document.py | 7 +- docling/datamodel/pipeline_options.py | 1 - docling/models/api_vlm_model.py | 1 - docling/models/base_model.py | 3 - docling/models/easyocr_model.py | 2 - docling/models/hf_mlx_model.py | 2 - docling/models/hf_vlm_model.py | 1 - docling/models/layout_model.py | 1 - docling/models/ocr_mac_model.py | 2 - docling/models/page_assemble_model.py | 10 +- .../models/picture_description_vlm_model.py | 2 - docling/models/rapid_ocr_model.py | 2 - docling/models/readingorder_model.py | 18 +-- docling/models/table_structure_model.py | 4 - docling/models/tesseract_ocr_cli_model.py | 2 - docling/pipeline/base_pipeline.py | 5 - docling/pipeline/simple_pipeline.py | 1 - docling/pipeline/vlm_pipeline.py | 2 - docling/utils/export.py | 2 - docling/utils/layout_postprocessor.py | 4 +- .../examples/develop_formula_understanding.py | 1 - docs/examples/export_multimodal.py | 1 - docs/examples/export_tables.py | 4 +- docs/examples/rag_azuresearch.ipynb | 2 +- docs/examples/rag_langchain.ipynb | 2 +- docs/examples/run_md.py | 4 +- docs/examples/translate.py | 1 - docs/examples/visual_grounding.ipynb | 2 +- pyproject.toml | 75 +++++++-- tests/test_backend_asciidoc.py | 1 - tests/test_backend_csv.py | 9 +- tests/test_backend_docling_parse.py | 6 +- tests/test_backend_docling_parse_v2.py | 6 +- tests/test_backend_docling_parse_v4.py | 6 +- tests/test_backend_html.py | 15 +- tests/test_backend_jats.py | 6 +- tests/test_backend_msexcel.py | 14 +- tests/test_backend_msword.py | 21 ++- tests/test_backend_patent_uspto.py | 18 +-- tests/test_backend_pdfium.py | 6 +- tests/test_backend_pptx.py | 15 +- tests/test_code_formula.py | 1 - tests/test_document_picture_classifier.py | 37 +++-- tests/test_e2e_conversion.py | 3 - tests/test_input_doc.py | 5 - tests/test_interfaces.py | 4 - tests/test_invalid_input.py | 1 - tests/test_legacy_format_transform.py | 1 - tests/verify_utils.py | 152 +++++++++--------- 63 files changed, 268 insertions(+), 316 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19bb27c3..b0db1a8d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,32 @@ fail_fast: true repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.5 + hooks: + # Run the Ruff formatter. + - id: ruff-format + name: "Ruff formatter" + args: [--config=pyproject.toml] + files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' + # Run the Ruff linter. + # - id: ruff + # name: "Ruff linter" + # args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + # files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' - repo: local hooks: - - id: black - name: Black - entry: poetry run black docling docs/examples tests - pass_filenames: false - language: system - files: '\.py$' - - id: isort - name: isort - entry: poetry run isort docling docs/examples tests - pass_filenames: false - language: system - files: '\.py$' + # - id: black + # name: Black + # entry: poetry run black docling docs/examples tests + # pass_filenames: false + # language: system + # files: '\.py$' + # - id: isort + # name: isort + # entry: poetry run isort docling docs/examples tests + # pass_filenames: false + # language: system + # files: '\.py$' # - id: flake8 # name: flake8 # entry: poetry run flake8 docling @@ -26,18 +39,18 @@ repos: pass_filenames: false language: system files: '\.py$' - - id: nbqa_black - name: nbQA Black - entry: poetry run nbqa black docs/examples - pass_filenames: false - language: system - files: '\.ipynb$' - - id: nbqa_isort - name: nbQA isort - entry: poetry run nbqa isort docs/examples - pass_filenames: false - language: system - files: '\.ipynb$' + # - id: nbqa_black + # name: nbQA Black + # entry: poetry run nbqa black docs/examples + # pass_filenames: false + # language: system + # files: '\.ipynb$' + # - id: nbqa_isort + # name: nbQA isort + # entry: poetry run nbqa isort docs/examples + # pass_filenames: false + # language: system + # files: '\.ipynb$' - id: poetry name: Poetry check entry: poetry check --lock diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 09891eb8..82188e6b 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -125,7 +125,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Lists elif self._is_list_item(line): - _log.debug(f"line: {line}") item = self._parse_list_item(line) _log.debug(f"parsed list-item: {item}") @@ -147,7 +146,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): indents[level + 1] = item["indent"] elif in_list and item["indent"] < indents[level]: - # print(item["indent"], " => ", indents[level]) while item["indent"] < indents[level]: # print(item["indent"], " => ", indents[level]) @@ -176,7 +174,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): elif in_table and ( (not self._is_table_line(line)) or line.strip() == "|===" ): # end of table - caption = None if len(caption_data) > 0: caption = doc.add_text( @@ -195,7 +192,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Picture elif self._is_picture(line): - caption = None if len(caption_data) > 0: caption = doc.add_text( @@ -250,7 +246,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): text_data = [] elif len(line.strip()) > 0: # allow multiline texts - item = self._parse_text(line) text_data.append(item["text"]) @@ -357,7 +352,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return [cell.strip() for cell in line.split("|") if cell.strip()] def _populate_table_as_grid(self, table_data): - num_rows = len(table_data) # Adjust the table data into a grid format diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 533ed429..081efa6f 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -156,7 +156,6 @@ class DoclingParsePageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index f7475aaf..1e6ea9c1 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -172,7 +172,6 @@ class DoclingParseV2PageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index e1e74301..232081fd 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index b2d5f900..1289ffbd 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -77,7 +77,6 @@ def get_val(key, default=None, store=CHR): class Tag2Method(object): - def call_method(self, elm, stag=None): getmethod = self.tag2meth.get if stag is None: @@ -130,7 +129,6 @@ class Tag2Method(object): class Pr(Tag2Method): - text = "" __val_tags = ("chr", "pos", "begChr", "endChr", "type") diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 5889429a..3534d827 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc def walk(self, tag: Tag, doc: DoclingDocument) -> None: - # Iterate over elements in the body of the document text: str = "" for element in tag.children: @@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) else: if hlevel > self.level: - # add invisible group for i in range(self.level + 1, hlevel): self.parents[i] = doc.add_group( @@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level = hlevel elif hlevel < self.level: - # remove the tail for key in self.parents.keys(): if key > hlevel: diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index f83dd2d9..2abe5bae 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): visited: Set[marko.element.Element], parent_item: Optional[NodeItem] = None, ): - if element in visited: return @@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: - # export to HTML html_backend_cls = HTMLDocumentBackend html_str = doc.export_to_html() diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 971b93cd..f962e326 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken """ if self.workbook is not None: - # Iterate over all sheets for sheet_name in self.workbook.sheetnames: _log.info(f"Processing sheet: {sheet_name}") @@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken ) for excel_cell in excel_table.data: - cell = TableCell( text=excel_cell.text, row_span=excel_cell.row_span, @@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken # Iterate over all cells in the sheet for ri, row in enumerate(sheet.iter_rows(values_only=False)): for rj, cell in enumerate(row): - # Skip empty or already visited cells if cell.value is None or (ri, rj) in visited: continue @@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken visited_cells: set[tuple[int, int]] = set() for ri in range(start_row, max_row + 1): for rj in range(start_col, max_col + 1): - cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing # Check if the cell belongs to a merged range @@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken col_span = 1 for merged_range in sheet.merged_cells.ranges: - if ( merged_range.min_row <= ri + 1 and ri + 1 <= merged_range.max_row and merged_range.min_col <= rj + 1 and rj + 1 <= merged_range.max_col ): - row_span = merged_range.max_row - merged_range.min_row + 1 col_span = merged_range.max_col - merged_range.min_col + 1 break diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 0fce0f89..d8e9a2ce 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -225,7 +225,6 @@ class PyPdfiumPageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py index 2409961f..06cbb2f4 100755 --- a/docling/backend/xml/jats_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -300,7 +300,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): def _add_abstract( self, doc: DoclingDocument, xml_components: XMLComponents ) -> None: - for abstract in xml_components["abstract"]: text: str = abstract["content"] title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index f3fb1ca3..a3e04081 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): @override def convert(self) -> DoclingDocument: - if self.parser is not None: doc = self.parser.parse(self.patent_content) if doc is None: diff --git a/docling/cli/main.py b/docling/cli/main.py index 6830c7f1..3cb521ad 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -160,7 +160,6 @@ def export_documents( export_doctags: bool, image_export_mode: ImageRefMode, ): - success_count = 0 failure_count = 0 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8ee53d6c..b1daa482 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -233,9 +233,9 @@ class Page(BaseModel): None # Internal PDF backend. By default it is cleared during assembling. ) _default_image_scale: float = 1.0 # Default image scale for external usage. - _image_cache: Dict[float, Image] = ( - {} - ) # Cache of images in different scales. By default it is cleared during assembling. + _image_cache: Dict[ + float, Image + ] = {} # Cache of images in different scales. By default it is cleared during assembling. def get_image( self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 93dfd1a5..b925404c 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -134,9 +134,9 @@ class InputDocument(BaseModel): self._init_doc(backend, path_or_stream) elif isinstance(path_or_stream, BytesIO): - assert ( - filename is not None - ), "Can't construct InputDocument from stream without providing filename arg." + assert filename is not None, ( + "Can't construct InputDocument from stream without providing filename arg." + ) self.file = PurePath(filename) self.filesize = path_or_stream.getbuffer().nbytes @@ -228,7 +228,6 @@ class _DummyBackend(AbstractDocumentBackend): class _DocumentConversionInput(BaseModel): - path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] headers: Optional[Dict[str, str]] = None limits: Optional[DocumentLimits] = DocumentLimits() diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 8e99cd09..a24df89d 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions): class VlmPipelineOptions(PaginatedPipelineOptions): - generate_page_images: bool = True force_backend_text: bool = ( False # (To be used with vlms, or other generative models) diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 95201224..4fbefcc4 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder class ApiVlmModel(BasePageModel): - def __init__( self, enabled: bool, diff --git a/docling/models/base_model.py b/docling/models/base_model.py index 712d329d..95760cdf 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -29,7 +29,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem) class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): - elements_batch_size: int = settings.perf.elements_batch_size @abstractmethod @@ -50,7 +49,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]): - def prepare_element( self, conv_res: ConversionResult, element: NodeItem ) -> Optional[NodeItem]: @@ -62,7 +60,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]): class BaseItemAndImageEnrichmentModel( GenericEnrichmentModel[ItemAndImageEnrichmentElement] ): - images_scale: float expansion_factor: float = 0.0 diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 13eb33c9..5e22e1d3 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -126,13 +126,11 @@ class EasyOcrModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return for page in page_batch: - assert page._backend is not None if not page._backend.is_valid(): yield page diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 762a6557..571f85a0 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -19,7 +19,6 @@ _log = logging.getLogger(__name__) class HuggingFaceMlxModel(BasePageModel): - def __init__( self, enabled: bool, @@ -32,7 +31,6 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options if self.enabled: - try: from mlx_vlm import generate, load # type: ignore from mlx_vlm.prompt_utils import apply_chat_template # type: ignore diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 2acbe290..7b4771d8 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -19,7 +19,6 @@ _log = logging.getLogger(__name__) class HuggingFaceVlmModel(BasePageModel): - def __init__( self, enabled: bool, diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index b3cbd954..a61f7726 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -142,7 +142,6 @@ class LayoutModel(BasePageModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index 98ca3f19..c9c778f0 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -58,7 +58,6 @@ class OcrMacModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return @@ -69,7 +68,6 @@ class OcrMacModel(BaseOcrModel): yield page else: with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4712abdc..4c43560e 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -71,7 +71,6 @@ class PageAssembleModel(BasePageModel): yield page else: with TimeRecorder(conv_res, "page_assemble"): - assert page.predictions.layout is not None # assembles some JSON output page by page. @@ -83,7 +82,6 @@ class PageAssembleModel(BasePageModel): for cluster in page.predictions.layout.clusters: # _log.info("Cluster label seen:", cluster.label) if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - textlines = [ cell.text.replace("\x02", "-").strip() for cell in cluster.cells @@ -109,9 +107,7 @@ class PageAssembleModel(BasePageModel): tbl = page.predictions.tablestructure.table_map.get( cluster.id, None ) - if ( - not tbl - ): # fallback: add table without structure, if it isn't present + if not tbl: # fallback: add table without structure, if it isn't present tbl = Table( label=cluster.label, id=cluster.id, @@ -130,9 +126,7 @@ class PageAssembleModel(BasePageModel): fig = page.predictions.figures_classification.figure_map.get( cluster.id, None ) - if ( - not fig - ): # fallback: add figure without classification, if it isn't present + if not fig: # fallback: add figure without classification, if it isn't present fig = FigureElement( label=cluster.label, id=cluster.id, diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index fc5c51ec..907c1a43 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -13,7 +13,6 @@ from docling.utils.accelerator_utils import decide_device class PictureDescriptionVlmModel(PictureDescriptionBaseModel): - @classmethod def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: return PictureDescriptionVlmOptions @@ -36,7 +35,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): self.options: PictureDescriptionVlmOptions if self.enabled: - if artifacts_path is None: artifacts_path = self.download_models(repo_id=self.options.repo_id) else: diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index e21974d7..77190cfe 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -74,13 +74,11 @@ class RapidOcrModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return for page in page_batch: - assert page._backend is not None if not page._backend.is_valid(): yield page diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index e7bdd1a1..a40bc5a9 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -53,12 +53,10 @@ class ReadingOrderModel: def _assembled_to_readingorder_elements( self, conv_res: ConversionResult ) -> List[ReadingOrderPageElement]: - elements: List[ReadingOrderPageElement] = [] page_no_to_pages = {p.page_no: p for p in conv_res.pages} for element in conv_res.assembled.elements: - page_height = page_no_to_pages[element.page_no].size.height # type: ignore bbox = element.cluster.bbox.to_bottom_left_origin(page_height) text = element.text or "" @@ -84,7 +82,6 @@ class ReadingOrderModel: def _add_child_elements( self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument ): - child: Cluster for child in element.cluster.children: c_label = child.label @@ -118,7 +115,6 @@ class ReadingOrderModel: el_to_footnotes_mapping: Dict[int, List[int]], el_merges_mapping: Dict[int, List[int]], ) -> DoclingDocument: - id_to_elem = { RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem for elem in conv_res.assembled.elements @@ -192,7 +188,6 @@ class ReadingOrderModel: code_item.footnotes.append(new_footnote_item.get_ref()) else: - new_item, current_list = self._handle_text_element( element, out_doc, current_list, page_height ) @@ -206,7 +201,6 @@ class ReadingOrderModel: ) elif isinstance(element, Table): - tbl_data = TableData( num_rows=element.num_rows, num_cols=element.num_cols, @@ -342,12 +336,12 @@ class ReadingOrderModel: return new_item, current_list def _merge_elements(self, element, merged_elem, new_item, page_height): - assert isinstance( - merged_elem, type(element) - ), "Merged element must be of same type as element." - assert ( - merged_elem.label == new_item.label - ), "Labels of merged elements must match." + assert isinstance(merged_elem, type(element)), ( + "Merged element must be of same type as element." + ) + assert merged_elem.label == new_item.label, ( + "Labels of merged elements must match." + ) prov = ProvenanceItem( page_no=element.page_no + 1, charspan=( diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 34a7d9da..3e81e288 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel): self.enabled = enabled if self.enabled: - if artifacts_path is None: artifacts_path = self.download_models() / self._model_path else: @@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return @@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel): yield page else: with TimeRecorder(conv_res, "table_structure"): - assert page.predictions.layout is not None assert page.size is not None @@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel): table_out = tf_output[0] table_cells = [] for element in table_out["tf_responses"]: - if not self.do_cell_matching: the_bbox = BoundingBox.model_validate( element["bbox"] diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 1e7fe039..365fdae7 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -63,7 +63,6 @@ class TesseractOcrCliModel(BaseOcrModel): ) def _get_name_and_version(self) -> Tuple[str, str]: - if self._name != None and self._version != None: return self._name, self._version # type: ignore @@ -197,7 +196,6 @@ class TesseractOcrCliModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 1bf48ef0..b278a0c7 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -64,7 +64,6 @@ class BasePipeline(ABC): return conv_res def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: - def _prepare_elements( conv_res: ConversionResult, model: GenericEnrichmentModel[Any] ) -> Iterable[NodeItem]: @@ -113,7 +112,6 @@ class BasePipeline(ABC): class PaginatedPipeline(BasePipeline): # TODO this is a bad name. - def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) self.keep_backend = False @@ -127,7 +125,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. yield from page_batch def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(conv_res.input._backend, PdfDocumentBackend): raise RuntimeError( f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " @@ -139,7 +136,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. total_elapsed_time = 0.0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): - for i in range(0, conv_res.input.page_count): start_page, end_page = conv_res.input.limits.page_range if (start_page - 1) <= i <= (end_page - 1): @@ -161,7 +157,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. pipeline_pages = self._apply_on_pages(conv_res, init_pages) for p in pipeline_pages: # Must exhaust! - # Cleanup cached images if not self.keep_images: p._image_cache = {} diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index fb985231..2e8f0ea0 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline): super().__init__(pipeline_options) def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend): raise RuntimeError( f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. " diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 79279fd0..b8892a49 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -32,7 +32,6 @@ _log = logging.getLogger(__name__) class VlmPipeline(PaginatedPipeline): - def __init__(self, pipeline_options: VlmPipelineOptions): super().__init__(pipeline_options) self.keep_backend = True @@ -114,7 +113,6 @@ class VlmPipeline(PaginatedPipeline): def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): - if ( self.pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS diff --git a/docling/utils/export.py b/docling/utils/export.py index 2c0077e8..4f3ac6fe 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -13,7 +13,6 @@ _log = logging.getLogger(__name__) def generate_multimodal_pages( doc_result: ConversionResult, ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: - label_to_doclaynet = { "title": "title", "table-of-contents": "document_index", @@ -122,7 +121,6 @@ def generate_multimodal_pages( if doc.main_text is None: return for ix, orig_item in enumerate(doc.main_text): - item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None or item.prov is None or len(item.prov) == 0: _log.debug(f"Skipping item {orig_item}") diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 17d8f8bd..66ce9aec 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -484,7 +484,9 @@ class LayoutPostprocessor: spatial_index = ( self.regular_index if cluster_type == "regular" - else self.picture_index if cluster_type == "picture" else self.wrapper_index + else self.picture_index + if cluster_type == "picture" + else self.wrapper_index ) # Map of currently valid clusters diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index ca24d95d..1ebfc46c 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel # How the pipeline can be extended. class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline): - def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions): super().__init__(pipeline_options) self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index e7ea3df2..a49999a2 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -51,7 +51,6 @@ def main(): page_segments, page, ) in generate_multimodal_pages(conv_res): - dpi = page._default_image_scale * 72 rows.append( diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 8f092921..9a911d84 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -32,12 +32,12 @@ def main(): print(table_df.to_markdown()) # Save the table as csv - element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv" _log.info(f"Saving CSV table to {element_csv_filename}") table_df.to_csv(element_csv_filename) # Save the table as html - element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html" + element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html" _log.info(f"Saving HTML table to {element_html_filename}") with element_html_filename.open("w") as fp: fp.write(table.export_to_html(doc=conv_res.document)) diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index 9f867b1d..d863313d 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -487,7 +487,7 @@ "\n", " all_succeeded = all(r.succeeded for r in resp)\n", " console.print(\n", - " f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n", + " f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n", " f\"first_doc_status_code: {resp[0].status_code}\"\n", " )\n", "\n", diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index 37c41709..17fe8e6e 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -341,7 +341,7 @@ "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n", "for i, doc in enumerate(resp_dict[\"context\"]):\n", " print()\n", - " print(f\"Source {i+1}:\")\n", + " print(f\"Source {i + 1}:\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " for key in doc.metadata:\n", " if key != \"pk\":\n", diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py index 46be97e2..6d1250be 100644 --- a/docs/examples/run_md.py +++ b/docs/examples/run_md.py @@ -25,9 +25,7 @@ def main(): document = mdb.convert() out_path = Path("scratch") - print( - f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}" - ) + print(f"Document {path} converted.\nSaved markdown output to: {str(out_path)}") # Export Docling document format to markdowndoc: fn = os.path.basename(path) diff --git a/docs/examples/translate.py b/docs/examples/translate.py index fa39b6d3..450f3905 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0 # FIXME: put in your favorite translation code .... def translate(text: str, src: str = "en", dest: str = "de"): - _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!") # from googletrans import Translator diff --git a/docs/examples/visual_grounding.ipynb b/docs/examples/visual_grounding.ipynb index 4d091da9..63200ed6 100644 --- a/docs/examples/visual_grounding.ipynb +++ b/docs/examples/visual_grounding.ipynb @@ -432,7 +432,7 @@ "\n", "for i, doc in enumerate(resp_dict[\"context\"][:]):\n", " image_by_page = {}\n", - " print(f\"Source {i+1}:\")\n", + " print(f\"Source {i + 1}:\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n", "\n", diff --git a/pyproject.toml b/pyproject.toml index d53ded12..f96e2683 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -166,15 +166,72 @@ docling-tools = "docling.cli.tools:app" requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" -[tool.black] +[tool.ruff] +target-version = "py39" line-length = 88 -target-version = ["py39"] -include = '\.pyi?$' +respect-gitignore = true -[tool.isort] -profile = "black" -line_length = 88 -py_version = 39 +# extend-exclude = [ +# "tests", +# ] + +[tool.ruff.format] +skip-magic-trailing-comma = false + +[tool.ruff.lint] +select = [ + # "B", # flake8-bugbear + "C", # flake8-comprehensions + "C9", # mccabe + # "D", # flake8-docstrings + "E", # pycodestyle errors (default) + "F", # pyflakes (default) + "I", # isort + "PD", # pandas-vet + "PIE", # pie + # "PTH", # pathlib + "Q", # flake8-quotes + # "RET", # return + "RUF", # Enable all ruff-specific checks + # "SIM", # simplify + "S307", # eval + # "T20", # (disallow print statements) keep debugging statements out of the codebase + "W", # pycodestyle warnings + "ASYNC", # async + "UP", # pyupgrade +] + +ignore = [ + "E501", # Line too long, handled by ruff formatter + "D107", # "Missing docstring in __init__", + "F811", # "redefinition of the same function" + "PL", # Pylint + "RUF012", # Mutable Class Attributes + "UP007", # Option and Union +] + +#extend-select = [] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] +"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests + +[tool.ruff.lint.mccabe] +max-complexity = 15 + +# [tool.ruff.lint.isort.sections] +# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"] + +[tool.ruff.lint.isort] +combine-as-imports = true +# section-order = [ +# "future", +# "standard-library", +# "third-party", +# "docling", +# "first-party", +# "local-folder", +# ] [tool.mypy] pretty = true @@ -202,10 +259,6 @@ module = [ ] ignore_missing_imports = true -[tool.flake8] -max-line-length = 88 -extend-ignore = ["E203", "E501"] - [tool.semantic_release] # for default values check: # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index 4574a228..bb050638 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -19,7 +19,6 @@ def _get_backend(fname): def test_asciidocs_examples(): - fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) for fname in fnames: diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index 2eee27bf..b5c9fd44 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -15,7 +15,6 @@ GENERATE = GEN_TEST_DATA def get_csv_paths(): - # Define the directory you want to search directory = Path(f"./tests/data/csv/") @@ -24,13 +23,11 @@ def get_csv_paths(): def get_csv_path(name: str): - # Return the matching CSV file path return Path(f"./tests/data/csv/{name}.csv") def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.CSV]) return converter @@ -55,9 +52,9 @@ def test_e2e_valid_csv_conversions(): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) assert verify_document( pred_doc=doc, diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 3c214791..62d9ad9d 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -42,9 +42,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index ee0e5c75..c67fb8b7 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -41,9 +41,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) diff --git a/tests/test_backend_docling_parse_v4.py b/tests/test_backend_docling_parse_v4.py index fcb551e9..42b9b40b 100644 --- a/tests/test_backend_docling_parse_v4.py +++ b/tests/test_backend_docling_parse_v4.py @@ -41,9 +41,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 5f5e7407..18254a78 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -105,7 +105,6 @@ def test_ordered_lists(): def get_html_paths(): - # Define the directory you want to search directory = Path("./tests/data/html/") @@ -115,14 +114,12 @@ def get_html_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) return converter def test_e2e_html_conversions(): - html_paths = get_html_paths() converter = get_converter() @@ -138,15 +135,15 @@ def test_e2e_html_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export( - pred_md, str(gt_path) + ".md", generate=GENERATE - ), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( + "export to md" + ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt", generate=GENERATE - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( + "export to indented-text" + ) assert verify_document(doc, str(gt_path) + ".json", GENERATE) diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py index d209431b..462cf72d 100644 --- a/tests/test_backend_jats.py +++ b/tests/test_backend_jats.py @@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json" diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 0604429c..0ce2ec57 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA def get_xlsx_paths(): - # Define the directory you want to search directory = Path("./tests/data/xlsx/") @@ -27,7 +26,6 @@ def get_xlsx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.XLSX]) return converter @@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None: pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), ( + "document document" + ) def test_pages(documents) -> None: diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 5c43ccf4..bd25cd41 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -43,7 +43,6 @@ def test_heading_levels(): def get_docx_paths(): - # Define the directory you want to search directory = Path("./tests/data/docx/") @@ -53,14 +52,12 @@ def get_docx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) return converter def test_e2e_docx_conversions(): - docx_paths = get_docx_paths() converter = get_converter() @@ -76,20 +73,20 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export( - pred_md, str(gt_path) + ".md", generate=GENERATE - ), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( + "export to md" + ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt", generate=GENERATE - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", generate=GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( + "document document" + ) if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index aebc01d9..d8ca57d8 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -109,20 +109,20 @@ def test_patent_groundtruth(patents, groundtruth): md_name = path.stem + ".md" if md_name in gt_names: pred_md = doc.export_to_markdown() - assert ( - pred_md == gt_names[md_name] - ), f"Markdown file mismatch against groundtruth {md_name}" + assert pred_md == gt_names[md_name], ( + f"Markdown file mismatch against groundtruth {md_name}" + ) json_path = path.with_suffix(".json") if json_path.stem in gt_names: - assert verify_document( - doc, str(json_path), GENERATE - ), f"JSON file mismatch against groundtruth {json_path}" + assert verify_document(doc, str(json_path), GENERATE), ( + f"JSON file mismatch against groundtruth {json_path}" + ) itxt_name = path.stem + ".itxt" if itxt_name in gt_names: pred_itxt = doc._export_to_indented_text() - assert ( - pred_itxt == gt_names[itxt_name] - ), f"Indented text file mismatch against groundtruth {itxt_name}" + assert pred_itxt == gt_names[itxt_name], ( + f"Indented text file mismatch against groundtruth {itxt_name}" + ) def test_tables(tables): diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 10a2b9e7..7d6c9ce1 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -42,9 +42,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 947e9e6b..886470b3 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -12,7 +12,6 @@ GENERATE = GEN_TEST_DATA def get_pptx_paths(): - # Define the directory you want to search directory = Path("./tests/data/pptx/") @@ -22,14 +21,12 @@ def get_pptx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.PPTX]) return converter def test_e2e_pptx_conversions(): - pptx_paths = get_pptx_paths() converter = get_converter() @@ -50,10 +47,10 @@ def test_e2e_pptx_conversions(): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), ( + "document document" + ) diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index 085e094a..0c128969 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -12,7 +12,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 2ac1da9f..255e630d 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -11,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True @@ -52,29 +51,29 @@ def test_picture_classifier(): assert type(res.annotations[0]) == PictureClassificationData classification_data = res.annotations[0] assert classification_data.provenance == "DocumentPictureClassifier" - assert ( - len(classification_data.predicted_classes) == 16 - ), "Number of predicted classes is not equal to 16" + assert len(classification_data.predicted_classes) == 16, ( + "Number of predicted classes is not equal to 16" + ) confidences = [pred.confidence for pred in classification_data.predicted_classes] - assert confidences == sorted( - confidences, reverse=True - ), "Predictions are not sorted in descending order of confidence" - assert ( - classification_data.predicted_classes[0].class_name == "bar_chart" - ), "The prediction is wrong for the bar chart image." + assert confidences == sorted(confidences, reverse=True), ( + "Predictions are not sorted in descending order of confidence" + ) + assert classification_data.predicted_classes[0].class_name == "bar_chart", ( + "The prediction is wrong for the bar chart image." + ) res = results[1] assert len(res.annotations) == 1 assert type(res.annotations[0]) == PictureClassificationData classification_data = res.annotations[0] assert classification_data.provenance == "DocumentPictureClassifier" - assert ( - len(classification_data.predicted_classes) == 16 - ), "Number of predicted classes is not equal to 16" + assert len(classification_data.predicted_classes) == 16, ( + "Number of predicted classes is not equal to 16" + ) confidences = [pred.confidence for pred in classification_data.predicted_classes] - assert confidences == sorted( - confidences, reverse=True - ), "Predictions are not sorted in descending order of confidence" - assert ( - classification_data.predicted_classes[0].class_name == "map" - ), "The prediction is wrong for the bar chart image." + assert confidences == sorted(confidences, reverse=True), ( + "Predictions are not sorted in descending order of confidence" + ) + assert classification_data.predicted_classes[0].class_name == "map", ( + "The prediction is wrong for the bar chart image." + ) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 590558f2..4d23fe7a 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -15,7 +15,6 @@ GENERATE_V2 = GEN_TEST_DATA def get_pdf_paths(): - # Define the directory you want to search directory = Path("./tests/data/pdf/") @@ -25,7 +24,6 @@ def get_pdf_paths(): def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True @@ -45,7 +43,6 @@ def get_converter(): def test_e2e_pdfs_conversions(): - pdf_paths = get_pdf_paths() converter = get_converter() diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 946ad068..661fe93d 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -12,7 +12,6 @@ from docling.document_converter import PdfFormatOption def test_in_doc_from_valid_path(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) assert doc.valid == True @@ -27,7 +26,6 @@ def test_in_doc_from_invalid_path(): def test_in_doc_from_valid_buf(): - buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) @@ -36,7 +34,6 @@ def test_in_doc_from_valid_buf(): def test_in_doc_from_invalid_buf(): - buf = BytesIO(b"") stream = DocumentStream(name="my_doc.pdf", stream=buf) @@ -45,7 +42,6 @@ def test_in_doc_from_invalid_buf(): def test_image_in_pdf_backend(): - in_doc = InputDocument( path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), format=InputFormat.IMAGE, @@ -76,7 +72,6 @@ def test_image_in_pdf_backend(): def test_in_doc_with_page_range(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 29130c53..3c2a2b4c 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -16,14 +16,12 @@ GENERATE = GEN_TEST_DATA def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path @pytest.fixture def converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True @@ -42,7 +40,6 @@ def converter(): def test_convert_path(converter: DocumentConverter): - pdf_path = get_pdf_path() print(f"converting {pdf_path}") @@ -56,7 +53,6 @@ def test_convert_path(converter: DocumentConverter): def test_convert_stream(converter: DocumentConverter): - pdf_path = get_pdf_path() print(f"converting {pdf_path}") diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index 68716cba..3cc7a630 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -8,7 +8,6 @@ from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index c46f8990..13ca91cd 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -23,7 +23,6 @@ def test_doc_paths(): def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 02861a8b..fdc83e2d 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -21,7 +21,6 @@ from docling.datamodel.document import ConversionResult def levenshtein(str1: str, str2: str) -> int: - # Ensure str1 is the shorter string to optimize memory usage if len(str1) > len(str2): str1, str2 = str2, str1 @@ -46,7 +45,6 @@ def levenshtein(str1: str, str2: str) -> int: def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4): - if len(gt) == 0 or not fuzzy: assert gt == pred, f"{gt}!={pred}" else: @@ -57,22 +55,19 @@ def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4): def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): - - assert len(doc_pred_pages) == len( - doc_true_pages - ), "pred- and true-doc do not have the same number of pages" + assert len(doc_pred_pages) == len(doc_true_pages), ( + "pred- and true-doc do not have the same number of pages" + ) for pid, page_true_item in enumerate(doc_true_pages): - num_true_cells = len(page_true_item.cells) num_pred_cells = len(doc_pred_pages[pid].cells) - assert ( - num_true_cells == num_pred_cells - ), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}" + assert num_true_cells == num_pred_cells, ( + f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}" + ) for cid, cell_true_item in enumerate(page_true_item.cells): - cell_pred_item = doc_pred_pages[pid].cells[cid] true_text = cell_true_item.text @@ -81,9 +76,9 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): true_bbox = cell_true_item.rect.to_bounding_box().as_tuple() pred_bbox = cell_pred_item.rect.to_bounding_box().as_tuple() - assert ( - true_bbox == pred_bbox - ), f"bbox is not the same: {true_bbox} != {pred_bbox}" + assert true_bbox == pred_bbox, ( + f"bbox is not the same: {true_bbox} != {pred_bbox}" + ) return True @@ -123,19 +118,19 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): # print("Expected number of tables: {}, result: {}".format(len(doc_true.tables), len(doc_pred.tables))) - assert len(doc_true.tables) == len( - doc_pred.tables - ), "document has different count of tables than expected." + assert len(doc_true.tables) == len(doc_pred.tables), ( + "document has different count of tables than expected." + ) for l, true_item in enumerate(doc_true.tables): pred_item = doc_pred.tables[l] - assert ( - true_item.num_rows == pred_item.num_rows - ), "table does not have the same #-rows" - assert ( - true_item.num_cols == pred_item.num_cols - ), "table does not have the same #-cols" + assert true_item.num_rows == pred_item.num_rows, ( + "table does not have the same #-rows" + ) + assert true_item.num_cols == pred_item.num_cols, ( + "table does not have the same #-cols" + ) assert true_item.data is not None, "documents are expected to have table data" assert pred_item.data is not None, "documents are expected to have table data" @@ -145,7 +140,6 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): for i, row in enumerate(true_item.data): for j, col in enumerate(true_item.data[i]): - # print("true: ", true_item.data[i][j].text) # print("pred: ", pred_item.data[i][j].text) # print("") @@ -154,20 +148,20 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy=fuzzy ) - assert ( - true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type - ), "table-cell does not have the same type" + assert true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type, ( + "table-cell does not have the same type" + ) return True def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): - assert ( - true_item.data.num_rows == pred_item.data.num_rows - ), "table does not have the same #-rows" - assert ( - true_item.data.num_cols == pred_item.data.num_cols - ), "table does not have the same #-cols" + assert true_item.data.num_rows == pred_item.data.num_rows, ( + "table does not have the same #-rows" + ) + assert true_item.data.num_cols == pred_item.data.num_cols, ( + "table does not have the same #-cols" + ) assert true_item.data is not None, "documents are expected to have table data" assert pred_item.data is not None, "documents are expected to have table data" @@ -177,7 +171,6 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): for i, row in enumerate(true_item.data.grid): for j, col in enumerate(true_item.data.grid[i]): - # print("true: ", true_item.data[i][j].text) # print("pred: ", pred_item.data[i][j].text) # print("") @@ -225,9 +218,9 @@ def verify_picture_image_v2( def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool): assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match." - assert len(doc_true.tables) == len( - doc_pred.tables - ), "document has different count of tables than expected." + assert len(doc_true.tables) == len(doc_pred.tables), ( + "document has different count of tables than expected." + ) for (true_item, _true_level), (pred_item, _pred_level) in zip( doc_true.iterate_items(), doc_pred.iterate_items() @@ -261,25 +254,25 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: # Validate table content if isinstance(true_item, TableItem): - assert isinstance( - pred_item, TableItem - ), "Test item is not a TableItem as the expected one" - assert verify_table_v2( - true_item, pred_item, fuzzy=fuzzy - ), "Tables not matching" + assert isinstance(pred_item, TableItem), ( + "Test item is not a TableItem as the expected one" + ) + assert verify_table_v2(true_item, pred_item, fuzzy=fuzzy), ( + "Tables not matching" + ) # Validate picture content if isinstance(true_item, PictureItem): - assert isinstance( - pred_item, PictureItem - ), "Test item is not a PictureItem as the expected one" + assert isinstance(pred_item, PictureItem), ( + "Test item is not a PictureItem as the expected one" + ) true_image = true_item.get_image(doc=doc_true) pred_image = true_item.get_image(doc=doc_pred) if true_image is not None: - assert verify_picture_image_v2( - true_image, pred_image - ), "Picture image mismatch" + assert verify_picture_image_v2(true_image, pred_image), ( + "Picture image mismatch" + ) # TODO: check picture annotations @@ -303,9 +296,9 @@ def verify_conversion_result_v1( ): PageList = TypeAdapter(List[Page]) - assert ( - doc_result.status == ConversionStatus.SUCCESS - ), f"Doc {input_path} did not convert successfully." + assert doc_result.status == ConversionStatus.SUCCESS, ( + f"Doc {input_path} did not convert successfully." + ) doc_pred_pages: List[Page] = doc_result.pages doc_pred: DsDocument = doc_result.legacy_document @@ -357,25 +350,25 @@ def verify_conversion_result_v1( doc_true_dt = fr.read() if not fuzzy: - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + assert verify_cells(doc_pred_pages, doc_true_pages), ( + f"Mismatch in PDF cell prediction for {input_path}" + ) # assert verify_output( # doc_pred, doc_true # ), f"Mismatch in JSON prediction for {input_path}" - assert verify_tables_v1( - doc_pred, doc_true, fuzzy=fuzzy - ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" + assert verify_tables_v1(doc_pred, doc_true, fuzzy=fuzzy), ( + f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" + ) - assert verify_md( - doc_pred_md, doc_true_md, fuzzy=fuzzy - ), f"Mismatch in Markdown prediction for {input_path}" + assert verify_md(doc_pred_md, doc_true_md, fuzzy=fuzzy), ( + f"Mismatch in Markdown prediction for {input_path}" + ) - assert verify_dt( - doc_pred_dt, doc_true_dt, fuzzy=fuzzy - ), f"Mismatch in DocTags prediction for {input_path}" + assert verify_dt(doc_pred_dt, doc_true_dt, fuzzy=fuzzy), ( + f"Mismatch in DocTags prediction for {input_path}" + ) def verify_conversion_result_v2( @@ -387,9 +380,9 @@ def verify_conversion_result_v2( ): PageList = TypeAdapter(List[Page]) - assert ( - doc_result.status == ConversionStatus.SUCCESS - ), f"Doc {input_path} did not convert successfully." + assert doc_result.status == ConversionStatus.SUCCESS, ( + f"Doc {input_path} did not convert successfully." + ) doc_pred_pages: List[Page] = doc_result.pages doc_pred: DoclingDocument = doc_result.document @@ -439,29 +432,28 @@ def verify_conversion_result_v2( doc_true_dt = fr.read() if not fuzzy: - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + assert verify_cells(doc_pred_pages, doc_true_pages), ( + f"Mismatch in PDF cell prediction for {input_path}" + ) # assert verify_output( # doc_pred, doc_true # ), f"Mismatch in JSON prediction for {input_path}" - assert verify_docitems( - doc_pred, doc_true, fuzzy=fuzzy - ), f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}" + assert verify_docitems(doc_pred, doc_true, fuzzy=fuzzy), ( + f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}" + ) - assert verify_md( - doc_pred_md, doc_true_md, fuzzy=fuzzy - ), f"Mismatch in Markdown prediction for {input_path}" + assert verify_md(doc_pred_md, doc_true_md, fuzzy=fuzzy), ( + f"Mismatch in Markdown prediction for {input_path}" + ) - assert verify_dt( - doc_pred_dt, doc_true_dt, fuzzy=fuzzy - ), f"Mismatch in DocTags prediction for {input_path}" + assert verify_dt(doc_pred_dt, doc_true_dt, fuzzy=fuzzy), ( + f"Mismatch in DocTags prediction for {input_path}" + ) def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): - if not os.path.exists(gtfile) or generate: with open(gtfile, "w") as fw: json.dump(pred_doc.export_to_dict(), fw, indent=2)