apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-04-14 15:01:05 +02:00
parent 73cec158c6
commit 557efde7dc
30 changed files with 65 additions and 69 deletions

View File

@ -81,8 +81,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
title, section headers, text, lists, and tables.
"""
content = ""
in_list = False
in_table = False
@ -268,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
def _get_current_level(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return k - 1
return 0
def _get_current_parent(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return parents[k - 1]
return None
@ -323,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
return {
@ -331,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
# Fallback if no match

View File

@ -328,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise RuntimeError("Not support lim %s" % t_dict["e"])
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else:
return latex_s.format(lim=t_dict.get("lim"))

View File

@ -146,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
item for item in element.next_siblings if isinstance(item, Tag)
]
if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
):
text = text.strip()
if text and tag.name in ["div"]:

View File

@ -126,7 +126,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_list_item_value = 0
new_list = None
bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -368,8 +367,6 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10
parents = {} # type: ignore
for i in range(max_levels):
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
)
slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

View File

@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
if k >= 0 and v is None:
return k
return 0

View File

@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
kwd in doc_info.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL]
kwd in ent.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff)
affiliation_ids_names = {
id: name
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
}
affiliation_ids_names = dict(
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
)
# Get author names and affiliation names
for author_node in meta.xpath(

View File

@ -1472,9 +1472,7 @@ class XmlTable:
if cw == 0:
offset_w0.append(col["offset"][ic])
min_colinfo["offset"] = sorted(
list(set(col["offset"] + min_colinfo["offset"]))
)
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
# add back the 0 width cols to offset list
offset_w0 = list(set(offset_w0))

View File

@ -430,7 +430,7 @@ def convert(
settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None:
from_formats = [e for e in InputFormat]
from_formats = list(InputFormat)
parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:

View File

@ -89,14 +89,13 @@ def download(
"Cannot simultaneously set 'all' parameter and specify models to download."
)
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
level=logging.INFO,
format="[blue]%(message)s[/blue]",
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
to_download = models or (list(_AvailableModels) if all else _default_models)
output_dir = download_models(
output_dir=output_dir,
force=force,

View File

@ -172,7 +172,7 @@ class DocumentConverter:
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
):
self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
allowed_formats if allowed_formats is not None else list(InputFormat)
)
self.format_to_options = {
format: (

View File

@ -59,13 +59,11 @@ class EasyOcrModel(BaseOcrModel):
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
[
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
]
)
else:
warnings.warn(

View File

@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
@property
def registered_kind(self) -> list[str]:
return list(opt.kind for opt in self._classes.keys())
return [opt.kind for opt in self._classes.keys()]
def get_enum(self) -> enum.Enum:
return enum.Enum(

View File

@ -121,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
generation_time = time.time() - start_time
page_tags = output
_log.debug(f"Generation time {generation_time:.2f} seconds.")
# inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time
# print("")

View File

@ -166,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
num_tokens = len(generated_ids[0])
page_tags = generated_texts
_log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
)
# inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time
# print("")

View File

@ -64,7 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
)
def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None:
if self._name is not None and self._version is not None:
return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"]
@ -187,7 +187,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._tesseract_languages = df[0].tolist()[1:]
# Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]):
if any(l.startswith("script/") for l in self._tesseract_languages):
script_prefix = "script/"
else:
script_prefix = ""

View File

@ -76,7 +76,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang)
if any([l.startswith("script/") for l in self._tesserocr_languages]):
if any(l.startswith("script/") for l in self._tesserocr_languages):
self.script_prefix = "script/"
else:
self.script_prefix = ""

View File

@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
if isinstance(iterator, List):
iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1))
yield [first, *list(islice(iterator, chunk_size - 1))]
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:

View File

@ -544,7 +544,7 @@
"source": [
"doc = backend.convert()\n",
"\n",
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
]
},

View File

@ -84,7 +84,7 @@ def main():
)
}
)
result = doc_converter.convert(input_doc_path)
doc_converter.convert(input_doc_path)
if __name__ == "__main__":

View File

@ -807,10 +807,12 @@
}
],
"source": [
"from typing import Optional\n",
"\n",
"from azure.search.documents.models import VectorizableTextQuery\n",
"\n",
"\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n",
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
" \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n",

View File

@ -51,10 +51,9 @@ def main():
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
conv_doc = conv_res.document
doc_filename = conv_res.input.file
# Save markdown with embedded pictures in original text
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"

View File

@ -38,7 +38,7 @@ def test_asciidocs_examples():
if os.path.exists(gname):
with open(gname) as fr:
true_mddoc = fr.read()
fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else:

View File

@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -79,7 +79,7 @@ def test_pages(documents) -> None:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
@ -90,7 +90,7 @@ def test_pages(documents) -> None:
assert backend.page_count() == 3
# number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0]
doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 3
# page sizes as number of cells

View File

@ -129,7 +129,7 @@ def test_tables(tables):
"""Test the table parser."""
# CHECK table in file tables_20180000016.xml
file_name = "tables_ipa20180000016.xml"
file_table = [item[1] for item in tables if item[0].name == file_name][0]
file_table = next(item[1] for item in tables if item[0].name == file_name)
assert file_table.num_rows == 13
assert file_table.num_cols == 10
assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20200022300
file_name = "ipa20200022300.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
file_name = "ipa20180000016.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20110039701 for complex long tables
file_name = "ipa20110039701.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
assert doc.name == file_name
assert len(doc.tables) == 17
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
# CHECK application doc number 06442728
file_name = "pg06442728.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
# CHECK application doc number 20010031492
file_name = "pa20010031492.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
# CHECK application doc number 057006474
file_name = "pftaps057006474.txt"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)

View File

@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -14,7 +14,7 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path)
assert doc.valid == True
assert doc.valid is True
def test_in_doc_from_invalid_path():
@ -22,7 +22,7 @@ def test_in_doc_from_invalid_path():
doc = _make_input_doc(test_doc_path)
assert doc.valid == False
assert doc.valid is False
def test_in_doc_from_valid_buf():
@ -30,7 +30,7 @@ def test_in_doc_from_valid_buf():
stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream)
assert doc.valid == True
assert doc.valid is True
def test_in_doc_from_invalid_buf():
@ -38,7 +38,7 @@ def test_in_doc_from_invalid_buf():
stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream)
assert doc.valid == False
assert doc.valid is False
def test_image_in_pdf_backend():
@ -82,7 +82,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
assert doc.valid is True
limits.page_range = (9, 9)
@ -92,7 +92,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
assert doc.valid is True
limits.page_range = (11, 12)
@ -102,7 +102,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == False
assert doc.valid is False
def test_guess_format(tmp_path):
@ -187,17 +187,17 @@ def test_guess_format(tmp_path):
)
doc_path = temp_dir / "docling_test.xml"
doc_path.write_text(xml_content, encoding="utf-8")
assert dci._guess_format(doc_path) == None
assert dci._guess_format(doc_path) is None
buf = BytesIO(Path(doc_path).open("rb").read())
stream = DocumentStream(name="docling_test.xml", stream=buf)
assert dci._guess_format(stream) == None
assert dci._guess_format(stream) is None
# Invalid USPTO patent (as plain text)
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
assert dci._guess_format(stream) == None
assert dci._guess_format(stream) is None
doc_path = temp_dir / "pftaps_wrong.txt"
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None
assert dci._guess_format(doc_path) is None
# Valid Docling JSON
test_str = '{"name": ""}'

View File

@ -291,7 +291,7 @@ def verify_conversion_result_v1(
input_path: Path,
doc_result: ConversionResult,
generate: bool = False,
ocr_engine: str = None,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
):
PageList = TypeAdapter(List[Page])
@ -375,7 +375,7 @@ def verify_conversion_result_v2(
input_path: Path,
doc_result: ConversionResult,
generate: bool = False,
ocr_engine: str = None,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
):
PageList = TypeAdapter(List[Page])