mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Merge from main
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
d788bf2a6e
@ -1,9 +1,9 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Optional, Set, Union
|
||||
from typing import Optional, Union, cast
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@ -12,6 +12,7 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from typing_extensions import override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -21,6 +22,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
@override
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
_log.debug("About to init HTML backend...")
|
||||
@ -48,13 +50,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
|
||||
@override
|
||||
def is_valid(self) -> bool:
|
||||
return self.soup is not None
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@override
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
@ -62,9 +67,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
@override
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
origin = DocumentOrigin(
|
||||
@ -80,98 +87,78 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
assert self.soup is not None
|
||||
content = self.soup.body or self.soup
|
||||
# Replace <br> tags with newline characters
|
||||
for br in content.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
doc = self.walk(content, doc)
|
||||
for br in content("br"):
|
||||
br.replace_with(NavigableString("\n"))
|
||||
self.walk(content, doc)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
return doc
|
||||
|
||||
def walk(self, element: Tag, doc: DoclingDocument):
|
||||
try:
|
||||
# Iterate over elements in the body of the document
|
||||
for idx, element in enumerate(element.children):
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
# Iterate over elements in the body of the document
|
||||
for element in tag.children:
|
||||
if isinstance(element, Tag):
|
||||
try:
|
||||
self.analyse_element(element, idx, doc)
|
||||
self.analyze_tag(cast(Tag, element), doc)
|
||||
except Exception as exc_child:
|
||||
|
||||
_log.error(" -> error treating child: ", exc_child)
|
||||
_log.error(" => element: ", element, "\n")
|
||||
_log.error(
|
||||
f"Error processing child from tag{tag.name}: {exc_child}"
|
||||
)
|
||||
raise exc_child
|
||||
|
||||
except Exception as exc:
|
||||
pass
|
||||
return
|
||||
|
||||
return doc
|
||||
|
||||
def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""
|
||||
if element.name!=None:
|
||||
_log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||
"""
|
||||
|
||||
if element.name in self.labels:
|
||||
self.labels[element.name] += 1
|
||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
if tag.name in self.labels:
|
||||
self.labels[tag.name] += 1
|
||||
else:
|
||||
self.labels[element.name] = 1
|
||||
self.labels[tag.name] = 1
|
||||
|
||||
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(element, idx, doc)
|
||||
elif element.name in ["p"]:
|
||||
self.handle_paragraph(element, idx, doc)
|
||||
elif element.name in ["pre"]:
|
||||
self.handle_code(element, idx, doc)
|
||||
elif element.name in ["ul", "ol"]:
|
||||
self.handle_list(element, idx, doc)
|
||||
elif element.name in ["li"]:
|
||||
self.handle_listitem(element, idx, doc)
|
||||
elif element.name == "table":
|
||||
self.handle_table(element, idx, doc)
|
||||
elif element.name == "figure":
|
||||
self.handle_figure(element, idx, doc)
|
||||
elif element.name == "img":
|
||||
self.handle_image(element, idx, doc)
|
||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
self.handle_header(tag, doc)
|
||||
elif tag.name in ["p"]:
|
||||
self.handle_paragraph(tag, doc)
|
||||
elif tag.name in ["pre"]:
|
||||
self.handle_code(tag, doc)
|
||||
elif tag.name in ["ul", "ol"]:
|
||||
self.handle_list(tag, doc)
|
||||
elif tag.name in ["li"]:
|
||||
self.handle_list_item(tag, doc)
|
||||
elif tag.name == "table":
|
||||
self.handle_table(tag, doc)
|
||||
elif tag.name == "figure":
|
||||
self.handle_figure(tag, doc)
|
||||
elif tag.name == "img":
|
||||
self.handle_image(doc)
|
||||
else:
|
||||
self.walk(element, doc)
|
||||
self.walk(tag, doc)
|
||||
|
||||
def get_direct_text(self, item: Tag):
|
||||
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
||||
text = item.find(string=True, recursive=False)
|
||||
if isinstance(text, str):
|
||||
return text.strip()
|
||||
def get_text(self, item: PageElement) -> str:
|
||||
"""Get the text content of a tag."""
|
||||
parts: list[str] = self.extract_text_recursively(item)
|
||||
|
||||
return ""
|
||||
return "".join(parts) + " "
|
||||
|
||||
# Function to recursively extract text from all child nodes
|
||||
def extract_text_recursively(self, item: Tag):
|
||||
result = []
|
||||
def extract_text_recursively(self, item: PageElement) -> list[str]:
|
||||
result: list[str] = []
|
||||
|
||||
if isinstance(item, str):
|
||||
if isinstance(item, NavigableString):
|
||||
return [item]
|
||||
|
||||
if item.name not in ["ul", "ol"]:
|
||||
try:
|
||||
# Iterate over the children (and their text and tails)
|
||||
for child in item:
|
||||
try:
|
||||
# Recursively get the child's text content
|
||||
result.extend(self.extract_text_recursively(child))
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
_log.warn("item has no children")
|
||||
pass
|
||||
tag = cast(Tag, item)
|
||||
if tag.name not in ["ul", "ol"]:
|
||||
for child in tag:
|
||||
# Recursively get the child's text content
|
||||
result.extend(self.extract_text_recursively(child))
|
||||
|
||||
return "".join(result) + " "
|
||||
return ["".join(result) + " "]
|
||||
|
||||
def handle_header(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles header tags (h1, h2, etc.)."""
|
||||
hlevel = int(element.name.replace("h", ""))
|
||||
slevel = hlevel - 1
|
||||
|
||||
label = DocItemLabel.SECTION_HEADER
|
||||
text = element.text.strip()
|
||||
|
||||
if hlevel == 1:
|
||||
@ -197,7 +184,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
elif hlevel < self.level:
|
||||
|
||||
# remove the tail
|
||||
for key, val in self.parents.items():
|
||||
for key in self.parents.keys():
|
||||
if key > hlevel:
|
||||
self.parents[key] = None
|
||||
self.level = hlevel
|
||||
@ -208,27 +195,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
level=hlevel,
|
||||
)
|
||||
|
||||
def handle_code(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles monospace code snippets (pre)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.CODE
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_code(parent=self.parents[self.level], text=text)
|
||||
if text:
|
||||
doc.add_code(parent=self.parents[self.level], text=text)
|
||||
|
||||
def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles paragraph tags (p)."""
|
||||
if element.text is None:
|
||||
return
|
||||
text = element.text.strip()
|
||||
label = DocItemLabel.PARAGRAPH
|
||||
if len(text) == 0:
|
||||
return
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
if text:
|
||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||
|
||||
def handle_list(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles list tags (ul, ol) and their list items."""
|
||||
|
||||
if element.name == "ul":
|
||||
@ -250,18 +234,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles listitem tags (li)."""
|
||||
nested_lists = element.find(["ul", "ol"])
|
||||
nested_list = element.find(["ul", "ol"])
|
||||
|
||||
parent_list_label = self.parents[self.level].label
|
||||
index_in_list = len(self.parents[self.level].children) + 1
|
||||
|
||||
if nested_lists:
|
||||
name = element.name
|
||||
if nested_list:
|
||||
# Text in list item can be hidden within hierarchy, hence
|
||||
# we need to extract it recursively
|
||||
text = self.extract_text_recursively(element)
|
||||
text: str = self.get_text(element)
|
||||
# Flatten text, remove break lines:
|
||||
text = text.replace("\n", "").replace("\r", "")
|
||||
text = " ".join(text.split()).strip()
|
||||
@ -287,7 +270,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.parents[self.level + 1] = None
|
||||
self.level -= 1
|
||||
|
||||
elif isinstance(element.text, str):
|
||||
elif element.text.strip():
|
||||
text = element.text.strip()
|
||||
|
||||
marker = ""
|
||||
@ -302,59 +285,79 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
parent=self.parents[self.level],
|
||||
)
|
||||
else:
|
||||
_log.warn("list-item has no text: ", element)
|
||||
|
||||
def handle_table(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
"""Handles table tags."""
|
||||
_log.warning(f"list-item has no text: {element}")
|
||||
|
||||
@staticmethod
|
||||
def parse_table_data(element: Tag) -> Optional[TableData]:
|
||||
nested_tables = element.find("table")
|
||||
if nested_tables is not None:
|
||||
_log.warn("detected nested tables: skipping for now")
|
||||
return
|
||||
_log.warning("Skipping nested table.")
|
||||
return None
|
||||
|
||||
# Count the number of rows (number of <tr> elements)
|
||||
num_rows = len(element.find_all("tr"))
|
||||
num_rows = len(element("tr"))
|
||||
|
||||
# Find the number of columns (taking into account colspan)
|
||||
num_cols = 0
|
||||
for row in element.find_all("tr"):
|
||||
for row in element("tr"):
|
||||
col_count = 0
|
||||
for cell in row.find_all(["td", "th"]):
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
for cell in row(["td", "th"]):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
val = cast(Tag, cell).get("colspan", "1")
|
||||
colspan = int(val) if (isinstance(val, str) and val.isnumeric()) else 1
|
||||
col_count += colspan
|
||||
num_cols = max(num_cols, col_count)
|
||||
|
||||
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
for row_idx, row in enumerate(element.find_all("tr")):
|
||||
for row_idx, row in enumerate(element("tr")):
|
||||
if not isinstance(row, Tag):
|
||||
continue
|
||||
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row.find_all(["td", "th"])
|
||||
cells = row(["td", "th"])
|
||||
|
||||
# Check if each cell in the row is a header -> means it is a column header
|
||||
col_header = True
|
||||
for j, html_cell in enumerate(cells):
|
||||
if html_cell.name == "td":
|
||||
for html_cell in cells:
|
||||
if isinstance(html_cell, Tag) and html_cell.name == "td":
|
||||
col_header = False
|
||||
|
||||
# Extract the text content of each cell
|
||||
col_idx = 0
|
||||
# Extract and print the text content of each cell
|
||||
for _, html_cell in enumerate(cells):
|
||||
for html_cell in cells:
|
||||
if not isinstance(html_cell, Tag):
|
||||
continue
|
||||
|
||||
# extract inline formulas
|
||||
for formula in html_cell("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replace_with(NavigableString(math_formula))
|
||||
|
||||
# TODO: extract content correctly from table-cells with lists
|
||||
text = html_cell.text
|
||||
try:
|
||||
text = self.extract_table_cell_text(html_cell)
|
||||
except Exception as exc:
|
||||
_log.warn("exception: ", exc)
|
||||
exit(-1)
|
||||
|
||||
# label = html_cell.name
|
||||
|
||||
col_span = int(html_cell.get("colspan", 1))
|
||||
row_span = int(html_cell.get("rowspan", 1))
|
||||
col_val = html_cell.get("colspan", "1")
|
||||
col_span = (
|
||||
int(col_val)
|
||||
if isinstance(col_val, str) and col_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
row_val = html_cell.get("rowspan", "1")
|
||||
row_span = (
|
||||
int(row_val)
|
||||
if isinstance(row_val, str) and row_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
|
||||
while grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
@ -362,7 +365,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
for c in range(col_span):
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
cell = TableCell(
|
||||
table_cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
@ -373,57 +376,57 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
col_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
data.table_cells.append(table_cell)
|
||||
|
||||
doc.add_table(data=data, parent=self.parents[self.level])
|
||||
return data
|
||||
|
||||
def get_list_text(self, list_element: Tag, level=0):
|
||||
def handle_table(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles table tags."""
|
||||
|
||||
table_data = HTMLDocumentBackend.parse_table_data(element)
|
||||
|
||||
if table_data is not None:
|
||||
doc.add_table(data=table_data, parent=self.parents[self.level])
|
||||
|
||||
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||
result = []
|
||||
bullet_char = "*" # Default bullet character for unordered lists
|
||||
|
||||
if list_element.name == "ol": # For ordered lists, use numbers
|
||||
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
|
||||
for i, li in enumerate(list_element("li", recursive=False), 1):
|
||||
if not isinstance(li, Tag):
|
||||
continue
|
||||
# Add numbering for ordered lists
|
||||
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
if isinstance(nested_list, Tag):
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
elif list_element.name == "ul": # For unordered lists, use bullet points
|
||||
for li in list_element.find_all("li", recursive=False):
|
||||
for li in list_element("li", recursive=False):
|
||||
if not isinstance(li, Tag):
|
||||
continue
|
||||
# Add bullet points for unordered lists
|
||||
result.append(
|
||||
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
||||
)
|
||||
# Handle nested lists
|
||||
nested_list = li.find(["ul", "ol"])
|
||||
if nested_list:
|
||||
if isinstance(nested_list, Tag):
|
||||
result.extend(self.get_list_text(nested_list, level + 1))
|
||||
|
||||
return result
|
||||
|
||||
def extract_table_cell_text(self, cell: Tag):
|
||||
"""Extract text from a table cell, including lists with indents."""
|
||||
contains_lists = cell.find(["ul", "ol"])
|
||||
if contains_lists is None:
|
||||
return cell.text
|
||||
else:
|
||||
_log.debug(
|
||||
"should extract the content correctly for table-cells with lists ..."
|
||||
)
|
||||
return cell.text
|
||||
|
||||
def handle_figure(self, element: Tag, idx: int, doc: DoclingDocument):
|
||||
def handle_figure(self, element: Tag, doc: DoclingDocument) -> None:
|
||||
"""Handles image tags (img)."""
|
||||
|
||||
# Extract the image URI from the <img> tag
|
||||
# image_uri = root.xpath('//figure//img/@src')[0]
|
||||
|
||||
contains_captions = element.find(["figcaption"])
|
||||
if contains_captions is None:
|
||||
if not isinstance(contains_captions, Tag):
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
|
||||
else:
|
||||
texts = []
|
||||
for item in contains_captions:
|
||||
@ -437,6 +440,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
caption=fig_caption,
|
||||
)
|
||||
|
||||
def handle_image(self, element: Tag, idx, doc: DoclingDocument):
|
||||
def handle_image(self, doc: DoclingDocument) -> None:
|
||||
"""Handles image tags (img)."""
|
||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
||||
|
@ -4,7 +4,7 @@ from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Final, Optional, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
@ -12,14 +12,13 @@ from docling_core.types.doc import (
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
NodeItem,
|
||||
TableCell,
|
||||
TableData,
|
||||
TextItem,
|
||||
)
|
||||
from lxml import etree
|
||||
from typing_extensions import TypedDict, override
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@ -540,71 +539,10 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
) -> None:
|
||||
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
|
||||
table_tag = soup.find("table")
|
||||
|
||||
nested_tables = table_tag.find("table")
|
||||
if nested_tables:
|
||||
_log.warning(f"Skipping nested table in {str(self.file)}")
|
||||
if not isinstance(table_tag, Tag):
|
||||
return
|
||||
|
||||
# Count the number of rows (number of <tr> elements)
|
||||
num_rows = len(table_tag.find_all("tr"))
|
||||
|
||||
# Find the number of columns (taking into account colspan)
|
||||
num_cols = 0
|
||||
for row in table_tag.find_all("tr"):
|
||||
col_count = 0
|
||||
for cell in row.find_all(["td", "th"]):
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
col_count += colspan
|
||||
num_cols = max(num_cols, col_count)
|
||||
|
||||
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
|
||||
# Iterate over the rows in the table
|
||||
for row_idx, row in enumerate(table_tag.find_all("tr")):
|
||||
# For each row, find all the column cells (both <td> and <th>)
|
||||
cells = row.find_all(["td", "th"])
|
||||
|
||||
# Check if each cell in the row is a header -> means it is a column header
|
||||
col_header = True
|
||||
for j, html_cell in enumerate(cells):
|
||||
if html_cell.name == "td":
|
||||
col_header = False
|
||||
|
||||
# Extract and print the text content of each cell
|
||||
col_idx = 0
|
||||
for _, html_cell in enumerate(cells):
|
||||
# extract inline formulas
|
||||
for formula in html_cell.find_all("inline-formula"):
|
||||
math_parts = formula.text.split("$$")
|
||||
if len(math_parts) == 3:
|
||||
math_formula = f"$${math_parts[1]}$$"
|
||||
formula.replaceWith(math_formula)
|
||||
text = html_cell.text
|
||||
|
||||
col_span = int(html_cell.get("colspan", 1))
|
||||
row_span = int(html_cell.get("rowspan", 1))
|
||||
|
||||
while grid[row_idx][col_idx] is not None:
|
||||
col_idx += 1
|
||||
for r in range(row_span):
|
||||
for c in range(col_span):
|
||||
grid[row_idx + r][col_idx + c] = text
|
||||
|
||||
cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=col_header,
|
||||
row_header=((not col_header) and html_cell.name == "th"),
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
data = HTMLDocumentBackend.parse_table_data(table_tag)
|
||||
|
||||
# TODO: format label vs caption once styling is supported
|
||||
label = table_xml_component["label"]
|
||||
@ -616,7 +554,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
else None
|
||||
)
|
||||
|
||||
doc.add_table(data=data, parent=parent, caption=table_caption)
|
||||
if data is not None:
|
||||
doc.add_table(data=data, parent=parent, caption=table_caption)
|
||||
|
||||
return
|
||||
|
||||
@ -673,7 +612,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _walk_linear(
|
||||
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
||||
) -> str:
|
||||
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
|
||||
skip_tags = ["term"]
|
||||
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
|
||||
new_parent: NodeItem = parent
|
||||
|
@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
|
||||
from enum import Enum, unique
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any, Final, Optional, Union
|
||||
from typing import Final, Optional, Union
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from docling_core.types.doc import (
|
||||
@ -1406,6 +1406,10 @@ class XmlTable:
|
||||
http://oasis-open.org/specs/soextblx.dtd
|
||||
"""
|
||||
|
||||
class ColInfo(TypedDict):
|
||||
ncols: int
|
||||
colinfo: list[dict]
|
||||
|
||||
class MinColInfoType(TypedDict):
|
||||
offset: list[int]
|
||||
colwidth: list[int]
|
||||
@ -1425,7 +1429,7 @@ class XmlTable:
|
||||
self.empty_text = ""
|
||||
self._soup = BeautifulSoup(input, features="xml")
|
||||
|
||||
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
|
||||
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
|
||||
"""Create a unified range along the table groups.
|
||||
|
||||
Args:
|
||||
@ -1532,19 +1536,26 @@ class XmlTable:
|
||||
Returns:
|
||||
A docling table object.
|
||||
"""
|
||||
tgs_align = []
|
||||
tg_secs = table.find_all("tgroup")
|
||||
tgs_align: list[XmlTable.ColInfo] = []
|
||||
tg_secs = table("tgroup")
|
||||
if tg_secs:
|
||||
for tg_sec in tg_secs:
|
||||
ncols = tg_sec.get("cols", None)
|
||||
if ncols:
|
||||
ncols = int(ncols)
|
||||
tg_align = {"ncols": ncols, "colinfo": []}
|
||||
cs_secs = tg_sec.find_all("colspec")
|
||||
if not isinstance(tg_sec, Tag):
|
||||
continue
|
||||
col_val = tg_sec.get("cols")
|
||||
ncols = (
|
||||
int(col_val)
|
||||
if isinstance(col_val, str) and col_val.isnumeric()
|
||||
else 1
|
||||
)
|
||||
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
|
||||
cs_secs = tg_sec("colspec")
|
||||
if cs_secs:
|
||||
for cs_sec in cs_secs:
|
||||
colname = cs_sec.get("colname", None)
|
||||
colwidth = cs_sec.get("colwidth", None)
|
||||
if not isinstance(cs_sec, Tag):
|
||||
continue
|
||||
colname = cs_sec.get("colname")
|
||||
colwidth = cs_sec.get("colwidth")
|
||||
tg_align["colinfo"].append(
|
||||
{"colname": colname, "colwidth": colwidth}
|
||||
)
|
||||
@ -1565,16 +1576,23 @@ class XmlTable:
|
||||
table_data: list[TableCell] = []
|
||||
i_row_global = 0
|
||||
is_row_empty: bool = True
|
||||
tg_secs = table.find_all("tgroup")
|
||||
tg_secs = table("tgroup")
|
||||
if tg_secs:
|
||||
for itg, tg_sec in enumerate(tg_secs):
|
||||
if not isinstance(tg_sec, Tag):
|
||||
continue
|
||||
tg_range = tgs_range[itg]
|
||||
row_secs = tg_sec.find_all(["row", "tr"])
|
||||
row_secs = tg_sec(["row", "tr"])
|
||||
|
||||
if row_secs:
|
||||
for row_sec in row_secs:
|
||||
entry_secs = row_sec.find_all(["entry", "td"])
|
||||
is_header: bool = row_sec.parent.name in ["thead"]
|
||||
if not isinstance(row_sec, Tag):
|
||||
continue
|
||||
entry_secs = row_sec(["entry", "td"])
|
||||
is_header: bool = (
|
||||
row_sec.parent is not None
|
||||
and row_sec.parent.name == "thead"
|
||||
)
|
||||
|
||||
ncols = 0
|
||||
local_row: list[TableCell] = []
|
||||
@ -1582,23 +1600,26 @@ class XmlTable:
|
||||
if entry_secs:
|
||||
wrong_nbr_cols = False
|
||||
for ientry, entry_sec in enumerate(entry_secs):
|
||||
if not isinstance(entry_sec, Tag):
|
||||
continue
|
||||
text = entry_sec.get_text().strip()
|
||||
|
||||
# start-end
|
||||
namest = entry_sec.attrs.get("namest", None)
|
||||
nameend = entry_sec.attrs.get("nameend", None)
|
||||
if isinstance(namest, str) and namest.isnumeric():
|
||||
namest = int(namest)
|
||||
else:
|
||||
namest = ientry + 1
|
||||
namest = entry_sec.get("namest")
|
||||
nameend = entry_sec.get("nameend")
|
||||
start = (
|
||||
int(namest)
|
||||
if isinstance(namest, str) and namest.isnumeric()
|
||||
else ientry + 1
|
||||
)
|
||||
if isinstance(nameend, str) and nameend.isnumeric():
|
||||
nameend = int(nameend)
|
||||
end = int(nameend)
|
||||
shift = 0
|
||||
else:
|
||||
nameend = ientry + 2
|
||||
end = ientry + 2
|
||||
shift = 1
|
||||
|
||||
if nameend > len(tg_range["cell_offst"]):
|
||||
if end > len(tg_range["cell_offst"]):
|
||||
wrong_nbr_cols = True
|
||||
self.nbr_messages += 1
|
||||
if self.nbr_messages <= self.max_nbr_messages:
|
||||
@ -1608,8 +1629,8 @@ class XmlTable:
|
||||
break
|
||||
|
||||
range_ = [
|
||||
tg_range["cell_offst"][namest - 1],
|
||||
tg_range["cell_offst"][nameend - 1] - shift,
|
||||
tg_range["cell_offst"][start - 1],
|
||||
tg_range["cell_offst"][end - 1] - shift,
|
||||
]
|
||||
|
||||
# add row and replicate cell if needed
|
||||
@ -1668,7 +1689,7 @@ class XmlTable:
|
||||
A docling table data.
|
||||
"""
|
||||
section = self._soup.find("table")
|
||||
if section is not None:
|
||||
if isinstance(section, Tag):
|
||||
table = self._parse_table(section)
|
||||
if table.num_rows == 0 or table.num_cols == 0:
|
||||
_log.warning("The parsed USPTO table is empty")
|
||||
|
39
poetry.lock
generated
39
poetry.lock
generated
@ -282,17 +282,18 @@ testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-ch
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.12.3"
|
||||
version = "4.13.3"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.6.0"
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
|
||||
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
|
||||
{file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"},
|
||||
{file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">1.2"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
cchardet = ["cchardet"]
|
||||
@ -820,13 +821,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.19.0"
|
||||
version = "2.19.1"
|
||||
description = "A python library to define and validate data types in Docling."
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_core-2.19.0-py3-none-any.whl", hash = "sha256:caa1e13d98fa9a00608091c386609c75b3560c7291e842c252f0b6f8d5812dbd"},
|
||||
{file = "docling_core-2.19.0.tar.gz", hash = "sha256:ebf3062e31155bb5f0e6132056a2d239a0e6e693a75c5758886909bb9fef461a"},
|
||||
{file = "docling_core-2.19.1-py3-none-any.whl", hash = "sha256:ca7bd4dacd75611c5ea4f205192b71a8f22205e615eff1a16aac7082644d3b2e"},
|
||||
{file = "docling_core-2.19.1.tar.gz", hash = "sha256:e2769b816c669cdf27024dd3b219d3ecaf2161691dd5e8e5e8ce439557ea0928"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -1317,13 +1318,13 @@ colorama = ">=0.4"
|
||||
|
||||
[[package]]
|
||||
name = "griffe-pydantic"
|
||||
version = "1.1.0"
|
||||
version = "1.1.2"
|
||||
description = "Griffe extension for Pydantic."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
|
||||
{file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
|
||||
{file = "griffe_pydantic-1.1.2-py3-none-any.whl", hash = "sha256:8ad53218ca6e9c24ccec83588eb435f562b30355f641fe336e81b1e00ea05f3c"},
|
||||
{file = "griffe_pydantic-1.1.2.tar.gz", hash = "sha256:381eacd8854a85811522b4f6dc9a1ef0fb5931825081379d70ff3a425b0d4ea1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7021,18 +7022,18 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "transformers"
|
||||
version = "4.48.3"
|
||||
version = "4.49.0"
|
||||
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
|
||||
optional = false
|
||||
python-versions = ">=3.9.0"
|
||||
files = [
|
||||
{file = "transformers-4.48.3-py3-none-any.whl", hash = "sha256:78697f990f5ef350c23b46bf86d5081ce96b49479ab180b2de7687267de8fd36"},
|
||||
{file = "transformers-4.48.3.tar.gz", hash = "sha256:a5e8f1e9a6430aa78215836be70cecd3f872d99eeda300f41ad6cc841724afdb"},
|
||||
{file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"},
|
||||
{file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
filelock = "*"
|
||||
huggingface-hub = ">=0.24.0,<1.0"
|
||||
huggingface-hub = ">=0.26.0,<1.0"
|
||||
numpy = ">=1.17"
|
||||
packaging = ">=20.0"
|
||||
pyyaml = ">=5.1"
|
||||
@ -7045,13 +7046,13 @@ tqdm = ">=4.27"
|
||||
[package.extras]
|
||||
accelerate = ["accelerate (>=0.26.0)"]
|
||||
agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
|
||||
all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
|
||||
all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
|
||||
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
||||
benchmark = ["optimum-benchmark (>=0.3.0)"]
|
||||
codecarbon = ["codecarbon (>=2.8.1)"]
|
||||
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
|
||||
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
|
||||
dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
|
||||
dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
|
||||
dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
|
||||
dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
|
||||
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
|
||||
@ -7084,8 +7085,8 @@ tokenizers = ["tokenizers (>=0.21,<0.22)"]
|
||||
torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
|
||||
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
|
||||
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
|
||||
torchhub = ["filelock", "huggingface-hub (>=0.24.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
|
||||
video = ["av (==9.2.0)"]
|
||||
torchhub = ["filelock", "huggingface-hub (>=0.26.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
|
||||
video = ["av"]
|
||||
vision = ["Pillow (>=10.0.1,<=15.0)"]
|
||||
|
||||
[[package]]
|
||||
@ -7810,4 +7811,4 @@ vlm = ["transformers", "transformers"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "b19c39233b5c7ca2a4feed4886542395492ed43f4957f9c6f097b03e8d5b6148"
|
||||
content-hash = "3f657e7af78058e75dfb9f32e373f7f70e5e68a42a5b3603189e2251be90f349"
|
||||
|
@ -45,7 +45,7 @@ scipy = [
|
||||
typer = "^0.12.5"
|
||||
python-docx = "^1.1.2"
|
||||
python-pptx = "^1.0.2"
|
||||
beautifulsoup4 = ">=4.12.3,<4.13.0"
|
||||
beautifulsoup4 = "^4.12.3"
|
||||
pandas = "^2.1.4"
|
||||
marko = "^2.1.2"
|
||||
openpyxl = "^3.1.5"
|
||||
@ -164,7 +164,6 @@ module = [
|
||||
"easyocr.*",
|
||||
"ocrmac.*",
|
||||
"lxml.*",
|
||||
"bs4.*",
|
||||
"huggingface_hub.*",
|
||||
"transformers.*",
|
||||
]
|
||||
|
@ -410,68 +410,65 @@ item-0 at level 0: unspecified: group _root_
|
||||
item-396 at level 3: list: group list
|
||||
item-397 at level 4: list_item: list of books (useful looking abstracts)
|
||||
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
|
||||
item-399 at level 4: list_item:
|
||||
item-400 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
|
||||
item-401 at level 3: table with [3x2]
|
||||
item-402 at level 3: picture
|
||||
item-403 at level 3: list: group list
|
||||
item-404 at level 4: list_item: Ducks
|
||||
item-405 at level 4: list_item: Game birds
|
||||
item-406 at level 4: list_item: Bird common names
|
||||
item-407 at level 3: list: group list
|
||||
item-408 at level 4: list_item: All accuracy disputes
|
||||
item-409 at level 4: list_item: Accuracy disputes from February 2020
|
||||
item-410 at level 4: list_item: CS1 Finnish-language sources (fi)
|
||||
item-411 at level 4: list_item: CS1 Latvian-language sources (lv)
|
||||
item-412 at level 4: list_item: CS1 Swedish-language sources (sv)
|
||||
item-413 at level 4: list_item: Articles with short description
|
||||
item-414 at level 4: list_item: Short description is different from Wikidata
|
||||
item-415 at level 4: list_item: Wikipedia indefinitely move-protected pages
|
||||
item-416 at level 4: list_item: Wikipedia indefinitely semi-protected pages
|
||||
item-417 at level 4: list_item: Articles with 'species' microformats
|
||||
item-418 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
|
||||
item-419 at level 4: list_item: Articles containing Dutch-language text
|
||||
item-420 at level 4: list_item: Articles containing German-language text
|
||||
item-421 at level 4: list_item: Articles containing Norwegian-language text
|
||||
item-422 at level 4: list_item: Articles containing Lithuanian-language text
|
||||
item-423 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
|
||||
item-424 at level 4: list_item: All articles with self-published sources
|
||||
item-425 at level 4: list_item: Articles with self-published sources from February 2020
|
||||
item-426 at level 4: list_item: All articles with unsourced statements
|
||||
item-427 at level 4: list_item: Articles with unsourced statements from January 2022
|
||||
item-428 at level 4: list_item: CS1: long volume value
|
||||
item-429 at level 4: list_item: Pages using Sister project links with wikidata mismatch
|
||||
item-430 at level 4: list_item: Pages using Sister project links with hidden wikidata
|
||||
item-431 at level 4: list_item: Webarchive template wayback links
|
||||
item-432 at level 4: list_item: Articles with Project Gutenberg links
|
||||
item-433 at level 4: list_item: Articles containing video clips
|
||||
item-434 at level 3: list: group list
|
||||
item-435 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
|
||||
item-436 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
|
||||
item-437 at level 3: list: group list
|
||||
item-438 at level 4: list_item: Privacy policy
|
||||
item-439 at level 4: list_item: About Wikipedia
|
||||
item-440 at level 4: list_item: Disclaimers
|
||||
item-441 at level 4: list_item: Contact Wikipedia
|
||||
item-442 at level 4: list_item: Code of Conduct
|
||||
item-443 at level 4: list_item: Developers
|
||||
item-444 at level 4: list_item: Statistics
|
||||
item-445 at level 4: list_item: Cookie statement
|
||||
item-446 at level 4: list_item: Mobile view
|
||||
item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
|
||||
item-400 at level 3: table with [3x2]
|
||||
item-401 at level 3: picture
|
||||
item-402 at level 3: list: group list
|
||||
item-403 at level 4: list_item: Ducks
|
||||
item-404 at level 4: list_item: Game birds
|
||||
item-405 at level 4: list_item: Bird common names
|
||||
item-406 at level 3: list: group list
|
||||
item-407 at level 4: list_item: All accuracy disputes
|
||||
item-408 at level 4: list_item: Accuracy disputes from February 2020
|
||||
item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
|
||||
item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
|
||||
item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
|
||||
item-412 at level 4: list_item: Articles with short description
|
||||
item-413 at level 4: list_item: Short description is different from Wikidata
|
||||
item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
|
||||
item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
|
||||
item-416 at level 4: list_item: Articles with 'species' microformats
|
||||
item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
|
||||
item-418 at level 4: list_item: Articles containing Dutch-language text
|
||||
item-419 at level 4: list_item: Articles containing German-language text
|
||||
item-420 at level 4: list_item: Articles containing Norwegian-language text
|
||||
item-421 at level 4: list_item: Articles containing Lithuanian-language text
|
||||
item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
|
||||
item-423 at level 4: list_item: All articles with self-published sources
|
||||
item-424 at level 4: list_item: Articles with self-published sources from February 2020
|
||||
item-425 at level 4: list_item: All articles with unsourced statements
|
||||
item-426 at level 4: list_item: Articles with unsourced statements from January 2022
|
||||
item-427 at level 4: list_item: CS1: long volume value
|
||||
item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
|
||||
item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
|
||||
item-430 at level 4: list_item: Webarchive template wayback links
|
||||
item-431 at level 4: list_item: Articles with Project Gutenberg links
|
||||
item-432 at level 4: list_item: Articles containing video clips
|
||||
item-433 at level 3: list: group list
|
||||
item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
|
||||
item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
|
||||
item-436 at level 3: list: group list
|
||||
item-437 at level 4: list_item: Privacy policy
|
||||
item-438 at level 4: list_item: About Wikipedia
|
||||
item-439 at level 4: list_item: Disclaimers
|
||||
item-440 at level 4: list_item: Contact Wikipedia
|
||||
item-441 at level 4: list_item: Code of Conduct
|
||||
item-442 at level 4: list_item: Developers
|
||||
item-443 at level 4: list_item: Statistics
|
||||
item-444 at level 4: list_item: Cookie statement
|
||||
item-445 at level 4: list_item: Mobile view
|
||||
item-446 at level 3: list: group list
|
||||
item-447 at level 3: list: group list
|
||||
item-448 at level 4: list_item:
|
||||
item-449 at level 4: list_item:
|
||||
item-450 at level 3: list: group list
|
||||
item-451 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||
item-452 at level 1: caption: Male mallard.
|
||||
item-453 at level 1: caption: Wood ducks.
|
||||
item-454 at level 1: caption: Mallard landing in approach
|
||||
item-455 at level 1: caption: Male Mandarin duck
|
||||
item-456 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
|
||||
item-457 at level 1: caption: Female mallard in Cornwall, England
|
||||
item-458 at level 1: caption: Pecten along the bill
|
||||
item-459 at level 1: caption: Mallard duckling preening
|
||||
item-460 at level 1: caption: A Muscovy duckling
|
||||
item-461 at level 1: caption: Ringed teal
|
||||
item-462 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
|
||||
item-463 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
||||
item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||
item-449 at level 1: caption: Male mallard.
|
||||
item-450 at level 1: caption: Wood ducks.
|
||||
item-451 at level 1: caption: Mallard landing in approach
|
||||
item-452 at level 1: caption: Male Mandarin duck
|
||||
item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
|
||||
item-454 at level 1: caption: Female mallard in Cornwall, England
|
||||
item-455 at level 1: caption: Pecten along the bill
|
||||
item-456 at level 1: caption: Mallard duckling preening
|
||||
item-457 at level 1: caption: A Muscovy duckling
|
||||
item-458 at level 1: caption: Ringed teal
|
||||
item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
|
||||
item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
@ -1413,9 +1413,6 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/350"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/351"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -1428,14 +1425,14 @@
|
||||
"$ref": "#/texts/341"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/351"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/352"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/353"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/354"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -1448,6 +1445,9 @@
|
||||
"$ref": "#/texts/341"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/354"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/355"
|
||||
},
|
||||
@ -1522,9 +1522,6 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/379"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/380"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -1538,10 +1535,10 @@
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/381"
|
||||
"$ref": "#/texts/380"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/382"
|
||||
"$ref": "#/texts/381"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -1554,6 +1551,9 @@
|
||||
"$ref": "#/texts/341"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/382"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/383"
|
||||
},
|
||||
@ -1577,9 +1577,6 @@
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/390"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/391"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
@ -1591,14 +1588,7 @@
|
||||
"parent": {
|
||||
"$ref": "#/texts/341"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/texts/392"
|
||||
},
|
||||
{
|
||||
"$ref": "#/texts/393"
|
||||
}
|
||||
],
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"name": "list",
|
||||
"label": "list"
|
||||
@ -6774,27 +6764,13 @@
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/351",
|
||||
"parent": {
|
||||
"$ref": "#/groups/42"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
|
||||
"text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/352",
|
||||
"self_ref": "#/texts/351",
|
||||
"parent": {
|
||||
"$ref": "#/groups/43"
|
||||
},
|
||||
@ -6808,7 +6784,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/353",
|
||||
"self_ref": "#/texts/352",
|
||||
"parent": {
|
||||
"$ref": "#/groups/43"
|
||||
},
|
||||
@ -6822,7 +6798,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/354",
|
||||
"self_ref": "#/texts/353",
|
||||
"parent": {
|
||||
"$ref": "#/groups/43"
|
||||
},
|
||||
@ -6836,7 +6812,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/355",
|
||||
"self_ref": "#/texts/354",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6850,7 +6826,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/356",
|
||||
"self_ref": "#/texts/355",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6864,7 +6840,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/357",
|
||||
"self_ref": "#/texts/356",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6878,7 +6854,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/358",
|
||||
"self_ref": "#/texts/357",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6892,7 +6868,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/359",
|
||||
"self_ref": "#/texts/358",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6906,7 +6882,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/360",
|
||||
"self_ref": "#/texts/359",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6920,7 +6896,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/361",
|
||||
"self_ref": "#/texts/360",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6934,7 +6910,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/362",
|
||||
"self_ref": "#/texts/361",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6948,7 +6924,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/363",
|
||||
"self_ref": "#/texts/362",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6962,7 +6938,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/364",
|
||||
"self_ref": "#/texts/363",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6976,7 +6952,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/365",
|
||||
"self_ref": "#/texts/364",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -6990,7 +6966,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/366",
|
||||
"self_ref": "#/texts/365",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7004,7 +6980,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/367",
|
||||
"self_ref": "#/texts/366",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7018,7 +6994,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/368",
|
||||
"self_ref": "#/texts/367",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7032,7 +7008,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/369",
|
||||
"self_ref": "#/texts/368",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7046,7 +7022,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/370",
|
||||
"self_ref": "#/texts/369",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7060,7 +7036,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/371",
|
||||
"self_ref": "#/texts/370",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7074,7 +7050,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/372",
|
||||
"self_ref": "#/texts/371",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7088,7 +7064,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/373",
|
||||
"self_ref": "#/texts/372",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7102,7 +7078,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/374",
|
||||
"self_ref": "#/texts/373",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7116,7 +7092,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/375",
|
||||
"self_ref": "#/texts/374",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7130,7 +7106,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/376",
|
||||
"self_ref": "#/texts/375",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7144,7 +7120,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/377",
|
||||
"self_ref": "#/texts/376",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7158,7 +7134,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/378",
|
||||
"self_ref": "#/texts/377",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7172,7 +7148,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/379",
|
||||
"self_ref": "#/texts/378",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7186,7 +7162,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/380",
|
||||
"self_ref": "#/texts/379",
|
||||
"parent": {
|
||||
"$ref": "#/groups/44"
|
||||
},
|
||||
@ -7200,7 +7176,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/381",
|
||||
"self_ref": "#/texts/380",
|
||||
"parent": {
|
||||
"$ref": "#/groups/45"
|
||||
},
|
||||
@ -7214,7 +7190,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/382",
|
||||
"self_ref": "#/texts/381",
|
||||
"parent": {
|
||||
"$ref": "#/groups/45"
|
||||
},
|
||||
@ -7228,7 +7204,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/383",
|
||||
"self_ref": "#/texts/382",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7242,7 +7218,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/384",
|
||||
"self_ref": "#/texts/383",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7256,7 +7232,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/385",
|
||||
"self_ref": "#/texts/384",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7270,7 +7246,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/386",
|
||||
"self_ref": "#/texts/385",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7284,7 +7260,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/387",
|
||||
"self_ref": "#/texts/386",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7298,7 +7274,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/388",
|
||||
"self_ref": "#/texts/387",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7312,7 +7288,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/389",
|
||||
"self_ref": "#/texts/388",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7326,7 +7302,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/390",
|
||||
"self_ref": "#/texts/389",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7340,7 +7316,7 @@
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/391",
|
||||
"self_ref": "#/texts/390",
|
||||
"parent": {
|
||||
"$ref": "#/groups/46"
|
||||
},
|
||||
@ -7352,34 +7328,6 @@
|
||||
"text": "Mobile view",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/392",
|
||||
"parent": {
|
||||
"$ref": "#/groups/47"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
},
|
||||
{
|
||||
"self_ref": "#/texts/393",
|
||||
"parent": {
|
||||
"$ref": "#/groups/47"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "list_item",
|
||||
"prov": [],
|
||||
"orig": "",
|
||||
"text": "",
|
||||
"enumerated": false,
|
||||
"marker": "-"
|
||||
}
|
||||
],
|
||||
"pictures": [
|
||||
|
@ -473,7 +473,6 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
|
||||
|
||||
- list of books (useful looking abstracts)
|
||||
- Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
|
||||
-
|
||||
- Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl
|
||||
|
||||
| Authority control databases | Authority control databases |
|
||||
@ -526,7 +525,4 @@ additional terms may apply. By using this site, you agree to the Terms of Use an
|
||||
- Developers
|
||||
- Statistics
|
||||
- Cookie statement
|
||||
- Mobile view
|
||||
|
||||
-
|
||||
-
|
||||
- Mobile view
|
Loading…
Reference in New Issue
Block a user