mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Added HTML backend implementation, few improvements for other backends
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
f773d8a621
commit
89e58ca730
@ -1,26 +1,49 @@
|
|||||||
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from docling_core.types.experimental import (
|
from docling_core.types.experimental import (
|
||||||
|
BasePictureData,
|
||||||
|
BaseTableData,
|
||||||
DescriptionItem,
|
DescriptionItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
DocumentOrigin,
|
||||||
|
ImageRef,
|
||||||
|
PictureItem,
|
||||||
|
SectionHeaderItem,
|
||||||
|
TableCell,
|
||||||
|
TableItem,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.experimental.labels import DocItemLabel, GroupLabel
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
super().__init__(path_or_stream, document_hash)
|
super().__init__(path_or_stream, document_hash)
|
||||||
|
self.soup = None
|
||||||
|
# HTML file:
|
||||||
|
self.path_or_stream = path_or_stream
|
||||||
|
# Initialise the parents for the hierarchy
|
||||||
|
self.max_levels = 10
|
||||||
|
self.level = 0
|
||||||
|
self.parents = {}
|
||||||
|
for i in range(0, self.max_levels):
|
||||||
|
self.parents[i] = None
|
||||||
|
self.labels = {}
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def is_paginated(cls) -> bool:
|
||||||
False
|
return False
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -33,8 +56,338 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return {InputFormat.HTML}
|
return {InputFormat.HTML}
|
||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
|
|
||||||
# access self.path_or_stream to load stuff
|
# access self.path_or_stream to load stuff
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
|
|
||||||
|
try:
|
||||||
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
|
html_content = f.read()
|
||||||
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
|
except Exception as e:
|
||||||
|
_log.error("could not parse html: {}".format(e))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
# Replace <br> tags with newline characters
|
||||||
|
for br in self.soup.body.find_all("br"):
|
||||||
|
br.replace_with("\n")
|
||||||
|
doc = self.walk(self.soup.body, doc)
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
def walk(self, element, doc):
|
||||||
|
try:
|
||||||
|
# Iterate over elements in the body of the document
|
||||||
|
for idx, element in enumerate(element.children):
|
||||||
|
try:
|
||||||
|
self.analyse_element(element, idx, doc)
|
||||||
|
except Exception as exc_child:
|
||||||
|
_log.error(" -> error treating child: ", exc_child)
|
||||||
|
_log.error(" => element: ", element, "\n")
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def analyse_element(self, element, idx, doc):
|
||||||
|
"""
|
||||||
|
if element.name!=None:
|
||||||
|
print("\t"*self.level, idx, "\t", f"{element.name} ({self.level})")
|
||||||
|
"""
|
||||||
|
|
||||||
|
if element.name in self.labels:
|
||||||
|
self.labels[element.name] += 1
|
||||||
|
else:
|
||||||
|
self.labels[element.name] = 1
|
||||||
|
|
||||||
|
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
|
self.handle_header(element, idx, doc)
|
||||||
|
elif element.name in ["p"]:
|
||||||
|
self.handle_paragraph(element, idx, doc)
|
||||||
|
elif element.name in ["ul", "ol"]:
|
||||||
|
self.handle_list(element, idx, doc)
|
||||||
|
elif element.name in ["li"]:
|
||||||
|
self.handle_listitem(element, idx, doc)
|
||||||
|
elif element.name == "table":
|
||||||
|
self.handle_table(element, idx, doc)
|
||||||
|
elif element.name == "figure":
|
||||||
|
self.handle_figure(element, idx, doc)
|
||||||
|
elif element.name == "img":
|
||||||
|
self.handle_image(element, idx, doc)
|
||||||
|
else:
|
||||||
|
self.walk(element, doc)
|
||||||
|
|
||||||
|
def get_direct_text(self, item):
|
||||||
|
"""Get the direct text of the <li> element (ignoring nested lists)."""
|
||||||
|
text = item.find(string=True, recursive=False)
|
||||||
|
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Function to recursively extract text from all child nodes
|
||||||
|
def extract_text_recursively(self, item):
|
||||||
|
result = []
|
||||||
|
|
||||||
|
if isinstance(item, str):
|
||||||
|
return [item]
|
||||||
|
|
||||||
|
result.append(self.get_direct_text(item))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Iterate over the children (and their text and tails)
|
||||||
|
for child in item:
|
||||||
|
try:
|
||||||
|
# Recursively get the child's text content
|
||||||
|
result.extend(self.extract_text_recursively(child))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
_log.warn("item has no children")
|
||||||
|
pass
|
||||||
|
|
||||||
|
return " ".join(result)
|
||||||
|
|
||||||
|
def handle_header(self, element, idx, doc):
|
||||||
|
"""Handles header tags (h1, h2, etc.)."""
|
||||||
|
hlevel = int(element.name.replace("h", ""))
|
||||||
|
slevel = hlevel - 1
|
||||||
|
|
||||||
|
label = DocItemLabel.SECTION_HEADER
|
||||||
|
text = element.text.strip()
|
||||||
|
|
||||||
|
if hlevel == 1:
|
||||||
|
for key, val in self.parents.items():
|
||||||
|
self.parents[key] = None
|
||||||
|
|
||||||
|
self.level = 1
|
||||||
|
self.parents[self.level] = doc.add_text(
|
||||||
|
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
||||||
|
)
|
||||||
|
|
||||||
|
elif hlevel == self.level:
|
||||||
|
self.parents[hlevel] = doc.add_text(
|
||||||
|
parent=self.parents[hlevel - 1], label=label, text=text
|
||||||
|
)
|
||||||
|
|
||||||
|
elif hlevel > self.level:
|
||||||
|
|
||||||
|
# add invisible group
|
||||||
|
for i in range(self.level + 1, hlevel):
|
||||||
|
self.parents[i] = doc.add_group(
|
||||||
|
name=f"header-{i}",
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[i - 1],
|
||||||
|
)
|
||||||
|
|
||||||
|
self.parents[hlevel] = doc.add_text(
|
||||||
|
parent=self.parents[hlevel - 1], label=label, text=text
|
||||||
|
)
|
||||||
|
self.level = hlevel
|
||||||
|
|
||||||
|
elif hlevel < self.level:
|
||||||
|
|
||||||
|
# remove the tail
|
||||||
|
for key, val in self.parents.items():
|
||||||
|
if key > hlevel:
|
||||||
|
self.parents[key] = None
|
||||||
|
|
||||||
|
self.parents[hlevel] = doc.add_text(
|
||||||
|
parent=self.parents[hlevel - 1], label=label, text=text
|
||||||
|
)
|
||||||
|
self.level = hlevel
|
||||||
|
|
||||||
|
def handle_paragraph(self, element, idx, doc):
|
||||||
|
"""Handles paragraph tags (p)."""
|
||||||
|
if element.text is None:
|
||||||
|
return
|
||||||
|
text = element.text.strip()
|
||||||
|
label = DocItemLabel.PARAGRAPH
|
||||||
|
if len(text) == 0:
|
||||||
|
return
|
||||||
|
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
||||||
|
|
||||||
|
def handle_list(self, element, idx, doc):
|
||||||
|
"""Handles list tags (ul, ol) and their list items."""
|
||||||
|
|
||||||
|
# create a list group
|
||||||
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
|
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
||||||
|
)
|
||||||
|
self.level += 1
|
||||||
|
|
||||||
|
self.walk(element, doc)
|
||||||
|
|
||||||
|
self.parents[self.level + 1] = None
|
||||||
|
self.level -= 1
|
||||||
|
|
||||||
|
def handle_listitem(self, element, idx, doc):
|
||||||
|
"""Handles listitem tags (li)."""
|
||||||
|
nested_lists = element.find(["ul", "ol"])
|
||||||
|
if nested_lists:
|
||||||
|
name = element.name
|
||||||
|
text = self.get_direct_text(element)
|
||||||
|
|
||||||
|
# create a list-item
|
||||||
|
self.parents[self.level + 1] = doc.add_text(
|
||||||
|
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
||||||
|
)
|
||||||
|
self.level += 1
|
||||||
|
|
||||||
|
self.walk(element, doc)
|
||||||
|
|
||||||
|
self.parents[self.level + 1] = None
|
||||||
|
self.level -= 1
|
||||||
|
|
||||||
|
elif isinstance(element.text, str):
|
||||||
|
text = element.text.strip()
|
||||||
|
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.LIST_ITEM, text=text, parent=self.parents[self.level]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_log.warn("list-item has no text: ", element)
|
||||||
|
|
||||||
|
def handle_table(self, element, idx, doc):
|
||||||
|
"""Handles table tags."""
|
||||||
|
|
||||||
|
nested_tables = element.find("table")
|
||||||
|
if nested_tables is not None:
|
||||||
|
_log.warn("detected nested tables: skipping for now")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Count the number of rows (number of <tr> elements)
|
||||||
|
num_rows = len(element.find_all("tr"))
|
||||||
|
|
||||||
|
# Find the number of columns (taking into account colspan)
|
||||||
|
num_cols = 0
|
||||||
|
for row in element.find_all("tr"):
|
||||||
|
col_count = 0
|
||||||
|
for cell in row.find_all(["td", "th"]):
|
||||||
|
colspan = int(cell.get("colspan", 1))
|
||||||
|
col_count += colspan
|
||||||
|
num_cols = max(num_cols, col_count)
|
||||||
|
|
||||||
|
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
|
||||||
|
data = BaseTableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||||
|
|
||||||
|
# Iterate over the rows in the table
|
||||||
|
for row_idx, row in enumerate(element.find_all("tr")):
|
||||||
|
|
||||||
|
# For each row, find all the column cells (both <td> and <th>)
|
||||||
|
cells = row.find_all(["td", "th"])
|
||||||
|
|
||||||
|
# Check if each cell in the row is a header -> means it is a column header
|
||||||
|
col_header = True
|
||||||
|
for j, html_cell in enumerate(cells):
|
||||||
|
if html_cell.name == "td":
|
||||||
|
col_header = False
|
||||||
|
|
||||||
|
col_idx = 0
|
||||||
|
# Extract and print the text content of each cell
|
||||||
|
for _, html_cell in enumerate(cells):
|
||||||
|
|
||||||
|
text = html_cell.text
|
||||||
|
try:
|
||||||
|
text = self.extract_table_cell_text(html_cell)
|
||||||
|
except Exception as exc:
|
||||||
|
_log.warn("exception: ", exc)
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
# label = html_cell.name
|
||||||
|
|
||||||
|
col_span = int(html_cell.get("colspan", 1))
|
||||||
|
row_span = int(html_cell.get("rowspan", 1))
|
||||||
|
|
||||||
|
while grid[row_idx][col_idx] is not None:
|
||||||
|
col_idx += 1
|
||||||
|
for r in range(row_span):
|
||||||
|
for c in range(col_span):
|
||||||
|
grid[row_idx + r][col_idx + c] = text
|
||||||
|
|
||||||
|
cell = TableCell(
|
||||||
|
text=text,
|
||||||
|
row_span=row_span,
|
||||||
|
col_span=col_span,
|
||||||
|
start_row_offset_idx=row_idx,
|
||||||
|
end_row_offset_idx=row_idx + row_span,
|
||||||
|
start_col_offset_idx=col_idx,
|
||||||
|
end_col_offset_idx=col_idx + col_span,
|
||||||
|
col_header=col_header,
|
||||||
|
row_header=((not col_header) and html_cell.name == "th"),
|
||||||
|
)
|
||||||
|
data.table_cells.append(cell)
|
||||||
|
|
||||||
|
doc.add_table(data=data, parent=self.parents[self.level])
|
||||||
|
|
||||||
|
def get_list_text(list_element, level=0):
|
||||||
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||||
|
result = []
|
||||||
|
bullet_char = "*" # Default bullet character for unordered lists
|
||||||
|
|
||||||
|
if list_element.name == "ol": # For ordered lists, use numbers
|
||||||
|
for i, li in enumerate(list_element.find_all("li", recursive=False), 1):
|
||||||
|
# Add numbering for ordered lists
|
||||||
|
result.append(f"{' ' * level}{i}. {li.get_text(strip=True)}")
|
||||||
|
# Handle nested lists
|
||||||
|
nested_list = li.find(["ul", "ol"])
|
||||||
|
if nested_list:
|
||||||
|
result.extend(get_list_text(nested_list, level + 1))
|
||||||
|
elif list_element.name == "ul": # For unordered lists, use bullet points
|
||||||
|
for li in list_element.find_all("li", recursive=False):
|
||||||
|
# Add bullet points for unordered lists
|
||||||
|
result.append(
|
||||||
|
f"{' ' * level}{bullet_char} {li.get_text(strip=True)}"
|
||||||
|
)
|
||||||
|
# Handle nested lists
|
||||||
|
nested_list = li.find(["ul", "ol"])
|
||||||
|
if nested_list:
|
||||||
|
result.extend(get_list_text(nested_list, level + 1))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def extract_table_cell_text(self, cell):
|
||||||
|
"""Extract text from a table cell, including lists with indents."""
|
||||||
|
contains_lists = cell.find(["ul", "ol"])
|
||||||
|
if contains_lists is None:
|
||||||
|
return cell.text
|
||||||
|
else:
|
||||||
|
_log.warn(
|
||||||
|
"should extract the content correctly for table-cells with lists ..."
|
||||||
|
)
|
||||||
|
return cell.text
|
||||||
|
|
||||||
|
def handle_figure(self, element, idx, doc):
|
||||||
|
"""Handles image tags (img)."""
|
||||||
|
|
||||||
|
# Extract the image URI from the <img> tag
|
||||||
|
# image_uri = root.xpath('//figure//img/@src')[0]
|
||||||
|
|
||||||
|
contains_captions = element.find(["figcaption"])
|
||||||
|
if contains_captions is None:
|
||||||
|
doc.add_picture(
|
||||||
|
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
texts = []
|
||||||
|
for item in contains_captions:
|
||||||
|
texts.append(item.text)
|
||||||
|
|
||||||
|
fig_caption = doc.add_text(
|
||||||
|
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
||||||
|
)
|
||||||
|
doc.add_picture(
|
||||||
|
data=BasePictureData(),
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
caption=fig_caption,
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle_image(self, element, idx, doc):
|
||||||
|
"""Handles image tags (img)."""
|
||||||
|
doc.add_picture(
|
||||||
|
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||||
|
)
|
||||||
|
@ -43,7 +43,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def is_paginated(cls) -> bool:
|
||||||
False
|
return False # True? if so, how to handle pages...
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
@ -55,7 +55,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def is_paginated(cls) -> bool:
|
||||||
False
|
return False
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -63,6 +63,24 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
self.path_or_stream = None
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.DOCX}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
# Parses the DOCX into a structured document model.
|
||||||
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
|
docx_obj = None
|
||||||
|
try:
|
||||||
|
docx_obj = docx.Document(self.path_or_stream)
|
||||||
|
except Exception:
|
||||||
|
_log.error("could not parse docx")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
# self.initialise()
|
||||||
|
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
def update_history(self, name, level, numid, ilevel):
|
def update_history(self, name, level, numid, ilevel):
|
||||||
self.history["names"].append(name)
|
self.history["names"].append(name)
|
||||||
self.history["levels"].append(level)
|
self.history["levels"].append(level)
|
||||||
@ -89,24 +107,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return k
|
return k
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def supported_formats(cls) -> Set[InputFormat]:
|
|
||||||
return {InputFormat.DOCX}
|
|
||||||
|
|
||||||
def convert(self) -> DoclingDocument:
|
|
||||||
# Parses the DOCX into a structured document model.
|
|
||||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
|
||||||
docx_obj = None
|
|
||||||
try:
|
|
||||||
docx_obj = docx.Document(self.path_or_stream)
|
|
||||||
except Exception:
|
|
||||||
_log.error("could not parse docx")
|
|
||||||
return doc
|
|
||||||
|
|
||||||
# self.initialise()
|
|
||||||
doc = self.walk_linear(docx_obj.element.body, docx_obj, doc)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
@ -23,11 +23,11 @@ _log = logging.getLogger(__name__)
|
|||||||
USE_EXPERIMENTAL = False
|
USE_EXPERIMENTAL = False
|
||||||
|
|
||||||
input_paths = [
|
input_paths = [
|
||||||
# Path("tests/data/wiki_duck.html"),
|
Path("tests/data/wiki_duck.html"),
|
||||||
Path("tests/data/word_sample.docx"),
|
Path("tests/data/word_sample.docx"),
|
||||||
Path("tests/data/lorem_ipsum.docx"),
|
Path("tests/data/lorem_ipsum.docx"),
|
||||||
Path("tests/data/powerpoint_sample.pptx"),
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
# Path("tests/data/2206.01062.pdf"),
|
Path("tests/data/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
input = DocumentConversionInput.from_paths(input_paths)
|
input = DocumentConversionInput.from_paths(input_paths)
|
||||||
|
|
||||||
|
50
poetry.lock
generated
50
poetry.lock
generated
@ -196,8 +196,8 @@ files = [
|
|||||||
lazy-object-proxy = ">=1.4.0"
|
lazy-object-proxy = ">=1.4.0"
|
||||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||||
wrapt = [
|
wrapt = [
|
||||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -278,6 +278,27 @@ files = [
|
|||||||
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
|
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
|
||||||
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
|
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "beautifulsoup4"
|
||||||
|
version = "4.12.3"
|
||||||
|
description = "Screen-scraping library"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6.0"
|
||||||
|
files = [
|
||||||
|
{file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
|
||||||
|
{file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
soupsieve = ">1.2"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
cchardet = ["cchardet"]
|
||||||
|
chardet = ["chardet"]
|
||||||
|
charset-normalizer = ["charset-normalizer"]
|
||||||
|
html5lib = ["html5lib"]
|
||||||
|
lxml = ["lxml"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "black"
|
name = "black"
|
||||||
version = "24.8.0"
|
version = "24.8.0"
|
||||||
@ -2369,8 +2390,8 @@ jsonpatch = ">=1.33,<2.0"
|
|||||||
langsmith = ">=0.1.112,<0.2.0"
|
langsmith = ">=0.1.112,<0.2.0"
|
||||||
packaging = ">=23.2,<25"
|
packaging = ">=23.2,<25"
|
||||||
pydantic = [
|
pydantic = [
|
||||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
|
||||||
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
||||||
|
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||||
]
|
]
|
||||||
PyYAML = ">=5.3"
|
PyYAML = ">=5.3"
|
||||||
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
|
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
|
||||||
@ -2409,8 +2430,8 @@ files = [
|
|||||||
langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
|
langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
|
||||||
pymilvus = ">=2.4.3,<3.0.0"
|
pymilvus = ">=2.4.3,<3.0.0"
|
||||||
scipy = [
|
scipy = [
|
||||||
{version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
|
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
|
||||||
|
{version = ">=1.9,<2.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2442,8 +2463,8 @@ files = [
|
|||||||
httpx = ">=0.23.0,<1"
|
httpx = ">=0.23.0,<1"
|
||||||
orjson = ">=3.9.14,<4.0.0"
|
orjson = ">=3.9.14,<4.0.0"
|
||||||
pydantic = [
|
pydantic = [
|
||||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
|
||||||
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
||||||
|
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||||
]
|
]
|
||||||
requests = ">=2,<3"
|
requests = ">=2,<3"
|
||||||
requests-toolbelt = ">=1.0.0,<2.0.0"
|
requests-toolbelt = ">=1.0.0,<2.0.0"
|
||||||
@ -3640,10 +3661,10 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3776,9 +3797,9 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
python-dateutil = ">=2.8.2"
|
python-dateutil = ">=2.8.2"
|
||||||
pytz = ">=2020.1"
|
pytz = ">=2020.1"
|
||||||
@ -4250,8 +4271,8 @@ files = [
|
|||||||
annotated-types = ">=0.6.0"
|
annotated-types = ">=0.6.0"
|
||||||
pydantic-core = "2.23.4"
|
pydantic-core = "2.23.4"
|
||||||
typing-extensions = [
|
typing-extensions = [
|
||||||
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
|
||||||
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
|
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
|
||||||
|
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
@ -4419,8 +4440,8 @@ files = [
|
|||||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
astroid = ">=2.15.8,<=2.17.0-dev0"
|
||||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||||
dill = [
|
dill = [
|
||||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
isort = ">=4.2.5,<6"
|
isort = ">=4.2.5,<6"
|
||||||
mccabe = ">=0.6,<0.8"
|
mccabe = ">=0.6,<0.8"
|
||||||
@ -5933,6 +5954,17 @@ files = [
|
|||||||
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
|
{file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "soupsieve"
|
||||||
|
version = "2.6"
|
||||||
|
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
|
||||||
|
{file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sqlalchemy"
|
name = "sqlalchemy"
|
||||||
version = "2.0.35"
|
version = "2.0.35"
|
||||||
@ -7287,4 +7319,4 @@ type = ["pytest-mypy"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "a7c265a72158d56174202bdf33259ec2a9a60c54e74e4a8d61647d463f906880"
|
content-hash = "34dcde27e9214be7ebae7b2bc99e407e50c40cc44731552af7e8c97e53b2edf2"
|
||||||
|
@ -55,6 +55,7 @@ pyarrow = "^16.1.0"
|
|||||||
typer = "^0.12.5"
|
typer = "^0.12.5"
|
||||||
python-docx = "^1.1.2"
|
python-docx = "^1.1.2"
|
||||||
python-pptx = "^1.0.2"
|
python-pptx = "^1.0.2"
|
||||||
|
beautifulsoup4 = "^4.12.3"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
Loading…
Reference in New Issue
Block a user