Merge branch 'main' of github.com:DS4SD/docling into cau/pdfium-bitmap-fix

This commit is contained in:
Christoph Auer
2026-03-24 13:15:32 +01:00
30 changed files with 1690 additions and 237 deletions
+16
View File
@@ -1,3 +1,19 @@
## [v2.81.0](https://github.com/docling-project/docling/releases/tag/v2.81.0) - 2026-03-20
### Feature
* Route plain-text and Quarto/R Markdown files to the Markdown backend ([#3161](https://github.com/docling-project/docling/issues/3161)) ([`96d7c7e`](https://github.com/docling-project/docling/commit/96d7c7ec79992d8dddedfafaaedb7f9bf6e14f40))
### Fix
* **docx:** Missing list items after numbered header (#2665) ([#2678](https://github.com/docling-project/docling/issues/2678)) ([`2f7c09e`](https://github.com/docling-project/docling/commit/2f7c09e0d8f07a5fa0aaf4f33bdfb1f71d3f3063))
* Avoid thread-unsafe close of pypdfium backend ([#3160](https://github.com/docling-project/docling/issues/3160)) ([`afb4bb6`](https://github.com/docling-project/docling/commit/afb4bb68023c5d8fb8dc5e39413a27678e642293))
* Handle external image relationships in MsWordDocumentBackend ([#3114](https://github.com/docling-project/docling/issues/3114)) ([`8ae0974`](https://github.com/docling-project/docling/commit/8ae0974a9d86a447f78e4950bc0a45d5eba31e98))
* Handle PermissionError for directory input on Windows CLI ([#3149](https://github.com/docling-project/docling/issues/3149)) ([`a39317a`](https://github.com/docling-project/docling/commit/a39317a147859c68bf8aef635276a23585725529))
* Avoid in-place mutation of pipeline options breaking cache key ([#3115](https://github.com/docling-project/docling/issues/3115)) ([`412af62`](https://github.com/docling-project/docling/commit/412af62135869978b7d22e1dd4ee2725623fad44))
* Preserve torch_dtype in get_engine_config and add it to CodeFormulaV2 ([#3117](https://github.com/docling-project/docling/issues/3117)) ([`53a5f80`](https://github.com/docling-project/docling/commit/53a5f80a43849d853d4e0598d3875e6aac2f88e0))
* Release image backend resources after frame extraction ([#3134](https://github.com/docling-project/docling/issues/3134)) ([`1e841eb`](https://github.com/docling-project/docling/commit/1e841ebcbd048fbfc11d63b4086539b7cd88bb77))
## [v2.80.0](https://github.com/docling-project/docling/releases/tag/v2.80.0) - 2026-03-14
### Feature
+2 -1
View File
@@ -30,7 +30,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, WebVTT, images (PNG, TIFF, JPEG, ...), LaTeX, and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, WebVTT, images (PNG, TIFF, JPEG, ...), LaTeX, plain text, and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, WebVTT, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -50,6 +50,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 💼 Parsing of XBRL (eXtensible Business Reporting Language) documents for financial reports
* 💬 Parsing of WebVTT (Web Video Text Tracks) files and export to WebVTT format
* 💬 Parsing of LaTeX files
* 📝 Parsing of plain-text files (`.txt`, `.text`) and Markdown supersets (`.qmd`, `.Rmd`)
### Coming soon
+189 -90
View File
@@ -405,12 +405,78 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return None, None # If the paragraph is not part of a list
def _get_level_element(self, numid: int, ilvl: int) -> Optional[BaseOxmlElement]:
"""Find the level element from the numbering XML for a given numId and ilvl."""
try:
if not hasattr(self.docx_obj, "part") or not hasattr(
self.docx_obj.part, "package"
):
return None
numbering_part = None
for part in self.docx_obj.part.package.parts:
if "numbering" in part.partname:
numbering_part = part
break
if numbering_part is None:
return None
numbering_root = numbering_part.element
namespaces = {"w": self._W_NS}
num_element = numbering_root.find(
f".//w:num[@w:numId='{numid}']", namespaces=namespaces
)
if num_element is None:
return None
abstract_num_id_elem = num_element.find(
".//w:abstractNumId", namespaces=namespaces
)
if abstract_num_id_elem is None:
return None
abstract_num_id = abstract_num_id_elem.get(self.XML_KEY)
if abstract_num_id is None:
return None
abstract_num_element = numbering_root.find(
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']",
namespaces=namespaces,
)
if abstract_num_element is None:
return None
return abstract_num_element.find(
f".//w:lvl[@w:ilvl='{ilvl}']", namespaces=namespaces
)
except Exception as e:
_log.debug(f"Error finding level element: {e}")
return None
def _get_start_value(self, numid: int, ilvl: int) -> int:
"""Read the start value from the abstractNum definition."""
lvl_element = self._get_level_element(numid, ilvl)
if lvl_element is not None:
namespaces = {"w": self._W_NS}
start_element = lvl_element.find(".//w:start", namespaces=namespaces)
if start_element is not None:
val = start_element.get(self.XML_KEY)
if val is not None:
return int(val)
return 1
def _get_list_counter(self, numid: int, ilvl: int) -> int:
"""Get and increment the counter for a specific numId and ilvl combination."""
key = (numid, ilvl)
if key not in self.list_counters:
self.list_counters[key] = 0
start = self._get_start_value(numid, ilvl)
self.list_counters[key] = start - 1
self.list_counters[key] += 1
# Reset sub-level counters since parent level advanced
for k in [k for k in self.list_counters if k[0] == numid and k[1] > ilvl]:
self.list_counters[k] = 0
return self.list_counters[key]
def _reset_list_counters_for_new_sequence(self, numid: int):
@@ -420,74 +486,30 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
for key in keys_to_reset:
self.list_counters[key] = 0
def _build_enum_marker(self, numid: int, ilvl: int) -> str:
"""Build full hierarchical marker like '1.2.3.'"""
parts = []
for lvl in range(ilvl + 1):
counter = self.list_counters.get((numid, lvl))
if counter is None:
counter = self._get_start_value(numid, lvl)
parts.append(str(counter))
return ".".join(parts) + "."
def _is_numbered_list(self, numId: int, ilvl: int) -> bool:
"""Check if a list is numbered based on its numFmt value."""
try:
# Access the numbering part of the document
if not hasattr(self.docx_obj, "part") or not hasattr(
self.docx_obj.part, "package"
):
return False
numbering_part = None
# Find the numbering part
for part in self.docx_obj.part.package.parts:
if "numbering" in part.partname:
numbering_part = part
break
if numbering_part is None:
return False
# Parse the numbering XML
numbering_root = numbering_part.element
namespaces = {"w": self._W_NS}
# Find the numbering definition with the given numId
num_xpath = f".//w:num[@w:numId='{numId}']"
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
if num_element is None:
return False
# Get the abstractNumId from the num element
abstract_num_id_elem = num_element.find(
".//w:abstractNumId", namespaces=namespaces
)
if abstract_num_id_elem is None:
return False
abstract_num_id = abstract_num_id_elem.get(f"{self._W_NS_CLARK}val")
if abstract_num_id is None:
return False
# Find the abstract numbering definition
abstract_num_xpath = (
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
)
abstract_num_element = numbering_root.find(
abstract_num_xpath, namespaces=namespaces
)
if abstract_num_element is None:
return False
# Find the level definition for the given ilvl
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
lvl_element = self._get_level_element(numId, ilvl)
if lvl_element is None:
return False
# Get the numFmt element
namespaces = {"w": self._W_NS}
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
if num_fmt_element is None:
return False
num_fmt = num_fmt_element.get(f"{self._W_NS_CLARK}val")
num_fmt = num_fmt_element.get(self.XML_KEY)
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
# Bullet formats include: bullet
numbered_formats = {
"decimal",
"lowerRoman",
@@ -884,21 +906,55 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
only_texts = []
only_equations = []
texts_and_equations = []
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
if isinstance(subt.text, str):
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
# Collect oMath elements and text runs from the paragraph.
# Use direct children iteration first; fall back to deep iteration
# only if no oMath elements are found at the direct level.
direct_omaths = [
child
for child in element
if "oMath" in child.tag and "oMathPara" not in child.tag
]
if direct_omaths:
# Iterate direct children to preserve sibling order and avoid
# processing nested oMath descendants of an already-converted node.
for child in element:
if "oMath" in child.tag and "oMathPara" not in child.tag:
latex_equation = str(oMath2Latex(child)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
else:
# Collect text from non-math children (e.g. <w:r> runs)
for t_elem in child.iter():
t_tag = etree.QName(t_elem).localname
if t_tag == "t" and "math" not in t_elem.tag:
if isinstance(t_elem.text, str):
only_texts.append(t_elem.text)
texts_and_equations.append(t_elem.text)
else:
# Original deep-iteration fallback for nested oMath (e.g.
# inside oMathPara or other wrapper elements).
for subt in element.iter():
tag_name = etree.QName(subt).localname
if tag_name == "t" and "math" not in subt.tag:
if isinstance(subt.text, str):
only_texts.append(subt.text)
texts_and_equations.append(subt.text)
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
latex_equation = str(oMath2Latex(subt)).strip()
if len(latex_equation) > 0:
only_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
texts_and_equations.append(
self.equation_bookends.format(EQ=latex_equation)
)
if len(only_equations) < 1:
return text, []
@@ -1033,15 +1089,28 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
text
) > 0:
# Standalone equation
# Standalone equation(s) — emit each as a separate formula
level = self._get_level()
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=self.parents[level - 1],
text=text.replace("<eq>", "").replace("</eq>", ""),
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
parent = self.parents[level - 1]
if len(equations) > 1:
for eq in equations:
eq_text = eq.replace("<eq>", "").replace("</eq>", "").strip()
if len(eq_text) > 0:
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=parent,
text=eq_text,
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
else:
t1 = doc.add_text(
label=DocItemLabel.FORMULA,
parent=parent,
text=text.replace("<eq>", "").replace("</eq>", ""),
content_layer=self.content_layer,
)
elem_ref.append(t1.get_ref())
else:
# Inline equation
level = self._get_level()
@@ -1272,8 +1341,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) -> None:
"""Resolve enumeration marker and add a formatted list item."""
if is_numbered:
counter = self._get_list_counter(numid, ilevel)
enum_marker = str(counter) + "."
self._get_list_counter(numid, ilevel)
enum_marker = self._build_enum_marker(numid, ilevel)
else:
enum_marker = ""
self._add_formatted_list_item(doc, elements, enum_marker, is_numbered, level)
@@ -1294,10 +1363,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level = self._get_level()
prev_indent = self._prev_indent()
if (
self._prev_numid() is None
or self._prev_numid() != numid
or (self._prev_numid() == numid and self.level_at_new_list is None)
if self._prev_numid() is None or (
self._prev_numid() == numid and self.level_at_new_list is None
): # Open new list
self.level_at_new_list = level
@@ -1360,12 +1427,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + ilevel,
)
elif self._prev_numid() == numid or prev_indent == ilevel:
elif self._prev_numid() == numid and isinstance(
self.parents.get(level - 1), ListGroup
):
# Continue existing list - only if parent is actually a ListGroup
self._add_list_item_with_marker(
doc, elements, numid, ilevel, is_numbered, level - 1
)
else:
_log.warning("List item not matching any insert condition.")
elif self._prev_numid() != numid or not isinstance(
self.parents.get(level - 1), ListGroup
):
# New list sequence: Different numid OR parent is not a ListGroup
# Use anchor-based level to place new list at the correct document position
if self.level_at_new_list is not None:
use_level = self.level_at_new_list + ilevel
for k in list(self.parents.keys()):
if k > use_level:
self.parents[k] = None
else:
use_level = level
self.level_at_new_list = use_level
list_gr = doc.add_list_group(
name="list",
parent=self.parents[use_level - 1],
content_layer=self.content_layer,
)
self.parents[use_level] = list_gr
elem_ref.append(list_gr.get_ref())
# Set marker and enumerated arguments if this is an enumeration element.
if is_numbered:
self._get_list_counter(numid, ilevel)
enum_marker = self._build_enum_marker(numid, ilevel)
else:
enum_marker = ""
self._add_formatted_list_item(
doc, elements, enum_marker, is_numbered, use_level
)
return elem_ref
@staticmethod
+3 -2
View File
@@ -375,8 +375,9 @@ class PyPdfiumPageBackend(PdfPageBackend):
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
self.text_page = None
with pypdfium2_lock:
self._ppage = None
self.text_page = None
class PyPdfiumDocumentBackend(PdfDocumentBackend):
+24 -2
View File
@@ -370,6 +370,25 @@ def _split_list(raw: str | None) -> list[str] | None:
return re.split(r"[;,]", raw)
_OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING = frozenset(
{
OutputFormat.TEXT,
OutputFormat.DOCTAGS,
OutputFormat.VTT,
}
)
def _should_generate_export_images(
image_export_mode: ImageRefMode,
to_formats: list[OutputFormat],
) -> bool:
return image_export_mode != ImageRefMode.PLACEHOLDER and any(
to_format not in _OUTPUT_FORMATS_NOT_SUPPORTING_IMAGE_EMBEDDING
for to_format in to_formats
)
@app.command(no_args_is_help=True)
def convert( # noqa: C901
input_sources: Annotated[
@@ -404,7 +423,7 @@ def convert( # noqa: C901
ImageRefMode,
typer.Option(
...,
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
help="Image export mode for image-capable document outputs (JSON, YAML, HTML, HTML split-page, and Markdown). Text, DocTags, and WebVTT outputs do not export images. With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
),
] = ImageRefMode.EMBEDDED,
pipeline: Annotated[
@@ -750,7 +769,10 @@ def convert( # noqa: C901
)
pipeline_options.table_structure_options.mode = table_mode
if image_export_mode != ImageRefMode.PLACEHOLDER:
if _should_generate_export_images(
image_export_mode,
to_formats,
):
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = (
True # FIXME: to be deprecated in version 3
+26 -25
View File
@@ -37,7 +37,7 @@ from docling.datamodel.pipeline_options import PipelineOptions
class BaseFormatOption(BaseModel):
"""Base class for format options used by _DocumentConversionInput."""
pipeline_options: Optional[PipelineOptions] = None
pipeline_options: PipelineOptions | None = None
backend: Type[AbstractDocumentBackend]
model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -89,7 +89,7 @@ FormatToExtensions: dict[InputFormat, list[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.MD: ["md", "txt", "text", "qmd", "rmd", "Rmd"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_JATS: ["xml", "nxml"],
InputFormat.XML_XBRL: ["xml", "xbrl"],
@@ -128,7 +128,7 @@ FormatToMimeType: dict[InputFormat, list[str]] = {
],
InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.MD: ["text/markdown", "text/x-markdown", "text/plain"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
@@ -180,6 +180,7 @@ class VlmStopReason(str, Enum):
LENGTH = "length" # max tokens reached
STOP_SEQUENCE = "stop_sequence" # Custom stopping criteria met
END_OF_SEQUENCE = "end_of_sequence" # Model generated end-of-text token
CONTENT_FILTERED = "content_filter" # Content filtered by API provider
UNSPECIFIED = "unspecified" # Defaul none value
@@ -207,7 +208,7 @@ class BasePageElement(BaseModel):
id: int
page_no: int
cluster: Cluster
text: Optional[str] = None
text: str | None = None
class LayoutPrediction(BaseModel):
@@ -224,9 +225,9 @@ class VlmPrediction(BaseModel):
text: str = ""
generated_tokens: list[VlmPredictionToken] = []
generation_time: float = -1
num_tokens: Optional[int] = None
num_tokens: int | None = None
stop_reason: VlmStopReason = VlmStopReason.UNSPECIFIED
input_prompt: Optional[str] = None
input_prompt: str | None = None
class ContainerElement(
@@ -252,14 +253,14 @@ class TextElement(BasePageElement):
class FigureElement(BasePageElement):
annotations: list[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
provenance: str | None = None
predicted_class: str | None = None
confidence: float | None = None
@field_serializer("confidence")
def _serialize(
self, value: Optional[float], info: FieldSerializationInfo
) -> Optional[float]:
self, value: float | None, info: FieldSerializationInfo
) -> float | None:
return (
round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
if value is not None
@@ -278,11 +279,11 @@ class EquationPrediction(BaseModel):
class PagePredictions(BaseModel):
layout: Optional[LayoutPrediction] = None
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
vlm_response: Optional[VlmPrediction] = None
layout: LayoutPrediction | None = None
tablestructure: TableStructurePrediction | None = None
figures_classification: FigureClassificationPrediction | None = None
equations_prediction: EquationPrediction | None = None
vlm_response: VlmPrediction | None = None
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
@@ -306,10 +307,10 @@ class Page(BaseModel):
page_no: int
# page_hash: Optional[str] = None
size: Optional[Size] = None
parsed_page: Optional[SegmentedPdfPage] = None
size: Size | None = None
parsed_page: SegmentedPdfPage | None = None
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
assembled: AssembledUnit | None = None
_backend: Optional["PdfPageBackend"] = (
None # Internal PDF backend. By default it is cleared during assembling.
@@ -330,9 +331,9 @@ class Page(BaseModel):
def get_image(
self,
scale: float = 1.0,
max_size: Optional[int] = None,
cropbox: Optional[BoundingBox] = None,
) -> Optional[Image]:
max_size: int | None = None,
cropbox: BoundingBox | None = None,
) -> Image | None:
if self._backend is None:
return self._image_cache.get(scale, None)
@@ -358,7 +359,7 @@ class Page(BaseModel):
)
@property
def image(self) -> Optional[Image]:
def image(self) -> Image | None:
return self.get_image(scale=self._default_image_scale)
@@ -373,7 +374,7 @@ class OpenAiChatMessage(BaseModel):
class OpenAiResponseChoice(BaseModel):
index: int
message: OpenAiChatMessage
finish_reason: Optional[str]
finish_reason: str | None
class OpenAiResponseUsage(BaseModel):
@@ -388,7 +389,7 @@ class OpenAiApiResponse(BaseModel):
)
id: str
model: Optional[str] = None # returned by openai
model: str | None = None # returned by openai
choices: list[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage
+12 -2
View File
@@ -494,7 +494,7 @@ class _DocumentConversionInput(BaseModel):
if mime is None:
ext = obj.suffix[1:]
mime = _DocumentConversionInput._mime_from_extension(ext)
if mime is None: # must guess from
if mime is None: # must guess from content
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
if mime is not None and mime.lower() == "application/zip":
@@ -624,9 +624,11 @@ class _DocumentConversionInput(BaseModel):
input_format = InputFormat.XML_JATS
elif mime == "text/plain":
content_str = content.decode("utf-8")
content_str = content.decode("utf-8", errors="replace")
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO
# No MD fallback: unrecognised text/plain content returns None.
# MD is detected via text/markdown mime (from .md/.text/.qmd/… extensions).
return input_format
@@ -637,6 +639,14 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
elif ext in FormatToExtensions[InputFormat.HTML]:
mime = FormatToMimeType[InputFormat.HTML][0]
elif (
ext in FormatToExtensions[InputFormat.XML_USPTO]
and ext in FormatToExtensions[InputFormat.MD]
):
# "txt" appears in both XML_USPTO and MD extension lists. Leave mime=None
# so the content-probing chain (_detect_html_xhtml, _detect_csv, then the
# "text/plain" fallback + _guess_from_content) can pick the right format.
pass
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.CSV]:
+13 -2
View File
@@ -536,19 +536,21 @@ class PictureDescriptionBaseOptions(BaseOptions):
batch_size: Annotated[
int,
Field(
ge=1,
description=(
"Number of images to process in a single batch during picture description. Higher values improve "
"throughput but increase memory usage. Adjust based on available GPU/CPU memory."
)
),
),
] = 8
scale: Annotated[
float,
Field(
gt=0,
description=(
"Scaling factor for image resolution before processing. Higher values (e.g., 2.0) provide more detail "
"for the vision model but increase processing time and memory. Range: 0.5-4.0 typical."
)
),
),
] = 2.0
picture_area_threshold: Annotated[
@@ -715,6 +717,15 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
)
),
] = {"max_new_tokens": 200, "do_sample": False}
padding_side: Annotated[
Literal["left", "right"],
Field(
description=(
"Tokenizer padding side used for batched generation. Defaults to left to preserve the legacy "
"behavior, but can be overridden for models that require right padding."
)
),
] = "left"
@property
def repo_cache_folder(self) -> str:
@@ -122,9 +122,10 @@ class ApiVlmEngine(BaseVlmEngine):
images = preprocess_image_batch([input_data.image])
image = images[0]
# Prepare API parameters (use merged params which include model spec params)
api_params = {
**self.merged_params,
# Prepare API parameters: engine defaults first, then user/model
# params override. This allows users to set Azure-specific params
# like max_completion_tokens or override temperature (#3112).
api_params: dict[str, object] = {
"temperature": input_data.temperature,
}
@@ -132,6 +133,14 @@ class ApiVlmEngine(BaseVlmEngine):
if input_data.max_new_tokens:
api_params["max_tokens"] = input_data.max_new_tokens
# User/model spec params take precedence over engine defaults
api_params.update(self.merged_params)
# If user specified max_completion_tokens, remove conflicting
# max_tokens (required for Azure OpenAI compatibility)
if "max_completion_tokens" in api_params:
api_params.pop("max_tokens", None)
# Add stop strings if specified
if input_data.stop_strings:
api_params["stop"] = input_data.stop_strings
@@ -39,9 +39,16 @@ class PictureDescriptionBaseModel(
options: PictureDescriptionBaseOptions,
accelerator_options: AcceleratorOptions,
):
if options.batch_size < 1:
raise ValueError("Picture description batch_size must be >= 1")
if options.scale <= 0:
raise ValueError("Picture description scale must be > 0")
self.enabled = enabled
self.options = options
self.provenance = "not-implemented"
self.elements_batch_size = options.batch_size
self.images_scale = options.scale
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
return self.enabled and isinstance(element, PictureItem)
@@ -57,7 +57,6 @@ class PictureDescriptionVlmModel(
import torch
from transformers import (
AutoModelForImageTextToText,
AutoModelForVision2Seq,
AutoProcessor,
)
except ImportError:
@@ -68,6 +67,9 @@ class PictureDescriptionVlmModel(
# Initialize processor and model
with _model_init_lock:
self.processor = AutoProcessor.from_pretrained(artifacts_path)
tokenizer = getattr(self.processor, "tokenizer", None)
if tokenizer is not None:
tokenizer.padding_side = self.options.padding_side
self.model = AutoModelForImageTextToText.from_pretrained(
artifacts_path,
device_map=self.device,
@@ -89,6 +91,10 @@ class PictureDescriptionVlmModel(
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
from transformers import GenerationConfig
image_batch = list(images)
if not image_batch:
return
# Create input messages
messages = [
{
@@ -100,24 +106,25 @@ class PictureDescriptionVlmModel(
},
]
# TODO: do batch generation
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True
)
inputs = self.processor(
text=[prompt] * len(image_batch),
images=image_batch,
return_tensors="pt",
padding=True,
)
inputs = inputs.to(self.device)
for image in images:
# Prepare inputs
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True
)
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(self.device)
generated_ids = self.model.generate(
**inputs,
generation_config=GenerationConfig(**self.options.generation_config),
)
generated_texts = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
# Generate outputs
generated_ids = self.model.generate(
**inputs,
generation_config=GenerationConfig(**self.options.generation_config),
)
generated_texts = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
)
yield generated_texts[0].strip()
for text in generated_texts:
yield text.strip()
@@ -4,7 +4,7 @@ import threading
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Optional, Union
from typing import Union
import numpy as np
from PIL.Image import Image
@@ -39,7 +39,7 @@ class HuggingFaceMlxModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
artifacts_path: Path | None,
accelerator_options: AcceleratorOptions,
vlm_options: InlineVlmOptions,
):
+12 -9
View File
@@ -19,9 +19,9 @@ def api_image_request(
prompt: str,
url: AnyUrl,
timeout: float = 20,
headers: Optional[dict[str, str]] = None,
headers: dict[str, str] | None = None,
**params,
) -> Tuple[str, Optional[int], VlmStopReason]:
) -> Tuple[str, int | None, VlmStopReason]:
img_io = BytesIO()
image = (
image.copy()
@@ -77,11 +77,14 @@ def api_image_request(
api_resp = OpenAiApiResponse.model_validate_json(r.text)
generated_text = api_resp.choices[0].message.content.strip()
num_tokens = api_resp.usage.total_tokens
stop_reason = (
VlmStopReason.LENGTH
if api_resp.choices[0].finish_reason == "length"
else VlmStopReason.END_OF_SEQUENCE
)
finish_reason = api_resp.choices[0].finish_reason
if finish_reason == "content_filter":
_log.warning("API response was filtered due to content safety policy.")
stop_reason = VlmStopReason.CONTENT_FILTERED
elif finish_reason == "length":
stop_reason = VlmStopReason.LENGTH
else:
stop_reason = VlmStopReason.END_OF_SEQUENCE
return generated_text, num_tokens, stop_reason
except Exception as e:
@@ -97,10 +100,10 @@ def api_image_request_streaming(
url: AnyUrl,
*,
timeout: float = 20,
headers: Optional[dict[str, str]] = None,
headers: dict[str, str] | None = None,
generation_stoppers: list[GenerationStopper] = [],
**params,
) -> Tuple[str, Optional[int]]:
) -> Tuple[str, int | None]:
"""
Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
+3 -2
View File
@@ -318,8 +318,9 @@ def run_vllm_example(input_doc_path: Path) -> bool:
url="http://localhost:8000/v1/chat/completions",
params={
"model": "ibm-granite/granite-docling-258M",
"max_tokens": 4096,
"skip_special_tokens": True,
"temperature": 0.0,
"max_tokens": 8192,
"skip_special_tokens": False,
},
timeout=90,
),
+2 -1
View File
@@ -35,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, WebVTT, images (PNG, TIFF, JPEG, ...), LaTeX, and more
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, WebVTT, images (PNG, TIFF, JPEG, ...), LaTeX, plain text, and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, WebVTT, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -55,6 +55,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
* 💼 Parsing of XBRL (eXtensible Business Reporting Language) documents for financial reports
* 💬 Parsing of WebVTT (Web Video Text Tracks) files
* 💬 Parsing of LaTeX files
* 📝 Parsing of plain-text files (`.txt`, `.text`) and Markdown supersets (`.qmd`, `.Rmd`)
### Coming soon
+1 -1
View File
@@ -1,6 +1,6 @@
[project]
name = "docling"
version = "2.80.0" # DO NOT EDIT, updated automatically
version = "2.81.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group header-0
item-2 at level 2: section_header: Issue 3: Concatenated equation blocks
item-3 at level 3: text: The paragraph below contains thr ... ts are siblings inside a single <w:p>.
item-4 at level 3: formula: a=b
item-5 at level 3: formula: c=d
item-6 at level 3: formula: e=f
@@ -0,0 +1,132 @@
{
"schema_name": "DoclingDocument",
"version": "1.9.0",
"name": "omml_multi_equation_paragraph",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 17520448227351822398,
"filename": "omml_multi_equation_paragraph.docx"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/0"
}
],
"content_layer": "body",
"name": "header-0",
"label": "section"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/groups/0"
},
"children": [
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"label": "section_header",
"prov": [],
"orig": "Issue 3: Concatenated equation blocks",
"text": "Issue 3: Concatenated equation blocks",
"level": 1
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "The paragraph below contains three separate <m:oMath> elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three <m:oMath> elements are siblings inside a single <w:p>.",
"text": "The paragraph below contains three separate <m:oMath> elements.\nExpected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)\nDocling produces: one $$ block with all equations concatenated.\n\nAll three <m:oMath> elements are siblings inside a single <w:p>.",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "a=b",
"text": "a=b"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "c=d",
"text": "c=d"
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/texts/0"
},
"children": [],
"content_layer": "body",
"label": "formula",
"prov": [],
"orig": "e=f",
"text": "e=f"
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}
@@ -0,0 +1,13 @@
## Issue 3: Concatenated equation blocks
The paragraph below contains three separate &lt;m:oMath&gt; elements.
Expected: three separate $$ blocks ($$a = b$$, $$c = d$$, $$e = f$$)
Docling produces: one $$ block with all equations concatenated.
All three &lt;m:oMath&gt; elements are siblings inside a single &lt;w:p&gt;.
$$a=b$$
$$c=d$$
$$e=f$$
@@ -49,4 +49,25 @@ item-0 at level 0: unspecified: group _root_
item-48 at level 4: text:
item-49 at level 4: text: Paragraph 2.1.2
item-50 at level 4: text:
item-51 at level 4: text:
item-51 at level 4: text:
item-52 at level 4: text:
item-53 at level 4: list: group list
item-54 at level 5: list_item: Appendix A: Glossary
item-55 at level 5: list: group list
item-56 at level 6: list_item: Section A.1
item-57 at level 6: list: group list
item-58 at level 7: list_item: Detail A.1.1
item-59 at level 6: list: group list
item-60 at level 7: list_item: Hardware Constraints Egde Case
item-61 at level 7: list_item: Software Constraints
item-62 at level 7: list_item: Network Constraints
item-63 at level 7: list_item: Environmental Constraints
item-64 at level 7: list_item: Regulatory Constraints
item-65 at level 7: list_item: Budget Constraints
item-66 at level 7: list_item: Timeline Constraints
item-67 at level 7: list_item: Resource Constraints
item-68 at level 7: list: group list
item-69 at level 8: list_item: First sub-item at this level
item-70 at level 8: list_item: Second sub-item at this level
item-71 at level 8: list_item: Third sub-item at this level
item-72 at level 4: text:
@@ -4,7 +4,7 @@
"name": "unit_test_headers_numbered",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"binary_hash": 7684538628968220703,
"binary_hash": 5429064773624687111,
"filename": "unit_test_headers_numbered.docx"
},
"furniture": {
@@ -140,6 +140,115 @@
"content_layer": "body",
"name": "header-2",
"label": "section"
},
{
"self_ref": "#/groups/5",
"parent": {
"$ref": "#/texts/39"
},
"children": [
{
"$ref": "#/texts/47"
},
{
"$ref": "#/groups/6"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/6",
"parent": {
"$ref": "#/groups/5"
},
"children": [
{
"$ref": "#/texts/48"
},
{
"$ref": "#/groups/7"
},
{
"$ref": "#/groups/8"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/7",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/49"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/8",
"parent": {
"$ref": "#/groups/6"
},
"children": [
{
"$ref": "#/texts/50"
},
{
"$ref": "#/texts/51"
},
{
"$ref": "#/texts/52"
},
{
"$ref": "#/texts/53"
},
{
"$ref": "#/texts/54"
},
{
"$ref": "#/texts/55"
},
{
"$ref": "#/texts/56"
},
{
"$ref": "#/texts/57"
},
{
"$ref": "#/groups/9"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
},
{
"self_ref": "#/groups/9",
"parent": {
"$ref": "#/groups/8"
},
"children": [
{
"$ref": "#/texts/58"
},
{
"$ref": "#/texts/59"
},
{
"$ref": "#/texts/60"
}
],
"content_layer": "body",
"name": "list",
"label": "list"
}
],
"texts": [
@@ -801,6 +910,15 @@
},
{
"$ref": "#/texts/45"
},
{
"$ref": "#/texts/46"
},
{
"$ref": "#/groups/5"
},
{
"$ref": "#/texts/61"
}
],
"content_layer": "body",
@@ -895,6 +1013,324 @@
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/46",
"parent": {
"$ref": "#/texts/39"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/groups/5"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Appendix A: Glossary",
"text": "Appendix A: Glossary",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "1."
},
{
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/6"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Section A.1",
"text": "Section A.1",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "1.1."
},
{
"self_ref": "#/texts/49",
"parent": {
"$ref": "#/groups/7"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Detail A.1.1",
"text": "Detail A.1.1",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "1.1.1."
},
{
"self_ref": "#/texts/50",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Hardware Constraints Egde Case",
"text": "Hardware Constraints Egde Case",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.1."
},
{
"self_ref": "#/texts/51",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Software Constraints",
"text": "Software Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.2."
},
{
"self_ref": "#/texts/52",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Network Constraints",
"text": "Network Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.3."
},
{
"self_ref": "#/texts/53",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Environmental Constraints",
"text": "Environmental Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.4."
},
{
"self_ref": "#/texts/54",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Regulatory Constraints",
"text": "Regulatory Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.5."
},
{
"self_ref": "#/texts/55",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Budget Constraints",
"text": "Budget Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.6."
},
{
"self_ref": "#/texts/56",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Timeline Constraints",
"text": "Timeline Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.7."
},
{
"self_ref": "#/texts/57",
"parent": {
"$ref": "#/groups/8"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Resource Constraints",
"text": "Resource Constraints",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.8."
},
{
"self_ref": "#/texts/58",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First sub-item at this level",
"text": "First sub-item at this level",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.8.1."
},
{
"self_ref": "#/texts/59",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second sub-item at this level",
"text": "Second sub-item at this level",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.8.2."
},
{
"self_ref": "#/texts/60",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Third sub-item at this level",
"text": "Third sub-item at this level",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
},
"enumerated": true,
"marker": "2.3.8.3."
},
{
"self_ref": "#/texts/61",
"parent": {
"$ref": "#/texts/39"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],
@@ -40,4 +40,19 @@ Paragraph 2.1.1.1
Paragraph 2.1.1
Paragraph 2.1.2
Paragraph 2.1.2
1. Appendix A: Glossary
- 1.1. Section A.1
- 1.1.1. Detail A.1.1
- 2.3.1. Hardware Constraints Egde Case
- 2.3.2. Software Constraints
- 2.3.3. Network Constraints
- 2.3.4. Environmental Constraints
- 2.3.5. Regulatory Constraints
- 2.3.6. Budget Constraints
- 2.3.7. Timeline Constraints
- 2.3.8. Resource Constraints
- 2.3.8.1. First sub-item at this level
- 2.3.8.2. Second sub-item at this level
- 2.3.8.3. Third sub-item at this level
+94
View File
@@ -0,0 +1,94 @@
"""Tests for api_image_request module."""
from unittest.mock import MagicMock, patch
import pytest
from PIL import Image
from docling.datamodel.base_models import VlmStopReason
from docling.utils.api_image_request import api_image_request
class TestApiImageRequest:
"""Test cases for api_image_request function."""
@pytest.fixture
def sample_image(self):
"""Create a simple test image."""
return Image.new("RGB", (100, 100), color="red")
@pytest.fixture
def mock_response_factory(self):
"""Factory to create mock API responses."""
def _create_mock_response(
content="Test response",
finish_reason="stop",
total_tokens=100,
status_ok=True,
):
mock_resp = MagicMock()
mock_resp.ok = status_ok
mock_resp.text = f"""{{
"id": "test-id",
"created": 1234567890,
"choices": [{{
"index": 0,
"message": {{"role": "assistant", "content": "{content}"}},
"finish_reason": "{finish_reason}"
}}],
"usage": {{"prompt_tokens": 50, "completion_tokens": 50, "total_tokens": {total_tokens}}}
}}"""
return mock_resp
return _create_mock_response
@patch("docling.utils.api_image_request.requests.post")
def test_content_filter_finish_reason(
self, mock_post, sample_image, mock_response_factory
):
"""Test that content_filter finish reason returns CONTENT_FILTERED."""
mock_post.return_value = mock_response_factory(
content="Filtered content", finish_reason="content_filter"
)
result_text, _tokens, stop_reason = api_image_request(
image=sample_image,
prompt="Test prompt",
url="http://test.api/v1/chat/completions",
)
assert result_text == "Filtered content"
assert stop_reason == VlmStopReason.CONTENT_FILTERED
@patch("docling.utils.api_image_request.requests.post")
def test_length_finish_reason(self, mock_post, sample_image, mock_response_factory):
"""Test that length finish reason returns LENGTH."""
mock_post.return_value = mock_response_factory(
content="Truncated content", finish_reason="length"
)
result_text, _tokens, stop_reason = api_image_request(
image=sample_image,
prompt="Test prompt",
url="http://test.api/v1/chat/completions",
)
assert result_text == "Truncated content"
assert stop_reason == VlmStopReason.LENGTH
@patch("docling.utils.api_image_request.requests.post")
def test_stop_finish_reason(self, mock_post, sample_image, mock_response_factory):
"""Test that stop finish reason returns END_OF_SEQUENCE."""
mock_post.return_value = mock_response_factory(
content="Normal completion", finish_reason="stop"
)
result_text, _tokens, stop_reason = api_image_request(
image=sample_image,
prompt="Test prompt",
url="http://test.api/v1/chat/completions",
)
assert result_text == "Normal completion"
assert stop_reason == VlmStopReason.END_OF_SEQUENCE
+171
View File
@@ -2,10 +2,13 @@ import logging
import os
import warnings
from pathlib import Path
from types import SimpleNamespace
import pytest
from docling_core.types.doc import GroupItem
from lxml import etree
import docling.backend.msword_backend as msword_backend_module
from docling.backend.docx.drawingml.utils import get_libreoffice_cmd
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -44,6 +47,17 @@ def get_converter():
return converter
@pytest.fixture(scope="module")
def backend(docx_paths) -> MsWordDocumentBackend:
docx_path = docx_paths[0]
in_doc = InputDocument(
path_or_stream=docx_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
return in_doc._backend
@pytest.fixture(scope="module")
def documents(docx_paths) -> list[tuple[Path, DoclingDocument]]:
documents: list[dict[Path, DoclingDocument]] = []
@@ -423,3 +437,160 @@ def test_external_image_references():
assert "Test Document with External Image" in md
assert "text before the image" in md
assert "after the external image" in md
def test_list_counter_and_enum_marker(docx_paths):
"""Test list counter increment, sub-level reset, marker building, and sequence reset."""
docx_path = docx_paths[0]
in_doc = InputDocument(
path_or_stream=docx_path,
format=InputFormat.DOCX,
backend=MsWordDocumentBackend,
)
backend = in_doc._backend
# Basic increment
assert backend._get_list_counter(1, 0) == 1
assert backend._get_list_counter(1, 0) == 2
assert backend._get_list_counter(1, 1) == 1
assert backend._get_list_counter(1, 1) == 2
assert backend._get_list_counter(1, 1) == 3
# Advancing parent level resets sub-levels
backend._get_list_counter(1, 2) # (1,2) = 1
backend._get_list_counter(1, 0) # (1,0) = 3, resets lvl 1 and 2
assert backend.list_counters[(1, 1)] == 0
assert backend.list_counters[(1, 2)] == 0
assert backend._get_list_counter(1, 1) == 1 # restarts from 1
# Hierarchical enum markers
backend.list_counters[(1, 0)] = 2
backend.list_counters[(1, 1)] = 3
backend.list_counters[(1, 2)] = 1
assert backend._build_enum_marker(1, 0) == "2."
assert backend._build_enum_marker(1, 1) == "2.3."
assert backend._build_enum_marker(1, 2) == "2.3.1."
assert backend._build_enum_marker(99, 0) == "1." # missing counter defaults to 1
# Reset sequence for a specific numid
backend._get_list_counter(2, 0) # (2,0) = 1
backend._reset_list_counters_for_new_sequence(1)
assert backend.list_counters[(1, 0)] == 0
assert backend.list_counters[(1, 1)] == 0
assert backend.list_counters[(2, 0)] == 1 # unaffected
def test_handle_equations_in_text_returns_original_text_on_mismatch(
backend, monkeypatch
):
element = etree.Element("p")
run = etree.SubElement(element, "r")
text_elem = etree.SubElement(run, "t")
text_elem.text = "alpha"
etree.SubElement(element, "oMath")
monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x")
text, equations = backend._handle_equations_in_text(element=element, text="beta")
assert text == "beta"
assert equations == []
def test_handle_equations_in_text_skips_empty_substrings(backend, monkeypatch):
equation = backend.equation_bookends.format(EQ="x")
element = etree.Element("p")
empty_run = etree.SubElement(element, "r")
empty_text = etree.SubElement(empty_run, "t")
empty_text.text = ""
etree.SubElement(element, "oMath")
tail_run = etree.SubElement(element, "r")
tail_text = etree.SubElement(tail_run, "t")
tail_text.text = "tail"
monkeypatch.setattr(msword_backend_module, "oMath2Latex", lambda _: "x")
text, equations = backend._handle_equations_in_text(element=element, text="tail")
assert equations == [equation]
assert text == f"{equation}tail"
def test_handle_text_elements_returns_empty_refs_when_text_is_none(
backend, monkeypatch
):
element = backend.docx_obj.paragraphs[0]._element
monkeypatch.setattr(
backend, "_handle_equations_in_text", lambda element, text: (None, [])
)
refs = backend._handle_text_elements(element, DoclingDocument(name="test"))
assert refs == []
def test_handle_text_elements_heading_defaults_to_non_numbered_when_style_missing(
backend, monkeypatch
):
captured: dict[str, tuple[int, str, bool]] = {}
class FakeParagraph:
def __init__(self, element, docx_obj):
self.text = "Heading text"
self.style = SimpleNamespace()
monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph)
monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: [])
monkeypatch.setattr(
backend, "_handle_equations_in_text", lambda element, text: (text, [])
)
monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: [])
monkeypatch.setattr(
backend, "_get_label_and_level", lambda paragraph: ("Heading", 1)
)
monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None))
def fake_add_heading(doc, level, text, is_numbered_style):
captured["heading"] = (level, text, is_numbered_style)
return []
monkeypatch.setattr(backend, "_add_heading", fake_add_heading)
refs = backend._handle_text_elements(object(), DoclingDocument(name="test"))
assert refs == []
assert captured["heading"] == (1, "Heading text", False)
def test_handle_text_elements_inline_equations_stop_when_text_is_consumed(
backend, monkeypatch
):
equation_one = backend.equation_bookends.format(EQ="a")
equation_two = backend.equation_bookends.format(EQ="b")
class FakeParagraph:
def __init__(self, element, docx_obj):
self.text = "inline eq"
self.style = SimpleNamespace()
monkeypatch.setattr(msword_backend_module, "Paragraph", FakeParagraph)
monkeypatch.setattr(backend, "_get_paragraph_elements", lambda paragraph: [])
monkeypatch.setattr(
backend,
"_handle_equations_in_text",
lambda element, text: (equation_one, [equation_one, equation_two]),
)
monkeypatch.setattr(backend, "_get_comment_ids_for_element", lambda element: [])
monkeypatch.setattr(
backend, "_get_label_and_level", lambda paragraph: ("Normal", None)
)
monkeypatch.setattr(backend, "_get_numId_and_ilvl", lambda paragraph: (None, None))
monkeypatch.setattr(backend, "_prev_numid", lambda: None)
monkeypatch.setattr(backend, "_get_level", lambda: 1)
backend.parents[0] = None
refs = backend._handle_text_elements(object(), DoclingDocument(name="test"))
assert len(refs) == 2
+33 -1
View File
@@ -1,8 +1,11 @@
from pathlib import Path
import pytest
from docling_core.types.doc import ImageRefMode
from typer.testing import CliRunner
from docling.cli.main import app
from docling.cli.main import _should_generate_export_images, app
from docling.datamodel.base_models import OutputFormat
runner = CliRunner()
@@ -27,6 +30,35 @@ def test_cli_convert(tmp_path):
assert converted.exists()
@pytest.mark.parametrize(
("image_export_mode", "to_formats", "expected"),
[
(ImageRefMode.PLACEHOLDER, [OutputFormat.JSON], False),
(ImageRefMode.EMBEDDED, [OutputFormat.TEXT, OutputFormat.DOCTAGS], False),
(ImageRefMode.EMBEDDED, [OutputFormat.MARKDOWN], True),
(
ImageRefMode.EMBEDDED,
[OutputFormat.TEXT, OutputFormat.MARKDOWN],
True,
),
],
)
def test_should_generate_export_images(image_export_mode, to_formats, expected):
assert _should_generate_export_images(image_export_mode, to_formats) is expected
def test_image_export_policy_covers_all_output_formats():
non_image_export_formats = {
OutputFormat.TEXT,
OutputFormat.DOCTAGS,
OutputFormat.VTT,
}
image_export_formats = set(OutputFormat) - non_image_export_formats
assert image_export_formats.isdisjoint(non_image_export_formats)
assert image_export_formats | non_image_export_formats == set(OutputFormat)
def test_cli_audio_auto_detection(tmp_path):
"""Test that CLI automatically detects audio files and sets ASR pipeline."""
from docling.datamodel.base_models import FormatToExtensions, InputFormat
@@ -0,0 +1,179 @@
from collections.abc import Iterable
from types import SimpleNamespace
from typing import ClassVar, List, Type
import pytest
from docling_core.types.doc import (
DoclingDocument,
ImageRef,
PictureItem,
ProvenanceItem,
)
from docling_core.types.doc.base import BoundingBox, Size
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.datamodel.pipeline_options import (
PictureDescriptionBaseOptions,
PictureDescriptionVlmEngineOptions,
PipelineOptions,
)
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.pipeline.base_pipeline import BasePipeline
class _TestOptions(PictureDescriptionBaseOptions):
kind: ClassVar[str] = "test"
class _ConfiguredPictureDescriptionModel(PictureDescriptionBaseModel):
def __init__(self, options: PictureDescriptionBaseOptions) -> None:
super().__init__(
enabled=True,
enable_remote_services=False,
artifacts_path=None,
options=options,
accelerator_options=AcceleratorOptions(),
)
@classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return _TestOptions
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
for _image in images:
yield "test description"
class _BatchRecordingPictureDescriptionModel(_ConfiguredPictureDescriptionModel):
def __init__(self, options: PictureDescriptionBaseOptions) -> None:
super().__init__(options)
self.batch_sizes: List[int] = []
def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[ItemAndImageEnrichmentElement],
) -> Iterable[PictureItem]:
element_list = list(element_batch)
self.batch_sizes.append(len(element_list))
for element in element_list:
assert isinstance(element.item, PictureItem)
yield element.item
class _PictureDescriptionPipeline(BasePipeline):
def _build_document(self, conv_res):
return conv_res
def _determine_status(self, conv_res):
return conv_res.status
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend) -> bool:
return True
def _make_picture_doc(*, count: int, embed_images: bool = True) -> DoclingDocument:
doc = DoclingDocument(name="test")
for _ in range(count):
image = (
ImageRef.from_pil(Image.new("RGB", (20, 20), "red"), dpi=72)
if embed_images
else None
)
doc.add_picture(image=image)
return doc
def test_picture_description_options_control_batch_size_and_scale() -> None:
model = _ConfiguredPictureDescriptionModel(_TestOptions(batch_size=3, scale=1.5))
assert model.elements_batch_size == 3
assert model.images_scale == 1.5
def test_picture_description_batch_size_controls_pipeline_chunking() -> None:
pipeline = _PictureDescriptionPipeline(PipelineOptions())
model = _BatchRecordingPictureDescriptionModel(_TestOptions(batch_size=2))
pipeline.enrichment_pipe = [model]
conv_res = SimpleNamespace(
document=_make_picture_doc(count=5),
timings={},
status="success",
)
pipeline._enrich_document(conv_res)
assert model.batch_sizes == [2, 2, 1]
def test_picture_description_scale_is_used_for_cropping() -> None:
model = _ConfiguredPictureDescriptionModel(_TestOptions(scale=1.5))
doc = DoclingDocument(name="test")
doc.add_page(page_no=1, size=Size(width=100, height=100))
picture = doc.add_picture(
prov=ProvenanceItem(
page_no=1,
bbox=BoundingBox(l=10, t=10, r=30, b=30),
charspan=(0, 0),
)
)
class _PageSpy:
def __init__(self):
self.page_no = 1
self.calls = []
def get_image(self, *, scale, cropbox):
self.calls.append({"scale": scale, "cropbox": cropbox})
return Image.new("RGB", (5, 5), "blue")
page = _PageSpy()
conv_res = SimpleNamespace(document=doc, pages=[page])
prepared = model.prepare_element(conv_res=conv_res, element=picture)
assert prepared is not None
assert page.calls[0]["scale"] == 1.5
def test_picture_description_embedded_images_keep_original_size() -> None:
model = _ConfiguredPictureDescriptionModel(_TestOptions(scale=1.5))
doc = _make_picture_doc(count=1, embed_images=True)
prepared = model.prepare_element(
conv_res=SimpleNamespace(document=doc, pages=[]), element=doc.pictures[0]
)
assert prepared is not None
assert prepared.image.size == (20, 20)
def test_picture_description_batch_size_must_be_positive() -> None:
with pytest.raises(ValueError):
_TestOptions(batch_size=0)
def test_picture_description_scale_must_be_positive() -> None:
with pytest.raises(ValueError):
_TestOptions(scale=0)
def test_picture_description_preset_batch_size_must_be_positive() -> None:
options = PictureDescriptionVlmEngineOptions.from_preset("smolvlm", batch_size=0)
with pytest.raises(ValueError, match="batch_size"):
_ConfiguredPictureDescriptionModel(options)
def test_picture_description_preset_scale_must_be_positive() -> None:
options = PictureDescriptionVlmEngineOptions.from_preset("smolvlm", scale=0)
with pytest.raises(ValueError, match="scale"):
_ConfiguredPictureDescriptionModel(options)
+164
View File
@@ -0,0 +1,164 @@
from pathlib import Path
import pytest
import torch
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions
from docling.models.stages.picture_description.picture_description_vlm_model import (
PictureDescriptionVlmModel,
)
class _DummyBatch(dict):
def to(self, device):
self["device"] = device
return self
class _DummyProcessor:
def __init__(self) -> None:
self.template_calls = 0
self.process_calls = []
self.decode_calls = 0
def apply_chat_template(self, messages, add_generation_prompt=True):
self.template_calls += 1
self.messages = messages
self.add_generation_prompt = add_generation_prompt
return "formatted prompt"
def __call__(self, *, text, images, return_tensors, padding):
self.process_calls.append(
{
"text": text,
"images": images,
"return_tensors": return_tensors,
"padding": padding,
}
)
return _DummyBatch(
{
"input_ids": torch.tensor([[1, 2, 3], [1, 2, 3]]),
"attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]),
}
)
def batch_decode(self, token_ids, *, skip_special_tokens):
self.decode_calls += 1
self.token_ids = token_ids
self.skip_special_tokens = skip_special_tokens
return ["first description", "second description"]
class _DummyModel:
def __init__(self) -> None:
self.generate_calls = []
def generate(self, **kwargs):
self.generate_calls.append(kwargs)
return torch.tensor(
[
[1, 2, 3, 10, 11],
[1, 2, 3, 20, 21],
]
)
class _DummyTokenizer:
def __init__(self) -> None:
self.padding_side = "left"
class _InitDummyProcessor:
def __init__(self) -> None:
self.tokenizer = _DummyTokenizer()
class _InitDummyModel:
def eval(self):
return self
def test_legacy_picture_description_vlm_batches_generation() -> None:
model = PictureDescriptionVlmModel.__new__(PictureDescriptionVlmModel)
model.processor = _DummyProcessor()
model.model = _DummyModel()
model.device = "cpu"
model.options = PictureDescriptionVlmOptions(
repo_id="org/model",
prompt="Describe this image in a few sentences.",
generation_config={"max_new_tokens": 17, "do_sample": False},
)
images = [
Image.new("RGB", (8, 8), "white"),
Image.new("RGB", (10, 10), "black"),
]
outputs = list(model._annotate_images(images))
assert outputs == ["first description", "second description"]
assert model.processor.template_calls == 1
assert len(model.processor.process_calls) == 1
assert model.processor.process_calls[0]["text"] == [
"formatted prompt",
"formatted prompt",
]
assert model.processor.process_calls[0]["images"] == images
assert model.processor.process_calls[0]["return_tensors"] == "pt"
assert model.processor.process_calls[0]["padding"] is True
assert model.processor.decode_calls == 1
assert model.processor.skip_special_tokens is True
assert len(model.model.generate_calls) == 1
assert model.model.generate_calls[0]["generation_config"].max_new_tokens == 17
def test_legacy_picture_description_vlm_skips_empty_batch() -> None:
model = PictureDescriptionVlmModel.__new__(PictureDescriptionVlmModel)
model.processor = _DummyProcessor()
model.model = _DummyModel()
model.device = "cpu"
model.options = PictureDescriptionVlmOptions(repo_id="org/model")
assert list(model._annotate_images([])) == []
assert model.processor.template_calls == 0
assert model.processor.process_calls == []
assert model.processor.decode_calls == 0
assert model.model.generate_calls == []
def test_legacy_picture_description_vlm_init_uses_configured_padding_side(
monkeypatch,
) -> None:
transformers = pytest.importorskip("transformers")
processor = _InitDummyProcessor()
model = _InitDummyModel()
monkeypatch.setattr(
transformers.AutoProcessor,
"from_pretrained",
lambda *args, **kwargs: processor,
)
monkeypatch.setattr(
transformers.AutoModelForImageTextToText,
"from_pretrained",
lambda *args, **kwargs: model,
)
monkeypatch.setattr(torch, "compile", lambda compiled_model: compiled_model)
picture_description_model = PictureDescriptionVlmModel(
enabled=True,
enable_remote_services=False,
artifacts_path=Path("/tmp"),
options=PictureDescriptionVlmOptions(
repo_id="org/model",
padding_side="right",
),
accelerator_options=AcceleratorOptions(device="cpu"),
)
assert processor.tokenizer.padding_side == "right"
assert picture_description_model.processor is processor
assert picture_description_model.model is model
Generated
+71 -71
View File
@@ -669,7 +669,7 @@ name = "coloredlogs"
version = "15.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "humanfriendly" },
{ name = "humanfriendly", marker = "python_full_version < '3.14'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
wheels = [
@@ -1113,7 +1113,7 @@ wheels = [
[[package]]
name = "docling"
version = "2.80.0"
version = "2.81.0"
source = { editable = "." }
dependencies = [
{ name = "accelerate" },
@@ -1526,11 +1526,11 @@ name = "fastapi"
version = "0.135.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "annotated-doc" },
{ name = "pydantic" },
{ name = "starlette" },
{ name = "typing-extensions" },
{ name = "typing-inspection" },
{ name = "annotated-doc", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pydantic", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "starlette", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "typing-inspection", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c4/73/5903c4b13beae98618d64eb9870c3fac4f605523dd0312ca5c80dadbd5b9/fastapi-0.135.2.tar.gz", hash = "sha256:88a832095359755527b7f63bb4c6bc9edb8329a026189eed83d6c1afcf419d56", size = 395833, upload-time = "2026-03-23T14:12:41.697Z" }
wheels = [
@@ -1748,12 +1748,12 @@ name = "gliner"
version = "0.2.26"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub" },
{ name = "onnxruntime" },
{ name = "sentencepiece" },
{ name = "torch" },
{ name = "tqdm" },
{ name = "transformers" },
{ name = "huggingface-hub", marker = "python_full_version < '3.14'" },
{ name = "onnxruntime", marker = "python_full_version < '3.14'" },
{ name = "sentencepiece", marker = "python_full_version < '3.14'" },
{ name = "torch", marker = "python_full_version < '3.14'" },
{ name = "tqdm", marker = "python_full_version < '3.14'" },
{ name = "transformers", marker = "python_full_version < '3.14'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/49/18/e199cb97147c4a9260c75e4caf51e17be6ff969b0604a029c9c62810cbe0/gliner-0.2.26.tar.gz", hash = "sha256:6783be92b4b81caa878dcc4269ba37800207c37118d8ff9be028b93bddd6813d", size = 181224, upload-time = "2026-03-19T15:07:22.707Z" }
wheels = [
@@ -1918,7 +1918,7 @@ name = "humanfriendly"
version = "10.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyreadline3", marker = "sys_platform == 'win32'" },
{ name = "pyreadline3", marker = "python_full_version < '3.14' and sys_platform == 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
wheels = [
@@ -3059,14 +3059,14 @@ name = "mlx-lm"
version = "0.29.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "jinja2" },
{ name = "jinja2", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "mlx", marker = "sys_platform == 'darwin'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "protobuf" },
{ name = "pyyaml" },
{ name = "sentencepiece" },
{ name = "transformers" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" },
{ name = "protobuf", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyyaml", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "sentencepiece", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "transformers", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e3/62/f46e1355256a114808517947f8e83ad6be310c7288c551db0fa678f47923/mlx_lm-0.29.1.tar.gz", hash = "sha256:b99180d8f33d33a077b814e550bfb2d8a59ae003d668fd1f4b3fff62a381d34b", size = 232302, upload-time = "2025-12-16T16:58:27.959Z" }
wheels = [
@@ -3088,19 +3088,19 @@ name = "mlx-vlm"
version = "0.3.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "datasets" },
{ name = "fastapi" },
{ name = "mlx" },
{ name = "mlx-lm" },
{ name = "datasets", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "fastapi", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "mlx", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "mlx-lm", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "opencv-python" },
{ name = "pillow" },
{ name = "requests" },
{ name = "soundfile" },
{ name = "tqdm" },
{ name = "transformers" },
{ name = "uvicorn" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" },
{ name = "opencv-python", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pillow", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "requests", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "soundfile", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "tqdm", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "transformers", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "uvicorn", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/1d/98/6b3c2d1317a317d0df544fe9ab0ef4f233ea85c1e4ac2fe6af7289ea1ee5/mlx_vlm-0.3.9.tar.gz", hash = "sha256:ae5050d0b1a051a29099c3a65efdbf6874bb497e8465734ac1992b6b179135b4", size = 303350, upload-time = "2025-12-03T21:48:24.199Z" }
wheels = [
@@ -3112,17 +3112,17 @@ name = "mlx-whisper"
version = "0.4.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub" },
{ name = "mlx" },
{ name = "more-itertools" },
{ name = "numba" },
{ name = "huggingface-hub", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "mlx", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "more-itertools", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "numba", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" },
{ name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "tiktoken" },
{ name = "torch" },
{ name = "tqdm" },
{ name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" },
{ name = "tiktoken", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "torch", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "tqdm", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/22/b7/a35232812a2ccfffcb7614ba96a91338551a660a0e9815cee668bf5743f0/mlx_whisper-0.4.3-py3-none-any.whl", hash = "sha256:6b82b6597a994643a3e5496c7bc229a672e5ca308458455bfe276e76ae024489", size = 890544, upload-time = "2025-08-29T14:56:13.815Z" },
@@ -3917,9 +3917,9 @@ name = "ocrmac"
version = "1.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "pillow" },
{ name = "pyobjc-framework-vision" },
{ name = "click", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pillow", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5e/07/3e15ab404f75875c5e48c47163300eb90b7409044d8711fc3aaf52503f2e/ocrmac-1.0.1.tar.gz", hash = "sha256:507fe5e4cbd67b2d03f6729a52bbc11f9d0b58241134eb958a5daafd4b9d93d9", size = 1454317, upload-time = "2026-01-08T16:44:26.412Z" }
wheels = [
@@ -3944,13 +3944,13 @@ name = "onnxruntime"
version = "1.23.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "coloredlogs" },
{ name = "flatbuffers" },
{ name = "coloredlogs", marker = "python_full_version < '3.14'" },
{ name = "flatbuffers", marker = "python_full_version < '3.14'" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "packaging" },
{ name = "protobuf" },
{ name = "sympy" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14'" },
{ name = "packaging", marker = "python_full_version < '3.14'" },
{ name = "protobuf", marker = "python_full_version < '3.14'" },
{ name = "sympy", marker = "python_full_version < '3.14'" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/35/d6/311b1afea060015b56c742f3531168c1644650767f27ef40062569960587/onnxruntime-1.23.2-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:a7730122afe186a784660f6ec5807138bf9d792fa1df76556b27307ea9ebcbe3", size = 17195934, upload-time = "2025-10-27T23:06:14.143Z" },
@@ -3982,13 +3982,13 @@ name = "onnxruntime-gpu"
version = "1.23.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "coloredlogs" },
{ name = "flatbuffers" },
{ name = "coloredlogs", marker = "(python_full_version < '3.14' and sys_platform != 'emscripten') or (python_full_version < '3.11' and sys_platform == 'emscripten')" },
{ name = "flatbuffers", marker = "(python_full_version < '3.14' and sys_platform != 'emscripten') or (python_full_version < '3.11' and sys_platform == 'emscripten')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "packaging" },
{ name = "protobuf" },
{ name = "sympy" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten'" },
{ name = "packaging", marker = "(python_full_version < '3.14' and sys_platform != 'emscripten') or (python_full_version < '3.11' and sys_platform == 'emscripten')" },
{ name = "protobuf", marker = "(python_full_version < '3.14' and sys_platform != 'emscripten') or (python_full_version < '3.11' and sys_platform == 'emscripten')" },
{ name = "sympy", marker = "(python_full_version < '3.14' and sys_platform != 'emscripten') or (python_full_version < '3.11' and sys_platform == 'emscripten')" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/cb/ae/39283748c68a96be4f5f8a9561e0e3ca92af1eae6c2b1c07fb1da5f65cd1/onnxruntime_gpu-1.23.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18de50c6c8eea50acc405ea13d299aec593e46478d7a22cd32cdbbdf7c42899d", size = 300525411, upload-time = "2025-10-22T16:56:08.415Z" },
@@ -5028,7 +5028,7 @@ name = "pyobjc-framework-cocoa"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" }
wheels = [
@@ -5046,8 +5046,8 @@ name = "pyobjc-framework-coreml"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" }
wheels = [
@@ -5065,8 +5065,8 @@ name = "pyobjc-framework-quartz"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" }
wheels = [
@@ -5084,10 +5084,10 @@ name = "pyobjc-framework-vision"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-framework-coreml" },
{ name = "pyobjc-framework-quartz" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" }
wheels = [
@@ -6528,9 +6528,9 @@ name = "soundfile"
version = "0.13.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cffi" },
{ name = "cffi", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "numpy", version = "2.4.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
wheels = [
@@ -6566,8 +6566,8 @@ name = "starlette"
version = "1.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio" },
{ name = "typing-extensions", marker = "python_full_version < '3.13'" },
{ name = "anyio", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/81/69/17425771797c36cded50b7fe44e850315d039f28b15901ab44839e70b593/starlette-1.0.0.tar.gz", hash = "sha256:6a4beaf1f81bb472fd19ea9b918b50dc3a77a6f2e190a12954b25e6ed5eea149", size = 2655289, upload-time = "2026-03-22T18:29:46.779Z" }
wheels = [
@@ -7369,8 +7369,8 @@ name = "uvicorn"
version = "0.42.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "h11" },
{ name = "click", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "h11", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "typing-extensions", marker = "python_full_version < '3.11'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e3/ad/4a96c425be6fb67e0621e62d86c402b4a17ab2be7f7c055d9bd2f638b9e2/uvicorn-0.42.0.tar.gz", hash = "sha256:9b1f190ce15a2dd22e7758651d9b6d12df09a13d51ba5bf4fc33c383a48e1775", size = 85393, upload-time = "2026-03-16T06:19:50.077Z" }