mirror of
https://github.com/docling-project/docling-core.git
synced 2026-05-17 13:10:44 +00:00
c8f3c01a61
* refactor: move WebVTT data model from docling Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(webvtt): deal with HTML entities in cue text spans Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): support more WebVTT models Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(DoclingDocument): create a new provenance model for media file types Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): make WebVTTTimestamp public Since WebVTTTimestamp is used in DoclingDocument, the class should be public. Strengthen validation of cue language start tag annotation. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): set languages to a list of strings in ProvenanceTrack Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(webvtt): add test for ProvenanceTrack Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): make all WebVTT classes public for reuse Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(webvtt): preserve newlines as WebVTTLineTerminator Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): set ProvenanceTrack time fields as float Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(webvtt): ensure start time offsets are in sequence Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(webvtt): improve regex to remove note,region,style blocks Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(webvtt): parse the WebVTT file title Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(webvtt): rebase to latest changes in idoctags Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * feat(webvtt): add WebVTT serializer Add a DoclingDocument serializer to WebVTT format. Improve WebVTT data model. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * fix(webvtt): add 'text/vtt' as extra mimetype Add 'text/vtt' as extra MIME type to support WebVTT serialization, since it is not supported by 'mimetypes' with python < 3.11 Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): roll back DocItem.prov as list of ProvenanceItem Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * tests(webvtt): fix test with STYLE and NOTE blocks Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * style(webvtt): apply X | Y annotation instead of Optional, Union Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): simplify TrackProvenance model with tags Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * refactor(webvtt): align class and field names to new 'source' type Classes and fields that are related to the new source type should aign with their names. The term 'provenance' will identify the legacy implementation. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(DoclingDocument): drop the validation on field assignment Drop the validation on field assignment in NodeItem objects. Add the 'source' argument in the convenient function 'add_text' to create TextItem with track source data. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> refactor(webvtt): drop cue span classes, 'lang' and 'c' tags Drop WebVTT formatting features not covered by Docling across formats. Only 'u', 'b', 'i', and 'v' are supported and without classes. Make 'v' tag explicit as 'voice' feature in SourceTrack class. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
80 lines
2.1 KiB
Python
80 lines
2.1 KiB
Python
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from docling_core.types.doc import DocItemLabel, DoclingDocument, TrackSource
|
|
from docling_core.types.legacy_doc.base import Prov, S3Reference
|
|
|
|
|
|
def test_s3_reference():
|
|
"""Validate data with Identifier model."""
|
|
gold_dict = {"__ref_s3_data": "#/s3_data/figures/0"}
|
|
data = S3Reference(__ref_s3_data="#/s3_data/figures/0")
|
|
|
|
assert data.model_dump() == gold_dict
|
|
assert data.model_dump(by_alias=True) == gold_dict
|
|
|
|
with pytest.raises(ValidationError, match="required"):
|
|
S3Reference()
|
|
|
|
|
|
def test_prov():
|
|
prov = {
|
|
"bbox": [
|
|
48.19645328521729,
|
|
644.2883926391602,
|
|
563.6185592651367,
|
|
737.4546043395997,
|
|
],
|
|
"page": 2,
|
|
"span": [0, 0],
|
|
}
|
|
|
|
assert Prov(**prov)
|
|
|
|
with pytest.raises(ValidationError, match="valid integer"):
|
|
prov["span"] = ["foo", 0]
|
|
Prov(**prov)
|
|
|
|
with pytest.raises(ValidationError, match="at least 2 items"):
|
|
prov["span"] = [0]
|
|
Prov(**prov)
|
|
|
|
|
|
def test_track_source():
|
|
"""Test the class TrackSource."""
|
|
|
|
valid_track = TrackSource(
|
|
start_time=11.0,
|
|
end_time=12.0,
|
|
identifier="test",
|
|
voice="Mary",
|
|
)
|
|
|
|
assert valid_track
|
|
assert valid_track.start_time == 11.0
|
|
assert valid_track.end_time == 12.0
|
|
assert valid_track.identifier == "test"
|
|
assert valid_track.voice == "Mary"
|
|
|
|
with pytest.raises(ValidationError, match="end_time"):
|
|
TrackSource(start_time=11.0)
|
|
|
|
with pytest.raises(ValidationError, match="should be a valid string"):
|
|
TrackSource(
|
|
start_time=11.0,
|
|
end_time=12.0,
|
|
voice=["Mary"],
|
|
)
|
|
|
|
with pytest.raises(ValidationError, match="must be greater than start"):
|
|
TrackSource(
|
|
start_time=11.0,
|
|
end_time=11.0,
|
|
)
|
|
|
|
doc = DoclingDocument(name="Unknown")
|
|
item = doc.add_text(text="Hello world", label=DocItemLabel.TEXT, source=valid_track)
|
|
assert item.source
|
|
assert len(item.source) == 1
|
|
assert item.source[0] == valid_track
|