Files
docling-core/docs/Record.json
Cesar Berrospi Ramis 266e7fc603 build: set pydantic version to 2.10.3 (#93)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2024-12-04 14:26:22 +01:00

970 lines
26 KiB
JSON

{
"$defs": {
"Acquisition": {
"additionalProperties": false,
"description": "Information on how the data was obtained.",
"properties": {
"type": {
"description": "The method to obtain the data.",
"enum": [
"API",
"FTP",
"Download",
"Link",
"Web scraping/Crawling",
"Other"
],
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"date": {
"anyOf": [
{
"format": "date-time",
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "A string representation of the acquisition datetime in ISO 8601 format.",
"title": "Date"
},
"link": {
"anyOf": [
{
"format": "uri",
"minLength": 1,
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Link to the data source of this document.",
"title": "Link",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"size": {
"anyOf": [
{
"minimum": 0,
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size in bytes of the raw document from the data source.",
"title": "Size",
"x-es-type": "long"
}
},
"required": [
"type"
],
"title": "Acquisition",
"type": "object"
},
"Attribute": {
"additionalProperties": false,
"description": "Attribute model that describes a list of characteristics.",
"properties": {
"conf": {
"description": "The confidence level of this attribute characteristics.",
"maximum": 1.0,
"minimum": 0.0,
"title": "Confidence",
"type": "number",
"x-es-type": "float"
},
"prov": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/ProvenanceItem"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "The sources of this attribute characteristics.",
"title": "Provenance"
},
"predicates": {
"description": "A list of characteristics (type, value, and name).",
"items": {
"$ref": "#/$defs/Predicate"
},
"title": "Predicates",
"type": "array"
}
},
"required": [
"conf",
"predicates"
],
"title": "Attribute",
"type": "object"
},
"BooleanValue": {
"additionalProperties": false,
"description": "Model for boolean values.",
"properties": {
"value": {
"title": "Value",
"type": "boolean",
"x-es-type": "boolean"
}
},
"required": [
"value"
],
"title": "BooleanValue",
"type": "object"
},
"CollectionRecordInfo": {
"additionalProperties": false,
"description": "Information of a collection of type Record.",
"properties": {
"name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the collection.",
"title": "Name",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"type": {
"const": "Record",
"description": "The collection type.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"version": {
"anyOf": [
{
"pattern": "^(?P<major>0|[1-9]\\d*)\\.(?P<minor>0|[1-9]\\d*)\\.(?P<patch>0|[1-9]\\d*)(?:-(?P<prerelease>(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "The version of this collection model.",
"title": "Version",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"alias": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "A list of tags (aliases) for the collection.",
"title": "Alias",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"type"
],
"title": "CollectionRecordInfo",
"type": "object"
},
"DatetimeValue": {
"additionalProperties": false,
"description": "Model for datetime values.",
"properties": {
"value": {
"format": "date-time",
"title": "Value",
"type": "string"
}
},
"required": [
"value"
],
"title": "DatetimeValue",
"type": "object"
},
"FileInfoObject": {
"description": "Filing information for any data object to be stored in a Docling database.",
"properties": {
"filename": {
"description": "The name of a persistent object that created this data object",
"title": "Filename",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"filename-prov": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "The provenance of this data object, e.g. an archive file, a URL, or any other repository.",
"title": "Filename-Prov",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"document-hash": {
"description": "A unique identifier of this data object within a collection of a Docling database",
"title": "Document-Hash",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"filename",
"document-hash"
],
"title": "FileInfoObject",
"type": "object"
},
"GeopointValue": {
"additionalProperties": false,
"description": "A representation of a geopoint (longitude and latitude coordinates).",
"properties": {
"value": {
"items": {
"type": "number"
},
"maxItems": 2,
"minItems": 2,
"title": "Value",
"type": "array",
"x-es-type": "geo_point"
},
"conf": {
"anyOf": [
{
"maximum": 1.0,
"minimum": 0.0,
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"title": "Conf",
"x-es-type": "float"
}
},
"required": [
"value"
],
"title": "GeopointValue",
"type": "object"
},
"Identifier": {
"additionalProperties": false,
"description": "Unique identifier of a Docling data object.",
"properties": {
"type": {
"description": "A string representing a collection or database that contains this data object.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"value": {
"description": "The identifier value of the data object within a collection or database.",
"title": "Value",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"_name": {
"description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).",
"pattern": "^.+#.+$",
"title": "_Name",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"type",
"value",
"_name"
],
"title": "Identifier",
"type": "object"
},
"Log": {
"additionalProperties": false,
"description": "Log entry to describe an ETL task on a document.",
"properties": {
"task": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "An identifier of this task. It may be used to identify this task from other tasks of the same agent and type.",
"title": "Task",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"agent": {
"description": "The Docling agent that performed the task, e.g., CCS or CXS.",
"title": "Agent",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"type": {
"description": "A task category.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"comment": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "A description of the task or any comments in natural language.",
"title": "Comment"
},
"date": {
"description": "A string representation of the task execution datetime in ISO 8601 format.",
"format": "date-time",
"title": "Date",
"type": "string"
}
},
"required": [
"agent",
"type",
"date"
],
"title": "Log",
"type": "object"
},
"NominalValue": {
"additionalProperties": false,
"description": "Model for nominal (categorical) values.",
"properties": {
"value": {
"title": "Value",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"value"
],
"title": "NominalValue",
"type": "object"
},
"NumericalValue": {
"additionalProperties": false,
"description": "Model for numerical values.",
"properties": {
"min": {
"title": "Min",
"type": "number",
"x-es-type": "float"
},
"max": {
"title": "Max",
"type": "number",
"x-es-type": "float"
},
"val": {
"title": "Val",
"type": "number",
"x-es-type": "float"
},
"err": {
"title": "Err",
"type": "number",
"x-es-type": "float"
},
"unit": {
"title": "Unit",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"min",
"max",
"val",
"err",
"unit"
],
"title": "NumericalValue",
"type": "object"
},
"Predicate": {
"additionalProperties": false,
"description": "Model for a predicate.",
"properties": {
"key": {
"$ref": "#/$defs/PredicateKey"
},
"value": {
"$ref": "#/$defs/PredicateValue"
},
"numerical_value": {
"anyOf": [
{
"$ref": "#/$defs/NumericalValue"
},
{
"type": "null"
}
],
"default": null
},
"numerical_value_si": {
"anyOf": [
{
"$ref": "#/$defs/NumericalValue"
},
{
"type": "null"
}
],
"default": null
},
"nominal_value": {
"anyOf": [
{
"$ref": "#/$defs/NominalValue"
},
{
"type": "null"
}
],
"default": null
},
"text_value": {
"anyOf": [
{
"$ref": "#/$defs/TextValue"
},
{
"type": "null"
}
],
"default": null
},
"boolean_value": {
"anyOf": [
{
"$ref": "#/$defs/BooleanValue"
},
{
"type": "null"
}
],
"default": null
},
"datetime_value": {
"anyOf": [
{
"$ref": "#/$defs/DatetimeValue"
},
{
"type": "null"
}
],
"default": null
},
"geopoint_value": {
"anyOf": [
{
"$ref": "#/$defs/GeopointValue"
},
{
"type": "null"
}
],
"default": null
}
},
"required": [
"key",
"value"
],
"title": "Predicate",
"type": "object"
},
"PredicateKey": {
"additionalProperties": false,
"description": "Model for the key (unique identifier) of a predicate.",
"properties": {
"name": {
"description": "Name of the predicate key.",
"title": "Name",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"type": {
"description": "Type of predicate key.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"name",
"type"
],
"title": "PredicateKey",
"type": "object"
},
"PredicateValue": {
"additionalProperties": false,
"description": "Model for the value of a predicate.",
"properties": {
"name": {
"description": "Name of the predicate value (actual value).",
"title": "Name",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"type": {
"description": "Type of predicate value.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"name",
"type"
],
"title": "PredicateValue",
"type": "object"
},
"ProvenanceItem": {
"additionalProperties": false,
"description": "A representation of an object provenance.",
"properties": {
"type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Any string representing the type of provenance, e.g. `sentence`, `table`, or `doi`.",
"title": "The provenance type",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"text": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "A text representing the evidence of the provenance, e.g. the sentence text or the content of a table cell",
"title": "Evidence of the provenance",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"reference": {
"anyOf": [
{
"$ref": "#/$defs/Identifier"
},
{
"type": "null"
}
],
"default": null,
"description": "Reference to another object, e.g. record, statement, URL, or any other object that identifies the provenance",
"title": "Reference to the provenance object"
},
"path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "A path that locates the evidence within the provenance object identified by the `reference` field using a JSON pointer notation, e.g., `#/main-text/5` to locate the `main-text` paragraph at index 5",
"title": "The location of the provenance within the referenced object",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"span": {
"anyOf": [
{
"items": {
"type": "integer"
},
"maxItems": 2,
"minItems": 2,
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "location of the item in the text/table referenced by the `path`, e.g., `[34, 67]`",
"title": "The location of the item in the text/table"
}
},
"title": "ProvenanceItem",
"type": "object"
},
"RecordDescription": {
"description": "Additional record metadata, including optional collection-specific fields.",
"properties": {
"logs": {
"description": "Logs that describe the ETL tasks applied to this record.",
"items": {
"$ref": "#/$defs/Log"
},
"title": "Logs",
"type": "array"
},
"publication_date": {
"anyOf": [
{
"format": "date-time",
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "The date that best represents the last publication time of a record.",
"title": "Publication date"
},
"collection": {
"anyOf": [
{
"$ref": "#/$defs/CollectionRecordInfo"
},
{
"type": "null"
}
],
"default": null,
"description": "The collection information of this record."
},
"acquisition": {
"anyOf": [
{
"$ref": "#/$defs/Acquisition"
},
{
"type": "null"
}
],
"default": null,
"description": "Information on how the document was obtained, for data governance purposes."
}
},
"required": [
"logs"
],
"title": "RecordDescription",
"type": "object"
},
"S3Reference": {
"description": "References an s3 resource.",
"properties": {
"__ref_s3_data": {
"examples": [
"#/_s3_data/figures/0"
],
"title": "Ref S3 Data",
"type": "string"
}
},
"required": [
"__ref_s3_data"
],
"title": "S3Reference",
"type": "object"
},
"Subject": {
"additionalProperties": false,
"description": "A representation of a subject.",
"properties": {
"display_name": {
"description": "Name of the subject in natural language. It can be used for end-user applications to display a human-readable name. For instance, `B(2) Mg(1)` for `MgB2` or `International Business Machines` for `IBM`",
"title": "Display Name",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"display_image": {
"anyOf": [
{
"$ref": "#/$defs/S3Reference"
},
{
"type": "null"
}
],
"default": null,
"description": "Image representing the subject. It can be used for end-user applications.For example, the chemical structure drawing of a compound or the eight bar IBM logo for IBM.",
"title": "Display Image",
"x-es-suppress": true
},
"type": {
"description": "Main subject type. For instance, `material`, `material-class`, `material-device`, `company`, or `person`.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"names": {
"description": "List of given names for this subject. They may not be unique across different subjects.",
"items": {
"$ref": "#/$defs/SubjectNameIdentifier"
},
"title": "Names",
"type": "array"
},
"identifiers": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Identifier"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of unique identifiers in database. For instance, the `PubChem ID` of a record in the PubChem database.",
"title": "Identifiers"
},
"labels": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of labels or categories for this subject.",
"title": "Labels",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"display_name",
"type",
"names"
],
"title": "Subject",
"type": "object"
},
"SubjectNameIdentifier": {
"additionalProperties": false,
"description": "Identifier of subject names.",
"properties": {
"type": {
"description": "A string representing a collection or database that contains this data object.",
"title": "Type",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"value": {
"description": "The identifier value of the data object within a collection or database.",
"title": "Value",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
},
"_name": {
"description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).",
"pattern": "^.+#.+$",
"title": "_Name",
"type": "string",
"x-es-ignore_above": 8191,
"x-es-type": "keyword"
}
},
"required": [
"type",
"value",
"_name"
],
"title": "SubjectNameIdentifier",
"type": "object"
},
"TextValue": {
"additionalProperties": false,
"description": "Model for textual values.",
"properties": {
"value": {
"title": "Value",
"type": "string",
"x-es-type": "text"
}
},
"required": [
"value"
],
"title": "TextValue",
"type": "object"
}
},
"description": "A representation of a structured record in an database.",
"properties": {
"conf": {
"description": "This value represents a score to the data item. Items originating from databases will typically have a score 1.0, while items resulting from an NLP model may have a value between 0.0 and 1.0.",
"maximum": 1.0,
"minimum": 0.0,
"title": "The confidence of the evidence",
"type": "number",
"x-es-type": "float"
},
"prov": {
"description": "A list of provenance items.",
"items": {
"$ref": "#/$defs/ProvenanceItem"
},
"title": "Provenance",
"type": "array"
},
"file-info": {
"$ref": "#/$defs/FileInfoObject"
},
"description": {
"$ref": "#/$defs/RecordDescription"
},
"subject": {
"$ref": "#/$defs/Subject"
},
"attributes": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Attribute"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"title": "Attributes"
},
"_name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "A short description or summary of the record.",
"title": "Name",
"x-es-type": "text"
},
"identifiers": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Identifier"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "A list of unique identifiers of this record in a database.",
"title": "Identifiers"
}
},
"required": [
"conf",
"prov",
"file-info",
"description",
"subject"
],
"title": "Record",
"type": "object"
}