Files
Christoph Auer 8fb3a169f6 fix: consistenty and perf improvements (#171)
* Add validation-only mode to cvat_evaluation_pipeline and make output consistent

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Revise CVAT XML parsing for efficiency

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Revise CVAT evaluation pipeline for efficiency

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: use LiveText OCR and refine picture validation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: keep table containers in level‑1 order after conflict resolution

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: respect inline reading order for embedded headers

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: skip cvat samples with oversized ocr pages

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: tighten reading-order validation resilience

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: convert grouped elements skipped by reading order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: keep conflicted reading-order containers independent

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: skip container merges during cvat conversion

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: guard merge paths that bridge separate containers

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: reconcile doc tree after CVAT rich cell rewiring

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* feat: track OCR usage during CVAT conversion

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* feat: Add tool to upload CVAT-generated datasets to HF

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-11-10 11:48:32 +01:00

459 lines
12 KiB
Python

HTML_DEFAULT_HEAD: str = r"""<head>
<link rel="icon" type="image/png"
href="https://ds4sd.github.io/docling/assets/logo.png"/>
<meta charset="UTF-8">
<title>
Powered by Docling
</title>
<style>
html {
background-color: LightGray;
}
body {
margin: 0 auto;
width:800px;
padding: 30px;
background-color: White;
font-family: Arial, sans-serif;
box-shadow: 10px 10px 10px grey;
}
figure{
display: block;
width: 100%;
margin: 0px;
margin-top: 10px;
margin-bottom: 10px;
}
img {
display: block;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
max-width: 640px;
max-height: 640px;
}
table {
min-width:500px;
background-color: White;
border-collapse: collapse;
cell-padding: 5px;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
}
th, td {
border: 1px solid black;
padding: 8px;
}
th {
font-weight: bold;
}
table tr:nth-child(even) td{
background-color: LightGray;
}
</style>
</head>"""
HTML_DEFAULT_HEAD_FOR_COMP: str = r"""<head>
<link rel="icon" type="image/png"
href="https://ds4sd.github.io/docling/assets/logo.png"/>
<meta charset="UTF-8">
<title>
Powered by Docling
</title>
<style>
html {
background-color: LightGray;
}
body {
margin: 0 auto;
padding: 10px;
background-color: White;
font-family: Arial, sans-serif;
box-shadow: 10px 10px 10px grey;
}
figure{
display: block;
width: 100%;
margin: 0px;
margin-top: 10px;
margin-bottom: 10px;
}
img {
display: block;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
max-width: 640px;
max-height: 640px;
}
table {
min-width:500px;
background-color: White;
border-collapse: collapse;
cell-padding: 5px;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
}
th, td {
border: 1px solid black;
padding: 8px;
}
th {
font-weight: bold;
}
table tr:nth-child(even) td{
background-color: LightGray;
}
</style>
</head>"""
HTML_DEFAULT_HEAD_FOR_COMP_v2: str = r"""<head>
<link rel="icon" type="image/png"
href="https://ds4sd.github.io/docling/assets/logo.png"/>
<meta charset="UTF-8">
<title>
Powered by Docling
</title>
<style>
html {
background-color: LightGray;
}
body {
margin: 0 auto;
padding: 10px;
background-color: White;
font-family: Arial, sans-serif;
box-shadow: 10px 10px 10px grey;
font-size: 0.9em; /* Smaller text */
max-width: 100%;
}
td {
width: 25%;
}
.page td {
width:auto;
}
/* Create a flex container for columns */
.container {
display: flex;
flex-wrap: nowrap;
width: 100%;
gap: 10px;
}
/* Each column takes exactly 25% width */
.column {
flex: 0 0 25%;
padding: 10px;
box-sizing: border-box;
overflow-x: auto; /* Enable horizontal scrolling within each column */
}
figure {
display: block;
width: 100%;
margin: 0;
margin-top: 10px;
margin-bottom: 10px;
overflow-x: auto; /* Horizontal scrolling for figures */
}
img {
display: block;
margin: auto;
margin-top: 10px;
margin-bottom: 10px;
max-width: 100%; /* Images will be responsive within their container */
min-width: 300px;
height: auto;
}
.page img {
min-width: auto;
}
.table-container {
width: 100%;
overflow-x: auto; /* Horizontal scrolling for tables */
}
table {
min-width: 500px;
background-color: White;
border-collapse: collapse;
margin: 10px 0;
width: 100%;
}
th, td {
border: 1px solid black;
padding: 8px;
text-align: left;
}
th {
font-weight: bold;
background-color: #f2f2f2;
}
table tr:nth-child(even) td {
background-color: LightGray;
}
/* Media query for responsive behavior */
@media (max-width: 768px) {
.container {
flex-direction: column;
}
.column {
flex: 0 0 100%;
margin-bottom: 15px;
}
}
</style>
</head>"""
HTML_COMPARISON_PAGE_v1 = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Side by Side Layout</title>
<style>
body {
display: flex;
justify-content: space-around; /* Adjust spacing between items */
align-items: flex-start; /* Align items at the top */
margin: 0;
padding: 0;
font-family: Arial, sans-serif;
}
iframe, img {
width: 30%; /* Adjust the width of each item */
height: 1024; /* Set a fixed height */
border: 1px solid #ccc; /* Optional: Add borders */
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
}
iframe {
overflow: auto; /* Add scrolling if the content is larger */
}
</style>
</head>
<body>
<!-- Image -->
<img src="data:image/png;base64,BASE64PAGE" alt="Page Image">
<!-- First HTML page -->
<iframe srcdoc='TRUEDOC' title="Ground Truth"></iframe>
<!-- Second HTML page -->
<iframe srcdoc='PREDDOC' title="Docling Result"></iframe>
</body>
</html>
"""
HTML_COMPARISON_PAGE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Self-Contained Page with Titles</title>
<style>
body {
display: flex;
justify-content: space-around; /* Adjust spacing between items */
align-items: flex-start; /* Align items at the top */
margin: 0;
padding: 20px;
font-family: Arial, sans-serif;
background-color: #f9f9f9;
height: 100vh; /* Full viewport height */
overflow: hidden; /* Prevent scrollbars from appearing unnecessarily */
}
.container {
display: flex;
flex-direction: column;
width: 30%; /* Adjust the width of each item */
height: 100%; /* Adjust height to fill parent container */
border: 1px solid #ccc; /* Optional: Add borders */
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
background-color: #fff; /* Optional: Add background */
overflow: hidden; /* Prevent content overflow */
}
.title {
text-align: center;
font-weight: bold;
padding: 10px;
background-color: #eee;
border-bottom: 1px solid #ccc;
}
iframe {
flex-grow: 1; /* Fill the remaining space */
border: none; /* Remove iframe border */
}
img {
width: 100%;
height: auto; /* Keep the aspect ratio of the image */
}
</style>
</head>
<body>
<!-- Embedded Image -->
<div class="container">
<div class="title">Page Image</div>
<img src="data:image/png;base64,BASE64PAGE" alt="Example Image">
</div>
<!-- First HTML page -->
<div class="container">
<div class="title">GroundTruth</div>
<iframe srcdoc='TRUEDOC' title="Page 1"></iframe>
</div>
<!-- Second HTML page -->
<div class="container">
<div class="title">Prediction</div>
<iframe srcdoc='PREDDOC' title="Page 2"></iframe>
</div>
</body>
</html>
"""
HTML_INSPECTION = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Self-Contained Page with Titles</title>
<style>
body {
display: flex;
justify-content: space-around; /* Adjust spacing between items */
align-items: flex-start; /* Align items at the top */
margin: 0;
padding: 20px;
font-family: Arial, sans-serif;
background-color: #f9f9f9;
height: 100vh; /* Full viewport height */
overflow: hidden; /* Prevent scrollbars from appearing unnecessarily */
}
.container {
display: flex;
flex-direction: column;
width: 50%; /* Adjust the width of each item */
height: 100%; /* Adjust height to fill parent container */
border: 1px solid #ccc; /* Optional: Add borders */
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
background-color: #fff; /* Optional: Add background */
overflow: auto; /* Prevent content overflow */
}
.title {
text-align: center;
font-weight: bold;
padding: 10px;
background-color: #eee;
border-bottom: 1px solid #ccc;
}
.image-wrapper {
padding: 10px; /* Add some space around each image */
text-align: center;
}
.image-wrapper img {
width: 100%; /* Scale images to container width */
height: auto; /* Maintain aspect ratio */
border: 1px solid #ccc;
box-shadow: 1px 1px 3px rgba(0, 0, 0, 0.1);
}
iframe {
flex-grow: 1; /* Fill the remaining space */
border: none; /* Remove iframe border */
}
img {
width: 100%;
height: auto; /* Keep the aspect ratio of the image */
}
</style>
</head>
<body>
<!-- Embedded Image -->
<div class="container">
<div class="title">Page Images</div>
PAGE_IMAGES
</div>
<!-- Second HTML page -->
<div class="container">
<div class="title">Prediction</div>
<iframe srcdoc='PREDDOC' title="Page 2"></iframe>
</div>
</body>
</html>
"""
HTML_COMPARISON_PAGE_WITH_CLUSTERS = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Self-Contained Page with Titles</title>
<style>
body {
display: flex;
justify-content: space-around; /* Adjust spacing between items */
align-items: flex-start; /* Align items at the top */
margin: 0;
padding: 20px;
font-family: Arial, sans-serif;
background-color: #f9f9f9;
height: 100vh; /* Full viewport height */
overflow: hidden; /* Prevent scrollbars from appearing unnecessarily */
}
.container {
display: flex;
flex-direction: column;
width: 25%; /* Adjust the width of each item */
height: 50%; /* Adjust height to fill parent container */
border: 1px solid #ccc; /* Optional: Add borders */
box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.1); /* Optional: Add shadow */
background-color: #fff; /* Optional: Add background */
overflow: hidden; /* Prevent content overflow */
}
.title {
text-align: center;
font-weight: bold;
padding: 10px;
background-color: #eee;
border-bottom: 1px solid #ccc;
}
iframe {
flex-grow: 1; /* Fill the remaining space */
border: none; /* Remove iframe border */
}
img {
width: 100%;
height: auto; /* Keep the aspect ratio of the image */
}
</style>
</head>
<body>
<!-- Embedded Image -->
<div class="container">
<div class="title">Page Image Groundtruth</div>
<img src="data:image/png;base64,BASE64TRUEPAGE" alt="Example Image">
</div>
<!-- First HTML page -->
<div class="container">
<div class="title">GroundTruth</div>
<iframe srcdoc='TRUEDOC' title="Page 1"></iframe>
</div>
<!-- Embedded Image -->
<div class="container">
<div class="title">Page Image Prediction</div>
<img src="data:image/png;base64,BASE64PREDPAGE" alt="Example Image">
</div>
<!-- Second HTML page -->
<div class="container">
<div class="title">Prediction</div>
<iframe srcdoc='PREDDOC' title="Page 2"></iframe>
</div>
</body>
</html>
"""