fix: locale-independent float parsing (fixes docling#1455) (#243)

Signed-off-by: Eric Van Boxsom <14831976+evb87-tech@users.noreply.github.com>
2026-05-17 13:10:49 +00:00 · 2026-05-06 19:11:31 +02:00
parent 1ef8e22aca
commit e56632d962
12 changed files with 773 additions and 17 deletions
@@ -0,0 +1,42 @@
+"""docling-parse: Extract text with coordinates from programmatic PDFs."""
+
+import locale as _locale
+import logging as _logging
+
+_log = _logging.getLogger(__name__)
+
+
+def _ensure_safe_numeric_locale() -> None:
+    """Ensure LC_NUMERIC uses a period as decimal separator.
+
+    PDF coordinate parsing (both in QPDF's C library and in this
+    package's C++ layer) relies on '.' as the decimal separator.
+    Locales that use ',' (French, German, Portuguese, etc.) silently
+    corrupt every floating-point value extracted from a PDF.
+
+    This function is called at import time as a defence-in-depth
+    measure.  The primary fix is in the C++ layer (from_chars), but
+    this protects against any locale-sensitive path we may have missed
+    — including QPDF's own atof() calls that we cannot patch.
+
+    See: https://github.com/docling-project/docling/issues/1455
+    """
+    try:
+        current = _locale.getlocale(_locale.LC_NUMERIC)
+        # setlocale returns the *actual* locale string; checking the
+        # decimal point via localeconv is the most reliable test.
+        conv = _locale.localeconv()
+        if conv.get("decimal_point", ".") != ".":
+            _locale.setlocale(_locale.LC_NUMERIC, "C")
+            _log.info(
+                "docling-parse: overrode LC_NUMERIC from %s to 'C' "
+                "to prevent PDF coordinate corruption",
+                current,
+            )
+    except (_locale.Error, ValueError):
+        # If we can't query or set the locale, the C++ from_chars
+        # layer will still protect us.
+        pass
+
+
+_ensure_safe_numeric_locale()
@@ -282,7 +282,7 @@ namespace pdflib
 	QPDFObjectHandle item = qpdf_arr.getArrayItem(d);
 	if(item.isNumber())
 	  {
-	    result[d] = item.getNumericValue();
+	    result[d] = utils::numeric::locale_safe_numeric_value(item);
 	  }
 	else
 	  {
@@ -90,6 +90,7 @@ namespace pdflib
    void decode_annots_from_qpdf();
    void extract_page_items_from_annots(QPDFObjectHandle annots);

+    void add_page_cell_from_annot(QPDFObjectHandle annot);
    void add_page_hyperlink_from_annot(QPDFObjectHandle annot);
    void add_page_widget_from_annot(QPDFObjectHandle annot);

@@ -785,6 +786,73 @@ namespace pdflib
      }
  }

+  void pdf_decoder<PAGE>::add_page_cell_from_annot(QPDFObjectHandle annot)
+  {
+    auto rect = annot.getKey("/Rect");
+
+    std::array<double, 4> bbox = {0., 0., 0., 0.};
+    for(int l=0; l<rect.getArrayNItems() and l<bbox.size(); l++)
+      {
+        QPDFObjectHandle num = rect.getArrayItem(l);
+        if(num.isNumber())
+          {
+            bbox[l] = utils::numeric::locale_safe_numeric_value(num);
+          }
+      }
+
+    auto [has_value, text] = to_string(annot, "/V");
+    if(not has_value)
+      {
+        text = "<unknown>";
+      }
+
+    page_item<PAGE_CELL> cell;
+    {
+      cell.widget = true;
+
+      cell.x0 = bbox[0];
+      cell.y0 = bbox[1];
+      cell.x1 = bbox[2];
+      cell.y1 = bbox[3];
+
+      cell.r_x0 = bbox[0];
+      cell.r_y0 = bbox[1];
+      cell.r_x1 = bbox[2];
+      cell.r_y1 = bbox[1];
+      cell.r_x2 = bbox[2];
+      cell.r_y2 = bbox[3];
+      cell.r_x3 = bbox[0];
+      cell.r_y3 = bbox[3];
+
+      cell.text = text;
+      cell.rendering_mode = 0;
+
+      cell.space_width = 0;
+      //cell.chars  = {};//chars;
+      //cell.widths = {};//widths;
+
+      cell.enc_name = "Form-font"; //font.get_encoding_name();
+
+      cell.font_enc = "Form-font"; //to_string(font.get_encoding());
+      cell.font_key = "Form-font"; //font.get_key();
+
+      cell.font_name = "Form-font"; //font.get_name();
+      cell.font_size = 0; //font_size/1000.0;
+
+      cell.italic = false;
+      cell.bold   = false;
+
+      cell.ocr        = false;
+      cell.confidence = -1.0;
+
+      cell.stack_size  = -1;
+      cell.block_count = -1;
+      cell.instr_count = -1;
+    }
+    page_cells.push_back(cell);
+
+  }
+
  void pdf_decoder<PAGE>::add_page_hyperlink_from_annot(QPDFObjectHandle annot)
  {
    LOG_S(INFO) << __FUNCTION__;
@@ -797,7 +865,7 @@ namespace pdflib
        QPDFObjectHandle num = rect.getArrayItem(l);
        if(num.isNumber())
          {
-            bbox[l] = num.getNumericValue();
+            bbox[l] = utils::numeric::locale_safe_numeric_value(num);
          }
      }

@@ -840,7 +908,7 @@ namespace pdflib
        QPDFObjectHandle num = rect.getArrayItem(l);
        if(num.isNumber())
          {
-            bbox[l] = num.getNumericValue();
+            bbox[l] = utils::numeric::locale_safe_numeric_value(num);
          }
      }

@@ -281,7 +281,7 @@ namespace pdflib
        else if(std::regex_match(line, match, expr))
          {
            int         numb = std::stoi(match[1]);
-            double      wval = std::stod(match[2]);
+            double      wval = utils::numeric::locale_safe_stod(match[2]);
            std::string name =           match[3] ;

            if(numb>=0 and name!="")
@@ -306,7 +306,7 @@ namespace pdflib

            if(elems.size()==2 and utils::string::is_number(elems[1]))
              {
-                properties[elems[0]] = std::stod(elems[1]);
+                properties[elems[0]] = utils::numeric::locale_safe_stod(elems[1]);
              }
            else if(elems.size()==2)
              {
@@ -314,11 +314,11 @@ namespace pdflib
              }
            else if(elems.size()>0 and elems[0]=="FontBBox")
              {
-                std::array<double, 4> bbox = { 
-                  std::stod(elems[1]),
-                  std::stod(elems[2]),
-                  std::stod(elems[3]),
-                  std::stod(elems[4])};
+                std::array<double, 4> bbox = {
+                  utils::numeric::locale_safe_stod(elems[1]),
+                  utils::numeric::locale_safe_stod(elems[2]),
+                  utils::numeric::locale_safe_stod(elems[3]),
+                  utils::numeric::locale_safe_stod(elems[4])};

                properties[elems[0]] = bbox;
              }
@@ -342,7 +342,7 @@ namespace pdflib
 	    //assert(item.isNumber());
 	    if(item.isNumber())
 	      {
-		double val = item.getNumericValue();
+		double val = utils::numeric::locale_safe_numeric_value(item);
 		dash_array.push_back(val);
 	      }
 	    else
@@ -344,7 +344,7 @@ namespace pdflib
          }
        else if(item.isNumber())
          {
-            double value = item.getNumericValue();
+            double value = utils::numeric::locale_safe_numeric_value(item);

            double tx = - value / 1000.0 * font_size * h_scaling;
            double ty = 0;
@@ -134,7 +134,7 @@ namespace pdflib
 	    std::string mvalue = match[1].str();
 	    LOG_S(WARNING) << "match-1: " << std::setw(12) << row.key << " | " << row.val << " => new matched value: " << mvalue;

-	    double value = std::stod(mvalue);
+	    double value = utils::numeric::locale_safe_stod(mvalue);

 	    // Creating a real (floating-point) QPDFObjectHandle
 	    QPDFObjectHandle new_obj = QPDFObjectHandle::newReal(value);
@@ -148,7 +148,7 @@ namespace pdflib
 	    std::string mvalue = match[1].str() + match[4].str();
 	    LOG_S(WARNING) << "match-2: " << std::setw(12) << row.key << " | " << row.val << " => new matched value: " << mvalue;

-	    double value = std::stod(mvalue);
+	    double value = utils::numeric::locale_safe_stod(mvalue);

 	    // Creating a real (floating-point) QPDFObjectHandle
 	    QPDFObjectHandle new_obj = QPDFObjectHandle::newReal(value);
@@ -162,7 +162,7 @@ namespace pdflib
 	    std::string mvalue = match[3].str() + match[7].str();
 	    LOG_S(WARNING) << "match-3: " << std::setw(12) << row.key << " | " << row.val << " => new matched value: " << mvalue;

-	    double value = std::stod(mvalue);
+	    double value = utils::numeric::locale_safe_stod(mvalue);

 	    // Creating a real (floating-point) QPDFObjectHandle
 	    QPDFObjectHandle new_obj = QPDFObjectHandle::newReal(value);
@@ -108,7 +108,7 @@ namespace pdflib
 	throw std::logic_error(message);
      }
    
-    return obj.getNumericValue(); 
+    return utils::numeric::locale_safe_numeric_value(obj);
  } 

  std::string qpdf_stream_instruction::to_char_string() 
@@ -178,7 +178,7 @@ namespace pdflib
          }
        else if(obj.isReal())
          {
-            double val = obj.getNumericValue();
+            double val = utils::numeric::locale_safe_numeric_value(obj);
            result = val;
          }
        else if(obj.isBool())
@@ -8,5 +8,6 @@
 #include "utils/timer.h"
 #include "utils/files.h"
 #include "utils/values.h"
+#include "utils/numeric.h"

 #endif
@@ -0,0 +1,90 @@
+//-*-C++-*-
+
+// Locale-independent numeric parsing utilities.
+//
+// Problem: std::stod() and std::atof() honour LC_NUMERIC, so under
+// locales that use ',' as the decimal separator (e.g. fr_FR, de_DE,
+// pt_BR) every floating-point value read from a PDF is silently
+// corrupted — "72.5" parses as 72.0 because the '.' is not recognised
+// as a decimal point.
+//
+// Solution: parse using the classic C locale instead of the process locale.
+// We provide two helpers:
+//
+//   1. locale_safe_stod(str)         — drop-in replacement for std::stod
+//   2. locale_safe_numeric_value(obj) — safe wrapper around QPDF's
+//      QPDFObjectHandle::getNumericValue(), which internally calls
+//      the locale-sensitive atof().
+//
+// See: https://github.com/docling-project/docling/issues/1455
+
+#ifndef PDF_UTILS_NUMERIC_H
+#define PDF_UTILS_NUMERIC_H
+
+#include <locale>
+#include <sstream>
+#include <string>
+#include <stdexcept>
+
+namespace utils
+{
+  namespace numeric
+  {
+
+    // Locale-independent replacement for std::stod().
+    //
+    // Uses a stream imbued with std::locale::classic(), which ignores the
+    // current LC_NUMERIC setting and remains portable across the older
+    // standard libraries used by the wheel build matrix.
+    //
+    // Throws std::invalid_argument on parse failure, matching the
+    // contract of std::stod().
+    inline double locale_safe_stod(const std::string& str)
+    {
+      std::istringstream stream(str);
+      stream.imbue(std::locale::classic());
+
+      double value = 0.0;
+      stream >> value;
+
+      if (stream.fail())
+        {
+          throw std::invalid_argument(
+            "locale_safe_stod: no valid conversion for \"" + str + "\"");
+        }
+
+      return value;
+    }
+
+    // Locale-independent wrapper around QPDFObjectHandle::getNumericValue().
+    //
+    // QPDF's getNumericValue() calls atof() internally for real
+    // numbers, which is locale-sensitive.  For integers, getIntValue()
+    // is safe (no decimal point involved).  For reals, we re-parse
+    // the string representation using locale_safe_stod().
+    //
+    // This function is a drop-in replacement for obj.getNumericValue()
+    // anywhere a QPDFObjectHandle is known to be a number.
+    inline double locale_safe_numeric_value(QPDFObjectHandle& obj)
+    {
+      if (obj.isInteger())
+        {
+          return static_cast<double>(obj.getIntValue());
+        }
+      else if (obj.isReal())
+        {
+          // Re-parse from the string representation instead of
+          // relying on QPDF's atof()-based getNumericValue().
+          return locale_safe_stod(obj.getRealValue());
+        }
+      else
+        {
+          throw std::invalid_argument(
+            "locale_safe_numeric_value: QPDF object is neither integer nor real");
+        }
+    }
+
+  }
+}
+
+#endif
@@ -0,0 +1,555 @@
+#!/usr/bin/env python
+"""Tests for locale-independent PDF coordinate parsing.
+
+Validates that docling-parse produces correct results regardless of
+the system's LC_NUMERIC setting — specifically for locales that use
+',' as the decimal separator (French, German, Portuguese, etc.).
+
+This addresses: https://github.com/docling-project/docling/issues/1455
+
+Test structure:
+  - Unit tests:        locale_safe_stod via Python-accessible paths
+  - Integration tests: full page parsing under hostile locale
+  - Edge-case tests:   boundary values, negative numbers, scientific notation
+  - Regression tests:  coordinate stability across locale switches
+"""
+
+import glob
+import locale
+import os
+import platform
+import sys
+from contextlib import contextmanager
+from typing import Generator, List, Optional, Tuple
+
+import pytest
+
+from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+REGRESSION_FOLDER = "tests/data/regression/*.pdf"
+
+# Locales that use ',' as decimal separator.  We try several because
+# availability varies across OS / Docker images.
+COMMA_LOCALES = [
+    "fr_FR.UTF-8",
+    "fr_FR.utf8",
+    "de_DE.UTF-8",
+    "de_DE.utf8",
+    "pt_BR.UTF-8",
+    "pt_BR.utf8",
+    "es_ES.UTF-8",
+    "es_ES.utf8",
+    "it_IT.UTF-8",
+    "it_IT.utf8",
+]
+
+
+def _find_comma_locale() -> str | None:
+    """Return the first available locale that uses ',' as decimal separator."""
+    for loc in COMMA_LOCALES:
+        try:
+            locale.setlocale(locale.LC_NUMERIC, loc)
+            conv = locale.localeconv()
+            if conv.get("decimal_point") == ",":
+                # Reset before returning
+                locale.setlocale(locale.LC_NUMERIC, "C")
+                return loc
+        except locale.Error:
+            continue
+    # Reset in case we changed it
+    try:
+        locale.setlocale(locale.LC_NUMERIC, "C")
+    except locale.Error:
+        pass
+    return None
+
+
+COMMA_LOCALE = _find_comma_locale()
+
+# Skip entire module if no comma-decimal locale is available
+requires_comma_locale = pytest.mark.skipif(
+    COMMA_LOCALE is None,
+    reason="No comma-decimal locale available (tried: fr_FR, de_DE, pt_BR, es_ES, it_IT)",
+)
+
+
+@contextmanager
+def hostile_locale() -> Generator[str, None, None]:
+    """Context manager that sets LC_NUMERIC to a comma-decimal locale.
+
+    Restores the original locale on exit regardless of exceptions.
+    """
+    assert COMMA_LOCALE is not None, "No comma-decimal locale available"
+
+    saved = locale.getlocale(locale.LC_NUMERIC)
+    locale.setlocale(locale.LC_NUMERIC, COMMA_LOCALE)
+
+    # Verify the locale is actually active
+    conv = locale.localeconv()
+    assert conv["decimal_point"] == ",", (
+        f"Expected ',' decimal separator under {COMMA_LOCALE}, "
+        f"got '{conv['decimal_point']}'"
+    )
+
+    try:
+        yield COMMA_LOCALE
+    finally:
+        try:
+            locale.setlocale(locale.LC_NUMERIC, saved)
+        except locale.Error:
+            locale.setlocale(locale.LC_NUMERIC, "C")
+
+
+def _get_regression_pdfs() -> List[str]:
+    """Return sorted list of regression PDF paths."""
+    return sorted(glob.glob(REGRESSION_FOLDER))
+
+
+def _parse_first_page(pdf_path: str) -> Tuple:
+    """Parse page 1 of a PDF and return (page, dimension, cells)."""
+    parser = DoclingPdfParser(loglevel="fatal")
+    doc = parser.load(pdf_path)
+    page = doc.get_page(1)
+    return page, page.dimension, page.char_cells
+
+
+# ---------------------------------------------------------------------------
+# Unit tests — locale_safe_stod behaviour via coordinate parsing
+# ---------------------------------------------------------------------------
+
+
+class TestLocaleIndependentParsing:
+    """Verify that float parsing in the C++ layer is locale-independent."""
+
+    @requires_comma_locale
+    def test_page_dimensions_under_comma_locale(self):
+        """Page dimensions must not be truncated under French locale.
+
+        This is the core bug: a page width of 612.0 points would be
+        parsed as 612 (or 0) when LC_NUMERIC uses comma.
+        """
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0, "No regression PDFs found"
+
+        # Parse under C locale first (baseline)
+        locale.setlocale(locale.LC_NUMERIC, "C")
+        baseline_pdf = pdfs[0]
+        _, baseline_dim, _ = _parse_first_page(baseline_pdf)
+
+        # Now parse under hostile locale
+        with hostile_locale():
+            _, hostile_dim, _ = _parse_first_page(baseline_pdf)
+
+        # Dimensions must match exactly
+        baseline_rect = baseline_dim.rect.to_polygon()
+        hostile_rect = hostile_dim.rect.to_polygon()
+
+        for i in range(4):
+            assert abs(baseline_rect[i][0] - hostile_rect[i][0]) < 0.001, (
+                f"X coordinate mismatch at vertex {i}: "
+                f"baseline={baseline_rect[i][0]}, hostile={hostile_rect[i][0]}"
+            )
+            assert abs(baseline_rect[i][1] - hostile_rect[i][1]) < 0.001, (
+                f"Y coordinate mismatch at vertex {i}: "
+                f"baseline={baseline_rect[i][1]}, hostile={hostile_rect[i][1]}"
+            )
+
+    @requires_comma_locale
+    def test_nonzero_fractional_dimensions(self):
+        """Page dimensions must retain their fractional part.
+
+        The pre-fix bug would truncate 612.0 → 612 (benign) but
+        595.276 → 595 (wrong by 0.276 points, visible at high DPI).
+        Compare parsed dimensions under C vs comma locale — they must
+        agree exactly. (Absolute thresholds break on PDFs with
+        unusual coordinate origins, e.g. broken_media_box_v01.pdf.)
+        """
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0
+
+        for pdf_path in pdfs[:5]:  # test first 5 for speed
+            locale.setlocale(locale.LC_NUMERIC, "C")
+            _, baseline_dim, _ = _parse_first_page(pdf_path)
+            baseline = baseline_dim.crop_bbox
+
+            with hostile_locale():
+                _, hostile_dim, _ = _parse_first_page(pdf_path)
+                hostile = hostile_dim.crop_bbox
+
+            for attr in ("l", "t", "r", "b"):
+                assert abs(getattr(baseline, attr) - getattr(hostile, attr)) < 1e-6, (
+                    f"crop_bbox.{attr} mismatch in {pdf_path}: "
+                    f"baseline={getattr(baseline, attr)} hostile={getattr(hostile, attr)}"
+                )
+
+    @requires_comma_locale
+    def test_text_cell_coordinates_under_comma_locale(self):
+        """Text cell bounding boxes must be correct under comma locale."""
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0
+
+        # Find a PDF with text cells
+        test_pdf = None
+        for pdf_path in pdfs:
+            _, _, cells = _parse_first_page(pdf_path)
+            if len(cells) > 0:
+                test_pdf = pdf_path
+                break
+
+        if test_pdf is None:
+            pytest.skip("No regression PDF with text cells found")
+
+        # Baseline
+        locale.setlocale(locale.LC_NUMERIC, "C")
+        _, _, baseline_cells = _parse_first_page(test_pdf)
+
+        # Hostile
+        with hostile_locale():
+            _, _, hostile_cells = _parse_first_page(test_pdf)
+
+        assert len(baseline_cells) == len(hostile_cells), (
+            f"Cell count differs: {len(baseline_cells)} vs {len(hostile_cells)}"
+        )
+
+        for i, (bc, hc) in enumerate(zip(baseline_cells, hostile_cells)):
+            b_rect = bc.rect.to_polygon()
+            h_rect = hc.rect.to_polygon()
+
+            for v in range(4):
+                assert abs(b_rect[v][0] - h_rect[v][0]) < 0.01, (
+                    f"Cell {i} vertex {v} X: {b_rect[v][0]} vs {h_rect[v][0]}"
+                )
+                assert abs(b_rect[v][1] - h_rect[v][1]) < 0.01, (
+                    f"Cell {i} vertex {v} Y: {b_rect[v][1]} vs {h_rect[v][1]}"
+                )
+
+            assert bc.text == hc.text, f"Cell {i} text: '{bc.text}' vs '{hc.text}'"
+
+
+# ---------------------------------------------------------------------------
+# Integration tests — full parsing pipeline under hostile locale
+# ---------------------------------------------------------------------------
+
+
+class TestFullPipelineLocaleResilience:
+    """End-to-end parsing of real PDFs under comma-decimal locale."""
+
+    @requires_comma_locale
+    def test_all_regression_pdfs_parse_without_error(self):
+        """Every regression PDF must parse without exception under French locale."""
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0
+
+        failures = []
+        with hostile_locale():
+            for pdf_path in pdfs:
+                try:
+                    parser = DoclingPdfParser(loglevel="fatal")
+                    doc = parser.load(pdf_path)
+                    n_pages = doc.number_of_pages()
+
+                    for page_no in range(1, n_pages + 1):
+                        page = doc.get_page(page_no)
+                        # Accessing dimension forces coordinate parsing
+                        _ = page.dimension
+                        _ = page.char_cells
+
+                except Exception as e:
+                    failures.append((os.path.basename(pdf_path), str(e)))
+
+        assert len(failures) == 0, (
+            f"{len(failures)} PDFs failed under {COMMA_LOCALE}:\n"
+            + "\n".join(f"  {name}: {err}" for name, err in failures)
+        )
+
+    @requires_comma_locale
+    def test_page_count_consistent_across_locales(self):
+        """Page count must not depend on locale."""
+        pdfs = _get_regression_pdfs()[:10]
+
+        locale.setlocale(locale.LC_NUMERIC, "C")
+        c_counts = {}
+        for pdf_path in pdfs:
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(pdf_path)
+            c_counts[pdf_path] = doc.number_of_pages()
+
+        with hostile_locale():
+            for pdf_path in pdfs:
+                parser = DoclingPdfParser(loglevel="fatal")
+                doc = parser.load(pdf_path)
+                hostile_count = doc.number_of_pages()
+                assert hostile_count == c_counts[pdf_path], (
+                    f"{os.path.basename(pdf_path)}: "
+                    f"C locale={c_counts[pdf_path]}, hostile={hostile_count}"
+                )
+
+    @requires_comma_locale
+    def test_shapes_consistent_across_locales(self):
+        """Shape coordinates (graphics state) must be locale-independent."""
+        pdfs = _get_regression_pdfs()
+
+        # Find a PDF with shapes
+        test_pdf = None
+        for pdf_path in pdfs:
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(pdf_path)
+            page = doc.get_page(1)
+            if len(page.shapes) > 0:
+                test_pdf = pdf_path
+                break
+
+        if test_pdf is None:
+            pytest.skip("No regression PDF with shapes found")
+
+        locale.setlocale(locale.LC_NUMERIC, "C")
+        parser = DoclingPdfParser(loglevel="fatal")
+        doc = parser.load(test_pdf)
+        baseline_shapes = doc.get_page(1).shapes
+
+        with hostile_locale():
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(test_pdf)
+            hostile_shapes = doc.get_page(1).shapes
+
+        assert len(baseline_shapes) == len(hostile_shapes), (
+            f"Shape count: {len(baseline_shapes)} vs {len(hostile_shapes)}"
+        )
+
+        for i, (bs, hs) in enumerate(zip(baseline_shapes, hostile_shapes)):
+            assert abs(bs.line_width - hs.line_width) < 0.001, (
+                f"Shape {i} line_width: {bs.line_width} vs {hs.line_width}"
+            )
+            assert len(bs.points) == len(hs.points), (
+                f"Shape {i} point count: {len(bs.points)} vs {len(hs.points)}"
+            )
+            for j, (bp, hp) in enumerate(zip(bs.points, hs.points)):
+                assert abs(bp.x - hp.x) < 0.01, (
+                    f"Shape {i} point {j} X: {bp.x} vs {hp.x}"
+                )
+                assert abs(bp.y - hp.y) < 0.01, (
+                    f"Shape {i} point {j} Y: {bp.y} vs {hp.y}"
+                )
+
+
+# ---------------------------------------------------------------------------
+# Edge-case tests — boundary values in numeric parsing
+# ---------------------------------------------------------------------------
+
+
+class TestNumericEdgeCases:
+    """Verify correct handling of edge-case numeric values in PDFs."""
+
+    @requires_comma_locale
+    def test_bytesio_loading_under_comma_locale(self):
+        """Loading from BytesIO must work under comma locale."""
+        from io import BytesIO
+
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0
+
+        with hostile_locale():
+            with open(pdfs[0], "rb") as f:
+                data = BytesIO(f.read())
+
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(data)
+            page = doc.get_page(1)
+
+            # Must not raise, and dimension must be valid
+            assert page.dimension.crop_bbox.r > 0
+
+    @requires_comma_locale
+    def test_multi_page_document_all_pages_valid(self):
+        """All pages of a multi-page PDF must parse correctly."""
+        pdfs = _get_regression_pdfs()
+
+        # Find multi-page PDF
+        multi_page_pdf = None
+        for pdf_path in pdfs:
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(pdf_path)
+            if doc.number_of_pages() > 1:
+                multi_page_pdf = pdf_path
+                break
+
+        if multi_page_pdf is None:
+            pytest.skip("No multi-page regression PDF found")
+
+        with hostile_locale():
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(multi_page_pdf)
+
+            for page_no in range(1, doc.number_of_pages() + 1):
+                page = doc.get_page(page_no)
+                dim = page.dimension
+
+                # Every page must have positive dimensions
+                assert dim.crop_bbox.r > 0, (
+                    f"Page {page_no}: crop_bbox.r = {dim.crop_bbox.r}"
+                )
+                assert dim.crop_bbox.t > 0, (
+                    f"Page {page_no}: crop_bbox.t = {dim.crop_bbox.t}"
+                )
+
+    @requires_comma_locale
+    def test_widgets_and_hyperlinks_under_comma_locale(self):
+        """Widget and hyperlink bounding boxes must be locale-independent."""
+        pdfs = _get_regression_pdfs()
+
+        # Find PDFs with widgets or hyperlinks
+        for pdf_path in pdfs:
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(pdf_path)
+            page = doc.get_page(1)
+
+            if len(page.widgets) > 0 or len(page.hyperlinks) > 0:
+                # Re-parse under hostile locale
+                with hostile_locale():
+                    parser2 = DoclingPdfParser(loglevel="fatal")
+                    doc2 = parser2.load(pdf_path)
+                    page2 = doc2.get_page(1)
+
+                    for w in page2.widgets:
+                        poly = w.rect.to_polygon()
+                        for v in range(4):
+                            # Coordinates should be finite and reasonable
+                            assert -10000 < poly[v][0] < 10000
+                            assert -10000 < poly[v][1] < 10000
+
+                    for h in page2.hyperlinks:
+                        poly = h.rect.to_polygon()
+                        for v in range(4):
+                            assert -10000 < poly[v][0] < 10000
+                            assert -10000 < poly[v][1] < 10000
+
+                return  # Found and tested at least one
+
+        pytest.skip("No regression PDF with widgets/hyperlinks found")
+
+
+# ---------------------------------------------------------------------------
+# Regression tests — exact coordinate reproducibility
+# ---------------------------------------------------------------------------
+
+
+class TestCoordinateReproducibility:
+    """Verify coordinates are bit-for-bit identical across locale switches.
+
+    This catches subtle bugs where values are "close" but not exact,
+    which would cause downstream layout analysis to drift.
+    """
+
+    @requires_comma_locale
+    def test_coordinate_stability_across_repeated_locale_switches(self):
+        """Parse same PDF under alternating locales — results must be identical."""
+        pdfs = _get_regression_pdfs()
+        assert len(pdfs) > 0
+        test_pdf = pdfs[0]
+
+        results = []
+        for i in range(4):
+            if i % 2 == 0:
+                locale.setlocale(locale.LC_NUMERIC, "C")
+            else:
+                locale.setlocale(locale.LC_NUMERIC, COMMA_LOCALE)
+
+            parser = DoclingPdfParser(loglevel="fatal")
+            doc = parser.load(test_pdf)
+            page = doc.get_page(1)
+            dim = page.dimension
+
+            results.append(
+                {
+                    "crop_r": dim.crop_bbox.r,
+                    "crop_t": dim.crop_bbox.t,
+                    "crop_l": dim.crop_bbox.l,
+                    "crop_b": dim.crop_bbox.b,
+                    "n_cells": len(page.char_cells),
+                }
+            )
+
+        # Restore
+        locale.setlocale(locale.LC_NUMERIC, "C")
+
+        # All 4 results must be identical
+        for i in range(1, 4):
+            for key in results[0]:
+                assert results[i][key] == results[0][key], (
+                    f"Iteration {i} diverged on {key}: "
+                    f"{results[i][key]} != {results[0][key]}"
+                )
+
+    @requires_comma_locale
+    def test_cell_text_content_unaffected_by_locale(self):
+        """Text content extraction must not depend on locale.
+
+        While the primary bug is coordinate corruption, we verify that
+        the text itself is also identical.
+        """
+        pdfs = _get_regression_pdfs()
+
+        test_pdf = None
+        for pdf_path in pdfs:
+            _, _, cells = _parse_first_page(pdf_path)
+            if len(cells) > 5:
+                test_pdf = pdf_path
+                break
+
+        if test_pdf is None:
+            pytest.skip("No PDF with enough text cells")
+
+        locale.setlocale(locale.LC_NUMERIC, "C")
+        _, _, c_cells = _parse_first_page(test_pdf)
+
+        with hostile_locale():
+            _, _, h_cells = _parse_first_page(test_pdf)
+
+        c_text = "".join(c.text for c in c_cells)
+        h_text = "".join(c.text for c in h_cells)
+
+        assert c_text == h_text, "Text content differs between locales"
+
+
+# ---------------------------------------------------------------------------
+# Python-level defence test
+# ---------------------------------------------------------------------------
+
+
+class TestPythonLocaleGuard:
+    """Verify the Python __init__.py locale guard works."""
+
+    @requires_comma_locale
+    def test_import_resets_numeric_locale(self):
+        """Importing docling_parse must ensure LC_NUMERIC uses '.' separator."""
+        # Set hostile locale first
+        locale.setlocale(locale.LC_NUMERIC, COMMA_LOCALE)
+
+        # Re-trigger the guard (it runs at import time, but we can call it)
+        import docling_parse
+
+        docling_parse._ensure_safe_numeric_locale()
+
+        conv = locale.localeconv()
+        assert conv["decimal_point"] == ".", (
+            f"LC_NUMERIC still uses '{conv['decimal_point']}' after import"
+        )
+
+        # Restore
+        locale.setlocale(locale.LC_NUMERIC, "C")
+
+    def test_guard_is_noop_under_c_locale(self):
+        """The guard must not change anything when LC_NUMERIC is already safe."""
+        locale.setlocale(locale.LC_NUMERIC, "C")
+
+        import docling_parse
+
+        docling_parse._ensure_safe_numeric_locale()
+
+        # Should still be C or equivalent
+        conv = locale.localeconv()
+        assert conv["decimal_point"] == "."