From 33f24ed0fc350ec4702d996df86cb45e70c37e1e Mon Sep 17 00:00:00 2001 From: yusufcanislek Date: Fri, 6 Mar 2026 00:47:45 +0300 Subject: [PATCH] Fix InstructionCache DenseMap corruption: empty/tombstone keys were identical MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The InstructionKey::InstructionKeyInfo had getEmptyKey() and getTombstoneKey() both returning InstructionKey(nullptr, nullptr). LLVM DenseMap requires these to be distinct sentinel values. This violated the DenseMap contract, causing bucket corruption during copy/iteration (the old FIXME about 'last item corrupted'). Fix: use reinterpret_cast sentinel pointers -1 and -2, matching LLVM convention. Also cleaned up the non-const copy constructor (removed dead local copy and stale FIXME comment). Also adds: - switch_sparse.asm test (non-consecutive case values: 10, 50, 200, 1000) - calc_cout.cpp test (skipped - documents inline policy limitation with STL) - C++ compilation support in build_samples.cmd - Skip mechanism for manifest entries (skip: true + skip_reason) - Fix test.py update-golden to not run determinism check before updating 68 pattern checks, 40 golden hashes, 108 handler microtests — all green. --- lifter/lifterClass.hpp | 20 ++++---- lifter/test_vectors/golden_ir_hashes.json | 4 +- .../oracle_vectors_full_handlers.json | 2 +- scripts/rewrite/build_samples.cmd | 9 ++++ scripts/rewrite/instruction_microtests.json | 14 +++++- scripts/rewrite/run.ps1 | 8 +++- scripts/rewrite/verify.ps1 | 4 ++ test.py | 2 +- testcases/rewrite_smoke/calc_cout.cpp | 16 +++++++ testcases/rewrite_smoke/switch_sparse.asm | 47 +++++++++++++++++++ 10 files changed, 109 insertions(+), 17 deletions(-) create mode 100644 testcases/rewrite_smoke/calc_cout.cpp create mode 100644 testcases/rewrite_smoke/switch_sparse.asm diff --git a/lifter/lifterClass.hpp b/lifter/lifterClass.hpp index 136224d..75c3b16 100644 --- a/lifter/lifterClass.hpp +++ b/lifter/lifterClass.hpp @@ -79,13 +79,17 @@ struct InstructionKey { return lhs == rhs; } - // Define empty and tombstone keys + // Define empty and tombstone keys — MUST be distinct for DenseMap. static inline InstructionKey getEmptyKey() { - return InstructionKey(nullptr, static_cast(nullptr)); + return InstructionKey( + reinterpret_cast(static_cast(-1)), + reinterpret_cast(static_cast(-1))); } static inline InstructionKey getTombstoneKey() { - return InstructionKey(nullptr, static_cast(nullptr)); + return InstructionKey( + reinterpret_cast(static_cast(-2)), + reinterpret_cast(static_cast(-2))); } }; }; @@ -110,16 +114,8 @@ public: } InstructionCache() = default; InstructionCache(InstructionCache& other) { - // we want to copy each SmallDenseMap individually - // crash on last item, why? - // FIXME: last item on array is corrupted. for (size_t i = 0; i < opcodeCaches.size(); ++i) { - - // reserve because its faster - - auto src = other.opcodeCaches[i]; - opcodeCaches[i].reserve(src.size()); - + opcodeCaches[i].reserve(other.opcodeCaches[i].size()); for (auto& kv : other.opcodeCaches[i]) { opcodeCaches[i].try_emplace(kv.first, kv.second); } diff --git a/lifter/test_vectors/golden_ir_hashes.json b/lifter/test_vectors/golden_ir_hashes.json index e80a7c7..e4b2d5a 100644 --- a/lifter/test_vectors/golden_ir_hashes.json +++ b/lifter/test_vectors/golden_ir_hashes.json @@ -36,5 +36,7 @@ "stack.ll": "41199a809916ab3045d1de076b3d4128fb40a45f950764b38b851f67b310c4fe", "stack_no_opts.ll": "94059a01b8a78951c9448ba94b5dadf445610df76315b8cab8eecd153843472b", "switch_3way.ll": "e706ce0da37dbe02fd52fae223c39f74a8b84c4946b971d8425fe868a4e73256", - "switch_3way_no_opts.ll": "5527b1a564babe40dd9ea1ff7d1ea3796e814c22823be7841ee1be0dbd1c7524" + "switch_3way_no_opts.ll": "5527b1a564babe40dd9ea1ff7d1ea3796e814c22823be7841ee1be0dbd1c7524", + "switch_sparse.ll": "06b9ec694dcf18ffb7041a437fa3e4f2e50c061569cc98bcb239b5f77c3a15f4", + "switch_sparse_no_opts.ll": "b5fea064fea49272e541476f1e087d6f34f10e9f2d90d0eb0b6ebdda9ca7ea6c" } diff --git a/lifter/test_vectors/oracle_vectors_full_handlers.json b/lifter/test_vectors/oracle_vectors_full_handlers.json index b18ec1f..d2dbc04 100644 --- a/lifter/test_vectors/oracle_vectors_full_handlers.json +++ b/lifter/test_vectors/oracle_vectors_full_handlers.json @@ -1,6 +1,6 @@ { "schema": "mergen-oracle-v1", - "generated_at_utc": "2026-03-05T17:43:33.532361+00:00", + "generated_at_utc": "2026-03-05T19:49:19.307536+00:00", "source_seed_schema": "mergen-oracle-seed-v1", "providers": [ "unicorn" diff --git a/scripts/rewrite/build_samples.cmd b/scripts/rewrite/build_samples.cmd index 336d57f..3290c90 100644 --- a/scripts/rewrite/build_samples.cmd +++ b/scripts/rewrite/build_samples.cmd @@ -62,5 +62,14 @@ for %%F in ("%~dp0..\..\testcases\rewrite_smoke\*.c") do ( if errorlevel 1 exit /b 1 ) +rem --- Compile C++ test programs (real binaries with CRT + STL) --- +for %%F in ("%~dp0..\..\testcases\rewrite_smoke\*.cpp") do ( + cl.exe /nologo /Od /GS- /EHsc /c /Fo"%WORKDIR%\%%~nF.obj" "%%~fF" + if errorlevel 1 exit /b 1 + + link.exe /nologo /subsystem:console /out:"%WORKDIR%\%%~nF.exe" /map:"%WORKDIR%\%%~nF.map" "%WORKDIR%\%%~nF.obj" + if errorlevel 1 exit /b 1 +) + echo Built rewrite regression samples in "%WORKDIR%" exit /b 0 \ No newline at end of file diff --git a/scripts/rewrite/instruction_microtests.json b/scripts/rewrite/instruction_microtests.json index ecae40f..56e048a 100644 --- a/scripts/rewrite/instruction_microtests.json +++ b/scripts/rewrite/instruction_microtests.json @@ -94,6 +94,18 @@ "name": "calc_switch", "symbol": "calc_switch", "patterns": ["switch i32 %0", "i32 1, label", "i32 2, label", "i32 3, label", "i32 4, label", "i32 5, label", "phi i64"] + }, + { + "name": "switch_sparse", + "symbol": "switch_sparse_target", + "patterns": ["switch i32 %0", "i32 10, label", "i32 50, label", "i32 200, label", "i32 1000, label", "phi i64", "[ 11,", "[ 55,", "[ 222,", "[ 1337,", "[ 4294967295,"] + }, + { + "name": "calc_cout", + "symbol": "calc_cout", + "skip": true, + "skip_reason": "Statically-linked STL (cout) inlined by lifter; GEPTracker UNREACHABLE on complex library code. Blocked on inline policy improvements (Phase 2).", + "patterns": [] } ] -} +} \ No newline at end of file diff --git a/scripts/rewrite/run.ps1 b/scripts/rewrite/run.ps1 index 5113733..d350c69 100644 --- a/scripts/rewrite/run.ps1 +++ b/scripts/rewrite/run.ps1 @@ -32,7 +32,8 @@ if ($samples.Count -eq 0) { $srcDir = Join-Path $repoRoot 'testcases/rewrite_smoke' $srcNames = @( (Get-ChildItem -Path $srcDir -Filter '*.asm' | ForEach-Object { $_.BaseName }) + - (Get-ChildItem -Path $srcDir -Filter '*.c' | ForEach-Object { $_.BaseName }) + (Get-ChildItem -Path $srcDir -Filter '*.c' | ForEach-Object { $_.BaseName }) + + (Get-ChildItem -Path $srcDir -Filter '*.cpp' | ForEach-Object { $_.BaseName }) ) $sampleNames = @($samples | ForEach-Object { $_.name }) @@ -52,6 +53,11 @@ New-Item -ItemType Directory -Path $irDir -Force | Out-Null Push-Location $repoRoot try { foreach ($sample in $samples) { + if ($sample.PSObject.Properties['skip'] -and $sample.skip) { + Write-Host "SKIP: $($sample.name) (known limitation)" + continue + } + $mapPath = Join-Path $WorkDir "$($sample.name).map" if (-not (Test-Path $mapPath)) { throw "Map file not found: $mapPath" diff --git a/scripts/rewrite/verify.ps1 b/scripts/rewrite/verify.ps1 index 6dc9630..ec2de28 100644 --- a/scripts/rewrite/verify.ps1 +++ b/scripts/rewrite/verify.ps1 @@ -19,6 +19,10 @@ if ($checks.Count -eq 0) { $failed = $false foreach ($check in $checks) { + if ($check.PSObject.Properties['skip'] -and $check.skip) { + Write-Host "SKIP: $($check.name) (known limitation)" + continue + } $file = Join-Path $irDir "$($check.name).ll" if (-not (Test-Path $file)) { Write-Host "FAIL: missing $file" diff --git a/test.py b/test.py index 4bccbe3..504439c 100644 --- a/test.py +++ b/test.py @@ -183,7 +183,7 @@ def main() -> None: return if command == "update-golden": - run_baseline() + _run_cmd(REWRITE_DIR / "run.cmd") update_golden(IR_OUTPUT_DIR, GOLDEN_HASHES_FILE) return diff --git a/testcases/rewrite_smoke/calc_cout.cpp b/testcases/rewrite_smoke/calc_cout.cpp new file mode 100644 index 0000000..b4349d1 --- /dev/null +++ b/testcases/rewrite_smoke/calc_cout.cpp @@ -0,0 +1,16 @@ +/* Test: function with cout call. + * Lift target: calc_cout — external call handling. + * The computation is pure, but it calls cout before returning. */ +#include + +__declspec(noinline) +int calc_cout(int x) { + int result = x * 3 + 7; + std::cout << result; + return result; +} + +int main() { + int r = calc_cout(10); + return r; +} diff --git a/testcases/rewrite_smoke/switch_sparse.asm b/testcases/rewrite_smoke/switch_sparse.asm new file mode 100644 index 0000000..890c615 --- /dev/null +++ b/testcases/rewrite_smoke/switch_sparse.asm @@ -0,0 +1,47 @@ +default rel +bits 64 + +global start +global switch_sparse_target +extern ExitProcess + +section .text +; Sparse switch on symbolic ECX input. +; Case values are NOT consecutive: 10, 50, 200, 1000. +; Tests multi-target branch resolution with large gaps between cases. +switch_sparse_target: + push rbp + mov rbp, rsp + mov eax, ecx + cmp eax, 10 + je .case10 + cmp eax, 50 + je .case50 + cmp eax, 200 + je .case200 + cmp eax, 1000 + je .case1000 + ; default + mov eax, -1 + jmp .done +.case10: + mov eax, 11 + jmp .done +.case50: + mov eax, 55 + jmp .done +.case200: + mov eax, 222 + jmp .done +.case1000: + mov eax, 1337 +.done: + pop rbp + ret + +start: + sub rsp, 40 + mov ecx, 200 + call switch_sparse_target + mov ecx, eax + call ExitProcess