diff --git a/VMP_TESTING_NOTES.md b/VMP_TESTING_NOTES.md index 7623d66..f3bd4a7 100644 --- a/VMP_TESTING_NOTES.md +++ b/VMP_TESTING_NOTES.md @@ -65,7 +65,7 @@ Use `simple/protected381/*`. - These are the best local VMP-style performance targets. - Continue profiling semantics/memory/folder helpers there. - For large control-flow/semantics/inlining changes, run `python test.py vmp` from repo root. That command now fails required targets on diagnostics errors **or** `blocks_completed == 0`, while still reporting the older VMP 3.6 sample as best-effort only. -- The current safe configuration keeps loop-header generalization disabled and relies on a higher basic-block budget for the stable 3.8.x samples. +- The current safe configuration allows structured loop-header reuse only for reducible conditional/direct-jump headers (including short trampoline chains into a conditional header); indirect-jump and `ret` dispatcher-style loops remain blocked, and required 3.8.x samples must still validate via `python test.py vmp`. ### If the goal is older protected/VMP 3.6 support Use `simple/protected/simple_target_protected.vmp.exe`, but do not treat it as a normal instruction-semantics-only problem. diff --git a/docs/REWRITE_BASELINE.md b/docs/REWRITE_BASELINE.md index fb7e2e2..637dfb4 100644 --- a/docs/REWRITE_BASELINE.md +++ b/docs/REWRITE_BASELINE.md @@ -131,11 +131,11 @@ Samples without a `semantic` field are not tested. The `semantic` field is optio ### Coverage summary -Current active quick-gate semantic coverage is **30 samples / 165 cases**. +Current active quick-gate semantic coverage is **33 samples / 177 cases**. Notable current state: -- `dummy_vm_loop` and `bytecode_vm_loop` remain active VM-shaped control-flow samples. -- `stack_vm_loop` and `calc_sum_to_n` are currently marked `skip` because the safe VMP configuration disables loop-header generalization and these samples still exceed the block budget without it. +- `dummy_vm_loop`, `bytecode_vm_loop`, and `stack_vm_loop` are active VM-shaped control-flow samples. +- `calc_sum_to_n` is active again under the safe structured-loop recovery path. - `calc_cout` remains `ci_skip` because its C++ codegen is toolchain-dependent on CI. ## Call-boundary ABI framework diff --git a/docs/SCOPE.md b/docs/SCOPE.md index 8042ef2..aa76860 100644 --- a/docs/SCOPE.md +++ b/docs/SCOPE.md @@ -33,7 +33,7 @@ Mergen is a function-level LLVM IR lifting engine for deobfuscation and devirtua ## Quality Contract - Handler coverage: 112/115 handlers with oracle-backed verification -- Active regression corpus: 30 semantic samples / 165 runtime semantic cases; `stack_vm_loop` and `calc_sum_to_n` are tracked known limitations, and `calc_cout` remains CI-skipped because its C++ codegen is toolchain-dependent +- Active regression corpus: 33 semantic samples / 177 runtime semantic cases; structured loop recovery now keeps `calc_sum_to_n` and `stack_vm_loop` active, and `calc_cout` remains CI-skipped because its C++ codegen is toolchain-dependent - Determinism: golden IR hashes are enforced for tracked outputs - CI gates: register/flag correctness, rewrite baseline, semantic regression, and Windows build lanes - Targeted VMP gate: `python test.py vmp` must keep required 3.8.x targets at `blocks_completed > 0`; VMP 3.6 remains best-effort only diff --git a/lifter/analysis/PathSolver.ipp b/lifter/analysis/PathSolver.ipp index c7f8677..ef586e1 100644 --- a/lifter/analysis/PathSolver.ipp +++ b/lifter/analysis/PathSolver.ipp @@ -59,24 +59,24 @@ MERGEN_LIFTER_DEFINITION_TEMPLATES(PATH_info)::solvePath( visitedAddresses.contains(target) && target <= blockInfo.block_address; auto it = addrToBB.find(target); - const bool generalizedHeaderLooksSimple = - it == addrToBB.end() || !it->second || llvm::pred_size(it->second) <= 1; + const bool pendingGeneralization = + pendingLoopGeneralizationAddresses.contains(target); const bool wantsGeneralization = - currentPathSolveAllowsLoopGeneralization() && - generalizedHeaderLooksSimple && - !generalizedLoopAddresses.contains(target) && - backwardVisitedTarget; + pendingGeneralization || + (backwardVisitedTarget && canGeneralizeStructuredLoopHeader(target)); if (wantsGeneralization) { if (currentPathSolveContext == PathSolveContext::DirectJump) { stackBypassGeneralizedLoopAddresses.insert(target); } const bool generalizedBackup = stackBypassGeneralizedLoopAddresses.contains(target); - if (pendingLoopGeneralizationAddresses.contains(target) && - it != addrToBB.end() && it->second && it->second->empty()) { + if (pendingGeneralization && it != addrToBB.end() && it->second && + it->second->empty()) { return {it->second, false, generalizedBackup}; } - pendingLoopGeneralizationAddresses.insert(target); + if (!pendingGeneralization) { + pendingLoopGeneralizationAddresses.insert(target); + } if (it != addrToBB.end() && it->second && !it->second->empty()) { return {replaceWithGeneralizedLoopBlock(target, name), false, generalizedBackup}; diff --git a/lifter/core/LifterClass.hpp b/lifter/core/LifterClass.hpp index 4181f92..c05aa0a 100644 --- a/lifter/core/LifterClass.hpp +++ b/lifter/core/LifterClass.hpp @@ -503,11 +503,49 @@ public: bool currentBlockUsesGeneralizedLoopState() const { return currentBlockRestoreMode == BlockRestoreMode::GeneralizedLoop; } - bool currentPathSolveAllowsLoopGeneralization() const { - // Disabled until the generalized loop heuristics can preserve real VMP exit - // paths without collapsing required 3.8.x targets into non-terminating IR. + bool currentPathSolveAllowsStructuredLoopGeneralization() const { + return currentPathSolveContext == PathSolveContext::ConditionalBranch || + currentPathSolveContext == PathSolveContext::DirectJump; + } + bool isStructuredLoopHeaderShape(BasicBlock* block) const { + std::set seenBlocks; + auto* current = block; + for (unsigned depth = 0; current && depth < 8; ++depth) { + if (!seenBlocks.insert(current).second || current->empty()) { + return false; + } + const size_t maxPreds = depth == 0 ? 2 : 1; + if (llvm::pred_size(current) > maxPreds) { + return false; + } + auto* branch = llvm::dyn_cast(current->getTerminator()); + if (!branch) { + return false; + } + if (branch->isConditional()) { + return true; + } + if (branch->getNumSuccessors() != 1) { + return false; + } + current = branch->getSuccessor(0); + } return false; } + bool canGeneralizeStructuredLoopHeader(uint64_t addr) { + if (getControlFlow() != ControlFlow::Unflatten || + !currentPathSolveAllowsStructuredLoopGeneralization() || + addr > blockInfo.block_address || !visitedAddresses.contains(addr) || + pendingLoopGeneralizationAddresses.contains(addr) || + generalizedLoopAddresses.contains(addr)) { + return false; + } + auto it = addrToBB.find(addr); + if (it == addrToBB.end() || !it->second || it->second->empty()) { + return false; + } + return isStructuredLoopHeaderShape(it->second); + } void liftBasicBlockFromAddress(uint64_t addr) { ++liftStats.blocks_attempted; @@ -757,7 +795,7 @@ public: BasicBlock* getLiftedBackedgeBB(uint64_t addr) { if (getControlFlow() != ControlFlow::Unflatten || - !currentPathSolveAllowsLoopGeneralization()) { + !currentPathSolveAllowsStructuredLoopGeneralization()) { return nullptr; } if (addr > blockInfo.block_address || diff --git a/lifter/test/Tester.hpp b/lifter/test/Tester.hpp index 1b4c79e..4516265 100644 --- a/lifter/test/Tester.hpp +++ b/lifter/test/Tester.hpp @@ -412,11 +412,24 @@ private: } - bool runLoopGeneralizationDirectJumpBlocked(std::string& details) { + bool runLoopGeneralizationConditionalBranchAllowed(std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::ConditionalBranch; + if (!lifter.currentPathSolveAllowsStructuredLoopGeneralization()) { + details = + " conditional-branch loop context should allow structured loop-header reuse\n"; + return false; + } + return true; + } + + bool runLoopGeneralizationDirectJumpAllowed(std::string& details) { LifterUnderTest lifter; lifter.currentPathSolveContext = LifterUnderTest::PathSolveContext::DirectJump; - if (lifter.currentPathSolveAllowsLoopGeneralization()) { - details = " direct-jump loop context must stay disabled until VMP-safe generalization exists\n"; + if (!lifter.currentPathSolveAllowsStructuredLoopGeneralization()) { + details = + " direct-jump latch context should allow structured loop-header reuse\n"; return false; } return true; @@ -424,9 +437,170 @@ private: bool runLoopGeneralizationIndirectJumpBlocked(std::string& details) { LifterUnderTest lifter; - lifter.currentPathSolveContext = LifterUnderTest::PathSolveContext::IndirectJump; - if (lifter.currentPathSolveAllowsLoopGeneralization()) { - details = " indirect-jump dispatcher context must not generalize loop state\n"; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::IndirectJump; + if (lifter.currentPathSolveAllowsStructuredLoopGeneralization()) { + details = + " indirect-jump dispatcher context must not generalize loop state\n"; + return false; + } + return true; + } + + bool runLoopGeneralizationRetBlocked(std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = LifterUnderTest::PathSolveContext::Ret; + if (lifter.currentPathSolveAllowsStructuredLoopGeneralization()) { + details = " return-path loop context must not generalize loop state\n"; + return false; + } + return true; + } + + bool runStructuredLoopHeaderAllowsConditionalBackedge(std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::ConditionalBranch; + + auto* current = llvm::BasicBlock::Create(lifter.context, "current", lifter.fnc); + auto* header = llvm::BasicBlock::Create(lifter.context, "loop_header", lifter.fnc); + auto* body = llvm::BasicBlock::Create(lifter.context, "loop_body", lifter.fnc); + auto* exit = llvm::BasicBlock::Create(lifter.context, "loop_exit", lifter.fnc); + + llvm::IRBuilder<> currentBuilder(current); + currentBuilder.CreateBr(header); + + llvm::IRBuilder<> headerBuilder(header); + headerBuilder.CreateCondBr(llvm::ConstantInt::getTrue(lifter.context), body, exit); + + llvm::IRBuilder<> bodyBuilder(body); + bodyBuilder.CreateRet(llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 0)); + + llvm::IRBuilder<> exitBuilder(exit); + exitBuilder.CreateRet(llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 1)); + + lifter.blockInfo = BBInfo(0x2000, current); + lifter.visitedAddresses.insert(0x1000); + lifter.addrToBB[0x1000] = header; + + if (!lifter.canGeneralizeStructuredLoopHeader(0x1000)) { + details = " visited conditional loop header should be eligible for structured reuse\n"; + return false; + } + return true; + } + + bool runStructuredLoopHeaderAllowsJumpChain(std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::DirectJump; + + auto* current = llvm::BasicBlock::Create(lifter.context, "current", lifter.fnc); + auto* trampoline = + llvm::BasicBlock::Create(lifter.context, "loop_trampoline", lifter.fnc); + auto* header = llvm::BasicBlock::Create(lifter.context, "loop_header", lifter.fnc); + auto* body = llvm::BasicBlock::Create(lifter.context, "loop_body", lifter.fnc); + auto* exit = llvm::BasicBlock::Create(lifter.context, "loop_exit", lifter.fnc); + + llvm::IRBuilder<> currentBuilder(current); + currentBuilder.CreateBr(trampoline); + + llvm::IRBuilder<> trampolineBuilder(trampoline); + trampolineBuilder.CreateBr(header); + + llvm::IRBuilder<> headerBuilder(header); + headerBuilder.CreateCondBr(llvm::ConstantInt::getTrue(lifter.context), body, exit); + + llvm::IRBuilder<> bodyBuilder(body); + bodyBuilder.CreateRet( + llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 0)); + + llvm::IRBuilder<> exitBuilder(exit); + exitBuilder.CreateRet( + llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 1)); + + lifter.blockInfo = BBInfo(0x2000, current); + lifter.visitedAddresses.insert(0x1000); + lifter.addrToBB[0x1000] = trampoline; + + if (!lifter.canGeneralizeStructuredLoopHeader(0x1000)) { + details = + " direct-jump trampoline chain into a conditional header should be eligible for structured reuse\n"; + return false; + } + return true; + } + + + bool runStructuredLoopHeaderRejectsNonConditionalTerminator( + std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::ConditionalBranch; + + auto* current = llvm::BasicBlock::Create(lifter.context, "current", lifter.fnc); + auto* header = llvm::BasicBlock::Create(lifter.context, "loop_header", lifter.fnc); + auto* exit = llvm::BasicBlock::Create(lifter.context, "loop_exit", lifter.fnc); + + llvm::IRBuilder<> currentBuilder(current); + currentBuilder.CreateBr(header); + + llvm::IRBuilder<> headerBuilder(header); + headerBuilder.CreateBr(exit); + + llvm::IRBuilder<> exitBuilder(exit); + exitBuilder.CreateRet(llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 0)); + + lifter.blockInfo = BBInfo(0x2000, current); + lifter.visitedAddresses.insert(0x1000); + lifter.addrToBB[0x1000] = header; + + if (lifter.canGeneralizeStructuredLoopHeader(0x1000)) { + details = " non-conditional header must not be treated as a structured loop header\n"; + return false; + } + return true; + } + + bool runStructuredLoopHeaderRejectsMultiplePredecessors(std::string& details) { + LifterUnderTest lifter; + lifter.currentPathSolveContext = + LifterUnderTest::PathSolveContext::ConditionalBranch; + + auto* current = llvm::BasicBlock::Create(lifter.context, "current", lifter.fnc); + auto* alternate = llvm::BasicBlock::Create(lifter.context, "alternate", lifter.fnc); + auto* third = llvm::BasicBlock::Create(lifter.context, "third", lifter.fnc); + auto* header = llvm::BasicBlock::Create(lifter.context, "loop_header", lifter.fnc); + auto* body = llvm::BasicBlock::Create(lifter.context, "loop_body", lifter.fnc); + auto* exit = llvm::BasicBlock::Create(lifter.context, "loop_exit", lifter.fnc); + + llvm::IRBuilder<> currentBuilder(current); + currentBuilder.CreateBr(header); + + llvm::IRBuilder<> alternateBuilder(alternate); + alternateBuilder.CreateBr(header); + + llvm::IRBuilder<> thirdBuilder(third); + thirdBuilder.CreateBr(header); + + llvm::IRBuilder<> headerBuilder(header); + headerBuilder.CreateCondBr(llvm::ConstantInt::getTrue(lifter.context), body, exit); + + llvm::IRBuilder<> bodyBuilder(body); + bodyBuilder.CreateRet( + llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 0)); + + llvm::IRBuilder<> exitBuilder(exit); + exitBuilder.CreateRet( + llvm::ConstantInt::get(llvm::Type::getInt64Ty(lifter.context), 1)); + + lifter.blockInfo = BBInfo(0x2000, current); + lifter.visitedAddresses.insert(0x1000); + lifter.addrToBB[0x1000] = header; + + if (lifter.canGeneralizeStructuredLoopHeader(0x1000)) { + details = + " header with more than two predecessors must not be generalized as a structured loop\n"; return false; } return true; @@ -537,10 +711,23 @@ private: &InstructionTester::runScasRepeatPrefixesRejected); runCustom("loop_addrsize_override_rejected", &InstructionTester::runLoopAddressSizeOverrideRejected); - runCustom("loop_generalization_direct_jump_blocked", - &InstructionTester::runLoopGeneralizationDirectJumpBlocked); + runCustom("loop_generalization_conditional_branch_allowed", + &InstructionTester::runLoopGeneralizationConditionalBranchAllowed); + runCustom("loop_generalization_direct_jump_allowed", + &InstructionTester::runLoopGeneralizationDirectJumpAllowed); runCustom("loop_generalization_indirect_jump_blocked", &InstructionTester::runLoopGeneralizationIndirectJumpBlocked); + runCustom("loop_generalization_ret_blocked", + &InstructionTester::runLoopGeneralizationRetBlocked); + runCustom("structured_loop_header_allows_conditional_backedge", + &InstructionTester::runStructuredLoopHeaderAllowsConditionalBackedge); + runCustom("structured_loop_header_allows_jump_chain", + &InstructionTester::runStructuredLoopHeaderAllowsJumpChain); + + runCustom("structured_loop_header_rejects_non_conditional_terminator", + &InstructionTester::runStructuredLoopHeaderRejectsNonConditionalTerminator); + runCustom("structured_loop_header_rejects_multiple_predecessors", + &InstructionTester::runStructuredLoopHeaderRejectsMultiplePredecessors); runCustom("generalized_loop_without_bypass_tag_keeps_normal_restore", &InstructionTester::runGeneralizedLoopWithoutBypassTagKeepsNormalRestore); runCustom("generalized_loop_with_bypass_tag_uses_generalized_restore", diff --git a/scripts/rewrite/instruction_microtests.json b/scripts/rewrite/instruction_microtests.json index b7b7a09..03e2796 100644 --- a/scripts/rewrite/instruction_microtests.json +++ b/scripts/rewrite/instruction_microtests.json @@ -100,9 +100,9 @@ "patterns": [ { "line_all": ["and i32", ", 1"] }, { "line_all": ["and i32", ", 7"] }, - "phi i64", - "select i1", - "icmp ugt i32", + "phi i32", + "add i32", + "sub i32", "br i1" ], "semantic": [ @@ -120,10 +120,10 @@ "patterns": [ { "line_all": ["and i32", ", 1"] }, { "line_all": ["and i32", ", 7"] }, - "select i1", - "phi i64", - "icmp ugt i32", - "br i1" + "phi i32", + "add i32", + "sub i32", + "ret i64 %common.ret.op" ], "semantic": [ { "inputs": { "RCX": 0 }, "expected": 40, "label": "even program returns constant handler" }, @@ -137,17 +137,14 @@ { "name": "stack_vm_loop", "symbol": "stack_vm_loop_target", - "skip": true, - "skip_reason": "Safe VMP-mode lifting currently disables loop-header generalization; this stack-based VM loop still exceeds the block budget without it.", "patterns": [ { "line_all": ["and i32", ", 1"] }, { "line_all": ["and i32", ", 7"] }, - "switch i32", "phi i32", - "sub i32", - "select i1", - "add i32", - "ret i64" + "add nsw i32", + "mul i33", + "lshr exact i33", + "ret i64 %common.ret.op" ], "semantic": [ { "inputs": { "RCX": 0 }, "expected": 40, "label": "even program returns constant handler" }, @@ -265,8 +262,6 @@ { "name": "calc_sum_to_n", "symbol": "calc_sum_to_n", - "skip": true, - "skip_reason": "Safe VMP-mode lifting currently disables loop-header generalization; this counted-loop sample still explodes the block budget without it.", "patterns": ["phi i32", "icmp slt i32", "add i32", "br i1"], "semantic": [ { "inputs": { "RCX": 0 }, "expected": 0, "label": "n=0" },