Files
Mergen/lifter/analysis/PathSolver.ipp
T
naci 20f80b672e lifter: prefer file-backed queued targets (#105)
The next Themida follow-up after PR #104 exposed another low-target alias problem: queued/control-flow destinations like 0x52532 were treated as already paged because the synthetic stack range covered them, so they never widened to their file-backed image RVA forms.

This patch splits target normalization into two policies: PathSolver keeps the broad paged normalization it needs for resolved loop/control-flow work, while getOrCreateBB/getUnvisitedAddr use a stricter file-backed normalization that prefers image-backed addresses over low stack aliases. It also queues the fake indirect-call return targets in Semantics.ipp so those destinations actually enter the worklist.

Observed effect on example2-virt.bin @ 0x140001000: 24 attempted / 1086 instructions -> 35 attempted / 1565 instructions, with new reached addresses in the 0x14001xxxx and 0x14002xxxx ranges.

Verification:

- build_iced lifter rewrite_microtests

- rewrite_microtests.exe solve_path_widens_mapped_rva_target normalize_runtime_target_widens_mapped_rva_target solve_load_infers_concrete_base_from_tracked_load generalized_loop_restore_merges_backedge_register_state

- python test.py quick

- python test.py vmp

- build_iced\lifter.exe ..\testthemida\example2-virt.bin 0x140001000

Co-authored-by: yusufcanislek <yusuf.canislek@meetdandy.com>
2026-04-19 19:31:41 +03:00

370 lines
14 KiB
C++

#pragma once
#include "CommonDisassembler.hpp"
#include "PathSolver.h"
#include "LifterClass.hpp"
#include "Utils.h"
#include <llvm/Analysis/AliasAnalysis.h>
#include <llvm/Analysis/MemorySSA.h>
#include <llvm/Analysis/MemorySSAUpdater.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/IR/Function.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstrTypes.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Value.h>
#include <llvm/IR/CFG.h>
#include <llvm/Support/Casting.h>
#include <limits>
MERGEN_LIFTER_DEFINITION_TEMPLATES(PATH_info)::solvePath(
llvm::Function* function, uint64_t& dest, Value* simplifyValue) {
auto pathSolveSample = profiler.sample("lift_path_solve");
// Clear memoization cache for value enumeration.
// Each solvePath invocation may have different assumptions
// (from different branch paths), so cached results don't carry over.
pv_cache.clear();
auto normalizeTargetAddress = [&](uint64_t target) -> uint64_t {
return normalizeRuntimeTargetAddress(target);
};
struct ResolvedTargetBlock {
BasicBlock* block;
bool reusedBackedge;
bool generalizedBackup;
};
auto resolveTargetBlock = [&](uint64_t target, const std::string& name)
-> ResolvedTargetBlock {
if (auto* reused = getLiftedBackedgeBB(target)) {
record_generalized_loop_backedge(reused);
return {reused, true, false};
}
const bool backwardVisitedTarget =
visitedAddresses.contains(target) &&
target <= blockInfo.block_address;
auto it = addrToBB.find(target);
const bool hasPendingGeneralization =
pendingLoopGeneralizationAddresses.contains(target);
// `resolveTargetBlock` is only reached with a concrete destination, so an
// indirect jump whose target has just resolved participates in the same
// structured-loop generalization path that direct and conditional jumps
// already take.
const bool canUseStructuredLoopGeneralization =
currentPathSolveAllowsStructuredLoopGeneralizationForResolvedTarget();
const bool canReusePendingGeneralization =
hasPendingGeneralization && canUseStructuredLoopGeneralization;
const bool wantsGeneralization =
canReusePendingGeneralization ||
(backwardVisitedTarget &&
canGeneralizeStructuredLoopHeader(target,
/*targetResolvedConcretely=*/true));
if (wantsGeneralization) {
// A resolved backward target participates in the same stack-concolic
// bypass regime regardless of whether the source jump is direct or
// indirect — both represent a confirmed loop back-edge.
if (currentPathSolveContext == PathSolveContext::DirectJump ||
currentPathSolveContext == PathSolveContext::IndirectJump) {
stackBypassGeneralizedLoopAddresses.insert(target);
}
const bool generalizedBackup =
canUseStructuredLoopGeneralization &&
stackBypassGeneralizedLoopAddresses.contains(target);
if (canReusePendingGeneralization && it != addrToBB.end() && it->second &&
it->second->empty()) {
return {it->second, false, generalizedBackup};
}
if (!hasPendingGeneralization) {
pendingLoopGeneralizationAddresses.insert(target);
}
if (it != addrToBB.end() && it->second && !it->second->empty()) {
return {replaceWithGeneralizedLoopBlock(target, name), false,
generalizedBackup};
}
return {getOrCreateBB(target, name), false, generalizedBackup};
}
return {getOrCreateBB(target, name), false, false};
};
auto backupQueuedTarget = [&](BasicBlock* targetBlock, bool generalizedBackup) {
if (targetBlock == liftAbortBlock) {
return;
}
branch_backup(targetBlock, generalizedBackup);
};
// do static polymorphism here
PATH_info result = PATH_unsolved;
if (llvm::ConstantInt* constInt =
dyn_cast<llvm::ConstantInt>(simplifyValue)) {
dest = normalizeTargetAddress(constInt->getZExtValue());
result = PATH_solved;
run = 0;
auto resolved = resolveTargetBlock(dest, "bb_solved_const");
builder->CreateBr(resolved.block);
if (!resolved.reusedBackedge) {
blockInfo = BBInfo(dest, resolved.block);
printvalue2("pushing block");
backupQueuedTarget(blockInfo.block, resolved.generalizedBackup);
unvisitedBlocks.push_back(blockInfo);
}
return result;
}
if (PATH_info solved = getConstraintVal(function, simplifyValue, dest)) {
dest = normalizeTargetAddress(dest);
if (solved == PATH_solved) {
run = 0;
std::cout << "Solved the constraint and moving to next path\n"
<< std::flush;
auto resolved = resolveTargetBlock(dest, "bb_solved");
builder->CreateBr(resolved.block);
if (!resolved.reusedBackedge) {
blockInfo = BBInfo(dest, resolved.block);
printvalue2("pushing block");
backupQueuedTarget(blockInfo.block, resolved.generalizedBackup);
unvisitedBlocks.push_back(blockInfo);
}
return solved;
}
}
// unsolved
printvalue(simplifyValue);
run = 0;
auto pvset = computePossibleValues(simplifyValue);
std::vector<APInt> pv(pvset.begin(), pvset.end());
if (pv.size() == 1) {
printvalue2(pv[0]);
dest = normalizeTargetAddress(pv[0].getZExtValue());
result = PATH_solved;
auto resolved = resolveTargetBlock(dest, "bb_single");
builder->CreateBr(resolved.block);
if (!resolved.reusedBackedge) {
blockInfo = BBInfo(dest, resolved.block);
printvalue2("pushing block");
backupQueuedTarget(blockInfo.block, resolved.generalizedBackup);
unvisitedBlocks.push_back(blockInfo);
}
return result;
}
if (pv.size() == 2) {
// auto bb_false = BasicBlock::Create(function->getContext(), "bb_false",
// builder->GetInsertBlock()->getParent());
// auto bb_true = BasicBlock::Create(function->getContext(), "bb_true",
// builder->GetInsertBlock()->getParent());
auto firstcase = pv[0];
auto secondcase = pv[1];
auto try_simplify = [&](APInt c1,
Value* simplifyv) -> std::optional<Value*> {
if (auto si = dyn_cast<SelectInst>(simplifyv)) {
auto firstcase_v = builder->getIntN(
simplifyv->getType()->getIntegerBitWidth(), c1.getZExtValue());
if (si->getTrueValue() == firstcase_v) {
printvalue(si);
printvalue(firstcase_v);
return si->getCondition();
}
}
return std::nullopt;
};
Value* condition = nullptr;
// condition value is a kind of hack
// 1- if its a select, we can extract the condition
// 1a- if firstcase is in the select, extract the condition
// 1b- if secondcase is in the select, extract the condition and reverse
// values
// 2- create a hacky compare for condition == potentialvalue
printvalue2(firstcase);
printvalue2(secondcase);
if (auto can_simplify = try_simplify(firstcase, simplifyValue)) {
printvalue2("b");
condition = can_simplify.value();
} else if (auto can_simplify2 = try_simplify(secondcase, simplifyValue)) {
// TODO: fix?
printvalue2("c");
std::swap(firstcase, secondcase);
condition = can_simplify2.value();
} else {
printvalue2("a");
condition = createICMPFolder(
llvm::CmpInst::ICMP_EQ, simplifyValue,
builder->getIntN(simplifyValue->getType()->getIntegerBitWidth(),
firstcase.getZExtValue()));
}
printvalue2(firstcase);
printvalue2(secondcase);
const uint64_t firstTarget =
normalizeTargetAddress(firstcase.getZExtValue());
const uint64_t secondTarget =
normalizeTargetAddress(secondcase.getZExtValue());
auto trueTarget = resolveTargetBlock(firstTarget, "bb_true");
auto falseTarget = resolveTargetBlock(secondTarget, "bb_false");
auto* bb_true = trueTarget.block;
auto* bb_false = falseTarget.block;
printvalue(condition);
auto BR = builder->CreateCondBr(condition, bb_true, bb_false);
RegisterBranch(BR);
printvalue2(firstcase);
printvalue2(secondcase);
blockInfo = BBInfo(secondTarget, bb_false);
// for [this], we can assume condition is true
// we can simplify any value tied to is dependent on condition,
// and try to simplify any value calculates condition
// for [newlifter], we can assume condition is false
auto newblock = BBInfo(firstTarget, bb_true);
// this->blockInfo = newblock;
printvalue(condition);
// lifters.push_back(newlifter);
// Constant conditions are already resolved — only track assumptions
// for instruction-produced conditions that need runtime disambiguation.
if (auto* condInst = dyn_cast<Instruction>(condition)) {
if (!falseTarget.reusedBackedge && blockInfo.block != liftAbortBlock) {
assumptions[condInst] = 0;
backupQueuedTarget(blockInfo.block, falseTarget.generalizedBackup);
addUnvisitedAddr(blockInfo);
}
if (!trueTarget.reusedBackedge && newblock.block != liftAbortBlock) {
this->assumptions[condInst] = 1;
backupQueuedTarget(newblock.block, trueTarget.generalizedBackup);
addUnvisitedAddr(newblock);
}
} else {
// Condition is a constant (e.g., from folded ICMP). Both branches
// are statically determined — back them up without assumption state.
if (!falseTarget.reusedBackedge && blockInfo.block != liftAbortBlock) {
backupQueuedTarget(blockInfo.block, falseTarget.generalizedBackup);
addUnvisitedAddr(blockInfo);
}
if (!trueTarget.reusedBackedge && newblock.block != liftAbortBlock) {
backupQueuedTarget(newblock.block, trueTarget.generalizedBackup);
addUnvisitedAddr(newblock);
}
}
debugging::doIfDebug([&]() {
std::string Filename = "output_newpath.ll";
std::error_code EC;
llvm::raw_fd_ostream OS(Filename, EC);
function->getParent()->print(OS, nullptr);
std::cout << "created a new path\n" << std::flush;
});
}
if (pv.size() > 2) {
// N-way branch: emit SwitchInst for multi-target resolution.
// Default must stay unresolved because computePossibleValues() is heuristic.
unsigned bitWidth = simplifyValue->getType()->getIntegerBitWidth();
auto* bb_default_unresolved =
createBudgetedBasicBlock("bb_switch_default_unresolved", current_address);
if (bb_default_unresolved == liftAbortBlock) {
builder->CreateRet(UndefValue::get(function->getReturnType()));
return PATH_unsolved;
}
DTU->applyUpdates(
{{DominatorTree::Insert, this->blockInfo.block, bb_default_unresolved}});
auto* SI = builder->CreateSwitch(
simplifyValue, bb_default_unresolved, static_cast<unsigned>(pv.size()));
// Add every discovered target as an explicit case.
std::set<uint64_t> emittedTargets;
size_t switchCaseIndex = 0;
for (const auto& caseVal : pv) {
const uint64_t normalizedTarget =
normalizeTargetAddress(caseVal.getZExtValue());
if (!emittedTargets.insert(normalizedTarget).second) {
continue;
}
// computePossibleValues cross-products uncorrelated select branches,
// which can produce spurious targets outside mapped memory. Skip them
// rather than crashing when the lifter tries to decode bytes there.
if (!isMemPaged(normalizedTarget)) {
std::cout << "[diag] skipping unmapped switch target 0x"
<< std::hex << normalizedTarget << std::dec << "\n"
<< std::flush;
continue;
}
auto bb_case = getOrCreateBB(
normalizedTarget, "bb_switch_" + std::to_string(switchCaseIndex++));
SI->addCase(
cast<ConstantInt>(builder->getIntN(bitWidth, normalizedTarget)),
bb_case);
auto caseBlock = BBInfo(normalizedTarget, bb_case);
addUnvisitedAddr(caseBlock);
branch_backup(caseBlock.block);
}
// Conservative fallback for values not enumerated in pv:
// keep default path data-dependent instead of returning undef, which can
// let later optimizations fold valid cases into arbitrary constants.
llvm::IRBuilder<> defaultBuilder(bb_default_unresolved);
Value* unresolvedRet = simplifyValue;
if (unresolvedRet->getType() != function->getReturnType()) {
if (unresolvedRet->getType()->isIntegerTy() &&
function->getReturnType()->isIntegerTy()) {
unresolvedRet = defaultBuilder.CreateZExtOrTrunc(
unresolvedRet, function->getReturnType(),
"switch_default_unresolved");
} else {
unresolvedRet = UndefValue::get(function->getReturnType());
}
}
defaultBuilder.CreateRet(unresolvedRet);
// Destination remains unknown for multi-target switches.
dest = 0;
result = PATH_multi_solved;
debugging::doIfDebug([&]() {
std::string Filename = "output_switch.ll";
std::error_code EC;
llvm::raw_fd_ostream OS(Filename, EC);
function->getParent()->print(OS, nullptr);
std::cout << "created multi-target switch with " << emittedTargets.size()
<< " targets\n"
<< std::flush;
});
}
if (pv.empty()) {
// computePossibleValues exhausted its budget without resolving any
// concrete targets. Emit an undef-return sentinel so the block has a
// valid terminator; do NOT use unreachable, which would let LLVM erase
// this reachable-but-unresolved path.
builder->CreateRet(UndefValue::get(function->getReturnType()));
}
return result;
}