Files
Mergen/scripts/rewrite/build_full_handler_seed.py
naci b2699fb3a8 lifter: model XGETBV deterministically (#107)
* lifter: model XGETBV deterministically

Add XGETBV opcode support and model selector 0 as a deterministic XCR0 value (0x7: x87+SSE+AVX enabled), with zero returned for other selectors. This follows the existing CPUID deterministic-model approach for static lifting/deobfuscation.

Verification:

- build_iced lifter rewrite_microtests

- rewrite_microtests.exe xgetbv_returns_deterministic_xcr0 int29_fastfail_lowered_to_noreturn_call solve_path_widens_mapped_rva_target normalize_runtime_target_widens_mapped_rva_target

- python test.py quick

- python test.py vmp

* rewrite: seed deterministic XGETBV handler

The XGETBV semantics patch is deterministic by design, so the full-handler oracle pipeline must not use Unicorn's host-specific result. Add a manual handler seed entry for xgetbv bytes and computed expected outputs, then regenerate the enriched seed and oracle vectors to match the lifter model (selector 0 -> EAX=0x7, EDX=0).

Verification:

- scripts\rewrite\run_all_handlers.cmd

- python test.py quick

- python test.py vmp

---------

Co-authored-by: yusufcanislek <yusuf.canislek@meetdandy.com>
2026-04-20 19:29:01 +03:00

597 lines
21 KiB
Python

#!/usr/bin/env python3
import argparse
import json
import random
import re
from pathlib import Path
from typing import Dict, List, Optional, Set
from capstone import Cs, CS_ARCH_X86, CS_MODE_64
MNEMONIC_ALIAS = {
"jae": "jnb",
"jnae": "jb",
"jna": "jbe",
"ja": "jnbe",
"jnge": "jl",
"jge": "jnl",
"jg": "jnle",
"jng": "jle",
"jpe": "jp",
"jpo": "jnp",
"je": "jz",
"jne": "jnz",
"sete": "setz",
"setne": "setnz",
"setae": "setnb",
"setna": "setbe",
"seta": "setnbe",
"setge": "setnl",
"setg": "setnle",
"setpe": "setp",
"setpo": "setnp",
"cmovae": "cmovnb",
"cmovna": "cmovbe",
"cmova": "cmovnbe",
"cmovge": "cmovnl",
"cmovg": "cmovnle",
"cmovpe": "cmovp",
"cmovpo": "cmovnp",
}
MANUAL_HANDLER_CASES = {
"imul2": {
"mnemonic": "imul",
"instruction_bytes": [0x48, 0xF7, 0xE9],
"initial": {
"registers": {"RAX": "0x7", "RDX": "0x0", "RCX": "0x3"},
"flags": {},
},
},
"mul2": {
"mnemonic": "mul",
"instruction_bytes": [0x48, 0xF7, 0xE1],
"initial": {
"registers": {"RAX": "0x7", "RDX": "0x0", "RCX": "0x3"},
"flags": {},
},
},
"div2": {
"mnemonic": "div",
"instruction_bytes": [0x48, 0xF7, 0xF1],
"initial": {
"registers": {"RAX": "0x10", "RDX": "0x0", "RCX": "0x2"},
"flags": {},
},
},
"idiv2": {
"mnemonic": "idiv",
"instruction_bytes": [0x48, 0xF7, 0xF9],
"initial": {
"registers": {"RAX": "0x10", "RDX": "0x0", "RCX": "0x2"},
"flags": {},
},
},
"idiv": {
"mnemonic": "idiv",
"instruction_bytes": [0xF7, 0xF9], # idiv ecx (32-bit)
"initial": {
"registers": {"RAX": "0x10", "RDX": "0x0", "RCX": "0x3"},
"flags": {},
},
},
"blsi": {
"mnemonic": "blsi",
"instruction_bytes": [0xC4, 0xE2, 0x78, 0xF3, 0xD9], # blsi eax, ecx
},
"blsmsk": {
"mnemonic": "blsmsk",
"instruction_bytes": [0xC4, 0xE2, 0x78, 0xF3, 0xD1], # blsmsk eax, ecx
},
"blsr": {
"mnemonic": "blsr",
"instruction_bytes": [0xC4, 0xE2, 0x78, 0xF3, 0xC9], # blsr eax, ecx
},
# ---- Conditional jumps (short-form jcc rel8, offset=+0x10) ----
# Each uses initial flags designed to make the branch TAKEN.
"jz": {"mnemonic": "je", "instruction_bytes": [0x74, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_ZF": 1}}},
"jnz": {"mnemonic": "jne", "instruction_bytes": [0x75, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_ZF": 0}}},
"jb": {"mnemonic": "jb", "instruction_bytes": [0x72, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 1}}},
"jnb": {"mnemonic": "jae", "instruction_bytes": [0x73, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 0}}},
"jbe": {"mnemonic": "jbe", "instruction_bytes": [0x76, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 1, "FLAG_ZF": 0}}},
"jnbe": {"mnemonic": "ja", "instruction_bytes": [0x77, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 0, "FLAG_ZF": 0}}},
"jl": {"mnemonic": "jl", "instruction_bytes": [0x7C, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1, "FLAG_OF": 0}}},
"jnl": {"mnemonic": "jge", "instruction_bytes": [0x7D, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0, "FLAG_OF": 0}}},
"jle": {"mnemonic": "jle", "instruction_bytes": [0x7E, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1, "FLAG_OF": 0, "FLAG_ZF": 0}}},
"jnle": {"mnemonic": "jg", "instruction_bytes": [0x7F, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0, "FLAG_OF": 0, "FLAG_ZF": 0}}},
"js": {"mnemonic": "js", "instruction_bytes": [0x78, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1}}},
"jns": {"mnemonic": "jns", "instruction_bytes": [0x79, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0}}},
"jo": {"mnemonic": "jo", "instruction_bytes": [0x70, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_OF": 1}}},
"jno": {"mnemonic": "jno", "instruction_bytes": [0x71, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_OF": 0}}},
"jp": {"mnemonic": "jp", "instruction_bytes": [0x7A, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_PF": 1}}},
"jnp": {"mnemonic": "jnp", "instruction_bytes": [0x7B, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_PF": 0}}},
"movs_x": {
"mnemonic": "movsq",
"instruction_bytes": [0x48, 0xA5],
"initial": {
"registers": {
"RSI": "0x2000",
"RDI": "0x3000",
},
"flags": {"FLAG_DF": 0},
},
},
# ---- Stack ops ----
"push": {
"mnemonic": "push",
"instruction_bytes": [0x50], # push rax
},
"pop": {
"mnemonic": "pop",
"instruction_bytes": [0x58], # pop rax
},
"pushfq": {
"mnemonic": "pushfq",
"instruction_bytes": [0x9C],
},
"popfq": {
"mnemonic": "popfq",
"instruction_bytes": [0x9D],
},
"leave": {
"mnemonic": "leave",
"instruction_bytes": [0xC9],
"initial": {
"registers": {"RBP": "0x200000"},
"flags": {},
},
},
"xgetbv": {
"mnemonic": "xgetbv",
"instruction_bytes": [0x0F, 0x01, 0xD0],
"initial": {
"registers": {"RCX": "0x0"},
"flags": {},
},
},
# ---- Control flow ----
"call": {
"mnemonic": "call",
"instruction_bytes": [0xE8, 0x10, 0x00, 0x00, 0x00], # call +0x10
},
"ret": {
"mnemonic": "ret",
"instruction_bytes": [0xC3],
# RSP must NOT equal STACKP_VALUE (0x14FEA0) to avoid real-return path
"initial": {
"registers": {"RSP": "0x14FF00"},
"flags": {},
},
},
"jmp": {
"mnemonic": "jmp",
"instruction_bytes": [0xEB, 0x10], # jmp +0x10 (short)
},
# ---- String ops ----
"stosx": {
"mnemonic": "stosq",
"instruction_bytes": [0x48, 0xAB],
"initial": {
"registers": {
"RDI": "0x3000",
"RAX": "0x1122334455667788",
},
"flags": {"FLAG_DF": 0},
},
},
# ---- System flag ops ----
"cli": {
"mnemonic": "cli",
"instruction_bytes": [0xFA],
},
}
# Additional test-case variants for handlers that need both-direction testing.
# Each maps handler_name -> list of {suffix, mnemonic, instruction_bytes, initial}.
# The suffix is appended to the case name, e.g. smoke_jz_je_notaken.
VARIANT_HANDLER_CASES: Dict[str, list] = {
# ---- jcc "not-taken" variants: flags designed so branch is NOT taken ----
"jz": [{"suffix": "notaken", "mnemonic": "je", "instruction_bytes": [0x74, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_ZF": 0}}}],
"jnz": [{"suffix": "notaken", "mnemonic": "jne", "instruction_bytes": [0x75, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_ZF": 1}}}],
"jb": [{"suffix": "notaken", "mnemonic": "jb", "instruction_bytes": [0x72, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 0}}}],
"jnb": [{"suffix": "notaken", "mnemonic": "jae", "instruction_bytes": [0x73, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 1}}}],
"jbe": [{"suffix": "notaken", "mnemonic": "jbe", "instruction_bytes": [0x76, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 0, "FLAG_ZF": 0}}}],
"jnbe": [{"suffix": "notaken", "mnemonic": "ja", "instruction_bytes": [0x77, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_CF": 1, "FLAG_ZF": 0}}}],
"jl": [{"suffix": "notaken", "mnemonic": "jl", "instruction_bytes": [0x7C, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0, "FLAG_OF": 0}}}],
"jnl": [{"suffix": "notaken", "mnemonic": "jge", "instruction_bytes": [0x7D, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1, "FLAG_OF": 0}}}],
"jle": [{"suffix": "notaken", "mnemonic": "jle", "instruction_bytes": [0x7E, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0, "FLAG_OF": 0, "FLAG_ZF": 0}}}],
"jnle": [{"suffix": "notaken", "mnemonic": "jg", "instruction_bytes": [0x7F, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1, "FLAG_OF": 0, "FLAG_ZF": 0}}}],
"js": [{"suffix": "notaken", "mnemonic": "js", "instruction_bytes": [0x78, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 0}}}],
"jns": [{"suffix": "notaken", "mnemonic": "jns", "instruction_bytes": [0x79, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_SF": 1}}}],
"jo": [{"suffix": "notaken", "mnemonic": "jo", "instruction_bytes": [0x70, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_OF": 0}}}],
"jno": [{"suffix": "notaken", "mnemonic": "jno", "instruction_bytes": [0x71, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_OF": 1}}}],
"jp": [{"suffix": "notaken", "mnemonic": "jp", "instruction_bytes": [0x7A, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_PF": 0}}}],
"jnp": [{"suffix": "notaken", "mnemonic": "jnp", "instruction_bytes": [0x7B, 0x10],
"initial": {"registers": {}, "flags": {"FLAG_PF": 1}}}],
}
# Instruction byte overrides for handlers whose auto-discovered encodings
# use registers outside the default initial set (RAX, RBX, RCX, RDX).
# Each maps handler_name -> [byte, ...]. The default initial state is used.
INSTRUCTION_OVERRIDES: Dict[str, list] = {
"dec": [0xFF, 0xC9], # dec ecx
"bsr": [0x0F, 0xBD, 0xC3], # bsr eax, ebx
"btc": [0x0F, 0xBB, 0xC8], # btc eax, ecx
"btr": [0x0F, 0xB3, 0xC8], # btr eax, ecx
"bts": [0x0F, 0xAB, 0xC8], # bts eax, ecx
"sar": [0xC0, 0xF8, 0x01], # sar al, 1 (stable OF semantics)
"shl": [0xC0, 0xE0, 0x01], # shl al, 1 (stable OF semantics)
"shr": [0xC0, 0xE8, 0x01], # shr al, 1 (stable OF semantics)
"andn": [0xC4, 0xE2, 0x70, 0xF2, 0xC2], # andn eax, ecx, edx
"bextr": [0xC4, 0xE2, 0x70, 0xF7, 0xC2], # bextr eax, edx, ecx
"bzhi": [0xC4, 0xE2, 0x70, 0xF5, 0xC2], # bzhi eax, edx, ecx
"pext": [0xC4, 0xE2, 0x72, 0xF5, 0xC2], # pext eax, ecx, edx
"lea": [0x8D, 0x04, 0x11], # lea eax, [rcx+rdx]
}
SKIP_RUN_HANDLERS = set()
DEFAULT_INITIAL = {
"registers": {
"RAX": "0x1122334455667788",
"RBX": "0x8877665544332211",
"RCX": "0x10",
"RDX": "0x2",
},
"flags": {
"FLAG_CF": 0,
"FLAG_PF": 0,
"FLAG_AF": 0,
"FLAG_ZF": 0,
"FLAG_SF": 0,
"FLAG_OF": 0,
"FLAG_DF": 0,
"FLAG_IF": 1,
},
}
def strip_comments(text: str) -> str:
text = re.sub(r"/\*.*?\*/", "", text, flags=re.DOTALL)
text = re.sub(r"//.*", "", text)
return text
def parse_handlers(opcode_file: Path) -> Dict[str, List[str]]:
text = strip_comments(opcode_file.read_text(encoding="utf-8"))
handlers: Dict[str, List[str]] = {}
for match in re.finditer(r"OPCODE\((.*?)\)", text, flags=re.DOTALL):
body = match.group(1)
tokens = [token.strip() for token in body.split(",") if token.strip()]
if not tokens:
continue
handler = tokens[0].lower()
mnemonics = [token.lower() for token in tokens[1:]]
handlers[handler] = mnemonics
return handlers
def normalize_mnemonic(raw: str) -> str:
m = raw.strip().lower()
return MNEMONIC_ALIAS.get(m, m)
def load_base_seed(seed_path: Path) -> dict:
payload = json.loads(seed_path.read_text(encoding="utf-8"))
if payload.get("schema") != "mergen-oracle-seed-v1":
raise RuntimeError("Base seed schema mismatch")
if not isinstance(payload.get("cases"), list):
raise RuntimeError("Base seed has invalid cases array")
return payload
def decode_first_insn(md: Cs, blob: bytes):
decoded = list(md.disasm(blob, 0))
if not decoded:
return None
return decoded[0]
def register_sample(samples: Dict[str, Dict[str, object]], mnemonic: str, insn) -> None:
has_mem = "[" in insn.op_str
raw_bytes = list(insn.bytes)
current = samples.get(mnemonic)
if current is None or (current.get("has_memory_operand", True) and not has_mem):
samples[mnemonic] = {
"instruction_bytes": raw_bytes,
"has_memory_operand": has_mem,
}
def discover_mnemonic_samples(
targets: Set[str], iterations: int, bytes_per_iteration: int, seed: int
) -> Dict[str, Dict[str, object]]:
md = Cs(CS_ARCH_X86, CS_MODE_64)
md.detail = False
rng = random.Random(seed)
samples: Dict[str, Dict[str, object]] = {}
for _ in range(max(1, iterations)):
unresolved = {mn for mn in targets if mn not in samples}
if not unresolved:
break
blob = bytes(rng.getrandbits(8) for _ in range(max(1024, bytes_per_iteration)))
offset = 0
while offset < len(blob):
insn = decode_first_insn(md, blob[offset : offset + 15])
if insn is None:
offset += 1
continue
mnemonic = normalize_mnemonic(insn.mnemonic)
if mnemonic in unresolved:
register_sample(samples, mnemonic, insn)
offset += max(1, insn.size)
return samples
def targeted_prefix_discovery(
samples: Dict[str, Dict[str, object]],
targets: Set[str],
seed: int,
) -> None:
unresolved = {mn for mn in targets if mn not in samples}
if not unresolved:
return
md = Cs(CS_ARCH_X86, CS_MODE_64)
md.detail = False
rng = random.Random(seed ^ 0xA5A5A5A5)
def probe(candidate: bytes):
insn = decode_first_insn(md, candidate)
if insn is None:
return
mnemonic = normalize_mnemonic(insn.mnemonic)
if mnemonic in targets:
register_sample(samples, mnemonic, insn)
# POPCNT/TZCNT/LZCNT families
for opcode in (0xB8, 0xBC, 0xBD):
for modrm in range(256):
probe(bytes([0xF3, 0x0F, opcode, modrm]))
# BLSI/BLSMSK/BLSR family
for modrm in range(256):
probe(bytes([0xF3, 0x0F, 0x38, 0xF3, modrm]))
# BEXTR legacy-ish forms
for prefix in (0xF2, 0xF3, 0x66):
for modrm in range(256):
for imm in (0x00, 0x01, 0x10, 0x20, 0x7F):
probe(bytes([prefix, 0x0F, 0x38, 0xF7, modrm, imm]))
# VEX-encoded probing for ANDN/BEXTR/BZHI/PEXT/PDEP family
for opcode in (0xF2, 0xF5, 0xF7):
for _ in range(300000):
b1 = rng.getrandbits(8)
b2 = rng.getrandbits(8)
modrm = rng.getrandbits(8)
imm = rng.getrandbits(8)
probe(bytes([0xC4, b1, b2, opcode, modrm]))
probe(bytes([0xC4, b1, b2, opcode, modrm, imm]))
if all(mn in samples for mn in targets):
return
def _merge_initial(initial: Optional[dict]) -> dict:
merged = {
"registers": dict(DEFAULT_INITIAL["registers"]),
"flags": dict(DEFAULT_INITIAL["flags"]),
}
if initial is None:
return merged
reg_overrides = initial.get("registers", {})
flag_overrides = initial.get("flags", {})
if reg_overrides:
merged["registers"].update(reg_overrides)
if flag_overrides:
merged["flags"].update(flag_overrides)
return merged
def build_smoke_case(
handler: str,
mnemonic: str,
instruction_bytes: List[int],
initial: Optional[dict] = None,
run_enabled: bool = True,
) -> dict:
case = {
"name": f"smoke_{handler}_{mnemonic}",
"handler": handler,
"instruction_bytes": instruction_bytes,
"initial": _merge_initial(initial),
"expected": {"registers": {}, "flags": {}},
"oracle": "none",
"source": "capstone-auto-discovery",
}
if not run_enabled:
case["skip"] = True
case["skip_reason"] = "known-crashing handler path"
return case
def main() -> None:
parser = argparse.ArgumentParser(
description="Build full handler seed using Capstone auto-discovery"
)
parser.add_argument("--opcode-file", default="lifter/semantics/x86_64_opcodes.x")
parser.add_argument("--base-seed", default="scripts/rewrite/oracle_seed_vectors.json")
parser.add_argument("--out-seed", default="scripts/rewrite/oracle_seed_full_handlers.json")
parser.add_argument("--iterations", type=int, default=12)
parser.add_argument("--bytes-per-iteration", type=int, default=2 * 1024 * 1024)
parser.add_argument("--seed", type=int, default=1337)
args = parser.parse_args()
opcode_file = Path(args.opcode_file)
base_seed_path = Path(args.base_seed)
out_seed_path = Path(args.out_seed)
handlers = parse_handlers(opcode_file)
base_seed = load_base_seed(base_seed_path)
covered_handlers = {
str(case.get("handler", "")).strip().lower()
for case in base_seed["cases"]
if str(case.get("handler", "")).strip()
}
missing_handlers = sorted(set(handlers.keys()) - covered_handlers)
target_mnemonics: Set[str] = set()
for handler in missing_handlers:
if handler in MANUAL_HANDLER_CASES:
continue
for mnemonic in handlers.get(handler, []):
target_mnemonics.add(normalize_mnemonic(mnemonic))
samples = discover_mnemonic_samples(
targets=target_mnemonics,
iterations=args.iterations,
bytes_per_iteration=args.bytes_per_iteration,
seed=args.seed,
)
targeted_prefix_discovery(samples, target_mnemonics, args.seed)
auto_cases = []
unresolved_handlers = []
for handler in missing_handlers:
if handler in MANUAL_HANDLER_CASES:
manual = MANUAL_HANDLER_CASES[handler]
auto_cases.append(
build_smoke_case(
handler=handler,
mnemonic=manual["mnemonic"],
instruction_bytes=manual["instruction_bytes"],
initial=manual.get("initial"),
run_enabled=handler not in SKIP_RUN_HANDLERS,
)
)
# Emit variant cases (e.g. not-taken jcc)
if handler in VARIANT_HANDLER_CASES:
for variant in VARIANT_HANDLER_CASES[handler]:
vcase = build_smoke_case(
handler=handler,
mnemonic=variant["mnemonic"],
instruction_bytes=variant["instruction_bytes"],
initial=variant.get("initial"),
run_enabled=handler not in SKIP_RUN_HANDLERS,
)
vcase["name"] += f"_{variant['suffix']}"
auto_cases.append(vcase)
continue
selected = None
selected_mnemonic = None
for mnemonic in handlers.get(handler, []):
normalized = normalize_mnemonic(mnemonic)
sample = samples.get(normalized)
if sample is None:
continue
selected = sample
selected_mnemonic = normalized
if not sample.get("has_memory_operand", True):
break
if selected is None or selected_mnemonic is None:
unresolved_handlers.append(handler)
continue
# Apply instruction byte override if available
insn_bytes = list(selected["instruction_bytes"])
if handler in INSTRUCTION_OVERRIDES:
insn_bytes = list(INSTRUCTION_OVERRIDES[handler])
auto_cases.append(
build_smoke_case(
handler=handler,
mnemonic=selected_mnemonic,
instruction_bytes=insn_bytes,
run_enabled=handler not in SKIP_RUN_HANDLERS,
)
)
if unresolved_handlers:
unresolved = ", ".join(unresolved_handlers)
raise RuntimeError(
f"Failed to discover instruction bytes for handlers: {unresolved}"
)
merged_cases = list(base_seed["cases"]) + auto_cases
output = {
"schema": "mergen-oracle-seed-v1",
"base_seed": str(base_seed_path),
"generator": "build_full_handler_seed.py",
"cases": merged_cases,
}
out_seed_path.parent.mkdir(parents=True, exist_ok=True)
out_seed_path.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
final_handlers = {
str(case.get("handler", "")).strip().lower()
for case in merged_cases
if str(case.get("handler", "")).strip()
}
print(f"Generated full-handler seed: {out_seed_path}")
print(f"Base cases: {len(base_seed['cases'])} | Auto smoke cases: {len(auto_cases)}")
print(f"Handler coverage in seed: {len(final_handlers)} / {len(handlers)}")
if __name__ == "__main__":
main()