From 0add6b8c1f3a7ffe0bbb380dfaa2ead3037d879f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcin=20Miko=C5=82ajczyk?= Date: Thu, 14 May 2026 05:33:50 +0200 Subject: [PATCH] Neo: packed math for unsigned/signed integers (#4407) --- .../frontend/translate/translate.cpp | 50 ++++++- .../frontend/translate/translate.h | 14 +- .../frontend/translate/vector_alu.cpp | 136 ++++++++++++++++-- 3 files changed, 185 insertions(+), 15 deletions(-) diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index e29e43115..043c973c8 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -766,7 +766,7 @@ T Translator::GetSrc64(const InstOperand& operand) { template IR::U64 Translator::GetSrc64(const InstOperand&); template IR::F64 Translator::GetSrc64(const InstOperand&); -template +template pk_type Translator::GetSrcPk(const InstOperand& operand) { constexpr bool is_float = std::is_same_v; @@ -794,7 +794,8 @@ pk_type Translator::GetSrcPk(const InstOperand& operand) { if constexpr (is_float) { return value; } else { - return ir.BitCast(value); + return ir.BitFieldExtract(ir.BitCast(value), ir.Imm32(0), ir.Imm32(16), + is_signed); } }; @@ -891,8 +892,9 @@ pk_type Translator::GetSrcPk(const InstOperand& operand) { return value; } -template pk_type Translator::GetSrcPk(const InstOperand&); -template pk_type Translator::GetSrcPk(const InstOperand&); +template pk_type Translator::GetSrcPk(const InstOperand&); +template pk_type Translator::GetSrcPk(const InstOperand&); +template pk_type Translator::GetSrcPk(const InstOperand&); void Translator::SetDst1(const InstOperand& operand, const IR::U1& value) { switch (operand.field) { @@ -1035,6 +1037,46 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra } } +template +void Translator::SetDstPk(const InstOperand& operand, const pk_type& value) { + pk_type v = value; + + if constexpr (std::is_same_v) { + if (operand.output_modifier.clamp) { + v = {ir.FPSaturate(v.first), ir.FPSaturate(v.second)}; + } + } else { + if (operand.output_modifier.clamp) { + if constexpr (is_signed) { + auto lower = ir.Imm32(-32768); + auto upper = ir.Imm32(32767); + v = {ir.SClamp(v.first, lower, upper), ir.SClamp(v.second, lower, upper)}; + } else { + auto imm = ir.Imm32(0xFFFF); + v = {ir.UMin(v.first, imm), ir.UMin(v.second, imm)}; + } + } + } + + IR::U32 value_raw{}; + if constexpr (std::is_same_v) { + value_raw = + ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(v.first, v.second)); + } else { + value_raw = ir.Pack2x16(AmdGpu::NumberFormat::Uint, + ir.CompositeConstruct(ir.BitCast(v.first), + ir.BitCast(v.second))); + } + SetDst(operand, value_raw); +} + +template void Translator::SetDstPk(const InstOperand& operand, + const pk_type& value); +template void Translator::SetDstPk(const InstOperand& operand, + const pk_type& value); +template void Translator::SetDstPk(const InstOperand& operand, + const pk_type& value); + void Translator::EmitFetch(const GcnInst& inst) { const auto code_sgpr_base = inst.src[0].code; diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index e84598287..1eea047ef 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -292,6 +292,16 @@ public: void V_OR3_B32(const GcnInst& inst); // VOP3P + void V_PK_MUL_LO_U16(const GcnInst& inst); + void V_PK_ADD_I16(const GcnInst& inst); + void V_PK_SUB_I16(const GcnInst& inst); + void V_PK_LSHLREV_B16(const GcnInst& inst); + void V_PK_LSHRREV_B16(const GcnInst& inst); + void V_PK_MAD_U16(const GcnInst& inst); + void V_PK_ADD_U16(const GcnInst& inst); + void V_PK_SUB_U16(const GcnInst& inst); + void V_PK_MAX_U16(const GcnInst& inst); + void V_PK_MIN_U16(const GcnInst& inst); void V_PK_FMA_F16(const GcnInst& inst); void V_PK_ADD_F16(const GcnInst& inst); void V_PK_MUL_F16(const GcnInst& inst); @@ -345,13 +355,15 @@ private: template [[nodiscard]] T GetSrc64(const InstOperand& operand); [[nodiscard]] IR::F32 GetSrcMix(const InstOperand& operand); - template + template [[nodiscard]] pk_type GetSrcPk(const InstOperand& operand); void SetDst1(const InstOperand& operand, const IR::U1& value); void SetDst(const InstOperand& operand, const IR::U32F32& value); template void SetDst16(const InstOperand& operand, const IR::U32F32& value); void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw); + template + void SetDstPk(const InstOperand& operand, const pk_type& value); // Vector ALU Helpers IR::U32 GetCarryIn(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index fb199e5d5..6227a1814 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -484,6 +484,26 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return; // VOP3P + case Opcode::V_PK_MUL_LO_U16: + return V_PK_MUL_LO_U16(inst); + case Opcode::V_PK_ADD_I16: + return V_PK_ADD_I16(inst); + case Opcode::V_PK_SUB_I16: + return V_PK_SUB_I16(inst); + case Opcode::V_PK_LSHRREV_B16: + return V_PK_LSHRREV_B16(inst); + case Opcode::V_PK_LSHLREV_B16: + return V_PK_LSHLREV_B16(inst); + case Opcode::V_PK_MAD_U16: + return V_PK_MAD_U16(inst); + case Opcode::V_PK_ADD_U16: + return V_PK_ADD_U16(inst); + case Opcode::V_PK_SUB_U16: + return V_PK_SUB_U16(inst); + case Opcode::V_PK_MAX_U16: + return V_PK_MAX_U16(inst); + case Opcode::V_PK_MIN_U16: + return V_PK_MIN_U16(inst); case Opcode::V_PK_FMA_F16: return V_PK_FMA_F16(inst); case Opcode::V_PK_ADD_F16: @@ -1680,6 +1700,107 @@ void Translator::V_ADD3_U32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IAdd(src0, ir.IAdd(src1, src2))); } +void Translator::V_PK_MUL_LO_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.IAdd(src0.first, src1.first); + const auto result_hi = ir.IAdd(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_ADD_I16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.IAdd(src0.first, src1.first); + const auto result_hi = ir.IAdd(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_SUB_I16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.ISub(src0.first, src1.first); + const auto result_hi = ir.ISub(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_LSHLREV_B16(const GcnInst& inst) { + const auto shift = GetSrcPk(inst.src[0]); + const auto src = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.ShiftLeftLogical(src.first, shift.first); + const auto result_hi = ir.ShiftLeftLogical(src.second, shift.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_LSHRREV_B16(const GcnInst& inst) { + const auto shift = GetSrcPk(inst.src[0]); + const auto src = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.ShiftRightLogical(src.first, shift.first); + const auto result_hi = ir.ShiftRightLogical(src.second, shift.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_MAD_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + const auto src2 = GetSrcPk(inst.src[2]); + + const auto result_lo = ir.IAdd(ir.IMul(src0.first, src1.first), src2.first); + const auto result_hi = ir.IAdd(ir.IMul(src0.second, src1.second), src2.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_ADD_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.IAdd(src0.first, src1.first); + const auto result_hi = ir.IAdd(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_SUB_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.ISub(src0.first, src1.first); + const auto result_hi = ir.ISub(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_MAX_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.UMax(src0.first, src1.first); + const auto result_hi = ir.UMax(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + +void Translator::V_PK_MIN_U16(const GcnInst& inst) { + const auto src0 = GetSrcPk(inst.src[0]); + const auto src1 = GetSrcPk(inst.src[1]); + + const auto result_lo = ir.UMin(src0.first, src1.first); + const auto result_hi = ir.UMin(src0.second, src1.second); + + SetDstPk(inst.dst[0], {result_lo, result_hi}); +} + void Translator::V_PK_FMA_F16(const GcnInst& inst) { const auto src0 = GetSrcPk(inst.src[0]); const auto src1 = GetSrcPk(inst.src[1]); @@ -1688,8 +1809,7 @@ void Translator::V_PK_FMA_F16(const GcnInst& inst) { const auto result_lo = ir.FPFma(src0.first, src1.first, src2.first); const auto result_hi = ir.FPFma(src0.second, src1.second, src2.second); - SetDst(inst.dst[0], - ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi))); + SetDstPk(inst.dst[0], {result_lo, result_hi}); } void Translator::V_PK_ADD_F16(const GcnInst& inst) { @@ -1699,8 +1819,7 @@ void Translator::V_PK_ADD_F16(const GcnInst& inst) { const auto result_lo = ir.FPAdd(src0.first, src1.first); const auto result_hi = ir.FPAdd(src0.second, src1.second); - SetDst(inst.dst[0], - ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi))); + SetDstPk(inst.dst[0], {result_lo, result_hi}); } void Translator::V_PK_MUL_F16(const GcnInst& inst) { @@ -1710,8 +1829,7 @@ void Translator::V_PK_MUL_F16(const GcnInst& inst) { const auto result_lo = ir.FPMul(src0.first, src1.first); const auto result_hi = ir.FPMul(src0.second, src1.second); - SetDst(inst.dst[0], - ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi))); + SetDstPk(inst.dst[0], {result_lo, result_hi}); } void Translator::V_PK_MIN_F16(const GcnInst& inst) { @@ -1721,8 +1839,7 @@ void Translator::V_PK_MIN_F16(const GcnInst& inst) { const auto result_lo = ir.FPMin(src0.first, src1.first); const auto result_hi = ir.FPMin(src0.second, src1.second); - SetDst(inst.dst[0], - ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi))); + SetDstPk(inst.dst[0], {result_lo, result_hi}); } void Translator::V_PK_MAX_F16(const GcnInst& inst) { @@ -1732,8 +1849,7 @@ void Translator::V_PK_MAX_F16(const GcnInst& inst) { const auto result_lo = ir.FPMax(src0.first, src1.first); const auto result_hi = ir.FPMax(src0.second, src1.second); - SetDst(inst.dst[0], - ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi))); + SetDstPk(inst.dst[0], {result_lo, result_hi}); } void Translator::V_LSHL_OR_B32(const GcnInst& inst) {