Neo: packed math for unsigned/signed integers (#4407)

This commit is contained in:
Marcin Mikołajczyk
2026-05-14 05:33:50 +02:00
committed by GitHub
parent fb6502b8f1
commit 0add6b8c1f
3 changed files with 185 additions and 15 deletions
@@ -766,7 +766,7 @@ T Translator::GetSrc64(const InstOperand& operand) {
template IR::U64 Translator::GetSrc64<IR::U64>(const InstOperand&);
template IR::F64 Translator::GetSrc64<IR::F64>(const InstOperand&);
template <typename T>
template <typename T, bool is_signed>
pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
constexpr bool is_float = std::is_same_v<T, IR::F32>;
@@ -794,7 +794,8 @@ pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
if constexpr (is_float) {
return value;
} else {
return ir.BitCast<IR::U32>(value);
return ir.BitFieldExtract(ir.BitCast<IR::U32>(value), ir.Imm32(0), ir.Imm32(16),
is_signed);
}
};
@@ -891,8 +892,9 @@ pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
return value;
}
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32>(const InstOperand&);
template pk_type<IR::F32> Translator::GetSrcPk<IR::F32>(const InstOperand&);
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32, true>(const InstOperand&);
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32, false>(const InstOperand&);
template pk_type<IR::F32> Translator::GetSrcPk<IR::F32, false>(const InstOperand&);
void Translator::SetDst1(const InstOperand& operand, const IR::U1& value) {
switch (operand.field) {
@@ -1035,6 +1037,46 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra
}
}
template <typename T, bool is_signed>
void Translator::SetDstPk(const InstOperand& operand, const pk_type<T>& value) {
pk_type<T> v = value;
if constexpr (std::is_same_v<T, IR::F32>) {
if (operand.output_modifier.clamp) {
v = {ir.FPSaturate(v.first), ir.FPSaturate(v.second)};
}
} else {
if (operand.output_modifier.clamp) {
if constexpr (is_signed) {
auto lower = ir.Imm32(-32768);
auto upper = ir.Imm32(32767);
v = {ir.SClamp(v.first, lower, upper), ir.SClamp(v.second, lower, upper)};
} else {
auto imm = ir.Imm32(0xFFFF);
v = {ir.UMin(v.first, imm), ir.UMin(v.second, imm)};
}
}
}
IR::U32 value_raw{};
if constexpr (std::is_same_v<T, IR::F32>) {
value_raw =
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(v.first, v.second));
} else {
value_raw = ir.Pack2x16(AmdGpu::NumberFormat::Uint,
ir.CompositeConstruct(ir.BitCast<IR::F32, IR::U32>(v.first),
ir.BitCast<IR::F32, IR::U32>(v.second)));
}
SetDst(operand, value_raw);
}
template void Translator::SetDstPk<IR::U32, false>(const InstOperand& operand,
const pk_type<IR::U32>& value);
template void Translator::SetDstPk<IR::U32, true>(const InstOperand& operand,
const pk_type<IR::U32>& value);
template void Translator::SetDstPk<IR::F32, false>(const InstOperand& operand,
const pk_type<IR::F32>& value);
void Translator::EmitFetch(const GcnInst& inst) {
const auto code_sgpr_base = inst.src[0].code;
@@ -292,6 +292,16 @@ public:
void V_OR3_B32(const GcnInst& inst);
// VOP3P
void V_PK_MUL_LO_U16(const GcnInst& inst);
void V_PK_ADD_I16(const GcnInst& inst);
void V_PK_SUB_I16(const GcnInst& inst);
void V_PK_LSHLREV_B16(const GcnInst& inst);
void V_PK_LSHRREV_B16(const GcnInst& inst);
void V_PK_MAD_U16(const GcnInst& inst);
void V_PK_ADD_U16(const GcnInst& inst);
void V_PK_SUB_U16(const GcnInst& inst);
void V_PK_MAX_U16(const GcnInst& inst);
void V_PK_MIN_U16(const GcnInst& inst);
void V_PK_FMA_F16(const GcnInst& inst);
void V_PK_ADD_F16(const GcnInst& inst);
void V_PK_MUL_F16(const GcnInst& inst);
@@ -345,13 +355,15 @@ private:
template <typename T = IR::U64>
[[nodiscard]] T GetSrc64(const InstOperand& operand);
[[nodiscard]] IR::F32 GetSrcMix(const InstOperand& operand);
template <typename T = IR::U32>
template <typename T = IR::U32, bool is_signed = false>
[[nodiscard]] pk_type<T> GetSrcPk(const InstOperand& operand);
void SetDst1(const InstOperand& operand, const IR::U1& value);
void SetDst(const InstOperand& operand, const IR::U32F32& value);
template <bool is_signed = false>
void SetDst16(const InstOperand& operand, const IR::U32F32& value);
void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw);
template <typename T = IR::U32, bool is_signed = false>
void SetDstPk(const InstOperand& operand, const pk_type<T>& value);
// Vector ALU Helpers
IR::U32 GetCarryIn(const GcnInst& inst);
@@ -484,6 +484,26 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
return;
// VOP3P
case Opcode::V_PK_MUL_LO_U16:
return V_PK_MUL_LO_U16(inst);
case Opcode::V_PK_ADD_I16:
return V_PK_ADD_I16(inst);
case Opcode::V_PK_SUB_I16:
return V_PK_SUB_I16(inst);
case Opcode::V_PK_LSHRREV_B16:
return V_PK_LSHRREV_B16(inst);
case Opcode::V_PK_LSHLREV_B16:
return V_PK_LSHLREV_B16(inst);
case Opcode::V_PK_MAD_U16:
return V_PK_MAD_U16(inst);
case Opcode::V_PK_ADD_U16:
return V_PK_ADD_U16(inst);
case Opcode::V_PK_SUB_U16:
return V_PK_SUB_U16(inst);
case Opcode::V_PK_MAX_U16:
return V_PK_MAX_U16(inst);
case Opcode::V_PK_MIN_U16:
return V_PK_MIN_U16(inst);
case Opcode::V_PK_FMA_F16:
return V_PK_FMA_F16(inst);
case Opcode::V_PK_ADD_F16:
@@ -1680,6 +1700,107 @@ void Translator::V_ADD3_U32(const GcnInst& inst) {
SetDst(inst.dst[0], ir.IAdd(src0, ir.IAdd(src1, src2)));
}
void Translator::V_PK_MUL_LO_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.IAdd(src0.first, src1.first);
const auto result_hi = ir.IAdd(src0.second, src1.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_ADD_I16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32, true>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32, true>(inst.src[1]);
const auto result_lo = ir.IAdd(src0.first, src1.first);
const auto result_hi = ir.IAdd(src0.second, src1.second);
SetDstPk<IR::U32, true>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_SUB_I16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32, true>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32, true>(inst.src[1]);
const auto result_lo = ir.ISub(src0.first, src1.first);
const auto result_hi = ir.ISub(src0.second, src1.second);
SetDstPk<IR::U32, true>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_LSHLREV_B16(const GcnInst& inst) {
const auto shift = GetSrcPk<IR::U32>(inst.src[0]);
const auto src = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.ShiftLeftLogical(src.first, shift.first);
const auto result_hi = ir.ShiftLeftLogical(src.second, shift.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_LSHRREV_B16(const GcnInst& inst) {
const auto shift = GetSrcPk<IR::U32>(inst.src[0]);
const auto src = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.ShiftRightLogical(src.first, shift.first);
const auto result_hi = ir.ShiftRightLogical(src.second, shift.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MAD_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto src2 = GetSrcPk<IR::U32>(inst.src[2]);
const auto result_lo = ir.IAdd(ir.IMul(src0.first, src1.first), src2.first);
const auto result_hi = ir.IAdd(ir.IMul(src0.second, src1.second), src2.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_ADD_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.IAdd(src0.first, src1.first);
const auto result_hi = ir.IAdd(src0.second, src1.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_SUB_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.ISub(src0.first, src1.first);
const auto result_hi = ir.ISub(src0.second, src1.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MAX_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.UMax(src0.first, src1.first);
const auto result_hi = ir.UMax(src0.second, src1.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MIN_U16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
const auto result_lo = ir.UMin(src0.first, src1.first);
const auto result_hi = ir.UMin(src0.second, src1.second);
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_FMA_F16(const GcnInst& inst) {
const auto src0 = GetSrcPk<IR::F32>(inst.src[0]);
const auto src1 = GetSrcPk<IR::F32>(inst.src[1]);
@@ -1688,8 +1809,7 @@ void Translator::V_PK_FMA_F16(const GcnInst& inst) {
const auto result_lo = ir.FPFma(src0.first, src1.first, src2.first);
const auto result_hi = ir.FPFma(src0.second, src1.second, src2.second);
SetDst(inst.dst[0],
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_ADD_F16(const GcnInst& inst) {
@@ -1699,8 +1819,7 @@ void Translator::V_PK_ADD_F16(const GcnInst& inst) {
const auto result_lo = ir.FPAdd(src0.first, src1.first);
const auto result_hi = ir.FPAdd(src0.second, src1.second);
SetDst(inst.dst[0],
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MUL_F16(const GcnInst& inst) {
@@ -1710,8 +1829,7 @@ void Translator::V_PK_MUL_F16(const GcnInst& inst) {
const auto result_lo = ir.FPMul(src0.first, src1.first);
const auto result_hi = ir.FPMul(src0.second, src1.second);
SetDst(inst.dst[0],
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MIN_F16(const GcnInst& inst) {
@@ -1721,8 +1839,7 @@ void Translator::V_PK_MIN_F16(const GcnInst& inst) {
const auto result_lo = ir.FPMin(src0.first, src1.first);
const auto result_hi = ir.FPMin(src0.second, src1.second);
SetDst(inst.dst[0],
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_PK_MAX_F16(const GcnInst& inst) {
@@ -1732,8 +1849,7 @@ void Translator::V_PK_MAX_F16(const GcnInst& inst) {
const auto result_lo = ir.FPMax(src0.first, src1.first);
const auto result_hi = ir.FPMax(src0.second, src1.second);
SetDst(inst.dst[0],
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
}
void Translator::V_LSHL_OR_B32(const GcnInst& inst) {