mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2026-05-26 13:50:37 +00:00
Neo: packed math for unsigned/signed integers (#4407)
This commit is contained in:
committed by
GitHub
parent
fb6502b8f1
commit
0add6b8c1f
@@ -766,7 +766,7 @@ T Translator::GetSrc64(const InstOperand& operand) {
|
||||
template IR::U64 Translator::GetSrc64<IR::U64>(const InstOperand&);
|
||||
template IR::F64 Translator::GetSrc64<IR::F64>(const InstOperand&);
|
||||
|
||||
template <typename T>
|
||||
template <typename T, bool is_signed>
|
||||
pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
|
||||
constexpr bool is_float = std::is_same_v<T, IR::F32>;
|
||||
|
||||
@@ -794,7 +794,8 @@ pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
|
||||
if constexpr (is_float) {
|
||||
return value;
|
||||
} else {
|
||||
return ir.BitCast<IR::U32>(value);
|
||||
return ir.BitFieldExtract(ir.BitCast<IR::U32>(value), ir.Imm32(0), ir.Imm32(16),
|
||||
is_signed);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -891,8 +892,9 @@ pk_type<T> Translator::GetSrcPk(const InstOperand& operand) {
|
||||
return value;
|
||||
}
|
||||
|
||||
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32>(const InstOperand&);
|
||||
template pk_type<IR::F32> Translator::GetSrcPk<IR::F32>(const InstOperand&);
|
||||
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32, true>(const InstOperand&);
|
||||
template pk_type<IR::U32> Translator::GetSrcPk<IR::U32, false>(const InstOperand&);
|
||||
template pk_type<IR::F32> Translator::GetSrcPk<IR::F32, false>(const InstOperand&);
|
||||
|
||||
void Translator::SetDst1(const InstOperand& operand, const IR::U1& value) {
|
||||
switch (operand.field) {
|
||||
@@ -1035,6 +1037,46 @@ void Translator::SetDst64(const InstOperand& operand, const IR::U64F64& value_ra
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, bool is_signed>
|
||||
void Translator::SetDstPk(const InstOperand& operand, const pk_type<T>& value) {
|
||||
pk_type<T> v = value;
|
||||
|
||||
if constexpr (std::is_same_v<T, IR::F32>) {
|
||||
if (operand.output_modifier.clamp) {
|
||||
v = {ir.FPSaturate(v.first), ir.FPSaturate(v.second)};
|
||||
}
|
||||
} else {
|
||||
if (operand.output_modifier.clamp) {
|
||||
if constexpr (is_signed) {
|
||||
auto lower = ir.Imm32(-32768);
|
||||
auto upper = ir.Imm32(32767);
|
||||
v = {ir.SClamp(v.first, lower, upper), ir.SClamp(v.second, lower, upper)};
|
||||
} else {
|
||||
auto imm = ir.Imm32(0xFFFF);
|
||||
v = {ir.UMin(v.first, imm), ir.UMin(v.second, imm)};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
IR::U32 value_raw{};
|
||||
if constexpr (std::is_same_v<T, IR::F32>) {
|
||||
value_raw =
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(v.first, v.second));
|
||||
} else {
|
||||
value_raw = ir.Pack2x16(AmdGpu::NumberFormat::Uint,
|
||||
ir.CompositeConstruct(ir.BitCast<IR::F32, IR::U32>(v.first),
|
||||
ir.BitCast<IR::F32, IR::U32>(v.second)));
|
||||
}
|
||||
SetDst(operand, value_raw);
|
||||
}
|
||||
|
||||
template void Translator::SetDstPk<IR::U32, false>(const InstOperand& operand,
|
||||
const pk_type<IR::U32>& value);
|
||||
template void Translator::SetDstPk<IR::U32, true>(const InstOperand& operand,
|
||||
const pk_type<IR::U32>& value);
|
||||
template void Translator::SetDstPk<IR::F32, false>(const InstOperand& operand,
|
||||
const pk_type<IR::F32>& value);
|
||||
|
||||
void Translator::EmitFetch(const GcnInst& inst) {
|
||||
const auto code_sgpr_base = inst.src[0].code;
|
||||
|
||||
|
||||
@@ -292,6 +292,16 @@ public:
|
||||
void V_OR3_B32(const GcnInst& inst);
|
||||
|
||||
// VOP3P
|
||||
void V_PK_MUL_LO_U16(const GcnInst& inst);
|
||||
void V_PK_ADD_I16(const GcnInst& inst);
|
||||
void V_PK_SUB_I16(const GcnInst& inst);
|
||||
void V_PK_LSHLREV_B16(const GcnInst& inst);
|
||||
void V_PK_LSHRREV_B16(const GcnInst& inst);
|
||||
void V_PK_MAD_U16(const GcnInst& inst);
|
||||
void V_PK_ADD_U16(const GcnInst& inst);
|
||||
void V_PK_SUB_U16(const GcnInst& inst);
|
||||
void V_PK_MAX_U16(const GcnInst& inst);
|
||||
void V_PK_MIN_U16(const GcnInst& inst);
|
||||
void V_PK_FMA_F16(const GcnInst& inst);
|
||||
void V_PK_ADD_F16(const GcnInst& inst);
|
||||
void V_PK_MUL_F16(const GcnInst& inst);
|
||||
@@ -345,13 +355,15 @@ private:
|
||||
template <typename T = IR::U64>
|
||||
[[nodiscard]] T GetSrc64(const InstOperand& operand);
|
||||
[[nodiscard]] IR::F32 GetSrcMix(const InstOperand& operand);
|
||||
template <typename T = IR::U32>
|
||||
template <typename T = IR::U32, bool is_signed = false>
|
||||
[[nodiscard]] pk_type<T> GetSrcPk(const InstOperand& operand);
|
||||
void SetDst1(const InstOperand& operand, const IR::U1& value);
|
||||
void SetDst(const InstOperand& operand, const IR::U32F32& value);
|
||||
template <bool is_signed = false>
|
||||
void SetDst16(const InstOperand& operand, const IR::U32F32& value);
|
||||
void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw);
|
||||
template <typename T = IR::U32, bool is_signed = false>
|
||||
void SetDstPk(const InstOperand& operand, const pk_type<T>& value);
|
||||
|
||||
// Vector ALU Helpers
|
||||
IR::U32 GetCarryIn(const GcnInst& inst);
|
||||
|
||||
@@ -484,6 +484,26 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
|
||||
return;
|
||||
|
||||
// VOP3P
|
||||
case Opcode::V_PK_MUL_LO_U16:
|
||||
return V_PK_MUL_LO_U16(inst);
|
||||
case Opcode::V_PK_ADD_I16:
|
||||
return V_PK_ADD_I16(inst);
|
||||
case Opcode::V_PK_SUB_I16:
|
||||
return V_PK_SUB_I16(inst);
|
||||
case Opcode::V_PK_LSHRREV_B16:
|
||||
return V_PK_LSHRREV_B16(inst);
|
||||
case Opcode::V_PK_LSHLREV_B16:
|
||||
return V_PK_LSHLREV_B16(inst);
|
||||
case Opcode::V_PK_MAD_U16:
|
||||
return V_PK_MAD_U16(inst);
|
||||
case Opcode::V_PK_ADD_U16:
|
||||
return V_PK_ADD_U16(inst);
|
||||
case Opcode::V_PK_SUB_U16:
|
||||
return V_PK_SUB_U16(inst);
|
||||
case Opcode::V_PK_MAX_U16:
|
||||
return V_PK_MAX_U16(inst);
|
||||
case Opcode::V_PK_MIN_U16:
|
||||
return V_PK_MIN_U16(inst);
|
||||
case Opcode::V_PK_FMA_F16:
|
||||
return V_PK_FMA_F16(inst);
|
||||
case Opcode::V_PK_ADD_F16:
|
||||
@@ -1680,6 +1700,107 @@ void Translator::V_ADD3_U32(const GcnInst& inst) {
|
||||
SetDst(inst.dst[0], ir.IAdd(src0, ir.IAdd(src1, src2)));
|
||||
}
|
||||
|
||||
void Translator::V_PK_MUL_LO_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.IAdd(src0.first, src1.first);
|
||||
const auto result_hi = ir.IAdd(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_ADD_I16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32, true>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32, true>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.IAdd(src0.first, src1.first);
|
||||
const auto result_hi = ir.IAdd(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, true>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_SUB_I16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32, true>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32, true>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.ISub(src0.first, src1.first);
|
||||
const auto result_hi = ir.ISub(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, true>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_LSHLREV_B16(const GcnInst& inst) {
|
||||
const auto shift = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.ShiftLeftLogical(src.first, shift.first);
|
||||
const auto result_hi = ir.ShiftLeftLogical(src.second, shift.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_LSHRREV_B16(const GcnInst& inst) {
|
||||
const auto shift = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.ShiftRightLogical(src.first, shift.first);
|
||||
const auto result_hi = ir.ShiftRightLogical(src.second, shift.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MAD_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
const auto src2 = GetSrcPk<IR::U32>(inst.src[2]);
|
||||
|
||||
const auto result_lo = ir.IAdd(ir.IMul(src0.first, src1.first), src2.first);
|
||||
const auto result_hi = ir.IAdd(ir.IMul(src0.second, src1.second), src2.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_ADD_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.IAdd(src0.first, src1.first);
|
||||
const auto result_hi = ir.IAdd(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_SUB_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.ISub(src0.first, src1.first);
|
||||
const auto result_hi = ir.ISub(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MAX_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.UMax(src0.first, src1.first);
|
||||
const auto result_hi = ir.UMax(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MIN_U16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::U32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::U32>(inst.src[1]);
|
||||
|
||||
const auto result_lo = ir.UMin(src0.first, src1.first);
|
||||
const auto result_hi = ir.UMin(src0.second, src1.second);
|
||||
|
||||
SetDstPk<IR::U32, false>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_FMA_F16(const GcnInst& inst) {
|
||||
const auto src0 = GetSrcPk<IR::F32>(inst.src[0]);
|
||||
const auto src1 = GetSrcPk<IR::F32>(inst.src[1]);
|
||||
@@ -1688,8 +1809,7 @@ void Translator::V_PK_FMA_F16(const GcnInst& inst) {
|
||||
const auto result_lo = ir.FPFma(src0.first, src1.first, src2.first);
|
||||
const auto result_hi = ir.FPFma(src0.second, src1.second, src2.second);
|
||||
|
||||
SetDst(inst.dst[0],
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
|
||||
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_ADD_F16(const GcnInst& inst) {
|
||||
@@ -1699,8 +1819,7 @@ void Translator::V_PK_ADD_F16(const GcnInst& inst) {
|
||||
const auto result_lo = ir.FPAdd(src0.first, src1.first);
|
||||
const auto result_hi = ir.FPAdd(src0.second, src1.second);
|
||||
|
||||
SetDst(inst.dst[0],
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
|
||||
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MUL_F16(const GcnInst& inst) {
|
||||
@@ -1710,8 +1829,7 @@ void Translator::V_PK_MUL_F16(const GcnInst& inst) {
|
||||
const auto result_lo = ir.FPMul(src0.first, src1.first);
|
||||
const auto result_hi = ir.FPMul(src0.second, src1.second);
|
||||
|
||||
SetDst(inst.dst[0],
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
|
||||
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MIN_F16(const GcnInst& inst) {
|
||||
@@ -1721,8 +1839,7 @@ void Translator::V_PK_MIN_F16(const GcnInst& inst) {
|
||||
const auto result_lo = ir.FPMin(src0.first, src1.first);
|
||||
const auto result_hi = ir.FPMin(src0.second, src1.second);
|
||||
|
||||
SetDst(inst.dst[0],
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
|
||||
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_PK_MAX_F16(const GcnInst& inst) {
|
||||
@@ -1732,8 +1849,7 @@ void Translator::V_PK_MAX_F16(const GcnInst& inst) {
|
||||
const auto result_lo = ir.FPMax(src0.first, src1.first);
|
||||
const auto result_hi = ir.FPMax(src0.second, src1.second);
|
||||
|
||||
SetDst(inst.dst[0],
|
||||
ir.Pack2x16(AmdGpu::NumberFormat::Float, ir.CompositeConstruct(result_lo, result_hi)));
|
||||
SetDstPk<IR::F32>(inst.dst[0], {result_lo, result_hi});
|
||||
}
|
||||
|
||||
void Translator::V_LSHL_OR_B32(const GcnInst& inst) {
|
||||
|
||||
Reference in New Issue
Block a user