mirror of
https://github.com/scummvm/dockerized-bb.git
synced 2026-05-21 05:40:49 +00:00
MXE: Fix libtheora assembly
XMM6 and XMM7 are expected to be callee saved in Windows x64 ABI.
This commit is contained in:
@@ -78,7 +78,7 @@ mxe_package(ogg)
|
||||
|
||||
mxe_package(vorbis)
|
||||
|
||||
mxe_package(theora)
|
||||
local_mxe_package(theora)
|
||||
|
||||
mxe_package(flac)
|
||||
|
||||
|
||||
@@ -0,0 +1,244 @@
|
||||
commit 305566da3d6e09deb014339d10eb0798be4791e8
|
||||
Author: Le Philousophe <lephilousophe@users.noreply.github.com>
|
||||
Date: Tue Oct 21 19:52:09 2025 +0200
|
||||
|
||||
Mark XMM registers as clobbered
|
||||
|
||||
diff --git a/lib/x86/sse2encfrag.c b/lib/x86/sse2encfrag.c
|
||||
index b7726c7..4da7d0c 100644
|
||||
--- a/lib/x86/sse2encfrag.c
|
||||
+++ b/lib/x86/sse2encfrag.c
|
||||
@@ -64,7 +64,7 @@
|
||||
"paddd %%xmm3,%%xmm2\n\t" \
|
||||
"paddd %%xmm2,%%xmm7\n\t" \
|
||||
|
||||
-unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
+unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
unsigned ret;
|
||||
__asm__ __volatile__(
|
||||
@@ -82,6 +82,8 @@ unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
:[ret]"=a"(ret)
|
||||
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
|
||||
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
@@ -92,7 +94,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
|
||||
|
||||
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
|
||||
horizontal sums as well as their 16-bit differences subject to a mask.
|
||||
- %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
|
||||
+ %%xmm6 must contain OC_MASK_CONSTS[0...7] and %%xmm7 must contain 0.*/
|
||||
#define OC_LOAD_SUB_MASK_2x8 \
|
||||
"#OC_LOAD_SUB_MASK_2x8\n\t" \
|
||||
/*Start the loads and expand the next 8 bits of the mask.*/ \
|
||||
@@ -127,7 +129,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
|
||||
"psubw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm3,%%xmm1\n\t" \
|
||||
|
||||
-unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
+unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||
ptrdiff_t ystride;
|
||||
unsigned ret;
|
||||
@@ -138,6 +140,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
"movq %[c],%%xmm6\n\t"
|
||||
:
|
||||
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
|
||||
+ :"%xmm6", "%xmm7"
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
unsigned m;
|
||||
@@ -151,6 +154,9 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
"paddd %%xmm0,%%xmm7\n\t"
|
||||
"paddd %%xmm1,%%xmm7\n\t"
|
||||
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
|
||||
+ :
|
||||
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", /*"%xmm5",*/ "%xmm6", "%xmm7"
|
||||
);
|
||||
}
|
||||
_src+=2*ystride;
|
||||
@@ -164,6 +170,8 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"movd %%xmm7,%[ret]\n\t"
|
||||
:[ret]"=a"(ret)
|
||||
+ :
|
||||
+ :"%xmm6", "%xmm7"
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
@@ -381,7 +389,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
OC_HADAMARD_AB_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x8
|
||||
|
||||
-static unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||
+static __attribute__((target("sse2"))) unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
@@ -434,25 +442,27 @@ static unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||
/*We have to use neg, so we actually clobber the condition codes for once
|
||||
(not to mention sub, and add).*/
|
||||
- :"cc"
|
||||
+ :"cc",
|
||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
-unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
-unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
-unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
+unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
unsigned ret;
|
||||
@@ -491,7 +501,9 @@ unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||
/*We have to use sub, so we actually clobber the condition codes for once.*/
|
||||
- :"cc"
|
||||
+ :"cc",
|
||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
diff --git a/lib/x86/sse2fdct.c b/lib/x86/sse2fdct.c
|
||||
index 7787cb9..e2d0a0d 100644
|
||||
--- a/lib/x86/sse2fdct.c
|
||||
+++ b/lib/x86/sse2fdct.c
|
||||
@@ -354,7 +354,7 @@
|
||||
/*SSE2 implementation of the fDCT for x86-64 only.
|
||||
Because of the 8 extra XMM registers on x86-64, this version can operate
|
||||
without any temporary stack access at all.*/
|
||||
-void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
+void __attribute__((target("sse2"))) oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
ptrdiff_t a;
|
||||
__asm__ __volatile__(
|
||||
/*Load the input.*/
|
||||
@@ -446,7 +446,11 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
:[a]"=&r"(a)
|
||||
:[y]"r"(_y),[x]"r"(_x)
|
||||
- :"memory"
|
||||
+ :"memory",
|
||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
||||
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
||||
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
diff --git a/lib/x86/sse2idct.c b/lib/x86/sse2idct.c
|
||||
index 1f71e3a..cf8c1e4 100644
|
||||
--- a/lib/x86/sse2idct.c
|
||||
+++ b/lib/x86/sse2idct.c
|
||||
@@ -206,7 +206,7 @@ const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
|
||||
"psraw $4,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
|
||||
|
||||
-static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
+static __attribute__((target("sse2"))) void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
@@ -230,8 +230,10 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
|
||||
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
- __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
|
||||
+ __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t":::"%xmm0");
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
for(i=0;i<2;i++){
|
||||
__asm__ __volatile__(
|
||||
@@ -240,6 +242,8 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
|
||||
+ :
|
||||
+ :"%xmm0"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -392,7 +396,7 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
-static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
+static __attribute__((target("sse2"))) void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
@@ -408,6 +412,8 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
|
||||
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
__asm__ __volatile__(
|
||||
@@ -417,13 +423,15 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
|
||||
+ :
|
||||
+ :"%xmm0"
|
||||
);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
-void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
+void __attribute__((target("sse2"))) oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
diff --git a/lib/x86/x86enquant.c b/lib/x86/x86enquant.c
|
||||
index 49368e8..8c01e51 100644
|
||||
--- a/lib/x86/x86enquant.c
|
||||
+++ b/lib/x86/x86enquant.c
|
||||
@@ -56,7 +56,7 @@ void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
|
||||
}
|
||||
}
|
||||
|
||||
-int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
+int __attribute__((target("sse2"))) oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||
ptrdiff_t r;
|
||||
__asm__ __volatile__(
|
||||
@@ -141,7 +141,9 @@ int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
"add %k[q],%k[r]\n\t"
|
||||
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
|
||||
:[dct]"r"(_dct),[qdct]"r"(_qdct)
|
||||
- :"cc","memory"
|
||||
+ :"cc","memory",
|
||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
return (int)r;
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
# This file is a placeholder because MXE expects to find .mk files in the plugins folders
|
||||
Reference in New Issue
Block a user