MXE: Fix libtheora assembly

XMM6 and XMM7 are expected to be callee saved in Windows x64 ABI.
This commit is contained in:
Le Philousophe
2025-10-26 10:35:38 +00:00
parent 9f2837f1f9
commit da58f5b621
3 changed files with 246 additions and 1 deletions
+1 -1
View File
@@ -78,7 +78,7 @@ mxe_package(ogg)
mxe_package(vorbis)
mxe_package(theora)
local_mxe_package(theora)
mxe_package(flac)
@@ -0,0 +1,244 @@
commit 305566da3d6e09deb014339d10eb0798be4791e8
Author: Le Philousophe <lephilousophe@users.noreply.github.com>
Date: Tue Oct 21 19:52:09 2025 +0200
Mark XMM registers as clobbered
diff --git a/lib/x86/sse2encfrag.c b/lib/x86/sse2encfrag.c
index b7726c7..4da7d0c 100644
--- a/lib/x86/sse2encfrag.c
+++ b/lib/x86/sse2encfrag.c
@@ -64,7 +64,7 @@
"paddd %%xmm3,%%xmm2\n\t" \
"paddd %%xmm2,%%xmm7\n\t" \
-unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride){
unsigned ret;
__asm__ __volatile__(
@@ -82,6 +82,8 @@ unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
:[ret]"=a"(ret)
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
[ystride3]"r"((ptrdiff_t)_ystride*3)
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
return ret;
}
@@ -92,7 +94,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
horizontal sums as well as their 16-bit differences subject to a mask.
- %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+ %%xmm6 must contain OC_MASK_CONSTS[0...7] and %%xmm7 must contain 0.*/
#define OC_LOAD_SUB_MASK_2x8 \
"#OC_LOAD_SUB_MASK_2x8\n\t" \
/*Start the loads and expand the next 8 bits of the mask.*/ \
@@ -127,7 +129,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
"psubw %%xmm2,%%xmm0\n\t" \
"psubw %%xmm3,%%xmm1\n\t" \
-unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
ptrdiff_t ystride;
unsigned ret;
@@ -138,6 +140,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
"movq %[c],%%xmm6\n\t"
:
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+ :"%xmm6", "%xmm7"
);
for(i=0;i<4;i++){
unsigned m;
@@ -151,6 +154,9 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
"paddd %%xmm0,%%xmm7\n\t"
"paddd %%xmm1,%%xmm7\n\t"
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+ :
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", /*"%xmm5",*/ "%xmm6", "%xmm7"
);
}
_src+=2*ystride;
@@ -164,6 +170,8 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
"paddd %%xmm6,%%xmm7\n\t"
"movd %%xmm7,%[ret]\n\t"
:[ret]"=a"(ret)
+ :
+ :"%xmm6", "%xmm7"
);
return ret;
}
@@ -381,7 +389,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
OC_HADAMARD_AB_8x8 \
OC_HADAMARD_C_ABS_ACCUM_8x8
-static unsigned oc_int_frag_satd_sse2(int *_dc,
+static __attribute__((target("sse2"))) unsigned oc_int_frag_satd_sse2(int *_dc,
const unsigned char *_src,int _src_ystride,
const unsigned char *_ref,int _ref_ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
@@ -434,25 +442,27 @@ static unsigned oc_int_frag_satd_sse2(int *_dc,
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
/*We have to use neg, so we actually clobber the condition codes for once
(not to mention sub, and add).*/
- :"cc"
+ :"cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
*_dc=dc;
return ret;
}
-unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref,int _ystride){
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
}
-unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
OC_ALIGN8(unsigned char ref[64]);
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
}
-unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
const unsigned char *_src,int _ystride){
OC_ALIGN16(ogg_int16_t buf[16]);
unsigned ret;
@@ -491,7 +501,9 @@ unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
/*We have to use sub, so we actually clobber the condition codes for once.*/
- :"cc"
+ :"cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
*_dc=dc;
return ret;
diff --git a/lib/x86/sse2fdct.c b/lib/x86/sse2fdct.c
index 7787cb9..e2d0a0d 100644
--- a/lib/x86/sse2fdct.c
+++ b/lib/x86/sse2fdct.c
@@ -354,7 +354,7 @@
/*SSE2 implementation of the fDCT for x86-64 only.
Because of the 8 extra XMM registers on x86-64, this version can operate
without any temporary stack access at all.*/
-void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void __attribute__((target("sse2"))) oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
ptrdiff_t a;
__asm__ __volatile__(
/*Load the input.*/
@@ -446,7 +446,11 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
#undef OC_ZZ_LOAD_ROW_HI
:[a]"=&r"(a)
:[y]"r"(_y),[x]"r"(_x)
- :"memory"
+ :"memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15"
);
}
#endif
diff --git a/lib/x86/sse2idct.c b/lib/x86/sse2idct.c
index 1f71e3a..cf8c1e4 100644
--- a/lib/x86/sse2idct.c
+++ b/lib/x86/sse2idct.c
@@ -206,7 +206,7 @@ const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
"psraw $4,%%xmm7\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
-static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+static __attribute__((target("sse2"))) void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
int i;
/*This routine accepts an 8x8 matrix pre-transposed.*/
@@ -230,8 +230,10 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
- __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+ __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t":::"%xmm0");
/*Clear input data for next block (decoder only).*/
for(i=0;i<2;i++){
__asm__ __volatile__(
@@ -240,6 +242,8 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+ :
+ :"%xmm0"
);
}
}
@@ -392,7 +396,7 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
"psubw %%xmm7,%%xmm4\n\t" \
"psubw %%xmm6,%%xmm5\n\t" \
-static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+static __attribute__((target("sse2"))) void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
@@ -408,6 +412,8 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
+ :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
/*Clear input data for next block (decoder only).*/
__asm__ __volatile__(
@@ -417,13 +423,15 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+ :
+ :"%xmm0"
);
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+void __attribute__((target("sse2"))) oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
diff --git a/lib/x86/x86enquant.c b/lib/x86/x86enquant.c
index 49368e8..8c01e51 100644
--- a/lib/x86/x86enquant.c
+++ b/lib/x86/x86enquant.c
@@ -56,7 +56,7 @@ void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
}
}
-int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+int __attribute__((target("sse2"))) oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
const ogg_uint16_t _dequant[64],const void *_enquant){
ptrdiff_t r;
__asm__ __volatile__(
@@ -141,7 +141,9 @@ int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
"add %k[q],%k[r]\n\t"
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
:[dct]"r"(_dct),[qdct]"r"(_qdct)
- :"cc","memory"
+ :"cc","memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
return (int)r;
}
+1
View File
@@ -0,0 +1 @@
# This file is a placeholder because MXE expects to find .mk files in the plugins folders