MXE: Fix libtheora assembly

XMM6 and XMM7 are expected to be callee saved in Windows x64 ABI.
2026-05-21 05:40:49 +00:00 · 2025-10-26 10:35:38 +00:00
parent 9f2837f1f9
commit da58f5b621
3 changed files with 246 additions and 1 deletions
@@ -78,7 +78,7 @@ mxe_package(ogg)

 mxe_package(vorbis)

-mxe_package(theora)
+local_mxe_package(theora)

 mxe_package(flac)

@@ -0,0 +1,244 @@
+commit 305566da3d6e09deb014339d10eb0798be4791e8
+Author: Le Philousophe <lephilousophe@users.noreply.github.com>
+Date:   Tue Oct 21 19:52:09 2025 +0200
+
+    Mark XMM registers as clobbered
+
+diff --git a/lib/x86/sse2encfrag.c b/lib/x86/sse2encfrag.c
+index b7726c7..4da7d0c 100644
+--- a/lib/x86/sse2encfrag.c
+++ b/lib/x86/sse2encfrag.c
+@@ -64,7 +64,7 @@
+  "paddd %%xmm3,%%xmm2\n\t" \
+  "paddd %%xmm2,%%xmm7\n\t" \
+ 
+-unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src,
+  const unsigned char *_ref,int _ystride){
+   unsigned ret;
+   __asm__ __volatile__(
+@@ -82,6 +82,8 @@ unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
+     :[ret]"=a"(ret)
+     :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
+      [ystride3]"r"((ptrdiff_t)_ystride*3)
+    :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+   return ret;
+ }
+@@ -92,7 +94,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+ 
+ /*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
+    horizontal sums as well as their 16-bit differences subject to a mask.
+-  %%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
+  %%xmm6 must contain OC_MASK_CONSTS[0...7] and %%xmm7 must contain 0.*/
+ #define OC_LOAD_SUB_MASK_2x8 \
+  "#OC_LOAD_SUB_MASK_2x8\n\t" \
+  /*Start the loads and expand the next 8 bits of the mask.*/ \
+@@ -127,7 +129,7 @@ static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
+  "psubw %%xmm2,%%xmm0\n\t" \
+  "psubw %%xmm3,%%xmm1\n\t" \
+ 
+-unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+  const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
+   ptrdiff_t ystride;
+   unsigned  ret;
+@@ -138,6 +140,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+     "movq %[c],%%xmm6\n\t"
+     :
+     :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
+    :"%xmm6", "%xmm7"
+   );
+   for(i=0;i<4;i++){
+     unsigned m;
+@@ -151,6 +154,9 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+         "paddd %%xmm0,%%xmm7\n\t"
+         "paddd %%xmm1,%%xmm7\n\t"
+         :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
+        :
+        :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+         "%xmm4", /*"%xmm5",*/ "%xmm6", "%xmm7"
+       );
+     }
+     _src+=2*ystride;
+@@ -164,6 +170,8 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+     "paddd %%xmm6,%%xmm7\n\t"
+     "movd %%xmm7,%[ret]\n\t"
+     :[ret]"=a"(ret)
+    :
+    :"%xmm6", "%xmm7"
+   );
+   return ret;
+ }
+@@ -381,7 +389,7 @@ unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
+  OC_HADAMARD_AB_8x8 \
+  OC_HADAMARD_C_ABS_ACCUM_8x8
+ 
+-static unsigned oc_int_frag_satd_sse2(int *_dc,
+static __attribute__((target("sse2"))) unsigned oc_int_frag_satd_sse2(int *_dc,
+  const unsigned char *_src,int _src_ystride,
+  const unsigned char *_ref,int _ref_ystride){
+   OC_ALIGN16(ogg_int16_t buf[16]);
+@@ -434,25 +442,27 @@ static unsigned oc_int_frag_satd_sse2(int *_dc,
+      [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
+     /*We have to use neg, so we actually clobber the condition codes for once
+        (not to mention sub, and add).*/
+-    :"cc"
+    :"cc",
+     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+   *_dc=dc;
+   return ret;
+ }
+ 
+-unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
+  const unsigned char *_ref,int _ystride){
+   return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
+ }
+ 
+-unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
+  const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
+   OC_ALIGN8(unsigned char ref[64]);
+   oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
+   return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
+ }
+ 
+-unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc,
+  const unsigned char *_src,int _ystride){
+   OC_ALIGN16(ogg_int16_t buf[16]);
+   unsigned ret;
+@@ -491,7 +501,9 @@ unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
+     :[src]"r"(_src),[src4]"r"(_src+4*_ystride),
+      [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
+     /*We have to use sub, so we actually clobber the condition codes for once.*/
+-    :"cc"
+    :"cc",
+     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+   *_dc=dc;
+   return ret;
+diff --git a/lib/x86/sse2fdct.c b/lib/x86/sse2fdct.c
+index 7787cb9..e2d0a0d 100644
+--- a/lib/x86/sse2fdct.c
+++ b/lib/x86/sse2fdct.c
+@@ -354,7 +354,7 @@
+ /*SSE2 implementation of the fDCT for x86-64 only.
+   Because of the 8 extra XMM registers on x86-64, this version can operate
+    without any temporary stack access at all.*/
+-void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+void __attribute__((target("sse2"))) oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+   ptrdiff_t a;
+   __asm__ __volatile__(
+     /*Load the input.*/
+@@ -446,7 +446,11 @@ void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+ #undef OC_ZZ_LOAD_ROW_HI
+     :[a]"=&r"(a)
+     :[y]"r"(_y),[x]"r"(_x)
+-    :"memory"
+    :"memory",
+     "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
+     "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
+     "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
+     "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+   );
+ }
+ #endif
+diff --git a/lib/x86/sse2idct.c b/lib/x86/sse2idct.c
+index 1f71e3a..cf8c1e4 100644
+--- a/lib/x86/sse2idct.c
+++ b/lib/x86/sse2idct.c
+@@ -206,7 +206,7 @@ const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
+   "psraw $4,%%xmm7\n\t" \
+   "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
+ 
+-static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+static __attribute__((target("sse2"))) void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+   OC_ALIGN16(ogg_int16_t buf[16]);
+   int i;
+   /*This routine accepts an 8x8 matrix pre-transposed.*/
+@@ -230,8 +230,10 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+      [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+     :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
+    :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+-  __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+  __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t":::"%xmm0");
+   /*Clear input data for next block (decoder only).*/
+   for(i=0;i<2;i++){
+     __asm__ __volatile__(
+@@ -240,6 +242,8 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+       "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+       "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+       :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+      :
+      :"%xmm0"
+     );
+   }
+ }
+@@ -392,7 +396,7 @@ static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+   "psubw %%xmm7,%%xmm4\n\t" \
+   "psubw %%xmm6,%%xmm5\n\t" \
+ 
+-static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+static __attribute__((target("sse2"))) void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+   OC_ALIGN16(ogg_int16_t buf[16]);
+   /*This routine accepts an 8x8 matrix pre-transposed.*/
+   __asm__ __volatile__(
+@@ -408,6 +412,8 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+      [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+      [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,64))
+    :"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+   /*Clear input data for next block (decoder only).*/
+   __asm__ __volatile__(
+@@ -417,13 +423,15 @@ static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+     "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+     "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+     :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+    :
+    :"%xmm0"
+   );
+ }
+ 
+ /*Performs an inverse 8x8 Type-II DCT transform.
+   The input is assumed to be scaled by a factor of 4 relative to orthonormal
+    version of the transform.*/
+-void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+void __attribute__((target("sse2"))) oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+   /*_last_zzi is subtly different from an actual count of the number of
+      coefficients we decoded for this block.
+     It contains the value of zzi BEFORE the final token in the block was
+diff --git a/lib/x86/x86enquant.c b/lib/x86/x86enquant.c
+index 49368e8..8c01e51 100644
+--- a/lib/x86/x86enquant.c
+++ b/lib/x86/x86enquant.c
+@@ -56,7 +56,7 @@ void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
+   }
+ }
+ 
+-int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+int __attribute__((target("sse2"))) oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+  const ogg_uint16_t _dequant[64],const void *_enquant){
+   ptrdiff_t r;
+   __asm__ __volatile__(
+@@ -141,7 +141,9 @@ int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+     "add %k[q],%k[r]\n\t"
+     :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
+     :[dct]"r"(_dct),[qdct]"r"(_qdct)
+-    :"cc","memory"
+    :"cc","memory",
+     "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+     "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+   );
+   return (int)r;
+ }
@@ -0,0 +1 @@
+# This file is a placeholder because MXE expects to find .mk files in the plugins folders
				`@@ -0,0 +1 @@`
				`# This file is a placeholder because MXE expects to find .mk files in the plugins folders`