diff --git a/Makefile b/Makefile index fb97d25..3e8f5e5 100644 --- a/Makefile +++ b/Makefile @@ -136,7 +136,7 @@ endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS - rm -f checkasm checkasm.exe tools/checkasm.o + rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) - sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak diff --git a/common/common.c b/common/common.c index c163e09..18b9e1a 100644 --- a/common/common.c +++ b/common/common.c @@ -710,7 +710,7 @@ void *x264_malloc( int i_size ) buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) + sizeof( int ) ); align_buf = buf + 15 + sizeof( void ** ) + sizeof( int ); - align_buf -= (long) align_buf & 15; + align_buf -= (intptr_t) align_buf & 15; *( (void **) ( align_buf - sizeof( void ** ) ) ) = buf; *( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size; return align_buf; diff --git a/common/pixel.c b/common/pixel.c index f97f1b2..3f6aa3c 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 { int64_t i_ssd = 0; int x, y; - int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15); + int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ pix2 + y*i_pix2 + x, i_pix2 ); diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index 4bfb330..dc03faa 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -32,7 +32,10 @@ cextern x264_cabac_transition cextern x264_cabac_renorm_shift ; t3 must be ecx, since it's used for shift. -%ifdef ARCH_X86_64 +%ifdef WIN64 + DECLARE_REG_TMP 3,1,2,0,4,5,6,10 + %define pointer resq +%elifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6,10 %define pointer resq %else @@ -67,10 +70,10 @@ endstruc %endmacro cglobal x264_cabac_encode_decision_asm, 0,7 - movifnidn t0d, r0m + movifnidn t0, r0mp movifnidn t1d, r1m - mov t5d, [r0+cb.range] - movzx t3d, byte [r0+cb.state+t1] + mov t5d, [t0+cb.range] + movzx t3d, byte [t0+cb.state+t1] mov t4d, t5d shr t5d, 6 and t5d, 3 @@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7 shr t6d, 6 movifnidn t2d, r2m cmp t6d, t2d - mov t6d, [r0+cb.low] + mov t6d, [t0+cb.low] lea t7, [t6+t4] cmovne t4d, t5d cmovne t6d, t7d LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2 movifnidn t1d, r1m - mov [r0+cb.state+t1], t3b + mov [t0+cb.state+t1], t3b .renorm: mov t3d, t4d shr t3d, 3 LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3 shl t4d, t3b shl t6d, t3b - add t3d, [r0+cb.queue] - mov [r0+cb.range], t4d - mov [r0+cb.low], t6d - mov [r0+cb.queue], t3d + add t3d, [t0+cb.queue] + mov [t0+cb.range], t4d + mov [t0+cb.low], t6d + mov [t0+cb.queue], t3d cmp t3d, 8 jge .putbyte REP_RET @@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7 sub t3d, 10 and t6d, t1d cmp t2b, 0xff ; FIXME is a 32bit op faster? - mov [r0+cb.queue], t3d - mov [r0+cb.low], t6d + mov [t0+cb.queue], t3d + mov [t0+cb.low], t6d mov t1d, t2d - mov t4, [r0+cb.p] + mov t4, [t0+cb.p] je .postpone - mov t5d, [r0+cb.bytes_outstanding] + mov t5d, [t0+cb.bytes_outstanding] shr t1d, 8 ; carry add [t4-1], t1b test t5d, t5d @@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7 .no_outstanding: mov [t4], t2b inc t4 - mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate - mov [r0+cb.p], t4 + mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate + mov [t0+cb.p], t4 RET .postpone: - inc dword [r0+cb.bytes_outstanding] + inc dword [t0+cb.bytes_outstanding] RET diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm index ebfe48d..f8f22bc 100644 --- a/common/x86/cpu-a.asm +++ b/common/x86/cpu-a.asm @@ -26,7 +26,26 @@ SECTION .text -%ifdef ARCH_X86_64 +%ifdef WIN64 + +;----------------------------------------------------------------------------- +; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) +;----------------------------------------------------------------------------- +cglobal x264_cpu_cpuid + push rbx + mov r10, rdx + mov r11, [rsp+48] + mov eax, ecx + cpuid + mov [r10], eax + mov [r8], ebx + mov [r9], ecx + mov [r11], edx + pop rbx + ret + +%elifdef ARCH_X86_64 + ;----------------------------------------------------------------------------- ; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- @@ -102,6 +121,7 @@ cglobal x264_stack_align call ecx leave ret + %endif ;----------------------------------------------------------------------------- diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index be0008a..987c938 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -189,7 +189,7 @@ dct8_mmx: ; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal x264_sub8x8_dct8_mmx, 3,3 -global x264_sub8x8_dct8_mmx %+ .skip_prologue +global x264_sub8x8_dct8_mmx.skip_prologue .skip_prologue: INIT_MMX call load_diff_4x8_mmx @@ -255,7 +255,7 @@ idct8_mmx: ; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- cglobal x264_add8x8_idct8_mmx, 2,2 -global x264_add8x8_idct8_mmx %+ .skip_prologue +global x264_add8x8_idct8_mmx.skip_prologue .skip_prologue: INIT_MMX add word [r1], 32 @@ -348,7 +348,7 @@ INIT_XMM ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal x264_sub8x8_dct8_sse2, 3,3 -global x264_sub8x8_dct8_sse2 %+ .skip_prologue +global x264_sub8x8_dct8_sse2.skip_prologue .skip_prologue: LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] @@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- cglobal x264_add8x8_idct8_sse2, 2,2 -global x264_add8x8_idct8_sse2 %+ .skip_prologue +global x264_add8x8_idct8_sse2.skip_prologue .skip_prologue: UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D 0,1,2,3,4,5,6,7,r1 diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index 171c4ab..2aecf3c 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -86,7 +86,7 @@ INIT_XMM ;----------------------------------------------------------------------------- ; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- -cglobal x264_sub8x8_dct8_sse2 +cglobal x264_sub8x8_dct8_sse2, 3,3,10 LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] @@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2 movdqa [r0+0x50], m5 movdqa [r0+0x60], m6 movdqa [r0+0x70], m7 - ret + RET %macro IDCT8_1D 10 @@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2 ;----------------------------------------------------------------------------- ; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- -cglobal x264_add8x8_idct8_sse2 +cglobal x264_add8x8_idct8_sse2, 2,2,10 movdqa m0, [r1+0x00] movdqa m1, [r1+0x10] movdqa m2, [r1+0x20] @@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2 STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE] STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE] STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE] - ret + RET diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 012e25a..018cdf1 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -152,12 +152,16 @@ cglobal x264_add4x4_idct_mmx, 2,2 INIT_XMM -cglobal x264_sub8x8_dct_sse2, 3,3 +cglobal x264_sub8x8_dct_sse2, 3,3,8 .skip_prologue: call .8x4 add r0, 64 add r1, 4*FENC_STRIDE add r2, 4*FDEC_STRIDE +%ifdef WIN64 + call .8x4 + RET +%endif .8x4: SUB_DCT4 2x4x4W movhps [r0+32], m0 @@ -166,11 +170,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3 movhps [r0+56], m3 ret -cglobal x264_add8x8_idct_sse2, 2,2 +cglobal x264_add8x8_idct_sse2, 2,2,8 .skip_prologue: call .8x4 add r1, 64 add r0, 4*FDEC_STRIDE +%ifdef WIN64 + call .8x4 + RET +%endif .8x4: movq m0, [r1+ 0] movq m1, [r1+ 8] @@ -189,6 +197,9 @@ cglobal x264_add8x8_idct_sse2, 2,2 %macro SUB_NxN_DCT 6 cglobal %1, 3,3 .skip_prologue: +%ifdef WIN64 + sub rsp, 8 +%endif call %2 add r0, %3 add r1, %4-%5-%6*FENC_STRIDE @@ -201,6 +212,9 @@ cglobal %1, 3,3 add r0, %3 add r1, %4-%5-%6*FENC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE +%ifdef WIN64 + add rsp, 8 +%endif jmp %2 %endmacro @@ -210,6 +224,9 @@ cglobal %1, 3,3 %macro ADD_NxN_IDCT 6 cglobal %1, 2,2 .skip_prologue: +%ifdef WIN64 + sub rsp, 8 +%endif call %2 add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 @@ -219,25 +236,30 @@ cglobal %1, 2,2 call %2 add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 +%ifdef WIN64 + add rsp, 8 +%endif jmp %2 %endmacro %ifndef ARCH_X86_64 -SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0 -ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0 -SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4 -ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4 +SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0 +ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0 +SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4 +ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4 cextern x264_sub8x8_dct8_mmx.skip_prologue cextern x264_add8x8_idct8_mmx.skip_prologue -SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0 -ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0 -%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue +SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0 +ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0 +%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue +%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue +%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue %define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue %endif -SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4 -ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4 +SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4 +ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4 cextern x264_sub8x8_dct8_sse2 cextern x264_add8x8_idct8_sse2 @@ -248,7 +270,7 @@ ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0 ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro SCAN_8x8 1 -cglobal x264_zigzag_scan_8x8_frame_%1, 2,2 +cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8 movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdq2q mm0, xmm0 @@ -523,7 +545,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 ;----------------------------------------------------------------------------- ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) ;----------------------------------------------------------------------------- -cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3 +cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8 movd xmm0, [r1+0*FENC_STRIDE] movd xmm1, [r1+1*FENC_STRIDE] movd xmm2, [r1+2*FENC_STRIDE] diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index c722b57..75b308f 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -278,7 +278,7 @@ SECTION .text ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal x264_deblock_v_luma_sse2 +cglobal x264_deblock_v_luma_sse2, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 - ret + RET ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2 - movsxd r10, esi +cglobal x264_deblock_h_luma_sse2, 5,7 + movsxd r10, r1d lea r11, [r10+r10*2] - lea rax, [r0-4] - lea r9, [r0-4+r11] + lea r6, [r0-4] + lea r5, [r0-4+r11] +%ifdef WIN64 + sub rsp, 0x98 + %define pix_tmp rsp+0x30 +%else sub rsp, 0x68 %define pix_tmp rsp +%endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp - lea rax, [rax+r10*8] - lea r9, [r9 +r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] - mov esi, 0x10 + mov r1d, 0x10 +%ifdef WIN64 + mov [rsp+0x20], r4 +%endif call x264_deblock_v_luma_sse2 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add rax, 2 - add r9, 2 + add r6, 2 + add r5, 2 movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 - sub rax, r10 - sub r9, r10 + sub r6, r10 + sub r5, r10 shr r10, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) +%ifdef WIN64 + add rsp, 0x98 +%else add rsp, 0x68 - ret +%endif + RET %else @@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 mova m3, [r0+r1] ; q1 LOAD_MASK r2, r3 - mov r3, r4m + mov r3, r4mp movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] @@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 ;----------------------------------------------------------------------------- INIT_MMX cglobal x264_deblock_h_luma_%1, 0,5 - mov r0, r0m + mov r0, r0mp mov r3, r1m lea r4, [r3*3] sub r0, 4 @@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 ADD esp, 20 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov r0, r0m + mov r0, r0mp sub r0, 2 lea r1, [r0+r4] @@ -609,7 +621,7 @@ DEBLOCK_LUMA sse2, v, 16 ;----------------------------------------------------------------------------- ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6 +cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -671,34 +683,34 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1 +cglobal x264_deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] - lea rax, [r0-4] - lea r9, [r0-4+r11] + lea r6, [r0-4] + lea r5, [r0-4+r11] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea rax, [rax+r10*8] - lea r9, [r9+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call x264_deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r9, [rax+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + lea r5, [r6+r11] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) shl r10, 3 - sub rax, r10 - sub r9, r10 + sub r6, r10 + sub r5, r10 shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) add rsp, 0x88 - ret + RET %else cglobal x264_deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] @@ -727,7 +739,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 ADD esp, 16 mov r1, r1m - mov r0, r0m + mov r0, r0mp lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 3b0ffda..6a58700 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -42,21 +42,22 @@ SECTION .text ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %ifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,10,11 - %macro AVG_START 0 - PROLOGUE 6,7 - .height_loop: + %macro AVG_START 0-1 0 + PROLOGUE 6,7,%1 +%ifdef WIN64 + movsxd r5, r5d +%endif %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 - %macro AVG_START 0 - PROLOGUE 0,7 + %macro AVG_START 0-1 0 + PROLOGUE 0,7,%1 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m - .height_loop: %endmacro %endif @@ -126,10 +127,11 @@ SECTION .text ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) ;----------------------------------------------------------------------------- -%macro AVG_WEIGHT 2 -cglobal x264_pixel_avg_weight_w%2_%1, 0,0 +%macro AVG_WEIGHT 3 +cglobal x264_pixel_avg_weight_w%2_%1 + AVG_START %3 BIWEIGHT_START - AVG_START +.height_loop: %if %2==8 && mmsize==16 BIWEIGHT [t2], [t4] SWAP 0, 2 @@ -156,20 +158,20 @@ cglobal x264_pixel_avg_weight_w%2_%1, 0,0 %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX INIT_MMX -AVG_WEIGHT mmxext, 4 -AVG_WEIGHT mmxext, 8 -AVG_WEIGHT mmxext, 16 +AVG_WEIGHT mmxext, 4, 0 +AVG_WEIGHT mmxext, 8, 0 +AVG_WEIGHT mmxext, 16, 0 INIT_XMM %define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext -AVG_WEIGHT sse2, 8 -AVG_WEIGHT sse2, 16 +AVG_WEIGHT sse2, 8, 8 +AVG_WEIGHT sse2, 16, 8 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 INIT_MMX -AVG_WEIGHT ssse3, 4 +AVG_WEIGHT ssse3, 4, 0 INIT_XMM -AVG_WEIGHT ssse3, 8 -AVG_WEIGHT ssse3, 16 +AVG_WEIGHT ssse3, 8, 8 +AVG_WEIGHT ssse3, 16, 8 @@ -182,7 +184,7 @@ AVG_WEIGHT ssse3, 16 ; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 3 -cglobal x264_pixel_avg_%1x%2_%3,0,0 +cglobal x264_pixel_avg_%1x%2_%3 mov eax, %2 cmp dword r6m, 32 jne x264_pixel_avg_weight_w%1_%3 @@ -211,6 +213,7 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0 %macro AVG_FUNC 3 cglobal %1 AVG_START +.height_loop: %2 m0, [t2] %2 m1, [t2+t3] pavgb m0, [t4] @@ -233,6 +236,7 @@ AVGH 8, 4, mmxext cglobal x264_pixel_avg_w16_mmxext AVG_START +.height_loop: movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] @@ -423,7 +427,7 @@ AVG2_W20 sse2_misalign %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set -cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0 +cglobal x264_pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, 0x1f|(%2>>1) cmp eax, (32-%1)|(%2>>1) @@ -624,7 +628,7 @@ cglobal x264_prefetch_fenc_mmxext, 5,5 lea r2, [r2+r4+64] prefetcht0 [r2] prefetcht0 [r2+r3] - ret + RET %else cglobal x264_prefetch_fenc_mmxext @@ -668,7 +672,7 @@ cglobal x264_prefetch_ref_mmxext, 3,3 prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] - ret + RET @@ -684,7 +688,7 @@ cglobal x264_prefetch_ref_mmxext, 3,3 %endif %macro MC_CHROMA_START 0 - movifnidn r2d, r2m + movifnidn r2, r2mp movifnidn r3d, r3m movifnidn r4d, r4m movifnidn r5d, r5m @@ -704,13 +708,13 @@ cglobal x264_prefetch_ref_mmxext, 3,3 ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- -%macro MC_CHROMA 1 -cglobal x264_mc_chroma_%1, 0,6 +%macro MC_CHROMA 2 +cglobal x264_mc_chroma_%1 %if mmsize == 16 cmp dword r6m, 4 - jle x264_mc_chroma_mmxext %+ .skip_prologue + jle x264_mc_chroma_mmxext %endif -.skip_prologue: + PROLOGUE 0,6,%2 MC_CHROMA_START pxor m3, m3 and r4d, 7 ; dx &= 7 @@ -739,7 +743,7 @@ cglobal x264_mc_chroma_%1, 0,6 mov r10, r0 mov r11, r2 %else - mov r0, r0m + mov r0, r0mp mov r1, r1m mov r5, r2 %endif @@ -781,9 +785,9 @@ cglobal x264_mc_chroma_%1, 0,6 lea r0, [r10+4] ; dst lea r2, [r11+4] ; src %else - mov r0, r0m + mov r0, r0mp lea r2, [r5+4] - add r0, 4 + add r0, 4 %endif mov r4d, r7m ; height jmp .loop2d @@ -804,7 +808,7 @@ cglobal x264_mc_chroma_%1, 0,6 SPLATW m6, m6 mova m7, [pw_4 GLOBAL] psubw m5, m6 - movifnidn r0d, r0m + movifnidn r0, r0mp movifnidn r1d, r1m mov r4d, r7m %if mmsize == 8 @@ -862,12 +866,12 @@ cglobal x264_mc_chroma_%1, 0,6 %endmacro ; MC_CHROMA INIT_MMX -MC_CHROMA mmxext +MC_CHROMA mmxext, 0 INIT_XMM -MC_CHROMA sse2 +MC_CHROMA sse2, 8 INIT_MMX -cglobal x264_mc_chroma_ssse3, 0,6 +cglobal x264_mc_chroma_ssse3, 0,6,8 MC_CHROMA_START and r4d, 7 and r5d, 7 @@ -884,7 +888,7 @@ cglobal x264_mc_chroma_ssse3, 0,6 mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d - movifnidn r0d, r0m + movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 @@ -925,7 +929,7 @@ INIT_XMM mova m5, [pw_32 GLOBAL] movd m6, r5d movd m7, r4d - movifnidn r0d, r0m + movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r4d, r7m SPLATW m6, m6 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index b392022..9a23fb8 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -121,11 +121,14 @@ SECTION .text INIT_MMX -%macro HPEL_V 1 +%macro HPEL_V 2 ;----------------------------------------------------------------------------- ; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_v_%1, 5,6 +cglobal x264_hpel_filter_v_%1, 5,6,%2 +%ifdef WIN64 + movsxd r4, r4d +%endif lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -154,7 +157,7 @@ cglobal x264_hpel_filter_v_%1, 5,6 jl .loop REP_RET %endmacro -HPEL_V mmxext +HPEL_V mmxext, 0 ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_mmxext( uint8_t *dst, int16_t *buf, int width ); @@ -235,7 +238,7 @@ INIT_XMM ;----------------------------------------------------------------------------- ; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_c_%1, 3,3 +cglobal x264_hpel_filter_c_%1, 3,3,9 add r0, r2 lea r1, [r1+r2*2] neg r2 @@ -287,7 +290,7 @@ cglobal x264_hpel_filter_c_%1, 3,3 ;----------------------------------------------------------------------------- ; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width ); ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_h_sse2, 3,3 +cglobal x264_hpel_filter_h_sse2, 3,3,8 add r0, r2 add r1, r2 neg r2 @@ -404,7 +407,7 @@ cglobal x264_hpel_filter_h_ssse3, 3,3 %ifndef ARCH_X86_64 HPEL_C sse2 %endif -HPEL_V sse2 +HPEL_V sse2, 8 HPEL_C sse2_misalign %define PALIGNR PALIGNR_SSSE3 HPEL_C ssse3 @@ -473,7 +476,11 @@ HPEL_C ssse3 ; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; uint8_t *src, int stride, int width, int height) ;----------------------------------------------------------------------------- -cglobal x264_hpel_filter_%1, 7,7 +cglobal x264_hpel_filter_%1, 7,7,16 +%ifdef WIN64 + movsxd r4, r4d + movsxd r5, r5d +%endif mov r10, r3 sub r5, 16 mov r11, r1 @@ -858,8 +865,11 @@ INTEGRAL_INIT sse2 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; int src_stride, int dst_stride, int width, int height ) ;----------------------------------------------------------------------------- -%macro FRAME_INIT_LOWRES 1 ; FIXME -cglobal x264_frame_init_lowres_core_%1, 6,7 +%macro FRAME_INIT_LOWRES 2 ; FIXME +cglobal x264_frame_init_lowres_core_%1, 6,7,%2 +%ifdef WIN64 + movsxd r5, r5d +%endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m dec r6d @@ -964,11 +974,11 @@ cglobal x264_frame_init_lowres_core_%1, 6,7 INIT_MMX %define PALIGNR PALIGNR_MMX -FRAME_INIT_LOWRES mmxext +FRAME_INIT_LOWRES mmxext, 0 %ifndef ARCH_X86_64 -FRAME_INIT_LOWRES cache32_mmxext +FRAME_INIT_LOWRES cache32_mmxext, 0 %endif INIT_XMM -FRAME_INIT_LOWRES sse2 +FRAME_INIT_LOWRES sse2, 12 %define PALIGNR PALIGNR_SSSE3 -FRAME_INIT_LOWRES ssse3 +FRAME_INIT_LOWRES ssse3, 12 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 9056007..3272551 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -204,7 +204,7 @@ void x264_sfence( void );\ static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\ int stride, int width, int height, int16_t *buf )\ {\ - int realign = (long)src & (align-1);\ + int realign = (intptr_t)src & (align-1);\ src -= realign;\ dstv -= realign;\ dstc -= realign;\ diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 42f9113..4426e33 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -170,8 +170,8 @@ SECTION .text ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -%macro SSD 3 -cglobal x264_pixel_ssd_%1x%2_%3, 4,4 +%macro SSD 4 +cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4 %if %1 >= mmsize pxor m7, m7 %endif @@ -193,19 +193,19 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4 %endmacro INIT_MMX -SSD 16, 16, mmx -SSD 16, 8, mmx -SSD 8, 16, mmx -SSD 8, 8, mmx -SSD 8, 4, mmx -SSD 4, 8, mmx -SSD 4, 4, mmx +SSD 16, 16, mmx, 0 +SSD 16, 8, mmx, 0 +SSD 8, 16, mmx, 0 +SSD 8, 8, mmx, 0 +SSD 8, 4, mmx, 0 +SSD 4, 8, mmx, 0 +SSD 4, 4, mmx, 0 INIT_XMM -SSD 16, 16, sse2 -SSD 16, 8, sse2 -SSD 8, 16, sse2 -SSD 8, 8, sse2 -SSD 8, 4, sse2 +SSD 16, 16, sse2, 8 +SSD 16, 8, sse2, 8 +SSD 8, 16, sse2, 5 +SSD 8, 8, sse2, 5 +SSD 8, 4, sse2, 5 cglobal x264_pixel_ssd_4x8_sse4, 4,4 SSD_QUARTER 0, 0, r1, r3, 0, 1 @@ -294,12 +294,12 @@ cglobal x264_pixel_var_8x8_mmxext, 2,3 VAR_END 6 INIT_XMM -cglobal x264_pixel_var_16x16_sse2, 2,3 +cglobal x264_pixel_var_16x16_sse2, 2,3,8 VAR_START VAR_2ROW r1, 8 VAR_END 8 -cglobal x264_pixel_var_8x8_sse2, 2,3 +cglobal x264_pixel_var_8x8_sse2, 2,3,8 VAR_START mov t3d, 4 .loop: @@ -524,8 +524,8 @@ SATD_W4 mmxext lea r0, [r10+8] lea r2, [r11+8] %else - mov r0, r0m - mov r2, r2m + mov r0, r0mp + mov r2, r2mp add r0, 8 add r2, 8 %endif @@ -548,7 +548,7 @@ x264_pixel_satd_4x8_internal_%1: SATD_8x4_SSE2 %1 ret -cglobal x264_pixel_satd_16x16_%1, 4,6 +cglobal x264_pixel_satd_16x16_%1, 4,6,8 SATD_START_SSE2 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 @@ -562,7 +562,7 @@ cglobal x264_pixel_satd_16x16_%1, 4,6 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 -cglobal x264_pixel_satd_16x8_%1, 4,6 +cglobal x264_pixel_satd_16x8_%1, 4,6,8 SATD_START_SSE2 BACKUP_POINTERS call x264_pixel_satd_8x8_internal_%1 @@ -570,7 +570,7 @@ cglobal x264_pixel_satd_16x8_%1, 4,6 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 -cglobal x264_pixel_satd_8x16_%1, 4,6 +cglobal x264_pixel_satd_8x16_%1, 4,6,8 SATD_START_SSE2 call x264_pixel_satd_8x8_internal_%1 lea r0, [r0+4*r1] @@ -578,17 +578,17 @@ cglobal x264_pixel_satd_8x16_%1, 4,6 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 -cglobal x264_pixel_satd_8x8_%1, 4,6 +cglobal x264_pixel_satd_8x8_%1, 4,6,8 SATD_START_SSE2 call x264_pixel_satd_8x8_internal_%1 SATD_END_SSE2 -cglobal x264_pixel_satd_8x4_%1, 4,6 +cglobal x264_pixel_satd_8x4_%1, 4,6,8 SATD_START_SSE2 call x264_pixel_satd_8x4_internal_%1 SATD_END_SSE2 -cglobal x264_pixel_satd_4x8_%1, 4,6 +cglobal x264_pixel_satd_4x8_%1, 4,6,8 INIT_XMM LOAD_MM_PERMUTATION satd_4x8_internal %define movh movd @@ -636,7 +636,7 @@ cglobal x264_pixel_sa8d_8x8_internal_%1 pavgw m0, m4 ret -cglobal x264_pixel_sa8d_8x8_%1, 4,6 +cglobal x264_pixel_sa8d_8x8_%1, 4,6,10 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 @@ -644,9 +644,9 @@ cglobal x264_pixel_sa8d_8x8_%1, 4,6 movd eax, m0 add eax, 1 shr eax, 1 - ret + RET -cglobal x264_pixel_sa8d_16x16_%1, 4,6 +cglobal x264_pixel_sa8d_16x16_%1, 4,6,11 lea r4, [3*r1] lea r5, [3*r3] call x264_pixel_sa8d_8x8_internal_%1 ; pix[0] @@ -667,7 +667,7 @@ cglobal x264_pixel_sa8d_16x16_%1, 4,6 movd eax, m0 add eax, 1 shr eax, 1 - ret + RET %else ; ARCH_X86_32 cglobal x264_pixel_sa8d_8x8_internal_%1 @@ -783,7 +783,7 @@ INIT_XMM ;----------------------------------------------------------------------------- ; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res ) ;----------------------------------------------------------------------------- -cglobal x264_intra_sa8d_x3_8x8_core_%1 +cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] @@ -807,11 +807,11 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1 HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7 ; dc - movzx edi, word [r1+0] - add di, word [r1+16] - add edi, 8 - and edi, -16 - shl edi, 2 + movzx r0d, word [r1+0] + add r0w, word [r1+16] + add r0d, 8 + and r0d, -16 + shl r0d, 2 pxor m15, m15 movdqa m8, m2 @@ -839,7 +839,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1 movdqa m14, m15 ; 7x8 sum movdqa m8, [r1+0] ; left edge - movd m9, edi + movd m9, r0d psllw m8, 3 psubw m8, m0 psubw m9, m0 @@ -882,7 +882,7 @@ cglobal x264_intra_sa8d_x3_8x8_core_%1 movq [r2], m3 ; i8x8_v, i8x8_h psrldq m3, 8 movd [r2+8], m3 ; i8x8_dc - ret + RET %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 @@ -1055,7 +1055,7 @@ cglobal x264_intra_satd_x3_4x4_%1, 2,6 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw %ifndef ARCH_X86_64 - mov r2, r2m + mov r2, r2mp %endif movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd @@ -1087,7 +1087,7 @@ cglobal x264_intra_satd_x3_16x16_%1, 0,7 %define sums rsp+64 ; size 24 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 - movifnidn r1d, r1m + movifnidn r1, r1mp CLEAR_SUMS ; 1D hadamards @@ -1105,7 +1105,7 @@ cglobal x264_intra_satd_x3_16x16_%1, 0,7 and t2d, -16 ; dc ; 2D hadamards - movifnidn r0d, r0m + movifnidn r0, r0mp xor r3d, r3d .loop_y: xor r4d, r4d @@ -1133,7 +1133,7 @@ cglobal x264_intra_satd_x3_16x16_%1, 0,7 jl .loop_y ; horizontal sum - movifnidn r2d, r2m + movifnidn r2, r2mp movq m2, [sums+16] movq m1, [sums+8] movq m0, [sums+0] @@ -1160,7 +1160,7 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6 %define dc_1d rsp+32 ; size 16 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 - movifnidn r1d, r1m + movifnidn r1, r1mp CLEAR_SUMS ; 1D hadamards @@ -1193,8 +1193,8 @@ cglobal x264_intra_satd_x3_8x8c_%1, 0,6 lea r5, [dc_1d] ; 2D hadamards - movifnidn r0d, r0m - movifnidn r2d, r2m + movifnidn r0, r0mp + movifnidn r2, r2mp xor r3d, r3d .loop_y: xor r4d, r4d @@ -1511,7 +1511,7 @@ HADAMARD_AC_WXH_SSE2 8, 8, %1 ; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 3 -cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3 +cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 sub rsp, 48+pad @@ -1596,7 +1596,7 @@ SATDS_SSE2 ssse3_phadd ; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, ; const uint8_t *pix2, int stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 +cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8 pxor m0, m0 pxor m1, m1 pxor m2, m2 @@ -1635,11 +1635,14 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 punpckldq m3, m4 punpckhdq m5, m4 -%ifdef ARCH_X86_64 +%ifdef WIN64 + %define t0 rax + mov t0, r4mp +%elifdef ARCH_X86_64 %define t0 r4 %else %define t0 eax - mov t0, r4m + mov t0, r4mp %endif movq [t0+ 0], m1 @@ -1652,7 +1655,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4 ;----------------------------------------------------------------------------- ; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_ssim_end4_sse2, 3,3 +cglobal x264_pixel_ssim_end4_sse2, 3,3,7 movdqa m0, [r0+ 0] movdqa m1, [r0+16] movdqa m2, [r0+32] @@ -1724,6 +1727,10 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3 %macro ADS_START 1 ; unroll_size %ifdef ARCH_X86_64 %define t0 r6 +%ifdef WIN64 + mov r4, r4mp + movsxd r5, dword r5m +%endif mov r10, rsp %else %define t0 r4 @@ -1743,6 +1750,9 @@ cglobal x264_pixel_ssim_end4_sse2, 3,3 add t0, 4*%1 sub r0d, 4*%1 jg .loop +%ifdef WIN64 + RESTORE_XMM r10 +%endif jmp ads_mvs %endmacro @@ -1776,7 +1786,9 @@ cglobal x264_pixel_ads4_mmxext, 4,7 ABS1 mm3, mm1 paddw mm0, mm2 paddw mm0, mm3 -%ifdef ARCH_X86_64 +%ifdef WIN64 + pshufw mm1, [r10+stack_offset+56], 0 +%elifdef ARCH_X86_64 pshufw mm1, [r10+8], 0 %else pshufw mm1, [ebp+stack_offset+28], 0 @@ -1830,7 +1842,7 @@ cglobal x264_pixel_ads1_mmxext, 4,7 ADS_END 2 %macro ADS_SSE2 1 -cglobal x264_pixel_ads4_%1, 4,7 +cglobal x264_pixel_ads4_%1, 4,7,12 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, 0xAA @@ -1899,7 +1911,7 @@ cglobal x264_pixel_ads4_%1, 4,7 %endif ; ARCH ADS_END 2 -cglobal x264_pixel_ads2_%1, 4,7 +cglobal x264_pixel_ads2_%1, 4,7,8 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 @@ -1925,7 +1937,7 @@ cglobal x264_pixel_ads2_%1, 4,7 movq [t0], xmm1 ADS_END 2 -cglobal x264_pixel_ads1_%1, 4,7 +cglobal x264_pixel_ads1_%1, 4,7,8 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 @@ -1971,20 +1983,24 @@ ADS_SSE2 ssse3 ; } ; return nmv; ; } -cglobal x264_pixel_ads_mvs +cglobal x264_pixel_ads_mvs, 0,7,0 ads_mvs: - xor eax, eax - xor esi, esi %ifdef ARCH_X86_64 ; mvs = r4 ; masks = rsp ; width = r5 ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.) - mov dword [rsp+r5], 0 +%ifdef WIN64 + mov r8, r4 + mov r9, r5 +%endif + xor eax, eax + xor esi, esi + mov dword [rsp+r9], 0 jmp .loopi .loopi0: add esi, 8 - cmp esi, r5d + cmp esi, r9d jge .end .loopi: mov rdi, [rsp+rsi] @@ -1992,7 +2008,7 @@ ads_mvs: jz .loopi0 xor ecx, ecx %macro TEST 1 - mov [r4+rax*2], si + mov [r8+rax*2], si test edi, 0xff<<(%1*8) setne cl add eax, ecx @@ -2007,14 +2023,15 @@ ads_mvs: TEST 1 TEST 2 TEST 3 - cmp esi, r5d + cmp esi, r9d jl .loopi .end: mov rsp, r10 - ret + RET %else - ; no PROLOGUE, inherit from x264_pixel_ads1 + xor eax, eax + xor esi, esi mov ebx, [ebp+stack_offset+20] ; mvs mov edi, [ebp+stack_offset+24] ; width mov dword [esp+edi], 0 diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 394db3c..cececbe 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -71,16 +71,16 @@ decimate_mask_table4: SECTION .text %macro QUANT_DC_START 0 - movd m6, r1m ; mf - movd m7, r2m ; bias + movd m2, r1m ; mf + movd m3, r2m ; bias %ifidn m0, mm0 - pshufw m6, m6, 0 - pshufw m7, m7, 0 + pshufw m2, m2, 0 + pshufw m3, m3, 0 %else - pshuflw m6, m6, 0 - pshuflw m7, m7, 0 - punpcklqdq m6, m6 - punpcklqdq m7, m7 + pshuflw m2, m2, 0 + pshuflw m3, m3, 0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 %endif %endmacro @@ -125,7 +125,7 @@ cglobal %1, 1,1 QUANT_DC_START %assign x 0 %rep %2 - QUANT_ONE [r0+x], m6, m7 + QUANT_ONE [r0+x], m2, m3 %assign x x+mmsize %endrep RET @@ -177,32 +177,32 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1 %macro DEQUANT16_L 3 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] -;;; m5 i_qbits +;;; m2 i_qbits mova m0, %2 packssdw m0, %3 pmullw m0, %1 - psllw m0, m5 + psllw m0, m2 mova %1, m0 %endmacro %macro DEQUANT32_R 3 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] -;;; m5 -i_qbits -;;; m6 f -;;; m7 0 +;;; m2 -i_qbits +;;; m3 f +;;; m4 0 mova m0, %1 mova m1, m0 - punpcklwd m0, m7 - punpckhwd m1, m7 + punpcklwd m0, m4 + punpckhwd m1, m4 pmaddwd m0, %2 pmaddwd m1, %3 - paddd m0, m6 - paddd m1, m6 - psrad m0, m5 - psrad m1, m5 + paddd m0, m3 + paddd m1, m3 + psrad m0, m2 + psrad m1, m2 packssdw m0, m1 mova %1, m0 %endmacro @@ -215,15 +215,15 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1 %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3] sub t0d, 16*%3 jge %%loop - rep ret + REP_RET %else %1 [r0+8*%3], [r1+16*%3], [r1+24*%3] %1 [r0 ], [r1 ], [r1+ 8*%3] - ret + RET %endif %endmacro -%macro DEQUANT16_FLAT 2-8 +%macro DEQUANT16_FLAT 2-5 mova m0, %1 %assign i %0-2 %rep %0-1 @@ -233,14 +233,16 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1 %else pmullw m0, [r0+%2] %endif - psllw m %+ i, m7 + psllw m %+ i, m4 mova [r0+%2], m %+ i %assign i i-1 %rotate 1 %endrep %endmacro -%ifdef ARCH_X86_64 +%ifdef WIN64 + DECLARE_REG_TMP 6,3,2 +%elifdef ARCH_X86_64 DECLARE_REG_TMP 4,3,2 %else DECLARE_REG_TMP 2,0,1 @@ -257,8 +259,8 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1 %ifdef ARCH_X86_64 add r1, t2 ; dequant_mf[i_mf] %else - add r1, r1m ; dequant_mf[i_mf] - mov r0, r0m ; dct + add r1, r1mp ; dequant_mf[i_mf] + mov r0, r0mp ; dct %endif sub t0d, %2 jl .rshift32 ; negative qbits => rightshift @@ -269,26 +271,27 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1 ;----------------------------------------------------------------------------- %macro DEQUANT 4 cglobal x264_dequant_%2x%2_%1, 0,3 +x264_dequant_%2x%2_%1.skip_prologue: DEQUANT_START %3+2, %3 .lshift: - movd m5, t0d + movd m2, t0d DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4 .rshift32: neg t0d - movd m5, t0d - mova m6, [pd_1 GLOBAL] - pxor m7, m7 - pslld m6, m5 - psrld m6, 1 + movd m2, t0d + mova m3, [pd_1 GLOBAL] + pxor m4, m4 + pslld m3, m2 + psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4 cglobal x264_dequant_%2x%2_flat16_%1, 0,3 movifnidn t2d, r2m %if %2 == 8 cmp t2d, 12 - jl x264_dequant_%2x%2_%1 + jl x264_dequant_%2x%2_%1.skip_prologue sub t2d, 12 %endif imul t0d, t2d, 0x2b @@ -303,8 +306,8 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3 %else lea r1, [dequant%2_scale + t2 GLOBAL] %endif - movifnidn r0d, r0m - movd m7, t0d + movifnidn r0, r0mp + movd m4, t0d %if %2 == 4 %ifidn %1, mmx DEQUANT16_FLAT [r1], 0, 16 @@ -322,7 +325,7 @@ cglobal x264_dequant_%2x%2_flat16_%1, 0,3 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112 DEQUANT16_FLAT [r1+32], 32, 96 %endif - ret + RET %endmacro ; DEQUANT %ifndef ARCH_X86_64 @@ -339,21 +342,21 @@ cglobal x264_dequant_4x4dc_%1, 0,3 DEQUANT_START 6, 6 .lshift: - movd m6, [r1] - movd m5, t0d - pslld m6, m5 + movd m3, [r1] + movd m2, t0d + pslld m3, m2 %if mmsize==16 - pshuflw m6, m6, 0 - punpcklqdq m6, m6 + pshuflw m3, m3, 0 + punpcklqdq m3, m3 %else - pshufw m6, m6, 0 + pshufw m3, m3, 0 %endif %assign x 0 %rep 16/mmsize mova m0, [r0+mmsize*0+x] mova m1, [r0+mmsize*1+x] - pmullw m0, m6 - pmullw m1, m6 + pmullw m0, m3 + pmullw m1, m3 mova [r0+mmsize*0+x], m0 mova [r0+mmsize*1+x], m1 %assign x x+mmsize*2 @@ -362,28 +365,28 @@ cglobal x264_dequant_4x4dc_%1, 0,3 .rshift32: neg t0d - movd m5, t0d - mova m6, [pw_1 GLOBAL] - mova m7, m6 - pslld m6, m5 - psrld m6, 1 - movd m4, [r1] + movd m3, t0d + mova m4, [pw_1 GLOBAL] + mova m5, m4 + pslld m4, m3 + psrld m4, 1 + movd m2, [r1] %if mmsize==8 - punpcklwd m4, m4 + punpcklwd m2, m2 %else - pshuflw m4, m4, 0 + pshuflw m2, m2, 0 %endif - punpcklwd m4, m6 + punpcklwd m2, m4 %assign x 0 %rep 32/mmsize mova m0, [r0+x] mova m1, m0 - punpcklwd m0, m7 - punpckhwd m1, m7 - pmaddwd m0, m4 - pmaddwd m1, m4 - psrad m0, m5 - psrad m1, m5 + punpcklwd m0, m5 + punpckhwd m1, m5 + pmaddwd m0, m2 + pmaddwd m1, m2 + psrad m0, m3 + psrad m1, m3 packssdw m0, m1 mova [r0+x], m0 %assign x x+mmsize @@ -399,10 +402,10 @@ DEQUANT_DC sse2 ;----------------------------------------------------------------------------- ; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) ;----------------------------------------------------------------------------- -%macro DENOISE_DCT 1 -cglobal x264_denoise_dct_%1, 4,5 +%macro DENOISE_DCT 2 +cglobal x264_denoise_dct_%1, 4,5,%2 movzx r4d, word [r0] ; backup DC coefficient - pxor m7, m7 + pxor m6, m6 .loop: sub r3, mmsize mova m2, [r0+r3*2+0*mmsize] @@ -419,10 +422,10 @@ cglobal x264_denoise_dct_%1, 4,5 mova [r0+r3*2+1*mmsize], m1 mova m2, m4 mova m3, m5 - punpcklwd m4, m7 - punpckhwd m2, m7 - punpcklwd m5, m7 - punpckhwd m3, m7 + punpcklwd m4, m6 + punpckhwd m2, m6 + punpcklwd m5, m6 + punpckhwd m3, m6 paddd m4, [r1+r3*4+0*mmsize] paddd m2, [r1+r3*4+1*mmsize] paddd m5, [r1+r3*4+2*mmsize] @@ -440,13 +443,13 @@ cglobal x264_denoise_dct_%1, 4,5 %define PSIGNW PSIGNW_MMX %ifndef ARCH_X86_64 INIT_MMX -DENOISE_DCT mmx +DENOISE_DCT mmx, 0 %endif INIT_XMM -DENOISE_DCT sse2 +DENOISE_DCT sse2, 7 %define PABSW PABSW_SSSE3 %define PSIGNW PSIGNW_SSSE3 -DENOISE_DCT ssse3 +DENOISE_DCT ssse3, 7 @@ -562,18 +565,18 @@ cglobal x264_decimate_score64_%1, 1,4 %else %define table x264_decimate_table8 %endif - mova m7, [pb_1 GLOBAL] - DECIMATE_MASK r1d, eax, r0, m7, %1, null + mova m5, [pb_1 GLOBAL] + DECIMATE_MASK r1d, eax, r0, m5, %1, null test eax, eax jne .ret9 - DECIMATE_MASK r2d, eax, r0+32, m7, %1, null + DECIMATE_MASK r2d, eax, r0+32, m5, %1, null shl r2d, 16 or r1d, r2d - DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null + DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null shl r2, 32 or eax, r3d or r1, r2 - DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null + DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null shl r2, 48 or r1, r2 xor r1, -1 @@ -703,7 +706,7 @@ cglobal x264_coeff_last4_%1, 1,1 RET %else cglobal x264_coeff_last4_%1, 0,3 - mov edx, r0m + mov edx, r0mp mov eax, [edx+4] xor ecx, ecx test eax, eax @@ -809,7 +812,9 @@ COEFF_LAST sse2_lzcnt %endmacro ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args -%ifdef ARCH_X86_64 +%ifdef WIN64 + DECLARE_REG_TMP 3,1,2,0,4,5,6 +%elifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6 %else DECLARE_REG_TMP 6,3,2,1,4,5,0 @@ -817,8 +822,8 @@ COEFF_LAST sse2_lzcnt %macro COEFF_LEVELRUN 2 cglobal x264_coeff_level_run%2_%1,0,7 - movifnidn t0d, r0m - movifnidn t1d, r1m + movifnidn t0, r0mp + movifnidn t1, r1mp pxor m2, m2 LAST_MASK t5d, t0-(%2&1)*2, t4d not t5d diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index f600dc1..6b850ff 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -113,7 +113,7 @@ SAD 4, 4 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_%1, 4,4 +cglobal x264_pixel_sad_16x16_%1, 4,4,8 movdqu m0, [r2] movdqu m1, [r2+r3] lea r2, [r2+2*r3] @@ -261,8 +261,8 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4 ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score -%macro INTRA_SAD16 1 -cglobal x264_intra_sad_x3_16x16_%1,3,5 +%macro INTRA_SAD16 2 +cglobal x264_intra_sad_x3_16x16_%1,3,5,%2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] @@ -337,11 +337,11 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5 INIT_MMX %define SPLATB SPLATB_MMX -INTRA_SAD16 mmxext +INTRA_SAD16 mmxext, 0 INIT_XMM -INTRA_SAD16 sse2 +INTRA_SAD16 sse2, 8 %define SPLATB SPLATB_SSSE3 -INTRA_SAD16 ssse3 +INTRA_SAD16 ssse3, 8 @@ -538,12 +538,17 @@ INTRA_SAD16 ssse3 %endmacro %macro SAD_X3_END 0 -%ifdef ARCH_X86_64 +%ifdef WIN64 + mov r0, r5mp + movd [r0+0], mm0 + movd [r0+4], mm1 + movd [r0+8], mm2 +%elifdef ARCH_X86_64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2 %else - mov r0, r5m + mov r0, r5mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 @@ -552,7 +557,7 @@ INTRA_SAD16 ssse3 %endmacro %macro SAD_X4_END 0 - mov r0, r6m + mov r0, r6mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 @@ -566,6 +571,13 @@ INTRA_SAD16 ssse3 ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2 +%ifdef WIN64 + %if %1 == 3 + movsxd r4, r4d + %elif %1 == 4 + movsxd r5, r5d + %endif +%endif SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -803,12 +815,17 @@ SAD_X 4, 4, 4 paddw xmm0, xmm4 paddw xmm1, xmm5 paddw xmm2, xmm6 -%ifdef ARCH_X86_64 +%ifdef WIN64 + mov r0, r5mp + movd [r0+0], xmm0 + movd [r0+4], xmm1 + movd [r0+8], xmm2 +%elifdef ARCH_X86_64 movd [r5+0], xmm0 movd [r5+4], xmm1 movd [r5+8], xmm2 %else - mov r0, r5m + mov r0, r5mp movd [r0+0], xmm0 movd [r0+4], xmm1 movd [r0+8], xmm2 @@ -817,7 +834,7 @@ SAD_X 4, 4, 4 %endmacro %macro SAD_X4_END_SSE2 0 - mov r0, r6m + mov r0, r6mp psllq xmm1, 32 psllq xmm3, 32 paddw xmm0, xmm1 @@ -910,7 +927,14 @@ SAD_X 4, 4, 4 ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1 +cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9 +%ifdef WIN64 + %if %1 == 3 + movsxd r4, r4d + %elif %1 == 4 + movsxd r5, r5d + %endif +%endif SAD_X%1_2x%2P_SSE2 1 %rep %3/2-1 SAD_X%1_2x%2P_SSE2 0 @@ -919,7 +943,14 @@ cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1 %endmacro %macro SAD_X_SSE2_MISALIGN 4 -cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1 +cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9 +%ifdef WIN64 + %if %1 == 3 + movsxd r4, r4d + %elif %1 == 4 + movsxd r5, r5d + %endif +%endif SAD_X%1_2x%2P_SSE2_MISALIGN 1 %rep %3/2-1 SAD_X%1_2x%2P_SSE2_MISALIGN 0 @@ -1021,7 +1052,7 @@ sad_w16_align%1_ssse3: %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height -cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0 +cglobal x264_pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 @@ -1069,7 +1100,7 @@ cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0 %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0 +cglobal x264_pixel_sad_16x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] @@ -1095,7 +1126,7 @@ cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0 %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline -cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0 +cglobal x264_pixel_sad_8x%1_cache%2_mmxext SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] @@ -1131,13 +1162,18 @@ cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0 %endmacro %macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 +cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 jmp x264_pixel_sad_x3_%1x%2_%4 .split: %ifdef ARCH_X86_64 + PROLOGUE 6,7 +%ifdef WIN64 + movsxd r4, r4d + sub rsp, 8 +%endif push r3 push r2 mov r2, r1 @@ -1147,14 +1183,26 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 mov r11, r5 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax +%ifdef WIN64 + mov r2, [rsp] +%else pop r2 +%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax +%ifdef WIN64 + mov r2, [rsp+8] +%else pop r2 +%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax +%ifdef WIN64 + add rsp, 24 +%endif + RET %else push edi mov edi, [esp+28] @@ -1174,12 +1222,12 @@ cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0 mov [edi+8], eax add esp, 16 pop edi -%endif ret +%endif %endmacro %macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver -cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 +cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 @@ -1187,7 +1235,11 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 jmp x264_pixel_sad_x4_%1x%2_%4 .split: %ifdef ARCH_X86_64 - mov r11, r6m + PROLOGUE 6,7 + mov r11, r6mp +%ifdef WIN64 + movsxd r5, r5d +%endif push r4 push r3 push r2 @@ -1197,18 +1249,34 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 mov r10, r0 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11], eax +%ifdef WIN64 + mov r2, [rsp] +%else pop r2 +%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+4], eax +%ifdef WIN64 + mov r2, [rsp+8] +%else pop r2 +%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+8], eax +%ifdef WIN64 + mov r2, [rsp+16] +%else pop r2 +%endif mov r0, r10 call x264_pixel_sad_%1x%2_cache%3_%5 mov [r11+12], eax +%ifdef WIN64 + add rsp, 24 +%endif + RET %else push edi mov edi, [esp+32] @@ -1232,8 +1300,8 @@ cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0 mov [edi+12], eax add esp, 16 pop edi -%endif ret +%endif %endmacro %macro SADX34_CACHELINE_FUNC 5 diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index b2aee3f..bd4d154 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -18,6 +18,12 @@ ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** +%ifdef ARCH_X86_64 + %ifidn __OUTPUT_FORMAT__,win64 + %define WIN64 + %endif +%endif + ; FIXME: All of the 64bit asm functions that take a stride as an argument ; via register, assume that the high dword of that register is filled with 0. ; This is true in practice (since we never do any 64bit arithmetic on strides, @@ -50,7 +56,9 @@ ; Some distros prefer shared objects to be PIC, but nothing breaks if ; the code contains a few textrels, so we'll skip that complexity. -%ifndef ARCH_X86_64 +%ifdef WIN64 + %define PIC +%elifndef ARCH_X86_64 %undef PIC %endif %ifdef PIC @@ -67,7 +75,8 @@ ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. -; %3 = list of names to define to registers +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. +; %4 = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. @@ -85,12 +94,13 @@ ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons ; which are slow when a normal ret follows a branch. -%macro DECLARE_REG 6 +%macro DECLARE_REG 7 %define r%1q %2 %define r%1d %3 %define r%1w %4 %define r%1b %5 %define r%1m %6 + %define r%1mp %7 %define r%1 %2 %endmacro @@ -213,15 +223,97 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7 %assign n_arg_names %%i %endmacro -%ifdef ARCH_X86_64 ;======================================================== +%ifdef WIN64 ; Windows x64 ;================================================= + +DECLARE_REG 0, rcx, ecx, cx, cl, ecx, rcx +DECLARE_REG 1, rdx, edx, dx, dl, edx, rdx +DECLARE_REG 2, r8, r8d, r8w, r8b, r8d, r8 +DECLARE_REG 3, r9, r9d, r9w, r9b, r9d, r9 +DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40], qword [rsp + stack_offset + 40] +DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48], qword [rsp + stack_offset + 48] +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56], qword [rsp + stack_offset + 56] +%define r7m [rsp + stack_offset + 64] +%define r8m [rsp + stack_offset + 72] -DECLARE_REG 0, rdi, edi, di, dil, edi -DECLARE_REG 1, rsi, esi, si, sil, esi -DECLARE_REG 2, rdx, edx, dx, dl, edx -DECLARE_REG 3, rcx, ecx, cx, cl, ecx -DECLARE_REG 4, r8, r8d, r8w, r8b, r8d -DECLARE_REG 5, r9, r9d, r9w, r9b, r9d -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [rsp + stack_offset + 8 + %1*8] + %endif +%endmacro + +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... + ASSERT %2 >= %1 + %assign regs_used %2 + ASSERT regs_used <= 7 + %if %0 > 2 + %assign xmm_regs_used %3 + %else + %assign xmm_regs_used 0 + %endif + ASSERT xmm_regs_used <= 16 + %if regs_used > 4 + push r4 + push r5 + %assign stack_offset stack_offset+16 + %endif + %if xmm_regs_used > 6 + sub rsp, (xmm_regs_used-6)*16+16 + %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i + %endrep + %endif + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro RESTORE_XMM_INTERNAL 1 + %if xmm_regs_used > 6 + %assign %%i xmm_regs_used + %rep (xmm_regs_used-6) + %assign %%i %%i-1 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] + %endrep + add %1, (xmm_regs_used-6)*16+16 + %endif +%endmacro + +%macro RESTORE_XMM 1 + RESTORE_XMM_INTERNAL %1 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 + %assign xmm_regs_used 0 +%endmacro + +%macro RET 0 + RESTORE_XMM_INTERNAL rsp + %if regs_used > 4 + pop r5 + pop r4 + %endif + ret +%endmacro + +%macro REP_RET 0 + %if regs_used > 4 || xmm_regs_used > 6 + RET + %else + rep ret + %endif +%endmacro + +%elifdef ARCH_X86_64 ; *nix x64 ;============================================= + +DECLARE_REG 0, rdi, edi, di, dil, edi, rdi +DECLARE_REG 1, rsi, esi, si, sil, esi, rsi +DECLARE_REG 2, rdx, edx, dx, dl, edx, rdx +DECLARE_REG 3, rcx, ecx, cx, cl, ecx, rcx +DECLARE_REG 4, r8, r8d, r8w, r8b, r8d, r8 +DECLARE_REG 5, r9, r9d, r9w, r9b, r9d, r9 +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8], qword [rsp + stack_offset + 8] %define r7m [rsp + stack_offset + 16] %define r8m [rsp + stack_offset + 24] @@ -231,12 +323,11 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] %endif %endmacro -%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... ASSERT %2 >= %1 ASSERT %2 <= 7 - %assign stack_offset 0 LOAD_IF_USED 6, %1 - DEFINE_ARGS %3 + DEFINE_ARGS %4 %endmacro %macro RET 0 @@ -249,13 +340,13 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] %else ; X86_32 ;============================================================== -DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] -DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] -DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] -DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] -DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] -DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] -DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] +DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4], dword [esp + stack_offset + 4] +DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8], dword [esp + stack_offset + 8] +DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12], dword [esp + stack_offset + 12] +DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16], dword [esp + stack_offset + 16] +DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20], dword [esp + stack_offset + 20] +DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24], dword [esp + stack_offset + 24] +DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28], dword [esp + stack_offset + 28] %define r7m [esp + stack_offset + 32] %define r8m [esp + stack_offset + 36] %define rsp esp @@ -279,9 +370,8 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endif %endmacro -%macro PROLOGUE 2-3+ ; #args, #regs, arg_names... +%macro PROLOGUE 2-4+ ; #args, #regs, arg_names... ASSERT %2 >= %1 - %assign stack_offset 0 %assign regs_used %2 ASSERT regs_used <= 7 PUSH_IF_USED 3 @@ -295,7 +385,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] LOAD_IF_USED 4, %1 LOAD_IF_USED 5, %1 LOAD_IF_USED 6, %1 - DEFINE_ARGS %3 + DEFINE_ARGS %4 %endmacro %macro RET 0 @@ -326,24 +416,19 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ; Symbol prefix for C linkage %macro cglobal 1-2+ + %ifdef PREFIX + %xdefine %1.skip_prologue _%1.skip_prologue + %xdefine %1 _%1 + %endif %ifidn __OUTPUT_FORMAT__,elf - %ifdef PREFIX - global _%1:function hidden - %define %1 _%1 - %else - global %1:function hidden - %endif + global %1:function hidden %else - %ifdef PREFIX - global _%1 - %define %1 _%1 - %else - global %1 - %endif + global %1 %endif align function_align %1: RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %assign stack_offset 0 %if %0 > 1 PROLOGUE %2 %endif @@ -351,11 +436,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %macro cextern 1 %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 + %xdefine %1 _%1 %endif + extern %1 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is diff --git a/configure b/configure index 599b105..7f4daa1 100755 --- a/configure +++ b/configure @@ -253,6 +253,8 @@ case $host_cpu in ASFLAGS="-f macho64 -m amd64 -DPIC -DPREFIX" CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" + elif [ "$SYS" = MINGW ]; then + ASFLAGS="-f win64 -m amd64 -DPREFIX" else ASFLAGS="-f elf -m amd64" fi @@ -340,6 +342,10 @@ if test "$pthread" = "auto" ; then pthread="yes" libpthread="-lpthreadGC2 -lwsock32" CFLAGS="$CFLAGS -DPTW32_STATIC_LIB" + elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then + pthread="yes" + libpthread="-lpthreadGC2 -lws2_32" + CFLAGS="$CFLAGS -DPTW32_STATIC_LIB" fi ;; *) @@ -373,8 +379,15 @@ if [ "$avis_input" = "auto" ] ; then fi fi if [ "$avis_input" = "yes" ] ; then - echo "#define AVIS_INPUT" >> config.h - LDFLAGS="$LDFLAGS -lvfw32" + if cc_check "stdlib.h" -lvfw32 ; then + echo "#define AVIS_INPUT" >> config.h + LDFLAGS="$LDFLAGS -lvfw32" + elif cc_check "stdlib.h" -lavifil32 ; then + echo "#define AVIS_INPUT" >> config.h + LDFLAGS="$LDFLAGS -lavifil32" + else + avis_input="no"; + fi fi if [ "$pic" = "yes" ] ; then diff --git a/extras/getopt.c b/extras/getopt.c index a890fde..d2dbd30 100644 --- a/extras/getopt.c +++ b/extras/getopt.c @@ -988,6 +988,11 @@ getopt (argc, argv, optstring) int getopt_long (argc, argv, optstring, long_options, opt_index) + int argc; + char *const *argv; + const char *optstring; + const struct option *long_options; + int *opt_index; { return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0); } diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index c180abc..daa1a87 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -22,25 +22,96 @@ SECTION_RODATA -error_message: db "failed to preserve register", 10, 0 +error_message: db "failed to preserve register", 0 + +%ifdef WIN64 +; just random numbers to reduce the chance of incidental match +ALIGN 16 +n4: dq 0xa77809bf11b239d1 +n5: dq 0x2ba9bf3d2f05b389 +x6: ddq 0x79445c159ce790641a1b2550a612b48c +x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd +x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943 +x9: ddq 0xd229e1f5b281303facbd382dcf5b8de2 +x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9 +x11: ddq 0x77d410d5c42c882d89b0c0765892729a +x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5 +x13: ddq 0xdd7b8919edd427862e8ec680de14b47c +x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf +x15: ddq 0x6de8f4c914c334d5011ff554472a7a10 +%endif SECTION .text -cextern printf +cextern puts ; max number of args used by any x264 asm function. ; (max_args % 4) must equal 3 for stack alignment %define max_args 11 +%ifdef WIN64 + +;----------------------------------------------------------------------------- +; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) +;----------------------------------------------------------------------------- +cglobal x264_checkasm_call, 4,7,16 + sub rsp, max_args*8 + %assign stack_offset stack_offset+max_args*8 + mov r6, r0 + mov [rsp+stack_offset+16], r1 + mov r0, r2 + mov r1, r3 + movsxd r2, dword r4m ; FIXME truncates pointer + movsxd r3, dword r5m ; FIXME truncates pointer +%assign i 4 +%rep max_args-4 + mov r4, [rsp+stack_offset+8+(i+2)*8] + mov [rsp+i*8], r4 + %assign i i+1 +%endrep +%assign i 6 +%rep 16-6 + movdqa xmm %+ i, [x %+ i GLOBAL] + %assign i i+1 +%endrep + mov r4, [n4 GLOBAL] + mov r5, [n5 GLOBAL] + call r6 + xor r4, [n4 GLOBAL] + xor r5, [n5 GLOBAL] + or r4, r5 + pxor xmm5, xmm5 +%assign i 6 +%rep 16-6 + pxor xmm %+ i, [x %+ i GLOBAL] + por xmm5, xmm %+ i + %assign i i+1 +%endrep + packsswb xmm5, xmm5 + movq r5, xmm5 + or r4, r5 + jz .ok + mov r4, rax + lea r0, [error_message GLOBAL] + call puts + mov r1, [rsp+stack_offset+16] + mov dword [r1], 0 + mov rax, r4 +.ok: + add rsp, max_args*8 + %assign stack_offset stack_offset-max_args*8 + RET + +%elifndef ARCH_X86_64 + ; just random numbers to reduce the chance of incidental match %define n3 dword 0x6549315c %define n4 dword 0xe02f3e23 %define n5 dword 0xb78d0d1d %define n6 dword 0x33627ba7 -%ifndef ARCH_X86_64 ;----------------------------------------------------------------------------- -; long x264_checkasm_call( long (*func)(), int *ok, ... ) +; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- cglobal x264_checkasm_call, 1,7 mov r3, n3 @@ -63,14 +134,14 @@ cglobal x264_checkasm_call, 1,7 mov r3, eax lea r1, [error_message GLOBAL] push r1 - xor eax, eax - call printf + call puts add esp, 4 mov r1, r1m mov dword [r1], 0 mov eax, r3 .ok: RET + %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------- diff --git a/tools/checkasm.c b/tools/checkasm.c index aeaf5fb..6f09b4b 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -170,11 +170,12 @@ int x264_stack_pagealign( int (*func)(), int align ); #define call_c1(func,...) func(__VA_ARGS__) -#ifdef ARCH_X86 +//#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(_WIN64) /* detect when callee-saved regs aren't saved. * needs an explicit asm check because it only sometimes crashes in normal use. */ -long x264_checkasm_call( long (*func)(), int *ok, ... ); -#define call_a1(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__) +intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); +#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__) #else #define call_a1 call_c1 #endif