diff -uNrp src/Makefile src.new/Makefile --- src/Makefile 2013-10-31 15:40:28 +0300 +++ src.new/Makefile 2013-10-31 15:46:38 +0300 @@ -221,7 +221,7 @@ OPT0 = --crf 30 -b1 -m1 -r1 --me dia --n OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50 OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500 OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree --aq-mode 2 --tff -OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4 --aq-mode 3 +OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4 --fgo 8 --aq-mode 3 OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac diff -uNrp src/common/arm/pixel.h src.new/common/arm/pixel.h --- src/common/arm/pixel.h 2013-01-10 12:18:26 +0300 +++ src.new/common/arm/pixel.h 2013-10-31 15:47:53 +0300 @@ -50,7 +50,7 @@ DECL_X1( sad_aligned, neon ) DECL_X1( sad_aligned, neon_dual ) DECL_X4( sad, neon ) DECL_X1( satd, neon ) -DECL_X1( ssd, neon ) +DECL_PIXELS( int, ssd, neon, ( uint8_t *, intptr_t, uint8_t *, intptr_t, intptr_t ) ) int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); diff -uNrp src/common/common.c src.new/common/common.c --- src/common/common.c 2013-10-31 15:40:29 +0300 +++ src.new/common/common.c 2013-10-31 15:48:43 +0300 @@ -140,6 +140,7 @@ void x264_param_default( x264_param_t *p param->analyse.f_psy_rd = 1.0; param->analyse.b_psy = 1; param->analyse.f_psy_trellis = 0; + param->analyse.i_fgo = 0; param->analyse.i_me_range = 16; param->analyse.i_subpel_refine = 7; param->analyse.b_mixed_references = 1; @@ -436,6 +437,7 @@ void x264_param_apply_fastfirstpass( x26 param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine ); param->analyse.i_trellis = 0; param->analyse.b_fast_pskip = 1; + param->analyse.i_fgo = 0; } } @@ -1002,6 +1004,8 @@ int x264_param_parse( x264_param_t *p, c p->rc.i_aq_mode = atoi(value); OPT("aq-strength") p->rc.f_aq_strength = atof(value); + OPT("fgo") + p->analyse.i_fgo = atoi(value); OPT("fade-compensate") p->rc.f_fade_compensate = atof(value); OPT("pass") @@ -1412,6 +1416,7 @@ char *x264_param2string( x264_param_t *p s += sprintf( s, " stitchable=%d", p->b_stitchable ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra ); + s += sprintf( s, " fgo=%d", p->analyse.i_fgo ); s += sprintf( s, " bframes=%d", p->i_bframe ); if( p->i_bframe ) diff -uNrp src/common/pixel.c src.new/common/pixel.c --- src/common/pixel.c 2013-08-26 14:10:46 +0300 +++ src.new/common/pixel.c 2013-10-31 16:25:19 +0300 @@ -77,7 +77,7 @@ PIXEL_SAD_C( x264_pixel_sad_4x4, 4, ****************************************************************************/ #define PIXEL_SSD_C( name, lx, ly ) \ static int name( pixel *pix1, intptr_t i_stride_pix1, \ - pixel *pix2, intptr_t i_stride_pix2 ) \ + pixel *pix2, intptr_t i_stride_pix2, intptr_t weight ) \ { \ int i_sum = 0; \ for( int y = 0; y < ly; y++ ) \ @@ -102,6 +102,81 @@ PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 1 PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) +#define PIXEL_NOISE_C( lx, ly ) \ +static int x264_pixel_noise_##lx##x##ly( pixel *pix, intptr_t i_stride ) \ +{\ + int score = 0;\ + for( int y = 0; y < ly; y++ ){\ + if( y+1 < ly ){\ + for( int x = 0; x < lx-1; x++ )\ + score += abs( pix[x] - pix[x+i_stride]\ + - pix[x+1] + pix[x+1+i_stride]);\ + }\ + pix += i_stride;\ + }\ + return score;\ +} + +PIXEL_NOISE_C( 16, 16 ) +PIXEL_NOISE_C( 16, 8 ) +PIXEL_NOISE_C( 8, 16 ) +PIXEL_NOISE_C( 8, 8 ) +PIXEL_NOISE_C( 8, 4 ) +PIXEL_NOISE_C( 4, 16 ) +PIXEL_NOISE_C( 4, 8 ) +PIXEL_NOISE_C( 4, 4 ) + +#define PIXEL_NSSD( nssdname, lx, ly, ssdname, noisename )\ +static int x264_pixel_nssd_##lx##x##ly##nssdname( pixel *pix1, intptr_t i_stride_pix1, \ + pixel *pix2, intptr_t i_stride_pix2, intptr_t weight ) \ +{\ + int ssd = x264_pixel_ssd_##lx##x##ly##ssdname( pix1, i_stride_pix1, \ + pix2, i_stride_pix2, weight );\ + int noise1 = x264_pixel_noise_##lx##x##ly##noisename( pix1, i_stride_pix1 );\ + int noise2 = x264_pixel_noise_##lx##x##ly##noisename( pix2, i_stride_pix2 );\ + return ssd + abs(noise1 - noise2) * weight;\ +} +#define PIXEL_NSSD5( nssdname, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 16, 16, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 16, 8, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 8, 16, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 8, 8, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 8, 4, ssdname, noisename ) +#define PIXEL_NSSD2( nssdname, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 4, 8, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 4, 4, ssdname, noisename ) +#define PIXEL_NSSD3( nssdname, ssdname, noisename )\ + PIXEL_NSSD( nssdname, 4, 16, ssdname, noisename )\ + PIXEL_NSSD2( nssdname, ssdname, noisename ) +#define PIXEL_NSSD7( nssdname, ssdname, noisename )\ + PIXEL_NSSD5( nssdname, ssdname, noisename )\ + PIXEL_NSSD2( nssdname, ssdname, noisename ) +#define PIXEL_NSSD8( nssdname, ssdname, noisename )\ + PIXEL_NSSD5( nssdname, ssdname, noisename )\ + PIXEL_NSSD3( nssdname, ssdname, noisename ) + +PIXEL_NSSD8( , , ) +#if HIGH_BIT_DEPTH +#if HAVE_MMX +PIXEL_NSSD8( _mmx2, _mmx2, ) +PIXEL_NSSD5( _sse2, _sse2, ) +#endif +#else +#if HAVE_MMX +PIXEL_NSSD8( _mmx, _mmx, ) +PIXEL_NSSD5( _mmx2, _mmx, _mmx2 ) +PIXEL_NSSD5( _sse2slow, _sse2slow, _mmx2 ) +PIXEL_NSSD5( _sse2, _sse2, _mmx2 ) +PIXEL_NSSD5( _ssse3, _ssse3, _mmx2 ) +PIXEL_NSSD3( _ssse3, _ssse3, ) +PIXEL_NSSD5( _avx, _avx, _mmx2 ) +PIXEL_NSSD5( _xop, _xop, _mmx2 ) +#endif +#if HAVE_ARMV6 +PIXEL_NSSD7( _neon, _neon, ) +#endif +#endif // HIGH_BIT_DEPTH + uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height ) { @@ -110,7 +185,7 @@ uint64_t x264_pixel_ssd_wxh( x264_pixel_ int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ - pix2 + y*i_pix2 + x, i_pix2 ); + pix2 + y*i_pix2 + x, i_pix2, 0 ); for( y = 0; y < i_height-15; y += 16 ) { int x = 0; @@ -809,6 +884,7 @@ void x264_pixel_init( int cpu, x264_pixe INIT7( sad_x3, ); INIT7( sad_x4, ); INIT8( ssd, ); + INIT8( nssd, ); INIT8( satd, ); INIT7( satd_x3, ); INIT7( satd_x4, ); @@ -853,6 +929,7 @@ void x264_pixel_init( int cpu, x264_pixe INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT8( ssd, _mmx2 ); + INIT8( nssd, _mmx2 ); INIT_ADS( _mmx2 ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2; @@ -875,6 +952,7 @@ void x264_pixel_init( int cpu, x264_pixe { INIT4_NAME( sad_aligned, sad, _sse2_aligned ); INIT5( ssd, _sse2 ); + INIT5( nssd, _sse2 ); INIT6( satd, _sse2 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; @@ -1008,6 +1086,7 @@ void x264_pixel_init( int cpu, x264_pixe if( cpu&X264_CPU_MMX ) { INIT8( ssd, _mmx ); + INIT8( nssd, _mmx ); } if( cpu&X264_CPU_MMX2 ) @@ -1019,6 +1098,7 @@ void x264_pixel_init( int cpu, x264_pixe INIT8( satd, _mmx2 ); INIT7( satd_x3, _mmx2 ); INIT7( satd_x4, _mmx2 ); + INIT5( nssd, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT_ADS( _mmx2 ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; @@ -1072,6 +1152,7 @@ void x264_pixel_init( int cpu, x264_pixe if( cpu&X264_CPU_SSE2 ) { INIT5( ssd, _sse2slow ); + INIT5( nssd, _sse2slow ); INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; @@ -1107,7 +1188,8 @@ void x264_pixel_init( int cpu, x264_pixe pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2; if( cpu&X264_CPU_CACHELINE_64 ) { - INIT2( ssd, _sse2); /* faster for width 16 on p4 */ + INIT2( ssd, _sse2 ); /* faster for width 16 on p4 */ + INIT2( nssd, _sse2 ); #if ARCH_X86 INIT2( sad, _cache64_sse2 ); INIT2( sad_x3, _cache64_sse2 ); @@ -1169,6 +1251,7 @@ void x264_pixel_init( int cpu, x264_pixe else { INIT8( ssd, _ssse3 ); + INIT8( nssd, _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; INIT8( satd, _ssse3 ); @@ -1201,6 +1284,7 @@ void x264_pixel_init( int cpu, x264_pixe if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) ) { INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ + INIT5( nssd, _sse2 ); } } @@ -1247,6 +1331,7 @@ void x264_pixel_init( int cpu, x264_pixe #endif } INIT5( ssd, _avx ); + INIT5( nssd, _avx ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx; @@ -1272,6 +1357,7 @@ void x264_pixel_init( int cpu, x264_pixe pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop; } INIT5( ssd, _xop ); + INIT5( nssd, _xop ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; @@ -1322,6 +1408,7 @@ void x264_pixel_init( int cpu, x264_pixe INIT7( sad_x3, _neon ); INIT7( sad_x4, _neon ); INIT7( ssd, _neon ); + INIT7( nssd, _neon ) INIT7( satd, _neon ); INIT7( satd_x3, _neon ); INIT7( satd_x4, _neon ); diff -uNrp src/common/pixel.h src.new/common/pixel.h --- src/common/pixel.h 2013-02-27 14:52:00 +0300 +++ src.new/common/pixel.h 2013-10-31 16:26:48 +0300 @@ -31,6 +31,7 @@ // SSD assumes all args aligned // other cmp functions assume first arg aligned typedef int (*x264_pixel_cmp_t) ( pixel *, intptr_t, pixel *, intptr_t ); +typedef int (*x264_pixel_cmp_weight_t) ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ); typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] ); typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] ); @@ -78,10 +79,12 @@ static const uint8_t x264_luma2chroma_pi typedef struct { x264_pixel_cmp_t sad[8]; - x264_pixel_cmp_t ssd[8]; + x264_pixel_cmp_weight_t ssd[8]; + x264_pixel_cmp_weight_t nssd[8]; x264_pixel_cmp_t satd[8]; x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; + x264_pixel_cmp_weight_t rdcmp[8]; /* either ssd or nsse for mode decision */ x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */ x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */ x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */ diff -uNrp src/common/x86/pixel-a.asm src.new/common/x86/pixel-a.asm --- src/common/x86/pixel-a.asm 2013-05-22 12:42:04 +0300 +++ src.new/common/x86/pixel-a.asm 2013-10-31 16:27:30 +0300 @@ -5205,3 +5205,88 @@ ads_mvs_ssse3: jl .loop movifnidn eax, r0d RET + +;============================================================================= +; NSSD +;============================================================================= + +%macro NOISE_CORE_LOAD_FIRST 3 + mova %1, [r0+%3] + movu %2, [r0+%3+1] +%endmacro + +%macro NOISE_CORE_LOAD_LAST 3 + mova %1, [r0+%3] + mova %2, %1 + psllq %1, 8 + psrlq %2, 8 + psrlq %1, 8 +%endmacro + +%macro NOISE_CORE_START 5 + NOISE_CORE_LOAD %1, %2, %5 + mova %3, %1 + mova %4, %2 + punpcklbw %1, m7 + punpcklbw %2, m7 + punpckhbw %3, m7 + punpckhbw %4, m7 + psubw %1, %2 + psubw %3, %4 +%endmacro + +%macro NOISE_CORE 7 + NOISE_CORE_START %1, %2, %3, %4, %7 + psubw %5, %1 + psubw %6, %3 + ABSW2 %5, %6, %5, %6, %4, %2 + paddw %6, %5 + paddw m6, %6 +%endmacro + +;arguments: src, stride +;macro arguments: width, height +%macro NOISE 2 +%if %1 == 16 +cglobal pixel_noise_%1x%2, 2,3 + mov r2, r0 +%else +cglobal pixel_noise_%1x%2, 2,2 +pixel_noise_%1x%2 %+ .skip_prologue: +%endif + pxor m7, m7 + pxor m6, m6 + NOISE_CORE_START m0, m1, m2, m3, 0 + NOISE_CORE m4, m1, m5, m3, m0, m2, r1 + lea r0, [r0+r1*2] +%rep (%2 - 2) / 2 + NOISE_CORE m0, m1, m2, m3, m4, m5, 0 + NOISE_CORE m4, m1, m5, m3, m0, m2, r1 + lea r0, [r0+r1*2] +%endrep + mova m0, m6 + punpcklwd m0, m7 + punpckhwd m6, m7 + paddd m6, m0 + mova m0, m6 + psrlq m6, 32 + paddd m0, m6 +%if %1 == 16 + lea r0, [r2+8] + movd r2, m0 + call pixel_noise_8x%2 %+ .skip_prologue + add rax, r2 +%else + movd rax, m0 +%endif + RET +%endmacro + +INIT_MMX mmx2 +%define NOISE_CORE_LOAD NOISE_CORE_LOAD_LAST +NOISE 8, 16 +NOISE 8, 8 +NOISE 8, 4 +%define NOISE_CORE_LOAD NOISE_CORE_LOAD_FIRST +NOISE 16, 16 +NOISE 16, 8 diff -uNrp src/common/x86/pixel.h src.new/common/x86/pixel.h --- src/common/x86/pixel.h 2013-08-26 14:10:46 +0300 +++ src.new/common/x86/pixel.h 2013-10-31 16:29:52 +0300 @@ -58,14 +58,15 @@ DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) -DECL_X1( ssd, mmx ) -DECL_X1( ssd, mmx2 ) -DECL_X1( ssd, sse2slow ) -DECL_X1( ssd, sse2 ) -DECL_X1( ssd, ssse3 ) -DECL_X1( ssd, avx ) -DECL_X1( ssd, xop ) -DECL_X1( ssd, avx2 ) +DECL_PIXELS( int, ssd, mmx, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, mmx2, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, sse2slow, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, sse2, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, ssse3, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, avx, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, xop, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, ssd, avx2, ( pixel *, intptr_t, pixel *, intptr_t, intptr_t ) ) +DECL_PIXELS( int, noise, mmx2, ( pixel *, intptr_t ) ) DECL_X1( satd, mmx2 ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) diff -uNrp src/encoder/analyse.c src.new/encoder/analyse.c --- src/encoder/analyse.c 2013-10-31 14:19:54 +0300 +++ src.new/encoder/analyse.c 2013-10-31 16:32:50 +0300 @@ -1125,7 +1125,7 @@ static void x264_intra_rd( x264_t *h, x2 else a->i_satd_i4x4 = COST_MAX; - if( a->i_satd_i8x8 < i_satd_thresh ) + if( ( a->i_satd_i8x8 < i_satd_thresh || h->param.analyse.i_fgo ) && a->i_satd_i8x8 < COST_MAX ) { h->mb.i_type = I_8x8; x264_analyse_update_cache( h, a ); @@ -2752,7 +2752,7 @@ static void x264_mb_analyse_b_rd( x264_t //FIXME not all the update_cache calls are needed h->mb.i_partition = D_16x16; /* L0 */ - if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX ) + if( (a->l0.me16x16.cost < thresh || h->param.analyse.i_fgo) && a->l0.i_rd16x16 == COST_MAX ) { h->mb.i_type = B_L0_L0; x264_analyse_update_cache( h, a ); @@ -2760,7 +2760,7 @@ static void x264_mb_analyse_b_rd( x264_t } /* L1 */ - if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX ) + if( (a->l1.me16x16.cost < thresh || h->param.analyse.i_fgo) && a->l1.i_rd16x16 == COST_MAX ) { h->mb.i_type = B_L1_L1; x264_analyse_update_cache( h, a ); @@ -3319,7 +3319,8 @@ skip_analysis: if( analysis.i_mbrd ) { - x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) ); + x264_mb_analyse_p_rd( h, &analysis, + h->param.analyse.i_fgo ? i_satd_inter : X264_MIN(i_satd_inter, i_satd_intra) ); i_type = P_L0; i_partition = D_16x16; i_cost = analysis.l0.i_rd16x16; diff -uNrp src/encoder/encoder.c src.new/encoder/encoder.c --- src/encoder/encoder.c 2013-10-31 15:40:29 +0300 +++ src.new/encoder/encoder.c 2013-10-31 16:34:07 +0300 @@ -795,6 +795,7 @@ static int x264_validate_parameters( x26 h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; + h->param.analyse.i_fgo = 0; h->param.analyse.b_psy = 0; h->param.i_bframe = 0; /* 8x8dct is not useful without RD in CAVLC lossless */ @@ -1167,6 +1168,25 @@ static int x264_validate_parameters( x26 if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy ) h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE; + if( h->param.analyse.i_fgo ) + { + if( h->param.analyse.i_subpel_refine < 7 ) + { + x264_log( h, X264_LOG_WARNING, "fgo requires subme >= 7\n" ); + h->param.analyse.i_fgo = 0; + } + else + { + /* Arbitrary clipping. */ + h->param.analyse.i_fgo = x264_clip3( h->param.analyse.i_fgo, 0, 50 ); + /* P-skip's threshold isn't necessarily accurate when using NSSD/FGO */ + h->param.analyse.b_fast_pskip = 0; + /* B-frame QPs need to be lower to retain grain */ + /* Arbitrary formula to scale pbratio based on fgo strength. */ + h->param.rc.f_pb_factor = 1 + (h->param.rc.f_pb_factor - 1) / pow(h->param.analyse.i_fgo, 0.3); + } + } + if( h->i_thread_frames > 1 ) { int r = h->param.analyse.i_mv_range_thread; @@ -1276,6 +1296,7 @@ static void mbcmp_init( x264_t *h ) memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) ); + memcpy( h->pixf.rdcmp, h->param.analyse.i_fgo ? h->pixf.nssd : h->pixf.ssd, sizeof(h->pixf.rdcmp) ); } static void chroma_dsp_init( x264_t *h ) @@ -1722,6 +1743,7 @@ static int x264_encoder_try_reconfig( x2 COPY( analyse.b_mixed_references ); COPY( analyse.f_psy_rd ); COPY( analyse.f_psy_trellis ); + COPY( analyse.i_fgo ); COPY( crop_rect ); // can only twiddle these if they were enabled to begin with: if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA ) diff -uNrp src/encoder/macroblock.c src.new/encoder/macroblock.c --- src/encoder/macroblock.c 2013-04-26 14:54:10 +0300 +++ src.new/encoder/macroblock.c 2013-10-31 16:34:30 +0300 @@ -1041,7 +1041,7 @@ static ALWAYS_INLINE int x264_macroblock /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ - ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); + ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE, 0 ); if( ssd < thresh ) continue; diff -uNrp src/encoder/rdo.c src.new/encoder/rdo.c --- src/encoder/rdo.c 2013-04-26 14:54:10 +0300 +++ src.new/encoder/rdo.c 2013-10-31 16:35:11 +0300 @@ -141,7 +141,7 @@ static inline int ssd_plane( x264_t *h, } satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8; } - return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; + return h->pixf.rdcmp[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE, h->param.analyse.i_fgo) + satd; } static inline int ssd_mb( x264_t *h ) diff -uNrp src/tools/checkasm.c src.new/tools/checkasm.c --- src/tools/checkasm.c 2013-08-26 14:10:46 +0300 +++ src.new/tools/checkasm.c 2013-10-31 16:36:04 +0300 @@ -340,9 +340,33 @@ static int check_pixel( int cpu_ref, int } \ report( "pixel " #name " :" ); +#define TEST_PIXEL_WEIGHT( name, align ) \ + ok = 1, used_asm = 0; \ + for( int i = 0; i < 8; i++ ) \ + { \ + int res_c, res_asm; \ + if( pixel_asm.name[i] != pixel_ref.name[i] ) \ + { \ + for( int j = 0; j < 64; j++ ) \ + { \ + used_asm = 1; \ + res_c = call_c( pixel_c.name[i], pbuf1, 32, pbuf2+j*!align, 16, j ); \ + res_asm = call_a( pixel_asm.name[i], pbuf1, 32, pbuf2+j*!align, 16, j ); \ + if( res_c != res_asm ) \ + { \ + ok = 0; \ + fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ + break; \ + } \ + } \ + } \ + } \ + report( "pixel " #name " :" ); + TEST_PIXEL( sad, 0 ); TEST_PIXEL( sad_aligned, 1 ); - TEST_PIXEL( ssd, 1 ); + TEST_PIXEL_WEIGHT( ssd, 1 ); + TEST_PIXEL_WEIGHT( nssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 1 ); diff -uNrp src/x264.c src.new/x264.c --- src/x264.c 2013-10-31 15:40:30 +0300 +++ src.new/x264.c 2013-10-31 16:37:16 +0300 @@ -863,6 +863,9 @@ static void help( x264_param_t *defaults defaults->analyse.f_psy_rd, defaults->analyse.f_psy_trellis ); H2( " --no-psy Disable all visual optimizations that worsen\n" " both PSNR and SSIM.\n" ); + H1( " --fgo Activates Film Grain Optimization. (requires subme>=7) [%d]\n", defaults->analyse.i_fgo ); + H2( " - 5: weak FGO\n" + " - 15: strong FGO\n" ); H2( " --no-mixed-refs Don't decide references on a per partition basis\n" ); H2( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --no-8x8dct Disable adaptive spatial transform size\n" ); @@ -1126,6 +1129,7 @@ static struct option long_options[] = { "no-dct-decimate", no_argument, NULL, 0 }, { "aq-strength", required_argument, NULL, 0 }, { "aq-mode", required_argument, NULL, 0 }, + { "fgo", required_argument, NULL, 0 }, { "fade-compensate", required_argument, NULL, 0 }, { "deadzone-inter", required_argument, NULL, 0 }, { "deadzone-intra", required_argument, NULL, 0 }, diff -uNrp src/x264.h src.new/x264.h --- src/x264.h 2013-10-31 15:40:29 +0300 +++ src.new/x264.h 2013-10-31 16:37:24 +0300 @@ -379,6 +379,7 @@ typedef struct x264_param_t int b_fast_pskip; /* early SKIP detection on P-frames */ int b_dct_decimate; /* transform coefficient thresholding on P-frames */ int i_noise_reduction; /* adaptive pseudo-deadzone */ + int i_fgo; /* psy film grain optimization */ float f_psy_rd; /* Psy RD strength */ float f_psy_trellis; /* Psy trellis strength */ int b_psy; /* Toggle all psy optimizations */