diff --git a/common/common.c b/common/common.c index 6669cae..4652996 100644 --- a/common/common.c +++ b/common/common.c @@ -267,6 +267,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) else p->i_threads = atoi(value); } + OPT("thread-queue") + { + if( !strcmp(value, "auto") ) + p->i_thread_queue = 0; + else + p->i_thread_queue = atoi(value); + } OPT2("deterministic", "n-deterministic") p->b_deterministic = atobool(value); OPT2("level", "level-idc") @@ -847,6 +854,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] ); s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset ); s += sprintf( s, " threads=%d", p->i_threads ); + s += sprintf( s, " thread_queue=%d", p->i_thread_queue ); s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " mbaff=%d", p->b_interlaced ); diff --git a/common/common.h b/common/common.h index 1668a63..7c04230 100644 --- a/common/common.h +++ b/common/common.h @@ -253,10 +253,17 @@ struct x264_t /* encoder parameters */ x264_param_t param; - x264_t *thread[X264_THREAD_MAX]; - x264_pthread_t thread_handle; - int b_thread_active; - int i_thread_phase; /* which thread to use for the next frame */ + x264_t *thread[X264_THREAD_MAX]; /* contexts for each frame in progress */ + x264_t **thread_queue; /* frames that have been prepared but not yet claimed by a worker thread */ + x264_pthread_cond_t thread_queue_cv; + x264_pthread_mutex_t thread_queue_mutex; + x264_pthread_t *thread_handle; + int thread_active; + x264_pthread_cond_t thread_active_cv; + x264_pthread_mutex_t thread_active_mutex; + int thread_exit; + int b_thread_active; + int i_thread_phase; /* which thread to use for the next frame */ /* bitstream output */ struct diff --git a/encoder/encoder.c b/encoder/encoder.c index e462a1f..5cabc18 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -46,6 +46,34 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_out ); +/* threading */ + +static int x264_slices_write_thread( x264_t *h ); + +#ifdef HAVE_PTHREAD +void x264_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{ + x264_pthread_mutex_lock( mutex ); + *var = val; + x264_pthread_cond_broadcast( cv ); + x264_pthread_mutex_unlock( mutex ); +} + +void x264_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{ + x264_pthread_mutex_lock( mutex ); + while( *var != val ) + x264_pthread_cond_wait( cv, mutex ); + x264_pthread_mutex_unlock( mutex ); +} + +#else +void x264_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{} +void x264_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{} +#endif + /**************************************************************************** * ******************************* x264 libs ********************************** @@ -371,6 +399,14 @@ static int x264_validate_parameters( x264_t *h ) h->param.b_pre_scenecut = 1; #endif } + if( h->param.i_threads > 1 ) + { + if( h->param.i_thread_queue == 0 ) + h->param.i_thread_queue = h->param.i_threads; + h->param.i_thread_queue = x264_clip3( h->param.i_thread_queue, h->param.i_threads, X264_THREAD_MAX ); + } + else + h->param.i_thread_queue = 1; if( h->param.b_interlaced ) { @@ -701,9 +737,9 @@ x264_t *x264_encoder_open ( x264_param_t *param ) /* Init frames. */ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) - h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_threads - 1; + h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4 + h->param.i_thread_queue - 1; else - h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; + h->frames.i_delay = h->param.i_bframe + h->param.i_thread_queue - 1; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames; h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering; @@ -770,16 +806,31 @@ x264_t *x264_encoder_open ( x264_param_t *param ) h->thread[0] = h; h->i_thread_num = 0; - for( i = 1; i < h->param.i_threads; i++ ) + for( i = 1; i < h->param.i_thread_queue; i++ ) h->thread[i] = x264_malloc( sizeof(x264_t) ); - for( i = 0; i < h->param.i_threads; i++ ) + if( h->param.i_threads > 1 ) + { + h->thread_handle = x264_malloc( h->param.i_threads * sizeof(x264_pthread_t) ); + h->thread_queue = x264_malloc( (h->param.i_thread_queue + 1) * sizeof(x264_t*) ); + memset( h->thread_queue, 0, (h->param.i_thread_queue + 1) * sizeof(x264_t*) ); + x264_pthread_cond_init( &h->thread_queue_cv, NULL ); + x264_pthread_mutex_init( &h->thread_queue_mutex, NULL ); + } + + for( i = 0; i < h->param.i_thread_queue; i++ ) { + x264_t *t = h->thread[i]; if( i > 0 ) - *h->thread[i] = *h; - h->thread[i]->fdec = x264_frame_pop_unused( h ); - h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream ); - if( x264_macroblock_cache_init( h->thread[i] ) < 0 ) + *t = *h; + t->fdec = x264_frame_pop_unused( h ); + t->out.p_bitstream = x264_malloc( h->out.i_bitstream ); + if( h->param.i_threads > 1 ) + { + x264_pthread_cond_init( &t->thread_active_cv, NULL ); + x264_pthread_mutex_init( &t->thread_active_mutex, NULL ); + } + if( x264_macroblock_cache_init( t ) < 0 ) return NULL; } @@ -806,6 +857,12 @@ x264_t *x264_encoder_open ( x264_param_t *param ) h->sps->i_profile_idc == PROFILE_HIGH ? "High" : "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 ); + if( h->param.i_threads > 1 ) + { + for( i = 0; i < h->param.i_threads; i++ ) + x264_pthread_create( &h->thread_handle[i], NULL, (void*)x264_slices_write_thread, h ); + } + return h; } @@ -1343,6 +1400,41 @@ static int x264_slices_write( x264_t *h ) return 0; } +static int x264_slices_write_thread( x264_t *h ) +{ +#ifdef HAVE_MMX + /* Misalign mask has to be set separately for each thread. */ + if( h->param.cpu&X264_CPU_SSE_MISALIGN ) + x264_cpu_mask_misalign_sse(); +#endif + + for(;;) + { + int i_frame_size; + x264_t *t = NULL; + + // get one frame from the queue + x264_pthread_mutex_lock( &h->thread_queue_mutex ); + while( !h->thread_queue[0] && !h->thread_exit ) + x264_pthread_cond_wait( &h->thread_queue_cv, &h->thread_queue_mutex ); + if( h->thread_queue[0] ) + t = (void*)x264_frame_shift( (void*)h->thread_queue ); + x264_pthread_mutex_unlock( &h->thread_queue_mutex ); + if( h->thread_exit ) + return 0; + if( !t ) + continue; + + x264_stack_align( x264_slice_write, t ); + i_frame_size = t->out.nal[t->out.i_nal-1].i_payload; + t->out.i_frame_size = i_frame_size; + + x264_cond_broadcast( &t->thread_active_cv, &t->thread_active_mutex, &t->thread_active, 0 ); + } + + return 0; +} + /**************************************************************************** * x264_encoder_encode: * XXX: i_poc : is the poc of the current given picture @@ -1370,7 +1462,7 @@ int x264_encoder_encode( x264_t *h, if( h->param.i_threads > 1) { int i = ++h->i_thread_phase; - int t = h->param.i_threads; + int t = h->param.i_thread_queue; thread_current = h->thread[ i%t ]; thread_prev = h->thread[ (i-1)%t ]; thread_oldest = h->thread[ (i+1)%t ]; @@ -1417,7 +1509,7 @@ int x264_encoder_encode( x264_t *h, if( h->param.rc.i_aq_mode ) x264_adaptive_quant_frame( h, fenc ); - if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) + if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_thread_queue ) { /* Nothing yet to encode */ /* waiting for filling bframe buffer */ @@ -1588,8 +1680,14 @@ do_encode: /* Write frame */ if( h->param.i_threads > 1 ) { - x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ); + assert( h->thread_active == 0 ); + h->thread_active = 1; + assert( h->b_thread_active == 0 ); h->b_thread_active = 1; + x264_pthread_mutex_lock( &h->thread[0]->thread_queue_mutex ); + x264_frame_push( (void*)h->thread_queue, (void*)h ); + x264_pthread_cond_broadcast( &h->thread[0]->thread_queue_cv ); + x264_pthread_mutex_unlock( &h->thread[0]->thread_queue_mutex ); } else x264_slices_write( h ); @@ -1708,7 +1806,7 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current, if( h->b_thread_active ) { - x264_pthread_join( h->thread_handle, NULL ); + x264_cond_wait( &h->thread_active_cv, &h->thread_active_mutex, &h->thread_active, 0 ); h->b_thread_active = 0; } if( !h->out.i_nal ) @@ -1881,15 +1979,29 @@ void x264_encoder_close ( x264_t *h ) || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM] || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM]; - for( i=0; iparam.i_threads; i++ ) + if( h->param.i_threads > 1 ) { // don't strictly have to wait for the other threads, but it's simpler than canceling them - if( h->thread[i]->b_thread_active ) + h->thread_exit = 1; + x264_pthread_mutex_lock( &h->thread_queue_mutex ); + x264_pthread_cond_broadcast( &h->thread_queue_cv ); + x264_pthread_mutex_unlock( &h->thread_queue_mutex ); + for( i=0; i < h->param.i_threads; i++ ) + x264_pthread_join( h->thread_handle[i], NULL ); + for( i=0; i < h->param.i_thread_queue; i++ ) { - x264_pthread_join( h->thread[i]->thread_handle, NULL ); - assert( h->thread[i]->fenc->i_reference_count == 1 ); - x264_frame_delete( h->thread[i]->fenc ); + x264_pthread_cond_destroy( &h->thread[i]->thread_active_cv ); + x264_pthread_mutex_destroy( &h->thread[i]->thread_active_mutex ); + if( h->thread[i]->b_thread_active ) + { + assert( h->thread[i]->fenc->i_reference_count == 1 ); + x264_frame_delete( h->thread[i]->fenc ); + } } + x264_pthread_cond_destroy( &h->thread_queue_cv ); + x264_pthread_mutex_destroy( &h->thread_queue_mutex ); + x264_free( h->thread_handle ); + x264_free( h->thread_queue ); } /* Slices used and PSNR */ @@ -2087,7 +2199,7 @@ void x264_encoder_close ( x264_t *h ) x264_cqm_delete( h ); if( h->param.i_threads > 1) - h = h->thread[ h->i_thread_phase % h->param.i_threads ]; + h = h->thread[ h->i_thread_phase % h->param.i_thread_queue ]; /* frames */ for( i = 0; h->frames.current[i]; i++ ) @@ -2108,7 +2220,7 @@ void x264_encoder_close ( x264_t *h ) h = h->thread[0]; - for( i = h->param.i_threads - 1; i >= 0; i-- ) + for( i = h->param.i_thread_queue - 1; i >= 0; i-- ) { x264_frame_t **frame; diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 5a74172..47a12f4 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -271,8 +271,8 @@ int x264_ratecontrol_new( x264_t *h ) x264_emms(); - rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) ); - memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) ); + rc = h->rc = x264_malloc( h->param.i_thread_queue * sizeof(x264_ratecontrol_t) ); + memset( rc, 0, h->param.i_thread_queue * sizeof(x264_ratecontrol_t) ); rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read; rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read; @@ -578,7 +578,7 @@ int x264_ratecontrol_new( x264_t *h ) x264_free( p ); } - for( i=0; iparam.i_threads; i++ ) + for( i=0; iparam.i_thread_queue; i++ ) { h->thread[i]->rc = rc+i; if( i ) @@ -1034,7 +1034,7 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num ) if( h->param.i_bframe_adaptive ) x264_log(h, X264_LOG_ERROR, "disabling adaptive B-frames\n"); - for( i = 0; i < h->param.i_threads; i++ ) + for( i = 0; i < h->param.i_thread_queue; i++ ) { h->thread[i]->rc->b_abr = 0; h->thread[i]->rc->b_2pass = 0; @@ -1294,9 +1294,9 @@ static void update_vbv_plan( x264_t *h ) { int j = h->rc - h->thread[0]->rc; int i; - for( i=1; iparam.i_threads; i++ ) + for( i=1; iparam.i_thread_queue; i++ ) { - x264_t *t = h->thread[ (j+i)%h->param.i_threads ]; + x264_t *t = h->thread[ (j+i)%h->param.i_thread_queue ]; double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; @@ -1457,9 +1457,9 @@ static float rate_estimate_qscale( x264_t *h ) { int j = h->rc - h->thread[0]->rc; int i; - for( i=1; iparam.i_threads; i++ ) + for( i=1; iparam.i_thread_queue; i++ ) { - x264_t *t = h->thread[ (j+i)%h->param.i_threads ]; + x264_t *t = h->thread[ (j+i)%h->param.i_thread_queue ]; double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; @@ -1470,16 +1470,16 @@ static float rate_estimate_qscale( x264_t *h ) } else { - if( h->fenc->i_frame < h->param.i_threads ) + if( h->fenc->i_frame < h->param.i_thread_queue ) predicted_bits += (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps; else - predicted_bits += (int64_t)(h->param.i_threads - 1) * rcc->bitrate / rcc->fps; + predicted_bits += (int64_t)(h->param.i_thread_queue - 1) * rcc->bitrate / rcc->fps; } diff = predicted_bits - (int64_t)rce.expected_bits; q = rce.new_qscale; q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2); - if( ((h->fenc->i_frame + 1 - h->param.i_threads) >= rcc->fps) && + if( ((h->fenc->i_frame + 1 - h->param.i_thread_queue) >= rcc->fps) && (rcc->expected_bits_sum > 0)) { /* Adjust quant based on the difference between @@ -1546,7 +1546,7 @@ static float rate_estimate_qscale( x264_t *h ) } else { - int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads; + int i_frame_done = h->fenc->i_frame + 1 - h->param.i_thread_queue; q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame ); diff --git a/x264.c b/x264.c index a1a8c94..eedc3b3 100644 --- a/x264.c +++ b/x264.c @@ -332,6 +332,7 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H0( " --no-psnr Disable PSNR computation\n" ); H0( " --no-ssim Disable SSIM computation\n" ); H0( " --threads Parallel encoding\n" ); + H1( " --thread-queue Number of delay frames for thread sync\n" ); H0( " --thread-input Run Avisynth in its own thread\n" ); H1( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" ); H1( " --asm Override CPU detection\n" ); @@ -457,6 +458,7 @@ static int Parse( int argc, char **argv, { "zones", required_argument, NULL, 0 }, { "qpfile", required_argument, NULL, OPT_QPFILE }, { "threads", required_argument, NULL, 0 }, + { "thread-queue", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "non-deterministic", no_argument, NULL, 0 }, { "no-psnr", no_argument, NULL, 0 }, diff --git a/x264.h b/x264.h index 51be79e..16b02c1 100644 --- a/x264.h +++ b/x264.h @@ -153,6 +153,7 @@ typedef struct x264_param_t /* CPU flags */ unsigned int cpu; int i_threads; /* encode multiple frames in parallel */ + int i_thread_queue; /* number of frames to prepare in advance (>= i_threads) */ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */ /* Video Properties */