diff -uNr a/common/common.c b/common/common.c --- a/common/common.c Wed May 28 05:53:15 2008 +++ b/common/common.c Thu Jun 5 20:49:52 2008 @@ -266,6 +266,13 @@ else p->i_threads = atoi(value); } + OPT("thread-queue") + { + if( !strcmp(value, "auto") ) + p->i_thread_queue = 0; + else + p->i_thread_queue = atoi(value); + } OPT2("deterministic", "n-deterministic") p->b_deterministic = atobool(value); OPT2("level", "level-idc") diff -uNr a/common/common.h b/common/common.h --- a/common/common.h Wed Jun 4 08:20:37 2008 +++ b/common/common.h Thu Jun 5 20:49:52 2008 @@ -238,10 +238,17 @@ /* encoder parameters */ x264_param_t param; - x264_t *thread[X264_THREAD_MAX]; - x264_pthread_t thread_handle; - int b_thread_active; - int i_thread_phase; /* which thread to use for the next frame */ + x264_t *thread[X264_THREAD_MAX]; /* contexts for each frame in progress */ + x264_t **thread_queue; /* frames that have been prepared but not yet claimed by a worker thread */ + x264_pthread_cond_t thread_queue_cv; + x264_pthread_mutex_t thread_queue_mutex; + x264_pthread_t *thread_handle; + int thread_active; + x264_pthread_cond_t thread_active_cv; + x264_pthread_mutex_t thread_active_mutex; + int thread_exit; + int b_thread_active; + int i_thread_phase; /* which thread to use for the next frame */ /* bitstream output */ struct diff -uNr a/encoder/encoder.c b/encoder/encoder.c --- a/encoder/encoder.c Thu Jun 5 20:47:35 2008 +++ b/encoder/encoder.c Thu Jun 5 20:49:52 2008 @@ -43,6 +43,34 @@ x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_out ); +/* threading */ + +static int x264_slices_write_thread( x264_t *h ); + +#ifdef HAVE_PTHREAD +void x264_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{ + x264_pthread_mutex_lock( mutex ); + *var = val; + x264_pthread_cond_broadcast( cv ); + x264_pthread_mutex_unlock( mutex ); +} + +void x264_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{ + x264_pthread_mutex_lock( mutex ); + while( *var != val ) + x264_pthread_cond_wait( cv, mutex ); + x264_pthread_mutex_unlock( mutex ); +} + +#else +void x264_cond_broadcast( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{} +void x264_cond_wait( x264_pthread_cond_t *cv, x264_pthread_mutex_t *mutex, int *var, int val ) +{} +#endif + /**************************************************************************** * ******************************* x264 libs ********************************** @@ -331,7 +359,12 @@ if( h->param.i_scenecut_threshold >= 0 ) h->param.b_pre_scenecut = 1; #endif + if( h->param.i_thread_queue == 0 ) + h->param.i_thread_queue = h->param.i_threads; + h->param.i_thread_queue = x264_clip3( h->param.i_thread_queue, h->param.i_threads, X264_THREAD_MAX ); } + else + h->param.i_thread_queue = 1; if( h->param.b_interlaced ) { @@ -622,7 +655,7 @@ h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height; /* Init frames. */ - h->frames.i_delay = h->param.i_bframe + h->param.i_threads - 1; + h->frames.i_delay = h->param.i_bframe + h->param.i_thread_queue - 1; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames; h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering; @@ -674,17 +707,39 @@ h->thread[0] = h; h->i_thread_num = 0; - for( i = 1; i < h->param.i_threads; i++ ) + for( i = 1; i < h->param.i_thread_queue; i++ ) h->thread[i] = x264_malloc( sizeof(x264_t) ); - for( i = 0; i < h->param.i_threads; i++ ) + if( h->param.i_threads > 1 ) + { + h->thread_handle = x264_malloc( h->param.i_threads * sizeof(x264_pthread_t) ); + h->thread_queue = x264_malloc( (h->param.i_thread_queue + 1) * sizeof(x264_t*) ); + memset( h->thread_queue, 0, (h->param.i_thread_queue + 1) * sizeof(x264_t*) ); + x264_pthread_cond_init( &h->thread_queue_cv, NULL ); + x264_pthread_mutex_init( &h->thread_queue_mutex, NULL ); + } + + for( i = 0; i < h->param.i_thread_queue; i++ ) { + x264_t *t = h->thread[i]; if( i > 0 ) - *h->thread[i] = *h; - h->thread[i]->fdec = x264_frame_pop_unused( h ); - h->thread[i]->out.p_bitstream = x264_malloc( h->out.i_bitstream ); - if( x264_macroblock_cache_init( h->thread[i] ) < 0 ) + *t = *h; + t->fdec = x264_frame_pop_unused( h ); + t->out.p_bitstream = x264_malloc( h->out.i_bitstream ); + if( x264_macroblock_cache_init( t ) < 0 ) return NULL; + + if( h->param.i_threads > 1 ) + { + x264_pthread_cond_init( &t->thread_active_cv, NULL ); + x264_pthread_mutex_init( &t->thread_active_mutex, NULL ); + } + } + + if( h->param.i_threads > 1 ) + { + for( i = 0; i < h->param.i_threads; i++ ) + x264_pthread_create( &h->thread_handle[i], NULL, (void*)x264_slices_write_thread, h ); } if( x264_ratecontrol_new( h ) < 0 ) @@ -1198,6 +1253,35 @@ return 0; } +static int x264_slices_write_thread( x264_t *h ) +{ + for(;;) + { + int i_frame_size; + x264_t *t = NULL; + + // get one frame from the queue + x264_pthread_mutex_lock( &h->thread_queue_mutex ); + while( !h->thread_queue[0] && !h->thread_exit ) + x264_pthread_cond_wait( &h->thread_queue_cv, &h->thread_queue_mutex ); + if( h->thread_queue[0] ) + t = (void*)x264_frame_shift( (void*)h->thread_queue ); + x264_pthread_mutex_unlock( &h->thread_queue_mutex ); + if( h->thread_exit ) + return 0; + if( !t ) + continue; + + x264_stack_align( x264_slice_write, t ); + i_frame_size = t->out.nal[t->out.i_nal-1].i_payload; + t->out.i_frame_size = i_frame_size; + + x264_cond_broadcast( &t->thread_active_cv, &t->thread_active_mutex, &t->thread_active, 0 ); + } + + return 0; +} + /**************************************************************************** * x264_encoder_encode: * XXX: i_poc : is the poc of the current given picture @@ -1225,7 +1309,7 @@ if( h->param.i_threads > 1) { int i = ++h->i_thread_phase; - int t = h->param.i_threads; + int t = h->param.i_thread_queue; thread_current = h->thread[ i%t ]; thread_prev = h->thread[ (i-1)%t ]; thread_oldest = h->thread[ (i+1)%t ]; @@ -1269,7 +1353,7 @@ if( h->frames.b_have_lowres ) x264_frame_init_lowres( h, fenc ); - if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) + if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_thread_queue ) { /* Nothing yet to encode */ /* waiting for filling bframe buffer */ @@ -1441,8 +1525,14 @@ /* Write frame */ if( h->param.i_threads > 1 ) { - x264_pthread_create( &h->thread_handle, NULL, (void*)x264_slices_write, h ); + assert( h->thread_active == 0 ); + h->thread_active = 1; + assert( h->b_thread_active == 0 ); h->b_thread_active = 1; + x264_pthread_mutex_lock( &h->thread[0]->thread_queue_mutex ); + x264_frame_push( (void*)h->thread_queue, (void*)h ); + x264_pthread_cond_broadcast( &h->thread[0]->thread_queue_cv ); + x264_pthread_mutex_unlock( &h->thread[0]->thread_queue_mutex ); } else x264_slices_write( h ); @@ -1561,7 +1651,7 @@ if( h->b_thread_active ) { - x264_pthread_join( h->thread_handle, NULL ); + x264_cond_wait( &h->thread_active_cv, &h->thread_active_mutex, &h->thread_active, 0 ); h->b_thread_active = 0; } if( !h->out.i_nal ) @@ -1725,25 +1815,39 @@ int64_t i_yuv_size = 3 * h->param.i_width * h->param.i_height / 2; int i; - for( i=0; iparam.i_threads; i++ ) + if( h->param.i_threads > 1 ) { // don't strictly have to wait for the other threads, but it's simpler than canceling them - if( h->thread[i]->b_thread_active ) - { - x264_pthread_join( h->thread[i]->thread_handle, NULL ); - assert( h->thread[i]->fenc->i_reference_count == 1 ); - x264_frame_delete( h->thread[i]->fenc ); + h->thread_exit = 1; + x264_pthread_mutex_lock( &h->thread_queue_mutex ); + x264_pthread_cond_broadcast( &h->thread_queue_cv ); + x264_pthread_mutex_unlock( &h->thread_queue_mutex ); + for( i=0; i < h->param.i_threads; i++ ) + x264_pthread_join( h->thread_handle[i], NULL ); + for( i=0; i < h->param.i_thread_queue; i++ ) + { + x264_pthread_cond_destroy( &h->thread[i]->thread_active_cv ); + x264_pthread_mutex_destroy( &h->thread[i]->thread_active_mutex ); + if( h->thread[i]->b_thread_active ) + { + assert( h->thread[i]->fenc->i_reference_count == 1 ); + x264_frame_delete( h->thread[i]->fenc ); + } } + x264_pthread_cond_destroy( &h->thread_queue_cv ); + x264_pthread_mutex_destroy( &h->thread_queue_mutex ); + x264_free( h->thread_handle ); + x264_free( h->thread_queue ); } if( h->param.i_threads > 1) { x264_t *thread_prev; - thread_prev = h->thread[ h->i_thread_phase % h->param.i_threads ]; + thread_prev = h->thread[ h->i_thread_phase % h->param.i_thread_queue ]; x264_thread_sync_ratecontrol( h, thread_prev, h ); x264_thread_sync_ratecontrol( thread_prev, thread_prev, h ); - h->i_frame = thread_prev->i_frame + 1 - h->param.i_threads; + h->i_frame = thread_prev->i_frame + 1 - h->param.i_thread_queue; } h->i_frame++; @@ -1914,7 +2018,7 @@ x264_cqm_delete( h ); if( h->param.i_threads > 1) - h = h->thread[ h->i_thread_phase % h->param.i_threads ]; + h = h->thread[ h->i_thread_phase % h->param.i_thread_queue ]; /* frames */ for( i = 0; h->frames.current[i]; i++ ) @@ -1935,7 +2039,7 @@ h = h->thread[0]; - for( i = h->param.i_threads - 1; i >= 0; i-- ) + for( i = h->param.i_thread_queue - 1; i >= 0; i-- ) { x264_frame_t **frame; diff -uNr a/encoder/ratecontrol.c b/encoder/ratecontrol.c --- a/encoder/ratecontrol.c Thu Jun 5 20:47:52 2008 +++ b/encoder/ratecontrol.c Thu Jun 5 20:49:52 2008 @@ -267,8 +267,8 @@ x264_emms(); - rc = h->rc = x264_malloc( h->param.i_threads * sizeof(x264_ratecontrol_t) ); - memset( rc, 0, h->param.i_threads * sizeof(x264_ratecontrol_t) ); + rc = h->rc = x264_malloc( h->param.i_thread_queue * sizeof(x264_ratecontrol_t) ); + memset( rc, 0, h->param.i_thread_queue * sizeof(x264_ratecontrol_t) ); rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read; rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read; @@ -546,7 +546,7 @@ x264_free( p ); } - for( i=0; iparam.i_threads; i++ ) + for( i=0; iparam.i_thread_queue; i++ ) { h->thread[i]->rc = rc+i; if( i ) @@ -714,7 +714,7 @@ x264_free( rc->zones[i].param ); x264_free( rc->zones ); } - for( i=0; iparam.i_threads; i++ ) + for( i=0; iparam.i_thread_queue; i++ ) x264_free( rc[i].ac_energy ); x264_free( rc ); } @@ -1312,9 +1312,9 @@ { int j = h->rc - h->thread[0]->rc; int i; - for( i=1; iparam.i_threads; i++ ) + for( i=1; iparam.i_thread_queue; i++ ) { - x264_t *t = h->thread[ (j+i)%h->param.i_threads ]; + x264_t *t = h->thread[ (j+i)%h->param.i_thread_queue ]; double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; @@ -1467,13 +1467,13 @@ //FIXME adjust abr_buffer based on distance to the end of the video int64_t diff; - if( h->fenc->i_frame < h->param.i_threads ) + if( h->fenc->i_frame < h->param.i_thread_queue ) diff = total_bits + (int64_t)h->fenc->i_frame * rcc->bitrate / rcc->fps - (int64_t)rce.expected_bits; else - diff = total_bits + (int64_t)(h->param.i_threads - 1) * rcc->bitrate / rcc->fps - (int64_t)rce.expected_bits; + diff = total_bits + (int64_t)(h->param.i_thread_queue - 1) * rcc->bitrate / rcc->fps - (int64_t)rce.expected_bits; q = rce.new_qscale; q /= x264_clip3f((double)(abr_buffer - diff) / abr_buffer, .5, 2); - if( h->fenc->i_frame + 1 - h->param.i_threads >= rcc->fps && rcc->expected_bits_sum > 0) + if( h->fenc->i_frame + 1 - h->param.i_thread_queue >= rcc->fps && rcc->expected_bits_sum > 0) { /* Adjust quant based on the difference between * achieved and expected bitrate so far */ @@ -1537,7 +1537,7 @@ } else { - int i_frame_done = h->fenc->i_frame + 1 - h->param.i_threads; + int i_frame_done = h->fenc->i_frame + 1 - h->param.i_thread_queue; q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame ); diff -uNr a/x264.c b/x264.c --- a/x264.c Wed Jun 4 08:20:37 2008 +++ b/x264.c Thu Jun 5 20:49:52 2008 @@ -314,6 +314,7 @@ H0( " --no-psnr Disable PSNR computation\n" ); H0( " --no-ssim Disable SSIM computation\n" ); H0( " --threads Parallel encoding\n" ); + H1( " --thread-queue Number of delay frames for thread sync\n" ); H0( " --thread-input Run Avisynth in its own thread\n" ); H1( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" ); H1( " --asm Override CPU detection\n" ); @@ -440,6 +441,7 @@ { "zones", required_argument, NULL, 0 }, { "qpfile", required_argument, NULL, OPT_QPFILE }, { "threads", required_argument, NULL, 0 }, + { "thread-queue", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, { "non-deterministic", no_argument, NULL, 0 }, { "no-psnr", no_argument, NULL, 0 }, diff -uNr a/x264.h b/x264.h --- a/x264.h Wed Jun 4 08:20:37 2008 +++ b/x264.h Thu Jun 5 20:49:52 2008 @@ -147,6 +147,7 @@ /* CPU flags */ unsigned int cpu; int i_threads; /* encode multiple frames in parallel */ + int i_thread_queue; /* number of frames to prepare in advance (>= i_threads) */ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */ /* Video Properties */