Index: source/libvpx/vp9/decoder/vp9_dthread.c |
=================================================================== |
--- source/libvpx/vp9/decoder/vp9_dthread.c (revision 263011) |
+++ source/libvpx/vp9/decoder/vp9_dthread.c (working copy) |
@@ -9,10 +9,13 @@ |
*/ |
#include "./vpx_config.h" |
+ |
+#include "vpx_mem/vpx_mem.h" |
+ |
#include "vp9/common/vp9_reconinter.h" |
+ |
#include "vp9/decoder/vp9_dthread.h" |
-#include "vp9/decoder/vp9_onyxd_int.h" |
-#include "vpx_mem/vpx_mem.h" |
+#include "vp9/decoder/vp9_decoder.h" |
#if CONFIG_MULTITHREAD |
static INLINE void mutex_lock(pthread_mutex_t *const mutex) { |
@@ -96,7 +99,7 @@ |
for (r = start; r < stop; r += num_lf_workers) { |
const int mi_row = r << MI_BLOCK_SIZE_LOG2; |
- MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride; |
+ MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mi_stride; |
for (c = 0; c < sb_cols; ++c) { |
const int mi_col = c << MI_BLOCK_SIZE_LOG2; |
@@ -104,9 +107,8 @@ |
sync_read(lf_sync, r, c); |
- setup_dst_planes(xd, frame_buffer, mi_row, mi_col); |
- vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, |
- &lfm); |
+ vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col); |
+ vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mi_stride, &lfm); |
for (plane = 0; plane < num_planes; ++plane) { |
vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); |
@@ -130,13 +132,15 @@ |
// VP9 decoder: Implement multi-threaded loopfilter that uses the tile |
// threads. |
-void vp9_loop_filter_frame_mt(VP9D_COMP *pbi, |
+void vp9_loop_filter_frame_mt(VP9Decoder *pbi, |
VP9_COMMON *cm, |
MACROBLOCKD *xd, |
int frame_filter_level, |
int y_only, int partial_frame) { |
// Number of superblock rows and cols |
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; |
+ const int tile_cols = 1 << cm->log2_tile_cols; |
+ const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); |
int i; |
// Allocate memory used in thread synchronization. |
@@ -166,7 +170,16 @@ |
sizeof(*pbi->lf_row_sync.cur_sb_col) * sb_rows); |
// Set up loopfilter thread data. |
- for (i = 0; i < pbi->num_tile_workers; ++i) { |
+ // The decoder is using num_workers instead of pbi->num_tile_workers |
+ // because it has been observed that using more threads on the |
+ // loopfilter, than there are tile columns in the frame will hurt |
+ // performance on Android. This is because the system will only |
+ // schedule the tile decode workers on cores equal to the number |
+ // of tile columns. Then if the decoder tries to use more threads for the |
+ // loopfilter, it will hurt performance because of contention. If the |
+ // multithreading code changes in the future then the number of workers |
+ // used by the loopfilter should be revisited. |
+ for (i = 0; i < num_workers; ++i) { |
VP9Worker *const worker = &pbi->tile_workers[i]; |
TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; |
LFWorkerData *const lf_data = &tile_data->lfdata; |
@@ -182,10 +195,10 @@ |
lf_data->y_only = y_only; // always do all planes in decoder |
lf_data->lf_sync = &pbi->lf_row_sync; |
- lf_data->num_lf_workers = pbi->num_tile_workers; |
+ lf_data->num_lf_workers = num_workers; |
// Start loopfiltering |
- if (i == pbi->num_tile_workers - 1) { |
+ if (i == num_workers - 1) { |
vp9_worker_execute(worker); |
} else { |
vp9_worker_launch(worker); |
@@ -193,7 +206,7 @@ |
} |
// Wait till all rows are finished |
- for (i = 0; i < pbi->num_tile_workers; ++i) { |
+ for (i = 0; i < num_workers; ++i) { |
vp9_worker_sync(&pbi->tile_workers[i]); |
} |
} |