Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(421)

Side by Side Diff: source/libvpx/vp9/decoder/vp9_dthread.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « source/libvpx/vp9/decoder/vp9_dthread.h ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
85 (void)c; 85 (void)c;
86 (void)sb_cols; 86 (void)sb_cols;
87 #endif // CONFIG_MULTITHREAD 87 #endif // CONFIG_MULTITHREAD
88 } 88 }
89 89
90 // Implement row loopfiltering for each thread. 90 // Implement row loopfiltering for each thread.
91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, 91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
92 VP9_COMMON *const cm, 92 VP9_COMMON *const cm,
93 struct macroblockd_plane planes[MAX_MB_PLANE], 93 struct macroblockd_plane planes[MAX_MB_PLANE],
94 int start, int stop, int y_only, 94 int start, int stop, int y_only,
95 VP9LfSync *const lf_sync, int num_lf_workers) { 95 VP9LfSync *const lf_sync) {
96 const int num_planes = y_only ? 1 : MAX_MB_PLANE; 96 const int num_planes = y_only ? 1 : MAX_MB_PLANE;
97 int r, c; // SB row and col 97 int r, c; // SB row and col
98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; 98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
99 99
100 for (r = start; r < stop; r += num_lf_workers) { 100 for (r = start; r < stop; r += lf_sync->num_workers) {
101 const int mi_row = r << MI_BLOCK_SIZE_LOG2; 101 const int mi_row = r << MI_BLOCK_SIZE_LOG2;
102 MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; 102 MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
103 103
104 for (c = 0; c < sb_cols; ++c) { 104 for (c = 0; c < sb_cols; ++c) {
105 const int mi_col = c << MI_BLOCK_SIZE_LOG2; 105 const int mi_col = c << MI_BLOCK_SIZE_LOG2;
106 LOOP_FILTER_MASK lfm; 106 LOOP_FILTER_MASK lfm;
107 int plane; 107 int plane;
108 108
109 sync_read(lf_sync, r, c); 109 sync_read(lf_sync, r, c);
110 110
111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); 111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); 112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
113 113
114 for (plane = 0; plane < num_planes; ++plane) { 114 for (plane = 0; plane < num_planes; ++plane) {
115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); 115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
116 } 116 }
117 117
118 sync_write(lf_sync, r, c, sb_cols); 118 sync_write(lf_sync, r, c, sb_cols);
119 } 119 }
120 } 120 }
121 } 121 }
122 122
123 // Row-based multi-threaded loopfilter hook 123 // Row-based multi-threaded loopfilter hook
124 static int loop_filter_row_worker(TileWorkerData *const tile_data, 124 static int loop_filter_row_worker(VP9LfSync *const lf_sync,
125 void *unused) { 125 LFWorkerData *const lf_data) {
126 LFWorkerData *const lf_data = &tile_data->lfdata;
127 (void)unused;
128 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, 126 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
129 lf_data->start, lf_data->stop, lf_data->y_only, 127 lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);
130 lf_data->lf_sync, lf_data->num_lf_workers);
131 return 1; 128 return 1;
132 } 129 }
133 130
134 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile 131 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile
135 // threads. 132 // threads.
136 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, 133 void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,
137 VP9Decoder *pbi, VP9_COMMON *cm, 134 YV12_BUFFER_CONFIG *frame,
135 struct macroblockd_plane planes[MAX_MB_PLANE],
136 VP9_COMMON *cm,
137 VP9Worker *workers, int nworkers,
138 int frame_filter_level, 138 int frame_filter_level,
139 int y_only) { 139 int y_only) {
140 VP9LfSync *const lf_sync = &pbi->lf_row_sync;
141 const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); 140 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
142 // Number of superblock rows and cols 141 // Number of superblock rows and cols
143 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; 142 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
144 const int tile_cols = 1 << cm->log2_tile_cols; 143 const int tile_cols = 1 << cm->log2_tile_cols;
145 const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); 144 const int num_workers = MIN(nworkers, tile_cols);
146 int i; 145 int i;
147 146
148 if (!frame_filter_level) return; 147 if (!frame_filter_level) return;
149 148
150 if (!lf_sync->sync_range || cm->last_height != cm->height) { 149 if (!lf_sync->sync_range || cm->last_height != cm->height ||
150 num_workers > lf_sync->num_workers) {
151 vp9_loop_filter_dealloc(lf_sync); 151 vp9_loop_filter_dealloc(lf_sync);
152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width); 152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
153 } 153 }
154 154
155 vp9_loop_filter_frame_init(cm, frame_filter_level); 155 vp9_loop_filter_frame_init(cm, frame_filter_level);
156 156
157 // Initialize cur_sb_col to -1 for all SB rows. 157 // Initialize cur_sb_col to -1 for all SB rows.
158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); 158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
159 159
160 // Set up loopfilter thread data. 160 // Set up loopfilter thread data.
161 // The decoder is using num_workers instead of pbi->num_tile_workers 161 // The decoder is capping num_workers because it has been observed that using
162 // because it has been observed that using more threads on the 162 // more threads on the loopfilter than there are cores will hurt performance
163 // loopfilter, than there are tile columns in the frame will hurt 163 // on Android. This is because the system will only schedule the tile decode
164 // performance on Android. This is because the system will only 164 // workers on cores equal to the number of tile columns. Then if the decoder
165 // schedule the tile decode workers on cores equal to the number 165 // tries to use more threads for the loopfilter, it will hurt performance
166 // of tile columns. Then if the decoder tries to use more threads for the 166 // because of contention. If the multithreading code changes in the future
167 // loopfilter, it will hurt performance because of contention. If the 167 // then the number of workers used by the loopfilter should be revisited.
168 // multithreading code changes in the future then the number of workers
169 // used by the loopfilter should be revisited.
170 for (i = 0; i < num_workers; ++i) { 168 for (i = 0; i < num_workers; ++i) {
171 VP9Worker *const worker = &pbi->tile_workers[i]; 169 VP9Worker *const worker = &workers[i];
172 TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; 170 LFWorkerData *const lf_data = &lf_sync->lfdata[i];
173 LFWorkerData *const lf_data = &tile_data->lfdata;
174 171
175 worker->hook = (VP9WorkerHook)loop_filter_row_worker; 172 worker->hook = (VP9WorkerHook)loop_filter_row_worker;
173 worker->data1 = lf_sync;
174 worker->data2 = lf_data;
176 175
177 // Loopfilter data 176 // Loopfilter data
178 lf_data->frame_buffer = frame; 177 vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
179 lf_data->cm = cm;
180 vp9_copy(lf_data->planes, pbi->mb.plane);
181 lf_data->start = i; 178 lf_data->start = i;
182 lf_data->stop = sb_rows; 179 lf_data->stop = sb_rows;
183 lf_data->y_only = y_only; // always do all planes in decoder 180 lf_data->y_only = y_only;
184
185 lf_data->lf_sync = lf_sync;
186 lf_data->num_lf_workers = num_workers;
187 181
188 // Start loopfiltering 182 // Start loopfiltering
189 if (i == num_workers - 1) { 183 if (i == num_workers - 1) {
190 winterface->execute(worker); 184 winterface->execute(worker);
191 } else { 185 } else {
192 winterface->launch(worker); 186 winterface->launch(worker);
193 } 187 }
194 } 188 }
195 189
196 // Wait till all rows are finished 190 // Wait till all rows are finished
197 for (i = 0; i < num_workers; ++i) { 191 for (i = 0; i < num_workers; ++i) {
198 winterface->sync(&pbi->tile_workers[i]); 192 winterface->sync(&workers[i]);
199 } 193 }
200 } 194 }
201 195
202 // Set up nsync by width. 196 // Set up nsync by width.
203 static int get_sync_range(int width) { 197 static int get_sync_range(int width) {
204 // nsync numbers are picked by testing. For example, for 4k 198 // nsync numbers are picked by testing. For example, for 4k
205 // video, using 4 gives best performance. 199 // video, using 4 gives best performance.
206 if (width < 640) 200 if (width < 640)
207 return 1; 201 return 1;
208 else if (width <= 1280) 202 else if (width <= 1280)
209 return 2; 203 return 2;
210 else if (width <= 4096) 204 else if (width <= 4096)
211 return 4; 205 return 4;
212 else 206 else
213 return 8; 207 return 8;
214 } 208 }
215 209
216 // Allocate memory for lf row synchronization 210 // Allocate memory for lf row synchronization
217 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, 211 void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
218 int width) { 212 int width, int num_workers) {
219 lf_sync->rows = rows; 213 lf_sync->rows = rows;
220 #if CONFIG_MULTITHREAD 214 #if CONFIG_MULTITHREAD
221 { 215 {
222 int i; 216 int i;
223 217
224 CHECK_MEM_ERROR(cm, lf_sync->mutex_, 218 CHECK_MEM_ERROR(cm, lf_sync->mutex_,
225 vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); 219 vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
226 if (lf_sync->mutex_) { 220 if (lf_sync->mutex_) {
227 for (i = 0; i < rows; ++i) { 221 for (i = 0; i < rows; ++i) {
228 pthread_mutex_init(&lf_sync->mutex_[i], NULL); 222 pthread_mutex_init(&lf_sync->mutex_[i], NULL);
229 } 223 }
230 } 224 }
231 225
232 CHECK_MEM_ERROR(cm, lf_sync->cond_, 226 CHECK_MEM_ERROR(cm, lf_sync->cond_,
233 vpx_malloc(sizeof(*lf_sync->cond_) * rows)); 227 vpx_malloc(sizeof(*lf_sync->cond_) * rows));
234 if (lf_sync->cond_) { 228 if (lf_sync->cond_) {
235 for (i = 0; i < rows; ++i) { 229 for (i = 0; i < rows; ++i) {
236 pthread_cond_init(&lf_sync->cond_[i], NULL); 230 pthread_cond_init(&lf_sync->cond_[i], NULL);
237 } 231 }
238 } 232 }
239 } 233 }
240 #endif // CONFIG_MULTITHREAD 234 #endif // CONFIG_MULTITHREAD
241 235
236 CHECK_MEM_ERROR(cm, lf_sync->lfdata,
237 vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
238 lf_sync->num_workers = num_workers;
239
242 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, 240 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
243 vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); 241 vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
244 242
245 // Set up nsync. 243 // Set up nsync.
246 lf_sync->sync_range = get_sync_range(width); 244 lf_sync->sync_range = get_sync_range(width);
247 } 245 }
248 246
249 // Deallocate lf synchronization related mutex and data 247 // Deallocate lf synchronization related mutex and data
250 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { 248 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
251 if (lf_sync != NULL) { 249 if (lf_sync != NULL) {
252 #if CONFIG_MULTITHREAD 250 #if CONFIG_MULTITHREAD
253 int i; 251 int i;
254 252
255 if (lf_sync->mutex_ != NULL) { 253 if (lf_sync->mutex_ != NULL) {
256 for (i = 0; i < lf_sync->rows; ++i) { 254 for (i = 0; i < lf_sync->rows; ++i) {
257 pthread_mutex_destroy(&lf_sync->mutex_[i]); 255 pthread_mutex_destroy(&lf_sync->mutex_[i]);
258 } 256 }
259 vpx_free(lf_sync->mutex_); 257 vpx_free(lf_sync->mutex_);
260 } 258 }
261 if (lf_sync->cond_ != NULL) { 259 if (lf_sync->cond_ != NULL) {
262 for (i = 0; i < lf_sync->rows; ++i) { 260 for (i = 0; i < lf_sync->rows; ++i) {
263 pthread_cond_destroy(&lf_sync->cond_[i]); 261 pthread_cond_destroy(&lf_sync->cond_[i]);
264 } 262 }
265 vpx_free(lf_sync->cond_); 263 vpx_free(lf_sync->cond_);
266 } 264 }
267 #endif // CONFIG_MULTITHREAD 265 #endif // CONFIG_MULTITHREAD
266 vpx_free(lf_sync->lfdata);
268 vpx_free(lf_sync->cur_sb_col); 267 vpx_free(lf_sync->cur_sb_col);
269 // clear the structure as the source of this call may be a resize in which 268 // clear the structure as the source of this call may be a resize in which
270 // case this call will be followed by an _alloc() which may fail. 269 // case this call will be followed by an _alloc() which may fail.
271 vp9_zero(*lf_sync); 270 vp9_zero(*lf_sync);
272 } 271 }
273 } 272 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/decoder/vp9_dthread.h ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698