Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(411)

Side by Side Diff: source/libvpx/vp9/common/vp9_loopfilter_thread.c

Issue 800493003: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Keep vp9_iht8x8_add_neon disabled because of http://llvm.org/bugs/show_bug.cgi?id=22178 Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "./vpx_config.h" 11 #include "./vpx_config.h"
12
13 #include "vpx_mem/vpx_mem.h" 12 #include "vpx_mem/vpx_mem.h"
14 13 #include "vp9/common/vp9_loopfilter_thread.h"
15 #include "vp9/common/vp9_reconinter.h" 14 #include "vp9/common/vp9_reconinter.h"
16 15
17 #include "vp9/decoder/vp9_dthread.h"
18 #include "vp9/decoder/vp9_decoder.h"
19
20 #if CONFIG_MULTITHREAD 16 #if CONFIG_MULTITHREAD
21 static INLINE void mutex_lock(pthread_mutex_t *const mutex) { 17 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
22 const int kMaxTryLocks = 4000; 18 const int kMaxTryLocks = 4000;
23 int locked = 0; 19 int locked = 0;
24 int i; 20 int i;
25 21
26 for (i = 0; i < kMaxTryLocks; ++i) { 22 for (i = 0; i < kMaxTryLocks; ++i) {
27 if (!pthread_mutex_trylock(mutex)) { 23 if (!pthread_mutex_trylock(mutex)) {
28 locked = 1; 24 locked = 1;
29 break; 25 break;
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
81 } 77 }
82 #else 78 #else
83 (void)lf_sync; 79 (void)lf_sync;
84 (void)r; 80 (void)r;
85 (void)c; 81 (void)c;
86 (void)sb_cols; 82 (void)sb_cols;
87 #endif // CONFIG_MULTITHREAD 83 #endif // CONFIG_MULTITHREAD
88 } 84 }
89 85
90 // Implement row loopfiltering for each thread. 86 // Implement row loopfiltering for each thread.
91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, 87 static INLINE
92 VP9_COMMON *const cm, 88 void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
93 struct macroblockd_plane planes[MAX_MB_PLANE], 89 VP9_COMMON *const cm,
94 int start, int stop, int y_only, 90 struct macroblockd_plane planes[MAX_MB_PLANE],
95 VP9LfSync *const lf_sync) { 91 int start, int stop, int y_only,
92 VP9LfSync *const lf_sync) {
96 const int num_planes = y_only ? 1 : MAX_MB_PLANE; 93 const int num_planes = y_only ? 1 : MAX_MB_PLANE;
97 int r, c; // SB row and col 94 const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
95 planes[1].subsampling_x == 1);
98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; 96 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
97 int mi_row, mi_col;
99 98
100 for (r = start; r < stop; r += lf_sync->num_workers) { 99 for (mi_row = start; mi_row < stop;
101 const int mi_row = r << MI_BLOCK_SIZE_LOG2; 100 mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
102 MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; 101 MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
103 102
104 for (c = 0; c < sb_cols; ++c) { 103 for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
105 const int mi_col = c << MI_BLOCK_SIZE_LOG2; 104 const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
105 const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
106 LOOP_FILTER_MASK lfm; 106 LOOP_FILTER_MASK lfm;
107 int plane; 107 int plane;
108 108
109 sync_read(lf_sync, r, c); 109 sync_read(lf_sync, r, c);
110 110
111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); 111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); 112
113 // TODO(JBB): Make setup_mask work for non 420.
114 if (use_420)
115 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
116 &lfm);
113 117
114 for (plane = 0; plane < num_planes; ++plane) { 118 for (plane = 0; plane < num_planes; ++plane) {
115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); 119 if (use_420)
120 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
121 else
122 vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
123 mi_row, mi_col);
116 } 124 }
117 125
118 sync_write(lf_sync, r, c, sb_cols); 126 sync_write(lf_sync, r, c, sb_cols);
119 } 127 }
120 } 128 }
121 } 129 }
122 130
123 // Row-based multi-threaded loopfilter hook 131 // Row-based multi-threaded loopfilter hook
124 static int loop_filter_row_worker(VP9LfSync *const lf_sync, 132 static int loop_filter_row_worker(VP9LfSync *const lf_sync,
125 LFWorkerData *const lf_data) { 133 LFWorkerData *const lf_data) {
126 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, 134 thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
127 lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); 135 lf_data->start, lf_data->stop, lf_data->y_only,
136 lf_sync);
128 return 1; 137 return 1;
129 } 138 }
130 139
131 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile 140 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
132 // threads. 141 VP9_COMMON *cm,
133 void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, 142 struct macroblockd_plane planes[MAX_MB_PLANE],
134 YV12_BUFFER_CONFIG *frame, 143 int start, int stop, int y_only,
135 struct macroblockd_plane planes[MAX_MB_PLANE], 144 VP9Worker *workers, int nworkers,
136 VP9_COMMON *cm, 145 VP9LfSync *lf_sync) {
137 VP9Worker *workers, int nworkers,
138 int frame_filter_level,
139 int y_only) {
140 const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); 146 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
141 // Number of superblock rows and cols 147 // Number of superblock rows and cols
142 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; 148 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
149 // Decoder may allocate more threads than number of tiles based on user's
150 // input.
143 const int tile_cols = 1 << cm->log2_tile_cols; 151 const int tile_cols = 1 << cm->log2_tile_cols;
144 const int num_workers = MIN(nworkers, tile_cols); 152 const int num_workers = MIN(nworkers, tile_cols);
145 int i; 153 int i;
146 154
147 if (!frame_filter_level) return;
148
149 if (!lf_sync->sync_range || cm->last_height != cm->height || 155 if (!lf_sync->sync_range || cm->last_height != cm->height ||
150 num_workers > lf_sync->num_workers) { 156 num_workers > lf_sync->num_workers) {
151 vp9_loop_filter_dealloc(lf_sync); 157 vp9_loop_filter_dealloc(lf_sync);
152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); 158 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
153 } 159 }
154 160
155 vp9_loop_filter_frame_init(cm, frame_filter_level);
156
157 // Initialize cur_sb_col to -1 for all SB rows. 161 // Initialize cur_sb_col to -1 for all SB rows.
158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); 162 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
159 163
160 // Set up loopfilter thread data. 164 // Set up loopfilter thread data.
161 // The decoder is capping num_workers because it has been observed that using 165 // The decoder is capping num_workers because it has been observed that using
162 // more threads on the loopfilter than there are cores will hurt performance 166 // more threads on the loopfilter than there are cores will hurt performance
163 // on Android. This is because the system will only schedule the tile decode 167 // on Android. This is because the system will only schedule the tile decode
164 // workers on cores equal to the number of tile columns. Then if the decoder 168 // workers on cores equal to the number of tile columns. Then if the decoder
165 // tries to use more threads for the loopfilter, it will hurt performance 169 // tries to use more threads for the loopfilter, it will hurt performance
166 // because of contention. If the multithreading code changes in the future 170 // because of contention. If the multithreading code changes in the future
167 // then the number of workers used by the loopfilter should be revisited. 171 // then the number of workers used by the loopfilter should be revisited.
168 for (i = 0; i < num_workers; ++i) { 172 for (i = 0; i < num_workers; ++i) {
169 VP9Worker *const worker = &workers[i]; 173 VP9Worker *const worker = &workers[i];
170 LFWorkerData *const lf_data = &lf_sync->lfdata[i]; 174 LFWorkerData *const lf_data = &lf_sync->lfdata[i];
171 175
172 worker->hook = (VP9WorkerHook)loop_filter_row_worker; 176 worker->hook = (VP9WorkerHook)loop_filter_row_worker;
173 worker->data1 = lf_sync; 177 worker->data1 = lf_sync;
174 worker->data2 = lf_data; 178 worker->data2 = lf_data;
175 179
176 // Loopfilter data 180 // Loopfilter data
177 vp9_loop_filter_data_reset(lf_data, frame, cm, planes); 181 vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
178 lf_data->start = i; 182 lf_data->start = start + i * MI_BLOCK_SIZE;
179 lf_data->stop = sb_rows; 183 lf_data->stop = stop;
180 lf_data->y_only = y_only; 184 lf_data->y_only = y_only;
181 185
182 // Start loopfiltering 186 // Start loopfiltering
183 if (i == num_workers - 1) { 187 if (i == num_workers - 1) {
184 winterface->execute(worker); 188 winterface->execute(worker);
185 } else { 189 } else {
186 winterface->launch(worker); 190 winterface->launch(worker);
187 } 191 }
188 } 192 }
189 193
190 // Wait till all rows are finished 194 // Wait till all rows are finished
191 for (i = 0; i < num_workers; ++i) { 195 for (i = 0; i < num_workers; ++i) {
192 winterface->sync(&workers[i]); 196 winterface->sync(&workers[i]);
193 } 197 }
194 } 198 }
195 199
200 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
201 VP9_COMMON *cm,
202 struct macroblockd_plane planes[MAX_MB_PLANE],
203 int frame_filter_level,
204 int y_only, int partial_frame,
205 VP9Worker *workers, int num_workers,
206 VP9LfSync *lf_sync) {
207 int start_mi_row, end_mi_row, mi_rows_to_filter;
208
209 if (!frame_filter_level) return;
210
211 start_mi_row = 0;
212 mi_rows_to_filter = cm->mi_rows;
213 if (partial_frame && cm->mi_rows > 8) {
214 start_mi_row = cm->mi_rows >> 1;
215 start_mi_row &= 0xfffffff8;
216 mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
217 }
218 end_mi_row = start_mi_row + mi_rows_to_filter;
219 vp9_loop_filter_frame_init(cm, frame_filter_level);
220
221 loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
222 y_only, workers, num_workers, lf_sync);
223 }
224
196 // Set up nsync by width. 225 // Set up nsync by width.
197 static int get_sync_range(int width) { 226 static INLINE int get_sync_range(int width) {
198 // nsync numbers are picked by testing. For example, for 4k 227 // nsync numbers are picked by testing. For example, for 4k
199 // video, using 4 gives best performance. 228 // video, using 4 gives best performance.
200 if (width < 640) 229 if (width < 640)
201 return 1; 230 return 1;
202 else if (width <= 1280) 231 else if (width <= 1280)
203 return 2; 232 return 2;
204 else if (width <= 4096) 233 else if (width <= 4096)
205 return 4; 234 return 4;
206 else 235 else
207 return 8; 236 return 8;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
263 vpx_free(lf_sync->cond_); 292 vpx_free(lf_sync->cond_);
264 } 293 }
265 #endif // CONFIG_MULTITHREAD 294 #endif // CONFIG_MULTITHREAD
266 vpx_free(lf_sync->lfdata); 295 vpx_free(lf_sync->lfdata);
267 vpx_free(lf_sync->cur_sb_col); 296 vpx_free(lf_sync->cur_sb_col);
268 // clear the structure as the source of this call may be a resize in which 297 // clear the structure as the source of this call may be a resize in which
269 // case this call will be followed by an _alloc() which may fail. 298 // case this call will be followed by an _alloc() which may fail.
270 vp9_zero(*lf_sync); 299 vp9_zero(*lf_sync);
271 } 300 }
272 } 301 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_loopfilter_thread.h ('k') | source/libvpx/vp9/common/vp9_onyxc_int.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698