source/libvpx/vp9/decoder/vp9_dthread.c - Issue 756673003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/decoder/vp9_dthread.c

Issue 756673003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 74 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
85 (void)c;	85 (void)c;

86 (void)sb_cols;	86 (void)sb_cols;

87 #endif // CONFIG_MULTITHREAD	87 #endif // CONFIG_MULTITHREAD

88 }	88 }

89	89

90 // Implement row loopfiltering for each thread.	90 // Implement row loopfiltering for each thread.

91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,	91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,

92 VP9_COMMON *const cm,	92 VP9_COMMON *const cm,

93 struct macroblockd_plane planes[MAX_MB_PLANE],	93 struct macroblockd_plane planes[MAX_MB_PLANE],

94 int start, int stop, int y_only,	94 int start, int stop, int y_only,

95 VP9LfSync *const lf_sync, int num_lf_workers) {	95 VP9LfSync *const lf_sync) {

96 const int num_planes = y_only ? 1 : MAX_MB_PLANE;	96 const int num_planes = y_only ? 1 : MAX_MB_PLANE;

97 int r, c; // SB row and col	97 int r, c; // SB row and col

98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;	98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

99	99

100 for (r = start; r < stop; r += num_lf_workers) {	100 for (r = start; r < stop; r += lf_sync->num_workers) {

101 const int mi_row = r << MI_BLOCK_SIZE_LOG2;	101 const int mi_row = r << MI_BLOCK_SIZE_LOG2;

102 MODE_INFO const mi = cm->mi + mi_row cm->mi_stride;	102 MODE_INFO const mi = cm->mi + mi_row cm->mi_stride;

103	103

104 for (c = 0; c < sb_cols; ++c) {	104 for (c = 0; c < sb_cols; ++c) {

105 const int mi_col = c << MI_BLOCK_SIZE_LOG2;	105 const int mi_col = c << MI_BLOCK_SIZE_LOG2;

106 LOOP_FILTER_MASK lfm;	106 LOOP_FILTER_MASK lfm;

107 int plane;	107 int plane;

108	108

109 sync_read(lf_sync, r, c);	109 sync_read(lf_sync, r, c);

110	110

111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);	111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);	112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);

113	113

114 for (plane = 0; plane < num_planes; ++plane) {	114 for (plane = 0; plane < num_planes; ++plane) {

115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);	115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

116 }	116 }

117	117

118 sync_write(lf_sync, r, c, sb_cols);	118 sync_write(lf_sync, r, c, sb_cols);

119 }	119 }

120 }	120 }

121 }	121 }

122	122

123 // Row-based multi-threaded loopfilter hook	123 // Row-based multi-threaded loopfilter hook

124 static int loop_filter_row_worker(TileWorkerData *const tile_data,	124 static int loop_filter_row_worker(VP9LfSync *const lf_sync,

125 void *unused) {	125 LFWorkerData *const lf_data) {

126 LFWorkerData *const lf_data = &tile_data->lfdata;

127 (void)unused;

128 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,	126 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

129 lf_data->start, lf_data->stop, lf_data->y_only,	127 lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);

130 lf_data->lf_sync, lf_data->num_lf_workers);

131 return 1;	128 return 1;

132 }	129 }

133	130

134 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile	131 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile

135 // threads.	132 // threads.

136 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,	133 void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,

137 VP9Decoder pbi, VP9_COMMON cm,	134 YV12_BUFFER_CONFIG *frame,

	135 struct macroblockd_plane planes[MAX_MB_PLANE],

	136 VP9_COMMON *cm,

	137 VP9Worker *workers, int nworkers,

138 int frame_filter_level,	138 int frame_filter_level,

139 int y_only) {	139 int y_only) {

140 VP9LfSync *const lf_sync = &pbi->lf_row_sync;

141 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();	140 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

142 // Number of superblock rows and cols	141 // Number of superblock rows and cols

143 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;	142 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

144 const int tile_cols = 1 << cm->log2_tile_cols;	143 const int tile_cols = 1 << cm->log2_tile_cols;

145 const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);	144 const int num_workers = MIN(nworkers, tile_cols);

146 int i;	145 int i;

147	146

148 if (!frame_filter_level) return;	147 if (!frame_filter_level) return;

149	148

150 if (!lf_sync->sync_range \|\| cm->last_height != cm->height) {	149 if (!lf_sync->sync_range \|\| cm->last_height != cm->height \|\|

	150 num_workers > lf_sync->num_workers) {

151 vp9_loop_filter_dealloc(lf_sync);	151 vp9_loop_filter_dealloc(lf_sync);

152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);	152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

153 }	153 }

154	154

155 vp9_loop_filter_frame_init(cm, frame_filter_level);	155 vp9_loop_filter_frame_init(cm, frame_filter_level);

156	156

157 // Initialize cur_sb_col to -1 for all SB rows.	157 // Initialize cur_sb_col to -1 for all SB rows.

158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(lf_sync->cur_sb_col) sb_rows);	158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(lf_sync->cur_sb_col) sb_rows);

159	159

160 // Set up loopfilter thread data.	160 // Set up loopfilter thread data.

161 // The decoder is using num_workers instead of pbi->num_tile_workers	161 // The decoder is capping num_workers because it has been observed that using

162 // because it has been observed that using more threads on the	162 // more threads on the loopfilter than there are cores will hurt performance

163 // loopfilter, than there are tile columns in the frame will hurt	163 // on Android. This is because the system will only schedule the tile decode

164 // performance on Android. This is because the system will only	164 // workers on cores equal to the number of tile columns. Then if the decoder

165 // schedule the tile decode workers on cores equal to the number	165 // tries to use more threads for the loopfilter, it will hurt performance

166 // of tile columns. Then if the decoder tries to use more threads for the	166 // because of contention. If the multithreading code changes in the future

167 // loopfilter, it will hurt performance because of contention. If the	167 // then the number of workers used by the loopfilter should be revisited.

168 // multithreading code changes in the future then the number of workers

169 // used by the loopfilter should be revisited.

170 for (i = 0; i < num_workers; ++i) {	168 for (i = 0; i < num_workers; ++i) {

171 VP9Worker *const worker = &pbi->tile_workers[i];	169 VP9Worker *const worker = &workers[i];

172 TileWorkerData const tile_data = (TileWorkerData)worker->data1;	170 LFWorkerData *const lf_data = &lf_sync->lfdata[i];

173 LFWorkerData *const lf_data = &tile_data->lfdata;

174	171

175 worker->hook = (VP9WorkerHook)loop_filter_row_worker;	172 worker->hook = (VP9WorkerHook)loop_filter_row_worker;

	173 worker->data1 = lf_sync;

	174 worker->data2 = lf_data;

176	175

177 // Loopfilter data	176 // Loopfilter data

178 lf_data->frame_buffer = frame;	177 vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

179 lf_data->cm = cm;

180 vp9_copy(lf_data->planes, pbi->mb.plane);

181 lf_data->start = i;	178 lf_data->start = i;

182 lf_data->stop = sb_rows;	179 lf_data->stop = sb_rows;

183 lf_data->y_only = y_only; // always do all planes in decoder	180 lf_data->y_only = y_only;

184

185 lf_data->lf_sync = lf_sync;

186 lf_data->num_lf_workers = num_workers;

187	181

188 // Start loopfiltering	182 // Start loopfiltering

189 if (i == num_workers - 1) {	183 if (i == num_workers - 1) {

190 winterface->execute(worker);	184 winterface->execute(worker);

191 } else {	185 } else {

192 winterface->launch(worker);	186 winterface->launch(worker);

193 }	187 }

194 }	188 }

195	189

196 // Wait till all rows are finished	190 // Wait till all rows are finished

197 for (i = 0; i < num_workers; ++i) {	191 for (i = 0; i < num_workers; ++i) {

198 winterface->sync(&pbi->tile_workers[i]);	192 winterface->sync(&workers[i]);

199 }	193 }

200 }	194 }

201	195

202 // Set up nsync by width.	196 // Set up nsync by width.

203 static int get_sync_range(int width) {	197 static int get_sync_range(int width) {

204 // nsync numbers are picked by testing. For example, for 4k	198 // nsync numbers are picked by testing. For example, for 4k

205 // video, using 4 gives best performance.	199 // video, using 4 gives best performance.

206 if (width < 640)	200 if (width < 640)

207 return 1;	201 return 1;

208 else if (width <= 1280)	202 else if (width <= 1280)

209 return 2;	203 return 2;

210 else if (width <= 4096)	204 else if (width <= 4096)

211 return 4;	205 return 4;

212 else	206 else

213 return 8;	207 return 8;

214 }	208 }

215	209

216 // Allocate memory for lf row synchronization	210 // Allocate memory for lf row synchronization

217 void vp9_loop_filter_alloc(VP9LfSync lf_sync, VP9_COMMON cm, int rows,	211 void vp9_loop_filter_alloc(VP9LfSync lf_sync, VP9_COMMON cm, int rows,

218 int width) {	212 int width, int num_workers) {

219 lf_sync->rows = rows;	213 lf_sync->rows = rows;

220 #if CONFIG_MULTITHREAD	214 #if CONFIG_MULTITHREAD

221 {	215 {

222 int i;	216 int i;

223	217

224 CHECK_MEM_ERROR(cm, lf_sync->mutex_,	218 CHECK_MEM_ERROR(cm, lf_sync->mutex_,

225 vpx_malloc(sizeof(lf_sync->mutex_) rows));	219 vpx_malloc(sizeof(lf_sync->mutex_) rows));

226 if (lf_sync->mutex_) {	220 if (lf_sync->mutex_) {

227 for (i = 0; i < rows; ++i) {	221 for (i = 0; i < rows; ++i) {

228 pthread_mutex_init(&lf_sync->mutex_[i], NULL);	222 pthread_mutex_init(&lf_sync->mutex_[i], NULL);

229 }	223 }

230 }	224 }

231	225

232 CHECK_MEM_ERROR(cm, lf_sync->cond_,	226 CHECK_MEM_ERROR(cm, lf_sync->cond_,

233 vpx_malloc(sizeof(lf_sync->cond_) rows));	227 vpx_malloc(sizeof(lf_sync->cond_) rows));

234 if (lf_sync->cond_) {	228 if (lf_sync->cond_) {

235 for (i = 0; i < rows; ++i) {	229 for (i = 0; i < rows; ++i) {

236 pthread_cond_init(&lf_sync->cond_[i], NULL);	230 pthread_cond_init(&lf_sync->cond_[i], NULL);

237 }	231 }

238 }	232 }

239 }	233 }

240 #endif // CONFIG_MULTITHREAD	234 #endif // CONFIG_MULTITHREAD

241	235

	236 CHECK_MEM_ERROR(cm, lf_sync->lfdata,

	237 vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));

	238 lf_sync->num_workers = num_workers;

	239

242 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,	240 CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,

243 vpx_malloc(sizeof(lf_sync->cur_sb_col) rows));	241 vpx_malloc(sizeof(lf_sync->cur_sb_col) rows));

244	242

245 // Set up nsync.	243 // Set up nsync.

246 lf_sync->sync_range = get_sync_range(width);	244 lf_sync->sync_range = get_sync_range(width);

247 }	245 }

248	246

249 // Deallocate lf synchronization related mutex and data	247 // Deallocate lf synchronization related mutex and data

250 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {	248 void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {

251 if (lf_sync != NULL) {	249 if (lf_sync != NULL) {

252 #if CONFIG_MULTITHREAD	250 #if CONFIG_MULTITHREAD

253 int i;	251 int i;

254	252

255 if (lf_sync->mutex_ != NULL) {	253 if (lf_sync->mutex_ != NULL) {

256 for (i = 0; i < lf_sync->rows; ++i) {	254 for (i = 0; i < lf_sync->rows; ++i) {

257 pthread_mutex_destroy(&lf_sync->mutex_[i]);	255 pthread_mutex_destroy(&lf_sync->mutex_[i]);

258 }	256 }

259 vpx_free(lf_sync->mutex_);	257 vpx_free(lf_sync->mutex_);

260 }	258 }

261 if (lf_sync->cond_ != NULL) {	259 if (lf_sync->cond_ != NULL) {

262 for (i = 0; i < lf_sync->rows; ++i) {	260 for (i = 0; i < lf_sync->rows; ++i) {

263 pthread_cond_destroy(&lf_sync->cond_[i]);	261 pthread_cond_destroy(&lf_sync->cond_[i]);

264 }	262 }

265 vpx_free(lf_sync->cond_);	263 vpx_free(lf_sync->cond_);

266 }	264 }

267 #endif // CONFIG_MULTITHREAD	265 #endif // CONFIG_MULTITHREAD

	266 vpx_free(lf_sync->lfdata);

268 vpx_free(lf_sync->cur_sb_col);	267 vpx_free(lf_sync->cur_sb_col);

269 // clear the structure as the source of this call may be a resize in which	268 // clear the structure as the source of this call may be a resize in which

270 // case this call will be followed by an _alloc() which may fail.	269 // case this call will be followed by an _alloc() which may fail.

271 vp9_zero(*lf_sync);	270 vp9_zero(*lf_sync);

272 }	271 }

273 }	272 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/decoder/vp9_dthread.h ('k') | source/libvpx/vp9/encoder/vp9_aq_complexity.h » ('j') | no next file with comments »