source/libvpx/vp9/common/vp9_loopfilter_thread.c - Issue 800493003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/vp9_loopfilter_thread.c

Issue 800493003: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Keep vp9_iht8x8_add_neon disabled because of http://llvm.org/bugs/show_bug.cgi?id=22178 Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "./vpx_config.h"	11 #include "./vpx_config.h"

12

13 #include "vpx_mem/vpx_mem.h"	12 #include "vpx_mem/vpx_mem.h"

14	13 #include "vp9/common/vp9_loopfilter_thread.h"

15 #include "vp9/common/vp9_reconinter.h"	14 #include "vp9/common/vp9_reconinter.h"

16	15

17 #include "vp9/decoder/vp9_dthread.h"

18 #include "vp9/decoder/vp9_decoder.h"

19

20 #if CONFIG_MULTITHREAD	16 #if CONFIG_MULTITHREAD

21 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {	17 static INLINE void mutex_lock(pthread_mutex_t *const mutex) {

22 const int kMaxTryLocks = 4000;	18 const int kMaxTryLocks = 4000;

23 int locked = 0;	19 int locked = 0;

24 int i;	20 int i;

25	21

26 for (i = 0; i < kMaxTryLocks; ++i) {	22 for (i = 0; i < kMaxTryLocks; ++i) {

27 if (!pthread_mutex_trylock(mutex)) {	23 if (!pthread_mutex_trylock(mutex)) {

28 locked = 1;	24 locked = 1;

29 break;	25 break;

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
81 }	77 }

82 #else	78 #else

83 (void)lf_sync;	79 (void)lf_sync;

84 (void)r;	80 (void)r;

85 (void)c;	81 (void)c;

86 (void)sb_cols;	82 (void)sb_cols;

87 #endif // CONFIG_MULTITHREAD	83 #endif // CONFIG_MULTITHREAD

88 }	84 }

89	85

90 // Implement row loopfiltering for each thread.	86 // Implement row loopfiltering for each thread.

91 static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,	87 static INLINE

92 VP9_COMMON *const cm,	88 void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,

93 struct macroblockd_plane planes[MAX_MB_PLANE],	89 VP9_COMMON *const cm,

94 int start, int stop, int y_only,	90 struct macroblockd_plane planes[MAX_MB_PLANE],

95 VP9LfSync *const lf_sync) {	91 int start, int stop, int y_only,

	92 VP9LfSync *const lf_sync) {

96 const int num_planes = y_only ? 1 : MAX_MB_PLANE;	93 const int num_planes = y_only ? 1 : MAX_MB_PLANE;

97 int r, c; // SB row and col	94 const int use_420 = y_only \|\| (planes[1].subsampling_y == 1 &&

	95 planes[1].subsampling_x == 1);

98 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;	96 const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;

	97 int mi_row, mi_col;

99	98

100 for (r = start; r < stop; r += lf_sync->num_workers) {	99 for (mi_row = start; mi_row < stop;

101 const int mi_row = r << MI_BLOCK_SIZE_LOG2;	100 mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {

102 MODE_INFO const mi = cm->mi + mi_row cm->mi_stride;	101 MODE_INFO const mi = cm->mi + mi_row cm->mi_stride;

103	102

104 for (c = 0; c < sb_cols; ++c) {	103 for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {

105 const int mi_col = c << MI_BLOCK_SIZE_LOG2;	104 const int r = mi_row >> MI_BLOCK_SIZE_LOG2;

	105 const int c = mi_col >> MI_BLOCK_SIZE_LOG2;

106 LOOP_FILTER_MASK lfm;	106 LOOP_FILTER_MASK lfm;

107 int plane;	107 int plane;

108	108

109 sync_read(lf_sync, r, c);	109 sync_read(lf_sync, r, c);

110	110

111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);	111 vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

112 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);	112

	113 // TODO(JBB): Make setup_mask work for non 420.

	114 if (use_420)

	115 vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,

	116 &lfm);

113	117

114 for (plane = 0; plane < num_planes; ++plane) {	118 for (plane = 0; plane < num_planes; ++plane) {

115 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);	119 if (use_420)

	120 vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);

	121 else

	122 vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,

	123 mi_row, mi_col);

116 }	124 }

117	125

118 sync_write(lf_sync, r, c, sb_cols);	126 sync_write(lf_sync, r, c, sb_cols);

119 }	127 }

120 }	128 }

121 }	129 }

122	130

123 // Row-based multi-threaded loopfilter hook	131 // Row-based multi-threaded loopfilter hook

124 static int loop_filter_row_worker(VP9LfSync *const lf_sync,	132 static int loop_filter_row_worker(VP9LfSync *const lf_sync,

125 LFWorkerData *const lf_data) {	133 LFWorkerData *const lf_data) {

126 loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,	134 thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,

127 lf_data->start, lf_data->stop, lf_data->y_only, lf_sync);	135 lf_data->start, lf_data->stop, lf_data->y_only,

	136 lf_sync);

128 return 1;	137 return 1;

129 }	138 }

130	139

131 // VP9 decoder: Implement multi-threaded loopfilter that uses the tile	140 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,

132 // threads.	141 VP9_COMMON *cm,

133 void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync,	142 struct macroblockd_plane planes[MAX_MB_PLANE],

134 YV12_BUFFER_CONFIG *frame,	143 int start, int stop, int y_only,

135 struct macroblockd_plane planes[MAX_MB_PLANE],	144 VP9Worker *workers, int nworkers,

136 VP9_COMMON *cm,	145 VP9LfSync *lf_sync) {

137 VP9Worker *workers, int nworkers,

138 int frame_filter_level,

139 int y_only) {

140 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();	146 const VP9WorkerInterface *const winterface = vp9_get_worker_interface();

141 // Number of superblock rows and cols	147 // Number of superblock rows and cols

142 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;	148 const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;

	149 // Decoder may allocate more threads than number of tiles based on user's

	150 // input.

143 const int tile_cols = 1 << cm->log2_tile_cols;	151 const int tile_cols = 1 << cm->log2_tile_cols;

144 const int num_workers = MIN(nworkers, tile_cols);	152 const int num_workers = MIN(nworkers, tile_cols);

145 int i;	153 int i;

146	154

147 if (!frame_filter_level) return;

148

149 if (!lf_sync->sync_range \|\| cm->last_height != cm->height \|\|	155 if (!lf_sync->sync_range \|\| cm->last_height != cm->height \|\|

150 num_workers > lf_sync->num_workers) {	156 num_workers > lf_sync->num_workers) {

151 vp9_loop_filter_dealloc(lf_sync);	157 vp9_loop_filter_dealloc(lf_sync);

152 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);	158 vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);

153 }	159 }

154	160

155 vp9_loop_filter_frame_init(cm, frame_filter_level);

156

157 // Initialize cur_sb_col to -1 for all SB rows.	161 // Initialize cur_sb_col to -1 for all SB rows.

158 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(lf_sync->cur_sb_col) sb_rows);	162 vpx_memset(lf_sync->cur_sb_col, -1, sizeof(lf_sync->cur_sb_col) sb_rows);

159	163

160 // Set up loopfilter thread data.	164 // Set up loopfilter thread data.

161 // The decoder is capping num_workers because it has been observed that using	165 // The decoder is capping num_workers because it has been observed that using

162 // more threads on the loopfilter than there are cores will hurt performance	166 // more threads on the loopfilter than there are cores will hurt performance

163 // on Android. This is because the system will only schedule the tile decode	167 // on Android. This is because the system will only schedule the tile decode

164 // workers on cores equal to the number of tile columns. Then if the decoder	168 // workers on cores equal to the number of tile columns. Then if the decoder

165 // tries to use more threads for the loopfilter, it will hurt performance	169 // tries to use more threads for the loopfilter, it will hurt performance

166 // because of contention. If the multithreading code changes in the future	170 // because of contention. If the multithreading code changes in the future

167 // then the number of workers used by the loopfilter should be revisited.	171 // then the number of workers used by the loopfilter should be revisited.

168 for (i = 0; i < num_workers; ++i) {	172 for (i = 0; i < num_workers; ++i) {

169 VP9Worker *const worker = &workers[i];	173 VP9Worker *const worker = &workers[i];

170 LFWorkerData *const lf_data = &lf_sync->lfdata[i];	174 LFWorkerData *const lf_data = &lf_sync->lfdata[i];

171	175

172 worker->hook = (VP9WorkerHook)loop_filter_row_worker;	176 worker->hook = (VP9WorkerHook)loop_filter_row_worker;

173 worker->data1 = lf_sync;	177 worker->data1 = lf_sync;

174 worker->data2 = lf_data;	178 worker->data2 = lf_data;

175	179

176 // Loopfilter data	180 // Loopfilter data

177 vp9_loop_filter_data_reset(lf_data, frame, cm, planes);	181 vp9_loop_filter_data_reset(lf_data, frame, cm, planes);

178 lf_data->start = i;	182 lf_data->start = start + i * MI_BLOCK_SIZE;

179 lf_data->stop = sb_rows;	183 lf_data->stop = stop;

180 lf_data->y_only = y_only;	184 lf_data->y_only = y_only;

181	185

182 // Start loopfiltering	186 // Start loopfiltering

183 if (i == num_workers - 1) {	187 if (i == num_workers - 1) {

184 winterface->execute(worker);	188 winterface->execute(worker);

185 } else {	189 } else {

186 winterface->launch(worker);	190 winterface->launch(worker);

187 }	191 }

188 }	192 }

189	193

190 // Wait till all rows are finished	194 // Wait till all rows are finished

191 for (i = 0; i < num_workers; ++i) {	195 for (i = 0; i < num_workers; ++i) {

192 winterface->sync(&workers[i]);	196 winterface->sync(&workers[i]);

193 }	197 }

194 }	198 }

195	199

	200 void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,

	201 VP9_COMMON *cm,

	202 struct macroblockd_plane planes[MAX_MB_PLANE],

	203 int frame_filter_level,

	204 int y_only, int partial_frame,

	205 VP9Worker *workers, int num_workers,

	206 VP9LfSync *lf_sync) {

	207 int start_mi_row, end_mi_row, mi_rows_to_filter;

	208

	209 if (!frame_filter_level) return;

	210

	211 start_mi_row = 0;

	212 mi_rows_to_filter = cm->mi_rows;

	213 if (partial_frame && cm->mi_rows > 8) {

	214 start_mi_row = cm->mi_rows >> 1;

	215 start_mi_row &= 0xfffffff8;

	216 mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);

	217 }

	218 end_mi_row = start_mi_row + mi_rows_to_filter;

	219 vp9_loop_filter_frame_init(cm, frame_filter_level);

	220

	221 loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,

	222 y_only, workers, num_workers, lf_sync);

	223 }

	224

196 // Set up nsync by width.	225 // Set up nsync by width.

197 static int get_sync_range(int width) {	226 static INLINE int get_sync_range(int width) {

198 // nsync numbers are picked by testing. For example, for 4k	227 // nsync numbers are picked by testing. For example, for 4k

199 // video, using 4 gives best performance.	228 // video, using 4 gives best performance.

200 if (width < 640)	229 if (width < 640)

201 return 1;	230 return 1;

202 else if (width <= 1280)	231 else if (width <= 1280)

203 return 2;	232 return 2;

204 else if (width <= 4096)	233 else if (width <= 4096)

205 return 4;	234 return 4;

206 else	235 else

207 return 8;	236 return 8;

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
263 vpx_free(lf_sync->cond_);	292 vpx_free(lf_sync->cond_);

264 }	293 }

265 #endif // CONFIG_MULTITHREAD	294 #endif // CONFIG_MULTITHREAD

266 vpx_free(lf_sync->lfdata);	295 vpx_free(lf_sync->lfdata);

267 vpx_free(lf_sync->cur_sb_col);	296 vpx_free(lf_sync->cur_sb_col);

268 // clear the structure as the source of this call may be a resize in which	297 // clear the structure as the source of this call may be a resize in which

269 // case this call will be followed by an _alloc() which may fail.	298 // case this call will be followed by an _alloc() which may fail.

270 vp9_zero(*lf_sync);	299 vp9_zero(*lf_sync);

271 }	300 }

272 }	301 }

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/common/vp9_loopfilter_thread.h ('k') | source/libvpx/vp9/common/vp9_onyxc_int.h » ('j') | no next file with comments »