source/libvpx/third_party/libyuv/source/scale.c - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale.c

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /*

2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

3 *

4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.

9 */

10

11 #include "third_party/libyuv/include/libyuv/scale.h"

12

13 #include <assert.h>

14 #include <string.h>

15

16 #include "third_party/libyuv/include/libyuv/cpu_id.h"

17 #include "third_party/libyuv/source/row.h"

18

19 #ifdef __cplusplus

20 namespace libyuv {

21 extern "C" {

22 #endif

23

24 /*

25 * Note: Defining YUV_DISABLE_ASM allows to use c version.

26 */

27 //#define YUV_DISABLE_ASM

28

29 #if defined(_MSC_VER)

30 #define ALIGN16(var) __declspec(align(16)) var

31 #else

32 #define ALIGN16(var) var __attribute__((aligned(16)))

33 #endif

34

35 // Note: A Neon reference manual

36 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG .html

37 // Note: Some SSE2 reference manuals

38 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf

39

40 // Set the following flag to true to revert to only

41 // using the reference implementation ScalePlaneBox(), and

42 // NOT the optimized versions. Useful for debugging and

43 // when comparing the quality of the resulting YUV planes

44 // as produced by the optimized and non-optimized versions.

45

46 static int use_reference_impl_ = 0;

47

48 void SetUseReferenceImpl(int use) {

49 use_reference_impl_ = use;

50 }

51

52 // ScaleRowDown2Int also used by planar functions

53

54 /**

55 * NEON downscalers with interpolation.

56 *

57 * Provided by Fritz Koenig

58 *

59 */

60

61 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

62 #define HAS_SCALEROWDOWN2_NEON

63 void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride,

64 uint8* dst, int dst_width) {

65 asm volatile (

66 "1: \n"

67 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1

68 "vst1.u8 {q0}, [%1]! \n" // store even pixels

69 "subs %2, %2, #16 \n" // 16 processed per loop

70 "bhi 1b \n"

71 : "+r"(src_ptr), // %0

72 "+r"(dst), // %1

73 "+r"(dst_width) // %2

74 :

75 : "q0", "q1" // Clobber List

76 );

77 }

78

79 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,

80 uint8* dst, int dst_width) {

81 asm volatile (

82 "add %1, %0 \n" // change the stride to row 2 pointer

83 "1: \n"

84 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post incre ment

85 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post incre ment

86 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent

87 "vpaddl.u8 q1, q1 \n"

88 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add r ow 1 to row 2

89 "vpadal.u8 q1, q3 \n"

90 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack

91 "vrshrn.u16 d1, q1, #2 \n"

92 "vst1.u8 {q0}, [%2]! \n"

93 "subs %3, %3, #16 \n" // 16 processed per loop

94 "bhi 1b \n"

95 : "+r"(src_ptr), // %0

96 "+r"(src_stride), // %1

97 "+r"(dst), // %2

98 "+r"(dst_width) // %3

99 :

100 : "q0", "q1", "q2", "q3" // Clobber List

101 );

102 }

103

104 #define HAS_SCALEROWDOWN4_NEON

105 static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,

106 uint8* dst_ptr, int dst_width) {

107 asm volatile (

108 "1: \n"

109 "vld2.u8 {d0, d1}, [%0]! \n"

110 "vtrn.u8 d1, d0 \n"

111 "vshrn.u16 d0, q0, #8 \n"

112 "vst1.u32 {d0[1]}, [%1]! \n"

113

114 "subs %2, #4 \n"

115 "bhi 1b \n"

116 : "+r"(src_ptr), // %0

117 "+r"(dst_ptr), // %1

118 "+r"(dst_width) // %2

119 :

120 : "q0", "q1", "memory", "cc"

121 );

122 }

123

124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,

125 uint8* dst_ptr, int dst_width) {

126 asm volatile (

127 "add r4, %0, %3 \n"

128 "add r5, r4, %3 \n"

129 "add %3, r5, %3 \n"

130 "1: \n"

131 "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of in put data

132 "vld1.u8 {q1}, [r4]! \n"

133 "vld1.u8 {q2}, [r5]! \n"

134 "vld1.u8 {q3}, [%3]! \n"

135

136 "vpaddl.u8 q0, q0 \n"

137 "vpadal.u8 q0, q1 \n"

138 "vpadal.u8 q0, q2 \n"

139 "vpadal.u8 q0, q3 \n"

140

141 "vpaddl.u16 q0, q0 \n"

142

143 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding

144

145 "vmovn.u16 d0, q0 \n"

146 "vst1.u32 {d0[0]}, [%1]! \n"

147

148 "subs %2, #4 \n"

149 "bhi 1b \n"

150

151 : "+r"(src_ptr), // %0

152 "+r"(dst_ptr), // %1

153 "+r"(dst_width) // %2

154 : "r"(src_stride) // %3

155 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"

156 );

157 }

158

159 #define HAS_SCALEROWDOWN34_NEON

160 // Down scale from 4 to 3 pixels. Use the neon multilane read/write

161 // to load up the every 4th pixel into a 4 different registers.

162 // Point samples 32 pixels to 24 pixels.

163 static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,

164 uint8* dst_ptr, int dst_width) {

165 asm volatile (

166 "1: \n"

167 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0

168 "vmov d2, d3 \n" // order needs to be d0, d1, d2

169 "vst3.u8 {d0, d1, d2}, [%1]! \n"

170 "subs %2, #24 \n"

171 "bhi 1b \n"

172 : "+r"(src_ptr), // %0

173 "+r"(dst_ptr), // %1

174 "+r"(dst_width) // %2

175 :

176 : "d0", "d1", "d2", "d3", "memory", "cc"

177 );

178 }

179

180 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,

181 uint8* dst_ptr, int dst_width) {

182 asm volatile (

183 "vmov.u8 d24, #3 \n"

184 "add %3, %0 \n"

185 "1: \n"

186 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0

187 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1

188

189 // filter src line 0 with src line 1

190 // expand chars to shorts to allow for room

191 // when adding lines together

192 "vmovl.u8 q8, d4 \n"

193 "vmovl.u8 q9, d5 \n"

194 "vmovl.u8 q10, d6 \n"

195 "vmovl.u8 q11, d7 \n"

196

197 // 3 * line_0 + line_1

198 "vmlal.u8 q8, d0, d24 \n"

199 "vmlal.u8 q9, d1, d24 \n"

200 "vmlal.u8 q10, d2, d24 \n"

201 "vmlal.u8 q11, d3, d24 \n"

202

203 // (3 * line_0 + line_1) >> 2

204 "vqrshrn.u16 d0, q8, #2 \n"

205 "vqrshrn.u16 d1, q9, #2 \n"

206 "vqrshrn.u16 d2, q10, #2 \n"

207 "vqrshrn.u16 d3, q11, #2 \n"

208

209 // a0 = (src[0] * 3 + s[1] * 1) >> 2

210 "vmovl.u8 q8, d1 \n"

211 "vmlal.u8 q8, d0, d24 \n"

212 "vqrshrn.u16 d0, q8, #2 \n"

213

214 // a1 = (src[1] * 1 + s[2] * 1) >> 1

215 "vrhadd.u8 d1, d1, d2 \n"

216

217 // a2 = (src[2] * 1 + s[3] * 3) >> 2

218 "vmovl.u8 q8, d2 \n"

219 "vmlal.u8 q8, d3, d24 \n"

220 "vqrshrn.u16 d2, q8, #2 \n"

221

222 "vst3.u8 {d0, d1, d2}, [%1]! \n"

223

224 "subs %2, #24 \n"

225 "bhi 1b \n"

226 : "+r"(src_ptr), // %0

227 "+r"(dst_ptr), // %1

228 "+r"(dst_width), // %2

229 "+r"(src_stride) // %3

230 :

231 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"

232 );

233 }

234

235 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,

236 uint8* dst_ptr, int dst_width) {

237 asm volatile (

238 "vmov.u8 d24, #3 \n"

239 "add %3, %0 \n"

240 "1: \n"

241 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0

242 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1

243

244 // average src line 0 with src line 1

245 "vrhadd.u8 q0, q0, q2 \n"

246 "vrhadd.u8 q1, q1, q3 \n"

247

248 // a0 = (src[0] * 3 + s[1] * 1) >> 2

249 "vmovl.u8 q3, d1 \n"

250 "vmlal.u8 q3, d0, d24 \n"

251 "vqrshrn.u16 d0, q3, #2 \n"

252

253 // a1 = (src[1] * 1 + s[2] * 1) >> 1

254 "vrhadd.u8 d1, d1, d2 \n"

255

256 // a2 = (src[2] * 1 + s[3] * 3) >> 2

257 "vmovl.u8 q3, d2 \n"

258 "vmlal.u8 q3, d3, d24 \n"

259 "vqrshrn.u16 d2, q3, #2 \n"

260

261 "vst3.u8 {d0, d1, d2}, [%1]! \n"

262

263 "subs %2, #24 \n"

264 "bhi 1b \n"

265 : "+r"(src_ptr), // %0

266 "+r"(dst_ptr), // %1

267 "+r"(dst_width), // %2

268 "+r"(src_stride) // %3

269 :

270 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"

271 );

272 }

273

274 #define HAS_SCALEROWDOWN38_NEON

275 const uint8 shuf38[16] __attribute__ ((aligned(16))) =

276 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };

277 const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =

278 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };

279 const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =

280 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,

281 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };

282 const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =

283 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,

284 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

285

286 // 32 -> 12

287 static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,

288 uint8* dst_ptr, int dst_width) {

289 asm volatile (

290 "vld1.u8 {q3}, [%3] \n"

291 "1: \n"

292 "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"

293 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"

294 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"

295 "vst1.u8 {d4}, [%1]! \n"

296 "vst1.u32 {d5[0]}, [%1]! \n"

297 "subs %2, #12 \n"

298 "bhi 1b \n"

299 : "+r"(src_ptr), // %0

300 "+r"(dst_ptr), // %1

301 "+r"(dst_width) // %2

302 : "r"(shuf38) // %3

303 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"

304 );

305 }

306

307 // 32x3 -> 12x1

308 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,

309 uint8* dst_ptr, int dst_width) {

310 asm volatile (

311 "vld1.u16 {q13}, [%4] \n"

312 "vld1.u8 {q14}, [%5] \n"

313 "vld1.u8 {q15}, [%6] \n"

314 "add r4, %0, %3, lsl #1 \n"

315 "add %3, %0 \n"

316 "1: \n"

317

318 // d0 = 00 40 01 41 02 42 03 43

319 // d1 = 10 50 11 51 12 52 13 53

320 // d2 = 20 60 21 61 22 62 23 63

321 // d3 = 30 70 31 71 32 72 33 73

322 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"

323 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"

324 "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"

325

326 // Shuffle the input data around to get align the data

327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

328 // d0 = 00 10 01 11 02 12 03 13

329 // d1 = 40 50 41 51 42 52 43 53

330 "vtrn.u8 d0, d1 \n"

331 "vtrn.u8 d4, d5 \n"

332 "vtrn.u8 d16, d17 \n"

333

334 // d2 = 20 30 21 31 22 32 23 33

335 // d3 = 60 70 61 71 62 72 63 73

336 "vtrn.u8 d2, d3 \n"

337 "vtrn.u8 d6, d7 \n"

338 "vtrn.u8 d18, d19 \n"

339

340 // d0 = 00+10 01+11 02+12 03+13

341 // d2 = 40+50 41+51 42+52 43+53

342 "vpaddl.u8 q0, q0 \n"

343 "vpaddl.u8 q2, q2 \n"

344 "vpaddl.u8 q8, q8 \n"

345

346 // d3 = 60+70 61+71 62+72 63+73

347 "vpaddl.u8 d3, d3 \n"

348 "vpaddl.u8 d7, d7 \n"

349 "vpaddl.u8 d19, d19 \n"

350

351 // combine source lines

352 "vadd.u16 q0, q2 \n"

353 "vadd.u16 q0, q8 \n"

354 "vadd.u16 d4, d3, d7 \n"

355 "vadd.u16 d4, d19 \n"

356

357 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

358 // + s[6 + st * 1] + s[7 + st * 1]

359 // + s[6 + st * 2] + s[7 + st * 2]) / 6

360 "vqrdmulh.s16 q2, q13 \n"

361 "vmovn.u16 d4, q2 \n"

362

363 // Shuffle 2,3 reg around so that 2 can be added to the

364 // 0,1 reg and 3 can be added to the 4,5 reg. This

365 // requires expanding from u8 to u16 as the 0,1 and 4,5

366 // registers are already expanded. Then do transposes

367 // to get aligned.

368 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

369 "vmovl.u8 q1, d2 \n"

370 "vmovl.u8 q3, d6 \n"

371 "vmovl.u8 q9, d18 \n"

372

373 // combine source lines

374 "vadd.u16 q1, q3 \n"

375 "vadd.u16 q1, q9 \n"

376

377 // d4 = xx 20 xx 30 xx 22 xx 32

378 // d5 = xx 21 xx 31 xx 23 xx 33

379 "vtrn.u32 d2, d3 \n"

380

381 // d4 = xx 20 xx 21 xx 22 xx 23

382 // d5 = xx 30 xx 31 xx 32 xx 33

383 "vtrn.u16 d2, d3 \n"

384

385 // 0+1+2, 3+4+5

386 "vadd.u16 q0, q1 \n"

387

388 // Need to divide, but can't downshift as the the value

389 // isn't a power of 2. So multiply by 65536 / n

390 // and take the upper 16 bits.

391 "vqrdmulh.s16 q0, q15 \n"

392

393 // Align for table lookup, vtbl requires registers to

394 // be adjacent

395 "vmov.u8 d2, d4 \n"

396

397 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"

398 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"

399

400 "vst1.u8 {d3}, [%1]! \n"

401 "vst1.u32 {d4[0]}, [%1]! \n"

402 "subs %2, #12 \n"

403 "bhi 1b \n"

404 : "+r"(src_ptr), // %0

405 "+r"(dst_ptr), // %1

406 "+r"(dst_width), // %2

407 "+r"(src_stride) // %3

408 : "r"(mult38_div6), // %4

409 "r"(shuf38_2), // %5

410 "r"(mult38_div9) // %6

411 : "r4", "q0", "q1", "q2", "q3", "q8", "q9",

412 "q13", "q14", "q15", "memory", "cc"

413 );

414 }

415

416 // 32x2 -> 12x1

417 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,

418 uint8* dst_ptr, int dst_width) {

419 asm volatile (

420 "vld1.u16 {q13}, [%4] \n"

421 "vld1.u8 {q14}, [%5] \n"

422 "add %3, %0 \n"

423 "1: \n"

424

425 // d0 = 00 40 01 41 02 42 03 43

426 // d1 = 10 50 11 51 12 52 13 53

427 // d2 = 20 60 21 61 22 62 23 63

428 // d3 = 30 70 31 71 32 72 33 73

429 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"

430 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"

431

432 // Shuffle the input data around to get align the data

433 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

434 // d0 = 00 10 01 11 02 12 03 13

435 // d1 = 40 50 41 51 42 52 43 53

436 "vtrn.u8 d0, d1 \n"

437 "vtrn.u8 d4, d5 \n"

438

439 // d2 = 20 30 21 31 22 32 23 33

440 // d3 = 60 70 61 71 62 72 63 73

441 "vtrn.u8 d2, d3 \n"

442 "vtrn.u8 d6, d7 \n"

443

444 // d0 = 00+10 01+11 02+12 03+13

445 // d2 = 40+50 41+51 42+52 43+53

446 "vpaddl.u8 q0, q0 \n"

447 "vpaddl.u8 q2, q2 \n"

448

449 // d3 = 60+70 61+71 62+72 63+73

450 "vpaddl.u8 d3, d3 \n"

451 "vpaddl.u8 d7, d7 \n"

452

453 // combine source lines

454 "vadd.u16 q0, q2 \n"

455 "vadd.u16 d4, d3, d7 \n"

456

457 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

458 "vqrshrn.u16 d4, q2, #2 \n"

459

460 // Shuffle 2,3 reg around so that 2 can be added to the

461 // 0,1 reg and 3 can be added to the 4,5 reg. This

462 // requires expanding from u8 to u16 as the 0,1 and 4,5

463 // registers are already expanded. Then do transposes

464 // to get aligned.

465 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

466 "vmovl.u8 q1, d2 \n"

467 "vmovl.u8 q3, d6 \n"

468

469 // combine source lines

470 "vadd.u16 q1, q3 \n"

471

472 // d4 = xx 20 xx 30 xx 22 xx 32

473 // d5 = xx 21 xx 31 xx 23 xx 33

474 "vtrn.u32 d2, d3 \n"

475

476 // d4 = xx 20 xx 21 xx 22 xx 23

477 // d5 = xx 30 xx 31 xx 32 xx 33

478 "vtrn.u16 d2, d3 \n"

479

480 // 0+1+2, 3+4+5

481 "vadd.u16 q0, q1 \n"

482

483 // Need to divide, but can't downshift as the the value

484 // isn't a power of 2. So multiply by 65536 / n

485 // and take the upper 16 bits.

486 "vqrdmulh.s16 q0, q13 \n"

487

488 // Align for table lookup, vtbl requires registers to

489 // be adjacent

490 "vmov.u8 d2, d4 \n"

491

492 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"

493 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"

494

495 "vst1.u8 {d3}, [%1]! \n"

496 "vst1.u32 {d4[0]}, [%1]! \n"

497 "subs %2, #12 \n"

498 "bhi 1b \n"

499 : "+r"(src_ptr), // %0

500 "+r"(dst_ptr), // %1

501 "+r"(dst_width), // %2

502 "+r"(src_stride) // %3

503 : "r"(mult38_div6), // %4

504 "r"(shuf38_2) // %5

505 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

506 );

507 }

508

509 /**

510 * SSE2 downscalers with interpolation.

511 *

512 * Provided by Frank Barchard (fbarchard@google.com)

513 *

514 */

515

516 // Constants for SSE2 code

517 #elif (defined(_M_IX86) \|\| defined(__i386__) \|\| defined(__x86_64__)) && \

518 !defined(YUV_DISABLE_ASM)

519 #if defined(_MSC_VER)

520 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var

521 #elif (defined(__APPLE__) \|\| defined(__MINGW32__) \|\| defined(__CYGWIN__)) && def ined(__i386__)

522 #define TALIGN16(t, var) t var __attribute__((aligned(16)))

523 #else

524 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))

525 #endif

526

527 #if (defined(__APPLE__) \|\| defined(__MINGW32__) \|\| defined(__CYGWIN__)) && \

528 defined(__i386__)

529 #define DECLARE_FUNCTION(name) \

530 ".text \n" \

531 ".globl _" #name " \n" \

532 "_" #name ": \n"

533 #else

534 #define DECLARE_FUNCTION(name) \

535 ".text \n" \

536 ".global " #name " \n" \

537 #name ": \n"

538 #endif

539

540

541 // Offsets for source bytes 0 to 9

542 //extern "C"

543 TALIGN16(const uint8, shuf0[16]) =

544 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

545

546 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

547 //extern "C"

548 TALIGN16(const uint8, shuf1[16]) =

549 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

550

551 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

552 //extern "C"

553 TALIGN16(const uint8, shuf2[16]) =

554 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

555

556 // Offsets for source bytes 0 to 10

557 //extern "C"

558 TALIGN16(const uint8, shuf01[16]) =

559 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

560

561 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

562 //extern "C"

563 TALIGN16(const uint8, shuf11[16]) =

564 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

565

566 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

567 //extern "C"

568 TALIGN16(const uint8, shuf21[16]) =

569 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

570

571 // Coefficients for source bytes 0 to 10

572 //extern "C"

573 TALIGN16(const uint8, madd01[16]) =

574 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

575

576 // Coefficients for source bytes 10 to 21

577 //extern "C"

578 TALIGN16(const uint8, madd11[16]) =

579 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

580

581 // Coefficients for source bytes 21 to 31

582 //extern "C"

583 TALIGN16(const uint8, madd21[16]) =

584 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

585

586 // Coefficients for source bytes 21 to 31

587 //extern "C"

588 TALIGN16(const int16, round34[8]) =

589 { 2, 2, 2, 2, 2, 2, 2, 2 };

590

591 //extern "C"

592 TALIGN16(const uint8, shuf38a[16]) =

593 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

594

595 //extern "C"

596 TALIGN16(const uint8, shuf38b[16]) =

597 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

598

599 // Arrange words 0,3,6 into 0,1,2

600 //extern "C"

601 TALIGN16(const uint8, shufac0[16]) =

602 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

603

604 // Arrange words 0,3,6 into 3,4,5

605 //extern "C"

606 TALIGN16(const uint8, shufac3[16]) =

607 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

608

609 // Scaling values for boxes of 3x3 and 2x3

610 //extern "C"

611 TALIGN16(const uint16, scaleac3[8]) =

612 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

613

614 // Arrange first value for pixels 0,1,2,3,4,5

615 //extern "C"

616 TALIGN16(const uint8, shufab0[16]) =

617 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

618

619 // Arrange second value for pixels 0,1,2,3,4,5

620 //extern "C"

621 TALIGN16(const uint8, shufab1[16]) =

622 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

623

624 // Arrange third value for pixels 0,1,2,3,4,5

625 //extern "C"

626 TALIGN16(const uint8, shufab2[16]) =

627 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

628

629 // Scaling values for boxes of 3x2 and 2x2

630 //extern "C"

631 TALIGN16(const uint16, scaleab2[8]) =

632 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

633 #endif

634

635 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)

636

637 #define HAS_SCALEROWDOWN2_SSE2

638 // Reads 32 pixels, throws half away and writes 16 pixels.

639 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

640 __declspec(naked)

641 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,

642 uint8* dst_ptr, int dst_width) {

643 __asm {

644 mov eax, [esp + 4] // src_ptr

645 // src_stride ignored

646 mov edx, [esp + 12] // dst_ptr

647 mov ecx, [esp + 16] // dst_width

648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

649 psrlw xmm5, 8

650

651 wloop:

652 movdqa xmm0, [eax]

653 movdqa xmm1, [eax + 16]

654 lea eax, [eax + 32]

655 pand xmm0, xmm5

656 pand xmm1, xmm5

657 packuswb xmm0, xmm1

658 movdqa [edx], xmm0

659 lea edx, [edx + 16]

660 sub ecx, 16

661 ja wloop

662

663 ret

664 }

665 }

666 // Blends 32x2 rectangle to 16x1.

667 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

668 __declspec(naked)

669 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,

670 uint8* dst_ptr, int dst_width) {

671 __asm {

672 push esi

673 mov eax, [esp + 4 + 4] // src_ptr

674 mov esi, [esp + 4 + 8] // src_stride

675 mov edx, [esp + 4 + 12] // dst_ptr

676 mov ecx, [esp + 4 + 16] // dst_width

677 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

678 psrlw xmm5, 8

679

680 wloop:

681 movdqa xmm0, [eax]

682 movdqa xmm1, [eax + 16]

683 movdqa xmm2, [eax + esi]

684 movdqa xmm3, [eax + esi + 16]

685 lea eax, [eax + 32]

686 pavgb xmm0, xmm2 // average rows

687 pavgb xmm1, xmm3

688

689 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

690 psrlw xmm0, 8

691 movdqa xmm3, xmm1

692 psrlw xmm1, 8

693 pand xmm2, xmm5

694 pand xmm3, xmm5

695 pavgw xmm0, xmm2

696 pavgw xmm1, xmm3

697 packuswb xmm0, xmm1

698

699 movdqa [edx], xmm0

700 lea edx, [edx + 16]

701 sub ecx, 16

702 ja wloop

703

704 pop esi

705 ret

706 }

707 }

708

709 #define HAS_SCALEROWDOWN4_SSE2

710 // Point samples 32 pixels to 8 pixels.

711 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

712 __declspec(naked)

713 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,

714 uint8* dst_ptr, int dst_width) {

715 __asm {

716 pushad

717 mov esi, [esp + 32 + 4] // src_ptr

718 // src_stride ignored

719 mov edi, [esp + 32 + 12] // dst_ptr

720 mov ecx, [esp + 32 + 16] // dst_width

721 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff

722 psrld xmm5, 24

723

724 wloop:

725 movdqa xmm0, [esi]

726 movdqa xmm1, [esi + 16]

727 lea esi, [esi + 32]

728 pand xmm0, xmm5

729 pand xmm1, xmm5

730 packuswb xmm0, xmm1

731 packuswb xmm0, xmm0

732 movq qword ptr [edi], xmm0

733 lea edi, [edi + 8]

734 sub ecx, 8

735 ja wloop

736

737 popad

738 ret

739 }

740 }

741

742 // Blends 32x4 rectangle to 8x1.

743 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

744 __declspec(naked)

745 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,

746 uint8* dst_ptr, int dst_width) {

747 __asm {

748 pushad

749 mov esi, [esp + 32 + 4] // src_ptr

750 mov ebx, [esp + 32 + 8] // src_stride

751 mov edi, [esp + 32 + 12] // dst_ptr

752 mov ecx, [esp + 32 + 16] // dst_width

753 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff

754 psrlw xmm7, 8

755 lea edx, [ebx + ebx * 2] // src_stride * 3

756

757 wloop:

758 movdqa xmm0, [esi]

759 movdqa xmm1, [esi + 16]

760 movdqa xmm2, [esi + ebx]

761 movdqa xmm3, [esi + ebx + 16]

762 pavgb xmm0, xmm2 // average rows

763 pavgb xmm1, xmm3

764 movdqa xmm2, [esi + ebx * 2]

765 movdqa xmm3, [esi + ebx * 2 + 16]

766 movdqa xmm4, [esi + edx]

767 movdqa xmm5, [esi + edx + 16]

768 lea esi, [esi + 32]

769 pavgb xmm2, xmm4

770 pavgb xmm3, xmm5

771 pavgb xmm0, xmm2

772 pavgb xmm1, xmm3

773

774 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

775 psrlw xmm0, 8

776 movdqa xmm3, xmm1

777 psrlw xmm1, 8

778 pand xmm2, xmm7

779 pand xmm3, xmm7

780 pavgw xmm0, xmm2

781 pavgw xmm1, xmm3

782 packuswb xmm0, xmm1

783

784 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)

785 psrlw xmm0, 8

786 pand xmm2, xmm7

787 pavgw xmm0, xmm2

788 packuswb xmm0, xmm0

789

790 movq qword ptr [edi], xmm0

791 lea edi, [edi + 8]

792 sub ecx, 8

793 ja wloop

794

795 popad

796 ret

797 }

798 }

799

800 #define HAS_SCALEROWDOWN8_SSE2

801 // Point samples 32 pixels to 4 pixels.

802 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.

803 __declspec(naked)

804 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,

805 uint8* dst_ptr, int dst_width) {

806 __asm {

807 pushad

808 mov esi, [esp + 32 + 4] // src_ptr

809 // src_stride ignored

810 mov edi, [esp + 32 + 12] // dst_ptr

811 mov ecx, [esp + 32 + 16] // dst_width

812 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes

813 psrlq xmm5, 56

814

815 wloop:

816 movdqa xmm0, [esi]

817 movdqa xmm1, [esi + 16]

818 lea esi, [esi + 32]

819 pand xmm0, xmm5

820 pand xmm1, xmm5

821 packuswb xmm0, xmm1 // 32->16

822 packuswb xmm0, xmm0 // 16->8

823 packuswb xmm0, xmm0 // 8->4

824 movd dword ptr [edi], xmm0

825 lea edi, [edi + 4]

826 sub ecx, 4

827 ja wloop

828

829 popad

830 ret

831 }

832 }

833

834 // Blends 32x8 rectangle to 4x1.

835 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.

836 __declspec(naked)

837 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

838 uint8* dst_ptr, int dst_width) {

839 __asm {

840 pushad

841 mov esi, [esp + 32 + 4] // src_ptr

842 mov ebx, [esp + 32 + 8] // src_stride

843 mov edi, [esp + 32 + 12] // dst_ptr

844 mov ecx, [esp + 32 + 16] // dst_width

845 lea edx, [ebx + ebx * 2] // src_stride * 3

846 pxor xmm7, xmm7

847

848 wloop:

849 movdqa xmm0, [esi] // average 8 rows to 1

850 movdqa xmm1, [esi + 16]

851 movdqa xmm2, [esi + ebx]

852 movdqa xmm3, [esi + ebx + 16]

853 pavgb xmm0, xmm2

854 pavgb xmm1, xmm3

855 movdqa xmm2, [esi + ebx * 2]

856 movdqa xmm3, [esi + ebx * 2 + 16]

857 movdqa xmm4, [esi + edx]

858 movdqa xmm5, [esi + edx + 16]

859 lea ebp, [esi + ebx * 4]

860 lea esi, [esi + 32]

861 pavgb xmm2, xmm4

862 pavgb xmm3, xmm5

863 pavgb xmm0, xmm2

864 pavgb xmm1, xmm3

865

866 movdqa xmm2, [ebp]

867 movdqa xmm3, [ebp + 16]

868 movdqa xmm4, [ebp + ebx]

869 movdqa xmm5, [ebp + ebx + 16]

870 pavgb xmm2, xmm4

871 pavgb xmm3, xmm5

872 movdqa xmm4, [ebp + ebx * 2]

873 movdqa xmm5, [ebp + ebx * 2 + 16]

874 movdqa xmm6, [ebp + edx]

875 pavgb xmm4, xmm6

876 movdqa xmm6, [ebp + edx + 16]

877 pavgb xmm5, xmm6

878 pavgb xmm2, xmm4

879 pavgb xmm3, xmm5

880 pavgb xmm0, xmm2

881 pavgb xmm1, xmm3

882

883 psadbw xmm0, xmm7 // average 32 pixels to 4

884 psadbw xmm1, xmm7

885 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01

886 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx

887 por xmm0, xmm1 // -> 3201

888 psrlw xmm0, 3

889 packuswb xmm0, xmm0

890 packuswb xmm0, xmm0

891 movd dword ptr [edi], xmm0

892

893 lea edi, [edi + 4]

894 sub ecx, 4

895 ja wloop

896

897 popad

898 ret

899 }

900 }

901

902 #define HAS_SCALEROWDOWN34_SSSE3

903 // Point samples 32 pixels to 24 pixels.

904 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

905 // Then shuffled to do the scaling.

906

907 // Note that movdqa+palign may be better than movdqu.

908 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

909 __declspec(naked)

910 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

911 uint8* dst_ptr, int dst_width) {

912 __asm {

913 pushad

914 mov esi, [esp + 32 + 4] // src_ptr

915 // src_stride ignored

916 mov edi, [esp + 32 + 12] // dst_ptr

917 mov ecx, [esp + 32 + 16] // dst_width

918 movdqa xmm3, _shuf0

919 movdqa xmm4, _shuf1

920 movdqa xmm5, _shuf2

921

922 wloop:

923 movdqa xmm0, [esi]

924 movdqa xmm1, [esi + 16]

925 lea esi, [esi + 32]

926 movdqa xmm2, xmm1

927 palignr xmm1, xmm0, 8

928 pshufb xmm0, xmm3

929 pshufb xmm1, xmm4

930 pshufb xmm2, xmm5

931 movq qword ptr [edi], xmm0

932 movq qword ptr [edi + 8], xmm1

933 movq qword ptr [edi + 16], xmm2

934 lea edi, [edi + 24]

935 sub ecx, 24

936 ja wloop

937

938 popad

939 ret

940 }

941 }

942

943 // Blends 32x2 rectangle to 24x1

944 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

945 // Then shuffled to do the scaling.

946

947 // Register usage:

948 // xmm0 src_row 0

949 // xmm1 src_row 1

950 // xmm2 shuf 0

951 // xmm3 shuf 1

952 // xmm4 shuf 2

953 // xmm5 madd 0

954 // xmm6 madd 1

955 // xmm7 round34

956

957 // Note that movdqa+palign may be better than movdqu.

958 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

959 __declspec(naked)

960 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

961 uint8* dst_ptr, int dst_width) {

962 __asm {

963 pushad

964 mov esi, [esp + 32 + 4] // src_ptr

965 mov ebx, [esp + 32 + 8] // src_stride

966 mov edi, [esp + 32 + 12] // dst_ptr

967 mov ecx, [esp + 32 + 16] // dst_width

968 movdqa xmm2, _shuf01

969 movdqa xmm3, _shuf11

970 movdqa xmm4, _shuf21

971 movdqa xmm5, _madd01

972 movdqa xmm6, _madd11

973 movdqa xmm7, _round34

974

975 wloop:

976 movdqa xmm0, [esi] // pixels 0..7

977 movdqa xmm1, [esi+ebx]

978 pavgb xmm0, xmm1

979 pshufb xmm0, xmm2

980 pmaddubsw xmm0, xmm5

981 paddsw xmm0, xmm7

982 psrlw xmm0, 2

983 packuswb xmm0, xmm0

984 movq qword ptr [edi], xmm0

985 movdqu xmm0, [esi+8] // pixels 8..15

986 movdqu xmm1, [esi+ebx+8]

987 pavgb xmm0, xmm1

988 pshufb xmm0, xmm3

989 pmaddubsw xmm0, xmm6

990 paddsw xmm0, xmm7

991 psrlw xmm0, 2

992 packuswb xmm0, xmm0

993 movq qword ptr [edi+8], xmm0

994 movdqa xmm0, [esi+16] // pixels 16..23

995 movdqa xmm1, [esi+ebx+16]

996 lea esi, [esi+32]

997 pavgb xmm0, xmm1

998 pshufb xmm0, xmm4

999 movdqa xmm1, _madd21

1000 pmaddubsw xmm0, xmm1

1001 paddsw xmm0, xmm7

1002 psrlw xmm0, 2

1003 packuswb xmm0, xmm0

1004 movq qword ptr [edi+16], xmm0

1005 lea edi, [edi+24]

1006 sub ecx, 24

1007 ja wloop

1008

1009 popad

1010 ret

1011 }

1012 }

1013

1014 // Note that movdqa+palign may be better than movdqu.

1015 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

1016 __declspec(naked)

1017 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

1018 uint8* dst_ptr, int dst_width) {

1019 __asm {

1020 pushad

1021 mov esi, [esp + 32 + 4] // src_ptr

1022 mov ebx, [esp + 32 + 8] // src_stride

1023 mov edi, [esp + 32 + 12] // dst_ptr

1024 mov ecx, [esp + 32 + 16] // dst_width

1025 movdqa xmm2, _shuf01

1026 movdqa xmm3, _shuf11

1027 movdqa xmm4, _shuf21

1028 movdqa xmm5, _madd01

1029 movdqa xmm6, _madd11

1030 movdqa xmm7, _round34

1031

1032 wloop:

1033 movdqa xmm0, [esi] // pixels 0..7

1034 movdqa xmm1, [esi+ebx]

1035 pavgb xmm1, xmm0

1036 pavgb xmm0, xmm1

1037 pshufb xmm0, xmm2

1038 pmaddubsw xmm0, xmm5

1039 paddsw xmm0, xmm7

1040 psrlw xmm0, 2

1041 packuswb xmm0, xmm0

1042 movq qword ptr [edi], xmm0

1043 movdqu xmm0, [esi+8] // pixels 8..15

1044 movdqu xmm1, [esi+ebx+8]

1045 pavgb xmm1, xmm0

1046 pavgb xmm0, xmm1

1047 pshufb xmm0, xmm3

1048 pmaddubsw xmm0, xmm6

1049 paddsw xmm0, xmm7

1050 psrlw xmm0, 2

1051 packuswb xmm0, xmm0

1052 movq qword ptr [edi+8], xmm0

1053 movdqa xmm0, [esi+16] // pixels 16..23

1054 movdqa xmm1, [esi+ebx+16]

1055 lea esi, [esi+32]

1056 pavgb xmm1, xmm0

1057 pavgb xmm0, xmm1

1058 pshufb xmm0, xmm4

1059 movdqa xmm1, _madd21

1060 pmaddubsw xmm0, xmm1

1061 paddsw xmm0, xmm7

1062 psrlw xmm0, 2

1063 packuswb xmm0, xmm0

1064 movq qword ptr [edi+16], xmm0

1065 lea edi, [edi+24]

1066 sub ecx, 24

1067 ja wloop

1068

1069 popad

1070 ret

1071 }

1072 }

1073

1074 #define HAS_SCALEROWDOWN38_SSSE3

1075 // 3/8 point sampler

1076

1077 // Scale 32 pixels to 12

1078 __declspec(naked)

1079 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

1080 uint8* dst_ptr, int dst_width) {

1081 __asm {

1082 pushad

1083 mov esi, [esp + 32 + 4] // src_ptr

1084 mov edx, [esp + 32 + 8] // src_stride

1085 mov edi, [esp + 32 + 12] // dst_ptr

1086 mov ecx, [esp + 32 + 16] // dst_width

1087 movdqa xmm4, _shuf38a

1088 movdqa xmm5, _shuf38b

1089

1090 xloop:

1091 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5

1092 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11

1093 lea esi, [esi + 32]

1094 pshufb xmm0, xmm4

1095 pshufb xmm1, xmm5

1096 paddusb xmm0, xmm1

1097

1098 movq qword ptr [edi], xmm0 // write 12 pixels

1099 movhlps xmm1, xmm0

1100 movd [edi + 8], xmm1

1101 lea edi, [edi + 12]

1102 sub ecx, 12

1103 ja xloop

1104

1105 popad

1106 ret

1107 }

1108 }

1109

1110 // Scale 16x3 pixels to 6x1 with interpolation

1111 __declspec(naked)

1112 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

1113 uint8* dst_ptr, int dst_width) {

1114 __asm {

1115 pushad

1116 mov esi, [esp + 32 + 4] // src_ptr

1117 mov edx, [esp + 32 + 8] // src_stride

1118 mov edi, [esp + 32 + 12] // dst_ptr

1119 mov ecx, [esp + 32 + 16] // dst_width

1120 movdqa xmm4, _shufac0

1121 movdqa xmm5, _shufac3

1122 movdqa xmm6, _scaleac3

1123 pxor xmm7, xmm7

1124

1125 xloop:

1126 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1

1127 movdqa xmm2, [esi + edx]

1128 movhlps xmm1, xmm0

1129 movhlps xmm3, xmm2

1130 punpcklbw xmm0, xmm7

1131 punpcklbw xmm1, xmm7

1132 punpcklbw xmm2, xmm7

1133 punpcklbw xmm3, xmm7

1134 paddusw xmm0, xmm2

1135 paddusw xmm1, xmm3

1136 movdqa xmm2, [esi + edx * 2]

1137 lea esi, [esi + 16]

1138 movhlps xmm3, xmm2

1139 punpcklbw xmm2, xmm7

1140 punpcklbw xmm3, xmm7

1141 paddusw xmm0, xmm2

1142 paddusw xmm1, xmm3

1143

1144 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2

1145 psrldq xmm0, 2

1146 paddusw xmm2, xmm0

1147 psrldq xmm0, 2

1148 paddusw xmm2, xmm0

1149 pshufb xmm2, xmm4

1150

1151 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2

1152 psrldq xmm1, 2

1153 paddusw xmm3, xmm1

1154 psrldq xmm1, 2

1155 paddusw xmm3, xmm1

1156 pshufb xmm3, xmm5

1157 paddusw xmm2, xmm3

1158

1159 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6

1160 packuswb xmm2, xmm2

1161

1162 movd [edi], xmm2 // write 6 pixels

1163 pextrw eax, xmm2, 2

1164 mov [edi + 4], ax

1165 lea edi, [edi + 6]

1166 sub ecx, 6

1167 ja xloop

1168

1169 popad

1170 ret

1171 }

1172 }

1173

1174 // Scale 16x2 pixels to 6x1 with interpolation

1175 __declspec(naked)

1176 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

1177 uint8* dst_ptr, int dst_width) {

1178 __asm {

1179 pushad

1180 mov esi, [esp + 32 + 4] // src_ptr

1181 mov edx, [esp + 32 + 8] // src_stride

1182 mov edi, [esp + 32 + 12] // dst_ptr

1183 mov ecx, [esp + 32 + 16] // dst_width

1184 movdqa xmm4, _shufab0

1185 movdqa xmm5, _shufab1

1186 movdqa xmm6, _shufab2

1187 movdqa xmm7, _scaleab2

1188

1189 xloop:

1190 movdqa xmm2, [esi] // average 2 rows into xmm2

1191 pavgb xmm2, [esi + edx]

1192 lea esi, [esi + 16]

1193

1194 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0

1195 pshufb xmm0, xmm4

1196 movdqa xmm1, xmm2

1197 pshufb xmm1, xmm5

1198 paddusw xmm0, xmm1

1199 pshufb xmm2, xmm6

1200 paddusw xmm0, xmm2

1201

1202 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2

1203 packuswb xmm0, xmm0

1204

1205 movd [edi], xmm0 // write 6 pixels

1206 pextrw eax, xmm0, 2

1207 mov [edi + 4], ax

1208 lea edi, [edi + 6]

1209 sub ecx, 6

1210 ja xloop

1211

1212 popad

1213 ret

1214 }

1215 }

1216

1217 #define HAS_SCALEADDROWS_SSE2

1218

1219 // Reads 8xN bytes and produces 16 shorts at a time.

1220 __declspec(naked)

1221 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

1222 uint16* dst_ptr, int src_width,

1223 int src_height) {

1224 __asm {

1225 pushad

1226 mov esi, [esp + 32 + 4] // src_ptr

1227 mov edx, [esp + 32 + 8] // src_stride

1228 mov edi, [esp + 32 + 12] // dst_ptr

1229 mov ecx, [esp + 32 + 16] // dst_width

1230 mov ebx, [esp + 32 + 20] // height

1231 pxor xmm5, xmm5

1232 dec ebx

1233

1234 xloop:

1235 // first row

1236 movdqa xmm2, [esi]

1237 lea eax, [esi + edx]

1238 movhlps xmm3, xmm2

1239 mov ebp, ebx

1240 punpcklbw xmm2, xmm5

1241 punpcklbw xmm3, xmm5

1242

1243 // sum remaining rows

1244 yloop:

1245 movdqa xmm0, [eax] // read 16 pixels

1246 lea eax, [eax + edx] // advance to next row

1247 movhlps xmm1, xmm0

1248 punpcklbw xmm0, xmm5

1249 punpcklbw xmm1, xmm5

1250 paddusw xmm2, xmm0 // sum 16 words

1251 paddusw xmm3, xmm1

1252 sub ebp, 1

1253 ja yloop

1254

1255 movdqa [edi], xmm2

1256 movdqa [edi + 16], xmm3

1257 lea edi, [edi + 32]

1258 lea esi, [esi + 16]

1259

1260 sub ecx, 16

1261 ja xloop

1262

1263 popad

1264 ret

1265 }

1266 }

1267

1268 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.

1269 #define HAS_SCALEFILTERROWS_SSE2

1270 __declspec(naked)

1271 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,

1272 int src_stride, int dst_width,

1273 int source_y_fraction) {

1274 __asm {

1275 push esi

1276 push edi

1277 mov edi, [esp + 8 + 4] // dst_ptr

1278 mov esi, [esp + 8 + 8] // src_ptr

1279 mov edx, [esp + 8 + 12] // src_stride

1280 mov ecx, [esp + 8 + 16] // dst_width

1281 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)

1282 cmp eax, 0

1283 je xloop1

1284 cmp eax, 128

1285 je xloop2

1286

1287 movd xmm6, eax // xmm6 = y fraction

1288 punpcklwd xmm6, xmm6

1289 pshufd xmm6, xmm6, 0

1290 neg eax // xmm5 = 256 - y fraction

1291 add eax, 256

1292 movd xmm5, eax

1293 punpcklwd xmm5, xmm5

1294 pshufd xmm5, xmm5, 0

1295 pxor xmm7, xmm7

1296

1297 xloop:

1298 movdqa xmm0, [esi]

1299 movdqa xmm2, [esi + edx]

1300 lea esi, [esi + 16]

1301 movdqa xmm1, xmm0

1302 movdqa xmm3, xmm2

1303 punpcklbw xmm0, xmm7

1304 punpcklbw xmm2, xmm7

1305 punpckhbw xmm1, xmm7

1306 punpckhbw xmm3, xmm7

1307 pmullw xmm0, xmm5 // scale row 0

1308 pmullw xmm1, xmm5

1309 pmullw xmm2, xmm6 // scale row 1

1310 pmullw xmm3, xmm6

1311 paddusw xmm0, xmm2 // sum rows

1312 paddusw xmm1, xmm3

1313 psrlw xmm0, 8

1314 psrlw xmm1, 8

1315 packuswb xmm0, xmm1

1316 movdqa [edi], xmm0

1317 lea edi, [edi + 16]

1318 sub ecx, 16

1319 ja xloop

1320

1321 mov al, [edi - 1]

1322 mov [edi], al

1323 pop edi

1324 pop esi

1325 ret

1326

1327 xloop1:

1328 movdqa xmm0, [esi]

1329 lea esi, [esi + 16]

1330 movdqa [edi], xmm0

1331 lea edi, [edi + 16]

1332 sub ecx, 16

1333 ja xloop1

1334

1335 mov al, [edi - 1]

1336 mov [edi], al

1337 pop edi

1338 pop esi

1339 ret

1340

1341 xloop2:

1342 movdqa xmm0, [esi]

1343 movdqa xmm2, [esi + edx]

1344 lea esi, [esi + 16]

1345 pavgb xmm0, xmm2

1346 movdqa [edi], xmm0

1347 lea edi, [edi + 16]

1348 sub ecx, 16

1349 ja xloop2

1350

1351 mov al, [edi - 1]

1352 mov [edi], al

1353 pop edi

1354 pop esi

1355 ret

1356 }

1357 }

1358

1359 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.

1360 #define HAS_SCALEFILTERROWS_SSSE3

1361 __declspec(naked)

1362 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

1363 int src_stride, int dst_width,

1364 int source_y_fraction) {

1365 __asm {

1366 push esi

1367 push edi

1368 mov edi, [esp + 8 + 4] // dst_ptr

1369 mov esi, [esp + 8 + 8] // src_ptr

1370 mov edx, [esp + 8 + 12] // src_stride

1371 mov ecx, [esp + 8 + 16] // dst_width

1372 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)

1373 shr eax, 1

1374 cmp eax, 0

1375 je xloop1

1376 cmp eax, 64

1377 je xloop2

1378

1379 mov ah,al

1380 neg al

1381 add al, 128

1382 movd xmm5, eax

1383 punpcklwd xmm5, xmm5

1384 pshufd xmm5, xmm5, 0

1385

1386 xloop:

1387 movdqa xmm0, [esi]

1388 movdqa xmm2, [esi + edx]

1389 lea esi, [esi + 16]

1390 movdqa xmm1, xmm0

1391 punpcklbw xmm0, xmm2

1392 punpckhbw xmm1, xmm2

1393 pmaddubsw xmm0, xmm5

1394 pmaddubsw xmm1, xmm5

1395 psrlw xmm0, 7

1396 psrlw xmm1, 7

1397 packuswb xmm0, xmm1

1398 movdqa [edi], xmm0

1399 lea edi, [edi + 16]

1400 sub ecx, 16

1401 ja xloop

1402

1403 mov al, [edi - 1]

1404 mov [edi], al

1405 pop edi

1406 pop esi

1407 ret

1408

1409 xloop1:

1410 movdqa xmm0, [esi]

1411 lea esi, [esi + 16]

1412 movdqa [edi], xmm0

1413 lea edi, [edi + 16]

1414 sub ecx, 16

1415 ja xloop1

1416

1417 mov al, [edi - 1]

1418 mov [edi], al

1419 pop edi

1420 pop esi

1421 ret

1422

1423 xloop2:

1424 movdqa xmm0, [esi]

1425 movdqa xmm2, [esi + edx]

1426 lea esi, [esi + 16]

1427 pavgb xmm0, xmm2

1428 movdqa [edi], xmm0

1429 lea edi, [edi + 16]

1430 sub ecx, 16

1431 ja xloop2

1432

1433 mov al, [edi - 1]

1434 mov [edi], al

1435 pop edi

1436 pop esi

1437 ret

1438

1439 }

1440 }

1441

1442 // Note that movdqa+palign may be better than movdqu.

1443 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

1444 __declspec(naked)

1445 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

1446 int dst_width) {

1447 __asm {

1448 mov edx, [esp + 4] // dst_ptr

1449 mov eax, [esp + 8] // src_ptr

1450 mov ecx, [esp + 12] // dst_width

1451 movdqa xmm1, _round34

1452 movdqa xmm2, _shuf01

1453 movdqa xmm3, _shuf11

1454 movdqa xmm4, _shuf21

1455 movdqa xmm5, _madd01

1456 movdqa xmm6, _madd11

1457 movdqa xmm7, _madd21

1458

1459 wloop:

1460 movdqa xmm0, [eax] // pixels 0..7

1461 pshufb xmm0, xmm2

1462 pmaddubsw xmm0, xmm5

1463 paddsw xmm0, xmm1

1464 psrlw xmm0, 2

1465 packuswb xmm0, xmm0

1466 movq qword ptr [edx], xmm0

1467 movdqu xmm0, [eax+8] // pixels 8..15

1468 pshufb xmm0, xmm3

1469 pmaddubsw xmm0, xmm6

1470 paddsw xmm0, xmm1

1471 psrlw xmm0, 2

1472 packuswb xmm0, xmm0

1473 movq qword ptr [edx+8], xmm0

1474 movdqa xmm0, [eax+16] // pixels 16..23

1475 lea eax, [eax+32]

1476 pshufb xmm0, xmm4

1477 pmaddubsw xmm0, xmm7

1478 paddsw xmm0, xmm1

1479 psrlw xmm0, 2

1480 packuswb xmm0, xmm0

1481 movq qword ptr [edx+16], xmm0

1482 lea edx, [edx+24]

1483 sub ecx, 24

1484 ja wloop

1485 ret

1486 }

1487 }

1488

1489 #elif (defined(__x86_64__) \|\| defined(__i386__)) && !defined(YUV_DISABLE_ASM)

1490

1491 // GCC versions of row functions are verbatim conversions from Visual C.

1492 // Generated using gcc disassembly on Visual C object file:

1493 // objdump -D yuvscaler.obj >yuvscaler.txt

1494 #define HAS_SCALEROWDOWN2_SSE2

1495 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,

1496 uint8* dst_ptr, int dst_width) {

1497 asm volatile (

1498 "pcmpeqb %%xmm5,%%xmm5 \n"

1499 "psrlw $0x8,%%xmm5 \n"

1500 "1:"

1501 "movdqa (%0),%%xmm0 \n"

1502 "movdqa 0x10(%0),%%xmm1 \n"

1503 "lea 0x20(%0),%0 \n"

1504 "pand %%xmm5,%%xmm0 \n"

1505 "pand %%xmm5,%%xmm1 \n"

1506 "packuswb %%xmm1,%%xmm0 \n"

1507 "movdqa %%xmm0,(%1) \n"

1508 "lea 0x10(%1),%1 \n"

1509 "sub $0x10,%2 \n"

1510 "ja 1b \n"

1511 : "+r"(src_ptr), // %0

1512 "+r"(dst_ptr), // %1

1513 "+r"(dst_width) // %2

1514 :

1515 : "memory", "cc"

1516 );

1517 }

1518

1519 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,

1520 uint8* dst_ptr, int dst_width) {

1521 asm volatile (

1522 "pcmpeqb %%xmm5,%%xmm5 \n"

1523 "psrlw $0x8,%%xmm5 \n"

1524 "1:"

1525 "movdqa (%0),%%xmm0 \n"

1526 "movdqa 0x10(%0),%%xmm1 \n"

1527 "movdqa (%0,%3,1),%%xmm2 \n"

1528 "movdqa 0x10(%0,%3,1),%%xmm3 \n"

1529 "lea 0x20(%0),%0 \n"

1530 "pavgb %%xmm2,%%xmm0 \n"

1531 "pavgb %%xmm3,%%xmm1 \n"

1532 "movdqa %%xmm0,%%xmm2 \n"

1533 "psrlw $0x8,%%xmm0 \n"

1534 "movdqa %%xmm1,%%xmm3 \n"

1535 "psrlw $0x8,%%xmm1 \n"

1536 "pand %%xmm5,%%xmm2 \n"

1537 "pand %%xmm5,%%xmm3 \n"

1538 "pavgw %%xmm2,%%xmm0 \n"

1539 "pavgw %%xmm3,%%xmm1 \n"

1540 "packuswb %%xmm1,%%xmm0 \n"

1541 "movdqa %%xmm0,(%1) \n"

1542 "lea 0x10(%1),%1 \n"

1543 "sub $0x10,%2 \n"

1544 "ja 1b \n"

1545 : "+r"(src_ptr), // %0

1546 "+r"(dst_ptr), // %1

1547 "+r"(dst_width) // %2

1548 : "r"((intptr_t)(src_stride)) // %3

1549 : "memory", "cc"

1550 );

1551 }

1552

1553 #define HAS_SCALEROWDOWN4_SSE2

1554 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,

1555 uint8* dst_ptr, int dst_width) {

1556 asm volatile (

1557 "pcmpeqb %%xmm5,%%xmm5 \n"

1558 "psrld $0x18,%%xmm5 \n"

1559 "1:"

1560 "movdqa (%0),%%xmm0 \n"

1561 "movdqa 0x10(%0),%%xmm1 \n"

1562 "lea 0x20(%0),%0 \n"

1563 "pand %%xmm5,%%xmm0 \n"

1564 "pand %%xmm5,%%xmm1 \n"

1565 "packuswb %%xmm1,%%xmm0 \n"

1566 "packuswb %%xmm0,%%xmm0 \n"

1567 "movq %%xmm0,(%1) \n"

1568 "lea 0x8(%1),%1 \n"

1569 "sub $0x8,%2 \n"

1570 "ja 1b \n"

1571 : "+r"(src_ptr), // %0

1572 "+r"(dst_ptr), // %1

1573 "+r"(dst_width) // %2

1574 :

1575 : "memory", "cc"

1576 );

1577 }

1578

1579 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,

1580 uint8* dst_ptr, int dst_width) {

1581 intptr_t temp = 0;

1582 asm volatile (

1583 "pcmpeqb %%xmm7,%%xmm7 \n"

1584 "psrlw $0x8,%%xmm7 \n"

1585 "lea (%4,%4,2),%3 \n"

1586 "1:"

1587 "movdqa (%0),%%xmm0 \n"

1588 "movdqa 0x10(%0),%%xmm1 \n"

1589 "movdqa (%0,%4,1),%%xmm2 \n"

1590 "movdqa 0x10(%0,%4,1),%%xmm3 \n"

1591 "pavgb %%xmm2,%%xmm0 \n"

1592 "pavgb %%xmm3,%%xmm1 \n"

1593 "movdqa (%0,%4,2),%%xmm2 \n"

1594 "movdqa 0x10(%0,%4,2),%%xmm3 \n"

1595 "movdqa (%0,%3,1),%%xmm4 \n"

1596 "movdqa 0x10(%0,%3,1),%%xmm5 \n"

1597 "lea 0x20(%0),%0 \n"

1598 "pavgb %%xmm4,%%xmm2 \n"

1599 "pavgb %%xmm2,%%xmm0 \n"

1600 "pavgb %%xmm5,%%xmm3 \n"

1601 "pavgb %%xmm3,%%xmm1 \n"

1602 "movdqa %%xmm0,%%xmm2 \n"

1603 "psrlw $0x8,%%xmm0 \n"

1604 "movdqa %%xmm1,%%xmm3 \n"

1605 "psrlw $0x8,%%xmm1 \n"

1606 "pand %%xmm7,%%xmm2 \n"

1607 "pand %%xmm7,%%xmm3 \n"

1608 "pavgw %%xmm2,%%xmm0 \n"

1609 "pavgw %%xmm3,%%xmm1 \n"

1610 "packuswb %%xmm1,%%xmm0 \n"

1611 "movdqa %%xmm0,%%xmm2 \n"

1612 "psrlw $0x8,%%xmm0 \n"

1613 "pand %%xmm7,%%xmm2 \n"

1614 "pavgw %%xmm2,%%xmm0 \n"

1615 "packuswb %%xmm0,%%xmm0 \n"

1616 "movq %%xmm0,(%1) \n"

1617 "lea 0x8(%1),%1 \n"

1618 "sub $0x8,%2 \n"

1619 "ja 1b \n"

1620 : "+r"(src_ptr), // %0

1621 "+r"(dst_ptr), // %1

1622 "+r"(dst_width), // %2

1623 "+r"(temp) // %3

1624 : "r"((intptr_t)(src_stride)) // %4

1625 : "memory", "cc"

1626 #if defined(__x86_64__)

1627 , "xmm6", "xmm7"

1628 #endif

1629 );

1630 }

1631

1632 #define HAS_SCALEROWDOWN8_SSE2

1633 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,

1634 uint8* dst_ptr, int dst_width) {

1635 asm volatile (

1636 "pcmpeqb %%xmm5,%%xmm5 \n"

1637 "psrlq $0x38,%%xmm5 \n"

1638 "1:"

1639 "movdqa (%0),%%xmm0 \n"

1640 "movdqa 0x10(%0),%%xmm1 \n"

1641 "lea 0x20(%0),%0 \n"

1642 "pand %%xmm5,%%xmm0 \n"

1643 "pand %%xmm5,%%xmm1 \n"

1644 "packuswb %%xmm1,%%xmm0 \n"

1645 "packuswb %%xmm0,%%xmm0 \n"

1646 "packuswb %%xmm0,%%xmm0 \n"

1647 "movd %%xmm0,(%1) \n"

1648 "lea 0x4(%1),%1 \n"

1649 "sub $0x4,%2 \n"

1650 "ja 1b \n"

1651 : "+r"(src_ptr), // %0

1652 "+r"(dst_ptr), // %1

1653 "+r"(dst_width) // %2

1654 :

1655 : "memory", "cc"

1656 );

1657 }

1658

1659 #if defined(__i386__)

1660 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

1661 uint8* dst_ptr, int dst_width);

1662 asm(

1663 DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)

1664 "pusha \n"

1665 "mov 0x24(%esp),%esi \n"

1666 "mov 0x28(%esp),%ebx \n"

1667 "mov 0x2c(%esp),%edi \n"

1668 "mov 0x30(%esp),%ecx \n"

1669 "lea (%ebx,%ebx,2),%edx \n"

1670 "pxor %xmm7,%xmm7 \n"

1671

1672 "1:"

1673 "movdqa (%esi),%xmm0 \n"

1674 "movdqa 0x10(%esi),%xmm1 \n"

1675 "movdqa (%esi,%ebx,1),%xmm2 \n"

1676 "movdqa 0x10(%esi,%ebx,1),%xmm3 \n"

1677 "pavgb %xmm2,%xmm0 \n"

1678 "pavgb %xmm3,%xmm1 \n"

1679 "movdqa (%esi,%ebx,2),%xmm2 \n"

1680 "movdqa 0x10(%esi,%ebx,2),%xmm3 \n"

1681 "movdqa (%esi,%edx,1),%xmm4 \n"

1682 "movdqa 0x10(%esi,%edx,1),%xmm5 \n"

1683 "lea (%esi,%ebx,4),%ebp \n"

1684 "lea 0x20(%esi),%esi \n"

1685 "pavgb %xmm4,%xmm2 \n"

1686 "pavgb %xmm5,%xmm3 \n"

1687 "pavgb %xmm2,%xmm0 \n"

1688 "pavgb %xmm3,%xmm1 \n"

1689 "movdqa 0x0(%ebp),%xmm2 \n"

1690 "movdqa 0x10(%ebp),%xmm3 \n"

1691 "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n"

1692 "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n"

1693 "pavgb %xmm4,%xmm2 \n"

1694 "pavgb %xmm5,%xmm3 \n"

1695 "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n"

1696 "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n"

1697 "movdqa 0x0(%ebp,%edx,1),%xmm6 \n"

1698 "pavgb %xmm6,%xmm4 \n"

1699 "movdqa 0x10(%ebp,%edx,1),%xmm6 \n"

1700 "pavgb %xmm6,%xmm5 \n"

1701 "pavgb %xmm4,%xmm2 \n"

1702 "pavgb %xmm5,%xmm3 \n"

1703 "pavgb %xmm2,%xmm0 \n"

1704 "pavgb %xmm3,%xmm1 \n"

1705 "psadbw %xmm7,%xmm0 \n"

1706 "psadbw %xmm7,%xmm1 \n"

1707 "pshufd $0xd8,%xmm0,%xmm0 \n"

1708 "pshufd $0x8d,%xmm1,%xmm1 \n"

1709 "por %xmm1,%xmm0 \n"

1710 "psrlw $0x3,%xmm0 \n"

1711 "packuswb %xmm0,%xmm0 \n"

1712 "packuswb %xmm0,%xmm0 \n"

1713 "movd %xmm0,(%edi) \n"

1714 "lea 0x4(%edi),%edi \n"

1715 "sub $0x4,%ecx \n"

1716 "ja 1b \n"

1717 "popa \n"

1718 "ret \n"

1719 );

1720

1721 // fpic is used for magiccam plugin

1722 #if !defined(__PIC__)

1723 #define HAS_SCALEROWDOWN34_SSSE3

1724 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

1725 uint8* dst_ptr, int dst_width);

1726 asm(

1727 DECLARE_FUNCTION(ScaleRowDown34_SSSE3)

1728 "pusha \n"

1729 "mov 0x24(%esp),%esi \n"

1730 "mov 0x2c(%esp),%edi \n"

1731 "mov 0x30(%esp),%ecx \n"

1732 "movdqa _shuf0,%xmm3 \n"

1733 "movdqa _shuf1,%xmm4 \n"

1734 "movdqa _shuf2,%xmm5 \n"

1735

1736 "1:"

1737 "movdqa (%esi),%xmm0 \n"

1738 "movdqa 0x10(%esi),%xmm2 \n"

1739 "lea 0x20(%esi),%esi \n"

1740 "movdqa %xmm2,%xmm1 \n"

1741 "palignr $0x8,%xmm0,%xmm1 \n"

1742 "pshufb %xmm3,%xmm0 \n"

1743 "pshufb %xmm4,%xmm1 \n"

1744 "pshufb %xmm5,%xmm2 \n"

1745 "movq %xmm0,(%edi) \n"

1746 "movq %xmm1,0x8(%edi) \n"

1747 "movq %xmm2,0x10(%edi) \n"

1748 "lea 0x18(%edi),%edi \n"

1749 "sub $0x18,%ecx \n"

1750 "ja 1b \n"

1751 "popa \n"

1752 "ret \n"

1753 );

1754

1755 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

1756 uint8* dst_ptr, int dst_width);

1757 asm(

1758 DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)

1759 "pusha \n"

1760 "mov 0x24(%esp),%esi \n"

1761 "mov 0x28(%esp),%ebp \n"

1762 "mov 0x2c(%esp),%edi \n"

1763 "mov 0x30(%esp),%ecx \n"

1764 "movdqa _shuf01,%xmm2 \n"

1765 "movdqa _shuf11,%xmm3 \n"

1766 "movdqa _shuf21,%xmm4 \n"

1767 "movdqa _madd01,%xmm5 \n"

1768 "movdqa _madd11,%xmm6 \n"

1769 "movdqa _round34,%xmm7 \n"

1770

1771 "1:"

1772 "movdqa (%esi),%xmm0 \n"

1773 "movdqa (%esi,%ebp),%xmm1 \n"

1774 "pavgb %xmm1,%xmm0 \n"

1775 "pshufb %xmm2,%xmm0 \n"

1776 "pmaddubsw %xmm5,%xmm0 \n"

1777 "paddsw %xmm7,%xmm0 \n"

1778 "psrlw $0x2,%xmm0 \n"

1779 "packuswb %xmm0,%xmm0 \n"

1780 "movq %xmm0,(%edi) \n"

1781 "movdqu 0x8(%esi),%xmm0 \n"

1782 "movdqu 0x8(%esi,%ebp),%xmm1 \n"

1783 "pavgb %xmm1,%xmm0 \n"

1784 "pshufb %xmm3,%xmm0 \n"

1785 "pmaddubsw %xmm6,%xmm0 \n"

1786 "paddsw %xmm7,%xmm0 \n"

1787 "psrlw $0x2,%xmm0 \n"

1788 "packuswb %xmm0,%xmm0 \n"

1789 "movq %xmm0,0x8(%edi) \n"

1790 "movdqa 0x10(%esi),%xmm0 \n"

1791 "movdqa 0x10(%esi,%ebp),%xmm1 \n"

1792 "lea 0x20(%esi),%esi \n"

1793 "pavgb %xmm1,%xmm0 \n"

1794 "pshufb %xmm4,%xmm0 \n"

1795 "movdqa _madd21,%xmm1 \n"

1796 "pmaddubsw %xmm1,%xmm0 \n"

1797 "paddsw %xmm7,%xmm0 \n"

1798 "psrlw $0x2,%xmm0 \n"

1799 "packuswb %xmm0,%xmm0 \n"

1800 "movq %xmm0,0x10(%edi) \n"

1801 "lea 0x18(%edi),%edi \n"

1802 "sub $0x18,%ecx \n"

1803 "ja 1b \n"

1804

1805 "popa \n"

1806 "ret \n"

1807 );

1808

1809 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

1810 uint8* dst_ptr, int dst_width);

1811 asm(

1812 DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)

1813 "pusha \n"

1814 "mov 0x24(%esp),%esi \n"

1815 "mov 0x28(%esp),%ebp \n"

1816 "mov 0x2c(%esp),%edi \n"

1817 "mov 0x30(%esp),%ecx \n"

1818 "movdqa _shuf01,%xmm2 \n"

1819 "movdqa _shuf11,%xmm3 \n"

1820 "movdqa _shuf21,%xmm4 \n"

1821 "movdqa _madd01,%xmm5 \n"

1822 "movdqa _madd11,%xmm6 \n"

1823 "movdqa _round34,%xmm7 \n"

1824

1825 "1:"

1826 "movdqa (%esi),%xmm0 \n"

1827 "movdqa (%esi,%ebp,1),%xmm1 \n"

1828 "pavgb %xmm0,%xmm1 \n"

1829 "pavgb %xmm1,%xmm0 \n"

1830 "pshufb %xmm2,%xmm0 \n"

1831 "pmaddubsw %xmm5,%xmm0 \n"

1832 "paddsw %xmm7,%xmm0 \n"

1833 "psrlw $0x2,%xmm0 \n"

1834 "packuswb %xmm0,%xmm0 \n"

1835 "movq %xmm0,(%edi) \n"

1836 "movdqu 0x8(%esi),%xmm0 \n"

1837 "movdqu 0x8(%esi,%ebp,1),%xmm1 \n"

1838 "pavgb %xmm0,%xmm1 \n"

1839 "pavgb %xmm1,%xmm0 \n"

1840 "pshufb %xmm3,%xmm0 \n"

1841 "pmaddubsw %xmm6,%xmm0 \n"

1842 "paddsw %xmm7,%xmm0 \n"

1843 "psrlw $0x2,%xmm0 \n"

1844 "packuswb %xmm0,%xmm0 \n"

1845 "movq %xmm0,0x8(%edi) \n"

1846 "movdqa 0x10(%esi),%xmm0 \n"

1847 "movdqa 0x10(%esi,%ebp,1),%xmm1 \n"

1848 "lea 0x20(%esi),%esi \n"

1849 "pavgb %xmm0,%xmm1 \n"

1850 "pavgb %xmm1,%xmm0 \n"

1851 "pshufb %xmm4,%xmm0 \n"

1852 "movdqa _madd21,%xmm1 \n"

1853 "pmaddubsw %xmm1,%xmm0 \n"

1854 "paddsw %xmm7,%xmm0 \n"

1855 "psrlw $0x2,%xmm0 \n"

1856 "packuswb %xmm0,%xmm0 \n"

1857 "movq %xmm0,0x10(%edi) \n"

1858 "lea 0x18(%edi),%edi \n"

1859 "sub $0x18,%ecx \n"

1860 "ja 1b \n"

1861 "popa \n"

1862 "ret \n"

1863 );

1864

1865 #define HAS_SCALEROWDOWN38_SSSE3

1866 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

1867 uint8* dst_ptr, int dst_width);

1868 asm(

1869 DECLARE_FUNCTION(ScaleRowDown38_SSSE3)

1870 "pusha \n"

1871 "mov 0x24(%esp),%esi \n"

1872 "mov 0x28(%esp),%edx \n"

1873 "mov 0x2c(%esp),%edi \n"

1874 "mov 0x30(%esp),%ecx \n"

1875 "movdqa _shuf38a ,%xmm4 \n"

1876 "movdqa _shuf38b ,%xmm5 \n"

1877

1878 "1:"

1879 "movdqa (%esi),%xmm0 \n"

1880 "movdqa 0x10(%esi),%xmm1 \n"

1881 "lea 0x20(%esi),%esi \n"

1882 "pshufb %xmm4,%xmm0 \n"

1883 "pshufb %xmm5,%xmm1 \n"

1884 "paddusb %xmm1,%xmm0 \n"

1885 "movq %xmm0,(%edi) \n"

1886 "movhlps %xmm0,%xmm1 \n"

1887 "movd %xmm1,0x8(%edi) \n"

1888 "lea 0xc(%edi),%edi \n"

1889 "sub $0xc,%ecx \n"

1890 "ja 1b \n"

1891 "popa \n"

1892 "ret \n"

1893 );

1894

1895 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

1896 uint8* dst_ptr, int dst_width);

1897 asm(

1898 DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)

1899 "pusha \n"

1900 "mov 0x24(%esp),%esi \n"

1901 "mov 0x28(%esp),%edx \n"

1902 "mov 0x2c(%esp),%edi \n"

1903 "mov 0x30(%esp),%ecx \n"

1904 "movdqa _shufac0,%xmm4 \n"

1905 "movdqa _shufac3,%xmm5 \n"

1906 "movdqa _scaleac3,%xmm6 \n"

1907 "pxor %xmm7,%xmm7 \n"

1908

1909 "1:"

1910 "movdqa (%esi),%xmm0 \n"

1911 "movdqa (%esi,%edx,1),%xmm2 \n"

1912 "movhlps %xmm0,%xmm1 \n"

1913 "movhlps %xmm2,%xmm3 \n"

1914 "punpcklbw %xmm7,%xmm0 \n"

1915 "punpcklbw %xmm7,%xmm1 \n"

1916 "punpcklbw %xmm7,%xmm2 \n"

1917 "punpcklbw %xmm7,%xmm3 \n"

1918 "paddusw %xmm2,%xmm0 \n"

1919 "paddusw %xmm3,%xmm1 \n"

1920 "movdqa (%esi,%edx,2),%xmm2 \n"

1921 "lea 0x10(%esi),%esi \n"

1922 "movhlps %xmm2,%xmm3 \n"

1923 "punpcklbw %xmm7,%xmm2 \n"

1924 "punpcklbw %xmm7,%xmm3 \n"

1925 "paddusw %xmm2,%xmm0 \n"

1926 "paddusw %xmm3,%xmm1 \n"

1927 "movdqa %xmm0,%xmm2 \n"

1928 "psrldq $0x2,%xmm0 \n"

1929 "paddusw %xmm0,%xmm2 \n"

1930 "psrldq $0x2,%xmm0 \n"

1931 "paddusw %xmm0,%xmm2 \n"

1932 "pshufb %xmm4,%xmm2 \n"

1933 "movdqa %xmm1,%xmm3 \n"

1934 "psrldq $0x2,%xmm1 \n"

1935 "paddusw %xmm1,%xmm3 \n"

1936 "psrldq $0x2,%xmm1 \n"

1937 "paddusw %xmm1,%xmm3 \n"

1938 "pshufb %xmm5,%xmm3 \n"

1939 "paddusw %xmm3,%xmm2 \n"

1940 "pmulhuw %xmm6,%xmm2 \n"

1941 "packuswb %xmm2,%xmm2 \n"

1942 "movd %xmm2,(%edi) \n"

1943 "pextrw $0x2,%xmm2,%eax \n"

1944 "mov %ax,0x4(%edi) \n"

1945 "lea 0x6(%edi),%edi \n"

1946 "sub $0x6,%ecx \n"

1947 "ja 1b \n"

1948 "popa \n"

1949 "ret \n"

1950 );

1951

1952 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

1953 uint8* dst_ptr, int dst_width);

1954 asm(

1955 DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)

1956 "pusha \n"

1957 "mov 0x24(%esp),%esi \n"

1958 "mov 0x28(%esp),%edx \n"

1959 "mov 0x2c(%esp),%edi \n"

1960 "mov 0x30(%esp),%ecx \n"

1961 "movdqa _shufab0,%xmm4 \n"

1962 "movdqa _shufab1,%xmm5 \n"

1963 "movdqa _shufab2,%xmm6 \n"

1964 "movdqa _scaleab2,%xmm7 \n"

1965

1966 "1:"

1967 "movdqa (%esi),%xmm2 \n"

1968 "pavgb (%esi,%edx,1),%xmm2 \n"

1969 "lea 0x10(%esi),%esi \n"

1970 "movdqa %xmm2,%xmm0 \n"

1971 "pshufb %xmm4,%xmm0 \n"

1972 "movdqa %xmm2,%xmm1 \n"

1973 "pshufb %xmm5,%xmm1 \n"

1974 "paddusw %xmm1,%xmm0 \n"

1975 "pshufb %xmm6,%xmm2 \n"

1976 "paddusw %xmm2,%xmm0 \n"

1977 "pmulhuw %xmm7,%xmm0 \n"

1978 "packuswb %xmm0,%xmm0 \n"

1979 "movd %xmm0,(%edi) \n"

1980 "pextrw $0x2,%xmm0,%eax \n"

1981 "mov %ax,0x4(%edi) \n"

1982 "lea 0x6(%edi),%edi \n"

1983 "sub $0x6,%ecx \n"

1984 "ja 1b \n"

1985 "popa \n"

1986 "ret \n"

1987 );

1988 #endif // __PIC__

1989

1990 #define HAS_SCALEADDROWS_SSE2

1991 void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

1992 uint16* dst_ptr, int src_width,

1993 int src_height);

1994 asm(

1995 DECLARE_FUNCTION(ScaleAddRows_SSE2)

1996 "pusha \n"

1997 "mov 0x24(%esp),%esi \n"

1998 "mov 0x28(%esp),%edx \n"

1999 "mov 0x2c(%esp),%edi \n"

2000 "mov 0x30(%esp),%ecx \n"

2001 "mov 0x34(%esp),%ebx \n"

2002 "pxor %xmm5,%xmm5 \n"

2003

2004 "1:"

2005 "movdqa (%esi),%xmm2 \n"

2006 "lea (%esi,%edx,1),%eax \n"

2007 "movhlps %xmm2,%xmm3 \n"

2008 "lea -0x1(%ebx),%ebp \n"

2009 "punpcklbw %xmm5,%xmm2 \n"

2010 "punpcklbw %xmm5,%xmm3 \n"

2011

2012 "2:"

2013 "movdqa (%eax),%xmm0 \n"

2014 "lea (%eax,%edx,1),%eax \n"

2015 "movhlps %xmm0,%xmm1 \n"

2016 "punpcklbw %xmm5,%xmm0 \n"

2017 "punpcklbw %xmm5,%xmm1 \n"

2018 "paddusw %xmm0,%xmm2 \n"

2019 "paddusw %xmm1,%xmm3 \n"

2020 "sub $0x1,%ebp \n"

2021 "ja 2b \n"

2022

2023 "movdqa %xmm2,(%edi) \n"

2024 "movdqa %xmm3,0x10(%edi) \n"

2025 "lea 0x20(%edi),%edi \n"

2026 "lea 0x10(%esi),%esi \n"

2027 "sub $0x10,%ecx \n"

2028 "ja 1b \n"

2029 "popa \n"

2030 "ret \n"

2031 );

2032

2033 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version

2034 #define HAS_SCALEFILTERROWS_SSE2

2035 void ScaleFilterRows_SSE2(uint8* dst_ptr,

2036 const uint8* src_ptr, int src_stride,

2037 int dst_width, int source_y_fraction);

2038 asm(

2039 DECLARE_FUNCTION(ScaleFilterRows_SSE2)

2040 "push %esi \n"

2041 "push %edi \n"

2042 "mov 0xc(%esp),%edi \n"

2043 "mov 0x10(%esp),%esi \n"

2044 "mov 0x14(%esp),%edx \n"

2045 "mov 0x18(%esp),%ecx \n"

2046 "mov 0x1c(%esp),%eax \n"

2047 "cmp $0x0,%eax \n"

2048 "je 2f \n"

2049 "cmp $0x80,%eax \n"

2050 "je 3f \n"

2051 "movd %eax,%xmm6 \n"

2052 "punpcklwd %xmm6,%xmm6 \n"

2053 "pshufd $0x0,%xmm6,%xmm6 \n"

2054 "neg %eax \n"

2055 "add $0x100,%eax \n"

2056 "movd %eax,%xmm5 \n"

2057 "punpcklwd %xmm5,%xmm5 \n"

2058 "pshufd $0x0,%xmm5,%xmm5 \n"

2059 "pxor %xmm7,%xmm7 \n"

2060

2061 "1:"

2062 "movdqa (%esi),%xmm0 \n"

2063 "movdqa (%esi,%edx,1),%xmm2 \n"

2064 "lea 0x10(%esi),%esi \n"

2065 "movdqa %xmm0,%xmm1 \n"

2066 "movdqa %xmm2,%xmm3 \n"

2067 "punpcklbw %xmm7,%xmm0 \n"

2068 "punpcklbw %xmm7,%xmm2 \n"

2069 "punpckhbw %xmm7,%xmm1 \n"

2070 "punpckhbw %xmm7,%xmm3 \n"

2071 "pmullw %xmm5,%xmm0 \n"

2072 "pmullw %xmm5,%xmm1 \n"

2073 "pmullw %xmm6,%xmm2 \n"

2074 "pmullw %xmm6,%xmm3 \n"

2075 "paddusw %xmm2,%xmm0 \n"

2076 "paddusw %xmm3,%xmm1 \n"

2077 "psrlw $0x8,%xmm0 \n"

2078 "psrlw $0x8,%xmm1 \n"

2079 "packuswb %xmm1,%xmm0 \n"

2080 "movdqa %xmm0,(%edi) \n"

2081 "lea 0x10(%edi),%edi \n"

2082 "sub $0x10,%ecx \n"

2083 "ja 1b \n"

2084 "mov -0x1(%edi),%al \n"

2085 "mov %al,(%edi) \n"

2086 "pop %edi \n"

2087 "pop %esi \n"

2088 "ret \n"

2089

2090 "2:"

2091 "movdqa (%esi),%xmm0 \n"

2092 "lea 0x10(%esi),%esi \n"

2093 "movdqa %xmm0,(%edi) \n"

2094 "lea 0x10(%edi),%edi \n"

2095 "sub $0x10,%ecx \n"

2096 "ja 2b \n"

2097

2098 "mov -0x1(%edi),%al \n"

2099 "mov %al,(%edi) \n"

2100 "pop %edi \n"

2101 "pop %esi \n"

2102 "ret \n"

2103

2104 "3:"

2105 "movdqa (%esi),%xmm0 \n"

2106 "movdqa (%esi,%edx,1),%xmm2 \n"

2107 "lea 0x10(%esi),%esi \n"

2108 "pavgb %xmm2,%xmm0 \n"

2109 "movdqa %xmm0,(%edi) \n"

2110 "lea 0x10(%edi),%edi \n"

2111 "sub $0x10,%ecx \n"

2112 "ja 3b \n"

2113

2114 "mov -0x1(%edi),%al \n"

2115 "mov %al,(%edi) \n"

2116 "pop %edi \n"

2117 "pop %esi \n"

2118 "ret \n"

2119 );

2120

2121 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version

2122 #define HAS_SCALEFILTERROWS_SSSE3

2123 void ScaleFilterRows_SSSE3(uint8* dst_ptr,

2124 const uint8* src_ptr, int src_stride,

2125 int dst_width, int source_y_fraction);

2126 asm(

2127 DECLARE_FUNCTION(ScaleFilterRows_SSSE3)

2128 "push %esi \n"

2129 "push %edi \n"

2130 "mov 0xc(%esp),%edi \n"

2131 "mov 0x10(%esp),%esi \n"

2132 "mov 0x14(%esp),%edx \n"

2133 "mov 0x18(%esp),%ecx \n"

2134 "mov 0x1c(%esp),%eax \n"

2135 "shr %eax \n"

2136 "cmp $0x0,%eax \n"

2137 "je 2f \n"

2138 "cmp $0x40,%eax \n"

2139 "je 3f \n"

2140 "mov %al,%ah \n"

2141 "neg %al \n"

2142 "add $0x80,%al \n"

2143 "movd %eax,%xmm5 \n"

2144 "punpcklwd %xmm5,%xmm5 \n"

2145 "pshufd $0x0,%xmm5,%xmm5 \n"

2146

2147 "1:"

2148 "movdqa (%esi),%xmm0 \n"

2149 "movdqa (%esi,%edx,1),%xmm2 \n"

2150 "lea 0x10(%esi),%esi \n"

2151 "movdqa %xmm0,%xmm1 \n"

2152 "punpcklbw %xmm2,%xmm0 \n"

2153 "punpckhbw %xmm2,%xmm1 \n"

2154 "pmaddubsw %xmm5,%xmm0 \n"

2155 "pmaddubsw %xmm5,%xmm1 \n"

2156 "psrlw $0x7,%xmm0 \n"

2157 "psrlw $0x7,%xmm1 \n"

2158 "packuswb %xmm1,%xmm0 \n"

2159 "movdqa %xmm0,(%edi) \n"

2160 "lea 0x10(%edi),%edi \n"

2161 "sub $0x10,%ecx \n"

2162 "ja 1b \n"

2163 "mov -0x1(%edi),%al \n"

2164 "mov %al,(%edi) \n"

2165 "pop %edi \n"

2166 "pop %esi \n"

2167 "ret \n"

2168

2169 "2:"

2170 "movdqa (%esi),%xmm0 \n"

2171 "lea 0x10(%esi),%esi \n"

2172 "movdqa %xmm0,(%edi) \n"

2173 "lea 0x10(%edi),%edi \n"

2174 "sub $0x10,%ecx \n"

2175 "ja 2b \n"

2176 "mov -0x1(%edi),%al \n"

2177 "mov %al,(%edi) \n"

2178 "pop %edi \n"

2179 "pop %esi \n"

2180 "ret \n"

2181

2182 "3:"

2183 "movdqa (%esi),%xmm0 \n"

2184 "movdqa (%esi,%edx,1),%xmm2 \n"

2185 "lea 0x10(%esi),%esi \n"

2186 "pavgb %xmm2,%xmm0 \n"

2187 "movdqa %xmm0,(%edi) \n"

2188 "lea 0x10(%edi),%edi \n"

2189 "sub $0x10,%ecx \n"

2190 "ja 3b \n"

2191 "mov -0x1(%edi),%al \n"

2192 "mov %al,(%edi) \n"

2193 "pop %edi \n"

2194 "pop %esi \n"

2195 "ret \n"

2196 );

2197

2198 #elif defined(__x86_64__)

2199 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

2200 uint8* dst_ptr, int dst_width) {

2201 asm volatile (

2202 "lea (%3,%3,2),%%r10 \n"

2203 "pxor %%xmm7,%%xmm7 \n"

2204 "1:"

2205 "movdqa (%0),%%xmm0 \n"

2206 "movdqa 0x10(%0),%%xmm1 \n"

2207 "movdqa (%0,%3,1),%%xmm2 \n"

2208 "movdqa 0x10(%0,%3,1),%%xmm3 \n"

2209 "pavgb %%xmm2,%%xmm0 \n"

2210 "pavgb %%xmm3,%%xmm1 \n"

2211 "movdqa (%0,%3,2),%%xmm2 \n"

2212 "movdqa 0x10(%0,%3,2),%%xmm3 \n"

2213 "movdqa (%0,%%r10,1),%%xmm4 \n"

2214 "movdqa 0x10(%0,%%r10,1),%%xmm5 \n"

2215 "lea (%0,%3,4),%%r11 \n"

2216 "lea 0x20(%0),%0 \n"

2217 "pavgb %%xmm4,%%xmm2 \n"

2218 "pavgb %%xmm5,%%xmm3 \n"

2219 "pavgb %%xmm2,%%xmm0 \n"

2220 "pavgb %%xmm3,%%xmm1 \n"

2221 "movdqa 0x0(%%r11),%%xmm2 \n"

2222 "movdqa 0x10(%%r11),%%xmm3 \n"

2223 "movdqa 0x0(%%r11,%3,1),%%xmm4 \n"

2224 "movdqa 0x10(%%r11,%3,1),%%xmm5 \n"

2225 "pavgb %%xmm4,%%xmm2 \n"

2226 "pavgb %%xmm5,%%xmm3 \n"

2227 "movdqa 0x0(%%r11,%3,2),%%xmm4 \n"

2228 "movdqa 0x10(%%r11,%3,2),%%xmm5 \n"

2229 "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n"

2230 "pavgb %%xmm6,%%xmm4 \n"

2231 "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n"

2232 "pavgb %%xmm6,%%xmm5 \n"

2233 "pavgb %%xmm4,%%xmm2 \n"

2234 "pavgb %%xmm5,%%xmm3 \n"

2235 "pavgb %%xmm2,%%xmm0 \n"

2236 "pavgb %%xmm3,%%xmm1 \n"

2237 "psadbw %%xmm7,%%xmm0 \n"

2238 "psadbw %%xmm7,%%xmm1 \n"

2239 "pshufd $0xd8,%%xmm0,%%xmm0 \n"

2240 "pshufd $0x8d,%%xmm1,%%xmm1 \n"

2241 "por %%xmm1,%%xmm0 \n"

2242 "psrlw $0x3,%%xmm0 \n"

2243 "packuswb %%xmm0,%%xmm0 \n"

2244 "packuswb %%xmm0,%%xmm0 \n"

2245 "movd %%xmm0,(%1) \n"

2246 "lea 0x4(%1),%1 \n"

2247 "sub $0x4,%2 \n"

2248 "ja 1b \n"

2249 : "+r"(src_ptr), // %0

2250 "+r"(dst_ptr), // %1

2251 "+r"(dst_width) // %2

2252 : "r"((intptr_t)(src_stride)) // %3

2253 : "memory", "cc", "r10", "r11", "xmm6", "xmm7"

2254 );

2255 }

2256

2257 #define HAS_SCALEROWDOWN34_SSSE3

2258 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

2259 uint8* dst_ptr, int dst_width) {

2260 asm volatile (

2261 "movdqa (%3),%%xmm3 \n"

2262 "movdqa (%4),%%xmm4 \n"

2263 "movdqa (%5),%%xmm5 \n"

2264 "1:"

2265 "movdqa (%0),%%xmm0 \n"

2266 "movdqa 0x10(%0),%%xmm2 \n"

2267 "lea 0x20(%0),%0 \n"

2268 "movdqa %%xmm2,%%xmm1 \n"

2269 "palignr $0x8,%%xmm0,%%xmm1 \n"

2270 "pshufb %%xmm3,%%xmm0 \n"

2271 "pshufb %%xmm4,%%xmm1 \n"

2272 "pshufb %%xmm5,%%xmm2 \n"

2273 "movq %%xmm0,(%1) \n"

2274 "movq %%xmm1,0x8(%1) \n"

2275 "movq %%xmm2,0x10(%1) \n"

2276 "lea 0x18(%1),%1 \n"

2277 "sub $0x18,%2 \n"

2278 "ja 1b \n"

2279 : "+r"(src_ptr), // %0

2280 "+r"(dst_ptr), // %1

2281 "+r"(dst_width) // %2

2282 : "r"(_shuf0), // %3

2283 "r"(_shuf1), // %4

2284 "r"(_shuf2) // %5

2285 : "memory", "cc"

2286 );

2287 }

2288

2289 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

2290 uint8* dst_ptr, int dst_width) {

2291 asm volatile (

2292 "movdqa (%4),%%xmm2 \n" // _shuf01

2293 "movdqa (%5),%%xmm3 \n" // _shuf11

2294 "movdqa (%6),%%xmm4 \n" // _shuf21

2295 "movdqa (%7),%%xmm5 \n" // _madd01

2296 "movdqa (%8),%%xmm6 \n" // _madd11

2297 "movdqa (%9),%%xmm7 \n" // _round34

2298 "movdqa (%10),%%xmm8 \n" // _madd21

2299 "1:"

2300 "movdqa (%0),%%xmm0 \n"

2301 "movdqa (%0,%3),%%xmm1 \n"

2302 "pavgb %%xmm1,%%xmm0 \n"

2303 "pshufb %%xmm2,%%xmm0 \n"

2304 "pmaddubsw %%xmm5,%%xmm0 \n"

2305 "paddsw %%xmm7,%%xmm0 \n"

2306 "psrlw $0x2,%%xmm0 \n"

2307 "packuswb %%xmm0,%%xmm0 \n"

2308 "movq %%xmm0,(%1) \n"

2309 "movdqu 0x8(%0),%%xmm0 \n"

2310 "movdqu 0x8(%0,%3),%%xmm1 \n"

2311 "pavgb %%xmm1,%%xmm0 \n"

2312 "pshufb %%xmm3,%%xmm0 \n"

2313 "pmaddubsw %%xmm6,%%xmm0 \n"

2314 "paddsw %%xmm7,%%xmm0 \n"

2315 "psrlw $0x2,%%xmm0 \n"

2316 "packuswb %%xmm0,%%xmm0 \n"

2317 "movq %%xmm0,0x8(%1) \n"

2318 "movdqa 0x10(%0),%%xmm0 \n"

2319 "movdqa 0x10(%0,%3),%%xmm1 \n"

2320 "lea 0x20(%0),%0 \n"

2321 "pavgb %%xmm1,%%xmm0 \n"

2322 "pshufb %%xmm4,%%xmm0 \n"

2323 "pmaddubsw %%xmm8,%%xmm0 \n"

2324 "paddsw %%xmm7,%%xmm0 \n"

2325 "psrlw $0x2,%%xmm0 \n"

2326 "packuswb %%xmm0,%%xmm0 \n"

2327 "movq %%xmm0,0x10(%1) \n"

2328 "lea 0x18(%1),%1 \n"

2329 "sub $0x18,%2 \n"

2330 "ja 1b \n"

2331 : "+r"(src_ptr), // %0

2332 "+r"(dst_ptr), // %1

2333 "+r"(dst_width) // %2

2334 : "r"((intptr_t)(src_stride)), // %3

2335 "r"(_shuf01), // %4

2336 "r"(_shuf11), // %5

2337 "r"(_shuf21), // %6

2338 "r"(_madd01), // %7

2339 "r"(_madd11), // %8

2340 "r"(_round34), // %9

2341 "r"(_madd21) // %10

2342 : "memory", "cc", "xmm6", "xmm7", "xmm8"

2343 );

2344 }

2345

2346 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

2347 uint8* dst_ptr, int dst_width) {

2348 asm volatile (

2349 "movdqa (%4),%%xmm2 \n" // _shuf01

2350 "movdqa (%5),%%xmm3 \n" // _shuf11

2351 "movdqa (%6),%%xmm4 \n" // _shuf21

2352 "movdqa (%7),%%xmm5 \n" // _madd01

2353 "movdqa (%8),%%xmm6 \n" // _madd11

2354 "movdqa (%9),%%xmm7 \n" // _round34

2355 "movdqa (%10),%%xmm8 \n" // _madd21

2356 "1:"

2357 "movdqa (%0),%%xmm0 \n"

2358 "movdqa (%0,%3,1),%%xmm1 \n"

2359 "pavgb %%xmm0,%%xmm1 \n"

2360 "pavgb %%xmm1,%%xmm0 \n"

2361 "pshufb %%xmm2,%%xmm0 \n"

2362 "pmaddubsw %%xmm5,%%xmm0 \n"

2363 "paddsw %%xmm7,%%xmm0 \n"

2364 "psrlw $0x2,%%xmm0 \n"

2365 "packuswb %%xmm0,%%xmm0 \n"

2366 "movq %%xmm0,(%1) \n"

2367 "movdqu 0x8(%0),%%xmm0 \n"

2368 "movdqu 0x8(%0,%3,1),%%xmm1 \n"

2369 "pavgb %%xmm0,%%xmm1 \n"

2370 "pavgb %%xmm1,%%xmm0 \n"

2371 "pshufb %%xmm3,%%xmm0 \n"

2372 "pmaddubsw %%xmm6,%%xmm0 \n"

2373 "paddsw %%xmm7,%%xmm0 \n"

2374 "psrlw $0x2,%%xmm0 \n"

2375 "packuswb %%xmm0,%%xmm0 \n"

2376 "movq %%xmm0,0x8(%1) \n"

2377 "movdqa 0x10(%0),%%xmm0 \n"

2378 "movdqa 0x10(%0,%3,1),%%xmm1 \n"

2379 "lea 0x20(%0),%0 \n"

2380 "pavgb %%xmm0,%%xmm1 \n"

2381 "pavgb %%xmm1,%%xmm0 \n"

2382 "pshufb %%xmm4,%%xmm0 \n"

2383 "pmaddubsw %%xmm8,%%xmm0 \n"

2384 "paddsw %%xmm7,%%xmm0 \n"

2385 "psrlw $0x2,%%xmm0 \n"

2386 "packuswb %%xmm0,%%xmm0 \n"

2387 "movq %%xmm0,0x10(%1) \n"

2388 "lea 0x18(%1),%1 \n"

2389 "sub $0x18,%2 \n"

2390 "ja 1b \n"

2391 : "+r"(src_ptr), // %0

2392 "+r"(dst_ptr), // %1

2393 "+r"(dst_width) // %2

2394 : "r"((intptr_t)(src_stride)), // %3

2395 "r"(_shuf01), // %4

2396 "r"(_shuf11), // %5

2397 "r"(_shuf21), // %6

2398 "r"(_madd01), // %7

2399 "r"(_madd11), // %8

2400 "r"(_round34), // %9

2401 "r"(_madd21) // %10

2402 : "memory", "cc", "xmm6", "xmm7", "xmm8"

2403 );

2404 }

2405

2406 #define HAS_SCALEROWDOWN38_SSSE3

2407 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

2408 uint8* dst_ptr, int dst_width) {

2409 asm volatile (

2410 "movdqa (%3),%%xmm4 \n"

2411 "movdqa (%4),%%xmm5 \n"

2412 "1:"

2413 "movdqa (%0),%%xmm0 \n"

2414 "movdqa 0x10(%0),%%xmm1 \n"

2415 "lea 0x20(%0),%0 \n"

2416 "pshufb %%xmm4,%%xmm0 \n"

2417 "pshufb %%xmm5,%%xmm1 \n"

2418 "paddusb %%xmm1,%%xmm0 \n"

2419 "movq %%xmm0,(%1) \n"

2420 "movhlps %%xmm0,%%xmm1 \n"

2421 "movd %%xmm1,0x8(%1) \n"

2422 "lea 0xc(%1),%1 \n"

2423 "sub $0xc,%2 \n"

2424 "ja 1b \n"

2425 : "+r"(src_ptr), // %0

2426 "+r"(dst_ptr), // %1

2427 "+r"(dst_width) // %2

2428 : "r"(_shuf38a), // %3

2429 "r"(_shuf38b) // %4

2430 : "memory", "cc"

2431 );

2432 }

2433

2434 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

2435 uint8* dst_ptr, int dst_width) {

2436 asm volatile (

2437 "movdqa (%4),%%xmm4 \n"

2438 "movdqa (%5),%%xmm5 \n"

2439 "movdqa (%6),%%xmm6 \n"

2440 "pxor %%xmm7,%%xmm7 \n"

2441 "1:"

2442 "movdqa (%0),%%xmm0 \n"

2443 "movdqa (%0,%3,1),%%xmm2 \n"

2444 "movhlps %%xmm0,%%xmm1 \n"

2445 "movhlps %%xmm2,%%xmm3 \n"

2446 "punpcklbw %%xmm7,%%xmm0 \n"

2447 "punpcklbw %%xmm7,%%xmm1 \n"

2448 "punpcklbw %%xmm7,%%xmm2 \n"

2449 "punpcklbw %%xmm7,%%xmm3 \n"

2450 "paddusw %%xmm2,%%xmm0 \n"

2451 "paddusw %%xmm3,%%xmm1 \n"

2452 "movdqa (%0,%3,2),%%xmm2 \n"

2453 "lea 0x10(%0),%0 \n"

2454 "movhlps %%xmm2,%%xmm3 \n"

2455 "punpcklbw %%xmm7,%%xmm2 \n"

2456 "punpcklbw %%xmm7,%%xmm3 \n"

2457 "paddusw %%xmm2,%%xmm0 \n"

2458 "paddusw %%xmm3,%%xmm1 \n"

2459 "movdqa %%xmm0,%%xmm2 \n"

2460 "psrldq $0x2,%%xmm0 \n"

2461 "paddusw %%xmm0,%%xmm2 \n"

2462 "psrldq $0x2,%%xmm0 \n"

2463 "paddusw %%xmm0,%%xmm2 \n"

2464 "pshufb %%xmm4,%%xmm2 \n"

2465 "movdqa %%xmm1,%%xmm3 \n"

2466 "psrldq $0x2,%%xmm1 \n"

2467 "paddusw %%xmm1,%%xmm3 \n"

2468 "psrldq $0x2,%%xmm1 \n"

2469 "paddusw %%xmm1,%%xmm3 \n"

2470 "pshufb %%xmm5,%%xmm3 \n"

2471 "paddusw %%xmm3,%%xmm2 \n"

2472 "pmulhuw %%xmm6,%%xmm2 \n"

2473 "packuswb %%xmm2,%%xmm2 \n"

2474 "movd %%xmm2,(%1) \n"

2475 "pextrw $0x2,%%xmm2,%%eax \n"

2476 "mov %%ax,0x4(%1) \n"

2477 "lea 0x6(%1),%1 \n"

2478 "sub $0x6,%2 \n"

2479 "ja 1b \n"

2480 : "+r"(src_ptr), // %0

2481 "+r"(dst_ptr), // %1

2482 "+r"(dst_width) // %2

2483 : "r"((intptr_t)(src_stride)), // %3

2484 "r"(_shufac0), // %4

2485 "r"(_shufac3), // %5

2486 "r"(_scaleac3) // %6

2487 : "memory", "cc", "rax", "xmm6", "xmm7"

2488 );

2489 }

2490

2491 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

2492 uint8* dst_ptr, int dst_width) {

2493 asm volatile (

2494 "movdqa (%4),%%xmm4 \n"

2495 "movdqa (%5),%%xmm5 \n"

2496 "movdqa (%6),%%xmm6 \n"

2497 "movdqa (%7),%%xmm7 \n"

2498 "1:"

2499 "movdqa (%0),%%xmm2 \n"

2500 "pavgb (%0,%3,1),%%xmm2 \n"

2501 "lea 0x10(%0),%0 \n"

2502 "movdqa %%xmm2,%%xmm0 \n"

2503 "pshufb %%xmm4,%%xmm0 \n"

2504 "movdqa %%xmm2,%%xmm1 \n"

2505 "pshufb %%xmm5,%%xmm1 \n"

2506 "paddusw %%xmm1,%%xmm0 \n"

2507 "pshufb %%xmm6,%%xmm2 \n"

2508 "paddusw %%xmm2,%%xmm0 \n"

2509 "pmulhuw %%xmm7,%%xmm0 \n"

2510 "packuswb %%xmm0,%%xmm0 \n"

2511 "movd %%xmm0,(%1) \n"

2512 "pextrw $0x2,%%xmm0,%%eax \n"

2513 "mov %%ax,0x4(%1) \n"

2514 "lea 0x6(%1),%1 \n"

2515 "sub $0x6,%2 \n"

2516 "ja 1b \n"

2517 : "+r"(src_ptr), // %0

2518 "+r"(dst_ptr), // %1

2519 "+r"(dst_width) // %2

2520 : "r"((intptr_t)(src_stride)), // %3

2521 "r"(_shufab0), // %4

2522 "r"(_shufab1), // %5

2523 "r"(_shufab2), // %6

2524 "r"(_scaleab2) // %7

2525 : "memory", "cc", "rax", "xmm6", "xmm7"

2526 );

2527 }

2528

2529 #define HAS_SCALEADDROWS_SSE2

2530 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

2531 uint16* dst_ptr, int src_width,

2532 int src_height) {

2533 asm volatile (

2534 "pxor %%xmm5,%%xmm5 \n"

2535 "1:"

2536 "movdqa (%0),%%xmm2 \n"

2537 "lea (%0,%4,1),%%r10 \n"

2538 "movhlps %%xmm2,%%xmm3 \n"

2539 "lea -0x1(%3),%%r11 \n"

2540 "punpcklbw %%xmm5,%%xmm2 \n"

2541 "punpcklbw %%xmm5,%%xmm3 \n"

2542

2543 "2:"

2544 "movdqa (%%r10),%%xmm0 \n"

2545 "lea (%%r10,%4,1),%%r10 \n"

2546 "movhlps %%xmm0,%%xmm1 \n"

2547 "punpcklbw %%xmm5,%%xmm0 \n"

2548 "punpcklbw %%xmm5,%%xmm1 \n"

2549 "paddusw %%xmm0,%%xmm2 \n"

2550 "paddusw %%xmm1,%%xmm3 \n"

2551 "sub $0x1,%%r11 \n"

2552 "ja 2b \n"

2553

2554 "movdqa %%xmm2,(%1) \n"

2555 "movdqa %%xmm3,0x10(%1) \n"

2556 "lea 0x20(%1),%1 \n"

2557 "lea 0x10(%0),%0 \n"

2558 "sub $0x10,%2 \n"

2559 "ja 1b \n"

2560 : "+r"(src_ptr), // %0

2561 "+r"(dst_ptr), // %1

2562 "+r"(src_width), // %2

2563 "+r"(src_height) // %3

2564 : "r"((intptr_t)(src_stride)) // %4

2565 : "memory", "cc", "r10", "r11"

2566 );

2567 }

2568

2569 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version

2570 #define HAS_SCALEFILTERROWS_SSE2

2571 static void ScaleFilterRows_SSE2(uint8* dst_ptr,

2572 const uint8* src_ptr, int src_stride,

2573 int dst_width, int source_y_fraction) {

2574 if (source_y_fraction == 0) {

2575 asm volatile (

2576 "1:"

2577 "movdqa (%1),%%xmm0 \n"

2578 "lea 0x10(%1),%1 \n"

2579 "movdqa %%xmm0,(%0) \n"

2580 "lea 0x10(%0),%0 \n"

2581 "sub $0x10,%2 \n"

2582 "ja 1b \n"

2583 "mov -0x1(%0),%%al \n"

2584 "mov %%al,(%0) \n"

2585 : "+r"(dst_ptr), // %0

2586 "+r"(src_ptr), // %1

2587 "+r"(dst_width) // %2

2588 :

2589 : "memory", "cc", "rax"

2590 );

2591 return;

2592 } else if (source_y_fraction == 128) {

2593 asm volatile (

2594 "1:"

2595 "movdqa (%1),%%xmm0 \n"

2596 "movdqa (%1,%3,1),%%xmm2 \n"

2597 "lea 0x10(%1),%1 \n"

2598 "pavgb %%xmm2,%%xmm0 \n"

2599 "movdqa %%xmm0,(%0) \n"

2600 "lea 0x10(%0),%0 \n"

2601 "sub $0x10,%2 \n"

2602 "ja 1b \n"

2603 "mov -0x1(%0),%%al \n"

2604 "mov %%al,(%0) \n"

2605 : "+r"(dst_ptr), // %0

2606 "+r"(src_ptr), // %1

2607 "+r"(dst_width) // %2

2608 : "r"((intptr_t)(src_stride)) // %3

2609 : "memory", "cc", "rax"

2610 );

2611 return;

2612 } else {

2613 asm volatile (

2614 "mov %3,%%eax \n"

2615 "movd %%eax,%%xmm6 \n"

2616 "punpcklwd %%xmm6,%%xmm6 \n"

2617 "pshufd $0x0,%%xmm6,%%xmm6 \n"

2618 "neg %%eax \n"

2619 "add $0x100,%%eax \n"

2620 "movd %%eax,%%xmm5 \n"

2621 "punpcklwd %%xmm5,%%xmm5 \n"

2622 "pshufd $0x0,%%xmm5,%%xmm5 \n"

2623 "pxor %%xmm7,%%xmm7 \n"

2624 "1:"

2625 "movdqa (%1),%%xmm0 \n"

2626 "movdqa (%1,%4,1),%%xmm2 \n"

2627 "lea 0x10(%1),%1 \n"

2628 "movdqa %%xmm0,%%xmm1 \n"

2629 "movdqa %%xmm2,%%xmm3 \n"

2630 "punpcklbw %%xmm7,%%xmm0 \n"

2631 "punpcklbw %%xmm7,%%xmm2 \n"

2632 "punpckhbw %%xmm7,%%xmm1 \n"

2633 "punpckhbw %%xmm7,%%xmm3 \n"

2634 "pmullw %%xmm5,%%xmm0 \n"

2635 "pmullw %%xmm5,%%xmm1 \n"

2636 "pmullw %%xmm6,%%xmm2 \n"

2637 "pmullw %%xmm6,%%xmm3 \n"

2638 "paddusw %%xmm2,%%xmm0 \n"

2639 "paddusw %%xmm3,%%xmm1 \n"

2640 "psrlw $0x8,%%xmm0 \n"

2641 "psrlw $0x8,%%xmm1 \n"

2642 "packuswb %%xmm1,%%xmm0 \n"

2643 "movdqa %%xmm0,(%0) \n"

2644 "lea 0x10(%0),%0 \n"

2645 "sub $0x10,%2 \n"

2646 "ja 1b \n"

2647 "mov -0x1(%0),%%al \n"

2648 "mov %%al,(%0) \n"

2649 : "+r"(dst_ptr), // %0

2650 "+r"(src_ptr), // %1

2651 "+r"(dst_width), // %2

2652 "+r"(source_y_fraction) // %3

2653 : "r"((intptr_t)(src_stride)) // %4

2654 : "memory", "cc", "rax", "xmm6", "xmm7"

2655 );

2656 }

2657 return;

2658 }

2659

2660 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version

2661 #define HAS_SCALEFILTERROWS_SSSE3

2662 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,

2663 const uint8* src_ptr, int src_stride,

2664 int dst_width, int source_y_fraction) {

2665 source_y_fraction >>= 1;

2666 if (source_y_fraction == 0) {

2667 asm volatile (

2668 "1:"

2669 "movdqa (%1),%%xmm0 \n"

2670 "lea 0x10(%1),%1 \n"

2671 "movdqa %%xmm0,(%0) \n"

2672 "lea 0x10(%0),%0 \n"

2673 "sub $0x10,%2 \n"

2674 "ja 1b \n"

2675 "mov -0x1(%0),%%al \n"

2676 "mov %%al,(%0) \n"

2677 : "+r"(dst_ptr), // %0

2678 "+r"(src_ptr), // %1

2679 "+r"(dst_width) // %2

2680 :

2681 : "memory", "cc", "rax"

2682 );

2683 return;

2684 } else if (source_y_fraction == 64) {

2685 asm volatile (

2686 "1:"

2687 "movdqa (%1),%%xmm0 \n"

2688 "movdqa (%1,%3,1),%%xmm2 \n"

2689 "lea 0x10(%1),%1 \n"

2690 "pavgb %%xmm2,%%xmm0 \n"

2691 "movdqa %%xmm0,(%0) \n"

2692 "lea 0x10(%0),%0 \n"

2693 "sub $0x10,%2 \n"

2694 "ja 1b \n"

2695 "mov -0x1(%0),%%al \n"

2696 "mov %%al,(%0) \n"

2697 : "+r"(dst_ptr), // %0

2698 "+r"(src_ptr), // %1

2699 "+r"(dst_width) // %2

2700 : "r"((intptr_t)(src_stride)) // %3

2701 : "memory", "cc", "rax"

2702 );

2703 return;

2704 } else {

2705 asm volatile (

2706 "mov %3,%%eax \n"

2707 "mov %%al,%%ah \n"

2708 "neg %%al \n"

2709 "add $0x80,%%al \n"

2710 "movd %%eax,%%xmm5 \n"

2711 "punpcklwd %%xmm5,%%xmm5 \n"

2712 "pshufd $0x0,%%xmm5,%%xmm5 \n"

2713 "1:"

2714 "movdqa (%1),%%xmm0 \n"

2715 "movdqa (%1,%4,1),%%xmm2 \n"

2716 "lea 0x10(%1),%1 \n"

2717 "movdqa %%xmm0,%%xmm1 \n"

2718 "punpcklbw %%xmm2,%%xmm0 \n"

2719 "punpckhbw %%xmm2,%%xmm1 \n"

2720 "pmaddubsw %%xmm5,%%xmm0 \n"

2721 "pmaddubsw %%xmm5,%%xmm1 \n"

2722 "psrlw $0x7,%%xmm0 \n"

2723 "psrlw $0x7,%%xmm1 \n"

2724 "packuswb %%xmm1,%%xmm0 \n"

2725 "movdqa %%xmm0,(%0) \n"

2726 "lea 0x10(%0),%0 \n"

2727 "sub $0x10,%2 \n"

2728 "ja 1b \n"

2729 "mov -0x1(%0),%%al \n"

2730 "mov %%al,(%0) \n"

2731 : "+r"(dst_ptr), // %0

2732 "+r"(src_ptr), // %1

2733 "+r"(dst_width), // %2

2734 "+r"(source_y_fraction) // %3

2735 : "r"((intptr_t)(src_stride)) // %4

2736 : "memory", "cc", "rax"

2737 );

2738 }

2739 return;

2740 }

2741 #endif

2742 #endif

2743

2744 // CPU agnostic row functions

2745 static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,

2746 uint8* dst, int dst_width) {

2747 int x;

2748 for (x = 0; x < dst_width; ++x) {

2749 dst++ = src_ptr;

2750 src_ptr += 2;

2751 }

2752 }

2753

2754 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,

2755 uint8* dst, int dst_width) {

2756 int x;

2757 for (x = 0; x < dst_width; ++x) {

2758 *dst++ = (src_ptr[0] + src_ptr[1] +

2759 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;

2760 src_ptr += 2;

2761 }

2762 }

2763

2764 static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,

2765 uint8* dst, int dst_width) {

2766 int x;

2767 for (x = 0; x < dst_width; ++x) {

2768 dst++ = src_ptr;

2769 src_ptr += 4;

2770 }

2771 }

2772

2773 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,

2774 uint8* dst, int dst_width) {

2775 int x;

2776 for (x = 0; x < dst_width; ++x) {

2777 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

2778 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

2779 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +

2780 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +

2781 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +

2782 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +

2783 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +

2784 8) >> 4;

2785 src_ptr += 4;

2786 }

2787 }

2788

2789 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.

2790 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.

2791 // The following 2 lines cause error on Windows.

2792 //static const int kMaxOutputWidth = 640;

2793 //static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2;

2794 #define kMaxOutputWidth 640

2795 #define kMaxRow12 1280

2796

2797 static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,

2798 uint8* dst, int dst_width) {

2799 int x;

2800 for (x = 0; x < dst_width; ++x) {

2801 dst++ = src_ptr;

2802 src_ptr += 8;

2803 }

2804 }

2805

2806 // Note calling code checks width is less than max and if not

2807 // uses ScaleRowDown8_C instead.

2808 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,

2809 uint8* dst, int dst_width) {

2810 ALIGN16(uint8 src_row[kMaxRow12 * 2]);

2811 assert(dst_width <= kMaxOutputWidth);

2812 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);

2813 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,

2814 src_row + kMaxOutputWidth,

2815 dst_width * 2);

2816 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);

2817 }

2818

2819 static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,

2820 uint8* dst, int dst_width) {

2821 uint8* dend;

2822 assert((dst_width % 3 == 0) && (dst_width > 0));

2823 dend = dst + dst_width;

2824 do {

2825 dst[0] = src_ptr[0];

2826 dst[1] = src_ptr[1];

2827 dst[2] = src_ptr[3];

2828 dst += 3;

2829 src_ptr += 4;

2830 } while (dst < dend);

2831 }

2832

2833 // Filter rows 0 and 1 together, 3 : 1

2834 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,

2835 uint8* d, int dst_width) {

2836 uint8* dend;

2837 const uint8* s;

2838 const uint8* t;

2839 assert((dst_width % 3 == 0) && (dst_width > 0));

2840 dend = d + dst_width;

2841 s = src_ptr;

2842 t = src_ptr + src_stride;

2843 do {

2844 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

2845 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

2846 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

2847 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

2848 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

2849 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

2850 d[0] = (a0 * 3 + b0 + 2) >> 2;

2851 d[1] = (a1 * 3 + b1 + 2) >> 2;

2852 d[2] = (a2 * 3 + b2 + 2) >> 2;

2853 d += 3;

2854 s += 4;

2855 t += 4;

2856 } while (d < dend);

2857 }

2858

2859 // Filter rows 1 and 2 together, 1 : 1

2860 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,

2861 uint8* d, int dst_width) {

2862 uint8* dend;

2863 const uint8* s;

2864 const uint8* t;

2865 assert((dst_width % 3 == 0) && (dst_width > 0));

2866 dend = d + dst_width;

2867 s = src_ptr;

2868 t = src_ptr + src_stride;

2869 do {

2870 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

2871 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

2872 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

2873 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

2874 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

2875 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

2876 d[0] = (a0 + b0 + 1) >> 1;

2877 d[1] = (a1 + b1 + 1) >> 1;

2878 d[2] = (a2 + b2 + 1) >> 1;

2879 d += 3;

2880 s += 4;

2881 t += 4;

2882 } while (d < dend);

2883 }

2884

2885 #if defined(HAS_SCALEFILTERROWS_SSE2)

2886 // Filter row to 3/4

2887 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,

2888 int dst_width) {

2889 uint8* dend;

2890 const uint8* s;

2891 assert((dst_width % 3 == 0) && (dst_width > 0));

2892 dend = dst_ptr + dst_width;

2893 s = src_ptr;

2894 do {

2895 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;

2896 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;

2897 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;

2898 dst_ptr += 3;

2899 s += 4;

2900 } while (dst_ptr < dend);

2901 }

2902 #endif

2903

2904 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

2905 int dst_width, int dx) {

2906 int x = 0;

2907 int j;

2908 for (j = 0; j < dst_width; ++j) {

2909 int xi = x >> 16;

2910 int xf1 = x & 0xffff;

2911 int xf0 = 65536 - xf1;

2912

2913 dst_ptr++ = (src_ptr[xi] xf0 + src_ptr[xi + 1] * xf1) >> 16;

2914 x += dx;

2915 }

2916 }

2917

2918 //Not work on Windows

2919 //static const int kMaxInputWidth = 2560;

2920 #define kMaxInputWidth 2560

2921 #if defined(HAS_SCALEFILTERROWS_SSE2)

2922 #define HAS_SCALEROWDOWN34_SSE2

2923 // Filter rows 0 and 1 together, 3 : 1

2924 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,

2925 uint8* dst_ptr, int dst_width) {

2926 ALIGN16(uint8 row[kMaxInputWidth]);

2927 assert((dst_width % 3 == 0) && (dst_width > 0));

2928 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);

2929 ScaleFilterCols34_C(dst_ptr, row, dst_width);

2930 }

2931

2932 // Filter rows 1 and 2 together, 1 : 1

2933 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,

2934 uint8* dst_ptr, int dst_width) {

2935 ALIGN16(uint8 row[kMaxInputWidth]);

2936 assert((dst_width % 3 == 0) && (dst_width > 0));

2937 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);

2938 ScaleFilterCols34_C(dst_ptr, row, dst_width);

2939 }

2940 #endif

2941

2942 static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,

2943 uint8* dst, int dst_width) {

2944 int x;

2945 assert(dst_width % 3 == 0);

2946 for (x = 0; x < dst_width; x += 3) {

2947 dst[0] = src_ptr[0];

2948 dst[1] = src_ptr[3];

2949 dst[2] = src_ptr[6];

2950 dst += 3;

2951 src_ptr += 8;

2952 }

2953 }

2954

2955 // 8x3 -> 3x1

2956 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,

2957 uint8* dst_ptr, int dst_width) {

2958 int i;

2959 assert((dst_width % 3 == 0) && (dst_width > 0));

2960 for (i = 0; i < dst_width; i+=3) {

2961 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

2962 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

2963 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +

2964 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *

2965 (65536 / 9) >> 16;

2966 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

2967 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +

2968 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +

2969 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *

2970 (65536 / 9) >> 16;

2971 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

2972 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +

2973 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *

2974 (65536 / 6) >> 16;

2975 src_ptr += 8;

2976 dst_ptr += 3;

2977 }

2978 }

2979

2980 // 8x2 -> 3x1

2981 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,

2982 uint8* dst_ptr, int dst_width) {

2983 int i;

2984 assert((dst_width % 3 == 0) && (dst_width > 0));

2985 for (i = 0; i < dst_width; i+=3) {

2986 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

2987 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

2988 src_ptr[src_stride + 2]) * (65536 / 6) >> 16;

2989 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

2990 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +

2991 src_ptr[src_stride + 5]) * (65536 / 6) >> 16;

2992 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

2993 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *

2994 (65536 / 4) >> 16;

2995 src_ptr += 8;

2996 dst_ptr += 3;

2997 }

2998 }

2999

3000 // C version 8x2 -> 8x1

3001 static void ScaleFilterRows_C(uint8* dst_ptr,

3002 const uint8* src_ptr, int src_stride,

3003 int dst_width, int source_y_fraction) {

3004 int y1_fraction;

3005 int y0_fraction;

3006 const uint8* src_ptr1;

3007 uint8* end;

3008 assert(dst_width > 0);

3009 y1_fraction = source_y_fraction;

3010 y0_fraction = 256 - y1_fraction;

3011 src_ptr1 = src_ptr + src_stride;

3012 end = dst_ptr + dst_width;

3013 do {

3014 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

3015 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;

3016 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;

3017 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;

3018 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;

3019 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;

3020 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;

3021 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;

3022 src_ptr += 8;

3023 src_ptr1 += 8;

3024 dst_ptr += 8;

3025 } while (dst_ptr < end);

3026 dst_ptr[0] = dst_ptr[-1];

3027 }

3028

3029 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,

3030 uint16* dst_ptr, int src_width, int src_height) {

3031 int x,y;

3032 assert(src_width > 0);

3033 assert(src_height > 0);

3034 for (x = 0; x < src_width; ++x) {

3035 const uint8* s = src_ptr + x;

3036 int sum = 0;

3037 for (y = 0; y < src_height; ++y) {

3038 sum += s[0];

3039 s += src_stride;

3040 }

3041 dst_ptr[x] = sum;

3042 }

3043 }

3044

3045 /**

3046 * Scale plane, 1/2

3047 *

3048 * This is an optimized version for scaling down a plane to 1/2 of

3049 * its original size.

3050 *

3051 */

3052 static void ScalePlaneDown2(int src_width, int src_height,

3053 int dst_width, int dst_height,

3054 int src_stride, int dst_stride,

3055 const uint8* src_ptr, uint8* dst_ptr,

3056 FilterModeEnum filtering) {

3057 void (ScaleRowDown2)(const uint8 src_ptr, int src_stride,

3058 uint8* dst_ptr, int dst_width);

3059 assert(IS_ALIGNED(src_width, 2));

3060 assert(IS_ALIGNED(src_height, 2));

3061

3062 #if defined(HAS_SCALEROWDOWN2_NEON)

3063 if (TestCpuFlag(kCpuHasNEON) &&

3064 IS_ALIGNED(dst_width, 16)) {

3065 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;

3066 } else

3067 #endif

3068 #if defined(HAS_SCALEROWDOWN2_SSE2)

3069 if (TestCpuFlag(kCpuHasSSE2) &&

3070 IS_ALIGNED(dst_width, 16) &&

3071 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

3072 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

3073 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;

3074 } else

3075 #endif

3076 {

3077 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;

3078 }

3079

3080 {

3081 int y;

3082 for (y = 0; y < dst_height; ++y) {

3083 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);

3084 src_ptr += (src_stride << 1);

3085 dst_ptr += dst_stride;

3086 }

3087 }

3088 }

3089

3090 /**

3091 * Scale plane, 1/4

3092 *

3093 * This is an optimized version for scaling down a plane to 1/4 of

3094 * its original size.

3095 */

3096 static void ScalePlaneDown4(int src_width, int src_height,

3097 int dst_width, int dst_height,

3098 int src_stride, int dst_stride,

3099 const uint8* src_ptr, uint8* dst_ptr,

3100 FilterModeEnum filtering) {

3101 void (ScaleRowDown4)(const uint8 src_ptr, int src_stride,

3102 uint8* dst_ptr, int dst_width);

3103 assert(IS_ALIGNED(src_width, 4));

3104 assert(IS_ALIGNED(src_height, 4));

3105

3106 #if defined(HAS_SCALEROWDOWN4_NEON)

3107 if (TestCpuFlag(kCpuHasNEON) &&

3108 IS_ALIGNED(dst_width, 4)) {

3109 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;

3110 } else

3111 #endif

3112 #if defined(HAS_SCALEROWDOWN4_SSE2)

3113 if (TestCpuFlag(kCpuHasSSE2) &&

3114 IS_ALIGNED(dst_width, 8) &&

3115 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

3116 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {

3117 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;

3118 } else

3119 #endif

3120 {

3121 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;

3122 }

3123

3124 {

3125 int y;

3126 for (y = 0; y < dst_height; ++y) {

3127 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);

3128 src_ptr += (src_stride << 2);

3129 dst_ptr += dst_stride;

3130 }

3131 }

3132 }

3133

3134 /**

3135 * Scale plane, 1/8

3136 *

3137 * This is an optimized version for scaling down a plane to 1/8

3138 * of its original size.

3139 *

3140 */

3141 static void ScalePlaneDown8(int src_width, int src_height,

3142 int dst_width, int dst_height,

3143 int src_stride, int dst_stride,

3144 const uint8* src_ptr, uint8* dst_ptr,

3145 FilterModeEnum filtering) {

3146 void (ScaleRowDown8)(const uint8 src_ptr, int src_stride,

3147 uint8* dst_ptr, int dst_width);

3148 assert(IS_ALIGNED(src_width, 8));

3149 assert(IS_ALIGNED(src_height, 8));

3150

3151 #if defined(HAS_SCALEROWDOWN8_SSE2)

3152 if (TestCpuFlag(kCpuHasSSE2) &&

3153 IS_ALIGNED(dst_width, 4) &&

3154 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

3155 IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

3156 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;

3157 } else

3158 #endif

3159 {

3160 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?

3161 ScaleRowDown8Int_C : ScaleRowDown8_C;

3162 }

3163

3164 {

3165 int y;

3166 for (y = 0; y < dst_height; ++y) {

3167 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);

3168 src_ptr += (src_stride << 3);

3169 dst_ptr += dst_stride;

3170 }

3171 }

3172 }

3173

3174 /**

3175 * Scale plane down, 3/4

3176 *

3177 * Provided by Frank Barchard (fbarchard@google.com)

3178 *

3179 */

3180 static void ScalePlaneDown34(int src_width, int src_height,

3181 int dst_width, int dst_height,

3182 int src_stride, int dst_stride,

3183 const uint8* src_ptr, uint8* dst_ptr,

3184 FilterModeEnum filtering) {

3185 void (ScaleRowDown34_0)(const uint8 src_ptr, int src_stride,

3186 uint8* dst_ptr, int dst_width);

3187 void (ScaleRowDown34_1)(const uint8 src_ptr, int src_stride,

3188 uint8* dst_ptr, int dst_width);

3189 assert(dst_width % 3 == 0);

3190 #if defined(HAS_SCALEROWDOWN34_NEON)

3191 if (TestCpuFlag(kCpuHasNEON) &&

3192 (dst_width % 24 == 0)) {

3193 if (!filtering) {

3194 ScaleRowDown34_0 = ScaleRowDown34_NEON;

3195 ScaleRowDown34_1 = ScaleRowDown34_NEON;

3196 } else {

3197 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;

3198 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;

3199 }

3200 } else

3201 #endif

3202

3203 #if defined(HAS_SCALEROWDOWN34_SSSE3)

3204 if (TestCpuFlag(kCpuHasSSSE3) &&

3205 (dst_width % 24 == 0) &&

3206 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

3207 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {

3208 if (!filtering) {

3209 ScaleRowDown34_0 = ScaleRowDown34_SSSE3;

3210 ScaleRowDown34_1 = ScaleRowDown34_SSSE3;

3211 } else {

3212 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;

3213 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;

3214 }

3215 } else

3216 #endif

3217 #if defined(HAS_SCALEROWDOWN34_SSE2)

3218 if (TestCpuFlag(kCpuHasSSE2) &&

3219 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&

3220 IS_ALIGNED(dst_stride, 8) &&

3221 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&

3222 filtering) {

3223 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;

3224 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;

3225 } else

3226 #endif

3227 {

3228 if (!filtering) {

3229 ScaleRowDown34_0 = ScaleRowDown34_C;

3230 ScaleRowDown34_1 = ScaleRowDown34_C;

3231 } else {

3232 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;

3233 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;

3234 }

3235 }

3236 {

3237 int src_row = 0;

3238 int y;

3239 for (y = 0; y < dst_height; ++y) {

3240 switch (src_row) {

3241 case 0:

3242 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);

3243 break;

3244

3245 case 1:

3246 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);

3247 break;

3248

3249 case 2:

3250 ScaleRowDown34_0(src_ptr + src_stride, -src_stride,

3251 dst_ptr, dst_width);

3252 break;

3253 }

3254 ++src_row;

3255 src_ptr += src_stride;

3256 dst_ptr += dst_stride;

3257 if (src_row >= 3) {

3258 src_ptr += src_stride;

3259 src_row = 0;

3260 }

3261 }

3262 }

3263 }

3264

3265 /**

3266 * Scale plane, 3/8

3267 *

3268 * This is an optimized version for scaling down a plane to 3/8

3269 * of its original size.

3270 *

3271 * Reduces 16x3 to 6x1

3272 */

3273 static void ScalePlaneDown38(int src_width, int src_height,

3274 int dst_width, int dst_height,

3275 int src_stride, int dst_stride,

3276 const uint8* src_ptr, uint8* dst_ptr,

3277 FilterModeEnum filtering) {

3278 void (ScaleRowDown38_3)(const uint8 src_ptr, int src_stride,

3279 uint8* dst_ptr, int dst_width);

3280 void (ScaleRowDown38_2)(const uint8 src_ptr, int src_stride,

3281 uint8* dst_ptr, int dst_width);

3282 assert(dst_width % 3 == 0);

3283 #if defined(HAS_SCALEROWDOWN38_NEON)

3284 if (TestCpuFlag(kCpuHasNEON) &&

3285 (dst_width % 12 == 0)) {

3286 if (!filtering) {

3287 ScaleRowDown38_3 = ScaleRowDown38_NEON;

3288 ScaleRowDown38_2 = ScaleRowDown38_NEON;

3289 } else {

3290 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;

3291 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;

3292 }

3293 } else

3294 #endif

3295

3296 #if defined(HAS_SCALEROWDOWN38_SSSE3)

3297 if (TestCpuFlag(kCpuHasSSSE3) &&

3298 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&

3299 IS_ALIGNED(dst_stride, 8) &&

3300 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {

3301 if (!filtering) {

3302 ScaleRowDown38_3 = ScaleRowDown38_SSSE3;

3303 ScaleRowDown38_2 = ScaleRowDown38_SSSE3;

3304 } else {

3305 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;

3306 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;

3307 }

3308 } else

3309 #endif

3310 {

3311 if (!filtering) {

3312 ScaleRowDown38_3 = ScaleRowDown38_C;

3313 ScaleRowDown38_2 = ScaleRowDown38_C;

3314 } else {

3315 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;

3316 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;

3317 }

3318 }

3319 {

3320 int src_row = 0;

3321 int y;

3322 for (y = 0; y < dst_height; ++y) {

3323 switch (src_row) {

3324 case 0:

3325 case 1:

3326 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);

3327 src_ptr += src_stride * 3;

3328 ++src_row;

3329 break;

3330

3331 case 2:

3332 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);

3333 src_ptr += src_stride * 2;

3334 src_row = 0;

3335 break;

3336 }

3337 dst_ptr += dst_stride;

3338 }

3339 }

3340 }

3341

3342 __inline static uint32 SumBox(int iboxwidth, int iboxheight,

3343 int src_stride, const uint8* src_ptr) {

3344 int x, y;

3345 uint32 sum;

3346 assert(iboxwidth > 0);

3347 assert(iboxheight > 0);

3348 sum = 0u;

3349 for (y = 0; y < iboxheight; ++y) {

3350 for (x = 0; x < iboxwidth; ++x) {

3351 sum += src_ptr[x];

3352 }

3353 src_ptr += src_stride;

3354 }

3355 return sum;

3356 }

3357

3358 static void ScalePlaneBoxRow(int dst_width, int boxheight,

3359 int dx, int src_stride,

3360 const uint8* src_ptr, uint8* dst_ptr) {

3361 int x = 0;

3362 int i;

3363 for (i = 0; i < dst_width; ++i) {

3364 int ix = x >> 16;

3365 int boxwidth;

3366 x += dx;

3367 boxwidth = (x >> 16) - ix;

3368 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /

3369 (boxwidth * boxheight);

3370 }

3371 }

3372

3373 __inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {

3374 uint32 sum;

3375 int x;

3376 assert(iboxwidth > 0);

3377 sum = 0u;

3378 for (x = 0; x < iboxwidth; ++x) {

3379 sum += src_ptr[x];

3380 }

3381 return sum;

3382 }

3383

3384 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,

3385 const uint16* src_ptr, uint8* dst_ptr) {

3386 int scaletbl[2];

3387 int minboxwidth = (dx >> 16);

3388 scaletbl[0] = 65536 / (minboxwidth * boxheight);

3389 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);

3390 {

3391 int *scaleptr = scaletbl - minboxwidth;

3392 int x = 0;

3393 int i;

3394 for (i = 0; i < dst_width; ++i) {

3395 int ix = x >> 16;

3396 int boxwidth;

3397 x += dx;

3398 boxwidth = (x >> 16) - ix;

3399 dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) scaleptr[boxwidth] >> 16;

3400 }

3401 }

3402 }

3403

3404 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,

3405 const uint16* src_ptr, uint8* dst_ptr) {

3406 int boxwidth = (dx >> 16);

3407 int scaleval = 65536 / (boxwidth * boxheight);

3408 int x = 0;

3409 int i;

3410 for (i = 0; i < dst_width; ++i) {

3411 dst_ptr++ = SumPixels(boxwidth, src_ptr + x) scaleval >> 16;

3412 x += boxwidth;

3413 }

3414 }

3415

3416 /**

3417 * Scale plane down to any dimensions, with interpolation.

3418 * (boxfilter).

3419 *

3420 * Same method as SimpleScale, which is fixed point, outputting

3421 * one pixel of destination using fixed point (16.16) to step

3422 * through source, sampling a box of pixel with simple

3423 * averaging.

3424 */

3425 static void ScalePlaneBox(int src_width, int src_height,

3426 int dst_width, int dst_height,

3427 int src_stride, int dst_stride,

3428 const uint8* src_ptr, uint8* dst_ptr) {

3429 int dx, dy;

3430 assert(dst_width > 0);

3431 assert(dst_height > 0);

3432 dy = (src_height << 16) / dst_height;

3433 dx = (src_width << 16) / dst_width;

3434 if (!IS_ALIGNED(src_width, 16) \|\| (src_width > kMaxInputWidth) \|\|

3435 dst_height * 2 > src_height) {

3436 uint8* dst = dst_ptr;

3437 int dy = (src_height << 16) / dst_height;

3438 int dx = (src_width << 16) / dst_width;

3439 int y = 0;

3440 int j;

3441 for (j = 0; j < dst_height; ++j) {

3442 int iy = y >> 16;

3443 const uint8* const src = src_ptr + iy * src_stride;

3444 int boxheight;

3445 y += dy;

3446 if (y > (src_height << 16)) {

3447 y = (src_height << 16);

3448 }

3449 boxheight = (y >> 16) - iy;

3450 ScalePlaneBoxRow(dst_width, boxheight,

3451 dx, src_stride,

3452 src, dst);

3453

3454 dst += dst_stride;

3455 }

3456 } else {

3457 ALIGN16(uint16 row[kMaxInputWidth]);

3458 void (ScaleAddRows)(const uint8 src_ptr, int src_stride,

3459 uint16* dst_ptr, int src_width, int src_height);

3460 void (*ScaleAddCols)(int dst_width, int boxheight, int dx,

3461 const uint16* src_ptr, uint8* dst_ptr);

3462 #if defined(HAS_SCALEADDROWS_SSE2)

3463 if (TestCpuFlag(kCpuHasSSE2) &&

3464 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

3465 IS_ALIGNED(src_width, 16)) {

3466 ScaleAddRows = ScaleAddRows_SSE2;

3467 } else

3468 #endif

3469 {

3470 ScaleAddRows = ScaleAddRows_C;

3471 }

3472 if (dx & 0xffff) {

3473 ScaleAddCols = ScaleAddCols2_C;

3474 } else {

3475 ScaleAddCols = ScaleAddCols1_C;

3476 }

3477

3478 {

3479 int y = 0;

3480 int j;

3481 for (j = 0; j < dst_height; ++j) {

3482 int iy = y >> 16;

3483 const uint8* const src = src_ptr + iy * src_stride;

3484 int boxheight;

3485 y += dy;

3486 if (y > (src_height << 16)) {

3487 y = (src_height << 16);

3488 }

3489 boxheight = (y >> 16) - iy;

3490 ScaleAddRows(src, src_stride, row, src_width, boxheight);

3491 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);

3492 dst_ptr += dst_stride;

3493 }

3494 }

3495 }

3496 }

3497

3498 /**

3499 * Scale plane to/from any dimensions, with interpolation.

3500 */

3501 static void ScalePlaneBilinearSimple(int src_width, int src_height,

3502 int dst_width, int dst_height,

3503 int src_stride, int dst_stride,

3504 const uint8* src_ptr, uint8* dst_ptr) {

3505 int i, j;

3506 uint8* dst = dst_ptr;

3507 int dx = (src_width << 16) / dst_width;

3508 int dy = (src_height << 16) / dst_height;

3509 int maxx = ((src_width - 1) << 16) - 1;

3510 int maxy = ((src_height - 1) << 16) - 1;

3511 int y = (dst_height < src_height) ? 32768 :

3512 (src_height << 16) / dst_height - 32768;

3513 for (i = 0; i < dst_height; ++i) {

3514 int cy = (y < 0) ? 0 : y;

3515 int yi = cy >> 16;

3516 int yf = cy & 0xffff;

3517 const uint8* const src = src_ptr + yi * src_stride;

3518 int x = (dst_width < src_width) ? 32768 :

3519 (src_width << 16) / dst_width - 32768;

3520 for (j = 0; j < dst_width; ++j) {

3521 int cx = (x < 0) ? 0 : x;

3522 int xi = cx >> 16;

3523 int xf = cx & 0xffff;

3524 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;

3525 int r1 = (src[xi + src_stride] * (65536 - xf) +

3526 src[xi + src_stride + 1] * xf) >> 16;

3527 dst++ = (r0 (65536 - yf) + r1 * yf) >> 16;

3528 x += dx;

3529 if (x > maxx)

3530 x = maxx;

3531 }

3532 dst += dst_stride - dst_width;

3533 y += dy;

3534 if (y > maxy)

3535 y = maxy;

3536 }

3537 }

3538

3539 /**

3540 * Scale plane to/from any dimensions, with bilinear

3541 * interpolation.

3542 */

3543 static void ScalePlaneBilinear(int src_width, int src_height,

3544 int dst_width, int dst_height,

3545 int src_stride, int dst_stride,

3546 const uint8* src_ptr, uint8* dst_ptr) {

3547 int dy;

3548 int dx;

3549 assert(dst_width > 0);

3550 assert(dst_height > 0);

3551 dy = (src_height << 16) / dst_height;

3552 dx = (src_width << 16) / dst_width;

3553 if (!IS_ALIGNED(src_width, 8) \|\| (src_width > kMaxInputWidth)) {

3554 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,

3555 src_stride, dst_stride, src_ptr, dst_ptr);

3556

3557 } else {

3558 ALIGN16(uint8 row[kMaxInputWidth + 1]);

3559 void (ScaleFilterRows)(uint8 dst_ptr, const uint8* src_ptr,

3560 int src_stride,

3561 int dst_width, int source_y_fraction);

3562 void (ScaleFilterCols)(uint8 dst_ptr, const uint8* src_ptr,

3563 int dst_width, int dx);

3564 #if defined(HAS_SCALEFILTERROWS_SSSE3)

3565 if (TestCpuFlag(kCpuHasSSSE3) &&

3566 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

3567 IS_ALIGNED(src_width, 16)) {

3568 ScaleFilterRows = ScaleFilterRows_SSSE3;

3569 } else

3570 #endif

3571 #if defined(HAS_SCALEFILTERROWS_SSE2)

3572 if (TestCpuFlag(kCpuHasSSE2) &&

3573 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

3574 IS_ALIGNED(src_width, 16)) {

3575 ScaleFilterRows = ScaleFilterRows_SSE2;

3576 } else

3577 #endif

3578 {

3579 ScaleFilterRows = ScaleFilterRows_C;

3580 }

3581 ScaleFilterCols = ScaleFilterCols_C;

3582

3583 {

3584 int y = 0;

3585 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.

3586 int j;

3587 for (j = 0; j < dst_height; ++j) {

3588 int iy = y >> 16;

3589 int fy = (y >> 8) & 255;

3590 const uint8* const src = src_ptr + iy * src_stride;

3591 ScaleFilterRows(row, src, src_stride, src_width, fy);

3592 ScaleFilterCols(dst_ptr, row, dst_width, dx);

3593 dst_ptr += dst_stride;

3594 y += dy;

3595 if (y > maxy) {

3596 y = maxy;

3597 }

3598 }

3599 }

3600 }

3601 }

3602

3603 /**

3604 * Scale plane to/from any dimensions, without interpolation.

3605 * Fixed point math is used for performance: The upper 16 bits

3606 * of x and dx is the integer part of the source position and

3607 * the lower 16 bits are the fixed decimal part.

3608 */

3609 static void ScalePlaneSimple(int src_width, int src_height,

3610 int dst_width, int dst_height,

3611 int src_stride, int dst_stride,

3612 const uint8* src_ptr, uint8* dst_ptr) {

3613 uint8* dst = dst_ptr;

3614 int dx = (src_width << 16) / dst_width;

3615 int y;

3616 for (y = 0; y < dst_height; ++y) {

3617 const uint8* const src = src_ptr + (y * src_height / dst_height) *

3618 src_stride;

3619 // TODO(fbarchard): Round X coordinate by setting x=0x8000.

3620 int x = 0;

3621 int i;

3622 for (i = 0; i < dst_width; ++i) {

3623 *dst++ = src[x >> 16];

3624 x += dx;

3625 }

3626 dst += dst_stride - dst_width;

3627 }

3628 }

3629

3630 /**

3631 * Scale plane to/from any dimensions.

3632 */

3633 static void ScalePlaneAnySize(int src_width, int src_height,

3634 int dst_width, int dst_height,

3635 int src_stride, int dst_stride,

3636 const uint8* src_ptr, uint8* dst_ptr,

3637 FilterModeEnum filtering) {

3638 if (!filtering) {

3639 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

3640 src_stride, dst_stride, src_ptr, dst_ptr);

3641 } else {

3642 // fall back to non-optimized version

3643 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,

3644 src_stride, dst_stride, src_ptr, dst_ptr);

3645 }

3646 }

3647

3648 /**

3649 * Scale plane down, any size

3650 *

3651 * This is an optimized version for scaling down a plane to any size.

3652 * The current implementation is ~10 times faster compared to the

3653 * reference implementation for e.g. XGA->LowResPAL

3654 *

3655 */

3656 static void ScalePlaneDown(int src_width, int src_height,

3657 int dst_width, int dst_height,

3658 int src_stride, int dst_stride,

3659 const uint8* src_ptr, uint8* dst_ptr,

3660 FilterModeEnum filtering) {

3661 if (!filtering) {

3662 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

3663 src_stride, dst_stride, src_ptr, dst_ptr);

3664 } else if (filtering == kFilterBilinear \|\| src_height * 2 > dst_height) {

3665 // between 1/2x and 1x use bilinear

3666 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,

3667 src_stride, dst_stride, src_ptr, dst_ptr);

3668 } else {

3669 ScalePlaneBox(src_width, src_height, dst_width, dst_height,

3670 src_stride, dst_stride, src_ptr, dst_ptr);

3671 }

3672 }

3673

3674 /**

3675 * Copy plane, no scaling

3676 *

3677 * This simply copies the given plane without scaling.

3678 * The current implementation is ~115 times faster

3679 * compared to the reference implementation.

3680 *

3681 */

3682 static void CopyPlane(int src_width, int src_height,

3683 int dst_width, int dst_height,

3684 int src_stride, int dst_stride,

3685 const uint8* src_ptr, uint8* dst_ptr) {

3686 if (src_stride == src_width && dst_stride == dst_width) {

3687 // All contiguous, so can use REALLY fast path.

3688 memcpy(dst_ptr, src_ptr, src_width * src_height);

3689 } else {

3690 // Not all contiguous; must copy scanlines individually

3691 const uint8* src = src_ptr;

3692 uint8* dst = dst_ptr;

3693 int i;

3694 for (i = 0; i < src_height; ++i) {

3695 memcpy(dst, src, src_width);

3696 dst += dst_stride;

3697 src += src_stride;

3698 }

3699 }

3700 }

3701

3702 static void ScalePlane(const uint8* src, int src_stride,

3703 int src_width, int src_height,

3704 uint8* dst, int dst_stride,

3705 int dst_width, int dst_height,

3706 FilterModeEnum filtering, int use_ref) {

3707 // Use specialized scales to improve performance for common resolutions.

3708 // For example, all the 1/2 scalings will use ScalePlaneDown2()

3709 if (dst_width == src_width && dst_height == src_height) {

3710 // Straight copy.

3711 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,

3712 dst_stride, src, dst);

3713 } else if (dst_width <= src_width && dst_height <= src_height) {

3714 // Scale down.

3715 if (use_ref) {

3716 // For testing, allow the optimized versions to be disabled.

3717 ScalePlaneDown(src_width, src_height, dst_width, dst_height,

3718 src_stride, dst_stride, src, dst, filtering);

3719 } else if (4 * dst_width == 3 * src_width &&

3720 4 * dst_height == 3 * src_height) {

3721 // optimized, 3/4

3722 ScalePlaneDown34(src_width, src_height, dst_width, dst_height,

3723 src_stride, dst_stride, src, dst, filtering);

3724 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {

3725 // optimized, 1/2

3726 ScalePlaneDown2(src_width, src_height, dst_width, dst_height,

3727 src_stride, dst_stride, src, dst, filtering);

3728 // 3/8 rounded up for odd sized chroma height.

3729 } else if (8 * dst_width == 3 * src_width &&

3730 dst_height == ((src_height * 3 + 7) / 8)) {

3731 // optimized, 3/8

3732 ScalePlaneDown38(src_width, src_height, dst_width, dst_height,

3733 src_stride, dst_stride, src, dst, filtering);

3734 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {

3735 // optimized, 1/4

3736 ScalePlaneDown4(src_width, src_height, dst_width, dst_height,

3737 src_stride, dst_stride, src, dst, filtering);

3738 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {

3739 // optimized, 1/8

3740 ScalePlaneDown8(src_width, src_height, dst_width, dst_height,

3741 src_stride, dst_stride, src, dst, filtering);

3742 } else {

3743 // Arbitrary downsample

3744 ScalePlaneDown(src_width, src_height, dst_width, dst_height,

3745 src_stride, dst_stride, src, dst, filtering);

3746 }

3747 } else {

3748 // Arbitrary scale up and/or down.

3749 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,

3750 src_stride, dst_stride, src, dst, filtering);

3751 }

3752 }

3753

3754 /**

3755 * Scale a plane.

3756 *

3757 * This function in turn calls a scaling function

3758 * suitable for handling the desired resolutions.

3759 *

3760 */

3761

3762 int I420Scale(const uint8* src_y, int src_stride_y,

3763 const uint8* src_u, int src_stride_u,

3764 const uint8* src_v, int src_stride_v,

3765 int src_width, int src_height,

3766 uint8* dst_y, int dst_stride_y,

3767 uint8* dst_u, int dst_stride_u,

3768 uint8* dst_v, int dst_stride_v,

3769 int dst_width, int dst_height,

3770 FilterModeEnum filtering) {

3771 if (!src_y \|\| !src_u \|\| !src_v \|\| src_width <= 0 \|\| src_height == 0 \|\|

3772 !dst_y \|\| !dst_u \|\| !dst_v \|\| dst_width <= 0 \|\| dst_height <= 0) {

3773 return -1;

3774 }

3775 // Negative height means invert the image.

3776 if (src_height < 0) {

3777 int halfheight;

3778 src_height = -src_height;

3779 halfheight = (src_height + 1) >> 1;

3780 src_y = src_y + (src_height - 1) * src_stride_y;

3781 src_u = src_u + (halfheight - 1) * src_stride_u;

3782 src_v = src_v + (halfheight - 1) * src_stride_v;

3783 src_stride_y = -src_stride_y;

3784 src_stride_u = -src_stride_u;

3785 src_stride_v = -src_stride_v;

3786 }

3787 {

3788 int src_halfwidth = (src_width + 1) >> 1;

3789 int src_halfheight = (src_height + 1) >> 1;

3790 int dst_halfwidth = (dst_width + 1) >> 1;

3791 int dst_halfheight = (dst_height + 1) >> 1;

3792

3793 ScalePlane(src_y, src_stride_y, src_width, src_height,

3794 dst_y, dst_stride_y, dst_width, dst_height,

3795 filtering, use_reference_impl_);

3796 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

3797 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

3798 filtering, use_reference_impl_);

3799 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

3800 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

3801 filtering, use_reference_impl_);

3802 }

3803 return 0;

3804 }

3805

3806 // Deprecated api

3807 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

3808 int src_stride_y, int src_stride_u, int src_stride_v,

3809 int src_width, int src_height,

3810 uint8* dst_y, uint8* dst_u, uint8* dst_v,

3811 int dst_stride_y, int dst_stride_u, int dst_stride_v,

3812 int dst_width, int dst_height,

3813 int interpolate) {

3814 if (!src_y \|\| !src_u \|\| !src_v \|\| src_width <= 0 \|\| src_height == 0 \|\|

3815 !dst_y \|\| !dst_u \|\| !dst_v \|\| dst_width <= 0 \|\| dst_height <= 0) {

3816 return -1;

3817 }

3818 // Negative height means invert the image.

3819 if (src_height < 0) {

3820 int halfheight;

3821 src_height = -src_height;

3822 halfheight = (src_height + 1) >> 1;

3823 src_y = src_y + (src_height - 1) * src_stride_y;

3824 src_u = src_u + (halfheight - 1) * src_stride_u;

3825 src_v = src_v + (halfheight - 1) * src_stride_v;

3826 src_stride_y = -src_stride_y;

3827 src_stride_u = -src_stride_u;

3828 src_stride_v = -src_stride_v;

3829 }

3830 {

3831 int src_halfwidth = (src_width + 1) >> 1;

3832 int src_halfheight = (src_height + 1) >> 1;

3833 int dst_halfwidth = (dst_width + 1) >> 1;

3834 int dst_halfheight = (dst_height + 1) >> 1;

3835 FilterModeEnum filtering = interpolate ? kFilterBox : kFilterNone;

3836

3837 ScalePlane(src_y, src_stride_y, src_width, src_height,

3838 dst_y, dst_stride_y, dst_width, dst_height,

3839 filtering, use_reference_impl_);

3840 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

3841 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

3842 filtering, use_reference_impl_);

3843 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

3844 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

3845 filtering, use_reference_impl_);

3846 }

3847 return 0;

3848 }

3849

3850 // Deprecated api

3851 int ScaleOffset(const uint8* src, int src_width, int src_height,

3852 uint8* dst, int dst_width, int dst_height, int dst_yoffset,

3853 int interpolate) {

3854 if (!src \|\| src_width <= 0 \|\| src_height <= 0 \|\|

3855 !dst \|\| dst_width <= 0 \|\| dst_height <= 0 \|\| dst_yoffset < 0 \|\|

3856 dst_yoffset >= dst_height) {

3857 return -1;

3858 }

3859 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.

3860 {

3861 int src_halfwidth = (src_width + 1) >> 1;

3862 int src_halfheight = (src_height + 1) >> 1;

3863 int dst_halfwidth = (dst_width + 1) >> 1;

3864 int dst_halfheight = (dst_height + 1) >> 1;

3865 int aheight = dst_height - dst_yoffset * 2; // actual output height

3866 const uint8* const src_y = src;

3867 const uint8* const src_u = src + src_width * src_height;

3868 const uint8* const src_v = src + src_width * src_height +

3869 src_halfwidth * src_halfheight;

3870 uint8* dst_y = dst + dst_yoffset * dst_width;

3871 uint8* dst_u = dst + dst_width * dst_height +

3872 (dst_yoffset >> 1) * dst_halfwidth;

3873 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +

3874 (dst_yoffset >> 1) * dst_halfwidth;

3875 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,

3876 src_width, src_height, dst_y, dst_u, dst_v, dst_width,

3877 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);

3878 }

3879 }

3880

3881 #ifdef __cplusplus

3882 } // extern "C"

3883 } // namespace libyuv

3884 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/row_win.cc ('k') | source/libvpx/third_party/libyuv/source/scale.cc » ('j') | no next file with comments »