src/opts/SkBitmapProcState_matrix_repeat_neon.h - Issue 23835006: ARM Skia NEON patches - 20 - New improved BitmapProcState code

Side by Side Diff: src/opts/SkBitmapProcState_matrix_repeat_neon.h

Issue 23835006: ARM Skia NEON patches - 20 - New improved BitmapProcState code (Closed) Base URL: https://skia.googlecode.com/svn/trunk

Patch Set: Rebase to remove conflicts on ignored-tests.txt Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola

2 *

3 * Use of this source code is governed by a BSD-style license that can be

4 * found in the LICENSE file.

5 */

6

7 /*

8 * Modifications done in-house at Motorola

9 *

10 * this is a clone of SkBitmapProcState_matrix.h

11 * and has been tuned to work with the NEON unit.

12 *

13 * Still going back and forth between whether this approach

14 * (clone the entire SkBitmapProcState_matrix.h file or

15 * if I should put just the modified routines in here and

16 * then use a construct like #define DONT_DO_THIS_FUNCTION or

17 * something like that...

18 *

19 * This is for the RepeatX_RepeatY part of the world

20 */

21

22

23 #include <arm_neon.h>

24

25 /*

26 * This has been modified on the knowledge that (at the time)

27 * we had the following macro definitions in the parent file

28 *

29 * #define MAKENAME(suffix) RepeatX_RepeatY ## suffix

30 * #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)

31 * #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)

32 * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)

33 * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)

34 */

35

36 /* SkClampMax(val,max) -- bound to 0..max */

37

38 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)

39 #define SCALE_FILTER_NAME MAKENAME(_filter_scale)

40 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)

41 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine)

42 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)

43 #define PERSP_FILTER_NAME MAKENAME(_filter_persp)

44

45 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)

46 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)

47

48 #ifndef PREAMBLE

49 #define PREAMBLE(state)

50 #define PREAMBLE_PARAM_X

51 #define PREAMBLE_PARAM_Y

52 #define PREAMBLE_ARG_X

53 #define PREAMBLE_ARG_Y

54 #endif

55

56 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,

57 uint32_t xy[], int count, int x, int y) {

58 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|

59 SkMatrix::kScale_Mask)) == 0);

60

61 PREAMBLE(s);

62 // we store y, x, x, x, x, x

63

64 const unsigned maxX = s.fBitmap->width() - 1;

65 SkFixed fx;

66 {

67 SkPoint pt;

68 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,

69 SkIntToScalar(y) + SK_ScalarHalf, &pt);

70 fx = SkScalarToFixed(pt.fY);

71 const unsigned maxY = s.fBitmap->height() - 1;

72 *xy++ = TILEY_PROCF(fx, maxY);

73 fx = SkScalarToFixed(pt.fX);

74 }

75

76 if (0 == maxX) {

77 // all of the following X values must be 0

78 memset(xy, 0, count * sizeof(uint16_t));

79 return;

80 }

81

82 const SkFixed dx = s.fInvSx;

83

84 #ifdef CHECK_FOR_DECAL

85 // test if we don't need to apply the tile proc

86 if ((unsigned)(fx >> 16) <= maxX &&

87 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {

88 decal_nofilter_scale_neon(xy, fx, dx, count);

89 } else

90 #endif

91 {

92 int i;

93

94 /* RBE: very much like done in decal_nofilter ,

95 * but some processing of the 'fx' information

96 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)

97 */

98 if (count >= 8) {

99 /* SkFixed is 16.16 fixed point */

100 SkFixed dx2 = dx+dx;

101 SkFixed dx4 = dx2+dx2;

102 SkFixed dx8 = dx4+dx4;

103

104 /* now build fx/fx+dx/fx+2dx/fx+3dx */

105 SkFixed fx1, fx2, fx3;

106 int32x4_t lbase, hbase;

107 int16_t dst16 = (int16_t )xy;

108

109 fx1 = fx+dx;

110 fx2 = fx1+dx;

111 fx3 = fx2+dx;

112

113 lbase = vdupq_n_s32(fx);

114 lbase = vsetq_lane_s32(fx1, lbase, 1);

115 lbase = vsetq_lane_s32(fx2, lbase, 2);

116 lbase = vsetq_lane_s32(fx3, lbase, 3);

117 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));

118

119 /* store & bump */

120 do

121 {

122 int32x4_t lout;

123 int32x4_t hout;

124 int16x8_t hi16;

125

126 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)((max)+1)>> 16) /

127 /* mask to low 16 [would like to use uzp tricks) */

128 lout = vandq_s32(lbase, vdupq_n_s32(0xffff));

129 hout = vandq_s32(hbase, vdupq_n_s32(0xffff));

130 /* bare multiplication, not SkFixedMul */

131 lout = vmulq_s32(lout, vdupq_n_s32(maxX+1));

132 hout = vmulq_s32(hout, vdupq_n_s32(maxX+1));

133

134 /* extraction, using uzp */

135 /* this is ok -- we want all hi(lout)s then all hi(hout)s */

136 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));

137 hi16 = vreinterpretq_s16_s32(hout);

138 vst1q_s16(dst16, hi16);

139

140 /* bump our base on to the next */

141 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));

142 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));

143 dst16 += 8;

144 count -= 8;

145 fx += dx8;

146 } while (count >= 8);

147 xy = (uint32_t *) dst16;

148 }

149 uint16_t* xx = (uint16_t*)xy;

150 for (i = count; i > 0; --i) {

151 *xx++ = TILEX_PROCF(fx, maxX); fx += dx;

152 }

153 }

154 }

155

156 // note: we could special-case on a matrix which is skewed in X but not Y.

157 // this would require a more general setup thatn SCALE does, but could use

158 // SCALE's inner loop that only looks at dx

159

160

161 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,

162 uint32_t xy[], int count, int x, int y) {

163 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);

164 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|

165 SkMatrix::kScale_Mask \|

166 SkMatrix::kAffine_Mask)) == 0);

167

168 PREAMBLE(s);

169 SkPoint srcPt;

170 s.fInvProc(s.fInvMatrix,

171 SkIntToScalar(x) + SK_ScalarHalf,

172 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);

173

174 SkFixed fx = SkScalarToFixed(srcPt.fX);

175 SkFixed fy = SkScalarToFixed(srcPt.fY);

176 SkFixed dx = s.fInvSx;

177 SkFixed dy = s.fInvKy;

178 int maxX = s.fBitmap->width() - 1;

179 int maxY = s.fBitmap->height() - 1;

180

181 #if 0

182 int ocount = count;

183 uint32_t *oxy = xy;

184 SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy;

185 #endif

186

187

188 if (0) { extern void rbe(void); rbe(); }

189

190 /* RBE: benchmarks show this eats up time; can we neonize it? */

191 /* RBE: very much like done in decal_nofilter ,

192 * but some processing of the 'fx' information

193 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)

194 */

195 if (count >= 4) {

196 /* SkFixed is 16.16 fixed point */

197 SkFixed dx4 = dx*4;

198 SkFixed dy4 = dy*4;

199

200 /* now build fx/fx+dx/fx+2dx/fx+3dx */

201 int32x4_t xbase, ybase;

202 int16_t dst16 = (int16_t )xy;

203

204 /* synthesize 4x for both X and Y */

205 xbase = vdupq_n_s32(fx);

206 xbase = vsetq_lane_s32(fx+dx, xbase, 1);

207 xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);

208 xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);

209

210 ybase = vdupq_n_s32(fy);

211 ybase = vsetq_lane_s32(fy+dy, ybase, 1);

212 ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);

213 ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);

214

215 /* store & bump */

216 do {

217 int32x4_t xout;

218 int32x4_t yout;

219 int16x8_t hi16;

220

221 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)((max)+1)>> 16) /

222 /* mask to low 16 [would like to use uzp tricks) */

223 xout = vandq_s32(xbase, vdupq_n_s32(0xffff));

224 yout = vandq_s32(ybase, vdupq_n_s32(0xffff));

225 /* bare multiplication, not SkFixedMul */

226 xout = vmulq_s32(xout, vdupq_n_s32(maxX+1));

227 yout = vmulq_s32(yout, vdupq_n_s32(maxY+1));

228

229 /* put hi16 from xout over low16 from yout */

230 yout = vsriq_n_s32(yout, xout, 16);

231

232 /* and then yout has the interleaved upper 16's */

233 hi16 = vreinterpretq_s16_s32(yout);

234 vst1q_s16(dst16, hi16);

235

236 /* bump preserved base & on to the next */

237 xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4));

238 ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4));

239 dst16 += 8; /* 8 x16 aka 4x32 */

240 count -= 4;

241 fx += dx4;

242 fy += dy4;

243 } while (count >= 4);

244 xy = (uint32_t *) dst16;

245 }

246

247 #if 0

248 /* diagnostics... see whether we agree with the NEON code */

249 int bad = 0;

250 uint32_t *myxy = oxy;

251 int myi = (-1);

252 SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy;

253 for (myi = ocount; myi > 0; --myi) {

254 uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) \| TILEX_PROCF(ofx, maxX);

255 if (val != *myxy++) {

256 bad++;

257 break;

258 }

259 ofx += odx; ofy += ody;

260 }

261 if (bad) {

262 SkDebugf("repeat-nofilter-affine fails\n");

263 SkDebugf("count %d myi %d\n", ocount, myi);

264 SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",

265 bfx, bdx, bfy, bdy);

266 SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);

267 }

268 #endif

269

270 for (int i = count; i > 0; --i) {

271 /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */

272 /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */

273 *xy++ = (TILEY_PROCF(fy, maxY) << 16) \| TILEX_PROCF(fx, maxX);

274 fx += dx; fy += dy;

275 }

276 }

277

278 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,

279 uint32_t* SK_RESTRICT xy,

280 int count, int x, int y) {

281 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);

282

283 PREAMBLE(s);

284 int maxX = s.fBitmap->width() - 1;

285 int maxY = s.fBitmap->height() - 1;

286

287 SkPerspIter iter(s.fInvMatrix,

288 SkIntToScalar(x) + SK_ScalarHalf,

289 SkIntToScalar(y) + SK_ScalarHalf, count);

290

291 while ((count = iter.next()) != 0) {

292 const SkFixed* SK_RESTRICT srcXY = iter.getXY();

293

294 /* RBE: */

295 /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */

296 /* it's a little more complicated than what I did for the

297 * clamp case -- where I could immediately snip to the top

298 * 16 bits and do my min/max games there.

299 * ... might only be able to get 4x unrolling here

300 */

301

302 /* vld2 to get a set of 32x4's ... */

303 /* do the tile[xy]_procf operations */

304 /* which includes doing vuzp to get hi16's */

305 /* store it */

306 /* -- inner loop (other than vld2) can be had from above */

307

308 /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...

309 * but we immediately discard the low 16 bits...

310 * so what we're going to do is vld4, which will give us

311 * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'

312 * parts....

313 */

314 if (0) { extern void rbe(void); rbe(); }

315 if (count >= 8) {

316 int32_t mysrc = (int32_t ) srcXY;

317 int16_t mydst = (int16_t ) xy;

318 do {

319 int32x4_t x, y, x2, y2;

320 int16x8_t hi, hi2;

321

322 /* read array of x,y,x,y,x,y */

323 /* vld2 does the de-interleaving for us */

324 /* isolate reg-bound scopes; gcc will minimize register

325 * motion if possible; this ensures that we don't lose

326 * a register across a debugging call because it happens

327 * to be bound into a call-clobbered register

328 */

329 {

330 register int32x4_t q0 asm("q0");

331 register int32x4_t q1 asm("q1");

332 asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"

333 : "=w" (q0), "=w" (q1)

334 : "r" (mysrc)

335 );

336 x = q0; y = q1;

337 }

338

339 /* offset == 256 bits == 32 bytes == 8 longs */

340 {

341 register int32x4_t q2 asm("q2");

342 register int32x4_t q3 asm("q3");

343 asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"

344 : "=w" (q2), "=w" (q3)

345 : "r" (mysrc+8)

346 );

347 x2 = q2; y2 = q3;

348 }

349

350 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)((max)+1)>> 16) /

351 /* mask to low 16 [would like to use uzp tricks) */

352 /* bare multiplication, not SkFixedMul */

353 x = vandq_s32(x, vdupq_n_s32(0xffff));

354 x = vmulq_s32(x, vdupq_n_s32(maxX+1));

355 y = vandq_s32(y, vdupq_n_s32(0xffff));

356 y = vmulq_s32(y, vdupq_n_s32(maxY+1));

357

358 x2 = vandq_s32(x2, vdupq_n_s32(0xffff));

359 x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1));

360 y2 = vandq_s32(y2, vdupq_n_s32(0xffff));

361 y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1));

362

363 /* now collect interleaved high 16's */

364 /* (hi-x, hi-y)4 (hi-x2; hi-y2)4 */

365

366 /* extraction, using uzp, leaves hi16's in y */

367 y = vsriq_n_s32(y, x, 16);

368 hi = vreinterpretq_s16_s32(y);

369 vst1q_s16(mydst, hi);

370

371 /* and likewise for the second 8 entries */

372 y2 = vsriq_n_s32(y2, x2, 16);

373 hi2 = vreinterpretq_s16_s32(y2);

374 vst1q_s16(mydst+8, hi2);

375

376 /* XXX: gcc isn't interleaving these with the NEON ops

377 * but i think that all the scoreboarding works out */

378 count -= 8; /* 8 iterations */

379 mysrc += 16; /* 16 longs */

380 mydst += 16; /* 16 shorts, aka 8 longs */

381 } while (count >= 8);

382 /* get xy and srcXY fixed up */

383 srcXY = (const SkFixed *) mysrc;

384 xy = (uint32_t *) mydst;

385 }

386 while (--count >= 0) {

387 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) \|

388 TILEX_PROCF(srcXY[0], maxX);

389 srcXY += 2;

390 }

391 }

392 }

393

394 //////////////////////////////////////////////////////////////////////////////

395

396 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,

397 SkFixed one PREAMBLE_PARAM_Y) {

398 unsigned i = TILEY_PROCF(f, max);

399 i = (i << 4) \| TILEY_LOW_BITS(f, max);

400 return (i << 14) \| (TILEY_PROCF((f + one), max));

401 }

402

403 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,

404 SkFixed one PREAMBLE_PARAM_X) {

405 unsigned i = TILEX_PROCF(f, max);

406 i = (i << 4) \| TILEX_LOW_BITS(f, max);

407 return (i << 14) \| (TILEX_PROCF((f + one), max));

408 }

409

410 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,

411 uint32_t xy[], int count, int x, int y) {

412 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|

413 SkMatrix::kScale_Mask)) == 0);

414 SkASSERT(s.fInvKy == 0);

415

416 PREAMBLE(s);

417

418 const unsigned maxX = s.fBitmap->width() - 1;

419 const SkFixed one = s.fFilterOneX;

420 const SkFractionalInt dx = s.fInvSxFractionalInt;

421 SkFractionalInt fx;

422

423 {

424 SkPoint pt;

425 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,

426 SkIntToScalar(y) + SK_ScalarHalf, &pt);

427 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);

428 const unsigned maxY = s.fBitmap->height() - 1;

429 // compute our two Y values up front

430 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);

431 // now initialize fx

432 fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1) ;

433 }

434

435 #ifdef CHECK_FOR_DECAL

436 // test if we don't need to apply the tile proc

437 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {

438 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),

439 SkFractionalIntToFixed(dx), count);

440 } else

441 #endif

442 {

443 do {

444 SkFixed fixedFx = SkFractionalIntToFixed(fx);

445 *xy++ = PACK_FILTER_X_NAME(fixedFx, maxX, one PREAMBLE_ARG_X);

446 fx += dx;

447 } while (--count != 0);

448 }

449 }

450

451 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,

452 uint32_t xy[], int count, int x, int y) {

453 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);

454 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask \|

455 SkMatrix::kScale_Mask \|

456 SkMatrix::kAffine_Mask)) == 0);

457

458 PREAMBLE(s);

459 SkPoint srcPt;

460 s.fInvProc(s.fInvMatrix,

461 SkIntToScalar(x) + SK_ScalarHalf,

462 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);

463

464 SkFixed oneX = s.fFilterOneX;

465 SkFixed oneY = s.fFilterOneY;

466 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);

467 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);

468 SkFixed dx = s.fInvSx;

469 SkFixed dy = s.fInvKy;

470 unsigned maxX = s.fBitmap->width() - 1;

471 unsigned maxY = s.fBitmap->height() - 1;

472

473 do {

474 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);

475 fy += dy;

476 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);

477 fx += dx;

478 } while (--count != 0);

479 }

480

481 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,

482 uint32_t* SK_RESTRICT xy, int count,

483 int x, int y) {

484 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);

485

486 extern void rbe(void);

487

488 PREAMBLE(s);

489 unsigned maxX = s.fBitmap->width() - 1;

490 unsigned maxY = s.fBitmap->height() - 1;

491 SkFixed oneX = s.fFilterOneX;

492 SkFixed oneY = s.fFilterOneY;

493

494

495

496 SkPerspIter iter(s.fInvMatrix,

497 SkIntToScalar(x) + SK_ScalarHalf,

498 SkIntToScalar(y) + SK_ScalarHalf, count);

499

500 while ((count = iter.next()) != 0) {

501 const SkFixed* SK_RESTRICT srcXY = iter.getXY();

502 do {

503 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,

504 oneY PREAMBLE_ARG_Y);

505 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,

506 oneX PREAMBLE_ARG_X);

507 srcXY += 2;

508 } while (--count != 0);

509 }

510 }

511

512 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {

513 SCALE_NOFILTER_NAME,

514 SCALE_FILTER_NAME,

515 AFFINE_NOFILTER_NAME,

516 AFFINE_FILTER_NAME,

517 PERSP_NOFILTER_NAME,

518 PERSP_FILTER_NAME

519 };

520

521 #undef MAKENAME

522 #undef TILEX_PROCF

523 #undef TILEY_PROCF

524 #ifdef CHECK_FOR_DECAL

525 #undef CHECK_FOR_DECAL

526 #endif

527

528 #undef SCALE_NOFILTER_NAME

529 #undef SCALE_FILTER_NAME

530 #undef AFFINE_NOFILTER_NAME

531 #undef AFFINE_FILTER_NAME

532 #undef PERSP_NOFILTER_NAME

533 #undef PERSP_FILTER_NAME

534

535 #undef PREAMBLE

536 #undef PREAMBLE_PARAM_X

537 #undef PREAMBLE_PARAM_Y

538 #undef PREAMBLE_ARG_X

539 #undef PREAMBLE_ARG_Y

540

541 #undef TILEX_LOW_BITS

542 #undef TILEY_LOW_BITS

OLD	NEW

« no previous file with comments | « src/opts/SkBitmapProcState_matrix_neon.h ('k') | no next file » | no next file with comments »