src/opts/SkBlitRow_opts_arm_neon.cpp - Issue 156113005: ARM Skia NEON patches - 27 - S32A_D565_Blend

Side by Side Diff: src/opts/SkBlitRow_opts_arm_neon.cpp

Issue 156113005: ARM Skia NEON patches - 27 - S32A_D565_Blend (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Add ignored-tests.txt Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include "SkBlitRow_opts_arm_neon.h"	8 #include "SkBlitRow_opts_arm_neon.h"

9	9

10 #include "SkBlitMask.h"	10 #include "SkBlitMask.h"

(...skipping 212 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
223 "21: \n\t"	223 "21: \n\t"

224 : [count] "+r" (count)	224 : [count] "+r" (count)

225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)	225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (s rc)

226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",	226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6" ,"d7",

227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",	227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25 ","d26","d27","d28","d29",

228 "d30","d31"	228 "d30","d31"

229 );	229 );

230 }	230 }

231 }	231 }

232	232

	233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

	234 prod += vdupq_n_u16(128);

	235 prod += vshrq_n_u16(prod, 8);

	236 return vshrq_n_u16(prod, 8);

	237 }

	238

233 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,	239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

234 const SkPMColor* SK_RESTRICT src, int count,	240 const SkPMColor* SK_RESTRICT src, int count,

235 U8CPU alpha, int /x/, int /y/) {	241 U8CPU alpha, int /x/, int /y/) {

	242 SkASSERT(255 > alpha);

236	243

237 U8CPU alpha_for_asm = alpha;	244 /* This code implements a Neon version of S32A_D565_Blend. The results have

238	245 * a few mismatches compared to the original code. These mismatches never

239 asm volatile (	246 * exceed 1.

240 /* This code implements a Neon version of S32A_D565_Blend. The output differ s from

241 * the original in two respects:

242 * 1. The results have a few mismatches compared to the original code. Thes e mismatches

243 * never exceed 1. It's possible to improve accuracy vs. a floating poin t

244 * implementation by introducing rounding right shifts (vrshr) for the f inal stage.

245 * Rounding is not present in the code below, because although results w ould be closer

246 * to a floating point implementation, the number of mismatches compared to the

247 * original code would be far greater.

248 * 2. On certain inputs, the original code can overflow, causing colour cha nnels to

249 * mix. Although the Neon code can also overflow, it doesn't allow one c olour channel

250 * to affect another.

251 */	247 */

252	248

253 #if 1	249 if (count >= 8) {

254 /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */	250 uint16x8_t valpha_max, vmask_blue;

255 "add %[alpha], %[alpha], #1 \n\t" // adjust r ange of alpha 0-256	251 uint8x8_t valpha;

	252

	253 // prepare constants

	254 valpha_max = vmovq_n_u16(255);

	255 valpha = vdup_n_u8(alpha);

	256 vmask_blue = vmovq_n_u16(SK_B16_MASK);

	257

	258 do {

	259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

	260 uint16x8_t vres_a, vres_r, vres_g, vres_b;

	261 uint8x8x4_t vsrc;

	262

	263 // load pixels

	264 vdst = vld1q_u16(dst);

	265 #if (__GNUC__ > 4) \|\| ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

	266 asm (

	267 "vld4.u8 %h[vsrc], [%[src]]!"

	268 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

	269 : :

	270 );

256 #else	271 #else

257 "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256	272 register uint8x8_t d0 asm("d0");

258 #endif	273 register uint8x8_t d1 asm("d1");

259 "vmov.u16 q3, #255 \n\t" // set up constant	274 register uint8x8_t d2 asm("d2");

260 "movs r4, %[count], lsr #3 \n\t" // calc. c ount>>3	275 register uint8x8_t d3 asm("d3");

261 "vmov.u16 d2[0], %[alpha] \n\t" // move al pha to Neon

262 "beq 2f \n\t" // if coun t8 == 0, exit

263 "vmov.u16 q15, #0x1f \n\t" // set up blue mask

264	276

265 "1: \n\t"	277 asm volatile (

266 "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load ei ght dst RGB565 pixels	278 "vld4.u8 {d0-d3},[%[src]]!;"

267 "subs r4, r4, #1 \n\t" // decreme nt loop counter	279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

268 "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load ei ght src ABGR32 pixels	280 [src] "+&r" (src)

269 // and deinterleave	281 : :

270	282 );

271 "vshl.u16 q9, q0, #5 \n\t" // shift g reen to top of lanes	283 vsrc.val[0] = d0;

272 "vand q10, q0, q15 \n\t" // extract blue	284 vsrc.val[1] = d1;

273 "vshr.u16 q8, q0, #11 \n\t" // extract red	285 vsrc.val[2] = d2;

274 "vshr.u16 q9, q9, #10 \n\t" // extract green	286 vsrc.val[3] = d3;

275 // dstrgb = {q8, q9, q10}

276

277 "vshr.u8 d24, d24, #3 \n\t" // shift r ed to 565 range

278 "vshr.u8 d25, d25, #2 \n\t" // shift g reen to 565 range

279 "vshr.u8 d26, d26, #3 \n\t" // shift b lue to 565 range

280

281 "vmovl.u8 q11, d24 \n\t" // widen r ed to 16 bits

282 "vmovl.u8 q12, d25 \n\t" // widen g reen to 16 bits

283 "vmovl.u8 q14, d27 \n\t" // widen a lpha to 16 bits

284 "vmovl.u8 q13, d26 \n\t" // widen b lue to 16 bits

285 // srcrgba = {q11, q12, q13, q14}

286

287 "vmul.u16 q2, q14, d2[0] \n\t" // sa * sr c_scale

288 "vmul.u16 q11, q11, d2[0] \n\t" // red res ult = src_red * src_scale

289 "vmul.u16 q12, q12, d2[0] \n\t" // grn res ult = src_grn * src_scale

290 "vmul.u16 q13, q13, d2[0] \n\t" // blu res ult = src_blu * src_scale

291

292 "vshr.u16 q2, q2, #8 \n\t" // sa * sr c_scale >> 8

293 "vsub.u16 q2, q3, q2 \n\t" // 255 - ( sa * src_scale >> 8)

294 // dst_scale = q2

295

296 "vmla.u16 q11, q8, q2 \n\t" // red res ult += dst_red * dst_scale

297 "vmla.u16 q12, q9, q2 \n\t" // grn res ult += dst_grn * dst_scale

298 "vmla.u16 q13, q10, q2 \n\t" // blu res ult += dst_blu * dst_scale

299

300 #if 1

301 // trying for a better match with SkDiv255Round(a)

302 // C alg is: a+=128; (a+a>>8)>>8

303 // we'll use just a rounding shift [q2 is available for scratch]

304 "vrshr.u16 q11, q11, #8 \n\t" // shift down red

305 "vrshr.u16 q12, q12, #8 \n\t" // shift down green

306 "vrshr.u16 q13, q13, #8 \n\t" // shift down blue

307 #else

308 // arm's original "truncating divide by 256"

309 "vshr.u16 q11, q11, #8 \n\t" // shift d own red

310 "vshr.u16 q12, q12, #8 \n\t" // shift d own green

311 "vshr.u16 q13, q13, #8 \n\t" // shift d own blue

312 #endif	287 #endif

313	288

314 "vsli.u16 q13, q12, #5 \n\t" // insert green into blue

315 "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue

316 "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write p ixel back to dst, update ptr

317	289

318 "bne 1b \n\t" // if coun ter != 0, loop	290 // deinterleave dst

319 "2: \n\t" // exi t	291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to to p of lanes

	292 vdst_b = vdst & vmask_blue; // extract blue

	293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red

	294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green

320	295

321 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [a lpha] "+r" (alpha_for_asm)	296 // shift src to 565

322 :	297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);

323 : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d 6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"	298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);

324 );	299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);

325	300

326 count &= 7;	301 // calc src * src_scale

327 if (count > 0) {	302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);

328 do {	303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);

329 SkPMColor sc = *src++;	304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);

330 if (sc) {	305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);

331 uint16_t dc = *dst;	306

332 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);	307 // prepare dst_scale

333 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(Sk GetPackedR16(dc), dst_scale);	308 vres_a = SkDiv255Round_neon8(vres_a);

334 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(Sk GetPackedG16(dc), dst_scale);	309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255

335 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(Sk GetPackedB16(dc), dst_scale);	310

336 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv25 5Round(db));	311 // add dst * dst_scale to previous result

337 }	312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);

338 dst += 1;	313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);

339 } while (--count != 0);	314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);

	315

	316 #ifdef S32A_D565_BLEND_EXACT

	317 // It is possible to get exact results with this but it is slow,

	318 // even slower than C code in some cases

	319 vres_r = SkDiv255Round_neon8(vres_r);

	320 vres_g = SkDiv255Round_neon8(vres_g);

	321 vres_b = SkDiv255Round_neon8(vres_b);

	322 #else

	323 vres_r = vrshrq_n_u16(vres_r, 8);

	324 vres_g = vrshrq_n_u16(vres_g, 8);

	325 vres_b = vrshrq_n_u16(vres_b, 8);

	326 #endif

	327 // pack result

	328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue

	329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red in to green/blue

	330

	331 // store

	332 vst1q_u16(dst, vres_b);

	333 dst += 8;

	334 count -= 8;

	335 } while (count >= 8);

	336 }

	337

	338 // leftovers

	339 while (count-- > 0) {

	340 SkPMColor sc = *src++;

	341 if (sc) {

	342 uint16_t dc = *dst;

	343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alph a);

	344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetP ackedR16(dc), dst_scale);

	345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetP ackedG16(dc), dst_scale);

	346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetP ackedB16(dc), dst_scale);

	347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Rou nd(db));

	348 }

	349 dst += 1;

340 }	350 }

341 }	351 }

342	352

343 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.	353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.

344 * each dither value is spaced out into byte lanes, and repeated	354 * each dither value is spaced out into byte lanes, and repeated

345 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the	355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the

346 * start of each row.	356 * start of each row.

347 */	357 */

348 static const uint8_t gDitherMatrix_Neon[48] = {	358 static const uint8_t gDitherMatrix_Neon[48] = {

349 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,	359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,

(...skipping 1084 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1434 * case where we do not inspect the src alpha.	1444 * case where we do not inspect the src alpha.

1435 */	1445 */

1436 #if SK_A32_SHIFT == 24	1446 #if SK_A32_SHIFT == 24

1437 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor	1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

1438 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,	1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,

1439 #else	1449 #else

1440 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,	1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,

1441 #endif	1451 #endif

1442 S32A_Blend_BlitRow32_neon // S32A_Blend	1452 S32A_Blend_BlitRow32_neon // S32A_Blend

1443 };	1453 };

OLD	NEW

« no previous file with comments | « expectations/gm/ignored-tests.txt ('k') | no next file » | no next file with comments »