src/opts/SkBlitRow_opts_SSE2.cpp - Issue 754733002: Add SkBlendARGB32_SSE2() to clean up code

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 754733002: Add SkBlendARGB32_SSE2() to clean up code (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: add fast path for SkBlendARGB32_SSE2 with constant alpha factor Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2012 The Android Open Source Project	2 * Copyright 2012 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #include <emmintrin.h>	8 #include <emmintrin.h>

9 #include "SkBitmapProcState_opts_SSE2.h"	9 #include "SkBitmapProcState_opts_SSE2.h"

10 #include "SkBlitRow_opts_SSE2.h"	10 #include "SkBlitRow_opts_SSE2.h"

(...skipping 20 matching lines...) Expand all Loading...
31 SkASSERT(((size_t)dst & 0x03) == 0);	31 SkASSERT(((size_t)dst & 0x03) == 0);

32 while (((size_t)dst & 0x0F) != 0) {	32 while (((size_t)dst & 0x0F) != 0) {

33 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);	33 dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

34 src++;	34 src++;

35 dst++;	35 dst++;

36 count--;	36 count--;

37 }	37 }

38	38

39 const __m128i s = reinterpret_cast<const __m128i>(src);	39 const __m128i s = reinterpret_cast<const __m128i>(src);

40 __m128i d = reinterpret_cast<__m128i>(dst);	40 __m128i d = reinterpret_cast<__m128i>(dst);

41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

43	41

44 // Move scale factors to upper byte of word

45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);

47 while (count >= 4) {	42 while (count >= 4) {

48 // Load 4 pixels each of src and dest.	43 // Load 4 pixels each of src and dest.

49 __m128i src_pixel = _mm_loadu_si128(s);	44 __m128i src_pixel = _mm_loadu_si128(s);

50 __m128i dst_pixel = _mm_load_si128(d);	45 __m128i dst_pixel = _mm_load_si128(d);

51	46

52 // Interleave Atom port 0/1 operations based on the execution port	47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);

53 // constraints that multiply can only be executed on port 0 (while	48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);

54 // boolean operations can be executed on either port 0 or port 1)

55 // because GCC currently doesn't do a good job scheduling

56 // instructions based on these constraints.

57

58 // Get red and blue pixels into lower byte of each word.

59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)

60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

61

62 // Multiply by scale.

63 // (4 x (0, rs.h, 0, bs.h))

64 // where rs.h stands for the higher byte of r * scale, and

65 // bs.h the higher byte of b * scale.

66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

67

68 // Get alpha and green pixels into higher byte of each word.

69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)

70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

71

72 // Multiply by scale.

73 // (4 x (as.h, as.l, gs.h, gs.l))

74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

75

76 // Clear the lower byte of the ascale and gscale results

77 // (4 x (as.h, 0, gs.h, 0))

78 src_ag = _mm_and_si128(src_ag, ag_mask);

79

80 // Operations the destination pixels are the same as on the

81 // source pixels. See the comments above.

82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);

84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);

85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);

86 dst_ag = _mm_and_si128(dst_ag, ag_mask);

87

88 // Combine back into RGBA.

89 // (4 x (as.h, rs.h, gs.h, bs.h))

90 src_pixel = _mm_or_si128(src_rb, src_ag);

91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

92	49

93 // Add result	50 // Add result

94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);	51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

95 _mm_store_si128(d, result);	52 _mm_store_si128(d, result);

96 s++;	53 s++;

97 d++;	54 d++;

98 count -= 4;	55 count -= 4;

99 }	56 }

100 src = reinterpret_cast<const SkPMColor*>(s);	57 src = reinterpret_cast<const SkPMColor*>(s);

101 dst = reinterpret_cast<SkPMColor*>(d);	58 dst = reinterpret_cast<SkPMColor*>(d);

(...skipping 141 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
243 }	200 }

244	201

245 if (count >= 4) {	202 if (count >= 4) {

246 while (((size_t)dst & 0x0F) != 0) {	203 while (((size_t)dst & 0x0F) != 0) {

247 dst = SkBlendARGB32(src, *dst, alpha);	204 dst = SkBlendARGB32(src, *dst, alpha);

248 src++;	205 src++;

249 dst++;	206 dst++;

250 count--;	207 count--;

251 }	208 }

252	209

253 uint32_t src_scale = SkAlpha255To256(alpha);

254

255 const __m128i s = reinterpret_cast<const __m128i>(src);	210 const __m128i s = reinterpret_cast<const __m128i>(src);

256 __m128i d = reinterpret_cast<__m128i>(dst);	211 __m128i d = reinterpret_cast<__m128i>(dst);

257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)

260 while (count >= 4) {	212 while (count >= 4) {

261 // Load 4 pixels each of src and dest.	213 // Load 4 pixels each of src and dest.

262 __m128i src_pixel = _mm_loadu_si128(s);	214 __m128i src_pixel = _mm_loadu_si128(s);

263 __m128i dst_pixel = _mm_load_si128(d);	215 __m128i dst_pixel = _mm_load_si128(d);

264	216

265 // Get red and blue pixels into lower byte of each word.	217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);

266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

268

269 // Get alpha and green into lower byte of each word.

270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

272

273 // Put per-pixel alpha in low byte of each word.

274 // After the following two statements, the dst_alpha looks like

275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)

276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

278

279 // dst_alpha = dst_alpha * src_scale

280 // Because src_scales are in the higher byte of each word and

281 // we use mulhi here, the resulting alpha values are already

282 // in the right place and don't need to be divided by 256.

283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)

284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

285

286 // Subtract alphas from 256, to get 1..256

287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

288

289 // Multiply red and blue by dst pixel alpha.

290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

291 // Multiply alpha and green by dst pixel alpha.

292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

293

294 // Multiply red and blue by global alpha.

295 // (4 x (0, rs.h, 0, bs.h))

296 // where rs.h stands for the higher byte of r * src_scale,

297 // and bs.h the higher byte of b * src_scale.

298 // Again, because we use mulhi, the resuling red and blue

299 // values are already in the right place and don't need to

300 // be divided by 256.

301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

302 // Multiply alpha and green by global alpha.

303 // (4 x (0, as.h, 0, gs.h))

304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

305

306 // Divide by 256.

307 dst_rb = _mm_srli_epi16(dst_rb, 8);

308

309 // Mask out low bits (goodies already in the right place; no need to divide)

310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

311 // Shift alpha and green to higher byte of each word.

312 // (4 x (as.h, 0, gs.h, 0))

313 src_ag = _mm_slli_epi16(src_ag, 8);

314

315 // Combine back into RGBA.

316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

317 src_pixel = _mm_or_si128(src_rb, src_ag);

318

319 // Add two pixels into result.

320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);

321 _mm_store_si128(d, result);	218 _mm_store_si128(d, result);

322 s++;	219 s++;

323 d++;	220 d++;

324 count -= 4;	221 count -= 4;

325 }	222 }

326 src = reinterpret_cast<const SkPMColor*>(s);	223 src = reinterpret_cast<const SkPMColor*>(s);

327 dst = reinterpret_cast<SkPMColor*>(d);	224 dst = reinterpret_cast<SkPMColor*>(d);

328 }	225 }

329	226

330 while (count > 0) {	227 while (count > 0) {

(...skipping 30 matching lines...) Expand all Loading...
361 SkASSERT(((size_t)dst & 0x03) == 0);	258 SkASSERT(((size_t)dst & 0x03) == 0);

362 while (((size_t)dst & 0x0F) != 0) {	259 while (((size_t)dst & 0x0F) != 0) {

363 dst = color + SkAlphaMulQ(src, scale);	260 dst = color + SkAlphaMulQ(src, scale);

364 src++;	261 src++;

365 dst++;	262 dst++;

366 count--;	263 count--;

367 }	264 }

368	265

369 const __m128i s = reinterpret_cast<const __m128i>(src);	266 const __m128i s = reinterpret_cast<const __m128i>(src);

370 __m128i d = reinterpret_cast<__m128i>(dst);	267 __m128i d = reinterpret_cast<__m128i>(dst);

371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

372 __m128i src_scale_wide = _mm_set1_epi16(scale);

373 __m128i color_wide = _mm_set1_epi32(color);	268 __m128i color_wide = _mm_set1_epi32(color);

374 while (count >= 4) {	269 while (count >= 4) {

375 // Load 4 pixels each of src and dest.

376 __m128i src_pixel = _mm_loadu_si128(s);	270 __m128i src_pixel = _mm_loadu_si128(s);

	271 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale);

377	272

378 // Get red and blue pixels into lower byte of each word.

379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

380

381 // Get alpha and green into lower byte of each word.

382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

383

384 // Multiply by scale.

385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

387

388 // Divide by 256.

389 src_rb = _mm_srli_epi16(src_rb, 8);

390 src_ag = _mm_andnot_si128(rb_mask, src_ag);

391

392 // Combine back into RGBA.

393 src_pixel = _mm_or_si128(src_rb, src_ag);

394

395 // Add color to result.

396 __m128i result = _mm_add_epi8(color_wide, src_pixel);	273 __m128i result = _mm_add_epi8(color_wide, src_pixel);

397

398 // Store result.

399 _mm_store_si128(d, result);	274 _mm_store_si128(d, result);

400 s++;	275 s++;

401 d++;	276 d++;

402 count -= 4;	277 count -= 4;

403 }	278 }

404 src = reinterpret_cast<const SkPMColor*>(s);	279 src = reinterpret_cast<const SkPMColor*>(s);

405 dst = reinterpret_cast<SkPMColor*>(d);	280 dst = reinterpret_cast<SkPMColor*>(d);

406 }	281 }

407	282

408 while (count > 0) {	283 while (count > 0) {

(...skipping 16 matching lines...) Expand all Loading...
425 do {	300 do {

426 int count = width;	301 int count = width;

427 if (count >= 4) {	302 if (count >= 4) {

428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {	303 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {

429 dst = SkBlendARGB32(color, dst, *mask);	304 dst = SkBlendARGB32(color, dst, *mask);

430 mask++;	305 mask++;

431 dst++;	306 dst++;

432 count--;	307 count--;

433 }	308 }

434 __m128i d = reinterpret_cast<__m128i>(dst);	309 __m128i d = reinterpret_cast<__m128i>(dst);

435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

436 __m128i c_256 = _mm_set1_epi16(256);

437 __m128i c_1 = _mm_set1_epi16(1);

438 __m128i src_pixel = _mm_set1_epi32(color);	310 __m128i src_pixel = _mm_set1_epi32(color);

439 while (count >= 4) {	311 while (count >= 4) {

440 // Load 4 pixels each of src and dest.	312 // Load 4 dst pixels

441 __m128i dst_pixel = _mm_load_si128(d);	313 __m128i dst_pixel = _mm_load_si128(d);

442	314

443 //set the aphla value	315 // Set the alpha value

444 __m128i src_scale_wide = _mm_cvtsi32_si128(reinterpret_cast<con st uint32_t>(mask));	316 __m128i alpha_wide = _mm_cvtsi32_si128(reinterpret_cast<const u int32_t>(mask));

445 src_scale_wide = _mm_unpacklo_epi8(src_scale_wide,	317 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128());

446 _mm_setzero_si128());	318 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128()) ;

447 src_scale_wide = _mm_unpacklo_epi16(src_scale_wide, src_scale_wi de);

448	319

449 //call SkAlpha255To256()	320 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_ wide);

450 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

451

452 // Get red and blue pixels into lower byte of each word.

453 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

454 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

455

456 // Get alpha and green into lower byte of each word.

457 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

458 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

459

460 // Put per-pixel alpha in low byte of each word.

461 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

462 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

463

464 // dst_alpha = dst_alpha * src_scale

465 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

466

467 // Divide by 256.

468 dst_alpha = _mm_srli_epi16(dst_alpha, 8);

469

470 // Subtract alphas from 256, to get 1..256

471 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

472 // Multiply red and blue by dst pixel alpha.

473 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

474 // Multiply alpha and green by dst pixel alpha.

475 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

476

477 // Multiply red and blue by global alpha.

478 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

479 // Multiply alpha and green by global alpha.

480 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

481 // Divide by 256.

482 dst_rb = _mm_srli_epi16(dst_rb, 8);

483 src_rb = _mm_srli_epi16(src_rb, 8);

484

485 // Mask out low bits (goodies already in the right place; no nee d to divide)

486 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

487 src_ag = _mm_andnot_si128(rb_mask, src_ag);

488

489 // Combine back into RGBA.

490 dst_pixel = _mm_or_si128(dst_rb, dst_ag);

491 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

492

493 // Add two pixels into result.

494 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);

495 _mm_store_si128(d, result);	321 _mm_store_si128(d, result);

496 // load the next 4 pixel	322 // Load the next 4 dst pixels and alphas

497 mask = mask + 4;	323 mask = mask + 4;

498 d++;	324 d++;

499 count -= 4;	325 count -= 4;

500 }	326 }

501 dst = reinterpret_cast<SkPMColor *>(d);	327 dst = reinterpret_cast<SkPMColor*>(d);

502 }	328 }

503 while (count > 0) {	329 while (count > 0) {

504 dst= SkBlendARGB32(color, dst, *mask);	330 dst= SkBlendARGB32(color, dst, *mask);

505 dst += 1;	331 dst += 1;

506 mask++;	332 mask++;

507 count --;	333 count --;

508 }	334 }

509 dst = (SkPMColor )((char)dst + dstOffset);	335 dst = (SkPMColor )((char)dst + dstOffset);

510 mask += maskOffset;	336 mask += maskOffset;

511 } while (--height != 0);	337 } while (--height != 0);

(...skipping 839 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst);	1177 uint32_t dst_expanded = SkExpand_rgb_16(*dst);

1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);	1178 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

1353 // now src and dst expanded are in g:11 r:10 x:1 b:10	1179 // now src and dst expanded are in g:11 r:10 x:1 b:10

1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);	1180 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

1355 }	1181 }

1356 dst += 1;	1182 dst += 1;

1357 DITHER_INC_X(x);	1183 DITHER_INC_X(x);

1358 } while (--count != 0);	1184 } while (--count != 0);

1359 }	1185 }

1360 }	1186 }

OLD	NEW

« no previous file with comments | « no previous file | src/opts/SkColor_opts_SSE2.h » ('j') | no next file with comments »