Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(31)

Side by Side Diff: src/opts/SkBlitRow_opts_SSE2.cpp

Issue 755573002: Cleanup with SkAlphaMulQ_SSE2() (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: fix performance regression Created 6 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/opts/SkColor_opts_SSE2.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2012 The Android Open Source Project 2 * Copyright 2012 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include <emmintrin.h> 8 #include <emmintrin.h>
9 #include "SkBitmapProcState_opts_SSE2.h" 9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBlitRow_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h"
(...skipping 20 matching lines...) Expand all
31 SkASSERT(((size_t)dst & 0x03) == 0); 31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) { 32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++; 34 src++;
35 dst++; 35 dst++;
36 count--; 36 count--;
37 } 37 }
38 38
39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 39 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst); 40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43 41
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
47 while (count >= 4) { 42 while (count >= 4) {
48 // Load 4 pixels each of src and dest. 43 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s); 44 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d); 45 __m128i dst_pixel = _mm_load_si128(d);
51 46
52 // Interleave Atom port 0/1 operations based on the execution port 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale);
53 // constraints that multiply can only be executed on port 0 (while 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale);
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
58 // Get red and blue pixels into lower byte of each word.
59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
61
62 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
71
72 // Multiply by scale.
73 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
75
76 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
87
88 // Combine back into RGBA.
89 // (4 x (as.h, rs.h, gs.h, bs.h))
90 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92 49
93 // Add result 50 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result); 52 _mm_store_si128(d, result);
96 s++; 53 s++;
97 d++; 54 d++;
98 count -= 4; 55 count -= 4;
99 } 56 }
100 src = reinterpret_cast<const SkPMColor*>(s); 57 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d); 58 dst = reinterpret_cast<SkPMColor*>(d);
(...skipping 259 matching lines...) Expand 10 before | Expand all | Expand 10 after
361 SkASSERT(((size_t)dst & 0x03) == 0); 318 SkASSERT(((size_t)dst & 0x03) == 0);
362 while (((size_t)dst & 0x0F) != 0) { 319 while (((size_t)dst & 0x0F) != 0) {
363 *dst = color + SkAlphaMulQ(*src, scale); 320 *dst = color + SkAlphaMulQ(*src, scale);
364 src++; 321 src++;
365 dst++; 322 dst++;
366 count--; 323 count--;
367 } 324 }
368 325
369 const __m128i *s = reinterpret_cast<const __m128i*>(src); 326 const __m128i *s = reinterpret_cast<const __m128i*>(src);
370 __m128i *d = reinterpret_cast<__m128i*>(dst); 327 __m128i *d = reinterpret_cast<__m128i*>(dst);
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372 __m128i src_scale_wide = _mm_set1_epi16(scale);
373 __m128i color_wide = _mm_set1_epi32(color); 328 __m128i color_wide = _mm_set1_epi32(color);
374 while (count >= 4) { 329 while (count >= 4) {
375 // Load 4 pixels each of src and dest.
376 __m128i src_pixel = _mm_loadu_si128(s); 330 __m128i src_pixel = _mm_loadu_si128(s);
331 src_pixel = SkAlphaMulQ_SSE2(src_pixel, scale);
377 332
378 // Get red and blue pixels into lower byte of each word.
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
380
381 // Get alpha and green into lower byte of each word.
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384 // Multiply by scale.
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388 // Divide by 256.
389 src_rb = _mm_srli_epi16(src_rb, 8);
390 src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392 // Combine back into RGBA.
393 src_pixel = _mm_or_si128(src_rb, src_ag);
394
395 // Add color to result.
396 __m128i result = _mm_add_epi8(color_wide, src_pixel); 333 __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398 // Store result.
399 _mm_store_si128(d, result); 334 _mm_store_si128(d, result);
400 s++; 335 s++;
401 d++; 336 d++;
402 count -= 4; 337 count -= 4;
403 } 338 }
404 src = reinterpret_cast<const SkPMColor*>(s); 339 src = reinterpret_cast<const SkPMColor*>(s);
405 dst = reinterpret_cast<SkPMColor*>(d); 340 dst = reinterpret_cast<SkPMColor*>(d);
406 } 341 }
407 342
408 while (count > 0) { 343 while (count > 0) {
(...skipping 942 matching lines...) Expand 10 before | Expand all | Expand 10 after
1351 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1286 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1352 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1287 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1353 // now src and dst expanded are in g:11 r:10 x:1 b:10 1288 // now src and dst expanded are in g:11 r:10 x:1 b:10
1354 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1289 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1355 } 1290 }
1356 dst += 1; 1291 dst += 1;
1357 DITHER_INC_X(x); 1292 DITHER_INC_X(x);
1358 } while (--count != 0); 1293 } while (--count != 0);
1359 } 1294 }
1360 } 1295 }
OLDNEW
« no previous file with comments | « no previous file | src/opts/SkColor_opts_SSE2.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698