Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 #include "SkColorPriv.h" | 1 #include "SkColorPriv.h" |
| 2 #include "SkColor_opts_SSE2.h" | 2 #include "SkColor_opts_SSE2.h" |
| 3 #include "SkMathPriv.h" | 3 #include "SkMathPriv.h" |
| 4 #include "SkXfermode.h" | 4 #include "SkXfermode.h" |
| 5 #include "SkXfermode_opts_SSE2.h" | 5 #include "SkXfermode_opts_SSE2.h" |
| 6 #include "SkXfermode_proccoeff.h" | 6 #include "SkXfermode_proccoeff.h" |
| 7 | 7 |
| 8 //////////////////////////////////////////////////////////////////////////////// | 8 //////////////////////////////////////////////////////////////////////////////// |
| 9 // 4 pixels SSE2 version functions | 9 // 4 pixels SSE2 version functions |
| 10 //////////////////////////////////////////////////////////////////////////////// | 10 //////////////////////////////////////////////////////////////////////////////// |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 77 __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da); | 77 __m128i g = blendfunc_multiply_byte_SSE2(sg, dg, sa, da); |
| 78 | 78 |
| 79 | 79 |
| 80 __m128i sb = SkGetPackedB32_SSE2(src); | 80 __m128i sb = SkGetPackedB32_SSE2(src); |
| 81 __m128i db = SkGetPackedB32_SSE2(dst); | 81 __m128i db = SkGetPackedB32_SSE2(dst); |
| 82 __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da); | 82 __m128i b = blendfunc_multiply_byte_SSE2(sb, db, sa, da); |
| 83 | 83 |
| 84 return SkPackARGB32_SSE2(a, r, g, b); | 84 return SkPackARGB32_SSE2(a, r, g, b); |
| 85 } | 85 } |
| 86 | 86 |
| 87 //////////////////////////////////////////////////////////////////////////////// | 87 static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) { |
| 88 __m128i r1 = _mm_mul_epu32(a, b); | |
| 89 __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); | |
| 90 __m128i r = _mm_unpacklo_epi32(_mm_shuffle_epi32(r1, _MM_SHUFFLE(0,0,2,0)), | |
| 91 _mm_shuffle_epi32(r2, _MM_SHUFFLE(0,0,2,0))); | |
| 92 return r; | |
| 93 } | |
| 94 | |
| 95 // Portable version of SkSqrtBits is in SkMath.cpp. | |
| 96 static inline __m128i SkSqrtBits_SSE2(const __m128i& x, int count) { | |
|
mtklein
2014/04/24 17:24:11
Think it's worth spinning off an SkMath_SSE2?
qiankun
2014/04/25 08:13:06
Done. Create a new file SkMath_opts_SSE2.h to hold
| |
| 97 __m128i root = _mm_setzero_si128(); | |
| 98 __m128i remHi = _mm_setzero_si128(); | |
| 99 __m128i remLo = x; | |
| 100 __m128i one128 = _mm_set1_epi32(1); | |
| 101 | |
| 102 do { | |
| 103 root = _mm_slli_epi32(root, 1); | |
| 104 | |
| 105 remHi = _mm_or_si128(_mm_slli_epi32(remHi, 2), | |
| 106 _mm_srli_epi32(remLo, 30)); | |
| 107 remLo = _mm_slli_epi32(remLo, 2); | |
| 108 | |
| 109 __m128i testDiv = _mm_slli_epi32(root, 1); | |
| 110 testDiv = _mm_add_epi32(testDiv, _mm_set1_epi32(1)); | |
| 111 | |
| 112 __m128i cmp = _mm_cmplt_epi32(remHi, testDiv); | |
| 113 __m128i remHi1 = _mm_and_si128(cmp, remHi); | |
| 114 __m128i root1 = _mm_and_si128(cmp, root); | |
| 115 __m128i remHi2 = _mm_andnot_si128(cmp, _mm_sub_epi32(remHi, testDiv)); | |
| 116 __m128i root2 = _mm_andnot_si128(cmp, _mm_add_epi32(root, one128)); | |
| 117 | |
| 118 remHi = _mm_or_si128(remHi1, remHi2); | |
| 119 root = _mm_or_si128(root1, root2); | |
| 120 } while (--count >= 0); | |
| 121 | |
| 122 return root; | |
| 123 } | |
| 124 | |
| 125 static __m128i sqrt_unit_byte_SSE2(const __m128i& n) { | |
| 126 return SkSqrtBits_SSE2(n, 15+4); | |
| 127 } | |
| 128 | |
| 129 static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, | |
| 130 const __m128i& sa, const __m128i& da) { | |
| 131 __m128i tmp1, tmp2, tmp3; | |
| 132 | |
| 133 // int m = da ? dc * 256 / da : 0; | |
| 134 __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); | |
| 135 __m128i m = _mm_slli_epi32(dc, 8); | |
| 136 __m128 x = _mm_cvtepi32_ps(m); | |
| 137 __m128 y = _mm_cvtepi32_ps(da); | |
| 138 m = _mm_cvttps_epi32(_mm_div_ps(x, y)); | |
| 139 m = _mm_andnot_si128(cmp, m); | |
| 140 | |
| 141 // if (2 * sc <= sa) | |
| 142 tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc | |
| 143 __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); | |
| 144 tmp1 = _mm_sub_epi32(tmp1, sa); // 2*sc - sa | |
| 145 tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m | |
| 146 tmp1 = Multiply32_SSE2(tmp1, tmp2); | |
| 147 tmp1 = _mm_srai_epi32(tmp1, 8); | |
| 148 tmp1 = _mm_add_epi32(sa, tmp1); | |
| 149 tmp1 = Multiply32_SSE2(dc, tmp1); | |
| 150 __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); | |
| 151 | |
| 152 // else if (4 * dc <= da) | |
| 153 tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 | |
| 154 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); | |
| 155 __m128i i = _mm_slli_epi32(m, 2); // 4 * m | |
| 156 __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 | |
| 157 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) | |
| 158 __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 | |
| 159 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) | |
| 160 i = _mm_srai_epi32(i, 16); // >> 16 | |
| 161 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m | |
| 162 tmp2 = _mm_add_epi32(i, j); | |
| 163 i = Multiply32_SSE2(dc, sa); // dc * sa | |
| 164 j = _mm_slli_epi32(sc, 1); // 2 * sc | |
| 165 j = _mm_sub_epi32(j, sa); // 2 * sc - sa | |
| 166 j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) | |
| 167 tmp2 = Multiply32_SSE2(j, tmp2); // * tmp | |
| 168 tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 | |
| 169 tmp2 = _mm_add_epi32(i, tmp2); | |
| 170 cmp = _mm_andnot_si128(cmp2, cmp1); | |
| 171 __m128i rc2 = _mm_and_si128(cmp, tmp2); | |
| 172 __m128i rc = _mm_or_si128(rc1, rc2); | |
| 173 | |
| 174 // else | |
| 175 tmp3 = sqrt_unit_byte_SSE2(m); | |
| 176 tmp3 = _mm_sub_epi32(tmp3, m); | |
| 177 tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) | |
| 178 tmp3 = _mm_srai_epi32(tmp3, 8); | |
| 179 tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa | |
| 180 cmp = _mm_and_si128(cmp1, cmp2); | |
| 181 __m128i rc3 = _mm_and_si128(cmp, tmp3); | |
| 182 rc = _mm_or_si128(rc, rc3); | |
| 183 | |
| 184 tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da | |
| 185 tmp1 = _mm_mullo_epi16(sc, tmp1); | |
| 186 tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa | |
| 187 tmp2 = _mm_mullo_epi16(dc, tmp2); | |
| 188 rc = _mm_add_epi32(rc, tmp1); | |
| 189 rc = _mm_add_epi32(rc, tmp2); | |
| 190 return clamp_div255round_SSE2(rc); | |
| 191 } | |
| 192 | |
| 193 static __m128i softlight_modeproc_SSE2(const __m128i& src, const __m128i& dst) { | |
| 194 __m128i sa = SkGetPackedA32_SSE2(src); | |
| 195 __m128i da = SkGetPackedA32_SSE2(dst); | |
| 196 | |
| 197 __m128i a = srcover_byte_SSE2(sa, da); | |
| 198 __m128i r = softlight_byte_SSE2(SkGetPackedR32_SSE2(src), | |
| 199 SkGetPackedR32_SSE2(dst), sa, da); | |
| 200 __m128i g = softlight_byte_SSE2(SkGetPackedG32_SSE2(src), | |
| 201 SkGetPackedG32_SSE2(dst), sa, da); | |
| 202 __m128i b = softlight_byte_SSE2(SkGetPackedB32_SSE2(src), | |
| 203 SkGetPackedB32_SSE2(dst), sa, da); | |
| 204 return SkPackARGB32_SSE2(a, r, g, b); | |
| 205 } | |
| 206 /////////////////////////////////////////////////////////////////////////////// | |
| 88 | 207 |
| 89 typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); | 208 typedef __m128i (*SkXfermodeProcSIMD)(const __m128i& src, const __m128i& dst); |
| 90 | 209 |
| 91 extern SkXfermodeProcSIMD gSSE2XfermodeProcs[]; | 210 extern SkXfermodeProcSIMD gSSE2XfermodeProcs[]; |
| 92 | 211 |
| 93 SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) | 212 SkSSE2ProcCoeffXfermode::SkSSE2ProcCoeffXfermode(SkReadBuffer& buffer) |
| 94 : INHERITED(buffer) { | 213 : INHERITED(buffer) { |
| 95 fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]); | 214 fProcSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[this->getMode()]); |
| 96 } | 215 } |
| 97 | 216 |
| (...skipping 140 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 238 NULL, // kPlus_Mode | 357 NULL, // kPlus_Mode |
| 239 NULL, // kModulate_Mode | 358 NULL, // kModulate_Mode |
| 240 NULL, // kScreen_Mode | 359 NULL, // kScreen_Mode |
| 241 | 360 |
| 242 NULL, // kOverlay_Mode | 361 NULL, // kOverlay_Mode |
| 243 NULL, // kDarken_Mode | 362 NULL, // kDarken_Mode |
| 244 NULL, // kLighten_Mode | 363 NULL, // kLighten_Mode |
| 245 NULL, // kColorDodge_Mode | 364 NULL, // kColorDodge_Mode |
| 246 NULL, // kColorBurn_Mode | 365 NULL, // kColorBurn_Mode |
| 247 NULL, // kHardLight_Mode | 366 NULL, // kHardLight_Mode |
| 248 NULL, // kSoftLight_Mode | 367 softlight_modeproc_SSE2, |
| 249 NULL, // kDifference_Mode | 368 NULL, // kDifference_Mode |
| 250 NULL, // kExclusion_Mode | 369 NULL, // kExclusion_Mode |
| 251 multiply_modeproc_SSE2, | 370 multiply_modeproc_SSE2, |
| 252 | 371 |
| 253 NULL, // kHue_Mode | 372 NULL, // kHue_Mode |
| 254 NULL, // kSaturation_Mode | 373 NULL, // kSaturation_Mode |
| 255 NULL, // kColor_Mode | 374 NULL, // kColor_Mode |
| 256 NULL, // kLuminosity_Mode | 375 NULL, // kLuminosity_Mode |
| 257 }; | 376 }; |
| 258 | 377 |
| 259 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, | 378 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, |
| 260 SkXfermode::Mode mode) { | 379 SkXfermode::Mode mode) { |
| 261 void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]); | 380 void* procSIMD = reinterpret_cast<void*>(gSSE2XfermodeProcs[mode]); |
| 262 | 381 |
| 263 if (procSIMD != NULL) { | 382 if (procSIMD != NULL) { |
| 264 return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD)); | 383 return SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, procSIMD)); |
| 265 } | 384 } |
| 266 return NULL; | 385 return NULL; |
| 267 } | 386 } |
| OLD | NEW |