Chromium Code Reviews| Index: third_party/qcms/src/transform-sse2.c |
| diff --git a/third_party/qcms/src/transform-sse2.c b/third_party/qcms/src/transform-sse2.c |
| index 34d0d8676c36f2d18e2f8b70b8e55cc2e30b8861..bdd3bf7c078f4337ef061924fffd7ce3d2dfc590 100644 |
| --- a/third_party/qcms/src/transform-sse2.c |
| +++ b/third_party/qcms/src/transform-sse2.c |
| @@ -25,8 +25,9 @@ |
| #include "qcmsint.h" |
| /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ |
| -#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
| -#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) |
| +#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE - 1) |
| +#define CLAMPMAXVAL 1.0f |
| + |
| static const ALIGN float floatScaleX4[4] = |
| { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; |
| static const ALIGN float clampMaxValueX4[4] = |
| @@ -103,13 +104,13 @@ void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
| vec_b = _mm_mul_ps(vec_b, mat2); |
| /* crunch, crunch, crunch */ |
| - vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
| + vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
| vec_r = _mm_max_ps(min, vec_r); |
| vec_r = _mm_min_ps(max, vec_r); |
| result = _mm_mul_ps(vec_r, scale); |
| /* store calc'd output tables indices */ |
| - _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
| + _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
| /* load for next loop while store completes */ |
| vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
| @@ -134,12 +135,12 @@ void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
| vec_g = _mm_mul_ps(vec_g, mat1); |
| vec_b = _mm_mul_ps(vec_b, mat2); |
| - vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
| + vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
| vec_r = _mm_max_ps(min, vec_r); |
| vec_r = _mm_min_ps(max, vec_r); |
| result = _mm_mul_ps(vec_r, scale); |
| - _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
| + _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
| dest[r_out] = otdata_r[output[0]]; |
| dest[1] = otdata_g[output[1]]; |
| @@ -223,13 +224,13 @@ void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
| alpha = src[3]; |
| /* crunch, crunch, crunch */ |
| - vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
| + vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
| vec_r = _mm_max_ps(min, vec_r); |
| vec_r = _mm_min_ps(max, vec_r); |
| result = _mm_mul_ps(vec_r, scale); |
| /* store calc'd output tables indices */ |
| - _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
| + _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
|
Noel Gordon
2015/11/04 13:01:28
Now we match the s/w case, one question: if you ro
radu.velea
2015/11/04 13:12:19
Using SSE round (_mm_cvtps_epi32) generates the fo
|
| /* load gamma values for next loop while store completes */ |
| vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
| @@ -256,12 +257,12 @@ void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
| dest[3] = alpha; |
| - vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
| + vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
| vec_r = _mm_max_ps(min, vec_r); |
| vec_r = _mm_min_ps(max, vec_r); |
| result = _mm_mul_ps(vec_r, scale); |
| - _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
| + _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
| dest[r_out] = otdata_r[output[0]]; |
| dest[1] = otdata_g[output[1]]; |