Index: third_party/qcms/src/transform-sse2.c |
diff --git a/third_party/qcms/src/transform-sse2.c b/third_party/qcms/src/transform-sse2.c |
index 34d0d8676c36f2d18e2f8b70b8e55cc2e30b8861..bdd3bf7c078f4337ef061924fffd7ce3d2dfc590 100644 |
--- a/third_party/qcms/src/transform-sse2.c |
+++ b/third_party/qcms/src/transform-sse2.c |
@@ -25,8 +25,9 @@ |
#include "qcmsint.h" |
/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ |
-#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
-#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) |
+#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE - 1) |
+#define CLAMPMAXVAL 1.0f |
+ |
static const ALIGN float floatScaleX4[4] = |
{ FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; |
static const ALIGN float clampMaxValueX4[4] = |
@@ -103,13 +104,13 @@ void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
vec_b = _mm_mul_ps(vec_b, mat2); |
/* crunch, crunch, crunch */ |
- vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
+ vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
vec_r = _mm_max_ps(min, vec_r); |
vec_r = _mm_min_ps(max, vec_r); |
result = _mm_mul_ps(vec_r, scale); |
/* store calc'd output tables indices */ |
- _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
+ _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
/* load for next loop while store completes */ |
vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
@@ -134,12 +135,12 @@ void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
vec_g = _mm_mul_ps(vec_g, mat1); |
vec_b = _mm_mul_ps(vec_b, mat2); |
- vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
+ vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
vec_r = _mm_max_ps(min, vec_r); |
vec_r = _mm_min_ps(max, vec_r); |
result = _mm_mul_ps(vec_r, scale); |
- _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
+ _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
dest[r_out] = otdata_r[output[0]]; |
dest[1] = otdata_g[output[1]]; |
@@ -223,13 +224,13 @@ void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
alpha = src[3]; |
/* crunch, crunch, crunch */ |
- vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
+ vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
vec_r = _mm_max_ps(min, vec_r); |
vec_r = _mm_min_ps(max, vec_r); |
result = _mm_mul_ps(vec_r, scale); |
/* store calc'd output tables indices */ |
- _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
+ _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
Noel Gordon
2015/11/04 13:01:28
Now we match the s/w case, one question: if you ro
radu.velea
2015/11/04 13:12:19
Using SSE round (_mm_cvtps_epi32) generates the fo
|
/* load gamma values for next loop while store completes */ |
vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
@@ -256,12 +257,12 @@ void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
dest[3] = alpha; |
- vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
+ vec_r = _mm_add_ps(vec_g, _mm_add_ps(vec_r, vec_b)); |
vec_r = _mm_max_ps(min, vec_r); |
vec_r = _mm_min_ps(max, vec_r); |
result = _mm_mul_ps(vec_r, scale); |
- _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
+ _mm_store_si128((__m128i*)output, _mm_cvttps_epi32(result)); |
dest[r_out] = otdata_r[output[0]]; |
dest[1] = otdata_g[output[1]]; |