Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(643)

Side by Side Diff: third_party/libwebp/dsp/enc_sse41.c

Issue 2149863002: libwebp: update to v0.5.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/libwebp/dsp/enc_sse2.c ('k') | third_party/libwebp/dsp/filters.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2015 Google Inc. All Rights Reserved. 1 // Copyright 2015 Google Inc. All Rights Reserved.
2 // 2 //
3 // Use of this source code is governed by a BSD-style license 3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source 4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found 5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may 6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree. 7 // be found in the AUTHORS file in the root of the source tree.
8 // ----------------------------------------------------------------------------- 8 // -----------------------------------------------------------------------------
9 // 9 //
10 // SSE4 version of some encoding functions. 10 // SSE4 version of some encoding functions.
11 // 11 //
12 // Author: Skal (pascal.massimino@gmail.com) 12 // Author: Skal (pascal.massimino@gmail.com)
13 13
14 #include "./dsp.h" 14 #include "./dsp.h"
15 15
16 #if defined(WEBP_USE_SSE41) 16 #if defined(WEBP_USE_SSE41)
17 #include <smmintrin.h> 17 #include <smmintrin.h>
18 #include <stdlib.h> // for abs() 18 #include <stdlib.h> // for abs()
19 19
20 #include "./common_sse2.h"
20 #include "../enc/vp8enci.h" 21 #include "../enc/vp8enci.h"
21 22
22 //------------------------------------------------------------------------------ 23 //------------------------------------------------------------------------------
23 // Compute susceptibility based on DCT-coeff histograms. 24 // Compute susceptibility based on DCT-coeff histograms.
24 25
25 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 26 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
26 int start_block, int end_block, 27 int start_block, int end_block,
27 VP8Histogram* const histo) { 28 VP8Histogram* const histo) {
28 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); 29 const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
29 int j; 30 int j;
(...skipping 30 matching lines...) Expand all
60 VP8SetHistogramData(distribution, histo); 61 VP8SetHistogramData(distribution, histo);
61 } 62 }
62 63
63 //------------------------------------------------------------------------------ 64 //------------------------------------------------------------------------------
64 // Texture distortion 65 // Texture distortion
65 // 66 //
66 // We try to match the spectral content (weighted) between source and 67 // We try to match the spectral content (weighted) between source and
67 // reconstructed samples. 68 // reconstructed samples.
68 69
69 // Hadamard transform 70 // Hadamard transform
70 // Returns the difference between the weighted sum of the absolute value of 71 // Returns the weighted sum of the absolute value of transformed coefficients.
71 // transformed coefficients. 72 // w[] contains a row-major 4 by 4 symmetric matrix.
72 static int TTransform(const uint8_t* inA, const uint8_t* inB, 73 static int TTransform(const uint8_t* inA, const uint8_t* inB,
73 const uint16_t* const w) { 74 const uint16_t* const w) {
75 int32_t sum[4];
74 __m128i tmp_0, tmp_1, tmp_2, tmp_3; 76 __m128i tmp_0, tmp_1, tmp_2, tmp_3;
75 77
76 // Load, combine and transpose inputs. 78 // Load and combine inputs.
77 { 79 {
78 const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); 80 const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
79 const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); 81 const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
80 const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); 82 const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
83 // In SSE4.1, with gcc 4.8 at least (maybe other versions),
84 // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
85 // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
86 // bound read.
81 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); 87 const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
82 const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); 88 const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
83 const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); 89 const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
84 const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); 90 const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
85 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); 91 const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
86 92
87 // Combine inA and inB (we'll do two transforms in parallel). 93 // Combine inA and inB (we'll do two transforms in parallel).
88 const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); 94 const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
89 const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); 95 const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
90 const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); 96 const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
91 const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); 97 const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
92 // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 98 tmp_0 = _mm_cvtepu8_epi16(inAB_0);
93 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 99 tmp_1 = _mm_cvtepu8_epi16(inAB_1);
94 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 100 tmp_2 = _mm_cvtepu8_epi16(inAB_2);
95 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 101 tmp_3 = _mm_cvtepu8_epi16(inAB_3);
96 102 // a00 a01 a02 a03 b00 b01 b02 b03
97 // Transpose the two 4x4, discarding the filling zeroes. 103 // a10 a11 a12 a13 b10 b11 b12 b13
98 const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); 104 // a20 a21 a22 a23 b20 b21 b22 b23
99 const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); 105 // a30 a31 a32 a33 b30 b31 b32 b33
100 // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
101 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
102 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
103 const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
104 // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
105 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
106
107 // Convert to 16b.
108 tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
109 tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
110 tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
111 tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
112 // a00 a10 a20 a30 b00 b10 b20 b30
113 // a01 a11 a21 a31 b01 b11 b21 b31
114 // a02 a12 a22 a32 b02 b12 b22 b32
115 // a03 a13 a23 a33 b03 b13 b23 b33
116 } 106 }
117 107
118 // Horizontal pass and subsequent transpose. 108 // Vertical pass first to avoid a transpose (vertical and horizontal passes
109 // are commutative because w/kWeightY is symmetric) and subsequent transpose.
119 { 110 {
120 // Calculate a and b (two 4x4 at once). 111 // Calculate a and b (two 4x4 at once).
121 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 112 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
122 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 113 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
123 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 114 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
124 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 115 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
125 const __m128i b0 = _mm_add_epi16(a0, a1); 116 const __m128i b0 = _mm_add_epi16(a0, a1);
126 const __m128i b1 = _mm_add_epi16(a3, a2); 117 const __m128i b1 = _mm_add_epi16(a3, a2);
127 const __m128i b2 = _mm_sub_epi16(a3, a2); 118 const __m128i b2 = _mm_sub_epi16(a3, a2);
128 const __m128i b3 = _mm_sub_epi16(a0, a1); 119 const __m128i b3 = _mm_sub_epi16(a0, a1);
129 // a00 a01 a02 a03 b00 b01 b02 b03 120 // a00 a01 a02 a03 b00 b01 b02 b03
130 // a10 a11 a12 a13 b10 b11 b12 b13 121 // a10 a11 a12 a13 b10 b11 b12 b13
131 // a20 a21 a22 a23 b20 b21 b22 b23 122 // a20 a21 a22 a23 b20 b21 b22 b23
132 // a30 a31 a32 a33 b30 b31 b32 b33 123 // a30 a31 a32 a33 b30 b31 b32 b33
133 124
134 // Transpose the two 4x4. 125 // Transpose the two 4x4.
135 const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); 126 VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
136 const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
137 const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
138 const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
139 // a00 a10 a01 a11 a02 a12 a03 a13
140 // a20 a30 a21 a31 a22 a32 a23 a33
141 // b00 b10 b01 b11 b02 b12 b03 b13
142 // b20 b30 b21 b31 b22 b32 b23 b33
143 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
144 const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
145 const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
146 const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
147 // a00 a10 a20 a30 a01 a11 a21 a31
148 // b00 b10 b20 b30 b01 b11 b21 b31
149 // a02 a12 a22 a32 a03 a13 a23 a33
150 // b02 b12 a22 b32 b03 b13 b23 b33
151 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
152 tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
153 tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
154 tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
155 // a00 a10 a20 a30 b00 b10 b20 b30
156 // a01 a11 a21 a31 b01 b11 b21 b31
157 // a02 a12 a22 a32 b02 b12 b22 b32
158 // a03 a13 a23 a33 b03 b13 b23 b33
159 } 127 }
160 128
161 // Vertical pass and difference of weighted sums. 129 // Horizontal pass and difference of weighted sums.
162 { 130 {
163 // Load all inputs. 131 // Load all inputs.
164 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); 132 const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
165 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); 133 const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
166 134
167 // Calculate a and b (two 4x4 at once). 135 // Calculate a and b (two 4x4 at once).
168 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 136 const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
169 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 137 const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
170 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 138 const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
171 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 139 const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
(...skipping 16 matching lines...) Expand all
188 // weighted sums 156 // weighted sums
189 A_b0 = _mm_madd_epi16(A_b0, w_0); 157 A_b0 = _mm_madd_epi16(A_b0, w_0);
190 A_b2 = _mm_madd_epi16(A_b2, w_8); 158 A_b2 = _mm_madd_epi16(A_b2, w_8);
191 B_b0 = _mm_madd_epi16(B_b0, w_0); 159 B_b0 = _mm_madd_epi16(B_b0, w_0);
192 B_b2 = _mm_madd_epi16(B_b2, w_8); 160 B_b2 = _mm_madd_epi16(B_b2, w_8);
193 A_b0 = _mm_add_epi32(A_b0, A_b2); 161 A_b0 = _mm_add_epi32(A_b0, A_b2);
194 B_b0 = _mm_add_epi32(B_b0, B_b2); 162 B_b0 = _mm_add_epi32(B_b0, B_b2);
195 163
196 // difference of weighted sums 164 // difference of weighted sums
197 A_b2 = _mm_sub_epi32(A_b0, B_b0); 165 A_b2 = _mm_sub_epi32(A_b0, B_b0);
198 // cascading summation of the differences 166 _mm_storeu_si128((__m128i*)&sum[0], A_b2);
199 B_b0 = _mm_hadd_epi32(A_b2, A_b2);
200 B_b2 = _mm_hadd_epi32(B_b0, B_b0);
201 return _mm_cvtsi128_si32(B_b2);
202 } 167 }
168 return sum[0] + sum[1] + sum[2] + sum[3];
203 } 169 }
204 170
205 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, 171 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
206 const uint16_t* const w) { 172 const uint16_t* const w) {
207 const int diff_sum = TTransform(a, b, w); 173 const int diff_sum = TTransform(a, b, w);
208 return abs(diff_sum) >> 5; 174 return abs(diff_sum) >> 5;
209 } 175 }
210 176
211 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, 177 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
212 const uint16_t* const w) { 178 const uint16_t* const w) {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
364 VP8EncQuantizeBlockWHT = QuantizeBlockWHT; 330 VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
365 VP8TDisto4x4 = Disto4x4; 331 VP8TDisto4x4 = Disto4x4;
366 VP8TDisto16x16 = Disto16x16; 332 VP8TDisto16x16 = Disto16x16;
367 } 333 }
368 334
369 #else // !WEBP_USE_SSE41 335 #else // !WEBP_USE_SSE41
370 336
371 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41) 337 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
372 338
373 #endif // WEBP_USE_SSE41 339 #endif // WEBP_USE_SSE41
OLDNEW
« no previous file with comments | « third_party/libwebp/dsp/enc_sse2.c ('k') | third_party/libwebp/dsp/filters.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698