Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(307)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c

Issue 668403002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13
14 #include "vpx/vpx_integer.h"
15
16 void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
17 int skip_block, const int16_t* zbin_ptr,
18 const int16_t* round_ptr, const int16_t* quant_ptr,
19 const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
20 int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
21 int zbin_oq_value, uint16_t* eob_ptr,
22 const int16_t* scan_ptr,
23 const int16_t* iscan_ptr) {
24 __m128i zero;
25 (void)scan_ptr;
26
27 coeff_ptr += n_coeffs;
28 iscan_ptr += n_coeffs;
29 qcoeff_ptr += n_coeffs;
30 dqcoeff_ptr += n_coeffs;
31 n_coeffs = -n_coeffs;
32 zero = _mm_setzero_si128();
33 if (!skip_block) {
34 __m128i eob;
35 __m128i zbin;
36 __m128i round, quant, dequant, shift;
37 {
38 __m128i coeff0, coeff1;
39
40 // Setup global values
41 {
42 __m128i zbin_oq;
43 __m128i pw_1;
44 zbin_oq = _mm_set1_epi16(zbin_oq_value);
45 zbin = _mm_load_si128((const __m128i*)zbin_ptr);
46 round = _mm_load_si128((const __m128i*)round_ptr);
47 quant = _mm_load_si128((const __m128i*)quant_ptr);
48 zbin = _mm_add_epi16(zbin, zbin_oq);
49 pw_1 = _mm_set1_epi16(1);
50 zbin = _mm_sub_epi16(zbin, pw_1);
51 dequant = _mm_load_si128((const __m128i*)dequant_ptr);
52 shift = _mm_load_si128((const __m128i*)quant_shift_ptr);
53 }
54
55 {
56 __m128i coeff0_sign, coeff1_sign;
57 __m128i qcoeff0, qcoeff1;
58 __m128i qtmp0, qtmp1;
59 __m128i cmp_mask0, cmp_mask1;
60 // Do DC and first 15 AC
61 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
62 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
63
64 // Poor man's sign extract
65 coeff0_sign = _mm_srai_epi16(coeff0, 15);
66 coeff1_sign = _mm_srai_epi16(coeff1, 15);
67 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
68 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
69 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
70 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
71
72 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
73 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
74 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
75 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
76 round = _mm_unpackhi_epi64(round, round);
77 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
78 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
79 quant = _mm_unpackhi_epi64(quant, quant);
80 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
81 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
82 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
83 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
84 shift = _mm_unpackhi_epi64(shift, shift);
85 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
86
87 // Reinsert signs
88 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
89 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
90 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
91 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
92
93 // Mask out zbin threshold coeffs
94 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
95 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
96
97 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
98 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
99
100 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
101 dequant = _mm_unpackhi_epi64(dequant, dequant);
102 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
103
104 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
105 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
106 }
107
108 {
109 // Scan for eob
110 __m128i zero_coeff0, zero_coeff1;
111 __m128i nzero_coeff0, nzero_coeff1;
112 __m128i iscan0, iscan1;
113 __m128i eob1;
114 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
115 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
116 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
117 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
118 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
119 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
120 // Add one to convert from indices to counts
121 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
122 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
123 eob = _mm_and_si128(iscan0, nzero_coeff0);
124 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
125 eob = _mm_max_epi16(eob, eob1);
126 }
127 n_coeffs += 8 * 2;
128 }
129
130 // AC only loop
131 while (n_coeffs < 0) {
132 __m128i coeff0, coeff1;
133 {
134 __m128i coeff0_sign, coeff1_sign;
135 __m128i qcoeff0, qcoeff1;
136 __m128i qtmp0, qtmp1;
137 __m128i cmp_mask0, cmp_mask1;
138
139 coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
140 coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
141
142 // Poor man's sign extract
143 coeff0_sign = _mm_srai_epi16(coeff0, 15);
144 coeff1_sign = _mm_srai_epi16(coeff1, 15);
145 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
146 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
147 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
148 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
149
150 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
151 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
152 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
153 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
154 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
155 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
156 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
157 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
158 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
159 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
160
161 // Reinsert signs
162 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
163 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
164 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
165 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
166
167 // Mask out zbin threshold coeffs
168 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
169 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
170
171 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
172 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
173
174 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
175 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
176
177 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
178 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
179 }
180
181 {
182 // Scan for eob
183 __m128i zero_coeff0, zero_coeff1;
184 __m128i nzero_coeff0, nzero_coeff1;
185 __m128i iscan0, iscan1;
186 __m128i eob0, eob1;
187 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
188 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
189 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
190 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
191 iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
192 iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
193 // Add one to convert from indices to counts
194 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
195 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
196 eob0 = _mm_and_si128(iscan0, nzero_coeff0);
197 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
198 eob0 = _mm_max_epi16(eob0, eob1);
199 eob = _mm_max_epi16(eob, eob0);
200 }
201 n_coeffs += 8 * 2;
202 }
203
204 // Accumulate EOB
205 {
206 __m128i eob_shuffled;
207 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
208 eob = _mm_max_epi16(eob, eob_shuffled);
209 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
210 eob = _mm_max_epi16(eob, eob_shuffled);
211 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
212 eob = _mm_max_epi16(eob, eob_shuffled);
213 *eob_ptr = _mm_extract_epi16(eob, 1);
214 }
215 } else {
216 do {
217 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
218 _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
219 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
220 _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
221 n_coeffs += 8 * 2;
222 } while (n_coeffs < 0);
223 *eob_ptr = 0;
224 }
225 }
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698