Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(196)

Side by Side Diff: source/scale_win.cc

Issue 1513183004: use rounding in scaledown by 2 (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: corrected version to 1554 Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
88 // Arrange third value for pixels 0,1,2,3,4,5 88 // Arrange third value for pixels 0,1,2,3,4,5
89 static uvec8 kShufAb2 = 89 static uvec8 kShufAb2 =
90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
91 91
92 // Scaling values for boxes of 3x2 and 2x2 92 // Scaling values for boxes of 3x2 and 2x2
93 static uvec16 kScaleAb2 = 93 static uvec16 kScaleAb2 =
94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
95 95
96 // Reads 32 pixels, throws half away and writes 16 pixels. 96 // Reads 32 pixels, throws half away and writes 16 pixels.
97 __declspec(naked) 97 __declspec(naked)
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) { 99 uint8* dst_ptr, int dst_width) {
100 __asm { 100 __asm {
101 mov eax, [esp + 4] // src_ptr 101 mov eax, [esp + 4] // src_ptr
102 // src_stride ignored 102 // src_stride ignored
103 mov edx, [esp + 12] // dst_ptr 103 mov edx, [esp + 12] // dst_ptr
104 mov ecx, [esp + 16] // dst_width 104 mov ecx, [esp + 16] // dst_width
105 105
106 wloop: 106 wloop:
107 movdqu xmm0, [eax] 107 movdqu xmm0, [eax]
108 movdqu xmm1, [eax + 16] 108 movdqu xmm1, [eax + 16]
109 lea eax, [eax + 32] 109 lea eax, [eax + 32]
110 psrlw xmm0, 8 // isolate odd pixels. 110 psrlw xmm0, 8 // isolate odd pixels.
111 psrlw xmm1, 8 111 psrlw xmm1, 8
112 packuswb xmm0, xmm1 112 packuswb xmm0, xmm1
113 movdqu [edx], xmm0 113 movdqu [edx], xmm0
114 lea edx, [edx + 16] 114 lea edx, [edx + 16]
115 sub ecx, 16 115 sub ecx, 16
116 jg wloop 116 jg wloop
117 117
118 ret 118 ret
119 } 119 }
120 } 120 }
121 121
122 // Blends 32x1 rectangle to 16x1. 122 // Blends 32x1 rectangle to 16x1.
123 __declspec(naked) 123 __declspec(naked)
124 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
125 uint8* dst_ptr, int dst_width) { 125 uint8* dst_ptr, int dst_width) {
126 __asm { 126 __asm {
127 mov eax, [esp + 4] // src_ptr 127 mov eax, [esp + 4] // src_ptr
128 // src_stride 128 // src_stride
129 mov edx, [esp + 12] // dst_ptr 129 mov edx, [esp + 12] // dst_ptr
130 mov ecx, [esp + 16] // dst_width 130 mov ecx, [esp + 16] // dst_width
131 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 131
132 psrlw xmm5, 8 132 pcmpeqb xmm4, xmm4 // constant 0x0101
133 psrlw xmm4, 15
134 packuswb xmm4, xmm4
135 pxor xmm5, xmm5 // constant 0
133 136
134 wloop: 137 wloop:
135 movdqu xmm0, [eax] 138 movdqu xmm0, [eax]
136 movdqu xmm1, [eax + 16] 139 movdqu xmm1, [eax + 16]
137 lea eax, [eax + 32] 140 lea eax, [eax + 32]
138 141 pmaddubsw xmm0, xmm4 // horizontal add
139 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 142 pmaddubsw xmm1, xmm4
140 psrlw xmm0, 8 143 pavgw xmm0, xmm5 // (x + 1) / 2
141 movdqa xmm3, xmm1 144 pavgw xmm1, xmm5
142 psrlw xmm1, 8
143 pand xmm2, xmm5
144 pand xmm3, xmm5
145 pavgw xmm0, xmm2
146 pavgw xmm1, xmm3
147 packuswb xmm0, xmm1 145 packuswb xmm0, xmm1
148
149 movdqu [edx], xmm0 146 movdqu [edx], xmm0
150 lea edx, [edx + 16] 147 lea edx, [edx + 16]
151 sub ecx, 16 148 sub ecx, 16
152 jg wloop 149 jg wloop
153 150
154 ret 151 ret
155 } 152 }
156 } 153 }
157 154
158 // Blends 32x2 rectangle to 16x1. 155 // Blends 32x2 rectangle to 16x1.
159 __declspec(naked) 156 __declspec(naked)
160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
161 uint8* dst_ptr, int dst_width) { 158 uint8* dst_ptr, int dst_width) {
162 __asm { 159 __asm {
163 push esi 160 push esi
164 mov eax, [esp + 4 + 4] // src_ptr 161 mov eax, [esp + 4 + 4] // src_ptr
165 mov esi, [esp + 4 + 8] // src_stride 162 mov esi, [esp + 4 + 8] // src_stride
166 mov edx, [esp + 4 + 12] // dst_ptr 163 mov edx, [esp + 4 + 12] // dst_ptr
167 mov ecx, [esp + 4 + 16] // dst_width 164 mov ecx, [esp + 4 + 16] // dst_width
168 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 165
169 psrlw xmm5, 8 166 pcmpeqb xmm4, xmm4 // constant 0x0101
167 psrlw xmm4, 15
168 packuswb xmm4, xmm4
169 pxor xmm5, xmm5 // constant 0
170 170
171 wloop: 171 wloop:
172 movdqu xmm0, [eax] 172 movdqu xmm0, [eax]
173 movdqu xmm1, [eax + 16] 173 movdqu xmm1, [eax + 16]
174 movdqu xmm2, [eax + esi] 174 movdqu xmm2, [eax + esi]
175 movdqu xmm3, [eax + esi + 16] 175 movdqu xmm3, [eax + esi + 16]
176 lea eax, [eax + 32] 176 lea eax, [eax + 32]
177 pavgb xmm0, xmm2 // average rows 177 pmaddubsw xmm0, xmm4 // horizontal add
178 pavgb xmm1, xmm3 178 pmaddubsw xmm1, xmm4
179 179 pmaddubsw xmm2, xmm4
180 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 180 pmaddubsw xmm3, xmm4
181 psrlw xmm0, 8 181 paddw xmm0, xmm2 // vertical add
182 movdqa xmm3, xmm1 182 paddw xmm1, xmm3
183 psrlw xmm1, 8 183 psrlw xmm0, 1
184 pand xmm2, xmm5 184 psrlw xmm1, 1
185 pand xmm3, xmm5 185 pavgw xmm0, xmm5 // (x + 1) / 2
186 pavgw xmm0, xmm2 186 pavgw xmm1, xmm5
187 pavgw xmm1, xmm3
188 packuswb xmm0, xmm1 187 packuswb xmm0, xmm1
189
190 movdqu [edx], xmm0 188 movdqu [edx], xmm0
191 lea edx, [edx + 16] 189 lea edx, [edx + 16]
192 sub ecx, 16 190 sub ecx, 16
193 jg wloop 191 jg wloop
194 192
195 pop esi 193 pop esi
196 ret 194 ret
197 } 195 }
198 } 196 }
199 197
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
238 236
239 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
240 vpsrlw ymm4, ymm4, 15 238 vpsrlw ymm4, ymm4, 15
241 vpackuswb ymm4, ymm4, ymm4 239 vpackuswb ymm4, ymm4, ymm4
242 vpxor ymm5, ymm5, ymm5 // constant 0 240 vpxor ymm5, ymm5, ymm5 // constant 0
243 241
244 wloop: 242 wloop:
245 vmovdqu ymm0, [eax] 243 vmovdqu ymm0, [eax]
246 vmovdqu ymm1, [eax + 32] 244 vmovdqu ymm1, [eax + 32]
247 lea eax, [eax + 64] 245 lea eax, [eax + 64]
248 246 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
249 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
250 vpmaddubsw ymm1, ymm1, ymm4 247 vpmaddubsw ymm1, ymm1, ymm4
251 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 248 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
252 vpavgw ymm1, ymm1, ymm5 249 vpavgw ymm1, ymm1, ymm5
253 vpackuswb ymm0, ymm0, ymm1 250 vpackuswb ymm0, ymm0, ymm1
254 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
255
256 vmovdqu [edx], ymm0 252 vmovdqu [edx], ymm0
257 lea edx, [edx + 32] 253 lea edx, [edx + 32]
258 sub ecx, 32 254 sub ecx, 32
259 jg wloop 255 jg wloop
260 256
261 vzeroupper 257 vzeroupper
262 ret 258 ret
263 } 259 }
264 } 260 }
265 261
262 // For rounding, average = (sum + 2) / 4
263 // becomes average((sum >> 1), 0)
266 // Blends 64x2 rectangle to 32x1. 264 // Blends 64x2 rectangle to 32x1.
267 __declspec(naked) 265 __declspec(naked)
268 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
269 uint8* dst_ptr, int dst_width) { 267 uint8* dst_ptr, int dst_width) {
270 __asm { 268 __asm {
271 push esi 269 push esi
272 mov eax, [esp + 4 + 4] // src_ptr 270 mov eax, [esp + 4 + 4] // src_ptr
273 mov esi, [esp + 4 + 8] // src_stride 271 mov esi, [esp + 4 + 8] // src_stride
274 mov edx, [esp + 4 + 12] // dst_ptr 272 mov edx, [esp + 4 + 12] // dst_ptr
275 mov ecx, [esp + 4 + 16] // dst_width 273 mov ecx, [esp + 4 + 16] // dst_width
276 274
277 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
278 vpsrlw ymm4, ymm4, 15 276 vpsrlw ymm4, ymm4, 15
279 vpackuswb ymm4, ymm4, ymm4 277 vpackuswb ymm4, ymm4, ymm4
280 vpxor ymm5, ymm5, ymm5 // constant 0 278 vpxor ymm5, ymm5, ymm5 // constant 0
281 279
282 wloop: 280 wloop:
283 vmovdqu ymm0, [eax] // average rows 281 vmovdqu ymm0, [eax]
284 vmovdqu ymm1, [eax + 32] 282 vmovdqu ymm1, [eax + 32]
285 vpavgb ymm0, ymm0, [eax + esi] 283 vmovdqu ymm2, [eax + esi]
286 vpavgb ymm1, ymm1, [eax + esi + 32] 284 vmovdqu ymm3, [eax + esi + 32]
287 lea eax, [eax + 64] 285 lea eax, [eax + 64]
288 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
289 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
290 vpmaddubsw ymm1, ymm1, ymm4 287 vpmaddubsw ymm1, ymm1, ymm4
288 vpmaddubsw ymm2, ymm2, ymm4
289 vpmaddubsw ymm3, ymm3, ymm4
290 vpaddw ymm0, ymm0, ymm2 // vertical add
291 vpaddw ymm1, ymm1, ymm3
292 vpsrlw ymm0, ymm0, 1
293 vpsrlw ymm1, ymm1, 1
291 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
292 vpavgw ymm1, ymm1, ymm5 295 vpavgw ymm1, ymm1, ymm5
293 vpackuswb ymm0, ymm0, ymm1 296 vpackuswb ymm0, ymm0, ymm1
294 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
295
296 vmovdqu [edx], ymm0 298 vmovdqu [edx], ymm0
297 lea edx, [edx + 32] 299 lea edx, [edx + 32]
298 sub ecx, 32 300 sub ecx, 32
299 jg wloop 301 jg wloop
300 302
301 pop esi 303 pop esi
302 vzeroupper 304 vzeroupper
303 ret 305 ret
304 } 306 }
305 } 307 }
(...skipping 1038 matching lines...) Expand 10 before | Expand all | Expand 10 after
1344 idiv ecx 1346 idiv ecx
1345 ret 1347 ret
1346 } 1348 }
1347 } 1349 }
1348 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1350 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1349 1351
1350 #ifdef __cplusplus 1352 #ifdef __cplusplus
1351 } // extern "C" 1353 } // extern "C"
1352 } // namespace libyuv 1354 } // namespace libyuv
1353 #endif 1355 #endif
OLDNEW
« no previous file with comments | « source/scale_gcc.cc ('k') | unit_test/planar_test.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698