Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(78)

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include "libyuv/row.h" 11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
12 13
13 #ifdef __cplusplus 14 #ifdef __cplusplus
14 namespace libyuv { 15 namespace libyuv {
15 extern "C" { 16 extern "C" {
16 #endif 17 #endif
17 18
18 // This module is for Visual C x86. 19 // This module is for Visual C x86.
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21 defined(_MSC_VER) && !defined(__clang__)
20 22
21 // Offsets for source bytes 0 to 9 23 // Offsets for source bytes 0 to 9
22 static uvec8 kShuf0 = 24 static uvec8 kShuf0 =
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24 26
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26 static uvec8 kShuf1 = 28 static uvec8 kShuf1 =
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28 30
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
86 88
87 // Arrange third value for pixels 0,1,2,3,4,5 89 // Arrange third value for pixels 0,1,2,3,4,5
88 static uvec8 kShufAb2 = 90 static uvec8 kShufAb2 =
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90 92
91 // Scaling values for boxes of 3x2 and 2x2 93 // Scaling values for boxes of 3x2 and 2x2
92 static uvec16 kScaleAb2 = 94 static uvec16 kScaleAb2 =
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94 96
95 // Reads 32 pixels, throws half away and writes 16 pixels. 97 // Reads 32 pixels, throws half away and writes 16 pixels.
96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 98 __declspec(naked)
97 __declspec(naked) __declspec(align(16))
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
99 uint8* dst_ptr, int dst_width) { 100 uint8* dst_ptr, int dst_width) {
100 __asm { 101 __asm {
101 mov eax, [esp + 4] // src_ptr 102 mov eax, [esp + 4] // src_ptr
102 // src_stride ignored 103 // src_stride ignored
103 mov edx, [esp + 12] // dst_ptr 104 mov edx, [esp + 12] // dst_ptr
104 mov ecx, [esp + 16] // dst_width 105 mov ecx, [esp + 16] // dst_width
105 106
106 wloop: 107 wloop:
107 movdqu xmm0, [eax] 108 movdqu xmm0, [eax]
108 movdqu xmm1, [eax + 16] 109 movdqu xmm1, [eax + 16]
109 lea eax, [eax + 32] 110 lea eax, [eax + 32]
110 psrlw xmm0, 8 // isolate odd pixels. 111 psrlw xmm0, 8 // isolate odd pixels.
111 psrlw xmm1, 8 112 psrlw xmm1, 8
112 packuswb xmm0, xmm1 113 packuswb xmm0, xmm1
113 movdqu [edx], xmm0 114 movdqu [edx], xmm0
114 lea edx, [edx + 16] 115 lea edx, [edx + 16]
115 sub ecx, 16 116 sub ecx, 16
116 jg wloop 117 jg wloop
117 118
118 ret 119 ret
119 } 120 }
120 } 121 }
121 122
122 // Blends 32x1 rectangle to 16x1. 123 // Blends 32x1 rectangle to 16x1.
123 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 124 __declspec(naked)
124 __declspec(naked) __declspec(align(16))
125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126 uint8* dst_ptr, int dst_width) { 126 uint8* dst_ptr, int dst_width) {
127 __asm { 127 __asm {
128 mov eax, [esp + 4] // src_ptr 128 mov eax, [esp + 4] // src_ptr
129 // src_stride 129 // src_stride
130 mov edx, [esp + 12] // dst_ptr 130 mov edx, [esp + 12] // dst_ptr
131 mov ecx, [esp + 16] // dst_width 131 mov ecx, [esp + 16] // dst_width
132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
133 psrlw xmm5, 8 133 psrlw xmm5, 8
134 134
(...skipping 15 matching lines...) Expand all
150 movdqu [edx], xmm0 150 movdqu [edx], xmm0
151 lea edx, [edx + 16] 151 lea edx, [edx + 16]
152 sub ecx, 16 152 sub ecx, 16
153 jg wloop 153 jg wloop
154 154
155 ret 155 ret
156 } 156 }
157 } 157 }
158 158
159 // Blends 32x2 rectangle to 16x1. 159 // Blends 32x2 rectangle to 16x1.
160 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 160 __declspec(naked)
161 __declspec(naked) __declspec(align(16))
162 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
163 uint8* dst_ptr, int dst_width) { 162 uint8* dst_ptr, int dst_width) {
164 __asm { 163 __asm {
165 push esi 164 push esi
166 mov eax, [esp + 4 + 4] // src_ptr 165 mov eax, [esp + 4 + 4] // src_ptr
167 mov esi, [esp + 4 + 8] // src_stride 166 mov esi, [esp + 4 + 8] // src_stride
168 mov edx, [esp + 4 + 12] // dst_ptr 167 mov edx, [esp + 4 + 12] // dst_ptr
169 mov ecx, [esp + 4 + 16] // dst_width 168 mov ecx, [esp + 4 + 16] // dst_width
170 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 169 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
171 psrlw xmm5, 8 170 psrlw xmm5, 8
(...skipping 20 matching lines...) Expand all
192 movdqu [edx], xmm0 191 movdqu [edx], xmm0
193 lea edx, [edx + 16] 192 lea edx, [edx + 16]
194 sub ecx, 16 193 sub ecx, 16
195 jg wloop 194 jg wloop
196 195
197 pop esi 196 pop esi
198 ret 197 ret
199 } 198 }
200 } 199 }
201 200
201 #ifdef HAS_SCALEROWDOWN2_AVX2
202 // Reads 64 pixels, throws half away and writes 32 pixels.
203 __declspec(naked)
204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
205 uint8* dst_ptr, int dst_width) {
206 __asm {
207 mov eax, [esp + 4] // src_ptr
208 // src_stride ignored
209 mov edx, [esp + 12] // dst_ptr
210 mov ecx, [esp + 16] // dst_width
211
212 wloop:
213 vmovdqu ymm0, [eax]
214 vmovdqu ymm1, [eax + 32]
215 lea eax, [eax + 64]
216 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
217 vpsrlw ymm1, ymm1, 8
218 vpackuswb ymm0, ymm0, ymm1
219 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
220 vmovdqu [edx], ymm0
221 lea edx, [edx + 32]
222 sub ecx, 32
223 jg wloop
224
225 vzeroupper
226 ret
227 }
228 }
229
230 // Blends 64x1 rectangle to 32x1.
231 __declspec(naked)
232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
233 uint8* dst_ptr, int dst_width) {
234 __asm {
235 mov eax, [esp + 4] // src_ptr
236 // src_stride
237 mov edx, [esp + 12] // dst_ptr
238 mov ecx, [esp + 16] // dst_width
239
240 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
241 vpsrlw ymm4, ymm4, 15
242 vpackuswb ymm4, ymm4, ymm4
243 vpxor ymm5, ymm5, ymm5 // constant 0
244
245 wloop:
246 vmovdqu ymm0, [eax]
247 vmovdqu ymm1, [eax + 32]
248 lea eax, [eax + 64]
249
250 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
251 vpmaddubsw ymm1, ymm1, ymm4
252 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
253 vpavgw ymm1, ymm1, ymm5
254 vpackuswb ymm0, ymm0, ymm1
255 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
256
257 vmovdqu [edx], ymm0
258 lea edx, [edx + 32]
259 sub ecx, 32
260 jg wloop
261
262 vzeroupper
263 ret
264 }
265 }
266
267 // Blends 64x2 rectangle to 32x1.
268 __declspec(naked)
269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
270 uint8* dst_ptr, int dst_width) {
271 __asm {
272 push esi
273 mov eax, [esp + 4 + 4] // src_ptr
274 mov esi, [esp + 4 + 8] // src_stride
275 mov edx, [esp + 4 + 12] // dst_ptr
276 mov ecx, [esp + 4 + 16] // dst_width
277
278 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
279 vpsrlw ymm4, ymm4, 15
280 vpackuswb ymm4, ymm4, ymm4
281 vpxor ymm5, ymm5, ymm5 // constant 0
282
283 wloop:
284 vmovdqu ymm0, [eax] // average rows
285 vmovdqu ymm1, [eax + 32]
286 vpavgb ymm0, ymm0, [eax + esi]
287 vpavgb ymm1, ymm1, [eax + esi + 32]
288 lea eax, [eax + 64]
289
290 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
291 vpmaddubsw ymm1, ymm1, ymm4
292 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
293 vpavgw ymm1, ymm1, ymm5
294 vpackuswb ymm0, ymm0, ymm1
295 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
296
297 vmovdqu [edx], ymm0
298 lea edx, [edx + 32]
299 sub ecx, 32
300 jg wloop
301
302 pop esi
303 vzeroupper
304 ret
305 }
306 }
307 #endif // HAS_SCALEROWDOWN2_AVX2
308
202 // Point samples 32 pixels to 8 pixels. 309 // Point samples 32 pixels to 8 pixels.
203 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 310 __declspec(naked)
204 __declspec(naked) __declspec(align(16))
205 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
206 uint8* dst_ptr, int dst_width) { 312 uint8* dst_ptr, int dst_width) {
207 __asm { 313 __asm {
208 mov eax, [esp + 4] // src_ptr 314 mov eax, [esp + 4] // src_ptr
209 // src_stride ignored 315 // src_stride ignored
210 mov edx, [esp + 12] // dst_ptr 316 mov edx, [esp + 12] // dst_ptr
211 mov ecx, [esp + 16] // dst_width 317 mov ecx, [esp + 16] // dst_width
212 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 318 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
213 psrld xmm5, 24 319 psrld xmm5, 24
214 pslld xmm5, 16 320 pslld xmm5, 16
(...skipping 10 matching lines...) Expand all
225 movq qword ptr [edx], xmm0 331 movq qword ptr [edx], xmm0
226 lea edx, [edx + 8] 332 lea edx, [edx + 8]
227 sub ecx, 8 333 sub ecx, 8
228 jg wloop 334 jg wloop
229 335
230 ret 336 ret
231 } 337 }
232 } 338 }
233 339
234 // Blends 32x4 rectangle to 8x1. 340 // Blends 32x4 rectangle to 8x1.
235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 341 __declspec(naked)
236 __declspec(naked) __declspec(align(16))
237 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
238 uint8* dst_ptr, int dst_width) { 343 uint8* dst_ptr, int dst_width) {
239 __asm { 344 __asm {
240 push esi 345 push esi
241 push edi 346 push edi
242 mov eax, [esp + 8 + 4] // src_ptr 347 mov eax, [esp + 8 + 4] // src_ptr
243 mov esi, [esp + 8 + 8] // src_stride 348 mov esi, [esp + 8 + 8] // src_stride
244 mov edx, [esp + 8 + 12] // dst_ptr 349 mov edx, [esp + 8 + 12] // dst_ptr
245 mov ecx, [esp + 8 + 16] // dst_width 350 mov ecx, [esp + 8 + 16] // dst_width
246 lea edi, [esi + esi * 2] // src_stride * 3 351 lea edi, [esi + esi * 2] // src_stride * 3
247 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 352 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
248 psrlw xmm7, 8 353 psrlw xmm7, 8
249 354
250 wloop: 355 wloop:
251 movdqu xmm0, [eax] 356 movdqu xmm0, [eax] // average rows
252 movdqu xmm1, [eax + 16] 357 movdqu xmm1, [eax + 16]
253 movdqu xmm2, [eax + esi] 358 movdqu xmm2, [eax + esi]
254 movdqu xmm3, [eax + esi + 16] 359 movdqu xmm3, [eax + esi + 16]
255 pavgb xmm0, xmm2 // average rows 360 pavgb xmm0, xmm2
256 pavgb xmm1, xmm3 361 pavgb xmm1, xmm3
257 movdqu xmm2, [eax + esi * 2] 362 movdqu xmm2, [eax + esi * 2]
258 movdqu xmm3, [eax + esi * 2 + 16] 363 movdqu xmm3, [eax + esi * 2 + 16]
259 movdqu xmm4, [eax + edi] 364 movdqu xmm4, [eax + edi]
260 movdqu xmm5, [eax + edi + 16] 365 movdqu xmm5, [eax + edi + 16]
261 lea eax, [eax + 32] 366 lea eax, [eax + 32]
262 pavgb xmm2, xmm4 367 pavgb xmm2, xmm4
263 pavgb xmm3, xmm5 368 pavgb xmm3, xmm5
264 pavgb xmm0, xmm2 369 pavgb xmm0, xmm2
265 pavgb xmm1, xmm3 370 pavgb xmm1, xmm3
(...skipping 18 matching lines...) Expand all
284 lea edx, [edx + 8] 389 lea edx, [edx + 8]
285 sub ecx, 8 390 sub ecx, 8
286 jg wloop 391 jg wloop
287 392
288 pop edi 393 pop edi
289 pop esi 394 pop esi
290 ret 395 ret
291 } 396 }
292 } 397 }
293 398
399 #ifdef HAS_SCALEROWDOWN4_AVX2
400 // Point samples 64 pixels to 16 pixels.
401 __declspec(naked)
402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
403 uint8* dst_ptr, int dst_width) {
404 __asm {
405 mov eax, [esp + 4] // src_ptr
406 // src_stride ignored
407 mov edx, [esp + 12] // dst_ptr
408 mov ecx, [esp + 16] // dst_width
409 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
410 vpsrld ymm5, ymm5, 24
411 vpslld ymm5, ymm5, 16
412
413 wloop:
414 vmovdqu ymm0, [eax]
415 vmovdqu ymm1, [eax + 32]
416 lea eax, [eax + 64]
417 vpand ymm0, ymm0, ymm5
418 vpand ymm1, ymm1, ymm5
419 vpackuswb ymm0, ymm0, ymm1
420 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
421 vpsrlw ymm0, ymm0, 8
422 vpackuswb ymm0, ymm0, ymm0
423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
424 vmovdqu [edx], xmm0
425 lea edx, [edx + 16]
426 sub ecx, 16
427 jg wloop
428
429 vzeroupper
430 ret
431 }
432 }
433
434 // Blends 64x4 rectangle to 16x1.
435 __declspec(naked)
436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
437 uint8* dst_ptr, int dst_width) {
438 __asm {
439 push esi
440 push edi
441 mov eax, [esp + 8 + 4] // src_ptr
442 mov esi, [esp + 8 + 8] // src_stride
443 mov edx, [esp + 8 + 12] // dst_ptr
444 mov ecx, [esp + 8 + 16] // dst_width
445 lea edi, [esi + esi * 2] // src_stride * 3
446 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
447 vpsrlw ymm7, ymm7, 8
448
449 wloop:
450 vmovdqu ymm0, [eax] // average rows
451 vmovdqu ymm1, [eax + 32]
452 vpavgb ymm0, ymm0, [eax + esi]
453 vpavgb ymm1, ymm1, [eax + esi + 32]
454 vmovdqu ymm2, [eax + esi * 2]
455 vmovdqu ymm3, [eax + esi * 2 + 32]
456 vpavgb ymm2, ymm2, [eax + edi]
457 vpavgb ymm3, ymm3, [eax + edi + 32]
458 lea eax, [eax + 64]
459 vpavgb ymm0, ymm0, ymm2
460 vpavgb ymm1, ymm1, ymm3
461
462 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
463 vpand ymm3, ymm1, ymm7
464 vpsrlw ymm0, ymm0, 8
465 vpsrlw ymm1, ymm1, 8
466 vpavgw ymm0, ymm0, ymm2
467 vpavgw ymm1, ymm1, ymm3
468 vpackuswb ymm0, ymm0, ymm1
469 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
470
471 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
472 vpsrlw ymm0, ymm0, 8
473 vpavgw ymm0, ymm0, ymm2
474 vpackuswb ymm0, ymm0, ymm0
475 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
476
477 vmovdqu [edx], xmm0
478 lea edx, [edx + 16]
479 sub ecx, 16
480 jg wloop
481
482 pop edi
483 pop esi
484 vzeroupper
485 ret
486 }
487 }
488 #endif // HAS_SCALEROWDOWN4_AVX2
489
294 // Point samples 32 pixels to 24 pixels. 490 // Point samples 32 pixels to 24 pixels.
295 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
296 // Then shuffled to do the scaling. 492 // Then shuffled to do the scaling.
297 493
298 // Note that movdqa+palign may be better than movdqu. 494 __declspec(naked)
299 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
300 __declspec(naked) __declspec(align(16))
301 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
302 uint8* dst_ptr, int dst_width) { 496 uint8* dst_ptr, int dst_width) {
303 __asm { 497 __asm {
304 mov eax, [esp + 4] // src_ptr 498 mov eax, [esp + 4] // src_ptr
305 // src_stride ignored 499 // src_stride ignored
306 mov edx, [esp + 12] // dst_ptr 500 mov edx, [esp + 12] // dst_ptr
307 mov ecx, [esp + 16] // dst_width 501 mov ecx, [esp + 16] // dst_width
308 movdqa xmm3, kShuf0 502 movdqa xmm3, kShuf0
309 movdqa xmm4, kShuf1 503 movdqa xmm4, kShuf1
310 movdqa xmm5, kShuf2 504 movdqa xmm5, kShuf2
(...skipping 26 matching lines...) Expand all
337 // xmm0 src_row 0 531 // xmm0 src_row 0
338 // xmm1 src_row 1 532 // xmm1 src_row 1
339 // xmm2 shuf 0 533 // xmm2 shuf 0
340 // xmm3 shuf 1 534 // xmm3 shuf 1
341 // xmm4 shuf 2 535 // xmm4 shuf 2
342 // xmm5 madd 0 536 // xmm5 madd 0
343 // xmm6 madd 1 537 // xmm6 madd 1
344 // xmm7 kRound34 538 // xmm7 kRound34
345 539
346 // Note that movdqa+palign may be better than movdqu. 540 // Note that movdqa+palign may be better than movdqu.
347 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 541 __declspec(naked)
348 __declspec(naked) __declspec(align(16))
349 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
350 ptrdiff_t src_stride, 543 ptrdiff_t src_stride,
351 uint8* dst_ptr, int dst_width) { 544 uint8* dst_ptr, int dst_width) {
352 __asm { 545 __asm {
353 push esi 546 push esi
354 mov eax, [esp + 4 + 4] // src_ptr 547 mov eax, [esp + 4 + 4] // src_ptr
355 mov esi, [esp + 4 + 8] // src_stride 548 mov esi, [esp + 4 + 8] // src_stride
356 mov edx, [esp + 4 + 12] // dst_ptr 549 mov edx, [esp + 4 + 12] // dst_ptr
357 mov ecx, [esp + 4 + 16] // dst_width 550 mov ecx, [esp + 4 + 16] // dst_width
358 movdqa xmm2, kShuf01 551 movdqa xmm2, kShuf01
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
395 lea edx, [edx + 24] 588 lea edx, [edx + 24]
396 sub ecx, 24 589 sub ecx, 24
397 jg wloop 590 jg wloop
398 591
399 pop esi 592 pop esi
400 ret 593 ret
401 } 594 }
402 } 595 }
403 596
404 // Note that movdqa+palign may be better than movdqu. 597 // Note that movdqa+palign may be better than movdqu.
405 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 598 __declspec(naked)
406 __declspec(naked) __declspec(align(16))
407 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
408 ptrdiff_t src_stride, 600 ptrdiff_t src_stride,
409 uint8* dst_ptr, int dst_width) { 601 uint8* dst_ptr, int dst_width) {
410 __asm { 602 __asm {
411 push esi 603 push esi
412 mov eax, [esp + 4 + 4] // src_ptr 604 mov eax, [esp + 4 + 4] // src_ptr
413 mov esi, [esp + 4 + 8] // src_stride 605 mov esi, [esp + 4 + 8] // src_stride
414 mov edx, [esp + 4 + 12] // dst_ptr 606 mov edx, [esp + 4 + 12] // dst_ptr
415 mov ecx, [esp + 4 + 16] // dst_width 607 mov ecx, [esp + 4 + 16] // dst_width
416 movdqa xmm2, kShuf01 608 movdqa xmm2, kShuf01
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
458 jg wloop 650 jg wloop
459 651
460 pop esi 652 pop esi
461 ret 653 ret
462 } 654 }
463 } 655 }
464 656
465 // 3/8 point sampler 657 // 3/8 point sampler
466 658
467 // Scale 32 pixels to 12 659 // Scale 32 pixels to 12
468 __declspec(naked) __declspec(align(16)) 660 __declspec(naked)
469 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
470 uint8* dst_ptr, int dst_width) { 662 uint8* dst_ptr, int dst_width) {
471 __asm { 663 __asm {
472 mov eax, [esp + 4] // src_ptr 664 mov eax, [esp + 4] // src_ptr
473 // src_stride ignored 665 // src_stride ignored
474 mov edx, [esp + 12] // dst_ptr 666 mov edx, [esp + 12] // dst_ptr
475 mov ecx, [esp + 16] // dst_width 667 mov ecx, [esp + 16] // dst_width
476 movdqa xmm4, kShuf38a 668 movdqa xmm4, kShuf38a
477 movdqa xmm5, kShuf38b 669 movdqa xmm5, kShuf38b
478 670
(...skipping 10 matching lines...) Expand all
489 movd [edx + 8], xmm1 681 movd [edx + 8], xmm1
490 lea edx, [edx + 12] 682 lea edx, [edx + 12]
491 sub ecx, 12 683 sub ecx, 12
492 jg xloop 684 jg xloop
493 685
494 ret 686 ret
495 } 687 }
496 } 688 }
497 689
498 // Scale 16x3 pixels to 6x1 with interpolation 690 // Scale 16x3 pixels to 6x1 with interpolation
499 __declspec(naked) __declspec(align(16)) 691 __declspec(naked)
500 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
501 ptrdiff_t src_stride, 693 ptrdiff_t src_stride,
502 uint8* dst_ptr, int dst_width) { 694 uint8* dst_ptr, int dst_width) {
503 __asm { 695 __asm {
504 push esi 696 push esi
505 mov eax, [esp + 4 + 4] // src_ptr 697 mov eax, [esp + 4 + 4] // src_ptr
506 mov esi, [esp + 4 + 8] // src_stride 698 mov esi, [esp + 4 + 8] // src_stride
507 mov edx, [esp + 4 + 12] // dst_ptr 699 mov edx, [esp + 4 + 12] // dst_ptr
508 mov ecx, [esp + 4 + 16] // dst_width 700 mov ecx, [esp + 4 + 16] // dst_width
509 movdqa xmm2, kShufAc 701 movdqa xmm2, kShufAc
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
554 lea edx, [edx + 6] 746 lea edx, [edx + 6]
555 sub ecx, 6 747 sub ecx, 6
556 jg xloop 748 jg xloop
557 749
558 pop esi 750 pop esi
559 ret 751 ret
560 } 752 }
561 } 753 }
562 754
563 // Scale 16x2 pixels to 6x1 with interpolation 755 // Scale 16x2 pixels to 6x1 with interpolation
564 __declspec(naked) __declspec(align(16)) 756 __declspec(naked)
565 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
566 ptrdiff_t src_stride, 758 ptrdiff_t src_stride,
567 uint8* dst_ptr, int dst_width) { 759 uint8* dst_ptr, int dst_width) {
568 __asm { 760 __asm {
569 push esi 761 push esi
570 mov eax, [esp + 4 + 4] // src_ptr 762 mov eax, [esp + 4 + 4] // src_ptr
571 mov esi, [esp + 4 + 8] // src_stride 763 mov esi, [esp + 4 + 8] // src_stride
572 mov edx, [esp + 4 + 12] // dst_ptr 764 mov edx, [esp + 4 + 12] // dst_ptr
573 mov ecx, [esp + 4 + 16] // dst_width 765 mov ecx, [esp + 4 + 16] // dst_width
574 movdqa xmm2, kShufAb0 766 movdqa xmm2, kShufAb0
(...skipping 23 matching lines...) Expand all
598 movd [edx + 2], xmm1 790 movd [edx + 2], xmm1
599 lea edx, [edx + 6] 791 lea edx, [edx + 6]
600 sub ecx, 6 792 sub ecx, 6
601 jg xloop 793 jg xloop
602 794
603 pop esi 795 pop esi
604 ret 796 ret
605 } 797 }
606 } 798 }
607 799
608 // Reads 16xN bytes and produces 16 shorts at a time. 800 // Reads 16 bytes and accumulates to 16 shorts at a time.
609 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. 801 __declspec(naked)
610 __declspec(naked) __declspec(align(16)) 802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
611 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
612 uint16* dst_ptr, int src_width,
613 int src_height) {
614 __asm { 803 __asm {
615 push esi 804 mov eax, [esp + 4] // src_ptr
616 push edi 805 mov edx, [esp + 8] // dst_ptr
617 push ebx 806 mov ecx, [esp + 12] // src_width
618 push ebp 807 pxor xmm5, xmm5
619 mov esi, [esp + 16 + 4] // src_ptr
620 mov edx, [esp + 16 + 8] // src_stride
621 mov edi, [esp + 16 + 12] // dst_ptr
622 mov ecx, [esp + 16 + 16] // dst_width
623 mov ebx, [esp + 16 + 20] // height
624 pxor xmm4, xmm4
625 dec ebx
626 808
809 // sum rows
627 xloop: 810 xloop:
628 // first row 811 movdqu xmm3, [eax] // read 16 bytes
629 movdqu xmm0, [esi] 812 lea eax, [eax + 16]
630 lea eax, [esi + edx] 813 movdqu xmm0, [edx] // read 16 words from destination
631 movdqa xmm1, xmm0 814 movdqu xmm1, [edx + 16]
632 punpcklbw xmm0, xmm4 815 movdqa xmm2, xmm3
633 punpckhbw xmm1, xmm4 816 punpcklbw xmm2, xmm5
634 lea esi, [esi + 16] 817 punpckhbw xmm3, xmm5
635 mov ebp, ebx
636 test ebp, ebp
637 je ydone
638
639 // sum remaining rows
640 yloop:
641 movdqu xmm2, [eax] // read 16 pixels
642 lea eax, [eax + edx] // advance to next row
643 movdqa xmm3, xmm2
644 punpcklbw xmm2, xmm4
645 punpckhbw xmm3, xmm4
646 paddusw xmm0, xmm2 // sum 16 words 818 paddusw xmm0, xmm2 // sum 16 words
647 paddusw xmm1, xmm3 819 paddusw xmm1, xmm3
648 sub ebp, 1 820 movdqu [edx], xmm0 // write 16 words to destination
649 jg yloop 821 movdqu [edx + 16], xmm1
650 822 lea edx, [edx + 32]
651 ydone:
652 movdqu [edi], xmm0
653 movdqu [edi + 16], xmm1
654 lea edi, [edi + 32]
655
656 sub ecx, 16 823 sub ecx, 16
657 jg xloop 824 jg xloop
658
659 pop ebp
660 pop ebx
661 pop edi
662 pop esi
663 ret 825 ret
664 } 826 }
665 } 827 }
666 828
829 #ifdef HAS_SCALEADDROW_AVX2
830 // Reads 32 bytes and accumulates to 32 shorts at a time.
831 __declspec(naked)
832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
833 __asm {
834 mov eax, [esp + 4] // src_ptr
835 mov edx, [esp + 8] // dst_ptr
836 mov ecx, [esp + 12] // src_width
837 vpxor ymm5, ymm5, ymm5
838
839 // sum rows
840 xloop:
841 vmovdqu ymm3, [eax] // read 32 bytes
842 lea eax, [eax + 32]
843 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
844 vpunpcklbw ymm2, ymm3, ymm5
845 vpunpckhbw ymm3, ymm3, ymm5
846 vpaddusw ymm0, ymm2, [edx] // sum 16 words
847 vpaddusw ymm1, ymm3, [edx + 32]
848 vmovdqu [edx], ymm0 // write 32 words to destination
849 vmovdqu [edx + 32], ymm1
850 lea edx, [edx + 64]
851 sub ecx, 32
852 jg xloop
853
854 vzeroupper
855 ret
856 }
857 }
858 #endif // HAS_SCALEADDROW_AVX2
859
667 // Bilinear column filtering. SSSE3 version. 860 // Bilinear column filtering. SSSE3 version.
668 // TODO(fbarchard): Port to Neon 861 __declspec(naked)
669 // TODO(fbarchard): Switch the following:
670 // xor ebx, ebx
671 // mov bx, word ptr [esi + eax] // 2 source x0 pixels
672 // To
673 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
674 // when drmemory bug fixed.
675 // https://code.google.com/p/drmemory/issues/detail?id=1396
676
677 __declspec(naked) __declspec(align(16))
678 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
679 int dst_width, int x, int dx) { 863 int dst_width, int x, int dx) {
680 __asm { 864 __asm {
681 push ebx 865 push ebx
682 push esi 866 push esi
683 push edi 867 push edi
684 mov edi, [esp + 12 + 4] // dst_ptr 868 mov edi, [esp + 12 + 4] // dst_ptr
685 mov esi, [esp + 12 + 8] // src_ptr 869 mov esi, [esp + 12 + 8] // src_ptr
686 mov ecx, [esp + 12 + 12] // dst_width 870 mov ecx, [esp + 12 + 12] // dst_width
687 movd xmm2, [esp + 12 + 16] // x 871 movd xmm2, [esp + 12 + 16] // x
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
744 xloop99: 928 xloop99:
745 929
746 pop edi 930 pop edi
747 pop esi 931 pop esi
748 pop ebx 932 pop ebx
749 ret 933 ret
750 } 934 }
751 } 935 }
752 936
753 // Reads 16 pixels, duplicates them and writes 32 pixels. 937 // Reads 16 pixels, duplicates them and writes 32 pixels.
754 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 938 __declspec(naked)
755 __declspec(naked) __declspec(align(16))
756 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
757 int dst_width, int x, int dx) { 940 int dst_width, int x, int dx) {
758 __asm { 941 __asm {
759 mov edx, [esp + 4] // dst_ptr 942 mov edx, [esp + 4] // dst_ptr
760 mov eax, [esp + 8] // src_ptr 943 mov eax, [esp + 8] // src_ptr
761 mov ecx, [esp + 12] // dst_width 944 mov ecx, [esp + 12] // dst_width
762 945
763 wloop: 946 wloop:
764 movdqu xmm0, [eax] 947 movdqu xmm0, [eax]
765 lea eax, [eax + 16] 948 lea eax, [eax + 16]
766 movdqa xmm1, xmm0 949 movdqa xmm1, xmm0
767 punpcklbw xmm0, xmm0 950 punpcklbw xmm0, xmm0
768 punpckhbw xmm1, xmm1 951 punpckhbw xmm1, xmm1
769 movdqu [edx], xmm0 952 movdqu [edx], xmm0
770 movdqu [edx + 16], xmm1 953 movdqu [edx + 16], xmm1
771 lea edx, [edx + 32] 954 lea edx, [edx + 32]
772 sub ecx, 32 955 sub ecx, 32
773 jg wloop 956 jg wloop
774 957
775 ret 958 ret
776 } 959 }
777 } 960 }
778 961
779 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
780 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 963 __declspec(naked)
781 __declspec(naked) __declspec(align(16))
782 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
783 ptrdiff_t src_stride, 965 ptrdiff_t src_stride,
784 uint8* dst_argb, int dst_width) { 966 uint8* dst_argb, int dst_width) {
785 __asm { 967 __asm {
786 mov eax, [esp + 4] // src_argb 968 mov eax, [esp + 4] // src_argb
787 // src_stride ignored 969 // src_stride ignored
788 mov edx, [esp + 12] // dst_argb 970 mov edx, [esp + 12] // dst_argb
789 mov ecx, [esp + 16] // dst_width 971 mov ecx, [esp + 16] // dst_width
790 972
791 wloop: 973 wloop:
792 movdqu xmm0, [eax] 974 movdqu xmm0, [eax]
793 movdqu xmm1, [eax + 16] 975 movdqu xmm1, [eax + 16]
794 lea eax, [eax + 32] 976 lea eax, [eax + 32]
795 shufps xmm0, xmm1, 0xdd 977 shufps xmm0, xmm1, 0xdd
796 movdqu [edx], xmm0 978 movdqu [edx], xmm0
797 lea edx, [edx + 16] 979 lea edx, [edx + 16]
798 sub ecx, 4 980 sub ecx, 4
799 jg wloop 981 jg wloop
800 982
801 ret 983 ret
802 } 984 }
803 } 985 }
804 986
805 // Blends 8x1 rectangle to 4x1. 987 // Blends 8x1 rectangle to 4x1.
806 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 988 __declspec(naked)
807 __declspec(naked) __declspec(align(16))
808 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
809 ptrdiff_t src_stride, 990 ptrdiff_t src_stride,
810 uint8* dst_argb, int dst_width) { 991 uint8* dst_argb, int dst_width) {
811 __asm { 992 __asm {
812 mov eax, [esp + 4] // src_argb 993 mov eax, [esp + 4] // src_argb
813 // src_stride ignored 994 // src_stride ignored
814 mov edx, [esp + 12] // dst_argb 995 mov edx, [esp + 12] // dst_argb
815 mov ecx, [esp + 16] // dst_width 996 mov ecx, [esp + 16] // dst_width
816 997
817 wloop: 998 wloop:
818 movdqu xmm0, [eax] 999 movdqu xmm0, [eax]
819 movdqu xmm1, [eax + 16] 1000 movdqu xmm1, [eax + 16]
820 lea eax, [eax + 32] 1001 lea eax, [eax + 32]
821 movdqa xmm2, xmm0 1002 movdqa xmm2, xmm0
822 shufps xmm0, xmm1, 0x88 // even pixels 1003 shufps xmm0, xmm1, 0x88 // even pixels
823 shufps xmm2, xmm1, 0xdd // odd pixels 1004 shufps xmm2, xmm1, 0xdd // odd pixels
824 pavgb xmm0, xmm2 1005 pavgb xmm0, xmm2
825 movdqu [edx], xmm0 1006 movdqu [edx], xmm0
826 lea edx, [edx + 16] 1007 lea edx, [edx + 16]
827 sub ecx, 4 1008 sub ecx, 4
828 jg wloop 1009 jg wloop
829 1010
830 ret 1011 ret
831 } 1012 }
832 } 1013 }
833 1014
834 // Blends 8x2 rectangle to 4x1. 1015 // Blends 8x2 rectangle to 4x1.
835 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1016 __declspec(naked)
836 __declspec(naked) __declspec(align(16))
837 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
838 ptrdiff_t src_stride, 1018 ptrdiff_t src_stride,
839 uint8* dst_argb, int dst_width) { 1019 uint8* dst_argb, int dst_width) {
840 __asm { 1020 __asm {
841 push esi 1021 push esi
842 mov eax, [esp + 4 + 4] // src_argb 1022 mov eax, [esp + 4 + 4] // src_argb
843 mov esi, [esp + 4 + 8] // src_stride 1023 mov esi, [esp + 4 + 8] // src_stride
844 mov edx, [esp + 4 + 12] // dst_argb 1024 mov edx, [esp + 4 + 12] // dst_argb
845 mov ecx, [esp + 4 + 16] // dst_width 1025 mov ecx, [esp + 4 + 16] // dst_width
846 1026
(...skipping 13 matching lines...) Expand all
860 lea edx, [edx + 16] 1040 lea edx, [edx + 16]
861 sub ecx, 4 1041 sub ecx, 4
862 jg wloop 1042 jg wloop
863 1043
864 pop esi 1044 pop esi
865 ret 1045 ret
866 } 1046 }
867 } 1047 }
868 1048
869 // Reads 4 pixels at a time. 1049 // Reads 4 pixels at a time.
870 // Alignment requirement: dst_argb 16 byte aligned. 1050 __declspec(naked)
871 __declspec(naked) __declspec(align(16))
872 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
873 int src_stepx, 1052 int src_stepx,
874 uint8* dst_argb, int dst_width) { 1053 uint8* dst_argb, int dst_width) {
875 __asm { 1054 __asm {
876 push ebx 1055 push ebx
877 push edi 1056 push edi
878 mov eax, [esp + 8 + 4] // src_argb 1057 mov eax, [esp + 8 + 4] // src_argb
879 // src_stride ignored 1058 // src_stride ignored
880 mov ebx, [esp + 8 + 12] // src_stepx 1059 mov ebx, [esp + 8 + 12] // src_stepx
881 mov edx, [esp + 8 + 16] // dst_argb 1060 mov edx, [esp + 8 + 16] // dst_argb
(...skipping 15 matching lines...) Expand all
897 sub ecx, 4 1076 sub ecx, 4
898 jg wloop 1077 jg wloop
899 1078
900 pop edi 1079 pop edi
901 pop ebx 1080 pop ebx
902 ret 1081 ret
903 } 1082 }
904 } 1083 }
905 1084
906 // Blends four 2x2 to 4x1. 1085 // Blends four 2x2 to 4x1.
907 // Alignment requirement: dst_argb 16 byte aligned. 1086 __declspec(naked)
908 __declspec(naked) __declspec(align(16))
909 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
910 ptrdiff_t src_stride, 1088 ptrdiff_t src_stride,
911 int src_stepx, 1089 int src_stepx,
912 uint8* dst_argb, int dst_width) { 1090 uint8* dst_argb, int dst_width) {
913 __asm { 1091 __asm {
914 push ebx 1092 push ebx
915 push esi 1093 push esi
916 push edi 1094 push edi
917 mov eax, [esp + 12 + 4] // src_argb 1095 mov eax, [esp + 12 + 4] // src_argb
918 mov esi, [esp + 12 + 8] // src_stride 1096 mov esi, [esp + 12 + 8] // src_stride
(...skipping 27 matching lines...) Expand all
946 jg wloop 1124 jg wloop
947 1125
948 pop edi 1126 pop edi
949 pop esi 1127 pop esi
950 pop ebx 1128 pop ebx
951 ret 1129 ret
952 } 1130 }
953 } 1131 }
954 1132
955 // Column scaling unfiltered. SSE2 version. 1133 // Column scaling unfiltered. SSE2 version.
956 __declspec(naked) __declspec(align(16)) 1134 __declspec(naked)
957 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
958 int dst_width, int x, int dx) { 1136 int dst_width, int x, int dx) {
959 __asm { 1137 __asm {
960 push edi 1138 push edi
961 push esi 1139 push esi
962 mov edi, [esp + 8 + 4] // dst_argb 1140 mov edi, [esp + 8 + 4] // dst_argb
963 mov esi, [esp + 8 + 8] // src_argb 1141 mov esi, [esp + 8 + 8] // src_argb
964 mov ecx, [esp + 8 + 12] // dst_width 1142 mov ecx, [esp + 8 + 12] // dst_width
965 movd xmm2, [esp + 8 + 16] // x 1143 movd xmm2, [esp + 8 + 16] // x
966 movd xmm3, [esp + 8 + 20] // dx 1144 movd xmm3, [esp + 8 + 20] // dx
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
1037 static uvec8 kShuffleColARGB = { 1215 static uvec8 kShuffleColARGB = {
1038 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1216 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1039 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1217 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1040 }; 1218 };
1041 1219
1042 // Shuffle table for duplicating 2 fractions into 8 bytes each 1220 // Shuffle table for duplicating 2 fractions into 8 bytes each
1043 static uvec8 kShuffleFractions = { 1221 static uvec8 kShuffleFractions = {
1044 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1222 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1045 }; 1223 };
1046 1224
1047 __declspec(naked) __declspec(align(16)) 1225 __declspec(naked)
1048 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1049 int dst_width, int x, int dx) { 1227 int dst_width, int x, int dx) {
1050 __asm { 1228 __asm {
1051 push esi 1229 push esi
1052 push edi 1230 push edi
1053 mov edi, [esp + 8 + 4] // dst_argb 1231 mov edi, [esp + 8 + 4] // dst_argb
1054 mov esi, [esp + 8 + 8] // src_argb 1232 mov esi, [esp + 8 + 8] // src_argb
1055 mov ecx, [esp + 8 + 12] // dst_width 1233 mov ecx, [esp + 8 + 12] // dst_width
1056 movd xmm2, [esp + 8 + 16] // x 1234 movd xmm2, [esp + 8 + 16] // x
1057 movd xmm3, [esp + 8 + 20] // dx 1235 movd xmm3, [esp + 8 + 20] // dx
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
1108 1286
1109 xloop99: 1287 xloop99:
1110 1288
1111 pop edi 1289 pop edi
1112 pop esi 1290 pop esi
1113 ret 1291 ret
1114 } 1292 }
1115 } 1293 }
1116 1294
1117 // Reads 4 pixels, duplicates them and writes 8 pixels. 1295 // Reads 4 pixels, duplicates them and writes 8 pixels.
1118 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1296 __declspec(naked)
1119 __declspec(naked) __declspec(align(16))
1120 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1121 int dst_width, int x, int dx) { 1298 int dst_width, int x, int dx) {
1122 __asm { 1299 __asm {
1123 mov edx, [esp + 4] // dst_argb 1300 mov edx, [esp + 4] // dst_argb
1124 mov eax, [esp + 8] // src_argb 1301 mov eax, [esp + 8] // src_argb
1125 mov ecx, [esp + 12] // dst_width 1302 mov ecx, [esp + 12] // dst_width
1126 1303
1127 wloop: 1304 wloop:
1128 movdqu xmm0, [eax] 1305 movdqu xmm0, [eax]
1129 lea eax, [eax + 16] 1306 lea eax, [eax + 16]
1130 movdqa xmm1, xmm0 1307 movdqa xmm1, xmm0
1131 punpckldq xmm0, xmm0 1308 punpckldq xmm0, xmm0
1132 punpckhdq xmm1, xmm1 1309 punpckhdq xmm1, xmm1
1133 movdqu [edx], xmm0 1310 movdqu [edx], xmm0
1134 movdqu [edx + 16], xmm1 1311 movdqu [edx + 16], xmm1
1135 lea edx, [edx + 32] 1312 lea edx, [edx + 32]
1136 sub ecx, 8 1313 sub ecx, 8
1137 jg wloop 1314 jg wloop
1138 1315
1139 ret 1316 ret
1140 } 1317 }
1141 } 1318 }
1142 1319
1143 // Divide num by div and return as 16.16 fixed point result. 1320 // Divide num by div and return as 16.16 fixed point result.
1144 __declspec(naked) __declspec(align(16)) 1321 __declspec(naked)
1145 int FixedDiv_X86(int num, int div) { 1322 int FixedDiv_X86(int num, int div) {
1146 __asm { 1323 __asm {
1147 mov eax, [esp + 4] // num 1324 mov eax, [esp + 4] // num
1148 cdq // extend num to 64 bits 1325 cdq // extend num to 64 bits
1149 shld edx, eax, 16 // 32.16 1326 shld edx, eax, 16 // 32.16
1150 shl eax, 16 1327 shl eax, 16
1151 idiv dword ptr [esp + 8] 1328 idiv dword ptr [esp + 8]
1152 ret 1329 ret
1153 } 1330 }
1154 } 1331 }
1155 1332
1156 // Divide num by div and return as 16.16 fixed point result. 1333 // Divide num by div and return as 16.16 fixed point result.
1157 __declspec(naked) __declspec(align(16)) 1334 __declspec(naked)
1158 int FixedDiv1_X86(int num, int div) { 1335 int FixedDiv1_X86(int num, int div) {
1159 __asm { 1336 __asm {
1160 mov eax, [esp + 4] // num 1337 mov eax, [esp + 4] // num
1161 mov ecx, [esp + 8] // denom 1338 mov ecx, [esp + 8] // denom
1162 cdq // extend num to 64 bits 1339 cdq // extend num to 64 bits
1163 shld edx, eax, 16 // 32.16 1340 shld edx, eax, 16 // 32.16
1164 shl eax, 16 1341 shl eax, 16
1165 sub eax, 0x00010001 1342 sub eax, 0x00010001
1166 sbb edx, 0 1343 sbb edx, 0
1167 sub ecx, 1 1344 sub ecx, 1
1168 idiv ecx 1345 idiv ecx
1169 ret 1346 ret
1170 } 1347 }
1171 } 1348 }
1172 1349 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1173 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1174 1350
1175 #ifdef __cplusplus 1351 #ifdef __cplusplus
1176 } // extern "C" 1352 } // extern "C"
1177 } // namespace libyuv 1353 } // namespace libyuv
1178 #endif 1354 #endif
OLDNEW
« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_posix.cc ('k') | source/libvpx/third_party/x86inc/README.libvpx » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698