OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
11 #include "libyuv/row.h" | 11 #include "libyuv/row.h" |
| 12 #include "libyuv/scale_row.h" |
12 | 13 |
13 #ifdef __cplusplus | 14 #ifdef __cplusplus |
14 namespace libyuv { | 15 namespace libyuv { |
15 extern "C" { | 16 extern "C" { |
16 #endif | 17 #endif |
17 | 18 |
18 // This module is for Visual C x86. | 19 // This module is for Visual C x86. |
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) | 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ |
| 21 defined(_MSC_VER) && !defined(__clang__) |
20 | 22 |
21 // Offsets for source bytes 0 to 9 | 23 // Offsets for source bytes 0 to 9 |
22 static uvec8 kShuf0 = | 24 static uvec8 kShuf0 = |
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; | 25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
24 | 26 |
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. | 27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
26 static uvec8 kShuf1 = | 28 static uvec8 kShuf1 = |
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; | 29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
28 | 30 |
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. | 31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
86 | 88 |
87 // Arrange third value for pixels 0,1,2,3,4,5 | 89 // Arrange third value for pixels 0,1,2,3,4,5 |
88 static uvec8 kShufAb2 = | 90 static uvec8 kShufAb2 = |
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
90 | 92 |
91 // Scaling values for boxes of 3x2 and 2x2 | 93 // Scaling values for boxes of 3x2 and 2x2 |
92 static uvec16 kScaleAb2 = | 94 static uvec16 kScaleAb2 = |
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
94 | 96 |
95 // Reads 32 pixels, throws half away and writes 16 pixels. | 97 // Reads 32 pixels, throws half away and writes 16 pixels. |
96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. | 98 __declspec(naked) |
97 __declspec(naked) __declspec(align(16)) | |
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
99 uint8* dst_ptr, int dst_width) { | 100 uint8* dst_ptr, int dst_width) { |
100 __asm { | 101 __asm { |
101 mov eax, [esp + 4] // src_ptr | 102 mov eax, [esp + 4] // src_ptr |
102 // src_stride ignored | 103 // src_stride ignored |
103 mov edx, [esp + 12] // dst_ptr | 104 mov edx, [esp + 12] // dst_ptr |
104 mov ecx, [esp + 16] // dst_width | 105 mov ecx, [esp + 16] // dst_width |
105 | 106 |
106 wloop: | 107 wloop: |
107 movdqu xmm0, [eax] | 108 movdqu xmm0, [eax] |
108 movdqu xmm1, [eax + 16] | 109 movdqu xmm1, [eax + 16] |
109 lea eax, [eax + 32] | 110 lea eax, [eax + 32] |
110 psrlw xmm0, 8 // isolate odd pixels. | 111 psrlw xmm0, 8 // isolate odd pixels. |
111 psrlw xmm1, 8 | 112 psrlw xmm1, 8 |
112 packuswb xmm0, xmm1 | 113 packuswb xmm0, xmm1 |
113 movdqu [edx], xmm0 | 114 movdqu [edx], xmm0 |
114 lea edx, [edx + 16] | 115 lea edx, [edx + 16] |
115 sub ecx, 16 | 116 sub ecx, 16 |
116 jg wloop | 117 jg wloop |
117 | 118 |
118 ret | 119 ret |
119 } | 120 } |
120 } | 121 } |
121 | 122 |
122 // Blends 32x1 rectangle to 16x1. | 123 // Blends 32x1 rectangle to 16x1. |
123 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. | 124 __declspec(naked) |
124 __declspec(naked) __declspec(align(16)) | |
125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
126 uint8* dst_ptr, int dst_width) { | 126 uint8* dst_ptr, int dst_width) { |
127 __asm { | 127 __asm { |
128 mov eax, [esp + 4] // src_ptr | 128 mov eax, [esp + 4] // src_ptr |
129 // src_stride | 129 // src_stride |
130 mov edx, [esp + 12] // dst_ptr | 130 mov edx, [esp + 12] // dst_ptr |
131 mov ecx, [esp + 16] // dst_width | 131 mov ecx, [esp + 16] // dst_width |
132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
133 psrlw xmm5, 8 | 133 psrlw xmm5, 8 |
134 | 134 |
(...skipping 15 matching lines...) Expand all Loading... |
150 movdqu [edx], xmm0 | 150 movdqu [edx], xmm0 |
151 lea edx, [edx + 16] | 151 lea edx, [edx + 16] |
152 sub ecx, 16 | 152 sub ecx, 16 |
153 jg wloop | 153 jg wloop |
154 | 154 |
155 ret | 155 ret |
156 } | 156 } |
157 } | 157 } |
158 | 158 |
159 // Blends 32x2 rectangle to 16x1. | 159 // Blends 32x2 rectangle to 16x1. |
160 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. | 160 __declspec(naked) |
161 __declspec(naked) __declspec(align(16)) | |
162 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
163 uint8* dst_ptr, int dst_width) { | 162 uint8* dst_ptr, int dst_width) { |
164 __asm { | 163 __asm { |
165 push esi | 164 push esi |
166 mov eax, [esp + 4 + 4] // src_ptr | 165 mov eax, [esp + 4 + 4] // src_ptr |
167 mov esi, [esp + 4 + 8] // src_stride | 166 mov esi, [esp + 4 + 8] // src_stride |
168 mov edx, [esp + 4 + 12] // dst_ptr | 167 mov edx, [esp + 4 + 12] // dst_ptr |
169 mov ecx, [esp + 4 + 16] // dst_width | 168 mov ecx, [esp + 4 + 16] // dst_width |
170 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 169 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
171 psrlw xmm5, 8 | 170 psrlw xmm5, 8 |
(...skipping 20 matching lines...) Expand all Loading... |
192 movdqu [edx], xmm0 | 191 movdqu [edx], xmm0 |
193 lea edx, [edx + 16] | 192 lea edx, [edx + 16] |
194 sub ecx, 16 | 193 sub ecx, 16 |
195 jg wloop | 194 jg wloop |
196 | 195 |
197 pop esi | 196 pop esi |
198 ret | 197 ret |
199 } | 198 } |
200 } | 199 } |
201 | 200 |
| 201 #ifdef HAS_SCALEROWDOWN2_AVX2 |
| 202 // Reads 64 pixels, throws half away and writes 32 pixels. |
| 203 __declspec(naked) |
| 204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 205 uint8* dst_ptr, int dst_width) { |
| 206 __asm { |
| 207 mov eax, [esp + 4] // src_ptr |
| 208 // src_stride ignored |
| 209 mov edx, [esp + 12] // dst_ptr |
| 210 mov ecx, [esp + 16] // dst_width |
| 211 |
| 212 wloop: |
| 213 vmovdqu ymm0, [eax] |
| 214 vmovdqu ymm1, [eax + 32] |
| 215 lea eax, [eax + 64] |
| 216 vpsrlw ymm0, ymm0, 8 // isolate odd pixels. |
| 217 vpsrlw ymm1, ymm1, 8 |
| 218 vpackuswb ymm0, ymm0, ymm1 |
| 219 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 220 vmovdqu [edx], ymm0 |
| 221 lea edx, [edx + 32] |
| 222 sub ecx, 32 |
| 223 jg wloop |
| 224 |
| 225 vzeroupper |
| 226 ret |
| 227 } |
| 228 } |
| 229 |
| 230 // Blends 64x1 rectangle to 32x1. |
| 231 __declspec(naked) |
| 232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 233 uint8* dst_ptr, int dst_width) { |
| 234 __asm { |
| 235 mov eax, [esp + 4] // src_ptr |
| 236 // src_stride |
| 237 mov edx, [esp + 12] // dst_ptr |
| 238 mov ecx, [esp + 16] // dst_width |
| 239 |
| 240 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
| 241 vpsrlw ymm4, ymm4, 15 |
| 242 vpackuswb ymm4, ymm4, ymm4 |
| 243 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 244 |
| 245 wloop: |
| 246 vmovdqu ymm0, [eax] |
| 247 vmovdqu ymm1, [eax + 32] |
| 248 lea eax, [eax + 64] |
| 249 |
| 250 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally |
| 251 vpmaddubsw ymm1, ymm1, ymm4 |
| 252 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
| 253 vpavgw ymm1, ymm1, ymm5 |
| 254 vpackuswb ymm0, ymm0, ymm1 |
| 255 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 256 |
| 257 vmovdqu [edx], ymm0 |
| 258 lea edx, [edx + 32] |
| 259 sub ecx, 32 |
| 260 jg wloop |
| 261 |
| 262 vzeroupper |
| 263 ret |
| 264 } |
| 265 } |
| 266 |
| 267 // Blends 64x2 rectangle to 32x1. |
| 268 __declspec(naked) |
| 269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 270 uint8* dst_ptr, int dst_width) { |
| 271 __asm { |
| 272 push esi |
| 273 mov eax, [esp + 4 + 4] // src_ptr |
| 274 mov esi, [esp + 4 + 8] // src_stride |
| 275 mov edx, [esp + 4 + 12] // dst_ptr |
| 276 mov ecx, [esp + 4 + 16] // dst_width |
| 277 |
| 278 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
| 279 vpsrlw ymm4, ymm4, 15 |
| 280 vpackuswb ymm4, ymm4, ymm4 |
| 281 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 282 |
| 283 wloop: |
| 284 vmovdqu ymm0, [eax] // average rows |
| 285 vmovdqu ymm1, [eax + 32] |
| 286 vpavgb ymm0, ymm0, [eax + esi] |
| 287 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 288 lea eax, [eax + 64] |
| 289 |
| 290 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally |
| 291 vpmaddubsw ymm1, ymm1, ymm4 |
| 292 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
| 293 vpavgw ymm1, ymm1, ymm5 |
| 294 vpackuswb ymm0, ymm0, ymm1 |
| 295 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 296 |
| 297 vmovdqu [edx], ymm0 |
| 298 lea edx, [edx + 32] |
| 299 sub ecx, 32 |
| 300 jg wloop |
| 301 |
| 302 pop esi |
| 303 vzeroupper |
| 304 ret |
| 305 } |
| 306 } |
| 307 #endif // HAS_SCALEROWDOWN2_AVX2 |
| 308 |
202 // Point samples 32 pixels to 8 pixels. | 309 // Point samples 32 pixels to 8 pixels. |
203 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | 310 __declspec(naked) |
204 __declspec(naked) __declspec(align(16)) | |
205 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
206 uint8* dst_ptr, int dst_width) { | 312 uint8* dst_ptr, int dst_width) { |
207 __asm { | 313 __asm { |
208 mov eax, [esp + 4] // src_ptr | 314 mov eax, [esp + 4] // src_ptr |
209 // src_stride ignored | 315 // src_stride ignored |
210 mov edx, [esp + 12] // dst_ptr | 316 mov edx, [esp + 12] // dst_ptr |
211 mov ecx, [esp + 16] // dst_width | 317 mov ecx, [esp + 16] // dst_width |
212 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 | 318 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
213 psrld xmm5, 24 | 319 psrld xmm5, 24 |
214 pslld xmm5, 16 | 320 pslld xmm5, 16 |
(...skipping 10 matching lines...) Expand all Loading... |
225 movq qword ptr [edx], xmm0 | 331 movq qword ptr [edx], xmm0 |
226 lea edx, [edx + 8] | 332 lea edx, [edx + 8] |
227 sub ecx, 8 | 333 sub ecx, 8 |
228 jg wloop | 334 jg wloop |
229 | 335 |
230 ret | 336 ret |
231 } | 337 } |
232 } | 338 } |
233 | 339 |
234 // Blends 32x4 rectangle to 8x1. | 340 // Blends 32x4 rectangle to 8x1. |
235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | 341 __declspec(naked) |
236 __declspec(naked) __declspec(align(16)) | |
237 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
238 uint8* dst_ptr, int dst_width) { | 343 uint8* dst_ptr, int dst_width) { |
239 __asm { | 344 __asm { |
240 push esi | 345 push esi |
241 push edi | 346 push edi |
242 mov eax, [esp + 8 + 4] // src_ptr | 347 mov eax, [esp + 8 + 4] // src_ptr |
243 mov esi, [esp + 8 + 8] // src_stride | 348 mov esi, [esp + 8 + 8] // src_stride |
244 mov edx, [esp + 8 + 12] // dst_ptr | 349 mov edx, [esp + 8 + 12] // dst_ptr |
245 mov ecx, [esp + 8 + 16] // dst_width | 350 mov ecx, [esp + 8 + 16] // dst_width |
246 lea edi, [esi + esi * 2] // src_stride * 3 | 351 lea edi, [esi + esi * 2] // src_stride * 3 |
247 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff | 352 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
248 psrlw xmm7, 8 | 353 psrlw xmm7, 8 |
249 | 354 |
250 wloop: | 355 wloop: |
251 movdqu xmm0, [eax] | 356 movdqu xmm0, [eax] // average rows |
252 movdqu xmm1, [eax + 16] | 357 movdqu xmm1, [eax + 16] |
253 movdqu xmm2, [eax + esi] | 358 movdqu xmm2, [eax + esi] |
254 movdqu xmm3, [eax + esi + 16] | 359 movdqu xmm3, [eax + esi + 16] |
255 pavgb xmm0, xmm2 // average rows | 360 pavgb xmm0, xmm2 |
256 pavgb xmm1, xmm3 | 361 pavgb xmm1, xmm3 |
257 movdqu xmm2, [eax + esi * 2] | 362 movdqu xmm2, [eax + esi * 2] |
258 movdqu xmm3, [eax + esi * 2 + 16] | 363 movdqu xmm3, [eax + esi * 2 + 16] |
259 movdqu xmm4, [eax + edi] | 364 movdqu xmm4, [eax + edi] |
260 movdqu xmm5, [eax + edi + 16] | 365 movdqu xmm5, [eax + edi + 16] |
261 lea eax, [eax + 32] | 366 lea eax, [eax + 32] |
262 pavgb xmm2, xmm4 | 367 pavgb xmm2, xmm4 |
263 pavgb xmm3, xmm5 | 368 pavgb xmm3, xmm5 |
264 pavgb xmm0, xmm2 | 369 pavgb xmm0, xmm2 |
265 pavgb xmm1, xmm3 | 370 pavgb xmm1, xmm3 |
(...skipping 18 matching lines...) Expand all Loading... |
284 lea edx, [edx + 8] | 389 lea edx, [edx + 8] |
285 sub ecx, 8 | 390 sub ecx, 8 |
286 jg wloop | 391 jg wloop |
287 | 392 |
288 pop edi | 393 pop edi |
289 pop esi | 394 pop esi |
290 ret | 395 ret |
291 } | 396 } |
292 } | 397 } |
293 | 398 |
| 399 #ifdef HAS_SCALEROWDOWN4_AVX2 |
| 400 // Point samples 64 pixels to 16 pixels. |
| 401 __declspec(naked) |
| 402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 403 uint8* dst_ptr, int dst_width) { |
| 404 __asm { |
| 405 mov eax, [esp + 4] // src_ptr |
| 406 // src_stride ignored |
| 407 mov edx, [esp + 12] // dst_ptr |
| 408 mov ecx, [esp + 16] // dst_width |
| 409 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 |
| 410 vpsrld ymm5, ymm5, 24 |
| 411 vpslld ymm5, ymm5, 16 |
| 412 |
| 413 wloop: |
| 414 vmovdqu ymm0, [eax] |
| 415 vmovdqu ymm1, [eax + 32] |
| 416 lea eax, [eax + 64] |
| 417 vpand ymm0, ymm0, ymm5 |
| 418 vpand ymm1, ymm1, ymm5 |
| 419 vpackuswb ymm0, ymm0, ymm1 |
| 420 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 421 vpsrlw ymm0, ymm0, 8 |
| 422 vpackuswb ymm0, ymm0, ymm0 |
| 423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 424 vmovdqu [edx], xmm0 |
| 425 lea edx, [edx + 16] |
| 426 sub ecx, 16 |
| 427 jg wloop |
| 428 |
| 429 vzeroupper |
| 430 ret |
| 431 } |
| 432 } |
| 433 |
| 434 // Blends 64x4 rectangle to 16x1. |
| 435 __declspec(naked) |
| 436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 437 uint8* dst_ptr, int dst_width) { |
| 438 __asm { |
| 439 push esi |
| 440 push edi |
| 441 mov eax, [esp + 8 + 4] // src_ptr |
| 442 mov esi, [esp + 8 + 8] // src_stride |
| 443 mov edx, [esp + 8 + 12] // dst_ptr |
| 444 mov ecx, [esp + 8 + 16] // dst_width |
| 445 lea edi, [esi + esi * 2] // src_stride * 3 |
| 446 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff |
| 447 vpsrlw ymm7, ymm7, 8 |
| 448 |
| 449 wloop: |
| 450 vmovdqu ymm0, [eax] // average rows |
| 451 vmovdqu ymm1, [eax + 32] |
| 452 vpavgb ymm0, ymm0, [eax + esi] |
| 453 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 454 vmovdqu ymm2, [eax + esi * 2] |
| 455 vmovdqu ymm3, [eax + esi * 2 + 32] |
| 456 vpavgb ymm2, ymm2, [eax + edi] |
| 457 vpavgb ymm3, ymm3, [eax + edi + 32] |
| 458 lea eax, [eax + 64] |
| 459 vpavgb ymm0, ymm0, ymm2 |
| 460 vpavgb ymm1, ymm1, ymm3 |
| 461 |
| 462 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) |
| 463 vpand ymm3, ymm1, ymm7 |
| 464 vpsrlw ymm0, ymm0, 8 |
| 465 vpsrlw ymm1, ymm1, 8 |
| 466 vpavgw ymm0, ymm0, ymm2 |
| 467 vpavgw ymm1, ymm1, ymm3 |
| 468 vpackuswb ymm0, ymm0, ymm1 |
| 469 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 470 |
| 471 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) |
| 472 vpsrlw ymm0, ymm0, 8 |
| 473 vpavgw ymm0, ymm0, ymm2 |
| 474 vpackuswb ymm0, ymm0, ymm0 |
| 475 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 476 |
| 477 vmovdqu [edx], xmm0 |
| 478 lea edx, [edx + 16] |
| 479 sub ecx, 16 |
| 480 jg wloop |
| 481 |
| 482 pop edi |
| 483 pop esi |
| 484 vzeroupper |
| 485 ret |
| 486 } |
| 487 } |
| 488 #endif // HAS_SCALEROWDOWN4_AVX2 |
| 489 |
294 // Point samples 32 pixels to 24 pixels. | 490 // Point samples 32 pixels to 24 pixels. |
295 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. | 491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
296 // Then shuffled to do the scaling. | 492 // Then shuffled to do the scaling. |
297 | 493 |
298 // Note that movdqa+palign may be better than movdqu. | 494 __declspec(naked) |
299 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | |
300 __declspec(naked) __declspec(align(16)) | |
301 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
302 uint8* dst_ptr, int dst_width) { | 496 uint8* dst_ptr, int dst_width) { |
303 __asm { | 497 __asm { |
304 mov eax, [esp + 4] // src_ptr | 498 mov eax, [esp + 4] // src_ptr |
305 // src_stride ignored | 499 // src_stride ignored |
306 mov edx, [esp + 12] // dst_ptr | 500 mov edx, [esp + 12] // dst_ptr |
307 mov ecx, [esp + 16] // dst_width | 501 mov ecx, [esp + 16] // dst_width |
308 movdqa xmm3, kShuf0 | 502 movdqa xmm3, kShuf0 |
309 movdqa xmm4, kShuf1 | 503 movdqa xmm4, kShuf1 |
310 movdqa xmm5, kShuf2 | 504 movdqa xmm5, kShuf2 |
(...skipping 26 matching lines...) Expand all Loading... |
337 // xmm0 src_row 0 | 531 // xmm0 src_row 0 |
338 // xmm1 src_row 1 | 532 // xmm1 src_row 1 |
339 // xmm2 shuf 0 | 533 // xmm2 shuf 0 |
340 // xmm3 shuf 1 | 534 // xmm3 shuf 1 |
341 // xmm4 shuf 2 | 535 // xmm4 shuf 2 |
342 // xmm5 madd 0 | 536 // xmm5 madd 0 |
343 // xmm6 madd 1 | 537 // xmm6 madd 1 |
344 // xmm7 kRound34 | 538 // xmm7 kRound34 |
345 | 539 |
346 // Note that movdqa+palign may be better than movdqu. | 540 // Note that movdqa+palign may be better than movdqu. |
347 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | 541 __declspec(naked) |
348 __declspec(naked) __declspec(align(16)) | |
349 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, | 542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
350 ptrdiff_t src_stride, | 543 ptrdiff_t src_stride, |
351 uint8* dst_ptr, int dst_width) { | 544 uint8* dst_ptr, int dst_width) { |
352 __asm { | 545 __asm { |
353 push esi | 546 push esi |
354 mov eax, [esp + 4 + 4] // src_ptr | 547 mov eax, [esp + 4 + 4] // src_ptr |
355 mov esi, [esp + 4 + 8] // src_stride | 548 mov esi, [esp + 4 + 8] // src_stride |
356 mov edx, [esp + 4 + 12] // dst_ptr | 549 mov edx, [esp + 4 + 12] // dst_ptr |
357 mov ecx, [esp + 4 + 16] // dst_width | 550 mov ecx, [esp + 4 + 16] // dst_width |
358 movdqa xmm2, kShuf01 | 551 movdqa xmm2, kShuf01 |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
395 lea edx, [edx + 24] | 588 lea edx, [edx + 24] |
396 sub ecx, 24 | 589 sub ecx, 24 |
397 jg wloop | 590 jg wloop |
398 | 591 |
399 pop esi | 592 pop esi |
400 ret | 593 ret |
401 } | 594 } |
402 } | 595 } |
403 | 596 |
404 // Note that movdqa+palign may be better than movdqu. | 597 // Note that movdqa+palign may be better than movdqu. |
405 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. | 598 __declspec(naked) |
406 __declspec(naked) __declspec(align(16)) | |
407 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, | 599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
408 ptrdiff_t src_stride, | 600 ptrdiff_t src_stride, |
409 uint8* dst_ptr, int dst_width) { | 601 uint8* dst_ptr, int dst_width) { |
410 __asm { | 602 __asm { |
411 push esi | 603 push esi |
412 mov eax, [esp + 4 + 4] // src_ptr | 604 mov eax, [esp + 4 + 4] // src_ptr |
413 mov esi, [esp + 4 + 8] // src_stride | 605 mov esi, [esp + 4 + 8] // src_stride |
414 mov edx, [esp + 4 + 12] // dst_ptr | 606 mov edx, [esp + 4 + 12] // dst_ptr |
415 mov ecx, [esp + 4 + 16] // dst_width | 607 mov ecx, [esp + 4 + 16] // dst_width |
416 movdqa xmm2, kShuf01 | 608 movdqa xmm2, kShuf01 |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
458 jg wloop | 650 jg wloop |
459 | 651 |
460 pop esi | 652 pop esi |
461 ret | 653 ret |
462 } | 654 } |
463 } | 655 } |
464 | 656 |
465 // 3/8 point sampler | 657 // 3/8 point sampler |
466 | 658 |
467 // Scale 32 pixels to 12 | 659 // Scale 32 pixels to 12 |
468 __declspec(naked) __declspec(align(16)) | 660 __declspec(naked) |
469 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
470 uint8* dst_ptr, int dst_width) { | 662 uint8* dst_ptr, int dst_width) { |
471 __asm { | 663 __asm { |
472 mov eax, [esp + 4] // src_ptr | 664 mov eax, [esp + 4] // src_ptr |
473 // src_stride ignored | 665 // src_stride ignored |
474 mov edx, [esp + 12] // dst_ptr | 666 mov edx, [esp + 12] // dst_ptr |
475 mov ecx, [esp + 16] // dst_width | 667 mov ecx, [esp + 16] // dst_width |
476 movdqa xmm4, kShuf38a | 668 movdqa xmm4, kShuf38a |
477 movdqa xmm5, kShuf38b | 669 movdqa xmm5, kShuf38b |
478 | 670 |
(...skipping 10 matching lines...) Expand all Loading... |
489 movd [edx + 8], xmm1 | 681 movd [edx + 8], xmm1 |
490 lea edx, [edx + 12] | 682 lea edx, [edx + 12] |
491 sub ecx, 12 | 683 sub ecx, 12 |
492 jg xloop | 684 jg xloop |
493 | 685 |
494 ret | 686 ret |
495 } | 687 } |
496 } | 688 } |
497 | 689 |
498 // Scale 16x3 pixels to 6x1 with interpolation | 690 // Scale 16x3 pixels to 6x1 with interpolation |
499 __declspec(naked) __declspec(align(16)) | 691 __declspec(naked) |
500 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, | 692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
501 ptrdiff_t src_stride, | 693 ptrdiff_t src_stride, |
502 uint8* dst_ptr, int dst_width) { | 694 uint8* dst_ptr, int dst_width) { |
503 __asm { | 695 __asm { |
504 push esi | 696 push esi |
505 mov eax, [esp + 4 + 4] // src_ptr | 697 mov eax, [esp + 4 + 4] // src_ptr |
506 mov esi, [esp + 4 + 8] // src_stride | 698 mov esi, [esp + 4 + 8] // src_stride |
507 mov edx, [esp + 4 + 12] // dst_ptr | 699 mov edx, [esp + 4 + 12] // dst_ptr |
508 mov ecx, [esp + 4 + 16] // dst_width | 700 mov ecx, [esp + 4 + 16] // dst_width |
509 movdqa xmm2, kShufAc | 701 movdqa xmm2, kShufAc |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
554 lea edx, [edx + 6] | 746 lea edx, [edx + 6] |
555 sub ecx, 6 | 747 sub ecx, 6 |
556 jg xloop | 748 jg xloop |
557 | 749 |
558 pop esi | 750 pop esi |
559 ret | 751 ret |
560 } | 752 } |
561 } | 753 } |
562 | 754 |
563 // Scale 16x2 pixels to 6x1 with interpolation | 755 // Scale 16x2 pixels to 6x1 with interpolation |
564 __declspec(naked) __declspec(align(16)) | 756 __declspec(naked) |
565 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, | 757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
566 ptrdiff_t src_stride, | 758 ptrdiff_t src_stride, |
567 uint8* dst_ptr, int dst_width) { | 759 uint8* dst_ptr, int dst_width) { |
568 __asm { | 760 __asm { |
569 push esi | 761 push esi |
570 mov eax, [esp + 4 + 4] // src_ptr | 762 mov eax, [esp + 4 + 4] // src_ptr |
571 mov esi, [esp + 4 + 8] // src_stride | 763 mov esi, [esp + 4 + 8] // src_stride |
572 mov edx, [esp + 4 + 12] // dst_ptr | 764 mov edx, [esp + 4 + 12] // dst_ptr |
573 mov ecx, [esp + 4 + 16] // dst_width | 765 mov ecx, [esp + 4 + 16] // dst_width |
574 movdqa xmm2, kShufAb0 | 766 movdqa xmm2, kShufAb0 |
(...skipping 23 matching lines...) Expand all Loading... |
598 movd [edx + 2], xmm1 | 790 movd [edx + 2], xmm1 |
599 lea edx, [edx + 6] | 791 lea edx, [edx + 6] |
600 sub ecx, 6 | 792 sub ecx, 6 |
601 jg xloop | 793 jg xloop |
602 | 794 |
603 pop esi | 795 pop esi |
604 ret | 796 ret |
605 } | 797 } |
606 } | 798 } |
607 | 799 |
608 // Reads 16xN bytes and produces 16 shorts at a time. | 800 // Reads 16 bytes and accumulates to 16 shorts at a time. |
609 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. | 801 __declspec(naked) |
610 __declspec(naked) __declspec(align(16)) | 802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { |
611 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | |
612 uint16* dst_ptr, int src_width, | |
613 int src_height) { | |
614 __asm { | 803 __asm { |
615 push esi | 804 mov eax, [esp + 4] // src_ptr |
616 push edi | 805 mov edx, [esp + 8] // dst_ptr |
617 push ebx | 806 mov ecx, [esp + 12] // src_width |
618 push ebp | 807 pxor xmm5, xmm5 |
619 mov esi, [esp + 16 + 4] // src_ptr | |
620 mov edx, [esp + 16 + 8] // src_stride | |
621 mov edi, [esp + 16 + 12] // dst_ptr | |
622 mov ecx, [esp + 16 + 16] // dst_width | |
623 mov ebx, [esp + 16 + 20] // height | |
624 pxor xmm4, xmm4 | |
625 dec ebx | |
626 | 808 |
| 809 // sum rows |
627 xloop: | 810 xloop: |
628 // first row | 811 movdqu xmm3, [eax] // read 16 bytes |
629 movdqu xmm0, [esi] | 812 lea eax, [eax + 16] |
630 lea eax, [esi + edx] | 813 movdqu xmm0, [edx] // read 16 words from destination |
631 movdqa xmm1, xmm0 | 814 movdqu xmm1, [edx + 16] |
632 punpcklbw xmm0, xmm4 | 815 movdqa xmm2, xmm3 |
633 punpckhbw xmm1, xmm4 | 816 punpcklbw xmm2, xmm5 |
634 lea esi, [esi + 16] | 817 punpckhbw xmm3, xmm5 |
635 mov ebp, ebx | |
636 test ebp, ebp | |
637 je ydone | |
638 | |
639 // sum remaining rows | |
640 yloop: | |
641 movdqu xmm2, [eax] // read 16 pixels | |
642 lea eax, [eax + edx] // advance to next row | |
643 movdqa xmm3, xmm2 | |
644 punpcklbw xmm2, xmm4 | |
645 punpckhbw xmm3, xmm4 | |
646 paddusw xmm0, xmm2 // sum 16 words | 818 paddusw xmm0, xmm2 // sum 16 words |
647 paddusw xmm1, xmm3 | 819 paddusw xmm1, xmm3 |
648 sub ebp, 1 | 820 movdqu [edx], xmm0 // write 16 words to destination |
649 jg yloop | 821 movdqu [edx + 16], xmm1 |
650 | 822 lea edx, [edx + 32] |
651 ydone: | |
652 movdqu [edi], xmm0 | |
653 movdqu [edi + 16], xmm1 | |
654 lea edi, [edi + 32] | |
655 | |
656 sub ecx, 16 | 823 sub ecx, 16 |
657 jg xloop | 824 jg xloop |
658 | |
659 pop ebp | |
660 pop ebx | |
661 pop edi | |
662 pop esi | |
663 ret | 825 ret |
664 } | 826 } |
665 } | 827 } |
666 | 828 |
| 829 #ifdef HAS_SCALEADDROW_AVX2 |
| 830 // Reads 32 bytes and accumulates to 32 shorts at a time. |
| 831 __declspec(naked) |
| 832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { |
| 833 __asm { |
| 834 mov eax, [esp + 4] // src_ptr |
| 835 mov edx, [esp + 8] // dst_ptr |
| 836 mov ecx, [esp + 12] // src_width |
| 837 vpxor ymm5, ymm5, ymm5 |
| 838 |
| 839 // sum rows |
| 840 xloop: |
| 841 vmovdqu ymm3, [eax] // read 32 bytes |
| 842 lea eax, [eax + 32] |
| 843 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck |
| 844 vpunpcklbw ymm2, ymm3, ymm5 |
| 845 vpunpckhbw ymm3, ymm3, ymm5 |
| 846 vpaddusw ymm0, ymm2, [edx] // sum 16 words |
| 847 vpaddusw ymm1, ymm3, [edx + 32] |
| 848 vmovdqu [edx], ymm0 // write 32 words to destination |
| 849 vmovdqu [edx + 32], ymm1 |
| 850 lea edx, [edx + 64] |
| 851 sub ecx, 32 |
| 852 jg xloop |
| 853 |
| 854 vzeroupper |
| 855 ret |
| 856 } |
| 857 } |
| 858 #endif // HAS_SCALEADDROW_AVX2 |
| 859 |
667 // Bilinear column filtering. SSSE3 version. | 860 // Bilinear column filtering. SSSE3 version. |
668 // TODO(fbarchard): Port to Neon | 861 __declspec(naked) |
669 // TODO(fbarchard): Switch the following: | |
670 // xor ebx, ebx | |
671 // mov bx, word ptr [esi + eax] // 2 source x0 pixels | |
672 // To | |
673 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels | |
674 // when drmemory bug fixed. | |
675 // https://code.google.com/p/drmemory/issues/detail?id=1396 | |
676 | |
677 __declspec(naked) __declspec(align(16)) | |
678 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
679 int dst_width, int x, int dx) { | 863 int dst_width, int x, int dx) { |
680 __asm { | 864 __asm { |
681 push ebx | 865 push ebx |
682 push esi | 866 push esi |
683 push edi | 867 push edi |
684 mov edi, [esp + 12 + 4] // dst_ptr | 868 mov edi, [esp + 12 + 4] // dst_ptr |
685 mov esi, [esp + 12 + 8] // src_ptr | 869 mov esi, [esp + 12 + 8] // src_ptr |
686 mov ecx, [esp + 12 + 12] // dst_width | 870 mov ecx, [esp + 12 + 12] // dst_width |
687 movd xmm2, [esp + 12 + 16] // x | 871 movd xmm2, [esp + 12 + 16] // x |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
744 xloop99: | 928 xloop99: |
745 | 929 |
746 pop edi | 930 pop edi |
747 pop esi | 931 pop esi |
748 pop ebx | 932 pop ebx |
749 ret | 933 ret |
750 } | 934 } |
751 } | 935 } |
752 | 936 |
753 // Reads 16 pixels, duplicates them and writes 32 pixels. | 937 // Reads 16 pixels, duplicates them and writes 32 pixels. |
754 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 938 __declspec(naked) |
755 __declspec(naked) __declspec(align(16)) | |
756 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
757 int dst_width, int x, int dx) { | 940 int dst_width, int x, int dx) { |
758 __asm { | 941 __asm { |
759 mov edx, [esp + 4] // dst_ptr | 942 mov edx, [esp + 4] // dst_ptr |
760 mov eax, [esp + 8] // src_ptr | 943 mov eax, [esp + 8] // src_ptr |
761 mov ecx, [esp + 12] // dst_width | 944 mov ecx, [esp + 12] // dst_width |
762 | 945 |
763 wloop: | 946 wloop: |
764 movdqu xmm0, [eax] | 947 movdqu xmm0, [eax] |
765 lea eax, [eax + 16] | 948 lea eax, [eax + 16] |
766 movdqa xmm1, xmm0 | 949 movdqa xmm1, xmm0 |
767 punpcklbw xmm0, xmm0 | 950 punpcklbw xmm0, xmm0 |
768 punpckhbw xmm1, xmm1 | 951 punpckhbw xmm1, xmm1 |
769 movdqu [edx], xmm0 | 952 movdqu [edx], xmm0 |
770 movdqu [edx + 16], xmm1 | 953 movdqu [edx + 16], xmm1 |
771 lea edx, [edx + 32] | 954 lea edx, [edx + 32] |
772 sub ecx, 32 | 955 sub ecx, 32 |
773 jg wloop | 956 jg wloop |
774 | 957 |
775 ret | 958 ret |
776 } | 959 } |
777 } | 960 } |
778 | 961 |
779 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) | 962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) |
780 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 963 __declspec(naked) |
781 __declspec(naked) __declspec(align(16)) | |
782 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, | 964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
783 ptrdiff_t src_stride, | 965 ptrdiff_t src_stride, |
784 uint8* dst_argb, int dst_width) { | 966 uint8* dst_argb, int dst_width) { |
785 __asm { | 967 __asm { |
786 mov eax, [esp + 4] // src_argb | 968 mov eax, [esp + 4] // src_argb |
787 // src_stride ignored | 969 // src_stride ignored |
788 mov edx, [esp + 12] // dst_argb | 970 mov edx, [esp + 12] // dst_argb |
789 mov ecx, [esp + 16] // dst_width | 971 mov ecx, [esp + 16] // dst_width |
790 | 972 |
791 wloop: | 973 wloop: |
792 movdqu xmm0, [eax] | 974 movdqu xmm0, [eax] |
793 movdqu xmm1, [eax + 16] | 975 movdqu xmm1, [eax + 16] |
794 lea eax, [eax + 32] | 976 lea eax, [eax + 32] |
795 shufps xmm0, xmm1, 0xdd | 977 shufps xmm0, xmm1, 0xdd |
796 movdqu [edx], xmm0 | 978 movdqu [edx], xmm0 |
797 lea edx, [edx + 16] | 979 lea edx, [edx + 16] |
798 sub ecx, 4 | 980 sub ecx, 4 |
799 jg wloop | 981 jg wloop |
800 | 982 |
801 ret | 983 ret |
802 } | 984 } |
803 } | 985 } |
804 | 986 |
805 // Blends 8x1 rectangle to 4x1. | 987 // Blends 8x1 rectangle to 4x1. |
806 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 988 __declspec(naked) |
807 __declspec(naked) __declspec(align(16)) | |
808 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, | 989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
809 ptrdiff_t src_stride, | 990 ptrdiff_t src_stride, |
810 uint8* dst_argb, int dst_width) { | 991 uint8* dst_argb, int dst_width) { |
811 __asm { | 992 __asm { |
812 mov eax, [esp + 4] // src_argb | 993 mov eax, [esp + 4] // src_argb |
813 // src_stride ignored | 994 // src_stride ignored |
814 mov edx, [esp + 12] // dst_argb | 995 mov edx, [esp + 12] // dst_argb |
815 mov ecx, [esp + 16] // dst_width | 996 mov ecx, [esp + 16] // dst_width |
816 | 997 |
817 wloop: | 998 wloop: |
818 movdqu xmm0, [eax] | 999 movdqu xmm0, [eax] |
819 movdqu xmm1, [eax + 16] | 1000 movdqu xmm1, [eax + 16] |
820 lea eax, [eax + 32] | 1001 lea eax, [eax + 32] |
821 movdqa xmm2, xmm0 | 1002 movdqa xmm2, xmm0 |
822 shufps xmm0, xmm1, 0x88 // even pixels | 1003 shufps xmm0, xmm1, 0x88 // even pixels |
823 shufps xmm2, xmm1, 0xdd // odd pixels | 1004 shufps xmm2, xmm1, 0xdd // odd pixels |
824 pavgb xmm0, xmm2 | 1005 pavgb xmm0, xmm2 |
825 movdqu [edx], xmm0 | 1006 movdqu [edx], xmm0 |
826 lea edx, [edx + 16] | 1007 lea edx, [edx + 16] |
827 sub ecx, 4 | 1008 sub ecx, 4 |
828 jg wloop | 1009 jg wloop |
829 | 1010 |
830 ret | 1011 ret |
831 } | 1012 } |
832 } | 1013 } |
833 | 1014 |
834 // Blends 8x2 rectangle to 4x1. | 1015 // Blends 8x2 rectangle to 4x1. |
835 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 1016 __declspec(naked) |
836 __declspec(naked) __declspec(align(16)) | |
837 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, | 1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
838 ptrdiff_t src_stride, | 1018 ptrdiff_t src_stride, |
839 uint8* dst_argb, int dst_width) { | 1019 uint8* dst_argb, int dst_width) { |
840 __asm { | 1020 __asm { |
841 push esi | 1021 push esi |
842 mov eax, [esp + 4 + 4] // src_argb | 1022 mov eax, [esp + 4 + 4] // src_argb |
843 mov esi, [esp + 4 + 8] // src_stride | 1023 mov esi, [esp + 4 + 8] // src_stride |
844 mov edx, [esp + 4 + 12] // dst_argb | 1024 mov edx, [esp + 4 + 12] // dst_argb |
845 mov ecx, [esp + 4 + 16] // dst_width | 1025 mov ecx, [esp + 4 + 16] // dst_width |
846 | 1026 |
(...skipping 13 matching lines...) Expand all Loading... |
860 lea edx, [edx + 16] | 1040 lea edx, [edx + 16] |
861 sub ecx, 4 | 1041 sub ecx, 4 |
862 jg wloop | 1042 jg wloop |
863 | 1043 |
864 pop esi | 1044 pop esi |
865 ret | 1045 ret |
866 } | 1046 } |
867 } | 1047 } |
868 | 1048 |
869 // Reads 4 pixels at a time. | 1049 // Reads 4 pixels at a time. |
870 // Alignment requirement: dst_argb 16 byte aligned. | 1050 __declspec(naked) |
871 __declspec(naked) __declspec(align(16)) | |
872 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | 1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
873 int src_stepx, | 1052 int src_stepx, |
874 uint8* dst_argb, int dst_width) { | 1053 uint8* dst_argb, int dst_width) { |
875 __asm { | 1054 __asm { |
876 push ebx | 1055 push ebx |
877 push edi | 1056 push edi |
878 mov eax, [esp + 8 + 4] // src_argb | 1057 mov eax, [esp + 8 + 4] // src_argb |
879 // src_stride ignored | 1058 // src_stride ignored |
880 mov ebx, [esp + 8 + 12] // src_stepx | 1059 mov ebx, [esp + 8 + 12] // src_stepx |
881 mov edx, [esp + 8 + 16] // dst_argb | 1060 mov edx, [esp + 8 + 16] // dst_argb |
(...skipping 15 matching lines...) Expand all Loading... |
897 sub ecx, 4 | 1076 sub ecx, 4 |
898 jg wloop | 1077 jg wloop |
899 | 1078 |
900 pop edi | 1079 pop edi |
901 pop ebx | 1080 pop ebx |
902 ret | 1081 ret |
903 } | 1082 } |
904 } | 1083 } |
905 | 1084 |
906 // Blends four 2x2 to 4x1. | 1085 // Blends four 2x2 to 4x1. |
907 // Alignment requirement: dst_argb 16 byte aligned. | 1086 __declspec(naked) |
908 __declspec(naked) __declspec(align(16)) | |
909 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, | 1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
910 ptrdiff_t src_stride, | 1088 ptrdiff_t src_stride, |
911 int src_stepx, | 1089 int src_stepx, |
912 uint8* dst_argb, int dst_width) { | 1090 uint8* dst_argb, int dst_width) { |
913 __asm { | 1091 __asm { |
914 push ebx | 1092 push ebx |
915 push esi | 1093 push esi |
916 push edi | 1094 push edi |
917 mov eax, [esp + 12 + 4] // src_argb | 1095 mov eax, [esp + 12 + 4] // src_argb |
918 mov esi, [esp + 12 + 8] // src_stride | 1096 mov esi, [esp + 12 + 8] // src_stride |
(...skipping 27 matching lines...) Expand all Loading... |
946 jg wloop | 1124 jg wloop |
947 | 1125 |
948 pop edi | 1126 pop edi |
949 pop esi | 1127 pop esi |
950 pop ebx | 1128 pop ebx |
951 ret | 1129 ret |
952 } | 1130 } |
953 } | 1131 } |
954 | 1132 |
955 // Column scaling unfiltered. SSE2 version. | 1133 // Column scaling unfiltered. SSE2 version. |
956 __declspec(naked) __declspec(align(16)) | 1134 __declspec(naked) |
957 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, | 1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
958 int dst_width, int x, int dx) { | 1136 int dst_width, int x, int dx) { |
959 __asm { | 1137 __asm { |
960 push edi | 1138 push edi |
961 push esi | 1139 push esi |
962 mov edi, [esp + 8 + 4] // dst_argb | 1140 mov edi, [esp + 8 + 4] // dst_argb |
963 mov esi, [esp + 8 + 8] // src_argb | 1141 mov esi, [esp + 8 + 8] // src_argb |
964 mov ecx, [esp + 8 + 12] // dst_width | 1142 mov ecx, [esp + 8 + 12] // dst_width |
965 movd xmm2, [esp + 8 + 16] // x | 1143 movd xmm2, [esp + 8 + 16] // x |
966 movd xmm3, [esp + 8 + 20] // dx | 1144 movd xmm3, [esp + 8 + 20] // dx |
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1037 static uvec8 kShuffleColARGB = { | 1215 static uvec8 kShuffleColARGB = { |
1038 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel | 1216 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
1039 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel | 1217 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
1040 }; | 1218 }; |
1041 | 1219 |
1042 // Shuffle table for duplicating 2 fractions into 8 bytes each | 1220 // Shuffle table for duplicating 2 fractions into 8 bytes each |
1043 static uvec8 kShuffleFractions = { | 1221 static uvec8 kShuffleFractions = { |
1044 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, | 1222 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
1045 }; | 1223 }; |
1046 | 1224 |
1047 __declspec(naked) __declspec(align(16)) | 1225 __declspec(naked) |
1048 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, | 1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
1049 int dst_width, int x, int dx) { | 1227 int dst_width, int x, int dx) { |
1050 __asm { | 1228 __asm { |
1051 push esi | 1229 push esi |
1052 push edi | 1230 push edi |
1053 mov edi, [esp + 8 + 4] // dst_argb | 1231 mov edi, [esp + 8 + 4] // dst_argb |
1054 mov esi, [esp + 8 + 8] // src_argb | 1232 mov esi, [esp + 8 + 8] // src_argb |
1055 mov ecx, [esp + 8 + 12] // dst_width | 1233 mov ecx, [esp + 8 + 12] // dst_width |
1056 movd xmm2, [esp + 8 + 16] // x | 1234 movd xmm2, [esp + 8 + 16] // x |
1057 movd xmm3, [esp + 8 + 20] // dx | 1235 movd xmm3, [esp + 8 + 20] // dx |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1108 | 1286 |
1109 xloop99: | 1287 xloop99: |
1110 | 1288 |
1111 pop edi | 1289 pop edi |
1112 pop esi | 1290 pop esi |
1113 ret | 1291 ret |
1114 } | 1292 } |
1115 } | 1293 } |
1116 | 1294 |
1117 // Reads 4 pixels, duplicates them and writes 8 pixels. | 1295 // Reads 4 pixels, duplicates them and writes 8 pixels. |
1118 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 1296 __declspec(naked) |
1119 __declspec(naked) __declspec(align(16)) | |
1120 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, | 1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
1121 int dst_width, int x, int dx) { | 1298 int dst_width, int x, int dx) { |
1122 __asm { | 1299 __asm { |
1123 mov edx, [esp + 4] // dst_argb | 1300 mov edx, [esp + 4] // dst_argb |
1124 mov eax, [esp + 8] // src_argb | 1301 mov eax, [esp + 8] // src_argb |
1125 mov ecx, [esp + 12] // dst_width | 1302 mov ecx, [esp + 12] // dst_width |
1126 | 1303 |
1127 wloop: | 1304 wloop: |
1128 movdqu xmm0, [eax] | 1305 movdqu xmm0, [eax] |
1129 lea eax, [eax + 16] | 1306 lea eax, [eax + 16] |
1130 movdqa xmm1, xmm0 | 1307 movdqa xmm1, xmm0 |
1131 punpckldq xmm0, xmm0 | 1308 punpckldq xmm0, xmm0 |
1132 punpckhdq xmm1, xmm1 | 1309 punpckhdq xmm1, xmm1 |
1133 movdqu [edx], xmm0 | 1310 movdqu [edx], xmm0 |
1134 movdqu [edx + 16], xmm1 | 1311 movdqu [edx + 16], xmm1 |
1135 lea edx, [edx + 32] | 1312 lea edx, [edx + 32] |
1136 sub ecx, 8 | 1313 sub ecx, 8 |
1137 jg wloop | 1314 jg wloop |
1138 | 1315 |
1139 ret | 1316 ret |
1140 } | 1317 } |
1141 } | 1318 } |
1142 | 1319 |
1143 // Divide num by div and return as 16.16 fixed point result. | 1320 // Divide num by div and return as 16.16 fixed point result. |
1144 __declspec(naked) __declspec(align(16)) | 1321 __declspec(naked) |
1145 int FixedDiv_X86(int num, int div) { | 1322 int FixedDiv_X86(int num, int div) { |
1146 __asm { | 1323 __asm { |
1147 mov eax, [esp + 4] // num | 1324 mov eax, [esp + 4] // num |
1148 cdq // extend num to 64 bits | 1325 cdq // extend num to 64 bits |
1149 shld edx, eax, 16 // 32.16 | 1326 shld edx, eax, 16 // 32.16 |
1150 shl eax, 16 | 1327 shl eax, 16 |
1151 idiv dword ptr [esp + 8] | 1328 idiv dword ptr [esp + 8] |
1152 ret | 1329 ret |
1153 } | 1330 } |
1154 } | 1331 } |
1155 | 1332 |
1156 // Divide num by div and return as 16.16 fixed point result. | 1333 // Divide num by div and return as 16.16 fixed point result. |
1157 __declspec(naked) __declspec(align(16)) | 1334 __declspec(naked) |
1158 int FixedDiv1_X86(int num, int div) { | 1335 int FixedDiv1_X86(int num, int div) { |
1159 __asm { | 1336 __asm { |
1160 mov eax, [esp + 4] // num | 1337 mov eax, [esp + 4] // num |
1161 mov ecx, [esp + 8] // denom | 1338 mov ecx, [esp + 8] // denom |
1162 cdq // extend num to 64 bits | 1339 cdq // extend num to 64 bits |
1163 shld edx, eax, 16 // 32.16 | 1340 shld edx, eax, 16 // 32.16 |
1164 shl eax, 16 | 1341 shl eax, 16 |
1165 sub eax, 0x00010001 | 1342 sub eax, 0x00010001 |
1166 sbb edx, 0 | 1343 sbb edx, 0 |
1167 sub ecx, 1 | 1344 sub ecx, 1 |
1168 idiv ecx | 1345 idiv ecx |
1169 ret | 1346 ret |
1170 } | 1347 } |
1171 } | 1348 } |
1172 | 1349 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
1173 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) | |
1174 | 1350 |
1175 #ifdef __cplusplus | 1351 #ifdef __cplusplus |
1176 } // extern "C" | 1352 } // extern "C" |
1177 } // namespace libyuv | 1353 } // namespace libyuv |
1178 #endif | 1354 #endif |
OLD | NEW |