OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "third_party/libyuv/include/libyuv/row.h" |
| 12 |
| 13 #ifdef __cplusplus |
| 14 namespace libyuv { |
| 15 extern "C" { |
| 16 #endif |
| 17 |
| 18 // This module is for Visual C x86. |
| 19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
| 20 |
| 21 // Offsets for source bytes 0 to 9 |
| 22 static uvec8 kShuf0 = |
| 23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 24 |
| 25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
| 26 static uvec8 kShuf1 = |
| 27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 28 |
| 29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
| 30 static uvec8 kShuf2 = |
| 31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 32 |
| 33 // Offsets for source bytes 0 to 10 |
| 34 static uvec8 kShuf01 = |
| 35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
| 36 |
| 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
| 38 static uvec8 kShuf11 = |
| 39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
| 40 |
| 41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
| 42 static uvec8 kShuf21 = |
| 43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
| 44 |
| 45 // Coefficients for source bytes 0 to 10 |
| 46 static uvec8 kMadd01 = |
| 47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
| 48 |
| 49 // Coefficients for source bytes 10 to 21 |
| 50 static uvec8 kMadd11 = |
| 51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
| 52 |
| 53 // Coefficients for source bytes 21 to 31 |
| 54 static uvec8 kMadd21 = |
| 55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
| 56 |
| 57 // Coefficients for source bytes 21 to 31 |
| 58 static vec16 kRound34 = |
| 59 { 2, 2, 2, 2, 2, 2, 2, 2 }; |
| 60 |
| 61 static uvec8 kShuf38a = |
| 62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 63 |
| 64 static uvec8 kShuf38b = |
| 65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
| 66 |
| 67 // Arrange words 0,3,6 into 0,1,2 |
| 68 static uvec8 kShufAc = |
| 69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 70 |
| 71 // Arrange words 0,3,6 into 3,4,5 |
| 72 static uvec8 kShufAc3 = |
| 73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
| 74 |
| 75 // Scaling values for boxes of 3x3 and 2x3 |
| 76 static uvec16 kScaleAc33 = |
| 77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
| 78 |
| 79 // Arrange first value for pixels 0,1,2,3,4,5 |
| 80 static uvec8 kShufAb0 = |
| 81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
| 82 |
| 83 // Arrange second value for pixels 0,1,2,3,4,5 |
| 84 static uvec8 kShufAb1 = |
| 85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
| 86 |
| 87 // Arrange third value for pixels 0,1,2,3,4,5 |
| 88 static uvec8 kShufAb2 = |
| 89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
| 90 |
| 91 // Scaling values for boxes of 3x2 and 2x2 |
| 92 static uvec16 kScaleAb2 = |
| 93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
| 94 |
| 95 // Reads 32 pixels, throws half away and writes 16 pixels. |
| 96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 97 __declspec(naked) __declspec(align(16)) |
| 98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 99 uint8* dst_ptr, int dst_width) { |
| 100 __asm { |
| 101 mov eax, [esp + 4] // src_ptr |
| 102 // src_stride ignored |
| 103 mov edx, [esp + 12] // dst_ptr |
| 104 mov ecx, [esp + 16] // dst_width |
| 105 |
| 106 align 4 |
| 107 wloop: |
| 108 movdqa xmm0, [eax] |
| 109 movdqa xmm1, [eax + 16] |
| 110 lea eax, [eax + 32] |
| 111 psrlw xmm0, 8 // isolate odd pixels. |
| 112 psrlw xmm1, 8 |
| 113 packuswb xmm0, xmm1 |
| 114 sub ecx, 16 |
| 115 movdqa [edx], xmm0 |
| 116 lea edx, [edx + 16] |
| 117 jg wloop |
| 118 |
| 119 ret |
| 120 } |
| 121 } |
| 122 |
| 123 // Blends 32x1 rectangle to 16x1. |
| 124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 125 __declspec(naked) __declspec(align(16)) |
| 126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 127 uint8* dst_ptr, int dst_width) { |
| 128 __asm { |
| 129 mov eax, [esp + 4] // src_ptr |
| 130 // src_stride |
| 131 mov edx, [esp + 12] // dst_ptr |
| 132 mov ecx, [esp + 16] // dst_width |
| 133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 134 psrlw xmm5, 8 |
| 135 |
| 136 align 4 |
| 137 wloop: |
| 138 movdqa xmm0, [eax] |
| 139 movdqa xmm1, [eax + 16] |
| 140 lea eax, [eax + 32] |
| 141 |
| 142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
| 143 psrlw xmm0, 8 |
| 144 movdqa xmm3, xmm1 |
| 145 psrlw xmm1, 8 |
| 146 pand xmm2, xmm5 |
| 147 pand xmm3, xmm5 |
| 148 pavgw xmm0, xmm2 |
| 149 pavgw xmm1, xmm3 |
| 150 packuswb xmm0, xmm1 |
| 151 |
| 152 sub ecx, 16 |
| 153 movdqa [edx], xmm0 |
| 154 lea edx, [edx + 16] |
| 155 jg wloop |
| 156 |
| 157 ret |
| 158 } |
| 159 } |
| 160 |
| 161 // Blends 32x2 rectangle to 16x1. |
| 162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 163 __declspec(naked) __declspec(align(16)) |
| 164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 165 uint8* dst_ptr, int dst_width) { |
| 166 __asm { |
| 167 push esi |
| 168 mov eax, [esp + 4 + 4] // src_ptr |
| 169 mov esi, [esp + 4 + 8] // src_stride |
| 170 mov edx, [esp + 4 + 12] // dst_ptr |
| 171 mov ecx, [esp + 4 + 16] // dst_width |
| 172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 173 psrlw xmm5, 8 |
| 174 |
| 175 align 4 |
| 176 wloop: |
| 177 movdqa xmm0, [eax] |
| 178 movdqa xmm1, [eax + 16] |
| 179 movdqa xmm2, [eax + esi] |
| 180 movdqa xmm3, [eax + esi + 16] |
| 181 lea eax, [eax + 32] |
| 182 pavgb xmm0, xmm2 // average rows |
| 183 pavgb xmm1, xmm3 |
| 184 |
| 185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
| 186 psrlw xmm0, 8 |
| 187 movdqa xmm3, xmm1 |
| 188 psrlw xmm1, 8 |
| 189 pand xmm2, xmm5 |
| 190 pand xmm3, xmm5 |
| 191 pavgw xmm0, xmm2 |
| 192 pavgw xmm1, xmm3 |
| 193 packuswb xmm0, xmm1 |
| 194 |
| 195 sub ecx, 16 |
| 196 movdqa [edx], xmm0 |
| 197 lea edx, [edx + 16] |
| 198 jg wloop |
| 199 |
| 200 pop esi |
| 201 ret |
| 202 } |
| 203 } |
| 204 |
| 205 // Reads 32 pixels, throws half away and writes 16 pixels. |
| 206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 207 __declspec(naked) __declspec(align(16)) |
| 208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, |
| 209 ptrdiff_t src_stride, |
| 210 uint8* dst_ptr, int dst_width) { |
| 211 __asm { |
| 212 mov eax, [esp + 4] // src_ptr |
| 213 // src_stride ignored |
| 214 mov edx, [esp + 12] // dst_ptr |
| 215 mov ecx, [esp + 16] // dst_width |
| 216 |
| 217 align 4 |
| 218 wloop: |
| 219 movdqu xmm0, [eax] |
| 220 movdqu xmm1, [eax + 16] |
| 221 lea eax, [eax + 32] |
| 222 psrlw xmm0, 8 // isolate odd pixels. |
| 223 psrlw xmm1, 8 |
| 224 packuswb xmm0, xmm1 |
| 225 sub ecx, 16 |
| 226 movdqu [edx], xmm0 |
| 227 lea edx, [edx + 16] |
| 228 jg wloop |
| 229 |
| 230 ret |
| 231 } |
| 232 } |
| 233 |
| 234 // Blends 32x1 rectangle to 16x1. |
| 235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 236 __declspec(naked) __declspec(align(16)) |
| 237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
| 238 ptrdiff_t src_stride, |
| 239 uint8* dst_ptr, int dst_width) { |
| 240 __asm { |
| 241 mov eax, [esp + 4] // src_ptr |
| 242 // src_stride |
| 243 mov edx, [esp + 12] // dst_ptr |
| 244 mov ecx, [esp + 16] // dst_width |
| 245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 246 psrlw xmm5, 8 |
| 247 |
| 248 align 4 |
| 249 wloop: |
| 250 movdqu xmm0, [eax] |
| 251 movdqu xmm1, [eax + 16] |
| 252 lea eax, [eax + 32] |
| 253 |
| 254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
| 255 psrlw xmm0, 8 |
| 256 movdqa xmm3, xmm1 |
| 257 psrlw xmm1, 8 |
| 258 pand xmm2, xmm5 |
| 259 pand xmm3, xmm5 |
| 260 pavgw xmm0, xmm2 |
| 261 pavgw xmm1, xmm3 |
| 262 packuswb xmm0, xmm1 |
| 263 |
| 264 sub ecx, 16 |
| 265 movdqu [edx], xmm0 |
| 266 lea edx, [edx + 16] |
| 267 jg wloop |
| 268 |
| 269 ret |
| 270 } |
| 271 } |
| 272 |
| 273 // Blends 32x2 rectangle to 16x1. |
| 274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
| 275 __declspec(naked) __declspec(align(16)) |
| 276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
| 277 ptrdiff_t src_stride, |
| 278 uint8* dst_ptr, int dst_width) { |
| 279 __asm { |
| 280 push esi |
| 281 mov eax, [esp + 4 + 4] // src_ptr |
| 282 mov esi, [esp + 4 + 8] // src_stride |
| 283 mov edx, [esp + 4 + 12] // dst_ptr |
| 284 mov ecx, [esp + 4 + 16] // dst_width |
| 285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 286 psrlw xmm5, 8 |
| 287 |
| 288 align 4 |
| 289 wloop: |
| 290 movdqu xmm0, [eax] |
| 291 movdqu xmm1, [eax + 16] |
| 292 movdqu xmm2, [eax + esi] |
| 293 movdqu xmm3, [eax + esi + 16] |
| 294 lea eax, [eax + 32] |
| 295 pavgb xmm0, xmm2 // average rows |
| 296 pavgb xmm1, xmm3 |
| 297 |
| 298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
| 299 psrlw xmm0, 8 |
| 300 movdqa xmm3, xmm1 |
| 301 psrlw xmm1, 8 |
| 302 pand xmm2, xmm5 |
| 303 pand xmm3, xmm5 |
| 304 pavgw xmm0, xmm2 |
| 305 pavgw xmm1, xmm3 |
| 306 packuswb xmm0, xmm1 |
| 307 |
| 308 sub ecx, 16 |
| 309 movdqu [edx], xmm0 |
| 310 lea edx, [edx + 16] |
| 311 jg wloop |
| 312 |
| 313 pop esi |
| 314 ret |
| 315 } |
| 316 } |
| 317 |
| 318 // Point samples 32 pixels to 8 pixels. |
| 319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
| 320 __declspec(naked) __declspec(align(16)) |
| 321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 322 uint8* dst_ptr, int dst_width) { |
| 323 __asm { |
| 324 mov eax, [esp + 4] // src_ptr |
| 325 // src_stride ignored |
| 326 mov edx, [esp + 12] // dst_ptr |
| 327 mov ecx, [esp + 16] // dst_width |
| 328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
| 329 psrld xmm5, 24 |
| 330 pslld xmm5, 16 |
| 331 |
| 332 align 4 |
| 333 wloop: |
| 334 movdqa xmm0, [eax] |
| 335 movdqa xmm1, [eax + 16] |
| 336 lea eax, [eax + 32] |
| 337 pand xmm0, xmm5 |
| 338 pand xmm1, xmm5 |
| 339 packuswb xmm0, xmm1 |
| 340 psrlw xmm0, 8 |
| 341 packuswb xmm0, xmm0 |
| 342 sub ecx, 8 |
| 343 movq qword ptr [edx], xmm0 |
| 344 lea edx, [edx + 8] |
| 345 jg wloop |
| 346 |
| 347 ret |
| 348 } |
| 349 } |
| 350 |
| 351 // Blends 32x4 rectangle to 8x1. |
| 352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
| 353 __declspec(naked) __declspec(align(16)) |
| 354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 355 uint8* dst_ptr, int dst_width) { |
| 356 __asm { |
| 357 push esi |
| 358 push edi |
| 359 mov eax, [esp + 8 + 4] // src_ptr |
| 360 mov esi, [esp + 8 + 8] // src_stride |
| 361 mov edx, [esp + 8 + 12] // dst_ptr |
| 362 mov ecx, [esp + 8 + 16] // dst_width |
| 363 lea edi, [esi + esi * 2] // src_stride * 3 |
| 364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
| 365 psrlw xmm7, 8 |
| 366 |
| 367 align 4 |
| 368 wloop: |
| 369 movdqa xmm0, [eax] |
| 370 movdqa xmm1, [eax + 16] |
| 371 movdqa xmm2, [eax + esi] |
| 372 movdqa xmm3, [eax + esi + 16] |
| 373 pavgb xmm0, xmm2 // average rows |
| 374 pavgb xmm1, xmm3 |
| 375 movdqa xmm2, [eax + esi * 2] |
| 376 movdqa xmm3, [eax + esi * 2 + 16] |
| 377 movdqa xmm4, [eax + edi] |
| 378 movdqa xmm5, [eax + edi + 16] |
| 379 lea eax, [eax + 32] |
| 380 pavgb xmm2, xmm4 |
| 381 pavgb xmm3, xmm5 |
| 382 pavgb xmm0, xmm2 |
| 383 pavgb xmm1, xmm3 |
| 384 |
| 385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
| 386 psrlw xmm0, 8 |
| 387 movdqa xmm3, xmm1 |
| 388 psrlw xmm1, 8 |
| 389 pand xmm2, xmm7 |
| 390 pand xmm3, xmm7 |
| 391 pavgw xmm0, xmm2 |
| 392 pavgw xmm1, xmm3 |
| 393 packuswb xmm0, xmm1 |
| 394 |
| 395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) |
| 396 psrlw xmm0, 8 |
| 397 pand xmm2, xmm7 |
| 398 pavgw xmm0, xmm2 |
| 399 packuswb xmm0, xmm0 |
| 400 |
| 401 sub ecx, 8 |
| 402 movq qword ptr [edx], xmm0 |
| 403 lea edx, [edx + 8] |
| 404 jg wloop |
| 405 |
| 406 pop edi |
| 407 pop esi |
| 408 ret |
| 409 } |
| 410 } |
| 411 |
| 412 // Point samples 32 pixels to 24 pixels. |
| 413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
| 414 // Then shuffled to do the scaling. |
| 415 |
| 416 // Note that movdqa+palign may be better than movdqu. |
| 417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
| 418 __declspec(naked) __declspec(align(16)) |
| 419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 420 uint8* dst_ptr, int dst_width) { |
| 421 __asm { |
| 422 mov eax, [esp + 4] // src_ptr |
| 423 // src_stride ignored |
| 424 mov edx, [esp + 12] // dst_ptr |
| 425 mov ecx, [esp + 16] // dst_width |
| 426 movdqa xmm3, kShuf0 |
| 427 movdqa xmm4, kShuf1 |
| 428 movdqa xmm5, kShuf2 |
| 429 |
| 430 align 4 |
| 431 wloop: |
| 432 movdqa xmm0, [eax] |
| 433 movdqa xmm1, [eax + 16] |
| 434 lea eax, [eax + 32] |
| 435 movdqa xmm2, xmm1 |
| 436 palignr xmm1, xmm0, 8 |
| 437 pshufb xmm0, xmm3 |
| 438 pshufb xmm1, xmm4 |
| 439 pshufb xmm2, xmm5 |
| 440 movq qword ptr [edx], xmm0 |
| 441 movq qword ptr [edx + 8], xmm1 |
| 442 movq qword ptr [edx + 16], xmm2 |
| 443 lea edx, [edx + 24] |
| 444 sub ecx, 24 |
| 445 jg wloop |
| 446 |
| 447 ret |
| 448 } |
| 449 } |
| 450 |
| 451 // Blends 32x2 rectangle to 24x1 |
| 452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
| 453 // Then shuffled to do the scaling. |
| 454 |
| 455 // Register usage: |
| 456 // xmm0 src_row 0 |
| 457 // xmm1 src_row 1 |
| 458 // xmm2 shuf 0 |
| 459 // xmm3 shuf 1 |
| 460 // xmm4 shuf 2 |
| 461 // xmm5 madd 0 |
| 462 // xmm6 madd 1 |
| 463 // xmm7 kRound34 |
| 464 |
| 465 // Note that movdqa+palign may be better than movdqu. |
| 466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
| 467 __declspec(naked) __declspec(align(16)) |
| 468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
| 469 ptrdiff_t src_stride, |
| 470 uint8* dst_ptr, int dst_width) { |
| 471 __asm { |
| 472 push esi |
| 473 mov eax, [esp + 4 + 4] // src_ptr |
| 474 mov esi, [esp + 4 + 8] // src_stride |
| 475 mov edx, [esp + 4 + 12] // dst_ptr |
| 476 mov ecx, [esp + 4 + 16] // dst_width |
| 477 movdqa xmm2, kShuf01 |
| 478 movdqa xmm3, kShuf11 |
| 479 movdqa xmm4, kShuf21 |
| 480 movdqa xmm5, kMadd01 |
| 481 movdqa xmm6, kMadd11 |
| 482 movdqa xmm7, kRound34 |
| 483 |
| 484 align 4 |
| 485 wloop: |
| 486 movdqa xmm0, [eax] // pixels 0..7 |
| 487 movdqa xmm1, [eax + esi] |
| 488 pavgb xmm0, xmm1 |
| 489 pshufb xmm0, xmm2 |
| 490 pmaddubsw xmm0, xmm5 |
| 491 paddsw xmm0, xmm7 |
| 492 psrlw xmm0, 2 |
| 493 packuswb xmm0, xmm0 |
| 494 movq qword ptr [edx], xmm0 |
| 495 movdqu xmm0, [eax + 8] // pixels 8..15 |
| 496 movdqu xmm1, [eax + esi + 8] |
| 497 pavgb xmm0, xmm1 |
| 498 pshufb xmm0, xmm3 |
| 499 pmaddubsw xmm0, xmm6 |
| 500 paddsw xmm0, xmm7 |
| 501 psrlw xmm0, 2 |
| 502 packuswb xmm0, xmm0 |
| 503 movq qword ptr [edx + 8], xmm0 |
| 504 movdqa xmm0, [eax + 16] // pixels 16..23 |
| 505 movdqa xmm1, [eax + esi + 16] |
| 506 lea eax, [eax + 32] |
| 507 pavgb xmm0, xmm1 |
| 508 pshufb xmm0, xmm4 |
| 509 movdqa xmm1, kMadd21 |
| 510 pmaddubsw xmm0, xmm1 |
| 511 paddsw xmm0, xmm7 |
| 512 psrlw xmm0, 2 |
| 513 packuswb xmm0, xmm0 |
| 514 sub ecx, 24 |
| 515 movq qword ptr [edx + 16], xmm0 |
| 516 lea edx, [edx + 24] |
| 517 jg wloop |
| 518 |
| 519 pop esi |
| 520 ret |
| 521 } |
| 522 } |
| 523 |
| 524 // Note that movdqa+palign may be better than movdqu. |
| 525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
| 526 __declspec(naked) __declspec(align(16)) |
| 527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
| 528 ptrdiff_t src_stride, |
| 529 uint8* dst_ptr, int dst_width) { |
| 530 __asm { |
| 531 push esi |
| 532 mov eax, [esp + 4 + 4] // src_ptr |
| 533 mov esi, [esp + 4 + 8] // src_stride |
| 534 mov edx, [esp + 4 + 12] // dst_ptr |
| 535 mov ecx, [esp + 4 + 16] // dst_width |
| 536 movdqa xmm2, kShuf01 |
| 537 movdqa xmm3, kShuf11 |
| 538 movdqa xmm4, kShuf21 |
| 539 movdqa xmm5, kMadd01 |
| 540 movdqa xmm6, kMadd11 |
| 541 movdqa xmm7, kRound34 |
| 542 |
| 543 align 4 |
| 544 wloop: |
| 545 movdqa xmm0, [eax] // pixels 0..7 |
| 546 movdqa xmm1, [eax + esi] |
| 547 pavgb xmm1, xmm0 |
| 548 pavgb xmm0, xmm1 |
| 549 pshufb xmm0, xmm2 |
| 550 pmaddubsw xmm0, xmm5 |
| 551 paddsw xmm0, xmm7 |
| 552 psrlw xmm0, 2 |
| 553 packuswb xmm0, xmm0 |
| 554 movq qword ptr [edx], xmm0 |
| 555 movdqu xmm0, [eax + 8] // pixels 8..15 |
| 556 movdqu xmm1, [eax + esi + 8] |
| 557 pavgb xmm1, xmm0 |
| 558 pavgb xmm0, xmm1 |
| 559 pshufb xmm0, xmm3 |
| 560 pmaddubsw xmm0, xmm6 |
| 561 paddsw xmm0, xmm7 |
| 562 psrlw xmm0, 2 |
| 563 packuswb xmm0, xmm0 |
| 564 movq qword ptr [edx + 8], xmm0 |
| 565 movdqa xmm0, [eax + 16] // pixels 16..23 |
| 566 movdqa xmm1, [eax + esi + 16] |
| 567 lea eax, [eax + 32] |
| 568 pavgb xmm1, xmm0 |
| 569 pavgb xmm0, xmm1 |
| 570 pshufb xmm0, xmm4 |
| 571 movdqa xmm1, kMadd21 |
| 572 pmaddubsw xmm0, xmm1 |
| 573 paddsw xmm0, xmm7 |
| 574 psrlw xmm0, 2 |
| 575 packuswb xmm0, xmm0 |
| 576 sub ecx, 24 |
| 577 movq qword ptr [edx + 16], xmm0 |
| 578 lea edx, [edx+24] |
| 579 jg wloop |
| 580 |
| 581 pop esi |
| 582 ret |
| 583 } |
| 584 } |
| 585 |
| 586 // 3/8 point sampler |
| 587 |
| 588 // Scale 32 pixels to 12 |
| 589 __declspec(naked) __declspec(align(16)) |
| 590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 591 uint8* dst_ptr, int dst_width) { |
| 592 __asm { |
| 593 mov eax, [esp + 4] // src_ptr |
| 594 // src_stride ignored |
| 595 mov edx, [esp + 12] // dst_ptr |
| 596 mov ecx, [esp + 16] // dst_width |
| 597 movdqa xmm4, kShuf38a |
| 598 movdqa xmm5, kShuf38b |
| 599 |
| 600 align 4 |
| 601 xloop: |
| 602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 |
| 603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 |
| 604 lea eax, [eax + 32] |
| 605 pshufb xmm0, xmm4 |
| 606 pshufb xmm1, xmm5 |
| 607 paddusb xmm0, xmm1 |
| 608 |
| 609 sub ecx, 12 |
| 610 movq qword ptr [edx], xmm0 // write 12 pixels |
| 611 movhlps xmm1, xmm0 |
| 612 movd [edx + 8], xmm1 |
| 613 lea edx, [edx + 12] |
| 614 jg xloop |
| 615 |
| 616 ret |
| 617 } |
| 618 } |
| 619 |
| 620 // Scale 16x3 pixels to 6x1 with interpolation |
| 621 __declspec(naked) __declspec(align(16)) |
| 622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
| 623 ptrdiff_t src_stride, |
| 624 uint8* dst_ptr, int dst_width) { |
| 625 __asm { |
| 626 push esi |
| 627 mov eax, [esp + 4 + 4] // src_ptr |
| 628 mov esi, [esp + 4 + 8] // src_stride |
| 629 mov edx, [esp + 4 + 12] // dst_ptr |
| 630 mov ecx, [esp + 4 + 16] // dst_width |
| 631 movdqa xmm2, kShufAc |
| 632 movdqa xmm3, kShufAc3 |
| 633 movdqa xmm4, kScaleAc33 |
| 634 pxor xmm5, xmm5 |
| 635 |
| 636 align 4 |
| 637 xloop: |
| 638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 |
| 639 movdqa xmm6, [eax + esi] |
| 640 movhlps xmm1, xmm0 |
| 641 movhlps xmm7, xmm6 |
| 642 punpcklbw xmm0, xmm5 |
| 643 punpcklbw xmm1, xmm5 |
| 644 punpcklbw xmm6, xmm5 |
| 645 punpcklbw xmm7, xmm5 |
| 646 paddusw xmm0, xmm6 |
| 647 paddusw xmm1, xmm7 |
| 648 movdqa xmm6, [eax + esi * 2] |
| 649 lea eax, [eax + 16] |
| 650 movhlps xmm7, xmm6 |
| 651 punpcklbw xmm6, xmm5 |
| 652 punpcklbw xmm7, xmm5 |
| 653 paddusw xmm0, xmm6 |
| 654 paddusw xmm1, xmm7 |
| 655 |
| 656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 |
| 657 psrldq xmm0, 2 |
| 658 paddusw xmm6, xmm0 |
| 659 psrldq xmm0, 2 |
| 660 paddusw xmm6, xmm0 |
| 661 pshufb xmm6, xmm2 |
| 662 |
| 663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 |
| 664 psrldq xmm1, 2 |
| 665 paddusw xmm7, xmm1 |
| 666 psrldq xmm1, 2 |
| 667 paddusw xmm7, xmm1 |
| 668 pshufb xmm7, xmm3 |
| 669 paddusw xmm6, xmm7 |
| 670 |
| 671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 |
| 672 packuswb xmm6, xmm6 |
| 673 |
| 674 sub ecx, 6 |
| 675 movd [edx], xmm6 // write 6 pixels |
| 676 psrlq xmm6, 16 |
| 677 movd [edx + 2], xmm6 |
| 678 lea edx, [edx + 6] |
| 679 jg xloop |
| 680 |
| 681 pop esi |
| 682 ret |
| 683 } |
| 684 } |
| 685 |
| 686 // Scale 16x2 pixels to 6x1 with interpolation |
| 687 __declspec(naked) __declspec(align(16)) |
| 688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
| 689 ptrdiff_t src_stride, |
| 690 uint8* dst_ptr, int dst_width) { |
| 691 __asm { |
| 692 push esi |
| 693 mov eax, [esp + 4 + 4] // src_ptr |
| 694 mov esi, [esp + 4 + 8] // src_stride |
| 695 mov edx, [esp + 4 + 12] // dst_ptr |
| 696 mov ecx, [esp + 4 + 16] // dst_width |
| 697 movdqa xmm2, kShufAb0 |
| 698 movdqa xmm3, kShufAb1 |
| 699 movdqa xmm4, kShufAb2 |
| 700 movdqa xmm5, kScaleAb2 |
| 701 |
| 702 align 4 |
| 703 xloop: |
| 704 movdqa xmm0, [eax] // average 2 rows into xmm0 |
| 705 pavgb xmm0, [eax + esi] |
| 706 lea eax, [eax + 16] |
| 707 |
| 708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 |
| 709 pshufb xmm1, xmm2 |
| 710 movdqa xmm6, xmm0 |
| 711 pshufb xmm6, xmm3 |
| 712 paddusw xmm1, xmm6 |
| 713 pshufb xmm0, xmm4 |
| 714 paddusw xmm1, xmm0 |
| 715 |
| 716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 |
| 717 packuswb xmm1, xmm1 |
| 718 |
| 719 sub ecx, 6 |
| 720 movd [edx], xmm1 // write 6 pixels |
| 721 psrlq xmm1, 16 |
| 722 movd [edx + 2], xmm1 |
| 723 lea edx, [edx + 6] |
| 724 jg xloop |
| 725 |
| 726 pop esi |
| 727 ret |
| 728 } |
| 729 } |
| 730 |
| 731 // Reads 16xN bytes and produces 16 shorts at a time. |
| 732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. |
| 733 __declspec(naked) __declspec(align(16)) |
| 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 735 uint16* dst_ptr, int src_width, |
| 736 int src_height) { |
| 737 __asm { |
| 738 push esi |
| 739 push edi |
| 740 push ebx |
| 741 push ebp |
| 742 mov esi, [esp + 16 + 4] // src_ptr |
| 743 mov edx, [esp + 16 + 8] // src_stride |
| 744 mov edi, [esp + 16 + 12] // dst_ptr |
| 745 mov ecx, [esp + 16 + 16] // dst_width |
| 746 mov ebx, [esp + 16 + 20] // height |
| 747 pxor xmm4, xmm4 |
| 748 dec ebx |
| 749 |
| 750 align 4 |
| 751 xloop: |
| 752 // first row |
| 753 movdqa xmm0, [esi] |
| 754 lea eax, [esi + edx] |
| 755 movdqa xmm1, xmm0 |
| 756 punpcklbw xmm0, xmm4 |
| 757 punpckhbw xmm1, xmm4 |
| 758 lea esi, [esi + 16] |
| 759 mov ebp, ebx |
| 760 test ebp, ebp |
| 761 je ydone |
| 762 |
| 763 // sum remaining rows |
| 764 align 4 |
| 765 yloop: |
| 766 movdqa xmm2, [eax] // read 16 pixels |
| 767 lea eax, [eax + edx] // advance to next row |
| 768 movdqa xmm3, xmm2 |
| 769 punpcklbw xmm2, xmm4 |
| 770 punpckhbw xmm3, xmm4 |
| 771 paddusw xmm0, xmm2 // sum 16 words |
| 772 paddusw xmm1, xmm3 |
| 773 sub ebp, 1 |
| 774 jg yloop |
| 775 |
| 776 align 4 |
| 777 ydone: |
| 778 movdqa [edi], xmm0 |
| 779 movdqa [edi + 16], xmm1 |
| 780 lea edi, [edi + 32] |
| 781 |
| 782 sub ecx, 16 |
| 783 jg xloop |
| 784 |
| 785 pop ebp |
| 786 pop ebx |
| 787 pop edi |
| 788 pop esi |
| 789 ret |
| 790 } |
| 791 } |
| 792 |
| 793 // Bilinear column filtering. SSSE3 version. |
| 794 // TODO(fbarchard): Port to Neon |
| 795 // TODO(fbarchard): Switch the following: |
| 796 // xor ebx, ebx |
| 797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels |
| 798 // To |
| 799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
| 800 // when drmemory bug fixed. |
| 801 // https://code.google.com/p/drmemory/issues/detail?id=1396 |
| 802 |
| 803 __declspec(naked) __declspec(align(16)) |
| 804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 805 int dst_width, int x, int dx) { |
| 806 __asm { |
| 807 push ebx |
| 808 push esi |
| 809 push edi |
| 810 mov edi, [esp + 12 + 4] // dst_ptr |
| 811 mov esi, [esp + 12 + 8] // src_ptr |
| 812 mov ecx, [esp + 12 + 12] // dst_width |
| 813 movd xmm2, [esp + 12 + 16] // x |
| 814 movd xmm3, [esp + 12 + 20] // dx |
| 815 mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
| 816 movd xmm5, eax |
| 817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
| 818 psrlw xmm6, 9 |
| 819 pextrw eax, xmm2, 1 // get x0 integer. preroll |
| 820 sub ecx, 2 |
| 821 jl xloop29 |
| 822 |
| 823 movdqa xmm0, xmm2 // x1 = x0 + dx |
| 824 paddd xmm0, xmm3 |
| 825 punpckldq xmm2, xmm0 // x0 x1 |
| 826 punpckldq xmm3, xmm3 // dx dx |
| 827 paddd xmm3, xmm3 // dx * 2, dx * 2 |
| 828 pextrw edx, xmm2, 3 // get x1 integer. preroll |
| 829 |
| 830 // 2 Pixel loop. |
| 831 align 4 |
| 832 xloop2: |
| 833 movdqa xmm1, xmm2 // x0, x1 fractions. |
| 834 paddd xmm2, xmm3 // x += dx |
| 835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
| 836 movd xmm0, ebx |
| 837 psrlw xmm1, 9 // 7 bit fractions. |
| 838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
| 839 movd xmm4, ebx |
| 840 pshufb xmm1, xmm5 // 0011 |
| 841 punpcklwd xmm0, xmm4 |
| 842 pxor xmm1, xmm6 // 0..7f and 7f..0 |
| 843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. |
| 844 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
| 845 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
| 846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
| 847 packuswb xmm0, xmm0 // 8 bits, 2 pixels. |
| 848 movd ebx, xmm0 |
| 849 mov [edi], bx |
| 850 lea edi, [edi + 2] |
| 851 sub ecx, 2 // 2 pixels |
| 852 jge xloop2 |
| 853 |
| 854 align 4 |
| 855 xloop29: |
| 856 |
| 857 add ecx, 2 - 1 |
| 858 jl xloop99 |
| 859 |
| 860 // 1 pixel remainder |
| 861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
| 862 movd xmm0, ebx |
| 863 psrlw xmm2, 9 // 7 bit fractions. |
| 864 pshufb xmm2, xmm5 // 0011 |
| 865 pxor xmm2, xmm6 // 0..7f and 7f..0 |
| 866 pmaddubsw xmm0, xmm2 // 16 bit |
| 867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
| 868 packuswb xmm0, xmm0 // 8 bits |
| 869 movd ebx, xmm0 |
| 870 mov [edi], bl |
| 871 |
| 872 align 4 |
| 873 xloop99: |
| 874 |
| 875 pop edi |
| 876 pop esi |
| 877 pop ebx |
| 878 ret |
| 879 } |
| 880 } |
| 881 |
| 882 // Reads 16 pixels, duplicates them and writes 32 pixels. |
| 883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 884 __declspec(naked) __declspec(align(16)) |
| 885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 886 int dst_width, int x, int dx) { |
| 887 __asm { |
| 888 mov edx, [esp + 4] // dst_ptr |
| 889 mov eax, [esp + 8] // src_ptr |
| 890 mov ecx, [esp + 12] // dst_width |
| 891 |
| 892 align 4 |
| 893 wloop: |
| 894 movdqa xmm0, [eax] |
| 895 lea eax, [eax + 16] |
| 896 movdqa xmm1, xmm0 |
| 897 punpcklbw xmm0, xmm0 |
| 898 punpckhbw xmm1, xmm1 |
| 899 sub ecx, 32 |
| 900 movdqa [edx], xmm0 |
| 901 movdqa [edx + 16], xmm1 |
| 902 lea edx, [edx + 32] |
| 903 jg wloop |
| 904 |
| 905 ret |
| 906 } |
| 907 } |
| 908 |
| 909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) |
| 910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 911 __declspec(naked) __declspec(align(16)) |
| 912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
| 913 ptrdiff_t src_stride, |
| 914 uint8* dst_argb, int dst_width) { |
| 915 __asm { |
| 916 mov eax, [esp + 4] // src_argb |
| 917 // src_stride ignored |
| 918 mov edx, [esp + 12] // dst_argb |
| 919 mov ecx, [esp + 16] // dst_width |
| 920 |
| 921 align 4 |
| 922 wloop: |
| 923 movdqa xmm0, [eax] |
| 924 movdqa xmm1, [eax + 16] |
| 925 lea eax, [eax + 32] |
| 926 shufps xmm0, xmm1, 0xdd |
| 927 sub ecx, 4 |
| 928 movdqa [edx], xmm0 |
| 929 lea edx, [edx + 16] |
| 930 jg wloop |
| 931 |
| 932 ret |
| 933 } |
| 934 } |
| 935 |
| 936 // Blends 8x1 rectangle to 4x1. |
| 937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 938 __declspec(naked) __declspec(align(16)) |
| 939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
| 940 ptrdiff_t src_stride, |
| 941 uint8* dst_argb, int dst_width) { |
| 942 __asm { |
| 943 mov eax, [esp + 4] // src_argb |
| 944 // src_stride ignored |
| 945 mov edx, [esp + 12] // dst_argb |
| 946 mov ecx, [esp + 16] // dst_width |
| 947 |
| 948 align 4 |
| 949 wloop: |
| 950 movdqa xmm0, [eax] |
| 951 movdqa xmm1, [eax + 16] |
| 952 lea eax, [eax + 32] |
| 953 movdqa xmm2, xmm0 |
| 954 shufps xmm0, xmm1, 0x88 // even pixels |
| 955 shufps xmm2, xmm1, 0xdd // odd pixels |
| 956 pavgb xmm0, xmm2 |
| 957 sub ecx, 4 |
| 958 movdqa [edx], xmm0 |
| 959 lea edx, [edx + 16] |
| 960 jg wloop |
| 961 |
| 962 ret |
| 963 } |
| 964 } |
| 965 |
| 966 // Blends 8x2 rectangle to 4x1. |
| 967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 968 __declspec(naked) __declspec(align(16)) |
| 969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
| 970 ptrdiff_t src_stride, |
| 971 uint8* dst_argb, int dst_width) { |
| 972 __asm { |
| 973 push esi |
| 974 mov eax, [esp + 4 + 4] // src_argb |
| 975 mov esi, [esp + 4 + 8] // src_stride |
| 976 mov edx, [esp + 4 + 12] // dst_argb |
| 977 mov ecx, [esp + 4 + 16] // dst_width |
| 978 |
| 979 align 4 |
| 980 wloop: |
| 981 movdqa xmm0, [eax] |
| 982 movdqa xmm1, [eax + 16] |
| 983 movdqa xmm2, [eax + esi] |
| 984 movdqa xmm3, [eax + esi + 16] |
| 985 lea eax, [eax + 32] |
| 986 pavgb xmm0, xmm2 // average rows |
| 987 pavgb xmm1, xmm3 |
| 988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
| 989 shufps xmm0, xmm1, 0x88 // even pixels |
| 990 shufps xmm2, xmm1, 0xdd // odd pixels |
| 991 pavgb xmm0, xmm2 |
| 992 sub ecx, 4 |
| 993 movdqa [edx], xmm0 |
| 994 lea edx, [edx + 16] |
| 995 jg wloop |
| 996 |
| 997 pop esi |
| 998 ret |
| 999 } |
| 1000 } |
| 1001 |
| 1002 // Reads 4 pixels at a time. |
| 1003 // Alignment requirement: dst_argb 16 byte aligned. |
| 1004 __declspec(naked) __declspec(align(16)) |
| 1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
| 1006 int src_stepx, |
| 1007 uint8* dst_argb, int dst_width) { |
| 1008 __asm { |
| 1009 push ebx |
| 1010 push edi |
| 1011 mov eax, [esp + 8 + 4] // src_argb |
| 1012 // src_stride ignored |
| 1013 mov ebx, [esp + 8 + 12] // src_stepx |
| 1014 mov edx, [esp + 8 + 16] // dst_argb |
| 1015 mov ecx, [esp + 8 + 20] // dst_width |
| 1016 lea ebx, [ebx * 4] |
| 1017 lea edi, [ebx + ebx * 2] |
| 1018 |
| 1019 align 4 |
| 1020 wloop: |
| 1021 movd xmm0, [eax] |
| 1022 movd xmm1, [eax + ebx] |
| 1023 punpckldq xmm0, xmm1 |
| 1024 movd xmm2, [eax + ebx * 2] |
| 1025 movd xmm3, [eax + edi] |
| 1026 lea eax, [eax + ebx * 4] |
| 1027 punpckldq xmm2, xmm3 |
| 1028 punpcklqdq xmm0, xmm2 |
| 1029 sub ecx, 4 |
| 1030 movdqa [edx], xmm0 |
| 1031 lea edx, [edx + 16] |
| 1032 jg wloop |
| 1033 |
| 1034 pop edi |
| 1035 pop ebx |
| 1036 ret |
| 1037 } |
| 1038 } |
| 1039 |
| 1040 // Blends four 2x2 to 4x1. |
| 1041 // Alignment requirement: dst_argb 16 byte aligned. |
| 1042 __declspec(naked) __declspec(align(16)) |
| 1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
| 1044 ptrdiff_t src_stride, |
| 1045 int src_stepx, |
| 1046 uint8* dst_argb, int dst_width) { |
| 1047 __asm { |
| 1048 push ebx |
| 1049 push esi |
| 1050 push edi |
| 1051 mov eax, [esp + 12 + 4] // src_argb |
| 1052 mov esi, [esp + 12 + 8] // src_stride |
| 1053 mov ebx, [esp + 12 + 12] // src_stepx |
| 1054 mov edx, [esp + 12 + 16] // dst_argb |
| 1055 mov ecx, [esp + 12 + 20] // dst_width |
| 1056 lea esi, [eax + esi] // row1 pointer |
| 1057 lea ebx, [ebx * 4] |
| 1058 lea edi, [ebx + ebx * 2] |
| 1059 |
| 1060 align 4 |
| 1061 wloop: |
| 1062 movq xmm0, qword ptr [eax] // row0 4 pairs |
| 1063 movhps xmm0, qword ptr [eax + ebx] |
| 1064 movq xmm1, qword ptr [eax + ebx * 2] |
| 1065 movhps xmm1, qword ptr [eax + edi] |
| 1066 lea eax, [eax + ebx * 4] |
| 1067 movq xmm2, qword ptr [esi] // row1 4 pairs |
| 1068 movhps xmm2, qword ptr [esi + ebx] |
| 1069 movq xmm3, qword ptr [esi + ebx * 2] |
| 1070 movhps xmm3, qword ptr [esi + edi] |
| 1071 lea esi, [esi + ebx * 4] |
| 1072 pavgb xmm0, xmm2 // average rows |
| 1073 pavgb xmm1, xmm3 |
| 1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
| 1075 shufps xmm0, xmm1, 0x88 // even pixels |
| 1076 shufps xmm2, xmm1, 0xdd // odd pixels |
| 1077 pavgb xmm0, xmm2 |
| 1078 sub ecx, 4 |
| 1079 movdqa [edx], xmm0 |
| 1080 lea edx, [edx + 16] |
| 1081 jg wloop |
| 1082 |
| 1083 pop edi |
| 1084 pop esi |
| 1085 pop ebx |
| 1086 ret |
| 1087 } |
| 1088 } |
| 1089 |
| 1090 // Column scaling unfiltered. SSE2 version. |
| 1091 __declspec(naked) __declspec(align(16)) |
| 1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
| 1093 int dst_width, int x, int dx) { |
| 1094 __asm { |
| 1095 push edi |
| 1096 push esi |
| 1097 mov edi, [esp + 8 + 4] // dst_argb |
| 1098 mov esi, [esp + 8 + 8] // src_argb |
| 1099 mov ecx, [esp + 8 + 12] // dst_width |
| 1100 movd xmm2, [esp + 8 + 16] // x |
| 1101 movd xmm3, [esp + 8 + 20] // dx |
| 1102 |
| 1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 |
| 1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 |
| 1105 paddd xmm2, xmm0 |
| 1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 |
| 1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 |
| 1108 paddd xmm2, xmm0 // x3 x2 x1 x0 |
| 1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 |
| 1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 |
| 1111 |
| 1112 pextrw eax, xmm2, 1 // get x0 integer. |
| 1113 pextrw edx, xmm2, 3 // get x1 integer. |
| 1114 |
| 1115 cmp ecx, 0 |
| 1116 jle xloop99 |
| 1117 sub ecx, 4 |
| 1118 jl xloop49 |
| 1119 |
| 1120 // 4 Pixel loop. |
| 1121 align 4 |
| 1122 xloop4: |
| 1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
| 1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
| 1125 pextrw eax, xmm2, 5 // get x2 integer. |
| 1126 pextrw edx, xmm2, 7 // get x3 integer. |
| 1127 paddd xmm2, xmm3 // x += dx |
| 1128 punpckldq xmm0, xmm1 // x0 x1 |
| 1129 |
| 1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels |
| 1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels |
| 1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
| 1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
| 1134 punpckldq xmm1, xmm4 // x2 x3 |
| 1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 |
| 1136 sub ecx, 4 // 4 pixels |
| 1137 movdqu [edi], xmm0 |
| 1138 lea edi, [edi + 16] |
| 1139 jge xloop4 |
| 1140 |
| 1141 align 4 |
| 1142 xloop49: |
| 1143 test ecx, 2 |
| 1144 je xloop29 |
| 1145 |
| 1146 // 2 Pixels. |
| 1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
| 1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
| 1149 pextrw eax, xmm2, 5 // get x2 integer. |
| 1150 punpckldq xmm0, xmm1 // x0 x1 |
| 1151 |
| 1152 movq qword ptr [edi], xmm0 |
| 1153 lea edi, [edi + 8] |
| 1154 |
| 1155 xloop29: |
| 1156 test ecx, 1 |
| 1157 je xloop99 |
| 1158 |
| 1159 // 1 Pixels. |
| 1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels |
| 1161 movd dword ptr [edi], xmm0 |
| 1162 align 4 |
| 1163 xloop99: |
| 1164 |
| 1165 pop esi |
| 1166 pop edi |
| 1167 ret |
| 1168 } |
| 1169 } |
| 1170 |
| 1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. |
| 1172 // TODO(fbarchard): Port to Neon |
| 1173 |
| 1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
| 1175 static uvec8 kShuffleColARGB = { |
| 1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
| 1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
| 1178 }; |
| 1179 |
| 1180 // Shuffle table for duplicating 2 fractions into 8 bytes each |
| 1181 static uvec8 kShuffleFractions = { |
| 1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
| 1183 }; |
| 1184 |
| 1185 __declspec(naked) __declspec(align(16)) |
| 1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
| 1187 int dst_width, int x, int dx) { |
| 1188 __asm { |
| 1189 push esi |
| 1190 push edi |
| 1191 mov edi, [esp + 8 + 4] // dst_argb |
| 1192 mov esi, [esp + 8 + 8] // src_argb |
| 1193 mov ecx, [esp + 8 + 12] // dst_width |
| 1194 movd xmm2, [esp + 8 + 16] // x |
| 1195 movd xmm3, [esp + 8 + 20] // dx |
| 1196 movdqa xmm4, kShuffleColARGB |
| 1197 movdqa xmm5, kShuffleFractions |
| 1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
| 1199 psrlw xmm6, 9 |
| 1200 pextrw eax, xmm2, 1 // get x0 integer. preroll |
| 1201 sub ecx, 2 |
| 1202 jl xloop29 |
| 1203 |
| 1204 movdqa xmm0, xmm2 // x1 = x0 + dx |
| 1205 paddd xmm0, xmm3 |
| 1206 punpckldq xmm2, xmm0 // x0 x1 |
| 1207 punpckldq xmm3, xmm3 // dx dx |
| 1208 paddd xmm3, xmm3 // dx * 2, dx * 2 |
| 1209 pextrw edx, xmm2, 3 // get x1 integer. preroll |
| 1210 |
| 1211 // 2 Pixel loop. |
| 1212 align 4 |
| 1213 xloop2: |
| 1214 movdqa xmm1, xmm2 // x0, x1 fractions. |
| 1215 paddd xmm2, xmm3 // x += dx |
| 1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
| 1217 psrlw xmm1, 9 // 7 bit fractions. |
| 1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels |
| 1219 pshufb xmm1, xmm5 // 0000000011111111 |
| 1220 pshufb xmm0, xmm4 // arrange pixels into pairs |
| 1221 pxor xmm1, xmm6 // 0..7f and 7f..0 |
| 1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. |
| 1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
| 1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
| 1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. |
| 1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. |
| 1227 movq qword ptr [edi], xmm0 |
| 1228 lea edi, [edi + 8] |
| 1229 sub ecx, 2 // 2 pixels |
| 1230 jge xloop2 |
| 1231 |
| 1232 align 4 |
| 1233 xloop29: |
| 1234 |
| 1235 add ecx, 2 - 1 |
| 1236 jl xloop99 |
| 1237 |
| 1238 // 1 pixel remainder |
| 1239 psrlw xmm2, 9 // 7 bit fractions. |
| 1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
| 1241 pshufb xmm2, xmm5 // 00000000 |
| 1242 pshufb xmm0, xmm4 // arrange pixels into pairs |
| 1243 pxor xmm2, xmm6 // 0..7f and 7f..0 |
| 1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. |
| 1245 psrlw xmm0, 7 |
| 1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. |
| 1247 movd [edi], xmm0 |
| 1248 |
| 1249 align 4 |
| 1250 xloop99: |
| 1251 |
| 1252 pop edi |
| 1253 pop esi |
| 1254 ret |
| 1255 } |
| 1256 } |
| 1257 |
| 1258 // Reads 4 pixels, duplicates them and writes 8 pixels. |
| 1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 1260 __declspec(naked) __declspec(align(16)) |
| 1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
| 1262 int dst_width, int x, int dx) { |
| 1263 __asm { |
| 1264 mov edx, [esp + 4] // dst_argb |
| 1265 mov eax, [esp + 8] // src_argb |
| 1266 mov ecx, [esp + 12] // dst_width |
| 1267 |
| 1268 align 4 |
| 1269 wloop: |
| 1270 movdqa xmm0, [eax] |
| 1271 lea eax, [eax + 16] |
| 1272 movdqa xmm1, xmm0 |
| 1273 punpckldq xmm0, xmm0 |
| 1274 punpckhdq xmm1, xmm1 |
| 1275 sub ecx, 8 |
| 1276 movdqa [edx], xmm0 |
| 1277 movdqa [edx + 16], xmm1 |
| 1278 lea edx, [edx + 32] |
| 1279 jg wloop |
| 1280 |
| 1281 ret |
| 1282 } |
| 1283 } |
| 1284 |
| 1285 // Divide num by div and return as 16.16 fixed point result. |
| 1286 __declspec(naked) __declspec(align(16)) |
| 1287 int FixedDiv_X86(int num, int div) { |
| 1288 __asm { |
| 1289 mov eax, [esp + 4] // num |
| 1290 cdq // extend num to 64 bits |
| 1291 shld edx, eax, 16 // 32.16 |
| 1292 shl eax, 16 |
| 1293 idiv dword ptr [esp + 8] |
| 1294 ret |
| 1295 } |
| 1296 } |
| 1297 |
| 1298 // Divide num by div and return as 16.16 fixed point result. |
| 1299 __declspec(naked) __declspec(align(16)) |
| 1300 int FixedDiv1_X86(int num, int div) { |
| 1301 __asm { |
| 1302 mov eax, [esp + 4] // num |
| 1303 mov ecx, [esp + 8] // denom |
| 1304 cdq // extend num to 64 bits |
| 1305 shld edx, eax, 16 // 32.16 |
| 1306 shl eax, 16 |
| 1307 sub eax, 0x00010001 |
| 1308 sbb edx, 0 |
| 1309 sub ecx, 1 |
| 1310 idiv ecx |
| 1311 ret |
| 1312 } |
| 1313 } |
| 1314 |
| 1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
| 1316 |
| 1317 #ifdef __cplusplus |
| 1318 } // extern "C" |
| 1319 } // namespace libyuv |
| 1320 #endif |
OLD | NEW |