OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "third_party/libyuv/include/libyuv/row.h" |
| 12 |
| 13 #ifdef __cplusplus |
| 14 namespace libyuv { |
| 15 extern "C" { |
| 16 #endif |
| 17 |
| 18 // This module is for Visual C x86. |
| 19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
| 20 |
| 21 #ifdef HAS_ARGBTOYROW_SSSE3 |
| 22 |
| 23 // Constants for ARGB. |
| 24 static const vec8 kARGBToY = { |
| 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
| 26 }; |
| 27 |
| 28 // JPeg full range. |
| 29 static const vec8 kARGBToYJ = { |
| 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
| 31 }; |
| 32 |
| 33 static const vec8 kARGBToU = { |
| 34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
| 35 }; |
| 36 |
| 37 static const vec8 kARGBToUJ = { |
| 38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
| 39 }; |
| 40 |
| 41 static const vec8 kARGBToV = { |
| 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
| 43 }; |
| 44 |
| 45 static const vec8 kARGBToVJ = { |
| 46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
| 47 }; |
| 48 |
| 49 // vpermd for vphaddw + vpackuswb vpermd. |
| 50 static const lvec32 kPermdARGBToY_AVX = { |
| 51 0, 4, 1, 5, 2, 6, 3, 7 |
| 52 }; |
| 53 |
| 54 // vpshufb for vphaddw + vpackuswb packed to shorts. |
| 55 static const lvec8 kShufARGBToUV_AVX = { |
| 56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
| 57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
| 58 }; |
| 59 |
| 60 // Constants for BGRA. |
| 61 static const vec8 kBGRAToY = { |
| 62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
| 63 }; |
| 64 |
| 65 static const vec8 kBGRAToU = { |
| 66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
| 67 }; |
| 68 |
| 69 static const vec8 kBGRAToV = { |
| 70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
| 71 }; |
| 72 |
| 73 // Constants for ABGR. |
| 74 static const vec8 kABGRToY = { |
| 75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
| 76 }; |
| 77 |
| 78 static const vec8 kABGRToU = { |
| 79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
| 80 }; |
| 81 |
| 82 static const vec8 kABGRToV = { |
| 83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
| 84 }; |
| 85 |
| 86 // Constants for RGBA. |
| 87 static const vec8 kRGBAToY = { |
| 88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
| 89 }; |
| 90 |
| 91 static const vec8 kRGBAToU = { |
| 92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
| 93 }; |
| 94 |
| 95 static const vec8 kRGBAToV = { |
| 96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
| 97 }; |
| 98 |
| 99 static const uvec8 kAddY16 = { |
| 100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
| 101 }; |
| 102 |
| 103 static const vec16 kAddYJ64 = { |
| 104 64, 64, 64, 64, 64, 64, 64, 64 |
| 105 }; |
| 106 |
| 107 static const uvec8 kAddUV128 = { |
| 108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
| 109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
| 110 }; |
| 111 |
| 112 static const uvec16 kAddUVJ128 = { |
| 113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
| 114 }; |
| 115 |
| 116 // Shuffle table for converting RGB24 to ARGB. |
| 117 static const uvec8 kShuffleMaskRGB24ToARGB = { |
| 118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
| 119 }; |
| 120 |
| 121 // Shuffle table for converting RAW to ARGB. |
| 122 static const uvec8 kShuffleMaskRAWToARGB = { |
| 123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
| 124 }; |
| 125 |
| 126 // Shuffle table for converting ARGB to RGB24. |
| 127 static const uvec8 kShuffleMaskARGBToRGB24 = { |
| 128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
| 129 }; |
| 130 |
| 131 // Shuffle table for converting ARGB to RAW. |
| 132 static const uvec8 kShuffleMaskARGBToRAW = { |
| 133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
| 134 }; |
| 135 |
| 136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| 137 static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
| 139 }; |
| 140 |
| 141 // Shuffle table for converting ARGB to RAW. |
| 142 static const uvec8 kShuffleMaskARGBToRAW_0 = { |
| 143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
| 144 }; |
| 145 |
| 146 // Duplicates gray value 3 times and fills in alpha opaque. |
| 147 __declspec(naked) __declspec(align(16)) |
| 148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
| 149 __asm { |
| 150 mov eax, [esp + 4] // src_y |
| 151 mov edx, [esp + 8] // dst_argb |
| 152 mov ecx, [esp + 12] // pix |
| 153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 154 pslld xmm5, 24 |
| 155 |
| 156 align 4 |
| 157 convertloop: |
| 158 movq xmm0, qword ptr [eax] |
| 159 lea eax, [eax + 8] |
| 160 punpcklbw xmm0, xmm0 |
| 161 movdqa xmm1, xmm0 |
| 162 punpcklwd xmm0, xmm0 |
| 163 punpckhwd xmm1, xmm1 |
| 164 por xmm0, xmm5 |
| 165 por xmm1, xmm5 |
| 166 movdqa [edx], xmm0 |
| 167 movdqa [edx + 16], xmm1 |
| 168 lea edx, [edx + 32] |
| 169 sub ecx, 8 |
| 170 jg convertloop |
| 171 ret |
| 172 } |
| 173 } |
| 174 |
| 175 __declspec(naked) __declspec(align(16)) |
| 176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, |
| 177 int pix) { |
| 178 __asm { |
| 179 mov eax, [esp + 4] // src_y |
| 180 mov edx, [esp + 8] // dst_argb |
| 181 mov ecx, [esp + 12] // pix |
| 182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 183 pslld xmm5, 24 |
| 184 |
| 185 align 4 |
| 186 convertloop: |
| 187 movq xmm0, qword ptr [eax] |
| 188 lea eax, [eax + 8] |
| 189 punpcklbw xmm0, xmm0 |
| 190 movdqa xmm1, xmm0 |
| 191 punpcklwd xmm0, xmm0 |
| 192 punpckhwd xmm1, xmm1 |
| 193 por xmm0, xmm5 |
| 194 por xmm1, xmm5 |
| 195 movdqu [edx], xmm0 |
| 196 movdqu [edx + 16], xmm1 |
| 197 lea edx, [edx + 32] |
| 198 sub ecx, 8 |
| 199 jg convertloop |
| 200 ret |
| 201 } |
| 202 } |
| 203 |
| 204 __declspec(naked) __declspec(align(16)) |
| 205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
| 206 __asm { |
| 207 mov eax, [esp + 4] // src_rgb24 |
| 208 mov edx, [esp + 8] // dst_argb |
| 209 mov ecx, [esp + 12] // pix |
| 210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 211 pslld xmm5, 24 |
| 212 movdqa xmm4, kShuffleMaskRGB24ToARGB |
| 213 |
| 214 align 4 |
| 215 convertloop: |
| 216 movdqu xmm0, [eax] |
| 217 movdqu xmm1, [eax + 16] |
| 218 movdqu xmm3, [eax + 32] |
| 219 lea eax, [eax + 48] |
| 220 movdqa xmm2, xmm3 |
| 221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| 222 pshufb xmm2, xmm4 |
| 223 por xmm2, xmm5 |
| 224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| 225 pshufb xmm0, xmm4 |
| 226 movdqa [edx + 32], xmm2 |
| 227 por xmm0, xmm5 |
| 228 pshufb xmm1, xmm4 |
| 229 movdqa [edx], xmm0 |
| 230 por xmm1, xmm5 |
| 231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| 232 pshufb xmm3, xmm4 |
| 233 movdqa [edx + 16], xmm1 |
| 234 por xmm3, xmm5 |
| 235 sub ecx, 16 |
| 236 movdqa [edx + 48], xmm3 |
| 237 lea edx, [edx + 64] |
| 238 jg convertloop |
| 239 ret |
| 240 } |
| 241 } |
| 242 |
| 243 __declspec(naked) __declspec(align(16)) |
| 244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
| 245 int pix) { |
| 246 __asm { |
| 247 mov eax, [esp + 4] // src_raw |
| 248 mov edx, [esp + 8] // dst_argb |
| 249 mov ecx, [esp + 12] // pix |
| 250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
| 251 pslld xmm5, 24 |
| 252 movdqa xmm4, kShuffleMaskRAWToARGB |
| 253 |
| 254 align 4 |
| 255 convertloop: |
| 256 movdqu xmm0, [eax] |
| 257 movdqu xmm1, [eax + 16] |
| 258 movdqu xmm3, [eax + 32] |
| 259 lea eax, [eax + 48] |
| 260 movdqa xmm2, xmm3 |
| 261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
| 262 pshufb xmm2, xmm4 |
| 263 por xmm2, xmm5 |
| 264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
| 265 pshufb xmm0, xmm4 |
| 266 movdqa [edx + 32], xmm2 |
| 267 por xmm0, xmm5 |
| 268 pshufb xmm1, xmm4 |
| 269 movdqa [edx], xmm0 |
| 270 por xmm1, xmm5 |
| 271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
| 272 pshufb xmm3, xmm4 |
| 273 movdqa [edx + 16], xmm1 |
| 274 por xmm3, xmm5 |
| 275 sub ecx, 16 |
| 276 movdqa [edx + 48], xmm3 |
| 277 lea edx, [edx + 64] |
| 278 jg convertloop |
| 279 ret |
| 280 } |
| 281 } |
| 282 |
| 283 // pmul method to replicate bits. |
| 284 // Math to replicate bits: |
| 285 // (v << 8) | (v << 3) |
| 286 // v * 256 + v * 8 |
| 287 // v * (256 + 8) |
| 288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
| 289 // 20 instructions. |
| 290 __declspec(naked) __declspec(align(16)) |
| 291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
| 292 int pix) { |
| 293 __asm { |
| 294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 295 movd xmm5, eax |
| 296 pshufd xmm5, xmm5, 0 |
| 297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
| 298 movd xmm6, eax |
| 299 pshufd xmm6, xmm6, 0 |
| 300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| 301 psllw xmm3, 11 |
| 302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
| 303 psllw xmm4, 10 |
| 304 psrlw xmm4, 5 |
| 305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| 306 psllw xmm7, 8 |
| 307 |
| 308 mov eax, [esp + 4] // src_rgb565 |
| 309 mov edx, [esp + 8] // dst_argb |
| 310 mov ecx, [esp + 12] // pix |
| 311 sub edx, eax |
| 312 sub edx, eax |
| 313 |
| 314 align 4 |
| 315 convertloop: |
| 316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
| 317 movdqa xmm1, xmm0 |
| 318 movdqa xmm2, xmm0 |
| 319 pand xmm1, xmm3 // R in upper 5 bits |
| 320 psllw xmm2, 11 // B in upper 5 bits |
| 321 pmulhuw xmm1, xmm5 // * (256 + 8) |
| 322 pmulhuw xmm2, xmm5 // * (256 + 8) |
| 323 psllw xmm1, 8 |
| 324 por xmm1, xmm2 // RB |
| 325 pand xmm0, xmm4 // G in middle 6 bits |
| 326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) |
| 327 por xmm0, xmm7 // AG |
| 328 movdqa xmm2, xmm1 |
| 329 punpcklbw xmm1, xmm0 |
| 330 punpckhbw xmm2, xmm0 |
| 331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| 332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| 333 lea eax, [eax + 16] |
| 334 sub ecx, 8 |
| 335 jg convertloop |
| 336 ret |
| 337 } |
| 338 } |
| 339 |
| 340 // 24 instructions |
| 341 __declspec(naked) __declspec(align(16)) |
| 342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
| 343 int pix) { |
| 344 __asm { |
| 345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
| 346 movd xmm5, eax |
| 347 pshufd xmm5, xmm5, 0 |
| 348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
| 349 movd xmm6, eax |
| 350 pshufd xmm6, xmm6, 0 |
| 351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
| 352 psllw xmm3, 11 |
| 353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
| 354 psrlw xmm4, 6 |
| 355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
| 356 psllw xmm7, 8 |
| 357 |
| 358 mov eax, [esp + 4] // src_argb1555 |
| 359 mov edx, [esp + 8] // dst_argb |
| 360 mov ecx, [esp + 12] // pix |
| 361 sub edx, eax |
| 362 sub edx, eax |
| 363 |
| 364 align 4 |
| 365 convertloop: |
| 366 movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
| 367 movdqa xmm1, xmm0 |
| 368 movdqa xmm2, xmm0 |
| 369 psllw xmm1, 1 // R in upper 5 bits |
| 370 psllw xmm2, 11 // B in upper 5 bits |
| 371 pand xmm1, xmm3 |
| 372 pmulhuw xmm2, xmm5 // * (256 + 8) |
| 373 pmulhuw xmm1, xmm5 // * (256 + 8) |
| 374 psllw xmm1, 8 |
| 375 por xmm1, xmm2 // RB |
| 376 movdqa xmm2, xmm0 |
| 377 pand xmm0, xmm4 // G in middle 5 bits |
| 378 psraw xmm2, 8 // A |
| 379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) |
| 380 pand xmm2, xmm7 |
| 381 por xmm0, xmm2 // AG |
| 382 movdqa xmm2, xmm1 |
| 383 punpcklbw xmm1, xmm0 |
| 384 punpckhbw xmm2, xmm0 |
| 385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
| 386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
| 387 lea eax, [eax + 16] |
| 388 sub ecx, 8 |
| 389 jg convertloop |
| 390 ret |
| 391 } |
| 392 } |
| 393 |
| 394 // 18 instructions. |
| 395 __declspec(naked) __declspec(align(16)) |
| 396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
| 397 int pix) { |
| 398 __asm { |
| 399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
| 400 movd xmm4, eax |
| 401 pshufd xmm4, xmm4, 0 |
| 402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
| 403 pslld xmm5, 4 |
| 404 mov eax, [esp + 4] // src_argb4444 |
| 405 mov edx, [esp + 8] // dst_argb |
| 406 mov ecx, [esp + 12] // pix |
| 407 sub edx, eax |
| 408 sub edx, eax |
| 409 |
| 410 align 4 |
| 411 convertloop: |
| 412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
| 413 movdqa xmm2, xmm0 |
| 414 pand xmm0, xmm4 // mask low nibbles |
| 415 pand xmm2, xmm5 // mask high nibbles |
| 416 movdqa xmm1, xmm0 |
| 417 movdqa xmm3, xmm2 |
| 418 psllw xmm1, 4 |
| 419 psrlw xmm3, 4 |
| 420 por xmm0, xmm1 |
| 421 por xmm2, xmm3 |
| 422 movdqa xmm1, xmm0 |
| 423 punpcklbw xmm0, xmm2 |
| 424 punpckhbw xmm1, xmm2 |
| 425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
| 426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
| 427 lea eax, [eax + 16] |
| 428 sub ecx, 8 |
| 429 jg convertloop |
| 430 ret |
| 431 } |
| 432 } |
| 433 |
| 434 __declspec(naked) __declspec(align(16)) |
| 435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 436 __asm { |
| 437 mov eax, [esp + 4] // src_argb |
| 438 mov edx, [esp + 8] // dst_rgb |
| 439 mov ecx, [esp + 12] // pix |
| 440 movdqa xmm6, kShuffleMaskARGBToRGB24 |
| 441 |
| 442 align 4 |
| 443 convertloop: |
| 444 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 445 movdqu xmm1, [eax + 16] |
| 446 movdqu xmm2, [eax + 32] |
| 447 movdqu xmm3, [eax + 48] |
| 448 lea eax, [eax + 64] |
| 449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| 450 pshufb xmm1, xmm6 |
| 451 pshufb xmm2, xmm6 |
| 452 pshufb xmm3, xmm6 |
| 453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| 454 psrldq xmm1, 4 // 8 bytes from 1 |
| 455 pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| 456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| 457 por xmm0, xmm4 // 4 bytes from 1 for 0 |
| 458 pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| 459 movdqu [edx], xmm0 // store 0 |
| 460 por xmm1, xmm5 // 8 bytes from 2 for 1 |
| 461 psrldq xmm2, 8 // 4 bytes from 2 |
| 462 pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| 463 por xmm2, xmm3 // 12 bytes from 3 for 2 |
| 464 movdqu [edx + 16], xmm1 // store 1 |
| 465 movdqu [edx + 32], xmm2 // store 2 |
| 466 lea edx, [edx + 48] |
| 467 sub ecx, 16 |
| 468 jg convertloop |
| 469 ret |
| 470 } |
| 471 } |
| 472 |
| 473 __declspec(naked) __declspec(align(16)) |
| 474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 475 __asm { |
| 476 mov eax, [esp + 4] // src_argb |
| 477 mov edx, [esp + 8] // dst_rgb |
| 478 mov ecx, [esp + 12] // pix |
| 479 movdqa xmm6, kShuffleMaskARGBToRAW |
| 480 |
| 481 align 4 |
| 482 convertloop: |
| 483 movdqu xmm0, [eax] // fetch 16 pixels of argb |
| 484 movdqu xmm1, [eax + 16] |
| 485 movdqu xmm2, [eax + 32] |
| 486 movdqu xmm3, [eax + 48] |
| 487 lea eax, [eax + 64] |
| 488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
| 489 pshufb xmm1, xmm6 |
| 490 pshufb xmm2, xmm6 |
| 491 pshufb xmm3, xmm6 |
| 492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
| 493 psrldq xmm1, 4 // 8 bytes from 1 |
| 494 pslldq xmm4, 12 // 4 bytes from 1 for 0 |
| 495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
| 496 por xmm0, xmm4 // 4 bytes from 1 for 0 |
| 497 pslldq xmm5, 8 // 8 bytes from 2 for 1 |
| 498 movdqu [edx], xmm0 // store 0 |
| 499 por xmm1, xmm5 // 8 bytes from 2 for 1 |
| 500 psrldq xmm2, 8 // 4 bytes from 2 |
| 501 pslldq xmm3, 4 // 12 bytes from 3 for 2 |
| 502 por xmm2, xmm3 // 12 bytes from 3 for 2 |
| 503 movdqu [edx + 16], xmm1 // store 1 |
| 504 movdqu [edx + 32], xmm2 // store 2 |
| 505 lea edx, [edx + 48] |
| 506 sub ecx, 16 |
| 507 jg convertloop |
| 508 ret |
| 509 } |
| 510 } |
| 511 |
| 512 __declspec(naked) __declspec(align(16)) |
| 513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 514 __asm { |
| 515 mov eax, [esp + 4] // src_argb |
| 516 mov edx, [esp + 8] // dst_rgb |
| 517 mov ecx, [esp + 12] // pix |
| 518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
| 519 psrld xmm3, 27 |
| 520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
| 521 psrld xmm4, 26 |
| 522 pslld xmm4, 5 |
| 523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
| 524 pslld xmm5, 11 |
| 525 |
| 526 align 4 |
| 527 convertloop: |
| 528 movdqa xmm0, [eax] // fetch 4 pixels of argb |
| 529 movdqa xmm1, xmm0 // B |
| 530 movdqa xmm2, xmm0 // G |
| 531 pslld xmm0, 8 // R |
| 532 psrld xmm1, 3 // B |
| 533 psrld xmm2, 5 // G |
| 534 psrad xmm0, 16 // R |
| 535 pand xmm1, xmm3 // B |
| 536 pand xmm2, xmm4 // G |
| 537 pand xmm0, xmm5 // R |
| 538 por xmm1, xmm2 // BG |
| 539 por xmm0, xmm1 // BGR |
| 540 packssdw xmm0, xmm0 |
| 541 lea eax, [eax + 16] |
| 542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
| 543 lea edx, [edx + 8] |
| 544 sub ecx, 4 |
| 545 jg convertloop |
| 546 ret |
| 547 } |
| 548 } |
| 549 |
| 550 // TODO(fbarchard): Improve sign extension/packing. |
| 551 __declspec(naked) __declspec(align(16)) |
| 552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 553 __asm { |
| 554 mov eax, [esp + 4] // src_argb |
| 555 mov edx, [esp + 8] // dst_rgb |
| 556 mov ecx, [esp + 12] // pix |
| 557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
| 558 psrld xmm4, 27 |
| 559 movdqa xmm5, xmm4 // generate mask 0x000003e0 |
| 560 pslld xmm5, 5 |
| 561 movdqa xmm6, xmm4 // generate mask 0x00007c00 |
| 562 pslld xmm6, 10 |
| 563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
| 564 pslld xmm7, 15 |
| 565 |
| 566 align 4 |
| 567 convertloop: |
| 568 movdqa xmm0, [eax] // fetch 4 pixels of argb |
| 569 movdqa xmm1, xmm0 // B |
| 570 movdqa xmm2, xmm0 // G |
| 571 movdqa xmm3, xmm0 // R |
| 572 psrad xmm0, 16 // A |
| 573 psrld xmm1, 3 // B |
| 574 psrld xmm2, 6 // G |
| 575 psrld xmm3, 9 // R |
| 576 pand xmm0, xmm7 // A |
| 577 pand xmm1, xmm4 // B |
| 578 pand xmm2, xmm5 // G |
| 579 pand xmm3, xmm6 // R |
| 580 por xmm0, xmm1 // BA |
| 581 por xmm2, xmm3 // GR |
| 582 por xmm0, xmm2 // BGRA |
| 583 packssdw xmm0, xmm0 |
| 584 lea eax, [eax + 16] |
| 585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
| 586 lea edx, [edx + 8] |
| 587 sub ecx, 4 |
| 588 jg convertloop |
| 589 ret |
| 590 } |
| 591 } |
| 592 |
| 593 __declspec(naked) __declspec(align(16)) |
| 594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
| 595 __asm { |
| 596 mov eax, [esp + 4] // src_argb |
| 597 mov edx, [esp + 8] // dst_rgb |
| 598 mov ecx, [esp + 12] // pix |
| 599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
| 600 psllw xmm4, 12 |
| 601 movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
| 602 psrlw xmm3, 8 |
| 603 |
| 604 align 4 |
| 605 convertloop: |
| 606 movdqa xmm0, [eax] // fetch 4 pixels of argb |
| 607 movdqa xmm1, xmm0 |
| 608 pand xmm0, xmm3 // low nibble |
| 609 pand xmm1, xmm4 // high nibble |
| 610 psrl xmm0, 4 |
| 611 psrl xmm1, 8 |
| 612 por xmm0, xmm1 |
| 613 packuswb xmm0, xmm0 |
| 614 lea eax, [eax + 16] |
| 615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
| 616 lea edx, [edx + 8] |
| 617 sub ecx, 4 |
| 618 jg convertloop |
| 619 ret |
| 620 } |
| 621 } |
| 622 |
| 623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| 624 __declspec(naked) __declspec(align(16)) |
| 625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 626 __asm { |
| 627 mov eax, [esp + 4] /* src_argb */ |
| 628 mov edx, [esp + 8] /* dst_y */ |
| 629 mov ecx, [esp + 12] /* pix */ |
| 630 movdqa xmm5, kAddY16 |
| 631 movdqa xmm4, kARGBToY |
| 632 |
| 633 align 4 |
| 634 convertloop: |
| 635 movdqa xmm0, [eax] |
| 636 movdqa xmm1, [eax + 16] |
| 637 movdqa xmm2, [eax + 32] |
| 638 movdqa xmm3, [eax + 48] |
| 639 pmaddubsw xmm0, xmm4 |
| 640 pmaddubsw xmm1, xmm4 |
| 641 pmaddubsw xmm2, xmm4 |
| 642 pmaddubsw xmm3, xmm4 |
| 643 lea eax, [eax + 64] |
| 644 phaddw xmm0, xmm1 |
| 645 phaddw xmm2, xmm3 |
| 646 psrlw xmm0, 7 |
| 647 psrlw xmm2, 7 |
| 648 packuswb xmm0, xmm2 |
| 649 paddb xmm0, xmm5 |
| 650 sub ecx, 16 |
| 651 movdqa [edx], xmm0 |
| 652 lea edx, [edx + 16] |
| 653 jg convertloop |
| 654 ret |
| 655 } |
| 656 } |
| 657 |
| 658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
| 659 __declspec(naked) __declspec(align(16)) |
| 660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 661 __asm { |
| 662 mov eax, [esp + 4] /* src_argb */ |
| 663 mov edx, [esp + 8] /* dst_y */ |
| 664 mov ecx, [esp + 12] /* pix */ |
| 665 movdqa xmm4, kARGBToYJ |
| 666 movdqa xmm5, kAddYJ64 |
| 667 |
| 668 align 4 |
| 669 convertloop: |
| 670 movdqa xmm0, [eax] |
| 671 movdqa xmm1, [eax + 16] |
| 672 movdqa xmm2, [eax + 32] |
| 673 movdqa xmm3, [eax + 48] |
| 674 pmaddubsw xmm0, xmm4 |
| 675 pmaddubsw xmm1, xmm4 |
| 676 pmaddubsw xmm2, xmm4 |
| 677 pmaddubsw xmm3, xmm4 |
| 678 lea eax, [eax + 64] |
| 679 phaddw xmm0, xmm1 |
| 680 phaddw xmm2, xmm3 |
| 681 paddw xmm0, xmm5 // Add .5 for rounding. |
| 682 paddw xmm2, xmm5 |
| 683 psrlw xmm0, 7 |
| 684 psrlw xmm2, 7 |
| 685 packuswb xmm0, xmm2 |
| 686 sub ecx, 16 |
| 687 movdqa [edx], xmm0 |
| 688 lea edx, [edx + 16] |
| 689 jg convertloop |
| 690 ret |
| 691 } |
| 692 } |
| 693 |
| 694 #ifdef HAS_ARGBTOYROW_AVX2 |
| 695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 696 __declspec(naked) __declspec(align(32)) |
| 697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| 698 __asm { |
| 699 mov eax, [esp + 4] /* src_argb */ |
| 700 mov edx, [esp + 8] /* dst_y */ |
| 701 mov ecx, [esp + 12] /* pix */ |
| 702 vbroadcastf128 ymm4, kARGBToY |
| 703 vbroadcastf128 ymm5, kAddY16 |
| 704 vmovdqa ymm6, kPermdARGBToY_AVX |
| 705 |
| 706 align 4 |
| 707 convertloop: |
| 708 vmovdqu ymm0, [eax] |
| 709 vmovdqu ymm1, [eax + 32] |
| 710 vmovdqu ymm2, [eax + 64] |
| 711 vmovdqu ymm3, [eax + 96] |
| 712 vpmaddubsw ymm0, ymm0, ymm4 |
| 713 vpmaddubsw ymm1, ymm1, ymm4 |
| 714 vpmaddubsw ymm2, ymm2, ymm4 |
| 715 vpmaddubsw ymm3, ymm3, ymm4 |
| 716 lea eax, [eax + 128] |
| 717 vphaddw ymm0, ymm0, ymm1 // mutates. |
| 718 vphaddw ymm2, ymm2, ymm3 |
| 719 vpsrlw ymm0, ymm0, 7 |
| 720 vpsrlw ymm2, ymm2, 7 |
| 721 vpackuswb ymm0, ymm0, ymm2 // mutates. |
| 722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| 723 vpaddb ymm0, ymm0, ymm5 |
| 724 sub ecx, 32 |
| 725 vmovdqu [edx], ymm0 |
| 726 lea edx, [edx + 32] |
| 727 jg convertloop |
| 728 vzeroupper |
| 729 ret |
| 730 } |
| 731 } |
| 732 #endif // HAS_ARGBTOYROW_AVX2 |
| 733 |
| 734 #ifdef HAS_ARGBTOYROW_AVX2 |
| 735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 736 __declspec(naked) __declspec(align(32)) |
| 737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
| 738 __asm { |
| 739 mov eax, [esp + 4] /* src_argb */ |
| 740 mov edx, [esp + 8] /* dst_y */ |
| 741 mov ecx, [esp + 12] /* pix */ |
| 742 vbroadcastf128 ymm4, kARGBToYJ |
| 743 vbroadcastf128 ymm5, kAddYJ64 |
| 744 vmovdqa ymm6, kPermdARGBToY_AVX |
| 745 |
| 746 align 4 |
| 747 convertloop: |
| 748 vmovdqu ymm0, [eax] |
| 749 vmovdqu ymm1, [eax + 32] |
| 750 vmovdqu ymm2, [eax + 64] |
| 751 vmovdqu ymm3, [eax + 96] |
| 752 vpmaddubsw ymm0, ymm0, ymm4 |
| 753 vpmaddubsw ymm1, ymm1, ymm4 |
| 754 vpmaddubsw ymm2, ymm2, ymm4 |
| 755 vpmaddubsw ymm3, ymm3, ymm4 |
| 756 lea eax, [eax + 128] |
| 757 vphaddw ymm0, ymm0, ymm1 // mutates. |
| 758 vphaddw ymm2, ymm2, ymm3 |
| 759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. |
| 760 vpaddw ymm2, ymm2, ymm5 |
| 761 vpsrlw ymm0, ymm0, 7 |
| 762 vpsrlw ymm2, ymm2, 7 |
| 763 vpackuswb ymm0, ymm0, ymm2 // mutates. |
| 764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
| 765 sub ecx, 32 |
| 766 vmovdqu [edx], ymm0 |
| 767 lea edx, [edx + 32] |
| 768 jg convertloop |
| 769 |
| 770 vzeroupper |
| 771 ret |
| 772 } |
| 773 } |
| 774 #endif // HAS_ARGBTOYJROW_AVX2 |
| 775 |
| 776 __declspec(naked) __declspec(align(16)) |
| 777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 778 __asm { |
| 779 mov eax, [esp + 4] /* src_argb */ |
| 780 mov edx, [esp + 8] /* dst_y */ |
| 781 mov ecx, [esp + 12] /* pix */ |
| 782 movdqa xmm5, kAddY16 |
| 783 movdqa xmm4, kARGBToY |
| 784 |
| 785 align 4 |
| 786 convertloop: |
| 787 movdqu xmm0, [eax] |
| 788 movdqu xmm1, [eax + 16] |
| 789 movdqu xmm2, [eax + 32] |
| 790 movdqu xmm3, [eax + 48] |
| 791 pmaddubsw xmm0, xmm4 |
| 792 pmaddubsw xmm1, xmm4 |
| 793 pmaddubsw xmm2, xmm4 |
| 794 pmaddubsw xmm3, xmm4 |
| 795 lea eax, [eax + 64] |
| 796 phaddw xmm0, xmm1 |
| 797 phaddw xmm2, xmm3 |
| 798 psrlw xmm0, 7 |
| 799 psrlw xmm2, 7 |
| 800 packuswb xmm0, xmm2 |
| 801 paddb xmm0, xmm5 |
| 802 sub ecx, 16 |
| 803 movdqu [edx], xmm0 |
| 804 lea edx, [edx + 16] |
| 805 jg convertloop |
| 806 ret |
| 807 } |
| 808 } |
| 809 |
| 810 __declspec(naked) __declspec(align(16)) |
| 811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 812 __asm { |
| 813 mov eax, [esp + 4] /* src_argb */ |
| 814 mov edx, [esp + 8] /* dst_y */ |
| 815 mov ecx, [esp + 12] /* pix */ |
| 816 movdqa xmm4, kARGBToYJ |
| 817 movdqa xmm5, kAddYJ64 |
| 818 |
| 819 align 4 |
| 820 convertloop: |
| 821 movdqu xmm0, [eax] |
| 822 movdqu xmm1, [eax + 16] |
| 823 movdqu xmm2, [eax + 32] |
| 824 movdqu xmm3, [eax + 48] |
| 825 pmaddubsw xmm0, xmm4 |
| 826 pmaddubsw xmm1, xmm4 |
| 827 pmaddubsw xmm2, xmm4 |
| 828 pmaddubsw xmm3, xmm4 |
| 829 lea eax, [eax + 64] |
| 830 phaddw xmm0, xmm1 |
| 831 phaddw xmm2, xmm3 |
| 832 paddw xmm0, xmm5 |
| 833 paddw xmm2, xmm5 |
| 834 psrlw xmm0, 7 |
| 835 psrlw xmm2, 7 |
| 836 packuswb xmm0, xmm2 |
| 837 sub ecx, 16 |
| 838 movdqu [edx], xmm0 |
| 839 lea edx, [edx + 16] |
| 840 jg convertloop |
| 841 ret |
| 842 } |
| 843 } |
| 844 |
| 845 __declspec(naked) __declspec(align(16)) |
| 846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 847 __asm { |
| 848 mov eax, [esp + 4] /* src_argb */ |
| 849 mov edx, [esp + 8] /* dst_y */ |
| 850 mov ecx, [esp + 12] /* pix */ |
| 851 movdqa xmm5, kAddY16 |
| 852 movdqa xmm4, kBGRAToY |
| 853 |
| 854 align 4 |
| 855 convertloop: |
| 856 movdqa xmm0, [eax] |
| 857 movdqa xmm1, [eax + 16] |
| 858 movdqa xmm2, [eax + 32] |
| 859 movdqa xmm3, [eax + 48] |
| 860 pmaddubsw xmm0, xmm4 |
| 861 pmaddubsw xmm1, xmm4 |
| 862 pmaddubsw xmm2, xmm4 |
| 863 pmaddubsw xmm3, xmm4 |
| 864 lea eax, [eax + 64] |
| 865 phaddw xmm0, xmm1 |
| 866 phaddw xmm2, xmm3 |
| 867 psrlw xmm0, 7 |
| 868 psrlw xmm2, 7 |
| 869 packuswb xmm0, xmm2 |
| 870 paddb xmm0, xmm5 |
| 871 sub ecx, 16 |
| 872 movdqa [edx], xmm0 |
| 873 lea edx, [edx + 16] |
| 874 jg convertloop |
| 875 ret |
| 876 } |
| 877 } |
| 878 |
| 879 __declspec(naked) __declspec(align(16)) |
| 880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 881 __asm { |
| 882 mov eax, [esp + 4] /* src_argb */ |
| 883 mov edx, [esp + 8] /* dst_y */ |
| 884 mov ecx, [esp + 12] /* pix */ |
| 885 movdqa xmm5, kAddY16 |
| 886 movdqa xmm4, kBGRAToY |
| 887 |
| 888 align 4 |
| 889 convertloop: |
| 890 movdqu xmm0, [eax] |
| 891 movdqu xmm1, [eax + 16] |
| 892 movdqu xmm2, [eax + 32] |
| 893 movdqu xmm3, [eax + 48] |
| 894 pmaddubsw xmm0, xmm4 |
| 895 pmaddubsw xmm1, xmm4 |
| 896 pmaddubsw xmm2, xmm4 |
| 897 pmaddubsw xmm3, xmm4 |
| 898 lea eax, [eax + 64] |
| 899 phaddw xmm0, xmm1 |
| 900 phaddw xmm2, xmm3 |
| 901 psrlw xmm0, 7 |
| 902 psrlw xmm2, 7 |
| 903 packuswb xmm0, xmm2 |
| 904 paddb xmm0, xmm5 |
| 905 sub ecx, 16 |
| 906 movdqu [edx], xmm0 |
| 907 lea edx, [edx + 16] |
| 908 jg convertloop |
| 909 ret |
| 910 } |
| 911 } |
| 912 |
| 913 __declspec(naked) __declspec(align(16)) |
| 914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 915 __asm { |
| 916 mov eax, [esp + 4] /* src_argb */ |
| 917 mov edx, [esp + 8] /* dst_y */ |
| 918 mov ecx, [esp + 12] /* pix */ |
| 919 movdqa xmm5, kAddY16 |
| 920 movdqa xmm4, kABGRToY |
| 921 |
| 922 align 4 |
| 923 convertloop: |
| 924 movdqa xmm0, [eax] |
| 925 movdqa xmm1, [eax + 16] |
| 926 movdqa xmm2, [eax + 32] |
| 927 movdqa xmm3, [eax + 48] |
| 928 pmaddubsw xmm0, xmm4 |
| 929 pmaddubsw xmm1, xmm4 |
| 930 pmaddubsw xmm2, xmm4 |
| 931 pmaddubsw xmm3, xmm4 |
| 932 lea eax, [eax + 64] |
| 933 phaddw xmm0, xmm1 |
| 934 phaddw xmm2, xmm3 |
| 935 psrlw xmm0, 7 |
| 936 psrlw xmm2, 7 |
| 937 packuswb xmm0, xmm2 |
| 938 paddb xmm0, xmm5 |
| 939 sub ecx, 16 |
| 940 movdqa [edx], xmm0 |
| 941 lea edx, [edx + 16] |
| 942 jg convertloop |
| 943 ret |
| 944 } |
| 945 } |
| 946 |
| 947 __declspec(naked) __declspec(align(16)) |
| 948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 949 __asm { |
| 950 mov eax, [esp + 4] /* src_argb */ |
| 951 mov edx, [esp + 8] /* dst_y */ |
| 952 mov ecx, [esp + 12] /* pix */ |
| 953 movdqa xmm5, kAddY16 |
| 954 movdqa xmm4, kABGRToY |
| 955 |
| 956 align 4 |
| 957 convertloop: |
| 958 movdqu xmm0, [eax] |
| 959 movdqu xmm1, [eax + 16] |
| 960 movdqu xmm2, [eax + 32] |
| 961 movdqu xmm3, [eax + 48] |
| 962 pmaddubsw xmm0, xmm4 |
| 963 pmaddubsw xmm1, xmm4 |
| 964 pmaddubsw xmm2, xmm4 |
| 965 pmaddubsw xmm3, xmm4 |
| 966 lea eax, [eax + 64] |
| 967 phaddw xmm0, xmm1 |
| 968 phaddw xmm2, xmm3 |
| 969 psrlw xmm0, 7 |
| 970 psrlw xmm2, 7 |
| 971 packuswb xmm0, xmm2 |
| 972 paddb xmm0, xmm5 |
| 973 sub ecx, 16 |
| 974 movdqu [edx], xmm0 |
| 975 lea edx, [edx + 16] |
| 976 jg convertloop |
| 977 ret |
| 978 } |
| 979 } |
| 980 |
| 981 __declspec(naked) __declspec(align(16)) |
| 982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 983 __asm { |
| 984 mov eax, [esp + 4] /* src_argb */ |
| 985 mov edx, [esp + 8] /* dst_y */ |
| 986 mov ecx, [esp + 12] /* pix */ |
| 987 movdqa xmm5, kAddY16 |
| 988 movdqa xmm4, kRGBAToY |
| 989 |
| 990 align 4 |
| 991 convertloop: |
| 992 movdqa xmm0, [eax] |
| 993 movdqa xmm1, [eax + 16] |
| 994 movdqa xmm2, [eax + 32] |
| 995 movdqa xmm3, [eax + 48] |
| 996 pmaddubsw xmm0, xmm4 |
| 997 pmaddubsw xmm1, xmm4 |
| 998 pmaddubsw xmm2, xmm4 |
| 999 pmaddubsw xmm3, xmm4 |
| 1000 lea eax, [eax + 64] |
| 1001 phaddw xmm0, xmm1 |
| 1002 phaddw xmm2, xmm3 |
| 1003 psrlw xmm0, 7 |
| 1004 psrlw xmm2, 7 |
| 1005 packuswb xmm0, xmm2 |
| 1006 paddb xmm0, xmm5 |
| 1007 sub ecx, 16 |
| 1008 movdqa [edx], xmm0 |
| 1009 lea edx, [edx + 16] |
| 1010 jg convertloop |
| 1011 ret |
| 1012 } |
| 1013 } |
| 1014 |
| 1015 __declspec(naked) __declspec(align(16)) |
| 1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
| 1017 __asm { |
| 1018 mov eax, [esp + 4] /* src_argb */ |
| 1019 mov edx, [esp + 8] /* dst_y */ |
| 1020 mov ecx, [esp + 12] /* pix */ |
| 1021 movdqa xmm5, kAddY16 |
| 1022 movdqa xmm4, kRGBAToY |
| 1023 |
| 1024 align 4 |
| 1025 convertloop: |
| 1026 movdqu xmm0, [eax] |
| 1027 movdqu xmm1, [eax + 16] |
| 1028 movdqu xmm2, [eax + 32] |
| 1029 movdqu xmm3, [eax + 48] |
| 1030 pmaddubsw xmm0, xmm4 |
| 1031 pmaddubsw xmm1, xmm4 |
| 1032 pmaddubsw xmm2, xmm4 |
| 1033 pmaddubsw xmm3, xmm4 |
| 1034 lea eax, [eax + 64] |
| 1035 phaddw xmm0, xmm1 |
| 1036 phaddw xmm2, xmm3 |
| 1037 psrlw xmm0, 7 |
| 1038 psrlw xmm2, 7 |
| 1039 packuswb xmm0, xmm2 |
| 1040 paddb xmm0, xmm5 |
| 1041 sub ecx, 16 |
| 1042 movdqu [edx], xmm0 |
| 1043 lea edx, [edx + 16] |
| 1044 jg convertloop |
| 1045 ret |
| 1046 } |
| 1047 } |
| 1048 |
| 1049 __declspec(naked) __declspec(align(16)) |
| 1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1051 uint8* dst_u, uint8* dst_v, int width) { |
| 1052 __asm { |
| 1053 push esi |
| 1054 push edi |
| 1055 mov eax, [esp + 8 + 4] // src_argb |
| 1056 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1057 mov edx, [esp + 8 + 12] // dst_u |
| 1058 mov edi, [esp + 8 + 16] // dst_v |
| 1059 mov ecx, [esp + 8 + 20] // pix |
| 1060 movdqa xmm7, kARGBToU |
| 1061 movdqa xmm6, kARGBToV |
| 1062 movdqa xmm5, kAddUV128 |
| 1063 sub edi, edx // stride from u to v |
| 1064 |
| 1065 align 4 |
| 1066 convertloop: |
| 1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1068 movdqa xmm0, [eax] |
| 1069 movdqa xmm1, [eax + 16] |
| 1070 movdqa xmm2, [eax + 32] |
| 1071 movdqa xmm3, [eax + 48] |
| 1072 pavgb xmm0, [eax + esi] |
| 1073 pavgb xmm1, [eax + esi + 16] |
| 1074 pavgb xmm2, [eax + esi + 32] |
| 1075 pavgb xmm3, [eax + esi + 48] |
| 1076 lea eax, [eax + 64] |
| 1077 movdqa xmm4, xmm0 |
| 1078 shufps xmm0, xmm1, 0x88 |
| 1079 shufps xmm4, xmm1, 0xdd |
| 1080 pavgb xmm0, xmm4 |
| 1081 movdqa xmm4, xmm2 |
| 1082 shufps xmm2, xmm3, 0x88 |
| 1083 shufps xmm4, xmm3, 0xdd |
| 1084 pavgb xmm2, xmm4 |
| 1085 |
| 1086 // step 2 - convert to U and V |
| 1087 // from here down is very similar to Y code except |
| 1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1089 movdqa xmm1, xmm0 |
| 1090 movdqa xmm3, xmm2 |
| 1091 pmaddubsw xmm0, xmm7 // U |
| 1092 pmaddubsw xmm2, xmm7 |
| 1093 pmaddubsw xmm1, xmm6 // V |
| 1094 pmaddubsw xmm3, xmm6 |
| 1095 phaddw xmm0, xmm2 |
| 1096 phaddw xmm1, xmm3 |
| 1097 psraw xmm0, 8 |
| 1098 psraw xmm1, 8 |
| 1099 packsswb xmm0, xmm1 |
| 1100 paddb xmm0, xmm5 // -> unsigned |
| 1101 |
| 1102 // step 3 - store 8 U and 8 V values |
| 1103 sub ecx, 16 |
| 1104 movlps qword ptr [edx], xmm0 // U |
| 1105 movhps qword ptr [edx + edi], xmm0 // V |
| 1106 lea edx, [edx + 8] |
| 1107 jg convertloop |
| 1108 |
| 1109 pop edi |
| 1110 pop esi |
| 1111 ret |
| 1112 } |
| 1113 } |
| 1114 |
| 1115 __declspec(naked) __declspec(align(16)) |
| 1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1117 uint8* dst_u, uint8* dst_v, int width) { |
| 1118 __asm { |
| 1119 push esi |
| 1120 push edi |
| 1121 mov eax, [esp + 8 + 4] // src_argb |
| 1122 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1123 mov edx, [esp + 8 + 12] // dst_u |
| 1124 mov edi, [esp + 8 + 16] // dst_v |
| 1125 mov ecx, [esp + 8 + 20] // pix |
| 1126 movdqa xmm7, kARGBToUJ |
| 1127 movdqa xmm6, kARGBToVJ |
| 1128 movdqa xmm5, kAddUVJ128 |
| 1129 sub edi, edx // stride from u to v |
| 1130 |
| 1131 align 4 |
| 1132 convertloop: |
| 1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1134 movdqa xmm0, [eax] |
| 1135 movdqa xmm1, [eax + 16] |
| 1136 movdqa xmm2, [eax + 32] |
| 1137 movdqa xmm3, [eax + 48] |
| 1138 pavgb xmm0, [eax + esi] |
| 1139 pavgb xmm1, [eax + esi + 16] |
| 1140 pavgb xmm2, [eax + esi + 32] |
| 1141 pavgb xmm3, [eax + esi + 48] |
| 1142 lea eax, [eax + 64] |
| 1143 movdqa xmm4, xmm0 |
| 1144 shufps xmm0, xmm1, 0x88 |
| 1145 shufps xmm4, xmm1, 0xdd |
| 1146 pavgb xmm0, xmm4 |
| 1147 movdqa xmm4, xmm2 |
| 1148 shufps xmm2, xmm3, 0x88 |
| 1149 shufps xmm4, xmm3, 0xdd |
| 1150 pavgb xmm2, xmm4 |
| 1151 |
| 1152 // step 2 - convert to U and V |
| 1153 // from here down is very similar to Y code except |
| 1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1155 movdqa xmm1, xmm0 |
| 1156 movdqa xmm3, xmm2 |
| 1157 pmaddubsw xmm0, xmm7 // U |
| 1158 pmaddubsw xmm2, xmm7 |
| 1159 pmaddubsw xmm1, xmm6 // V |
| 1160 pmaddubsw xmm3, xmm6 |
| 1161 phaddw xmm0, xmm2 |
| 1162 phaddw xmm1, xmm3 |
| 1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned |
| 1164 paddw xmm1, xmm5 |
| 1165 psraw xmm0, 8 |
| 1166 psraw xmm1, 8 |
| 1167 packsswb xmm0, xmm1 |
| 1168 |
| 1169 // step 3 - store 8 U and 8 V values |
| 1170 sub ecx, 16 |
| 1171 movlps qword ptr [edx], xmm0 // U |
| 1172 movhps qword ptr [edx + edi], xmm0 // V |
| 1173 lea edx, [edx + 8] |
| 1174 jg convertloop |
| 1175 |
| 1176 pop edi |
| 1177 pop esi |
| 1178 ret |
| 1179 } |
| 1180 } |
| 1181 |
| 1182 #ifdef HAS_ARGBTOUVROW_AVX2 |
| 1183 __declspec(naked) __declspec(align(32)) |
| 1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| 1185 uint8* dst_u, uint8* dst_v, int width) { |
| 1186 __asm { |
| 1187 push esi |
| 1188 push edi |
| 1189 mov eax, [esp + 8 + 4] // src_argb |
| 1190 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1191 mov edx, [esp + 8 + 12] // dst_u |
| 1192 mov edi, [esp + 8 + 16] // dst_v |
| 1193 mov ecx, [esp + 8 + 20] // pix |
| 1194 vbroadcastf128 ymm5, kAddUV128 |
| 1195 vbroadcastf128 ymm6, kARGBToV |
| 1196 vbroadcastf128 ymm7, kARGBToU |
| 1197 sub edi, edx // stride from u to v |
| 1198 |
| 1199 align 4 |
| 1200 convertloop: |
| 1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
| 1202 vmovdqu ymm0, [eax] |
| 1203 vmovdqu ymm1, [eax + 32] |
| 1204 vmovdqu ymm2, [eax + 64] |
| 1205 vmovdqu ymm3, [eax + 96] |
| 1206 vpavgb ymm0, ymm0, [eax + esi] |
| 1207 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 1208 vpavgb ymm2, ymm2, [eax + esi + 64] |
| 1209 vpavgb ymm3, ymm3, [eax + esi + 96] |
| 1210 lea eax, [eax + 128] |
| 1211 vshufps ymm4, ymm0, ymm1, 0x88 |
| 1212 vshufps ymm0, ymm0, ymm1, 0xdd |
| 1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
| 1214 vshufps ymm4, ymm2, ymm3, 0x88 |
| 1215 vshufps ymm2, ymm2, ymm3, 0xdd |
| 1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
| 1217 |
| 1218 // step 2 - convert to U and V |
| 1219 // from here down is very similar to Y code except |
| 1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V |
| 1221 vpmaddubsw ymm1, ymm0, ymm7 // U |
| 1222 vpmaddubsw ymm3, ymm2, ymm7 |
| 1223 vpmaddubsw ymm0, ymm0, ymm6 // V |
| 1224 vpmaddubsw ymm2, ymm2, ymm6 |
| 1225 vphaddw ymm1, ymm1, ymm3 // mutates |
| 1226 vphaddw ymm0, ymm0, ymm2 |
| 1227 vpsraw ymm1, ymm1, 8 |
| 1228 vpsraw ymm0, ymm0, 8 |
| 1229 vpacksswb ymm0, ymm1, ymm0 // mutates |
| 1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
| 1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw |
| 1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned |
| 1233 |
| 1234 // step 3 - store 16 U and 16 V values |
| 1235 sub ecx, 32 |
| 1236 vextractf128 [edx], ymm0, 0 // U |
| 1237 vextractf128 [edx + edi], ymm0, 1 // V |
| 1238 lea edx, [edx + 16] |
| 1239 jg convertloop |
| 1240 |
| 1241 pop edi |
| 1242 pop esi |
| 1243 vzeroupper |
| 1244 ret |
| 1245 } |
| 1246 } |
| 1247 #endif // HAS_ARGBTOUVROW_AVX2 |
| 1248 |
| 1249 __declspec(naked) __declspec(align(16)) |
| 1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1251 uint8* dst_u, uint8* dst_v, int width) { |
| 1252 __asm { |
| 1253 push esi |
| 1254 push edi |
| 1255 mov eax, [esp + 8 + 4] // src_argb |
| 1256 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1257 mov edx, [esp + 8 + 12] // dst_u |
| 1258 mov edi, [esp + 8 + 16] // dst_v |
| 1259 mov ecx, [esp + 8 + 20] // pix |
| 1260 movdqa xmm7, kARGBToU |
| 1261 movdqa xmm6, kARGBToV |
| 1262 movdqa xmm5, kAddUV128 |
| 1263 sub edi, edx // stride from u to v |
| 1264 |
| 1265 align 4 |
| 1266 convertloop: |
| 1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1268 movdqu xmm0, [eax] |
| 1269 movdqu xmm1, [eax + 16] |
| 1270 movdqu xmm2, [eax + 32] |
| 1271 movdqu xmm3, [eax + 48] |
| 1272 movdqu xmm4, [eax + esi] |
| 1273 pavgb xmm0, xmm4 |
| 1274 movdqu xmm4, [eax + esi + 16] |
| 1275 pavgb xmm1, xmm4 |
| 1276 movdqu xmm4, [eax + esi + 32] |
| 1277 pavgb xmm2, xmm4 |
| 1278 movdqu xmm4, [eax + esi + 48] |
| 1279 pavgb xmm3, xmm4 |
| 1280 lea eax, [eax + 64] |
| 1281 movdqa xmm4, xmm0 |
| 1282 shufps xmm0, xmm1, 0x88 |
| 1283 shufps xmm4, xmm1, 0xdd |
| 1284 pavgb xmm0, xmm4 |
| 1285 movdqa xmm4, xmm2 |
| 1286 shufps xmm2, xmm3, 0x88 |
| 1287 shufps xmm4, xmm3, 0xdd |
| 1288 pavgb xmm2, xmm4 |
| 1289 |
| 1290 // step 2 - convert to U and V |
| 1291 // from here down is very similar to Y code except |
| 1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1293 movdqa xmm1, xmm0 |
| 1294 movdqa xmm3, xmm2 |
| 1295 pmaddubsw xmm0, xmm7 // U |
| 1296 pmaddubsw xmm2, xmm7 |
| 1297 pmaddubsw xmm1, xmm6 // V |
| 1298 pmaddubsw xmm3, xmm6 |
| 1299 phaddw xmm0, xmm2 |
| 1300 phaddw xmm1, xmm3 |
| 1301 psraw xmm0, 8 |
| 1302 psraw xmm1, 8 |
| 1303 packsswb xmm0, xmm1 |
| 1304 paddb xmm0, xmm5 // -> unsigned |
| 1305 |
| 1306 // step 3 - store 8 U and 8 V values |
| 1307 sub ecx, 16 |
| 1308 movlps qword ptr [edx], xmm0 // U |
| 1309 movhps qword ptr [edx + edi], xmm0 // V |
| 1310 lea edx, [edx + 8] |
| 1311 jg convertloop |
| 1312 |
| 1313 pop edi |
| 1314 pop esi |
| 1315 ret |
| 1316 } |
| 1317 } |
| 1318 |
| 1319 __declspec(naked) __declspec(align(16)) |
| 1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1321 uint8* dst_u, uint8* dst_v, int width) { |
| 1322 __asm { |
| 1323 push esi |
| 1324 push edi |
| 1325 mov eax, [esp + 8 + 4] // src_argb |
| 1326 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1327 mov edx, [esp + 8 + 12] // dst_u |
| 1328 mov edi, [esp + 8 + 16] // dst_v |
| 1329 mov ecx, [esp + 8 + 20] // pix |
| 1330 movdqa xmm7, kARGBToUJ |
| 1331 movdqa xmm6, kARGBToVJ |
| 1332 movdqa xmm5, kAddUVJ128 |
| 1333 sub edi, edx // stride from u to v |
| 1334 |
| 1335 align 4 |
| 1336 convertloop: |
| 1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1338 movdqu xmm0, [eax] |
| 1339 movdqu xmm1, [eax + 16] |
| 1340 movdqu xmm2, [eax + 32] |
| 1341 movdqu xmm3, [eax + 48] |
| 1342 movdqu xmm4, [eax + esi] |
| 1343 pavgb xmm0, xmm4 |
| 1344 movdqu xmm4, [eax + esi + 16] |
| 1345 pavgb xmm1, xmm4 |
| 1346 movdqu xmm4, [eax + esi + 32] |
| 1347 pavgb xmm2, xmm4 |
| 1348 movdqu xmm4, [eax + esi + 48] |
| 1349 pavgb xmm3, xmm4 |
| 1350 lea eax, [eax + 64] |
| 1351 movdqa xmm4, xmm0 |
| 1352 shufps xmm0, xmm1, 0x88 |
| 1353 shufps xmm4, xmm1, 0xdd |
| 1354 pavgb xmm0, xmm4 |
| 1355 movdqa xmm4, xmm2 |
| 1356 shufps xmm2, xmm3, 0x88 |
| 1357 shufps xmm4, xmm3, 0xdd |
| 1358 pavgb xmm2, xmm4 |
| 1359 |
| 1360 // step 2 - convert to U and V |
| 1361 // from here down is very similar to Y code except |
| 1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1363 movdqa xmm1, xmm0 |
| 1364 movdqa xmm3, xmm2 |
| 1365 pmaddubsw xmm0, xmm7 // U |
| 1366 pmaddubsw xmm2, xmm7 |
| 1367 pmaddubsw xmm1, xmm6 // V |
| 1368 pmaddubsw xmm3, xmm6 |
| 1369 phaddw xmm0, xmm2 |
| 1370 phaddw xmm1, xmm3 |
| 1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned |
| 1372 paddw xmm1, xmm5 |
| 1373 psraw xmm0, 8 |
| 1374 psraw xmm1, 8 |
| 1375 packsswb xmm0, xmm1 |
| 1376 |
| 1377 // step 3 - store 8 U and 8 V values |
| 1378 sub ecx, 16 |
| 1379 movlps qword ptr [edx], xmm0 // U |
| 1380 movhps qword ptr [edx + edi], xmm0 // V |
| 1381 lea edx, [edx + 8] |
| 1382 jg convertloop |
| 1383 |
| 1384 pop edi |
| 1385 pop esi |
| 1386 ret |
| 1387 } |
| 1388 } |
| 1389 |
| 1390 __declspec(naked) __declspec(align(16)) |
| 1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
| 1392 uint8* dst_u, uint8* dst_v, int width) { |
| 1393 __asm { |
| 1394 push edi |
| 1395 mov eax, [esp + 4 + 4] // src_argb |
| 1396 mov edx, [esp + 4 + 8] // dst_u |
| 1397 mov edi, [esp + 4 + 12] // dst_v |
| 1398 mov ecx, [esp + 4 + 16] // pix |
| 1399 movdqa xmm7, kARGBToU |
| 1400 movdqa xmm6, kARGBToV |
| 1401 movdqa xmm5, kAddUV128 |
| 1402 sub edi, edx // stride from u to v |
| 1403 |
| 1404 align 4 |
| 1405 convertloop: |
| 1406 /* convert to U and V */ |
| 1407 movdqa xmm0, [eax] // U |
| 1408 movdqa xmm1, [eax + 16] |
| 1409 movdqa xmm2, [eax + 32] |
| 1410 movdqa xmm3, [eax + 48] |
| 1411 pmaddubsw xmm0, xmm7 |
| 1412 pmaddubsw xmm1, xmm7 |
| 1413 pmaddubsw xmm2, xmm7 |
| 1414 pmaddubsw xmm3, xmm7 |
| 1415 phaddw xmm0, xmm1 |
| 1416 phaddw xmm2, xmm3 |
| 1417 psraw xmm0, 8 |
| 1418 psraw xmm2, 8 |
| 1419 packsswb xmm0, xmm2 |
| 1420 paddb xmm0, xmm5 |
| 1421 sub ecx, 16 |
| 1422 movdqa [edx], xmm0 |
| 1423 |
| 1424 movdqa xmm0, [eax] // V |
| 1425 movdqa xmm1, [eax + 16] |
| 1426 movdqa xmm2, [eax + 32] |
| 1427 movdqa xmm3, [eax + 48] |
| 1428 pmaddubsw xmm0, xmm6 |
| 1429 pmaddubsw xmm1, xmm6 |
| 1430 pmaddubsw xmm2, xmm6 |
| 1431 pmaddubsw xmm3, xmm6 |
| 1432 phaddw xmm0, xmm1 |
| 1433 phaddw xmm2, xmm3 |
| 1434 psraw xmm0, 8 |
| 1435 psraw xmm2, 8 |
| 1436 packsswb xmm0, xmm2 |
| 1437 paddb xmm0, xmm5 |
| 1438 lea eax, [eax + 64] |
| 1439 movdqa [edx + edi], xmm0 |
| 1440 lea edx, [edx + 16] |
| 1441 jg convertloop |
| 1442 |
| 1443 pop edi |
| 1444 ret |
| 1445 } |
| 1446 } |
| 1447 |
| 1448 __declspec(naked) __declspec(align(16)) |
| 1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, |
| 1450 uint8* dst_u, uint8* dst_v, int width) { |
| 1451 __asm { |
| 1452 push edi |
| 1453 mov eax, [esp + 4 + 4] // src_argb |
| 1454 mov edx, [esp + 4 + 8] // dst_u |
| 1455 mov edi, [esp + 4 + 12] // dst_v |
| 1456 mov ecx, [esp + 4 + 16] // pix |
| 1457 movdqa xmm7, kARGBToU |
| 1458 movdqa xmm6, kARGBToV |
| 1459 movdqa xmm5, kAddUV128 |
| 1460 sub edi, edx // stride from u to v |
| 1461 |
| 1462 align 4 |
| 1463 convertloop: |
| 1464 /* convert to U and V */ |
| 1465 movdqu xmm0, [eax] // U |
| 1466 movdqu xmm1, [eax + 16] |
| 1467 movdqu xmm2, [eax + 32] |
| 1468 movdqu xmm3, [eax + 48] |
| 1469 pmaddubsw xmm0, xmm7 |
| 1470 pmaddubsw xmm1, xmm7 |
| 1471 pmaddubsw xmm2, xmm7 |
| 1472 pmaddubsw xmm3, xmm7 |
| 1473 phaddw xmm0, xmm1 |
| 1474 phaddw xmm2, xmm3 |
| 1475 psraw xmm0, 8 |
| 1476 psraw xmm2, 8 |
| 1477 packsswb xmm0, xmm2 |
| 1478 paddb xmm0, xmm5 |
| 1479 sub ecx, 16 |
| 1480 movdqu [edx], xmm0 |
| 1481 |
| 1482 movdqu xmm0, [eax] // V |
| 1483 movdqu xmm1, [eax + 16] |
| 1484 movdqu xmm2, [eax + 32] |
| 1485 movdqu xmm3, [eax + 48] |
| 1486 pmaddubsw xmm0, xmm6 |
| 1487 pmaddubsw xmm1, xmm6 |
| 1488 pmaddubsw xmm2, xmm6 |
| 1489 pmaddubsw xmm3, xmm6 |
| 1490 phaddw xmm0, xmm1 |
| 1491 phaddw xmm2, xmm3 |
| 1492 psraw xmm0, 8 |
| 1493 psraw xmm2, 8 |
| 1494 packsswb xmm0, xmm2 |
| 1495 paddb xmm0, xmm5 |
| 1496 lea eax, [eax + 64] |
| 1497 movdqu [edx + edi], xmm0 |
| 1498 lea edx, [edx + 16] |
| 1499 jg convertloop |
| 1500 |
| 1501 pop edi |
| 1502 ret |
| 1503 } |
| 1504 } |
| 1505 |
| 1506 __declspec(naked) __declspec(align(16)) |
| 1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
| 1508 uint8* dst_u, uint8* dst_v, int width) { |
| 1509 __asm { |
| 1510 push edi |
| 1511 mov eax, [esp + 4 + 4] // src_argb |
| 1512 mov edx, [esp + 4 + 8] // dst_u |
| 1513 mov edi, [esp + 4 + 12] // dst_v |
| 1514 mov ecx, [esp + 4 + 16] // pix |
| 1515 movdqa xmm7, kARGBToU |
| 1516 movdqa xmm6, kARGBToV |
| 1517 movdqa xmm5, kAddUV128 |
| 1518 sub edi, edx // stride from u to v |
| 1519 |
| 1520 align 4 |
| 1521 convertloop: |
| 1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1523 movdqa xmm0, [eax] |
| 1524 movdqa xmm1, [eax + 16] |
| 1525 movdqa xmm2, [eax + 32] |
| 1526 movdqa xmm3, [eax + 48] |
| 1527 lea eax, [eax + 64] |
| 1528 movdqa xmm4, xmm0 |
| 1529 shufps xmm0, xmm1, 0x88 |
| 1530 shufps xmm4, xmm1, 0xdd |
| 1531 pavgb xmm0, xmm4 |
| 1532 movdqa xmm4, xmm2 |
| 1533 shufps xmm2, xmm3, 0x88 |
| 1534 shufps xmm4, xmm3, 0xdd |
| 1535 pavgb xmm2, xmm4 |
| 1536 |
| 1537 // step 2 - convert to U and V |
| 1538 // from here down is very similar to Y code except |
| 1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1540 movdqa xmm1, xmm0 |
| 1541 movdqa xmm3, xmm2 |
| 1542 pmaddubsw xmm0, xmm7 // U |
| 1543 pmaddubsw xmm2, xmm7 |
| 1544 pmaddubsw xmm1, xmm6 // V |
| 1545 pmaddubsw xmm3, xmm6 |
| 1546 phaddw xmm0, xmm2 |
| 1547 phaddw xmm1, xmm3 |
| 1548 psraw xmm0, 8 |
| 1549 psraw xmm1, 8 |
| 1550 packsswb xmm0, xmm1 |
| 1551 paddb xmm0, xmm5 // -> unsigned |
| 1552 |
| 1553 // step 3 - store 8 U and 8 V values |
| 1554 sub ecx, 16 |
| 1555 movlps qword ptr [edx], xmm0 // U |
| 1556 movhps qword ptr [edx + edi], xmm0 // V |
| 1557 lea edx, [edx + 8] |
| 1558 jg convertloop |
| 1559 |
| 1560 pop edi |
| 1561 ret |
| 1562 } |
| 1563 } |
| 1564 |
| 1565 __declspec(naked) __declspec(align(16)) |
| 1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, |
| 1567 uint8* dst_u, uint8* dst_v, int width) { |
| 1568 __asm { |
| 1569 push edi |
| 1570 mov eax, [esp + 4 + 4] // src_argb |
| 1571 mov edx, [esp + 4 + 8] // dst_u |
| 1572 mov edi, [esp + 4 + 12] // dst_v |
| 1573 mov ecx, [esp + 4 + 16] // pix |
| 1574 movdqa xmm7, kARGBToU |
| 1575 movdqa xmm6, kARGBToV |
| 1576 movdqa xmm5, kAddUV128 |
| 1577 sub edi, edx // stride from u to v |
| 1578 |
| 1579 align 4 |
| 1580 convertloop: |
| 1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1582 movdqu xmm0, [eax] |
| 1583 movdqu xmm1, [eax + 16] |
| 1584 movdqu xmm2, [eax + 32] |
| 1585 movdqu xmm3, [eax + 48] |
| 1586 lea eax, [eax + 64] |
| 1587 movdqa xmm4, xmm0 |
| 1588 shufps xmm0, xmm1, 0x88 |
| 1589 shufps xmm4, xmm1, 0xdd |
| 1590 pavgb xmm0, xmm4 |
| 1591 movdqa xmm4, xmm2 |
| 1592 shufps xmm2, xmm3, 0x88 |
| 1593 shufps xmm4, xmm3, 0xdd |
| 1594 pavgb xmm2, xmm4 |
| 1595 |
| 1596 // step 2 - convert to U and V |
| 1597 // from here down is very similar to Y code except |
| 1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1599 movdqa xmm1, xmm0 |
| 1600 movdqa xmm3, xmm2 |
| 1601 pmaddubsw xmm0, xmm7 // U |
| 1602 pmaddubsw xmm2, xmm7 |
| 1603 pmaddubsw xmm1, xmm6 // V |
| 1604 pmaddubsw xmm3, xmm6 |
| 1605 phaddw xmm0, xmm2 |
| 1606 phaddw xmm1, xmm3 |
| 1607 psraw xmm0, 8 |
| 1608 psraw xmm1, 8 |
| 1609 packsswb xmm0, xmm1 |
| 1610 paddb xmm0, xmm5 // -> unsigned |
| 1611 |
| 1612 // step 3 - store 8 U and 8 V values |
| 1613 sub ecx, 16 |
| 1614 movlps qword ptr [edx], xmm0 // U |
| 1615 movhps qword ptr [edx + edi], xmm0 // V |
| 1616 lea edx, [edx + 8] |
| 1617 jg convertloop |
| 1618 |
| 1619 pop edi |
| 1620 ret |
| 1621 } |
| 1622 } |
| 1623 |
| 1624 __declspec(naked) __declspec(align(16)) |
| 1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1626 uint8* dst_u, uint8* dst_v, int width) { |
| 1627 __asm { |
| 1628 push esi |
| 1629 push edi |
| 1630 mov eax, [esp + 8 + 4] // src_argb |
| 1631 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1632 mov edx, [esp + 8 + 12] // dst_u |
| 1633 mov edi, [esp + 8 + 16] // dst_v |
| 1634 mov ecx, [esp + 8 + 20] // pix |
| 1635 movdqa xmm7, kBGRAToU |
| 1636 movdqa xmm6, kBGRAToV |
| 1637 movdqa xmm5, kAddUV128 |
| 1638 sub edi, edx // stride from u to v |
| 1639 |
| 1640 align 4 |
| 1641 convertloop: |
| 1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1643 movdqa xmm0, [eax] |
| 1644 movdqa xmm1, [eax + 16] |
| 1645 movdqa xmm2, [eax + 32] |
| 1646 movdqa xmm3, [eax + 48] |
| 1647 pavgb xmm0, [eax + esi] |
| 1648 pavgb xmm1, [eax + esi + 16] |
| 1649 pavgb xmm2, [eax + esi + 32] |
| 1650 pavgb xmm3, [eax + esi + 48] |
| 1651 lea eax, [eax + 64] |
| 1652 movdqa xmm4, xmm0 |
| 1653 shufps xmm0, xmm1, 0x88 |
| 1654 shufps xmm4, xmm1, 0xdd |
| 1655 pavgb xmm0, xmm4 |
| 1656 movdqa xmm4, xmm2 |
| 1657 shufps xmm2, xmm3, 0x88 |
| 1658 shufps xmm4, xmm3, 0xdd |
| 1659 pavgb xmm2, xmm4 |
| 1660 |
| 1661 // step 2 - convert to U and V |
| 1662 // from here down is very similar to Y code except |
| 1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1664 movdqa xmm1, xmm0 |
| 1665 movdqa xmm3, xmm2 |
| 1666 pmaddubsw xmm0, xmm7 // U |
| 1667 pmaddubsw xmm2, xmm7 |
| 1668 pmaddubsw xmm1, xmm6 // V |
| 1669 pmaddubsw xmm3, xmm6 |
| 1670 phaddw xmm0, xmm2 |
| 1671 phaddw xmm1, xmm3 |
| 1672 psraw xmm0, 8 |
| 1673 psraw xmm1, 8 |
| 1674 packsswb xmm0, xmm1 |
| 1675 paddb xmm0, xmm5 // -> unsigned |
| 1676 |
| 1677 // step 3 - store 8 U and 8 V values |
| 1678 sub ecx, 16 |
| 1679 movlps qword ptr [edx], xmm0 // U |
| 1680 movhps qword ptr [edx + edi], xmm0 // V |
| 1681 lea edx, [edx + 8] |
| 1682 jg convertloop |
| 1683 |
| 1684 pop edi |
| 1685 pop esi |
| 1686 ret |
| 1687 } |
| 1688 } |
| 1689 |
| 1690 __declspec(naked) __declspec(align(16)) |
| 1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1692 uint8* dst_u, uint8* dst_v, int width) { |
| 1693 __asm { |
| 1694 push esi |
| 1695 push edi |
| 1696 mov eax, [esp + 8 + 4] // src_argb |
| 1697 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1698 mov edx, [esp + 8 + 12] // dst_u |
| 1699 mov edi, [esp + 8 + 16] // dst_v |
| 1700 mov ecx, [esp + 8 + 20] // pix |
| 1701 movdqa xmm7, kBGRAToU |
| 1702 movdqa xmm6, kBGRAToV |
| 1703 movdqa xmm5, kAddUV128 |
| 1704 sub edi, edx // stride from u to v |
| 1705 |
| 1706 align 4 |
| 1707 convertloop: |
| 1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1709 movdqu xmm0, [eax] |
| 1710 movdqu xmm1, [eax + 16] |
| 1711 movdqu xmm2, [eax + 32] |
| 1712 movdqu xmm3, [eax + 48] |
| 1713 movdqu xmm4, [eax + esi] |
| 1714 pavgb xmm0, xmm4 |
| 1715 movdqu xmm4, [eax + esi + 16] |
| 1716 pavgb xmm1, xmm4 |
| 1717 movdqu xmm4, [eax + esi + 32] |
| 1718 pavgb xmm2, xmm4 |
| 1719 movdqu xmm4, [eax + esi + 48] |
| 1720 pavgb xmm3, xmm4 |
| 1721 lea eax, [eax + 64] |
| 1722 movdqa xmm4, xmm0 |
| 1723 shufps xmm0, xmm1, 0x88 |
| 1724 shufps xmm4, xmm1, 0xdd |
| 1725 pavgb xmm0, xmm4 |
| 1726 movdqa xmm4, xmm2 |
| 1727 shufps xmm2, xmm3, 0x88 |
| 1728 shufps xmm4, xmm3, 0xdd |
| 1729 pavgb xmm2, xmm4 |
| 1730 |
| 1731 // step 2 - convert to U and V |
| 1732 // from here down is very similar to Y code except |
| 1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1734 movdqa xmm1, xmm0 |
| 1735 movdqa xmm3, xmm2 |
| 1736 pmaddubsw xmm0, xmm7 // U |
| 1737 pmaddubsw xmm2, xmm7 |
| 1738 pmaddubsw xmm1, xmm6 // V |
| 1739 pmaddubsw xmm3, xmm6 |
| 1740 phaddw xmm0, xmm2 |
| 1741 phaddw xmm1, xmm3 |
| 1742 psraw xmm0, 8 |
| 1743 psraw xmm1, 8 |
| 1744 packsswb xmm0, xmm1 |
| 1745 paddb xmm0, xmm5 // -> unsigned |
| 1746 |
| 1747 // step 3 - store 8 U and 8 V values |
| 1748 sub ecx, 16 |
| 1749 movlps qword ptr [edx], xmm0 // U |
| 1750 movhps qword ptr [edx + edi], xmm0 // V |
| 1751 lea edx, [edx + 8] |
| 1752 jg convertloop |
| 1753 |
| 1754 pop edi |
| 1755 pop esi |
| 1756 ret |
| 1757 } |
| 1758 } |
| 1759 |
| 1760 __declspec(naked) __declspec(align(16)) |
| 1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1762 uint8* dst_u, uint8* dst_v, int width) { |
| 1763 __asm { |
| 1764 push esi |
| 1765 push edi |
| 1766 mov eax, [esp + 8 + 4] // src_argb |
| 1767 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1768 mov edx, [esp + 8 + 12] // dst_u |
| 1769 mov edi, [esp + 8 + 16] // dst_v |
| 1770 mov ecx, [esp + 8 + 20] // pix |
| 1771 movdqa xmm7, kABGRToU |
| 1772 movdqa xmm6, kABGRToV |
| 1773 movdqa xmm5, kAddUV128 |
| 1774 sub edi, edx // stride from u to v |
| 1775 |
| 1776 align 4 |
| 1777 convertloop: |
| 1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1779 movdqa xmm0, [eax] |
| 1780 movdqa xmm1, [eax + 16] |
| 1781 movdqa xmm2, [eax + 32] |
| 1782 movdqa xmm3, [eax + 48] |
| 1783 pavgb xmm0, [eax + esi] |
| 1784 pavgb xmm1, [eax + esi + 16] |
| 1785 pavgb xmm2, [eax + esi + 32] |
| 1786 pavgb xmm3, [eax + esi + 48] |
| 1787 lea eax, [eax + 64] |
| 1788 movdqa xmm4, xmm0 |
| 1789 shufps xmm0, xmm1, 0x88 |
| 1790 shufps xmm4, xmm1, 0xdd |
| 1791 pavgb xmm0, xmm4 |
| 1792 movdqa xmm4, xmm2 |
| 1793 shufps xmm2, xmm3, 0x88 |
| 1794 shufps xmm4, xmm3, 0xdd |
| 1795 pavgb xmm2, xmm4 |
| 1796 |
| 1797 // step 2 - convert to U and V |
| 1798 // from here down is very similar to Y code except |
| 1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1800 movdqa xmm1, xmm0 |
| 1801 movdqa xmm3, xmm2 |
| 1802 pmaddubsw xmm0, xmm7 // U |
| 1803 pmaddubsw xmm2, xmm7 |
| 1804 pmaddubsw xmm1, xmm6 // V |
| 1805 pmaddubsw xmm3, xmm6 |
| 1806 phaddw xmm0, xmm2 |
| 1807 phaddw xmm1, xmm3 |
| 1808 psraw xmm0, 8 |
| 1809 psraw xmm1, 8 |
| 1810 packsswb xmm0, xmm1 |
| 1811 paddb xmm0, xmm5 // -> unsigned |
| 1812 |
| 1813 // step 3 - store 8 U and 8 V values |
| 1814 sub ecx, 16 |
| 1815 movlps qword ptr [edx], xmm0 // U |
| 1816 movhps qword ptr [edx + edi], xmm0 // V |
| 1817 lea edx, [edx + 8] |
| 1818 jg convertloop |
| 1819 |
| 1820 pop edi |
| 1821 pop esi |
| 1822 ret |
| 1823 } |
| 1824 } |
| 1825 |
| 1826 __declspec(naked) __declspec(align(16)) |
| 1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1828 uint8* dst_u, uint8* dst_v, int width) { |
| 1829 __asm { |
| 1830 push esi |
| 1831 push edi |
| 1832 mov eax, [esp + 8 + 4] // src_argb |
| 1833 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1834 mov edx, [esp + 8 + 12] // dst_u |
| 1835 mov edi, [esp + 8 + 16] // dst_v |
| 1836 mov ecx, [esp + 8 + 20] // pix |
| 1837 movdqa xmm7, kABGRToU |
| 1838 movdqa xmm6, kABGRToV |
| 1839 movdqa xmm5, kAddUV128 |
| 1840 sub edi, edx // stride from u to v |
| 1841 |
| 1842 align 4 |
| 1843 convertloop: |
| 1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1845 movdqu xmm0, [eax] |
| 1846 movdqu xmm1, [eax + 16] |
| 1847 movdqu xmm2, [eax + 32] |
| 1848 movdqu xmm3, [eax + 48] |
| 1849 movdqu xmm4, [eax + esi] |
| 1850 pavgb xmm0, xmm4 |
| 1851 movdqu xmm4, [eax + esi + 16] |
| 1852 pavgb xmm1, xmm4 |
| 1853 movdqu xmm4, [eax + esi + 32] |
| 1854 pavgb xmm2, xmm4 |
| 1855 movdqu xmm4, [eax + esi + 48] |
| 1856 pavgb xmm3, xmm4 |
| 1857 lea eax, [eax + 64] |
| 1858 movdqa xmm4, xmm0 |
| 1859 shufps xmm0, xmm1, 0x88 |
| 1860 shufps xmm4, xmm1, 0xdd |
| 1861 pavgb xmm0, xmm4 |
| 1862 movdqa xmm4, xmm2 |
| 1863 shufps xmm2, xmm3, 0x88 |
| 1864 shufps xmm4, xmm3, 0xdd |
| 1865 pavgb xmm2, xmm4 |
| 1866 |
| 1867 // step 2 - convert to U and V |
| 1868 // from here down is very similar to Y code except |
| 1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1870 movdqa xmm1, xmm0 |
| 1871 movdqa xmm3, xmm2 |
| 1872 pmaddubsw xmm0, xmm7 // U |
| 1873 pmaddubsw xmm2, xmm7 |
| 1874 pmaddubsw xmm1, xmm6 // V |
| 1875 pmaddubsw xmm3, xmm6 |
| 1876 phaddw xmm0, xmm2 |
| 1877 phaddw xmm1, xmm3 |
| 1878 psraw xmm0, 8 |
| 1879 psraw xmm1, 8 |
| 1880 packsswb xmm0, xmm1 |
| 1881 paddb xmm0, xmm5 // -> unsigned |
| 1882 |
| 1883 // step 3 - store 8 U and 8 V values |
| 1884 sub ecx, 16 |
| 1885 movlps qword ptr [edx], xmm0 // U |
| 1886 movhps qword ptr [edx + edi], xmm0 // V |
| 1887 lea edx, [edx + 8] |
| 1888 jg convertloop |
| 1889 |
| 1890 pop edi |
| 1891 pop esi |
| 1892 ret |
| 1893 } |
| 1894 } |
| 1895 |
| 1896 __declspec(naked) __declspec(align(16)) |
| 1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1898 uint8* dst_u, uint8* dst_v, int width) { |
| 1899 __asm { |
| 1900 push esi |
| 1901 push edi |
| 1902 mov eax, [esp + 8 + 4] // src_argb |
| 1903 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1904 mov edx, [esp + 8 + 12] // dst_u |
| 1905 mov edi, [esp + 8 + 16] // dst_v |
| 1906 mov ecx, [esp + 8 + 20] // pix |
| 1907 movdqa xmm7, kRGBAToU |
| 1908 movdqa xmm6, kRGBAToV |
| 1909 movdqa xmm5, kAddUV128 |
| 1910 sub edi, edx // stride from u to v |
| 1911 |
| 1912 align 4 |
| 1913 convertloop: |
| 1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1915 movdqa xmm0, [eax] |
| 1916 movdqa xmm1, [eax + 16] |
| 1917 movdqa xmm2, [eax + 32] |
| 1918 movdqa xmm3, [eax + 48] |
| 1919 pavgb xmm0, [eax + esi] |
| 1920 pavgb xmm1, [eax + esi + 16] |
| 1921 pavgb xmm2, [eax + esi + 32] |
| 1922 pavgb xmm3, [eax + esi + 48] |
| 1923 lea eax, [eax + 64] |
| 1924 movdqa xmm4, xmm0 |
| 1925 shufps xmm0, xmm1, 0x88 |
| 1926 shufps xmm4, xmm1, 0xdd |
| 1927 pavgb xmm0, xmm4 |
| 1928 movdqa xmm4, xmm2 |
| 1929 shufps xmm2, xmm3, 0x88 |
| 1930 shufps xmm4, xmm3, 0xdd |
| 1931 pavgb xmm2, xmm4 |
| 1932 |
| 1933 // step 2 - convert to U and V |
| 1934 // from here down is very similar to Y code except |
| 1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 1936 movdqa xmm1, xmm0 |
| 1937 movdqa xmm3, xmm2 |
| 1938 pmaddubsw xmm0, xmm7 // U |
| 1939 pmaddubsw xmm2, xmm7 |
| 1940 pmaddubsw xmm1, xmm6 // V |
| 1941 pmaddubsw xmm3, xmm6 |
| 1942 phaddw xmm0, xmm2 |
| 1943 phaddw xmm1, xmm3 |
| 1944 psraw xmm0, 8 |
| 1945 psraw xmm1, 8 |
| 1946 packsswb xmm0, xmm1 |
| 1947 paddb xmm0, xmm5 // -> unsigned |
| 1948 |
| 1949 // step 3 - store 8 U and 8 V values |
| 1950 sub ecx, 16 |
| 1951 movlps qword ptr [edx], xmm0 // U |
| 1952 movhps qword ptr [edx + edi], xmm0 // V |
| 1953 lea edx, [edx + 8] |
| 1954 jg convertloop |
| 1955 |
| 1956 pop edi |
| 1957 pop esi |
| 1958 ret |
| 1959 } |
| 1960 } |
| 1961 |
| 1962 __declspec(naked) __declspec(align(16)) |
| 1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
| 1964 uint8* dst_u, uint8* dst_v, int width) { |
| 1965 __asm { |
| 1966 push esi |
| 1967 push edi |
| 1968 mov eax, [esp + 8 + 4] // src_argb |
| 1969 mov esi, [esp + 8 + 8] // src_stride_argb |
| 1970 mov edx, [esp + 8 + 12] // dst_u |
| 1971 mov edi, [esp + 8 + 16] // dst_v |
| 1972 mov ecx, [esp + 8 + 20] // pix |
| 1973 movdqa xmm7, kRGBAToU |
| 1974 movdqa xmm6, kRGBAToV |
| 1975 movdqa xmm5, kAddUV128 |
| 1976 sub edi, edx // stride from u to v |
| 1977 |
| 1978 align 4 |
| 1979 convertloop: |
| 1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
| 1981 movdqu xmm0, [eax] |
| 1982 movdqu xmm1, [eax + 16] |
| 1983 movdqu xmm2, [eax + 32] |
| 1984 movdqu xmm3, [eax + 48] |
| 1985 movdqu xmm4, [eax + esi] |
| 1986 pavgb xmm0, xmm4 |
| 1987 movdqu xmm4, [eax + esi + 16] |
| 1988 pavgb xmm1, xmm4 |
| 1989 movdqu xmm4, [eax + esi + 32] |
| 1990 pavgb xmm2, xmm4 |
| 1991 movdqu xmm4, [eax + esi + 48] |
| 1992 pavgb xmm3, xmm4 |
| 1993 lea eax, [eax + 64] |
| 1994 movdqa xmm4, xmm0 |
| 1995 shufps xmm0, xmm1, 0x88 |
| 1996 shufps xmm4, xmm1, 0xdd |
| 1997 pavgb xmm0, xmm4 |
| 1998 movdqa xmm4, xmm2 |
| 1999 shufps xmm2, xmm3, 0x88 |
| 2000 shufps xmm4, xmm3, 0xdd |
| 2001 pavgb xmm2, xmm4 |
| 2002 |
| 2003 // step 2 - convert to U and V |
| 2004 // from here down is very similar to Y code except |
| 2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V |
| 2006 movdqa xmm1, xmm0 |
| 2007 movdqa xmm3, xmm2 |
| 2008 pmaddubsw xmm0, xmm7 // U |
| 2009 pmaddubsw xmm2, xmm7 |
| 2010 pmaddubsw xmm1, xmm6 // V |
| 2011 pmaddubsw xmm3, xmm6 |
| 2012 phaddw xmm0, xmm2 |
| 2013 phaddw xmm1, xmm3 |
| 2014 psraw xmm0, 8 |
| 2015 psraw xmm1, 8 |
| 2016 packsswb xmm0, xmm1 |
| 2017 paddb xmm0, xmm5 // -> unsigned |
| 2018 |
| 2019 // step 3 - store 8 U and 8 V values |
| 2020 sub ecx, 16 |
| 2021 movlps qword ptr [edx], xmm0 // U |
| 2022 movhps qword ptr [edx + edi], xmm0 // V |
| 2023 lea edx, [edx + 8] |
| 2024 jg convertloop |
| 2025 |
| 2026 pop edi |
| 2027 pop esi |
| 2028 ret |
| 2029 } |
| 2030 } |
| 2031 #endif // HAS_ARGBTOYROW_SSSE3 |
| 2032 |
| 2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
| 2034 |
| 2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
| 2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
| 2037 #define UR 0 |
| 2038 |
| 2039 #define VB 0 |
| 2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
| 2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
| 2042 |
| 2043 // Bias |
| 2044 #define BB UB * 128 + VB * 128 |
| 2045 #define BG UG * 128 + VG * 128 |
| 2046 #define BR UR * 128 + VR * 128 |
| 2047 |
| 2048 #ifdef HAS_I422TOARGBROW_AVX2 |
| 2049 |
| 2050 static const lvec8 kUVToB_AVX = { |
| 2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, |
| 2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
| 2053 }; |
| 2054 static const lvec8 kUVToR_AVX = { |
| 2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, |
| 2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
| 2057 }; |
| 2058 static const lvec8 kUVToG_AVX = { |
| 2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
| 2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
| 2061 }; |
| 2062 static const lvec16 kYToRgb_AVX = { |
| 2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG |
| 2064 }; |
| 2065 static const lvec16 kYSub16_AVX = { |
| 2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 |
| 2067 }; |
| 2068 static const lvec16 kUVBiasB_AVX = { |
| 2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB |
| 2070 }; |
| 2071 static const lvec16 kUVBiasG_AVX = { |
| 2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG |
| 2073 }; |
| 2074 static const lvec16 kUVBiasR_AVX = { |
| 2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR |
| 2076 }; |
| 2077 |
| 2078 // 16 pixels |
| 2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2080 __declspec(naked) __declspec(align(16)) |
| 2081 void I422ToARGBRow_AVX2(const uint8* y_buf, |
| 2082 const uint8* u_buf, |
| 2083 const uint8* v_buf, |
| 2084 uint8* dst_argb, |
| 2085 int width) { |
| 2086 __asm { |
| 2087 push esi |
| 2088 push edi |
| 2089 mov eax, [esp + 8 + 4] // Y |
| 2090 mov esi, [esp + 8 + 8] // U |
| 2091 mov edi, [esp + 8 + 12] // V |
| 2092 mov edx, [esp + 8 + 16] // argb |
| 2093 mov ecx, [esp + 8 + 20] // width |
| 2094 sub edi, esi |
| 2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2096 vpxor ymm4, ymm4, ymm4 |
| 2097 |
| 2098 align 4 |
| 2099 convertloop: |
| 2100 vmovq xmm0, qword ptr [esi] // U |
| 2101 vmovq xmm1, qword ptr [esi + edi] // V |
| 2102 lea esi, [esi + 8] |
| 2103 vpunpcklbw ymm0, ymm0, ymm1 // UV |
| 2104 vpermq ymm0, ymm0, 0xd8 |
| 2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV |
| 2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV |
| 2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV |
| 2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV |
| 2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed |
| 2110 vpsubw ymm1, ymm1, kUVBiasG_AVX |
| 2111 vpsubw ymm0, ymm0, kUVBiasR_AVX |
| 2112 |
| 2113 // Step 2: Find Y contribution to 16 R,G,B values |
| 2114 vmovdqu xmm3, [eax] // NOLINT |
| 2115 lea eax, [eax + 16] |
| 2116 vpermq ymm3, ymm3, 0xd8 |
| 2117 vpunpcklbw ymm3, ymm3, ymm4 |
| 2118 vpsubsw ymm3, ymm3, kYSub16_AVX |
| 2119 vpmullw ymm3, ymm3, kYToRgb_AVX |
| 2120 vpaddsw ymm2, ymm2, ymm3 // B += Y |
| 2121 vpaddsw ymm1, ymm1, ymm3 // G += Y |
| 2122 vpaddsw ymm0, ymm0, ymm3 // R += Y |
| 2123 vpsraw ymm2, ymm2, 6 |
| 2124 vpsraw ymm1, ymm1, 6 |
| 2125 vpsraw ymm0, ymm0, 6 |
| 2126 vpackuswb ymm2, ymm2, ymm2 // B |
| 2127 vpackuswb ymm1, ymm1, ymm1 // G |
| 2128 vpackuswb ymm0, ymm0, ymm0 // R |
| 2129 |
| 2130 // Step 3: Weave into ARGB |
| 2131 vpunpcklbw ymm2, ymm2, ymm1 // BG |
| 2132 vpermq ymm2, ymm2, 0xd8 |
| 2133 vpunpcklbw ymm0, ymm0, ymm5 // RA |
| 2134 vpermq ymm0, ymm0, 0xd8 |
| 2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels |
| 2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels |
| 2137 vmovdqu [edx], ymm1 |
| 2138 vmovdqu [edx + 32], ymm2 |
| 2139 lea edx, [edx + 64] |
| 2140 sub ecx, 16 |
| 2141 jg convertloop |
| 2142 vzeroupper |
| 2143 |
| 2144 pop edi |
| 2145 pop esi |
| 2146 ret |
| 2147 } |
| 2148 } |
| 2149 #endif // HAS_I422TOARGBROW_AVX2 |
| 2150 |
| 2151 #ifdef HAS_I422TOARGBROW_SSSE3 |
| 2152 |
| 2153 static const vec8 kUVToB = { |
| 2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
| 2155 }; |
| 2156 |
| 2157 static const vec8 kUVToR = { |
| 2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
| 2159 }; |
| 2160 |
| 2161 static const vec8 kUVToG = { |
| 2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
| 2163 }; |
| 2164 |
| 2165 static const vec8 kVUToB = { |
| 2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, |
| 2167 }; |
| 2168 |
| 2169 static const vec8 kVUToR = { |
| 2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, |
| 2171 }; |
| 2172 |
| 2173 static const vec8 kVUToG = { |
| 2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
| 2175 }; |
| 2176 |
| 2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; |
| 2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; |
| 2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; |
| 2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; |
| 2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; |
| 2182 |
| 2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
| 2184 |
| 2185 // Read 8 UV from 444. |
| 2186 #define READYUV444 __asm { \ |
| 2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
| 2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 2189 __asm lea esi, [esi + 8] \ |
| 2190 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 2191 } |
| 2192 |
| 2193 // Read 4 UV from 422, upsample to 8 UV. |
| 2194 #define READYUV422 __asm { \ |
| 2195 __asm movd xmm0, [esi] /* U */ \ |
| 2196 __asm movd xmm1, [esi + edi] /* V */ \ |
| 2197 __asm lea esi, [esi + 4] \ |
| 2198 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2200 } |
| 2201 |
| 2202 // Read 2 UV from 411, upsample to 8 UV. |
| 2203 #define READYUV411 __asm { \ |
| 2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
| 2205 __asm movd xmm0, ebx \ |
| 2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
| 2207 __asm movd xmm1, ebx \ |
| 2208 __asm lea esi, [esi + 2] \ |
| 2209 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2212 } |
| 2213 |
| 2214 // Read 4 UV from NV12, upsample to 8 UV. |
| 2215 #define READNV12 __asm { \ |
| 2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
| 2217 __asm lea esi, [esi + 8] \ |
| 2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2219 } |
| 2220 |
| 2221 // Convert 8 pixels: 8 UV and 8 Y. |
| 2222 #define YUVTORGB __asm { \ |
| 2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
| 2224 __asm movdqa xmm1, xmm0 \ |
| 2225 __asm movdqa xmm2, xmm0 \ |
| 2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ |
| 2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ |
| 2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ |
| 2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
| 2230 __asm psubw xmm1, kUVBiasG \ |
| 2231 __asm psubw xmm2, kUVBiasR \ |
| 2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
| 2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
| 2234 __asm lea eax, [eax + 8] \ |
| 2235 __asm punpcklbw xmm3, xmm4 \ |
| 2236 __asm psubsw xmm3, kYSub16 \ |
| 2237 __asm pmullw xmm3, kYToRgb \ |
| 2238 __asm paddsw xmm0, xmm3 /* B += Y */ \ |
| 2239 __asm paddsw xmm1, xmm3 /* G += Y */ \ |
| 2240 __asm paddsw xmm2, xmm3 /* R += Y */ \ |
| 2241 __asm psraw xmm0, 6 \ |
| 2242 __asm psraw xmm1, 6 \ |
| 2243 __asm psraw xmm2, 6 \ |
| 2244 __asm packuswb xmm0, xmm0 /* B */ \ |
| 2245 __asm packuswb xmm1, xmm1 /* G */ \ |
| 2246 __asm packuswb xmm2, xmm2 /* R */ \ |
| 2247 } |
| 2248 |
| 2249 // Convert 8 pixels: 8 VU and 8 Y. |
| 2250 #define YVUTORGB __asm { \ |
| 2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
| 2252 __asm movdqa xmm1, xmm0 \ |
| 2253 __asm movdqa xmm2, xmm0 \ |
| 2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ |
| 2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ |
| 2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ |
| 2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
| 2258 __asm psubw xmm1, kUVBiasG \ |
| 2259 __asm psubw xmm2, kUVBiasR \ |
| 2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
| 2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
| 2262 __asm lea eax, [eax + 8] \ |
| 2263 __asm punpcklbw xmm3, xmm4 \ |
| 2264 __asm psubsw xmm3, kYSub16 \ |
| 2265 __asm pmullw xmm3, kYToRgb \ |
| 2266 __asm paddsw xmm0, xmm3 /* B += Y */ \ |
| 2267 __asm paddsw xmm1, xmm3 /* G += Y */ \ |
| 2268 __asm paddsw xmm2, xmm3 /* R += Y */ \ |
| 2269 __asm psraw xmm0, 6 \ |
| 2270 __asm psraw xmm1, 6 \ |
| 2271 __asm psraw xmm2, 6 \ |
| 2272 __asm packuswb xmm0, xmm0 /* B */ \ |
| 2273 __asm packuswb xmm1, xmm1 /* G */ \ |
| 2274 __asm packuswb xmm2, xmm2 /* R */ \ |
| 2275 } |
| 2276 |
| 2277 // 8 pixels, dest aligned 16. |
| 2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2279 __declspec(naked) __declspec(align(16)) |
| 2280 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 2281 const uint8* u_buf, |
| 2282 const uint8* v_buf, |
| 2283 uint8* dst_argb, |
| 2284 int width) { |
| 2285 __asm { |
| 2286 push esi |
| 2287 push edi |
| 2288 mov eax, [esp + 8 + 4] // Y |
| 2289 mov esi, [esp + 8 + 8] // U |
| 2290 mov edi, [esp + 8 + 12] // V |
| 2291 mov edx, [esp + 8 + 16] // argb |
| 2292 mov ecx, [esp + 8 + 20] // width |
| 2293 sub edi, esi |
| 2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2295 pxor xmm4, xmm4 |
| 2296 |
| 2297 align 4 |
| 2298 convertloop: |
| 2299 READYUV444 |
| 2300 YUVTORGB |
| 2301 |
| 2302 // Step 3: Weave into ARGB |
| 2303 punpcklbw xmm0, xmm1 // BG |
| 2304 punpcklbw xmm2, xmm5 // RA |
| 2305 movdqa xmm1, xmm0 |
| 2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2308 movdqa [edx], xmm0 |
| 2309 movdqa [edx + 16], xmm1 |
| 2310 lea edx, [edx + 32] |
| 2311 sub ecx, 8 |
| 2312 jg convertloop |
| 2313 |
| 2314 pop edi |
| 2315 pop esi |
| 2316 ret |
| 2317 } |
| 2318 } |
| 2319 |
| 2320 // 8 pixels, dest aligned 16. |
| 2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2322 __declspec(naked) __declspec(align(16)) |
| 2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 2324 const uint8* u_buf, |
| 2325 const uint8* v_buf, |
| 2326 uint8* dst_rgb24, |
| 2327 int width) { |
| 2328 __asm { |
| 2329 push esi |
| 2330 push edi |
| 2331 mov eax, [esp + 8 + 4] // Y |
| 2332 mov esi, [esp + 8 + 8] // U |
| 2333 mov edi, [esp + 8 + 12] // V |
| 2334 mov edx, [esp + 8 + 16] // rgb24 |
| 2335 mov ecx, [esp + 8 + 20] // width |
| 2336 sub edi, esi |
| 2337 pxor xmm4, xmm4 |
| 2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0 |
| 2339 movdqa xmm6, kShuffleMaskARGBToRGB24 |
| 2340 |
| 2341 align 4 |
| 2342 convertloop: |
| 2343 READYUV422 |
| 2344 YUVTORGB |
| 2345 |
| 2346 // Step 3: Weave into RRGB |
| 2347 punpcklbw xmm0, xmm1 // BG |
| 2348 punpcklbw xmm2, xmm2 // RR |
| 2349 movdqa xmm1, xmm0 |
| 2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
| 2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
| 2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
| 2353 pshufb xmm1, xmm6 // Pack into first 12 bytes. |
| 2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
| 2355 movq qword ptr [edx], xmm0 // First 8 bytes |
| 2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
| 2357 lea edx, [edx + 24] |
| 2358 sub ecx, 8 |
| 2359 jg convertloop |
| 2360 |
| 2361 pop edi |
| 2362 pop esi |
| 2363 ret |
| 2364 } |
| 2365 } |
| 2366 |
| 2367 // 8 pixels, dest aligned 16. |
| 2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2369 __declspec(naked) __declspec(align(16)) |
| 2370 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 2371 const uint8* u_buf, |
| 2372 const uint8* v_buf, |
| 2373 uint8* dst_raw, |
| 2374 int width) { |
| 2375 __asm { |
| 2376 push esi |
| 2377 push edi |
| 2378 mov eax, [esp + 8 + 4] // Y |
| 2379 mov esi, [esp + 8 + 8] // U |
| 2380 mov edi, [esp + 8 + 12] // V |
| 2381 mov edx, [esp + 8 + 16] // raw |
| 2382 mov ecx, [esp + 8 + 20] // width |
| 2383 sub edi, esi |
| 2384 pxor xmm4, xmm4 |
| 2385 movdqa xmm5, kShuffleMaskARGBToRAW_0 |
| 2386 movdqa xmm6, kShuffleMaskARGBToRAW |
| 2387 |
| 2388 align 4 |
| 2389 convertloop: |
| 2390 READYUV422 |
| 2391 YUVTORGB |
| 2392 |
| 2393 // Step 3: Weave into RRGB |
| 2394 punpcklbw xmm0, xmm1 // BG |
| 2395 punpcklbw xmm2, xmm2 // RR |
| 2396 movdqa xmm1, xmm0 |
| 2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
| 2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
| 2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
| 2400 pshufb xmm1, xmm6 // Pack into first 12 bytes. |
| 2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
| 2402 movq qword ptr [edx], xmm0 // First 8 bytes |
| 2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
| 2404 lea edx, [edx + 24] |
| 2405 sub ecx, 8 |
| 2406 jg convertloop |
| 2407 |
| 2408 pop edi |
| 2409 pop esi |
| 2410 ret |
| 2411 } |
| 2412 } |
| 2413 |
| 2414 // 8 pixels, dest unaligned. |
| 2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2416 __declspec(naked) __declspec(align(16)) |
| 2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
| 2418 const uint8* u_buf, |
| 2419 const uint8* v_buf, |
| 2420 uint8* rgb565_buf, |
| 2421 int width) { |
| 2422 __asm { |
| 2423 push esi |
| 2424 push edi |
| 2425 mov eax, [esp + 8 + 4] // Y |
| 2426 mov esi, [esp + 8 + 8] // U |
| 2427 mov edi, [esp + 8 + 12] // V |
| 2428 mov edx, [esp + 8 + 16] // rgb565 |
| 2429 mov ecx, [esp + 8 + 20] // width |
| 2430 sub edi, esi |
| 2431 pxor xmm4, xmm4 |
| 2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
| 2433 psrld xmm5, 27 |
| 2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
| 2435 psrld xmm6, 26 |
| 2436 pslld xmm6, 5 |
| 2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
| 2438 pslld xmm7, 11 |
| 2439 |
| 2440 align 4 |
| 2441 convertloop: |
| 2442 READYUV422 |
| 2443 YUVTORGB |
| 2444 |
| 2445 // Step 3: Weave into RRGB |
| 2446 punpcklbw xmm0, xmm1 // BG |
| 2447 punpcklbw xmm2, xmm2 // RR |
| 2448 movdqa xmm1, xmm0 |
| 2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
| 2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
| 2451 |
| 2452 // Step 3b: RRGB -> RGB565 |
| 2453 movdqa xmm3, xmm0 // B first 4 pixels of argb |
| 2454 movdqa xmm2, xmm0 // G |
| 2455 pslld xmm0, 8 // R |
| 2456 psrld xmm3, 3 // B |
| 2457 psrld xmm2, 5 // G |
| 2458 psrad xmm0, 16 // R |
| 2459 pand xmm3, xmm5 // B |
| 2460 pand xmm2, xmm6 // G |
| 2461 pand xmm0, xmm7 // R |
| 2462 por xmm3, xmm2 // BG |
| 2463 por xmm0, xmm3 // BGR |
| 2464 movdqa xmm3, xmm1 // B next 4 pixels of argb |
| 2465 movdqa xmm2, xmm1 // G |
| 2466 pslld xmm1, 8 // R |
| 2467 psrld xmm3, 3 // B |
| 2468 psrld xmm2, 5 // G |
| 2469 psrad xmm1, 16 // R |
| 2470 pand xmm3, xmm5 // B |
| 2471 pand xmm2, xmm6 // G |
| 2472 pand xmm1, xmm7 // R |
| 2473 por xmm3, xmm2 // BG |
| 2474 por xmm1, xmm3 // BGR |
| 2475 packssdw xmm0, xmm1 |
| 2476 sub ecx, 8 |
| 2477 movdqu [edx], xmm0 // store 8 pixels of RGB565 |
| 2478 lea edx, [edx + 16] |
| 2479 jg convertloop |
| 2480 |
| 2481 pop edi |
| 2482 pop esi |
| 2483 ret |
| 2484 } |
| 2485 } |
| 2486 |
| 2487 // 8 pixels, dest aligned 16. |
| 2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2489 __declspec(naked) __declspec(align(16)) |
| 2490 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2491 const uint8* u_buf, |
| 2492 const uint8* v_buf, |
| 2493 uint8* dst_argb, |
| 2494 int width) { |
| 2495 __asm { |
| 2496 push esi |
| 2497 push edi |
| 2498 mov eax, [esp + 8 + 4] // Y |
| 2499 mov esi, [esp + 8 + 8] // U |
| 2500 mov edi, [esp + 8 + 12] // V |
| 2501 mov edx, [esp + 8 + 16] // argb |
| 2502 mov ecx, [esp + 8 + 20] // width |
| 2503 sub edi, esi |
| 2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2505 pxor xmm4, xmm4 |
| 2506 |
| 2507 align 4 |
| 2508 convertloop: |
| 2509 READYUV422 |
| 2510 YUVTORGB |
| 2511 |
| 2512 // Step 3: Weave into ARGB |
| 2513 punpcklbw xmm0, xmm1 // BG |
| 2514 punpcklbw xmm2, xmm5 // RA |
| 2515 movdqa xmm1, xmm0 |
| 2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2518 movdqa [edx], xmm0 |
| 2519 movdqa [edx + 16], xmm1 |
| 2520 lea edx, [edx + 32] |
| 2521 sub ecx, 8 |
| 2522 jg convertloop |
| 2523 |
| 2524 pop edi |
| 2525 pop esi |
| 2526 ret |
| 2527 } |
| 2528 } |
| 2529 |
| 2530 // 8 pixels, dest aligned 16. |
| 2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2532 // Similar to I420 but duplicate UV once more. |
| 2533 __declspec(naked) __declspec(align(16)) |
| 2534 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 2535 const uint8* u_buf, |
| 2536 const uint8* v_buf, |
| 2537 uint8* dst_argb, |
| 2538 int width) { |
| 2539 __asm { |
| 2540 push ebx |
| 2541 push esi |
| 2542 push edi |
| 2543 mov eax, [esp + 12 + 4] // Y |
| 2544 mov esi, [esp + 12 + 8] // U |
| 2545 mov edi, [esp + 12 + 12] // V |
| 2546 mov edx, [esp + 12 + 16] // argb |
| 2547 mov ecx, [esp + 12 + 20] // width |
| 2548 sub edi, esi |
| 2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2550 pxor xmm4, xmm4 |
| 2551 |
| 2552 align 4 |
| 2553 convertloop: |
| 2554 READYUV411 // modifies EBX |
| 2555 YUVTORGB |
| 2556 |
| 2557 // Step 3: Weave into ARGB |
| 2558 punpcklbw xmm0, xmm1 // BG |
| 2559 punpcklbw xmm2, xmm5 // RA |
| 2560 movdqa xmm1, xmm0 |
| 2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2563 movdqa [edx], xmm0 |
| 2564 movdqa [edx + 16], xmm1 |
| 2565 lea edx, [edx + 32] |
| 2566 sub ecx, 8 |
| 2567 jg convertloop |
| 2568 |
| 2569 pop edi |
| 2570 pop esi |
| 2571 pop ebx |
| 2572 ret |
| 2573 } |
| 2574 } |
| 2575 |
| 2576 // 8 pixels, dest aligned 16. |
| 2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2578 __declspec(naked) __declspec(align(16)) |
| 2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2580 const uint8* uv_buf, |
| 2581 uint8* dst_argb, |
| 2582 int width) { |
| 2583 __asm { |
| 2584 push esi |
| 2585 mov eax, [esp + 4 + 4] // Y |
| 2586 mov esi, [esp + 4 + 8] // UV |
| 2587 mov edx, [esp + 4 + 12] // argb |
| 2588 mov ecx, [esp + 4 + 16] // width |
| 2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2590 pxor xmm4, xmm4 |
| 2591 |
| 2592 align 4 |
| 2593 convertloop: |
| 2594 READNV12 |
| 2595 YUVTORGB |
| 2596 |
| 2597 // Step 3: Weave into ARGB |
| 2598 punpcklbw xmm0, xmm1 // BG |
| 2599 punpcklbw xmm2, xmm5 // RA |
| 2600 movdqa xmm1, xmm0 |
| 2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2603 movdqa [edx], xmm0 |
| 2604 movdqa [edx + 16], xmm1 |
| 2605 lea edx, [edx + 32] |
| 2606 sub ecx, 8 |
| 2607 jg convertloop |
| 2608 |
| 2609 pop esi |
| 2610 ret |
| 2611 } |
| 2612 } |
| 2613 |
| 2614 // 8 pixels, dest aligned 16. |
| 2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2616 __declspec(naked) __declspec(align(16)) |
| 2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| 2618 const uint8* uv_buf, |
| 2619 uint8* dst_argb, |
| 2620 int width) { |
| 2621 __asm { |
| 2622 push esi |
| 2623 mov eax, [esp + 4 + 4] // Y |
| 2624 mov esi, [esp + 4 + 8] // VU |
| 2625 mov edx, [esp + 4 + 12] // argb |
| 2626 mov ecx, [esp + 4 + 16] // width |
| 2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2628 pxor xmm4, xmm4 |
| 2629 |
| 2630 align 4 |
| 2631 convertloop: |
| 2632 READNV12 |
| 2633 YVUTORGB |
| 2634 |
| 2635 // Step 3: Weave into ARGB |
| 2636 punpcklbw xmm0, xmm1 // BG |
| 2637 punpcklbw xmm2, xmm5 // RA |
| 2638 movdqa xmm1, xmm0 |
| 2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2641 movdqa [edx], xmm0 |
| 2642 movdqa [edx + 16], xmm1 |
| 2643 lea edx, [edx + 32] |
| 2644 sub ecx, 8 |
| 2645 jg convertloop |
| 2646 |
| 2647 pop esi |
| 2648 ret |
| 2649 } |
| 2650 } |
| 2651 |
| 2652 // 8 pixels, unaligned. |
| 2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2654 __declspec(naked) __declspec(align(16)) |
| 2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2656 const uint8* u_buf, |
| 2657 const uint8* v_buf, |
| 2658 uint8* dst_argb, |
| 2659 int width) { |
| 2660 __asm { |
| 2661 push esi |
| 2662 push edi |
| 2663 mov eax, [esp + 8 + 4] // Y |
| 2664 mov esi, [esp + 8 + 8] // U |
| 2665 mov edi, [esp + 8 + 12] // V |
| 2666 mov edx, [esp + 8 + 16] // argb |
| 2667 mov ecx, [esp + 8 + 20] // width |
| 2668 sub edi, esi |
| 2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2670 pxor xmm4, xmm4 |
| 2671 |
| 2672 align 4 |
| 2673 convertloop: |
| 2674 READYUV444 |
| 2675 YUVTORGB |
| 2676 |
| 2677 // Step 3: Weave into ARGB |
| 2678 punpcklbw xmm0, xmm1 // BG |
| 2679 punpcklbw xmm2, xmm5 // RA |
| 2680 movdqa xmm1, xmm0 |
| 2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2683 movdqu [edx], xmm0 |
| 2684 movdqu [edx + 16], xmm1 |
| 2685 lea edx, [edx + 32] |
| 2686 sub ecx, 8 |
| 2687 jg convertloop |
| 2688 |
| 2689 pop edi |
| 2690 pop esi |
| 2691 ret |
| 2692 } |
| 2693 } |
| 2694 |
| 2695 // 8 pixels, unaligned. |
| 2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2697 __declspec(naked) __declspec(align(16)) |
| 2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2699 const uint8* u_buf, |
| 2700 const uint8* v_buf, |
| 2701 uint8* dst_argb, |
| 2702 int width) { |
| 2703 __asm { |
| 2704 push esi |
| 2705 push edi |
| 2706 mov eax, [esp + 8 + 4] // Y |
| 2707 mov esi, [esp + 8 + 8] // U |
| 2708 mov edi, [esp + 8 + 12] // V |
| 2709 mov edx, [esp + 8 + 16] // argb |
| 2710 mov ecx, [esp + 8 + 20] // width |
| 2711 sub edi, esi |
| 2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2713 pxor xmm4, xmm4 |
| 2714 |
| 2715 align 4 |
| 2716 convertloop: |
| 2717 READYUV422 |
| 2718 YUVTORGB |
| 2719 |
| 2720 // Step 3: Weave into ARGB |
| 2721 punpcklbw xmm0, xmm1 // BG |
| 2722 punpcklbw xmm2, xmm5 // RA |
| 2723 movdqa xmm1, xmm0 |
| 2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2726 movdqu [edx], xmm0 |
| 2727 movdqu [edx + 16], xmm1 |
| 2728 lea edx, [edx + 32] |
| 2729 sub ecx, 8 |
| 2730 jg convertloop |
| 2731 |
| 2732 pop edi |
| 2733 pop esi |
| 2734 ret |
| 2735 } |
| 2736 } |
| 2737 |
| 2738 // 8 pixels, unaligned. |
| 2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2740 // Similar to I420 but duplicate UV once more. |
| 2741 __declspec(naked) __declspec(align(16)) |
| 2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2743 const uint8* u_buf, |
| 2744 const uint8* v_buf, |
| 2745 uint8* dst_argb, |
| 2746 int width) { |
| 2747 __asm { |
| 2748 push ebx |
| 2749 push esi |
| 2750 push edi |
| 2751 mov eax, [esp + 12 + 4] // Y |
| 2752 mov esi, [esp + 12 + 8] // U |
| 2753 mov edi, [esp + 12 + 12] // V |
| 2754 mov edx, [esp + 12 + 16] // argb |
| 2755 mov ecx, [esp + 12 + 20] // width |
| 2756 sub edi, esi |
| 2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2758 pxor xmm4, xmm4 |
| 2759 |
| 2760 align 4 |
| 2761 convertloop: |
| 2762 READYUV411 // modifies EBX |
| 2763 YUVTORGB |
| 2764 |
| 2765 // Step 3: Weave into ARGB |
| 2766 punpcklbw xmm0, xmm1 // BG |
| 2767 punpcklbw xmm2, xmm5 // RA |
| 2768 movdqa xmm1, xmm0 |
| 2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2771 movdqu [edx], xmm0 |
| 2772 movdqu [edx + 16], xmm1 |
| 2773 lea edx, [edx + 32] |
| 2774 sub ecx, 8 |
| 2775 jg convertloop |
| 2776 |
| 2777 pop edi |
| 2778 pop esi |
| 2779 pop ebx |
| 2780 ret |
| 2781 } |
| 2782 } |
| 2783 |
| 2784 // 8 pixels, dest aligned 16. |
| 2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2786 __declspec(naked) __declspec(align(16)) |
| 2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2788 const uint8* uv_buf, |
| 2789 uint8* dst_argb, |
| 2790 int width) { |
| 2791 __asm { |
| 2792 push esi |
| 2793 mov eax, [esp + 4 + 4] // Y |
| 2794 mov esi, [esp + 4 + 8] // UV |
| 2795 mov edx, [esp + 4 + 12] // argb |
| 2796 mov ecx, [esp + 4 + 16] // width |
| 2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2798 pxor xmm4, xmm4 |
| 2799 |
| 2800 align 4 |
| 2801 convertloop: |
| 2802 READNV12 |
| 2803 YUVTORGB |
| 2804 |
| 2805 // Step 3: Weave into ARGB |
| 2806 punpcklbw xmm0, xmm1 // BG |
| 2807 punpcklbw xmm2, xmm5 // RA |
| 2808 movdqa xmm1, xmm0 |
| 2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2811 movdqu [edx], xmm0 |
| 2812 movdqu [edx + 16], xmm1 |
| 2813 lea edx, [edx + 32] |
| 2814 sub ecx, 8 |
| 2815 jg convertloop |
| 2816 |
| 2817 pop esi |
| 2818 ret |
| 2819 } |
| 2820 } |
| 2821 |
| 2822 // 8 pixels, dest aligned 16. |
| 2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2824 __declspec(naked) __declspec(align(16)) |
| 2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2826 const uint8* uv_buf, |
| 2827 uint8* dst_argb, |
| 2828 int width) { |
| 2829 __asm { |
| 2830 push esi |
| 2831 mov eax, [esp + 4 + 4] // Y |
| 2832 mov esi, [esp + 4 + 8] // VU |
| 2833 mov edx, [esp + 4 + 12] // argb |
| 2834 mov ecx, [esp + 4 + 16] // width |
| 2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2836 pxor xmm4, xmm4 |
| 2837 |
| 2838 align 4 |
| 2839 convertloop: |
| 2840 READNV12 |
| 2841 YVUTORGB |
| 2842 |
| 2843 // Step 3: Weave into ARGB |
| 2844 punpcklbw xmm0, xmm1 // BG |
| 2845 punpcklbw xmm2, xmm5 // RA |
| 2846 movdqa xmm1, xmm0 |
| 2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
| 2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
| 2849 movdqu [edx], xmm0 |
| 2850 movdqu [edx + 16], xmm1 |
| 2851 lea edx, [edx + 32] |
| 2852 sub ecx, 8 |
| 2853 jg convertloop |
| 2854 |
| 2855 pop esi |
| 2856 ret |
| 2857 } |
| 2858 } |
| 2859 |
| 2860 __declspec(naked) __declspec(align(16)) |
| 2861 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 2862 const uint8* u_buf, |
| 2863 const uint8* v_buf, |
| 2864 uint8* dst_bgra, |
| 2865 int width) { |
| 2866 __asm { |
| 2867 push esi |
| 2868 push edi |
| 2869 mov eax, [esp + 8 + 4] // Y |
| 2870 mov esi, [esp + 8 + 8] // U |
| 2871 mov edi, [esp + 8 + 12] // V |
| 2872 mov edx, [esp + 8 + 16] // bgra |
| 2873 mov ecx, [esp + 8 + 20] // width |
| 2874 sub edi, esi |
| 2875 pxor xmm4, xmm4 |
| 2876 |
| 2877 align 4 |
| 2878 convertloop: |
| 2879 READYUV422 |
| 2880 YUVTORGB |
| 2881 |
| 2882 // Step 3: Weave into BGRA |
| 2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2884 punpcklbw xmm1, xmm0 // GB |
| 2885 punpcklbw xmm5, xmm2 // AR |
| 2886 movdqa xmm0, xmm5 |
| 2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
| 2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
| 2889 movdqa [edx], xmm5 |
| 2890 movdqa [edx + 16], xmm0 |
| 2891 lea edx, [edx + 32] |
| 2892 sub ecx, 8 |
| 2893 jg convertloop |
| 2894 |
| 2895 pop edi |
| 2896 pop esi |
| 2897 ret |
| 2898 } |
| 2899 } |
| 2900 |
| 2901 __declspec(naked) __declspec(align(16)) |
| 2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, |
| 2903 const uint8* u_buf, |
| 2904 const uint8* v_buf, |
| 2905 uint8* dst_bgra, |
| 2906 int width) { |
| 2907 __asm { |
| 2908 push esi |
| 2909 push edi |
| 2910 mov eax, [esp + 8 + 4] // Y |
| 2911 mov esi, [esp + 8 + 8] // U |
| 2912 mov edi, [esp + 8 + 12] // V |
| 2913 mov edx, [esp + 8 + 16] // bgra |
| 2914 mov ecx, [esp + 8 + 20] // width |
| 2915 sub edi, esi |
| 2916 pxor xmm4, xmm4 |
| 2917 |
| 2918 align 4 |
| 2919 convertloop: |
| 2920 READYUV422 |
| 2921 YUVTORGB |
| 2922 |
| 2923 // Step 3: Weave into BGRA |
| 2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2925 punpcklbw xmm1, xmm0 // GB |
| 2926 punpcklbw xmm5, xmm2 // AR |
| 2927 movdqa xmm0, xmm5 |
| 2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
| 2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
| 2930 movdqu [edx], xmm5 |
| 2931 movdqu [edx + 16], xmm0 |
| 2932 lea edx, [edx + 32] |
| 2933 sub ecx, 8 |
| 2934 jg convertloop |
| 2935 |
| 2936 pop edi |
| 2937 pop esi |
| 2938 ret |
| 2939 } |
| 2940 } |
| 2941 |
| 2942 __declspec(naked) __declspec(align(16)) |
| 2943 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 2944 const uint8* u_buf, |
| 2945 const uint8* v_buf, |
| 2946 uint8* dst_abgr, |
| 2947 int width) { |
| 2948 __asm { |
| 2949 push esi |
| 2950 push edi |
| 2951 mov eax, [esp + 8 + 4] // Y |
| 2952 mov esi, [esp + 8 + 8] // U |
| 2953 mov edi, [esp + 8 + 12] // V |
| 2954 mov edx, [esp + 8 + 16] // abgr |
| 2955 mov ecx, [esp + 8 + 20] // width |
| 2956 sub edi, esi |
| 2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2958 pxor xmm4, xmm4 |
| 2959 |
| 2960 align 4 |
| 2961 convertloop: |
| 2962 READYUV422 |
| 2963 YUVTORGB |
| 2964 |
| 2965 // Step 3: Weave into ARGB |
| 2966 punpcklbw xmm2, xmm1 // RG |
| 2967 punpcklbw xmm0, xmm5 // BA |
| 2968 movdqa xmm1, xmm2 |
| 2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
| 2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
| 2971 movdqa [edx], xmm2 |
| 2972 movdqa [edx + 16], xmm1 |
| 2973 lea edx, [edx + 32] |
| 2974 sub ecx, 8 |
| 2975 jg convertloop |
| 2976 |
| 2977 pop edi |
| 2978 pop esi |
| 2979 ret |
| 2980 } |
| 2981 } |
| 2982 |
| 2983 __declspec(naked) __declspec(align(16)) |
| 2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, |
| 2985 const uint8* u_buf, |
| 2986 const uint8* v_buf, |
| 2987 uint8* dst_abgr, |
| 2988 int width) { |
| 2989 __asm { |
| 2990 push esi |
| 2991 push edi |
| 2992 mov eax, [esp + 8 + 4] // Y |
| 2993 mov esi, [esp + 8 + 8] // U |
| 2994 mov edi, [esp + 8 + 12] // V |
| 2995 mov edx, [esp + 8 + 16] // abgr |
| 2996 mov ecx, [esp + 8 + 20] // width |
| 2997 sub edi, esi |
| 2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2999 pxor xmm4, xmm4 |
| 3000 |
| 3001 align 4 |
| 3002 convertloop: |
| 3003 READYUV422 |
| 3004 YUVTORGB |
| 3005 |
| 3006 // Step 3: Weave into ARGB |
| 3007 punpcklbw xmm2, xmm1 // RG |
| 3008 punpcklbw xmm0, xmm5 // BA |
| 3009 movdqa xmm1, xmm2 |
| 3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
| 3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
| 3012 movdqu [edx], xmm2 |
| 3013 movdqu [edx + 16], xmm1 |
| 3014 lea edx, [edx + 32] |
| 3015 sub ecx, 8 |
| 3016 jg convertloop |
| 3017 |
| 3018 pop edi |
| 3019 pop esi |
| 3020 ret |
| 3021 } |
| 3022 } |
| 3023 |
| 3024 __declspec(naked) __declspec(align(16)) |
| 3025 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 3026 const uint8* u_buf, |
| 3027 const uint8* v_buf, |
| 3028 uint8* dst_rgba, |
| 3029 int width) { |
| 3030 __asm { |
| 3031 push esi |
| 3032 push edi |
| 3033 mov eax, [esp + 8 + 4] // Y |
| 3034 mov esi, [esp + 8 + 8] // U |
| 3035 mov edi, [esp + 8 + 12] // V |
| 3036 mov edx, [esp + 8 + 16] // rgba |
| 3037 mov ecx, [esp + 8 + 20] // width |
| 3038 sub edi, esi |
| 3039 pxor xmm4, xmm4 |
| 3040 |
| 3041 align 4 |
| 3042 convertloop: |
| 3043 READYUV422 |
| 3044 YUVTORGB |
| 3045 |
| 3046 // Step 3: Weave into RGBA |
| 3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 3048 punpcklbw xmm1, xmm2 // GR |
| 3049 punpcklbw xmm5, xmm0 // AB |
| 3050 movdqa xmm0, xmm5 |
| 3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
| 3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
| 3053 movdqa [edx], xmm5 |
| 3054 movdqa [edx + 16], xmm0 |
| 3055 lea edx, [edx + 32] |
| 3056 sub ecx, 8 |
| 3057 jg convertloop |
| 3058 |
| 3059 pop edi |
| 3060 pop esi |
| 3061 ret |
| 3062 } |
| 3063 } |
| 3064 |
| 3065 __declspec(naked) __declspec(align(16)) |
| 3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, |
| 3067 const uint8* u_buf, |
| 3068 const uint8* v_buf, |
| 3069 uint8* dst_rgba, |
| 3070 int width) { |
| 3071 __asm { |
| 3072 push esi |
| 3073 push edi |
| 3074 mov eax, [esp + 8 + 4] // Y |
| 3075 mov esi, [esp + 8 + 8] // U |
| 3076 mov edi, [esp + 8 + 12] // V |
| 3077 mov edx, [esp + 8 + 16] // rgba |
| 3078 mov ecx, [esp + 8 + 20] // width |
| 3079 sub edi, esi |
| 3080 pxor xmm4, xmm4 |
| 3081 |
| 3082 align 4 |
| 3083 convertloop: |
| 3084 READYUV422 |
| 3085 YUVTORGB |
| 3086 |
| 3087 // Step 3: Weave into RGBA |
| 3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 3089 punpcklbw xmm1, xmm2 // GR |
| 3090 punpcklbw xmm5, xmm0 // AB |
| 3091 movdqa xmm0, xmm5 |
| 3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
| 3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
| 3094 movdqu [edx], xmm5 |
| 3095 movdqu [edx + 16], xmm0 |
| 3096 lea edx, [edx + 32] |
| 3097 sub ecx, 8 |
| 3098 jg convertloop |
| 3099 |
| 3100 pop edi |
| 3101 pop esi |
| 3102 ret |
| 3103 } |
| 3104 } |
| 3105 |
| 3106 #endif // HAS_I422TOARGBROW_SSSE3 |
| 3107 |
| 3108 #ifdef HAS_YTOARGBROW_SSE2 |
| 3109 __declspec(naked) __declspec(align(16)) |
| 3110 void YToARGBRow_SSE2(const uint8* y_buf, |
| 3111 uint8* rgb_buf, |
| 3112 int width) { |
| 3113 __asm { |
| 3114 pxor xmm5, xmm5 |
| 3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 3116 pslld xmm4, 24 |
| 3117 mov eax, 0x00100010 |
| 3118 movd xmm3, eax |
| 3119 pshufd xmm3, xmm3, 0 |
| 3120 mov eax, 0x004a004a // 74 |
| 3121 movd xmm2, eax |
| 3122 pshufd xmm2, xmm2,0 |
| 3123 mov eax, [esp + 4] // Y |
| 3124 mov edx, [esp + 8] // rgb |
| 3125 mov ecx, [esp + 12] // width |
| 3126 |
| 3127 align 4 |
| 3128 convertloop: |
| 3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
| 3130 movq xmm0, qword ptr [eax] |
| 3131 lea eax, [eax + 8] |
| 3132 punpcklbw xmm0, xmm5 // 0.Y |
| 3133 psubusw xmm0, xmm3 |
| 3134 pmullw xmm0, xmm2 |
| 3135 psrlw xmm0, 6 |
| 3136 packuswb xmm0, xmm0 // G |
| 3137 |
| 3138 // Step 2: Weave into ARGB |
| 3139 punpcklbw xmm0, xmm0 // GG |
| 3140 movdqa xmm1, xmm0 |
| 3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels |
| 3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels |
| 3143 por xmm0, xmm4 |
| 3144 por xmm1, xmm4 |
| 3145 movdqa [edx], xmm0 |
| 3146 movdqa [edx + 16], xmm1 |
| 3147 lea edx, [edx + 32] |
| 3148 sub ecx, 8 |
| 3149 jg convertloop |
| 3150 |
| 3151 ret |
| 3152 } |
| 3153 } |
| 3154 #endif // HAS_YTOARGBROW_SSE2 |
| 3155 |
| 3156 #ifdef HAS_MIRRORROW_SSSE3 |
| 3157 // Shuffle table for reversing the bytes. |
| 3158 static const uvec8 kShuffleMirror = { |
| 3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 3160 }; |
| 3161 |
| 3162 __declspec(naked) __declspec(align(16)) |
| 3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 3164 __asm { |
| 3165 mov eax, [esp + 4] // src |
| 3166 mov edx, [esp + 8] // dst |
| 3167 mov ecx, [esp + 12] // width |
| 3168 movdqa xmm5, kShuffleMirror |
| 3169 lea eax, [eax - 16] |
| 3170 |
| 3171 align 4 |
| 3172 convertloop: |
| 3173 movdqa xmm0, [eax + ecx] |
| 3174 pshufb xmm0, xmm5 |
| 3175 sub ecx, 16 |
| 3176 movdqa [edx], xmm0 |
| 3177 lea edx, [edx + 16] |
| 3178 jg convertloop |
| 3179 ret |
| 3180 } |
| 3181 } |
| 3182 #endif // HAS_MIRRORROW_SSSE3 |
| 3183 |
| 3184 #ifdef HAS_MIRRORROW_AVX2 |
| 3185 // Shuffle table for reversing the bytes. |
| 3186 static const ulvec8 kShuffleMirror_AVX2 = { |
| 3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, |
| 3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 3189 }; |
| 3190 |
| 3191 __declspec(naked) __declspec(align(16)) |
| 3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3193 __asm { |
| 3194 mov eax, [esp + 4] // src |
| 3195 mov edx, [esp + 8] // dst |
| 3196 mov ecx, [esp + 12] // width |
| 3197 vmovdqa ymm5, kShuffleMirror_AVX2 |
| 3198 lea eax, [eax - 32] |
| 3199 |
| 3200 align 4 |
| 3201 convertloop: |
| 3202 vmovdqu ymm0, [eax + ecx] |
| 3203 vpshufb ymm0, ymm0, ymm5 |
| 3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
| 3205 sub ecx, 32 |
| 3206 vmovdqu [edx], ymm0 |
| 3207 lea edx, [edx + 32] |
| 3208 jg convertloop |
| 3209 vzeroupper |
| 3210 ret |
| 3211 } |
| 3212 } |
| 3213 #endif // HAS_MIRRORROW_AVX2 |
| 3214 |
| 3215 #ifdef HAS_MIRRORROW_SSE2 |
| 3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 |
| 3217 // version can not. |
| 3218 __declspec(naked) __declspec(align(16)) |
| 3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 3220 __asm { |
| 3221 mov eax, [esp + 4] // src |
| 3222 mov edx, [esp + 8] // dst |
| 3223 mov ecx, [esp + 12] // width |
| 3224 lea eax, [eax - 16] |
| 3225 |
| 3226 align 4 |
| 3227 convertloop: |
| 3228 movdqu xmm0, [eax + ecx] |
| 3229 movdqa xmm1, xmm0 // swap bytes |
| 3230 psllw xmm0, 8 |
| 3231 psrlw xmm1, 8 |
| 3232 por xmm0, xmm1 |
| 3233 pshuflw xmm0, xmm0, 0x1b // swap words |
| 3234 pshufhw xmm0, xmm0, 0x1b |
| 3235 pshufd xmm0, xmm0, 0x4e // swap qwords |
| 3236 sub ecx, 16 |
| 3237 movdqu [edx], xmm0 |
| 3238 lea edx, [edx + 16] |
| 3239 jg convertloop |
| 3240 ret |
| 3241 } |
| 3242 } |
| 3243 #endif // HAS_MIRRORROW_SSE2 |
| 3244 |
| 3245 #ifdef HAS_MIRRORROW_UV_SSSE3 |
| 3246 // Shuffle table for reversing the bytes of UV channels. |
| 3247 static const uvec8 kShuffleMirrorUV = { |
| 3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
| 3249 }; |
| 3250 |
| 3251 __declspec(naked) __declspec(align(16)) |
| 3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
| 3253 int width) { |
| 3254 __asm { |
| 3255 push edi |
| 3256 mov eax, [esp + 4 + 4] // src |
| 3257 mov edx, [esp + 4 + 8] // dst_u |
| 3258 mov edi, [esp + 4 + 12] // dst_v |
| 3259 mov ecx, [esp + 4 + 16] // width |
| 3260 movdqa xmm1, kShuffleMirrorUV |
| 3261 lea eax, [eax + ecx * 2 - 16] |
| 3262 sub edi, edx |
| 3263 |
| 3264 align 4 |
| 3265 convertloop: |
| 3266 movdqa xmm0, [eax] |
| 3267 lea eax, [eax - 16] |
| 3268 pshufb xmm0, xmm1 |
| 3269 sub ecx, 8 |
| 3270 movlpd qword ptr [edx], xmm0 |
| 3271 movhpd qword ptr [edx + edi], xmm0 |
| 3272 lea edx, [edx + 8] |
| 3273 jg convertloop |
| 3274 |
| 3275 pop edi |
| 3276 ret |
| 3277 } |
| 3278 } |
| 3279 #endif // HAS_MIRRORROW_UV_SSSE3 |
| 3280 |
| 3281 #ifdef HAS_ARGBMIRRORROW_SSSE3 |
| 3282 // Shuffle table for reversing the bytes. |
| 3283 static const uvec8 kARGBShuffleMirror = { |
| 3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u |
| 3285 }; |
| 3286 |
| 3287 __declspec(naked) __declspec(align(16)) |
| 3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 3289 __asm { |
| 3290 mov eax, [esp + 4] // src |
| 3291 mov edx, [esp + 8] // dst |
| 3292 mov ecx, [esp + 12] // width |
| 3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
| 3294 movdqa xmm5, kARGBShuffleMirror |
| 3295 |
| 3296 align 4 |
| 3297 convertloop: |
| 3298 movdqa xmm0, [eax] |
| 3299 lea eax, [eax - 16] |
| 3300 pshufb xmm0, xmm5 |
| 3301 sub ecx, 4 |
| 3302 movdqa [edx], xmm0 |
| 3303 lea edx, [edx + 16] |
| 3304 jg convertloop |
| 3305 ret |
| 3306 } |
| 3307 } |
| 3308 #endif // HAS_ARGBMIRRORROW_SSSE3 |
| 3309 |
| 3310 #ifdef HAS_ARGBMIRRORROW_AVX2 |
| 3311 // Shuffle table for reversing the bytes. |
| 3312 static const ulvec32 kARGBShuffleMirror_AVX2 = { |
| 3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
| 3314 }; |
| 3315 |
| 3316 __declspec(naked) __declspec(align(16)) |
| 3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3318 __asm { |
| 3319 mov eax, [esp + 4] // src |
| 3320 mov edx, [esp + 8] // dst |
| 3321 mov ecx, [esp + 12] // width |
| 3322 lea eax, [eax - 32] |
| 3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2 |
| 3324 |
| 3325 align 4 |
| 3326 convertloop: |
| 3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order |
| 3328 sub ecx, 8 |
| 3329 vmovdqu [edx], ymm0 |
| 3330 lea edx, [edx + 32] |
| 3331 jg convertloop |
| 3332 vzeroupper |
| 3333 ret |
| 3334 } |
| 3335 } |
| 3336 #endif // HAS_ARGBMIRRORROW_AVX2 |
| 3337 |
| 3338 #ifdef HAS_SPLITUVROW_SSE2 |
| 3339 __declspec(naked) __declspec(align(16)) |
| 3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
| 3341 __asm { |
| 3342 push edi |
| 3343 mov eax, [esp + 4 + 4] // src_uv |
| 3344 mov edx, [esp + 4 + 8] // dst_u |
| 3345 mov edi, [esp + 4 + 12] // dst_v |
| 3346 mov ecx, [esp + 4 + 16] // pix |
| 3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3348 psrlw xmm5, 8 |
| 3349 sub edi, edx |
| 3350 |
| 3351 align 4 |
| 3352 convertloop: |
| 3353 movdqa xmm0, [eax] |
| 3354 movdqa xmm1, [eax + 16] |
| 3355 lea eax, [eax + 32] |
| 3356 movdqa xmm2, xmm0 |
| 3357 movdqa xmm3, xmm1 |
| 3358 pand xmm0, xmm5 // even bytes |
| 3359 pand xmm1, xmm5 |
| 3360 packuswb xmm0, xmm1 |
| 3361 psrlw xmm2, 8 // odd bytes |
| 3362 psrlw xmm3, 8 |
| 3363 packuswb xmm2, xmm3 |
| 3364 movdqa [edx], xmm0 |
| 3365 movdqa [edx + edi], xmm2 |
| 3366 lea edx, [edx + 16] |
| 3367 sub ecx, 16 |
| 3368 jg convertloop |
| 3369 |
| 3370 pop edi |
| 3371 ret |
| 3372 } |
| 3373 } |
| 3374 |
| 3375 __declspec(naked) __declspec(align(16)) |
| 3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 3377 int pix) { |
| 3378 __asm { |
| 3379 push edi |
| 3380 mov eax, [esp + 4 + 4] // src_uv |
| 3381 mov edx, [esp + 4 + 8] // dst_u |
| 3382 mov edi, [esp + 4 + 12] // dst_v |
| 3383 mov ecx, [esp + 4 + 16] // pix |
| 3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 3385 psrlw xmm5, 8 |
| 3386 sub edi, edx |
| 3387 |
| 3388 align 4 |
| 3389 convertloop: |
| 3390 movdqu xmm0, [eax] |
| 3391 movdqu xmm1, [eax + 16] |
| 3392 lea eax, [eax + 32] |
| 3393 movdqa xmm2, xmm0 |
| 3394 movdqa xmm3, xmm1 |
| 3395 pand xmm0, xmm5 // even bytes |
| 3396 pand xmm1, xmm5 |
| 3397 packuswb xmm0, xmm1 |
| 3398 psrlw xmm2, 8 // odd bytes |
| 3399 psrlw xmm3, 8 |
| 3400 packuswb xmm2, xmm3 |
| 3401 movdqu [edx], xmm0 |
| 3402 movdqu [edx + edi], xmm2 |
| 3403 lea edx, [edx + 16] |
| 3404 sub ecx, 16 |
| 3405 jg convertloop |
| 3406 |
| 3407 pop edi |
| 3408 ret |
| 3409 } |
| 3410 } |
| 3411 #endif // HAS_SPLITUVROW_SSE2 |
| 3412 |
| 3413 #ifdef HAS_SPLITUVROW_AVX2 |
| 3414 __declspec(naked) __declspec(align(16)) |
| 3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
| 3416 __asm { |
| 3417 push edi |
| 3418 mov eax, [esp + 4 + 4] // src_uv |
| 3419 mov edx, [esp + 4 + 8] // dst_u |
| 3420 mov edi, [esp + 4 + 12] // dst_v |
| 3421 mov ecx, [esp + 4 + 16] // pix |
| 3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3423 vpsrlw ymm5, ymm5, 8 |
| 3424 sub edi, edx |
| 3425 |
| 3426 align 4 |
| 3427 convertloop: |
| 3428 vmovdqu ymm0, [eax] |
| 3429 vmovdqu ymm1, [eax + 32] |
| 3430 lea eax, [eax + 64] |
| 3431 vpsrlw ymm2, ymm0, 8 // odd bytes |
| 3432 vpsrlw ymm3, ymm1, 8 |
| 3433 vpand ymm0, ymm0, ymm5 // even bytes |
| 3434 vpand ymm1, ymm1, ymm5 |
| 3435 vpackuswb ymm0, ymm0, ymm1 |
| 3436 vpackuswb ymm2, ymm2, ymm3 |
| 3437 vpermq ymm0, ymm0, 0xd8 |
| 3438 vpermq ymm2, ymm2, 0xd8 |
| 3439 vmovdqu [edx], ymm0 |
| 3440 vmovdqu [edx + edi], ymm2 |
| 3441 lea edx, [edx + 32] |
| 3442 sub ecx, 32 |
| 3443 jg convertloop |
| 3444 |
| 3445 pop edi |
| 3446 vzeroupper |
| 3447 ret |
| 3448 } |
| 3449 } |
| 3450 #endif // HAS_SPLITUVROW_AVX2 |
| 3451 |
| 3452 #ifdef HAS_MERGEUVROW_SSE2 |
| 3453 __declspec(naked) __declspec(align(16)) |
| 3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 3455 int width) { |
| 3456 __asm { |
| 3457 push edi |
| 3458 mov eax, [esp + 4 + 4] // src_u |
| 3459 mov edx, [esp + 4 + 8] // src_v |
| 3460 mov edi, [esp + 4 + 12] // dst_uv |
| 3461 mov ecx, [esp + 4 + 16] // width |
| 3462 sub edx, eax |
| 3463 |
| 3464 align 4 |
| 3465 convertloop: |
| 3466 movdqa xmm0, [eax] // read 16 U's |
| 3467 movdqa xmm1, [eax + edx] // and 16 V's |
| 3468 lea eax, [eax + 16] |
| 3469 movdqa xmm2, xmm0 |
| 3470 punpcklbw xmm0, xmm1 // first 8 UV pairs |
| 3471 punpckhbw xmm2, xmm1 // next 8 UV pairs |
| 3472 movdqa [edi], xmm0 |
| 3473 movdqa [edi + 16], xmm2 |
| 3474 lea edi, [edi + 32] |
| 3475 sub ecx, 16 |
| 3476 jg convertloop |
| 3477 |
| 3478 pop edi |
| 3479 ret |
| 3480 } |
| 3481 } |
| 3482 |
| 3483 __declspec(naked) __declspec(align(16)) |
| 3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, |
| 3485 uint8* dst_uv, int width) { |
| 3486 __asm { |
| 3487 push edi |
| 3488 mov eax, [esp + 4 + 4] // src_u |
| 3489 mov edx, [esp + 4 + 8] // src_v |
| 3490 mov edi, [esp + 4 + 12] // dst_uv |
| 3491 mov ecx, [esp + 4 + 16] // width |
| 3492 sub edx, eax |
| 3493 |
| 3494 align 4 |
| 3495 convertloop: |
| 3496 movdqu xmm0, [eax] // read 16 U's |
| 3497 movdqu xmm1, [eax + edx] // and 16 V's |
| 3498 lea eax, [eax + 16] |
| 3499 movdqa xmm2, xmm0 |
| 3500 punpcklbw xmm0, xmm1 // first 8 UV pairs |
| 3501 punpckhbw xmm2, xmm1 // next 8 UV pairs |
| 3502 movdqu [edi], xmm0 |
| 3503 movdqu [edi + 16], xmm2 |
| 3504 lea edi, [edi + 32] |
| 3505 sub ecx, 16 |
| 3506 jg convertloop |
| 3507 |
| 3508 pop edi |
| 3509 ret |
| 3510 } |
| 3511 } |
| 3512 #endif // HAS_MERGEUVROW_SSE2 |
| 3513 |
| 3514 #ifdef HAS_MERGEUVROW_AVX2 |
| 3515 __declspec(naked) __declspec(align(16)) |
| 3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 3517 int width) { |
| 3518 __asm { |
| 3519 push edi |
| 3520 mov eax, [esp + 4 + 4] // src_u |
| 3521 mov edx, [esp + 4 + 8] // src_v |
| 3522 mov edi, [esp + 4 + 12] // dst_uv |
| 3523 mov ecx, [esp + 4 + 16] // width |
| 3524 sub edx, eax |
| 3525 |
| 3526 align 4 |
| 3527 convertloop: |
| 3528 vmovdqu ymm0, [eax] // read 32 U's |
| 3529 vmovdqu ymm1, [eax + edx] // and 32 V's |
| 3530 lea eax, [eax + 32] |
| 3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 |
| 3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 |
| 3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
| 3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
| 3535 vmovdqu [edi], ymm1 |
| 3536 vmovdqu [edi + 32], ymm2 |
| 3537 lea edi, [edi + 64] |
| 3538 sub ecx, 32 |
| 3539 jg convertloop |
| 3540 |
| 3541 pop edi |
| 3542 vzeroupper |
| 3543 ret |
| 3544 } |
| 3545 } |
| 3546 #endif // HAS_MERGEUVROW_AVX2 |
| 3547 |
| 3548 #ifdef HAS_COPYROW_SSE2 |
| 3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. |
| 3550 __declspec(naked) __declspec(align(16)) |
| 3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
| 3552 __asm { |
| 3553 mov eax, [esp + 4] // src |
| 3554 mov edx, [esp + 8] // dst |
| 3555 mov ecx, [esp + 12] // count |
| 3556 |
| 3557 align 4 |
| 3558 convertloop: |
| 3559 movdqa xmm0, [eax] |
| 3560 movdqa xmm1, [eax + 16] |
| 3561 lea eax, [eax + 32] |
| 3562 movdqa [edx], xmm0 |
| 3563 movdqa [edx + 16], xmm1 |
| 3564 lea edx, [edx + 32] |
| 3565 sub ecx, 32 |
| 3566 jg convertloop |
| 3567 ret |
| 3568 } |
| 3569 } |
| 3570 #endif // HAS_COPYROW_SSE2 |
| 3571 |
| 3572 // Unaligned Multiple of 1. |
| 3573 __declspec(naked) __declspec(align(16)) |
| 3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { |
| 3575 __asm { |
| 3576 mov eax, esi |
| 3577 mov edx, edi |
| 3578 mov esi, [esp + 4] // src |
| 3579 mov edi, [esp + 8] // dst |
| 3580 mov ecx, [esp + 12] // count |
| 3581 rep movsb |
| 3582 mov edi, edx |
| 3583 mov esi, eax |
| 3584 ret |
| 3585 } |
| 3586 } |
| 3587 |
| 3588 #ifdef HAS_COPYROW_X86 |
| 3589 __declspec(naked) __declspec(align(16)) |
| 3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) { |
| 3591 __asm { |
| 3592 mov eax, esi |
| 3593 mov edx, edi |
| 3594 mov esi, [esp + 4] // src |
| 3595 mov edi, [esp + 8] // dst |
| 3596 mov ecx, [esp + 12] // count |
| 3597 shr ecx, 2 |
| 3598 rep movsd |
| 3599 mov edi, edx |
| 3600 mov esi, eax |
| 3601 ret |
| 3602 } |
| 3603 } |
| 3604 #endif // HAS_COPYROW_X86 |
| 3605 |
| 3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
| 3607 // width in pixels |
| 3608 __declspec(naked) __declspec(align(16)) |
| 3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 3610 __asm { |
| 3611 mov eax, [esp + 4] // src |
| 3612 mov edx, [esp + 8] // dst |
| 3613 mov ecx, [esp + 12] // count |
| 3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| 3615 pslld xmm0, 24 |
| 3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| 3617 psrld xmm1, 8 |
| 3618 |
| 3619 align 4 |
| 3620 convertloop: |
| 3621 movdqa xmm2, [eax] |
| 3622 movdqa xmm3, [eax + 16] |
| 3623 lea eax, [eax + 32] |
| 3624 movdqa xmm4, [edx] |
| 3625 movdqa xmm5, [edx + 16] |
| 3626 pand xmm2, xmm0 |
| 3627 pand xmm3, xmm0 |
| 3628 pand xmm4, xmm1 |
| 3629 pand xmm5, xmm1 |
| 3630 por xmm2, xmm4 |
| 3631 por xmm3, xmm5 |
| 3632 movdqa [edx], xmm2 |
| 3633 movdqa [edx + 16], xmm3 |
| 3634 lea edx, [edx + 32] |
| 3635 sub ecx, 8 |
| 3636 jg convertloop |
| 3637 |
| 3638 ret |
| 3639 } |
| 3640 } |
| 3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
| 3642 |
| 3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
| 3644 // width in pixels |
| 3645 __declspec(naked) __declspec(align(16)) |
| 3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3647 __asm { |
| 3648 mov eax, [esp + 4] // src |
| 3649 mov edx, [esp + 8] // dst |
| 3650 mov ecx, [esp + 12] // count |
| 3651 vpcmpeqb ymm0, ymm0, ymm0 |
| 3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| 3653 |
| 3654 align 4 |
| 3655 convertloop: |
| 3656 vmovdqu ymm1, [eax] |
| 3657 vmovdqu ymm2, [eax + 32] |
| 3658 lea eax, [eax + 64] |
| 3659 vpblendvb ymm1, ymm1, [edx], ymm0 |
| 3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
| 3661 vmovdqu [edx], ymm1 |
| 3662 vmovdqu [edx + 32], ymm2 |
| 3663 lea edx, [edx + 64] |
| 3664 sub ecx, 16 |
| 3665 jg convertloop |
| 3666 |
| 3667 vzeroupper |
| 3668 ret |
| 3669 } |
| 3670 } |
| 3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
| 3672 |
| 3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 3674 // width in pixels |
| 3675 __declspec(naked) __declspec(align(16)) |
| 3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 3677 __asm { |
| 3678 mov eax, [esp + 4] // src |
| 3679 mov edx, [esp + 8] // dst |
| 3680 mov ecx, [esp + 12] // count |
| 3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
| 3682 pslld xmm0, 24 |
| 3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
| 3684 psrld xmm1, 8 |
| 3685 |
| 3686 align 4 |
| 3687 convertloop: |
| 3688 movq xmm2, qword ptr [eax] // 8 Y's |
| 3689 lea eax, [eax + 8] |
| 3690 punpcklbw xmm2, xmm2 |
| 3691 punpckhwd xmm3, xmm2 |
| 3692 punpcklwd xmm2, xmm2 |
| 3693 movdqa xmm4, [edx] |
| 3694 movdqa xmm5, [edx + 16] |
| 3695 pand xmm2, xmm0 |
| 3696 pand xmm3, xmm0 |
| 3697 pand xmm4, xmm1 |
| 3698 pand xmm5, xmm1 |
| 3699 por xmm2, xmm4 |
| 3700 por xmm3, xmm5 |
| 3701 movdqa [edx], xmm2 |
| 3702 movdqa [edx + 16], xmm3 |
| 3703 lea edx, [edx + 32] |
| 3704 sub ecx, 8 |
| 3705 jg convertloop |
| 3706 |
| 3707 ret |
| 3708 } |
| 3709 } |
| 3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
| 3711 |
| 3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3713 // width in pixels |
| 3714 __declspec(naked) __declspec(align(16)) |
| 3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 3716 __asm { |
| 3717 mov eax, [esp + 4] // src |
| 3718 mov edx, [esp + 8] // dst |
| 3719 mov ecx, [esp + 12] // count |
| 3720 vpcmpeqb ymm0, ymm0, ymm0 |
| 3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
| 3722 |
| 3723 align 4 |
| 3724 convertloop: |
| 3725 vpmovzxbd ymm1, qword ptr [eax] |
| 3726 vpmovzxbd ymm2, qword ptr [eax + 8] |
| 3727 lea eax, [eax + 16] |
| 3728 vpslld ymm1, ymm1, 24 |
| 3729 vpslld ymm2, ymm2, 24 |
| 3730 vpblendvb ymm1, ymm1, [edx], ymm0 |
| 3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
| 3732 vmovdqu [edx], ymm1 |
| 3733 vmovdqu [edx + 32], ymm2 |
| 3734 lea edx, [edx + 64] |
| 3735 sub ecx, 16 |
| 3736 jg convertloop |
| 3737 |
| 3738 vzeroupper |
| 3739 ret |
| 3740 } |
| 3741 } |
| 3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 3743 |
| 3744 #ifdef HAS_SETROW_X86 |
| 3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
| 3746 __declspec(naked) __declspec(align(16)) |
| 3747 void SetRow_X86(uint8* dst, uint32 v32, int count) { |
| 3748 __asm { |
| 3749 mov edx, edi |
| 3750 mov edi, [esp + 4] // dst |
| 3751 mov eax, [esp + 8] // v32 |
| 3752 mov ecx, [esp + 12] // count |
| 3753 shr ecx, 2 |
| 3754 rep stosd |
| 3755 mov edi, edx |
| 3756 ret |
| 3757 } |
| 3758 } |
| 3759 |
| 3760 // SetRow32 writes 'count' words using a 32 bit value repeated. |
| 3761 __declspec(naked) __declspec(align(16)) |
| 3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, |
| 3763 int dst_stride, int height) { |
| 3764 __asm { |
| 3765 push esi |
| 3766 push edi |
| 3767 push ebp |
| 3768 mov edi, [esp + 12 + 4] // dst |
| 3769 mov eax, [esp + 12 + 8] // v32 |
| 3770 mov ebp, [esp + 12 + 12] // width |
| 3771 mov edx, [esp + 12 + 16] // dst_stride |
| 3772 mov esi, [esp + 12 + 20] // height |
| 3773 lea ecx, [ebp * 4] |
| 3774 sub edx, ecx // stride - width * 4 |
| 3775 |
| 3776 align 4 |
| 3777 convertloop: |
| 3778 mov ecx, ebp |
| 3779 rep stosd |
| 3780 add edi, edx |
| 3781 sub esi, 1 |
| 3782 jg convertloop |
| 3783 |
| 3784 pop ebp |
| 3785 pop edi |
| 3786 pop esi |
| 3787 ret |
| 3788 } |
| 3789 } |
| 3790 #endif // HAS_SETROW_X86 |
| 3791 |
| 3792 #ifdef HAS_YUY2TOYROW_AVX2 |
| 3793 __declspec(naked) __declspec(align(16)) |
| 3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2, |
| 3795 uint8* dst_y, int pix) { |
| 3796 __asm { |
| 3797 mov eax, [esp + 4] // src_yuy2 |
| 3798 mov edx, [esp + 8] // dst_y |
| 3799 mov ecx, [esp + 12] // pix |
| 3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3801 vpsrlw ymm5, ymm5, 8 |
| 3802 |
| 3803 align 4 |
| 3804 convertloop: |
| 3805 vmovdqu ymm0, [eax] |
| 3806 vmovdqu ymm1, [eax + 32] |
| 3807 lea eax, [eax + 64] |
| 3808 vpand ymm0, ymm0, ymm5 // even bytes are Y |
| 3809 vpand ymm1, ymm1, ymm5 |
| 3810 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3811 vpermq ymm0, ymm0, 0xd8 |
| 3812 sub ecx, 32 |
| 3813 vmovdqu [edx], ymm0 |
| 3814 lea edx, [edx + 32] |
| 3815 jg convertloop |
| 3816 vzeroupper |
| 3817 ret |
| 3818 } |
| 3819 } |
| 3820 |
| 3821 __declspec(naked) __declspec(align(16)) |
| 3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
| 3823 uint8* dst_u, uint8* dst_v, int pix) { |
| 3824 __asm { |
| 3825 push esi |
| 3826 push edi |
| 3827 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3828 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3829 mov edx, [esp + 8 + 12] // dst_u |
| 3830 mov edi, [esp + 8 + 16] // dst_v |
| 3831 mov ecx, [esp + 8 + 20] // pix |
| 3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3833 vpsrlw ymm5, ymm5, 8 |
| 3834 sub edi, edx |
| 3835 |
| 3836 align 4 |
| 3837 convertloop: |
| 3838 vmovdqu ymm0, [eax] |
| 3839 vmovdqu ymm1, [eax + 32] |
| 3840 vpavgb ymm0, ymm0, [eax + esi] |
| 3841 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 3842 lea eax, [eax + 64] |
| 3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
| 3844 vpsrlw ymm1, ymm1, 8 |
| 3845 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3846 vpermq ymm0, ymm0, 0xd8 |
| 3847 vpand ymm1, ymm0, ymm5 // U |
| 3848 vpsrlw ymm0, ymm0, 8 // V |
| 3849 vpackuswb ymm1, ymm1, ymm1 // mutates. |
| 3850 vpackuswb ymm0, ymm0, ymm0 // mutates. |
| 3851 vpermq ymm1, ymm1, 0xd8 |
| 3852 vpermq ymm0, ymm0, 0xd8 |
| 3853 vextractf128 [edx], ymm1, 0 // U |
| 3854 vextractf128 [edx + edi], ymm0, 0 // V |
| 3855 lea edx, [edx + 16] |
| 3856 sub ecx, 32 |
| 3857 jg convertloop |
| 3858 |
| 3859 pop edi |
| 3860 pop esi |
| 3861 vzeroupper |
| 3862 ret |
| 3863 } |
| 3864 } |
| 3865 |
| 3866 __declspec(naked) __declspec(align(16)) |
| 3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
| 3868 uint8* dst_u, uint8* dst_v, int pix) { |
| 3869 __asm { |
| 3870 push edi |
| 3871 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3872 mov edx, [esp + 4 + 8] // dst_u |
| 3873 mov edi, [esp + 4 + 12] // dst_v |
| 3874 mov ecx, [esp + 4 + 16] // pix |
| 3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3876 vpsrlw ymm5, ymm5, 8 |
| 3877 sub edi, edx |
| 3878 |
| 3879 align 4 |
| 3880 convertloop: |
| 3881 vmovdqu ymm0, [eax] |
| 3882 vmovdqu ymm1, [eax + 32] |
| 3883 lea eax, [eax + 64] |
| 3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
| 3885 vpsrlw ymm1, ymm1, 8 |
| 3886 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3887 vpermq ymm0, ymm0, 0xd8 |
| 3888 vpand ymm1, ymm0, ymm5 // U |
| 3889 vpsrlw ymm0, ymm0, 8 // V |
| 3890 vpackuswb ymm1, ymm1, ymm1 // mutates. |
| 3891 vpackuswb ymm0, ymm0, ymm0 // mutates. |
| 3892 vpermq ymm1, ymm1, 0xd8 |
| 3893 vpermq ymm0, ymm0, 0xd8 |
| 3894 vextractf128 [edx], ymm1, 0 // U |
| 3895 vextractf128 [edx + edi], ymm0, 0 // V |
| 3896 lea edx, [edx + 16] |
| 3897 sub ecx, 32 |
| 3898 jg convertloop |
| 3899 |
| 3900 pop edi |
| 3901 vzeroupper |
| 3902 ret |
| 3903 } |
| 3904 } |
| 3905 |
| 3906 __declspec(naked) __declspec(align(16)) |
| 3907 void UYVYToYRow_AVX2(const uint8* src_uyvy, |
| 3908 uint8* dst_y, int pix) { |
| 3909 __asm { |
| 3910 mov eax, [esp + 4] // src_uyvy |
| 3911 mov edx, [esp + 8] // dst_y |
| 3912 mov ecx, [esp + 12] // pix |
| 3913 |
| 3914 align 4 |
| 3915 convertloop: |
| 3916 vmovdqu ymm0, [eax] |
| 3917 vmovdqu ymm1, [eax + 32] |
| 3918 lea eax, [eax + 64] |
| 3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
| 3920 vpsrlw ymm1, ymm1, 8 |
| 3921 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3922 vpermq ymm0, ymm0, 0xd8 |
| 3923 sub ecx, 32 |
| 3924 vmovdqu [edx], ymm0 |
| 3925 lea edx, [edx + 32] |
| 3926 jg convertloop |
| 3927 ret |
| 3928 vzeroupper |
| 3929 } |
| 3930 } |
| 3931 |
| 3932 __declspec(naked) __declspec(align(16)) |
| 3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
| 3934 uint8* dst_u, uint8* dst_v, int pix) { |
| 3935 __asm { |
| 3936 push esi |
| 3937 push edi |
| 3938 mov eax, [esp + 8 + 4] // src_yuy2 |
| 3939 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 3940 mov edx, [esp + 8 + 12] // dst_u |
| 3941 mov edi, [esp + 8 + 16] // dst_v |
| 3942 mov ecx, [esp + 8 + 20] // pix |
| 3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3944 vpsrlw ymm5, ymm5, 8 |
| 3945 sub edi, edx |
| 3946 |
| 3947 align 4 |
| 3948 convertloop: |
| 3949 vmovdqu ymm0, [eax] |
| 3950 vmovdqu ymm1, [eax + 32] |
| 3951 vpavgb ymm0, ymm0, [eax + esi] |
| 3952 vpavgb ymm1, ymm1, [eax + esi + 32] |
| 3953 lea eax, [eax + 64] |
| 3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
| 3955 vpand ymm1, ymm1, ymm5 |
| 3956 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3957 vpermq ymm0, ymm0, 0xd8 |
| 3958 vpand ymm1, ymm0, ymm5 // U |
| 3959 vpsrlw ymm0, ymm0, 8 // V |
| 3960 vpackuswb ymm1, ymm1, ymm1 // mutates. |
| 3961 vpackuswb ymm0, ymm0, ymm0 // mutates. |
| 3962 vpermq ymm1, ymm1, 0xd8 |
| 3963 vpermq ymm0, ymm0, 0xd8 |
| 3964 vextractf128 [edx], ymm1, 0 // U |
| 3965 vextractf128 [edx + edi], ymm0, 0 // V |
| 3966 lea edx, [edx + 16] |
| 3967 sub ecx, 32 |
| 3968 jg convertloop |
| 3969 |
| 3970 pop edi |
| 3971 pop esi |
| 3972 vzeroupper |
| 3973 ret |
| 3974 } |
| 3975 } |
| 3976 |
| 3977 __declspec(naked) __declspec(align(16)) |
| 3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
| 3979 uint8* dst_u, uint8* dst_v, int pix) { |
| 3980 __asm { |
| 3981 push edi |
| 3982 mov eax, [esp + 4 + 4] // src_yuy2 |
| 3983 mov edx, [esp + 4 + 8] // dst_u |
| 3984 mov edi, [esp + 4 + 12] // dst_v |
| 3985 mov ecx, [esp + 4 + 16] // pix |
| 3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
| 3987 vpsrlw ymm5, ymm5, 8 |
| 3988 sub edi, edx |
| 3989 |
| 3990 align 4 |
| 3991 convertloop: |
| 3992 vmovdqu ymm0, [eax] |
| 3993 vmovdqu ymm1, [eax + 32] |
| 3994 lea eax, [eax + 64] |
| 3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
| 3996 vpand ymm1, ymm1, ymm5 |
| 3997 vpackuswb ymm0, ymm0, ymm1 // mutates. |
| 3998 vpermq ymm0, ymm0, 0xd8 |
| 3999 vpand ymm1, ymm0, ymm5 // U |
| 4000 vpsrlw ymm0, ymm0, 8 // V |
| 4001 vpackuswb ymm1, ymm1, ymm1 // mutates. |
| 4002 vpackuswb ymm0, ymm0, ymm0 // mutates. |
| 4003 vpermq ymm1, ymm1, 0xd8 |
| 4004 vpermq ymm0, ymm0, 0xd8 |
| 4005 vextractf128 [edx], ymm1, 0 // U |
| 4006 vextractf128 [edx + edi], ymm0, 0 // V |
| 4007 lea edx, [edx + 16] |
| 4008 sub ecx, 32 |
| 4009 jg convertloop |
| 4010 |
| 4011 pop edi |
| 4012 vzeroupper |
| 4013 ret |
| 4014 } |
| 4015 } |
| 4016 #endif // HAS_YUY2TOYROW_AVX2 |
| 4017 |
| 4018 #ifdef HAS_YUY2TOYROW_SSE2 |
| 4019 __declspec(naked) __declspec(align(16)) |
| 4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
| 4021 uint8* dst_y, int pix) { |
| 4022 __asm { |
| 4023 mov eax, [esp + 4] // src_yuy2 |
| 4024 mov edx, [esp + 8] // dst_y |
| 4025 mov ecx, [esp + 12] // pix |
| 4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4027 psrlw xmm5, 8 |
| 4028 |
| 4029 align 4 |
| 4030 convertloop: |
| 4031 movdqa xmm0, [eax] |
| 4032 movdqa xmm1, [eax + 16] |
| 4033 lea eax, [eax + 32] |
| 4034 pand xmm0, xmm5 // even bytes are Y |
| 4035 pand xmm1, xmm5 |
| 4036 packuswb xmm0, xmm1 |
| 4037 sub ecx, 16 |
| 4038 movdqa [edx], xmm0 |
| 4039 lea edx, [edx + 16] |
| 4040 jg convertloop |
| 4041 ret |
| 4042 } |
| 4043 } |
| 4044 |
| 4045 __declspec(naked) __declspec(align(16)) |
| 4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
| 4047 uint8* dst_u, uint8* dst_v, int pix) { |
| 4048 __asm { |
| 4049 push esi |
| 4050 push edi |
| 4051 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4052 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4053 mov edx, [esp + 8 + 12] // dst_u |
| 4054 mov edi, [esp + 8 + 16] // dst_v |
| 4055 mov ecx, [esp + 8 + 20] // pix |
| 4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4057 psrlw xmm5, 8 |
| 4058 sub edi, edx |
| 4059 |
| 4060 align 4 |
| 4061 convertloop: |
| 4062 movdqa xmm0, [eax] |
| 4063 movdqa xmm1, [eax + 16] |
| 4064 movdqa xmm2, [eax + esi] |
| 4065 movdqa xmm3, [eax + esi + 16] |
| 4066 lea eax, [eax + 32] |
| 4067 pavgb xmm0, xmm2 |
| 4068 pavgb xmm1, xmm3 |
| 4069 psrlw xmm0, 8 // YUYV -> UVUV |
| 4070 psrlw xmm1, 8 |
| 4071 packuswb xmm0, xmm1 |
| 4072 movdqa xmm1, xmm0 |
| 4073 pand xmm0, xmm5 // U |
| 4074 packuswb xmm0, xmm0 |
| 4075 psrlw xmm1, 8 // V |
| 4076 packuswb xmm1, xmm1 |
| 4077 movq qword ptr [edx], xmm0 |
| 4078 movq qword ptr [edx + edi], xmm1 |
| 4079 lea edx, [edx + 8] |
| 4080 sub ecx, 16 |
| 4081 jg convertloop |
| 4082 |
| 4083 pop edi |
| 4084 pop esi |
| 4085 ret |
| 4086 } |
| 4087 } |
| 4088 |
| 4089 __declspec(naked) __declspec(align(16)) |
| 4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
| 4091 uint8* dst_u, uint8* dst_v, int pix) { |
| 4092 __asm { |
| 4093 push edi |
| 4094 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4095 mov edx, [esp + 4 + 8] // dst_u |
| 4096 mov edi, [esp + 4 + 12] // dst_v |
| 4097 mov ecx, [esp + 4 + 16] // pix |
| 4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4099 psrlw xmm5, 8 |
| 4100 sub edi, edx |
| 4101 |
| 4102 align 4 |
| 4103 convertloop: |
| 4104 movdqa xmm0, [eax] |
| 4105 movdqa xmm1, [eax + 16] |
| 4106 lea eax, [eax + 32] |
| 4107 psrlw xmm0, 8 // YUYV -> UVUV |
| 4108 psrlw xmm1, 8 |
| 4109 packuswb xmm0, xmm1 |
| 4110 movdqa xmm1, xmm0 |
| 4111 pand xmm0, xmm5 // U |
| 4112 packuswb xmm0, xmm0 |
| 4113 psrlw xmm1, 8 // V |
| 4114 packuswb xmm1, xmm1 |
| 4115 movq qword ptr [edx], xmm0 |
| 4116 movq qword ptr [edx + edi], xmm1 |
| 4117 lea edx, [edx + 8] |
| 4118 sub ecx, 16 |
| 4119 jg convertloop |
| 4120 |
| 4121 pop edi |
| 4122 ret |
| 4123 } |
| 4124 } |
| 4125 |
| 4126 __declspec(naked) __declspec(align(16)) |
| 4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, |
| 4128 uint8* dst_y, int pix) { |
| 4129 __asm { |
| 4130 mov eax, [esp + 4] // src_yuy2 |
| 4131 mov edx, [esp + 8] // dst_y |
| 4132 mov ecx, [esp + 12] // pix |
| 4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4134 psrlw xmm5, 8 |
| 4135 |
| 4136 align 4 |
| 4137 convertloop: |
| 4138 movdqu xmm0, [eax] |
| 4139 movdqu xmm1, [eax + 16] |
| 4140 lea eax, [eax + 32] |
| 4141 pand xmm0, xmm5 // even bytes are Y |
| 4142 pand xmm1, xmm5 |
| 4143 packuswb xmm0, xmm1 |
| 4144 sub ecx, 16 |
| 4145 movdqu [edx], xmm0 |
| 4146 lea edx, [edx + 16] |
| 4147 jg convertloop |
| 4148 ret |
| 4149 } |
| 4150 } |
| 4151 |
| 4152 __declspec(naked) __declspec(align(16)) |
| 4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, |
| 4154 uint8* dst_u, uint8* dst_v, int pix) { |
| 4155 __asm { |
| 4156 push esi |
| 4157 push edi |
| 4158 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4159 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4160 mov edx, [esp + 8 + 12] // dst_u |
| 4161 mov edi, [esp + 8 + 16] // dst_v |
| 4162 mov ecx, [esp + 8 + 20] // pix |
| 4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4164 psrlw xmm5, 8 |
| 4165 sub edi, edx |
| 4166 |
| 4167 align 4 |
| 4168 convertloop: |
| 4169 movdqu xmm0, [eax] |
| 4170 movdqu xmm1, [eax + 16] |
| 4171 movdqu xmm2, [eax + esi] |
| 4172 movdqu xmm3, [eax + esi + 16] |
| 4173 lea eax, [eax + 32] |
| 4174 pavgb xmm0, xmm2 |
| 4175 pavgb xmm1, xmm3 |
| 4176 psrlw xmm0, 8 // YUYV -> UVUV |
| 4177 psrlw xmm1, 8 |
| 4178 packuswb xmm0, xmm1 |
| 4179 movdqa xmm1, xmm0 |
| 4180 pand xmm0, xmm5 // U |
| 4181 packuswb xmm0, xmm0 |
| 4182 psrlw xmm1, 8 // V |
| 4183 packuswb xmm1, xmm1 |
| 4184 movq qword ptr [edx], xmm0 |
| 4185 movq qword ptr [edx + edi], xmm1 |
| 4186 lea edx, [edx + 8] |
| 4187 sub ecx, 16 |
| 4188 jg convertloop |
| 4189 |
| 4190 pop edi |
| 4191 pop esi |
| 4192 ret |
| 4193 } |
| 4194 } |
| 4195 |
| 4196 __declspec(naked) __declspec(align(16)) |
| 4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, |
| 4198 uint8* dst_u, uint8* dst_v, int pix) { |
| 4199 __asm { |
| 4200 push edi |
| 4201 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4202 mov edx, [esp + 4 + 8] // dst_u |
| 4203 mov edi, [esp + 4 + 12] // dst_v |
| 4204 mov ecx, [esp + 4 + 16] // pix |
| 4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4206 psrlw xmm5, 8 |
| 4207 sub edi, edx |
| 4208 |
| 4209 align 4 |
| 4210 convertloop: |
| 4211 movdqu xmm0, [eax] |
| 4212 movdqu xmm1, [eax + 16] |
| 4213 lea eax, [eax + 32] |
| 4214 psrlw xmm0, 8 // YUYV -> UVUV |
| 4215 psrlw xmm1, 8 |
| 4216 packuswb xmm0, xmm1 |
| 4217 movdqa xmm1, xmm0 |
| 4218 pand xmm0, xmm5 // U |
| 4219 packuswb xmm0, xmm0 |
| 4220 psrlw xmm1, 8 // V |
| 4221 packuswb xmm1, xmm1 |
| 4222 movq qword ptr [edx], xmm0 |
| 4223 movq qword ptr [edx + edi], xmm1 |
| 4224 lea edx, [edx + 8] |
| 4225 sub ecx, 16 |
| 4226 jg convertloop |
| 4227 |
| 4228 pop edi |
| 4229 ret |
| 4230 } |
| 4231 } |
| 4232 |
| 4233 __declspec(naked) __declspec(align(16)) |
| 4234 void UYVYToYRow_SSE2(const uint8* src_uyvy, |
| 4235 uint8* dst_y, int pix) { |
| 4236 __asm { |
| 4237 mov eax, [esp + 4] // src_uyvy |
| 4238 mov edx, [esp + 8] // dst_y |
| 4239 mov ecx, [esp + 12] // pix |
| 4240 |
| 4241 align 4 |
| 4242 convertloop: |
| 4243 movdqa xmm0, [eax] |
| 4244 movdqa xmm1, [eax + 16] |
| 4245 lea eax, [eax + 32] |
| 4246 psrlw xmm0, 8 // odd bytes are Y |
| 4247 psrlw xmm1, 8 |
| 4248 packuswb xmm0, xmm1 |
| 4249 sub ecx, 16 |
| 4250 movdqa [edx], xmm0 |
| 4251 lea edx, [edx + 16] |
| 4252 jg convertloop |
| 4253 ret |
| 4254 } |
| 4255 } |
| 4256 |
| 4257 __declspec(naked) __declspec(align(16)) |
| 4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 4259 uint8* dst_u, uint8* dst_v, int pix) { |
| 4260 __asm { |
| 4261 push esi |
| 4262 push edi |
| 4263 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4264 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4265 mov edx, [esp + 8 + 12] // dst_u |
| 4266 mov edi, [esp + 8 + 16] // dst_v |
| 4267 mov ecx, [esp + 8 + 20] // pix |
| 4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4269 psrlw xmm5, 8 |
| 4270 sub edi, edx |
| 4271 |
| 4272 align 4 |
| 4273 convertloop: |
| 4274 movdqa xmm0, [eax] |
| 4275 movdqa xmm1, [eax + 16] |
| 4276 movdqa xmm2, [eax + esi] |
| 4277 movdqa xmm3, [eax + esi + 16] |
| 4278 lea eax, [eax + 32] |
| 4279 pavgb xmm0, xmm2 |
| 4280 pavgb xmm1, xmm3 |
| 4281 pand xmm0, xmm5 // UYVY -> UVUV |
| 4282 pand xmm1, xmm5 |
| 4283 packuswb xmm0, xmm1 |
| 4284 movdqa xmm1, xmm0 |
| 4285 pand xmm0, xmm5 // U |
| 4286 packuswb xmm0, xmm0 |
| 4287 psrlw xmm1, 8 // V |
| 4288 packuswb xmm1, xmm1 |
| 4289 movq qword ptr [edx], xmm0 |
| 4290 movq qword ptr [edx + edi], xmm1 |
| 4291 lea edx, [edx + 8] |
| 4292 sub ecx, 16 |
| 4293 jg convertloop |
| 4294 |
| 4295 pop edi |
| 4296 pop esi |
| 4297 ret |
| 4298 } |
| 4299 } |
| 4300 |
| 4301 __declspec(naked) __declspec(align(16)) |
| 4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
| 4303 uint8* dst_u, uint8* dst_v, int pix) { |
| 4304 __asm { |
| 4305 push edi |
| 4306 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4307 mov edx, [esp + 4 + 8] // dst_u |
| 4308 mov edi, [esp + 4 + 12] // dst_v |
| 4309 mov ecx, [esp + 4 + 16] // pix |
| 4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4311 psrlw xmm5, 8 |
| 4312 sub edi, edx |
| 4313 |
| 4314 align 4 |
| 4315 convertloop: |
| 4316 movdqa xmm0, [eax] |
| 4317 movdqa xmm1, [eax + 16] |
| 4318 lea eax, [eax + 32] |
| 4319 pand xmm0, xmm5 // UYVY -> UVUV |
| 4320 pand xmm1, xmm5 |
| 4321 packuswb xmm0, xmm1 |
| 4322 movdqa xmm1, xmm0 |
| 4323 pand xmm0, xmm5 // U |
| 4324 packuswb xmm0, xmm0 |
| 4325 psrlw xmm1, 8 // V |
| 4326 packuswb xmm1, xmm1 |
| 4327 movq qword ptr [edx], xmm0 |
| 4328 movq qword ptr [edx + edi], xmm1 |
| 4329 lea edx, [edx + 8] |
| 4330 sub ecx, 16 |
| 4331 jg convertloop |
| 4332 |
| 4333 pop edi |
| 4334 ret |
| 4335 } |
| 4336 } |
| 4337 |
| 4338 __declspec(naked) __declspec(align(16)) |
| 4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, |
| 4340 uint8* dst_y, int pix) { |
| 4341 __asm { |
| 4342 mov eax, [esp + 4] // src_uyvy |
| 4343 mov edx, [esp + 8] // dst_y |
| 4344 mov ecx, [esp + 12] // pix |
| 4345 |
| 4346 align 4 |
| 4347 convertloop: |
| 4348 movdqu xmm0, [eax] |
| 4349 movdqu xmm1, [eax + 16] |
| 4350 lea eax, [eax + 32] |
| 4351 psrlw xmm0, 8 // odd bytes are Y |
| 4352 psrlw xmm1, 8 |
| 4353 packuswb xmm0, xmm1 |
| 4354 sub ecx, 16 |
| 4355 movdqu [edx], xmm0 |
| 4356 lea edx, [edx + 16] |
| 4357 jg convertloop |
| 4358 ret |
| 4359 } |
| 4360 } |
| 4361 |
| 4362 __declspec(naked) __declspec(align(16)) |
| 4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, |
| 4364 uint8* dst_u, uint8* dst_v, int pix) { |
| 4365 __asm { |
| 4366 push esi |
| 4367 push edi |
| 4368 mov eax, [esp + 8 + 4] // src_yuy2 |
| 4369 mov esi, [esp + 8 + 8] // stride_yuy2 |
| 4370 mov edx, [esp + 8 + 12] // dst_u |
| 4371 mov edi, [esp + 8 + 16] // dst_v |
| 4372 mov ecx, [esp + 8 + 20] // pix |
| 4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4374 psrlw xmm5, 8 |
| 4375 sub edi, edx |
| 4376 |
| 4377 align 4 |
| 4378 convertloop: |
| 4379 movdqu xmm0, [eax] |
| 4380 movdqu xmm1, [eax + 16] |
| 4381 movdqu xmm2, [eax + esi] |
| 4382 movdqu xmm3, [eax + esi + 16] |
| 4383 lea eax, [eax + 32] |
| 4384 pavgb xmm0, xmm2 |
| 4385 pavgb xmm1, xmm3 |
| 4386 pand xmm0, xmm5 // UYVY -> UVUV |
| 4387 pand xmm1, xmm5 |
| 4388 packuswb xmm0, xmm1 |
| 4389 movdqa xmm1, xmm0 |
| 4390 pand xmm0, xmm5 // U |
| 4391 packuswb xmm0, xmm0 |
| 4392 psrlw xmm1, 8 // V |
| 4393 packuswb xmm1, xmm1 |
| 4394 movq qword ptr [edx], xmm0 |
| 4395 movq qword ptr [edx + edi], xmm1 |
| 4396 lea edx, [edx + 8] |
| 4397 sub ecx, 16 |
| 4398 jg convertloop |
| 4399 |
| 4400 pop edi |
| 4401 pop esi |
| 4402 ret |
| 4403 } |
| 4404 } |
| 4405 |
| 4406 __declspec(naked) __declspec(align(16)) |
| 4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, |
| 4408 uint8* dst_u, uint8* dst_v, int pix) { |
| 4409 __asm { |
| 4410 push edi |
| 4411 mov eax, [esp + 4 + 4] // src_yuy2 |
| 4412 mov edx, [esp + 4 + 8] // dst_u |
| 4413 mov edi, [esp + 4 + 12] // dst_v |
| 4414 mov ecx, [esp + 4 + 16] // pix |
| 4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
| 4416 psrlw xmm5, 8 |
| 4417 sub edi, edx |
| 4418 |
| 4419 align 4 |
| 4420 convertloop: |
| 4421 movdqu xmm0, [eax] |
| 4422 movdqu xmm1, [eax + 16] |
| 4423 lea eax, [eax + 32] |
| 4424 pand xmm0, xmm5 // UYVY -> UVUV |
| 4425 pand xmm1, xmm5 |
| 4426 packuswb xmm0, xmm1 |
| 4427 movdqa xmm1, xmm0 |
| 4428 pand xmm0, xmm5 // U |
| 4429 packuswb xmm0, xmm0 |
| 4430 psrlw xmm1, 8 // V |
| 4431 packuswb xmm1, xmm1 |
| 4432 movq qword ptr [edx], xmm0 |
| 4433 movq qword ptr [edx + edi], xmm1 |
| 4434 lea edx, [edx + 8] |
| 4435 sub ecx, 16 |
| 4436 jg convertloop |
| 4437 |
| 4438 pop edi |
| 4439 ret |
| 4440 } |
| 4441 } |
| 4442 #endif // HAS_YUY2TOYROW_SSE2 |
| 4443 |
| 4444 #ifdef HAS_ARGBBLENDROW_SSE2 |
| 4445 // Blend 8 pixels at a time. |
| 4446 __declspec(naked) __declspec(align(16)) |
| 4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 4448 uint8* dst_argb, int width) { |
| 4449 __asm { |
| 4450 push esi |
| 4451 mov eax, [esp + 4 + 4] // src_argb0 |
| 4452 mov esi, [esp + 4 + 8] // src_argb1 |
| 4453 mov edx, [esp + 4 + 12] // dst_argb |
| 4454 mov ecx, [esp + 4 + 16] // width |
| 4455 pcmpeqb xmm7, xmm7 // generate constant 1 |
| 4456 psrlw xmm7, 15 |
| 4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
| 4458 psrlw xmm6, 8 |
| 4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| 4460 psllw xmm5, 8 |
| 4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 4462 pslld xmm4, 24 |
| 4463 |
| 4464 sub ecx, 1 |
| 4465 je convertloop1 // only 1 pixel? |
| 4466 jl convertloop1b |
| 4467 |
| 4468 // 1 pixel loop until destination pointer is aligned. |
| 4469 alignloop1: |
| 4470 test edx, 15 // aligned? |
| 4471 je alignloop1b |
| 4472 movd xmm3, [eax] |
| 4473 lea eax, [eax + 4] |
| 4474 movdqa xmm0, xmm3 // src argb |
| 4475 pxor xmm3, xmm4 // ~alpha |
| 4476 movd xmm2, [esi] // _r_b |
| 4477 psrlw xmm3, 8 // alpha |
| 4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| 4479 pshuflw xmm3, xmm3, 0F5h |
| 4480 pand xmm2, xmm6 // _r_b |
| 4481 paddw xmm3, xmm7 // 256 - alpha |
| 4482 pmullw xmm2, xmm3 // _r_b * alpha |
| 4483 movd xmm1, [esi] // _a_g |
| 4484 lea esi, [esi + 4] |
| 4485 psrlw xmm1, 8 // _a_g |
| 4486 por xmm0, xmm4 // set alpha to 255 |
| 4487 pmullw xmm1, xmm3 // _a_g * alpha |
| 4488 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4489 paddusb xmm0, xmm2 // + src argb |
| 4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4491 paddusb xmm0, xmm1 // + src argb |
| 4492 sub ecx, 1 |
| 4493 movd [edx], xmm0 |
| 4494 lea edx, [edx + 4] |
| 4495 jge alignloop1 |
| 4496 |
| 4497 alignloop1b: |
| 4498 add ecx, 1 - 4 |
| 4499 jl convertloop4b |
| 4500 |
| 4501 // 4 pixel loop. |
| 4502 convertloop4: |
| 4503 movdqu xmm3, [eax] // src argb |
| 4504 lea eax, [eax + 16] |
| 4505 movdqa xmm0, xmm3 // src argb |
| 4506 pxor xmm3, xmm4 // ~alpha |
| 4507 movdqu xmm2, [esi] // _r_b |
| 4508 psrlw xmm3, 8 // alpha |
| 4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| 4510 pshuflw xmm3, xmm3, 0F5h |
| 4511 pand xmm2, xmm6 // _r_b |
| 4512 paddw xmm3, xmm7 // 256 - alpha |
| 4513 pmullw xmm2, xmm3 // _r_b * alpha |
| 4514 movdqu xmm1, [esi] // _a_g |
| 4515 lea esi, [esi + 16] |
| 4516 psrlw xmm1, 8 // _a_g |
| 4517 por xmm0, xmm4 // set alpha to 255 |
| 4518 pmullw xmm1, xmm3 // _a_g * alpha |
| 4519 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4520 paddusb xmm0, xmm2 // + src argb |
| 4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4522 paddusb xmm0, xmm1 // + src argb |
| 4523 sub ecx, 4 |
| 4524 movdqa [edx], xmm0 |
| 4525 lea edx, [edx + 16] |
| 4526 jge convertloop4 |
| 4527 |
| 4528 convertloop4b: |
| 4529 add ecx, 4 - 1 |
| 4530 jl convertloop1b |
| 4531 |
| 4532 // 1 pixel loop. |
| 4533 convertloop1: |
| 4534 movd xmm3, [eax] // src argb |
| 4535 lea eax, [eax + 4] |
| 4536 movdqa xmm0, xmm3 // src argb |
| 4537 pxor xmm3, xmm4 // ~alpha |
| 4538 movd xmm2, [esi] // _r_b |
| 4539 psrlw xmm3, 8 // alpha |
| 4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| 4541 pshuflw xmm3, xmm3, 0F5h |
| 4542 pand xmm2, xmm6 // _r_b |
| 4543 paddw xmm3, xmm7 // 256 - alpha |
| 4544 pmullw xmm2, xmm3 // _r_b * alpha |
| 4545 movd xmm1, [esi] // _a_g |
| 4546 lea esi, [esi + 4] |
| 4547 psrlw xmm1, 8 // _a_g |
| 4548 por xmm0, xmm4 // set alpha to 255 |
| 4549 pmullw xmm1, xmm3 // _a_g * alpha |
| 4550 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4551 paddusb xmm0, xmm2 // + src argb |
| 4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4553 paddusb xmm0, xmm1 // + src argb |
| 4554 sub ecx, 1 |
| 4555 movd [edx], xmm0 |
| 4556 lea edx, [edx + 4] |
| 4557 jge convertloop1 |
| 4558 |
| 4559 convertloop1b: |
| 4560 pop esi |
| 4561 ret |
| 4562 } |
| 4563 } |
| 4564 #endif // HAS_ARGBBLENDROW_SSE2 |
| 4565 |
| 4566 #ifdef HAS_ARGBBLENDROW_SSSE3 |
| 4567 // Shuffle table for isolating alpha. |
| 4568 static const uvec8 kShuffleAlpha = { |
| 4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
| 4571 }; |
| 4572 // Same as SSE2, but replaces: |
| 4573 // psrlw xmm3, 8 // alpha |
| 4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
| 4575 // pshuflw xmm3, xmm3, 0F5h |
| 4576 // with.. |
| 4577 // pshufb xmm3, kShuffleAlpha // alpha |
| 4578 // Blend 8 pixels at a time. |
| 4579 |
| 4580 __declspec(naked) __declspec(align(16)) |
| 4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
| 4582 uint8* dst_argb, int width) { |
| 4583 __asm { |
| 4584 push esi |
| 4585 mov eax, [esp + 4 + 4] // src_argb0 |
| 4586 mov esi, [esp + 4 + 8] // src_argb1 |
| 4587 mov edx, [esp + 4 + 12] // dst_argb |
| 4588 mov ecx, [esp + 4 + 16] // width |
| 4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
| 4590 psrlw xmm7, 15 |
| 4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
| 4592 psrlw xmm6, 8 |
| 4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
| 4594 psllw xmm5, 8 |
| 4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 4596 pslld xmm4, 24 |
| 4597 |
| 4598 sub ecx, 1 |
| 4599 je convertloop1 // only 1 pixel? |
| 4600 jl convertloop1b |
| 4601 |
| 4602 // 1 pixel loop until destination pointer is aligned. |
| 4603 alignloop1: |
| 4604 test edx, 15 // aligned? |
| 4605 je alignloop1b |
| 4606 movd xmm3, [eax] |
| 4607 lea eax, [eax + 4] |
| 4608 movdqa xmm0, xmm3 // src argb |
| 4609 pxor xmm3, xmm4 // ~alpha |
| 4610 movd xmm2, [esi] // _r_b |
| 4611 pshufb xmm3, kShuffleAlpha // alpha |
| 4612 pand xmm2, xmm6 // _r_b |
| 4613 paddw xmm3, xmm7 // 256 - alpha |
| 4614 pmullw xmm2, xmm3 // _r_b * alpha |
| 4615 movd xmm1, [esi] // _a_g |
| 4616 lea esi, [esi + 4] |
| 4617 psrlw xmm1, 8 // _a_g |
| 4618 por xmm0, xmm4 // set alpha to 255 |
| 4619 pmullw xmm1, xmm3 // _a_g * alpha |
| 4620 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4621 paddusb xmm0, xmm2 // + src argb |
| 4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4623 paddusb xmm0, xmm1 // + src argb |
| 4624 sub ecx, 1 |
| 4625 movd [edx], xmm0 |
| 4626 lea edx, [edx + 4] |
| 4627 jge alignloop1 |
| 4628 |
| 4629 alignloop1b: |
| 4630 add ecx, 1 - 4 |
| 4631 jl convertloop4b |
| 4632 |
| 4633 test eax, 15 // unaligned? |
| 4634 jne convertuloop4 |
| 4635 test esi, 15 // unaligned? |
| 4636 jne convertuloop4 |
| 4637 |
| 4638 // 4 pixel loop. |
| 4639 convertloop4: |
| 4640 movdqa xmm3, [eax] // src argb |
| 4641 lea eax, [eax + 16] |
| 4642 movdqa xmm0, xmm3 // src argb |
| 4643 pxor xmm3, xmm4 // ~alpha |
| 4644 movdqa xmm2, [esi] // _r_b |
| 4645 pshufb xmm3, kShuffleAlpha // alpha |
| 4646 pand xmm2, xmm6 // _r_b |
| 4647 paddw xmm3, xmm7 // 256 - alpha |
| 4648 pmullw xmm2, xmm3 // _r_b * alpha |
| 4649 movdqa xmm1, [esi] // _a_g |
| 4650 lea esi, [esi + 16] |
| 4651 psrlw xmm1, 8 // _a_g |
| 4652 por xmm0, xmm4 // set alpha to 255 |
| 4653 pmullw xmm1, xmm3 // _a_g * alpha |
| 4654 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4655 paddusb xmm0, xmm2 // + src argb |
| 4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4657 paddusb xmm0, xmm1 // + src argb |
| 4658 sub ecx, 4 |
| 4659 movdqa [edx], xmm0 |
| 4660 lea edx, [edx + 16] |
| 4661 jge convertloop4 |
| 4662 jmp convertloop4b |
| 4663 |
| 4664 // 4 pixel unaligned loop. |
| 4665 convertuloop4: |
| 4666 movdqu xmm3, [eax] // src argb |
| 4667 lea eax, [eax + 16] |
| 4668 movdqa xmm0, xmm3 // src argb |
| 4669 pxor xmm3, xmm4 // ~alpha |
| 4670 movdqu xmm2, [esi] // _r_b |
| 4671 pshufb xmm3, kShuffleAlpha // alpha |
| 4672 pand xmm2, xmm6 // _r_b |
| 4673 paddw xmm3, xmm7 // 256 - alpha |
| 4674 pmullw xmm2, xmm3 // _r_b * alpha |
| 4675 movdqu xmm1, [esi] // _a_g |
| 4676 lea esi, [esi + 16] |
| 4677 psrlw xmm1, 8 // _a_g |
| 4678 por xmm0, xmm4 // set alpha to 255 |
| 4679 pmullw xmm1, xmm3 // _a_g * alpha |
| 4680 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4681 paddusb xmm0, xmm2 // + src argb |
| 4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4683 paddusb xmm0, xmm1 // + src argb |
| 4684 sub ecx, 4 |
| 4685 movdqa [edx], xmm0 |
| 4686 lea edx, [edx + 16] |
| 4687 jge convertuloop4 |
| 4688 |
| 4689 convertloop4b: |
| 4690 add ecx, 4 - 1 |
| 4691 jl convertloop1b |
| 4692 |
| 4693 // 1 pixel loop. |
| 4694 convertloop1: |
| 4695 movd xmm3, [eax] // src argb |
| 4696 lea eax, [eax + 4] |
| 4697 movdqa xmm0, xmm3 // src argb |
| 4698 pxor xmm3, xmm4 // ~alpha |
| 4699 movd xmm2, [esi] // _r_b |
| 4700 pshufb xmm3, kShuffleAlpha // alpha |
| 4701 pand xmm2, xmm6 // _r_b |
| 4702 paddw xmm3, xmm7 // 256 - alpha |
| 4703 pmullw xmm2, xmm3 // _r_b * alpha |
| 4704 movd xmm1, [esi] // _a_g |
| 4705 lea esi, [esi + 4] |
| 4706 psrlw xmm1, 8 // _a_g |
| 4707 por xmm0, xmm4 // set alpha to 255 |
| 4708 pmullw xmm1, xmm3 // _a_g * alpha |
| 4709 psrlw xmm2, 8 // _r_b convert to 8 bits again |
| 4710 paddusb xmm0, xmm2 // + src argb |
| 4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again |
| 4712 paddusb xmm0, xmm1 // + src argb |
| 4713 sub ecx, 1 |
| 4714 movd [edx], xmm0 |
| 4715 lea edx, [edx + 4] |
| 4716 jge convertloop1 |
| 4717 |
| 4718 convertloop1b: |
| 4719 pop esi |
| 4720 ret |
| 4721 } |
| 4722 } |
| 4723 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 4724 |
| 4725 #ifdef HAS_ARGBATTENUATEROW_SSE2 |
| 4726 // Attenuate 4 pixels at a time. |
| 4727 // Aligned to 16 bytes. |
| 4728 __declspec(naked) __declspec(align(16)) |
| 4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4730 __asm { |
| 4731 mov eax, [esp + 4] // src_argb0 |
| 4732 mov edx, [esp + 8] // dst_argb |
| 4733 mov ecx, [esp + 12] // width |
| 4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
| 4735 pslld xmm4, 24 |
| 4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff |
| 4737 psrld xmm5, 8 |
| 4738 |
| 4739 align 4 |
| 4740 convertloop: |
| 4741 movdqa xmm0, [eax] // read 4 pixels |
| 4742 punpcklbw xmm0, xmm0 // first 2 |
| 4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words |
| 4744 pshuflw xmm2, xmm2, 0FFh |
| 4745 pmulhuw xmm0, xmm2 // rgb * a |
| 4746 movdqa xmm1, [eax] // read 4 pixels |
| 4747 punpckhbw xmm1, xmm1 // next 2 pixels |
| 4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words |
| 4749 pshuflw xmm2, xmm2, 0FFh |
| 4750 pmulhuw xmm1, xmm2 // rgb * a |
| 4751 movdqa xmm2, [eax] // alphas |
| 4752 lea eax, [eax + 16] |
| 4753 psrlw xmm0, 8 |
| 4754 pand xmm2, xmm4 |
| 4755 psrlw xmm1, 8 |
| 4756 packuswb xmm0, xmm1 |
| 4757 pand xmm0, xmm5 // keep original alphas |
| 4758 por xmm0, xmm2 |
| 4759 sub ecx, 4 |
| 4760 movdqa [edx], xmm0 |
| 4761 lea edx, [edx + 16] |
| 4762 jg convertloop |
| 4763 |
| 4764 ret |
| 4765 } |
| 4766 } |
| 4767 #endif // HAS_ARGBATTENUATEROW_SSE2 |
| 4768 |
| 4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 4770 // Shuffle table duplicating alpha. |
| 4771 static const uvec8 kShuffleAlpha0 = { |
| 4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
| 4773 }; |
| 4774 static const uvec8 kShuffleAlpha1 = { |
| 4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
| 4777 }; |
| 4778 __declspec(naked) __declspec(align(16)) |
| 4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4780 __asm { |
| 4781 mov eax, [esp + 4] // src_argb0 |
| 4782 mov edx, [esp + 8] // dst_argb |
| 4783 mov ecx, [esp + 12] // width |
| 4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
| 4785 pslld xmm3, 24 |
| 4786 movdqa xmm4, kShuffleAlpha0 |
| 4787 movdqa xmm5, kShuffleAlpha1 |
| 4788 |
| 4789 align 4 |
| 4790 convertloop: |
| 4791 movdqu xmm0, [eax] // read 4 pixels |
| 4792 pshufb xmm0, xmm4 // isolate first 2 alphas |
| 4793 movdqu xmm1, [eax] // read 4 pixels |
| 4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs |
| 4795 pmulhuw xmm0, xmm1 // rgb * a |
| 4796 movdqu xmm1, [eax] // read 4 pixels |
| 4797 pshufb xmm1, xmm5 // isolate next 2 alphas |
| 4798 movdqu xmm2, [eax] // read 4 pixels |
| 4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs |
| 4800 pmulhuw xmm1, xmm2 // rgb * a |
| 4801 movdqu xmm2, [eax] // mask original alpha |
| 4802 lea eax, [eax + 16] |
| 4803 pand xmm2, xmm3 |
| 4804 psrlw xmm0, 8 |
| 4805 psrlw xmm1, 8 |
| 4806 packuswb xmm0, xmm1 |
| 4807 por xmm0, xmm2 // copy original alpha |
| 4808 sub ecx, 4 |
| 4809 movdqu [edx], xmm0 |
| 4810 lea edx, [edx + 16] |
| 4811 jg convertloop |
| 4812 |
| 4813 ret |
| 4814 } |
| 4815 } |
| 4816 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
| 4817 |
| 4818 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
| 4819 // Shuffle table duplicating alpha. |
| 4820 static const ulvec8 kShuffleAlpha_AVX2 = { |
| 4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
| 4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
| 4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
| 4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
| 4825 }; |
| 4826 __declspec(naked) __declspec(align(16)) |
| 4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 4828 __asm { |
| 4829 mov eax, [esp + 4] // src_argb0 |
| 4830 mov edx, [esp + 8] // dst_argb |
| 4831 mov ecx, [esp + 12] // width |
| 4832 sub edx, eax |
| 4833 vmovdqa ymm4, kShuffleAlpha_AVX2 |
| 4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
| 4835 vpslld ymm5, ymm5, 24 |
| 4836 |
| 4837 align 4 |
| 4838 convertloop: |
| 4839 vmovdqu ymm6, [eax] // read 8 pixels. |
| 4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| 4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| 4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas |
| 4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas |
| 4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a |
| 4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a |
| 4846 vpand ymm6, ymm6, ymm5 // isolate alpha |
| 4847 vpsrlw ymm0, ymm0, 8 |
| 4848 vpsrlw ymm1, ymm1, 8 |
| 4849 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| 4850 vpor ymm0, ymm0, ymm6 // copy original alpha |
| 4851 sub ecx, 8 |
| 4852 vmovdqu [eax + edx], ymm0 |
| 4853 lea eax, [eax + 32] |
| 4854 jg convertloop |
| 4855 |
| 4856 vzeroupper |
| 4857 ret |
| 4858 } |
| 4859 } |
| 4860 #endif // HAS_ARGBATTENUATEROW_AVX2 |
| 4861 |
| 4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
| 4863 // Unattenuate 4 pixels at a time. |
| 4864 // Aligned to 16 bytes. |
| 4865 __declspec(naked) __declspec(align(16)) |
| 4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 4867 int width) { |
| 4868 __asm { |
| 4869 push esi |
| 4870 push edi |
| 4871 mov eax, [esp + 8 + 4] // src_argb0 |
| 4872 mov edx, [esp + 8 + 8] // dst_argb |
| 4873 mov ecx, [esp + 8 + 12] // width |
| 4874 |
| 4875 align 4 |
| 4876 convertloop: |
| 4877 movdqu xmm0, [eax] // read 4 pixels |
| 4878 movzx esi, byte ptr [eax + 3] // first alpha |
| 4879 movzx edi, byte ptr [eax + 7] // second alpha |
| 4880 punpcklbw xmm0, xmm0 // first 2 |
| 4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
| 4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
| 4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a |
| 4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
| 4885 movlhps xmm2, xmm3 |
| 4886 pmulhuw xmm0, xmm2 // rgb * a |
| 4887 |
| 4888 movdqu xmm1, [eax] // read 4 pixels |
| 4889 movzx esi, byte ptr [eax + 11] // third alpha |
| 4890 movzx edi, byte ptr [eax + 15] // forth alpha |
| 4891 punpckhbw xmm1, xmm1 // next 2 |
| 4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
| 4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
| 4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words |
| 4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
| 4896 movlhps xmm2, xmm3 |
| 4897 pmulhuw xmm1, xmm2 // rgb * a |
| 4898 lea eax, [eax + 16] |
| 4899 |
| 4900 packuswb xmm0, xmm1 |
| 4901 sub ecx, 4 |
| 4902 movdqu [edx], xmm0 |
| 4903 lea edx, [edx + 16] |
| 4904 jg convertloop |
| 4905 pop edi |
| 4906 pop esi |
| 4907 ret |
| 4908 } |
| 4909 } |
| 4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
| 4911 |
| 4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
| 4913 // Shuffle table duplicating alpha. |
| 4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = { |
| 4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
| 4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
| 4917 }; |
| 4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
| 4919 // USE_GATHER is not on by default, due to being a slow instruction. |
| 4920 #ifdef USE_GATHER |
| 4921 __declspec(naked) __declspec(align(16)) |
| 4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 4923 int width) { |
| 4924 __asm { |
| 4925 mov eax, [esp + 4] // src_argb0 |
| 4926 mov edx, [esp + 8] // dst_argb |
| 4927 mov ecx, [esp + 12] // width |
| 4928 sub edx, eax |
| 4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 |
| 4930 |
| 4931 align 4 |
| 4932 convertloop: |
| 4933 vmovdqu ymm6, [eax] // read 8 pixels. |
| 4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. |
| 4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. |
| 4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| 4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| 4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a |
| 4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
| 4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
| 4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a |
| 4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas |
| 4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
| 4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
| 4945 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| 4946 sub ecx, 8 |
| 4947 vmovdqu [eax + edx], ymm0 |
| 4948 lea eax, [eax + 32] |
| 4949 jg convertloop |
| 4950 |
| 4951 vzeroupper |
| 4952 ret |
| 4953 } |
| 4954 } |
| 4955 #else // USE_GATHER |
| 4956 __declspec(naked) __declspec(align(16)) |
| 4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 4958 int width) { |
| 4959 __asm { |
| 4960 |
| 4961 mov eax, [esp + 4] // src_argb0 |
| 4962 mov edx, [esp + 8] // dst_argb |
| 4963 mov ecx, [esp + 12] // width |
| 4964 sub edx, eax |
| 4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 |
| 4966 |
| 4967 push esi |
| 4968 push edi |
| 4969 |
| 4970 align 4 |
| 4971 convertloop: |
| 4972 // replace VPGATHER |
| 4973 movzx esi, byte ptr [eax + 3] // alpha0 |
| 4974 movzx edi, byte ptr [eax + 7] // alpha1 |
| 4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] |
| 4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] |
| 4977 movzx esi, byte ptr [eax + 11] // alpha2 |
| 4978 movzx edi, byte ptr [eax + 15] // alpha3 |
| 4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] |
| 4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] |
| 4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] |
| 4982 movzx esi, byte ptr [eax + 19] // alpha4 |
| 4983 movzx edi, byte ptr [eax + 23] // alpha5 |
| 4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] |
| 4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] |
| 4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] |
| 4987 movzx esi, byte ptr [eax + 27] // alpha6 |
| 4988 movzx edi, byte ptr [eax + 31] // alpha7 |
| 4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] |
| 4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] |
| 4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] |
| 4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] |
| 4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] |
| 4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] |
| 4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] |
| 4996 // end of VPGATHER |
| 4997 |
| 4998 vmovdqu ymm6, [eax] // read 8 pixels. |
| 4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
| 5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
| 5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
| 5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
| 5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a |
| 5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas |
| 5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
| 5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
| 5007 vpackuswb ymm0, ymm0, ymm1 // unmutated. |
| 5008 sub ecx, 8 |
| 5009 vmovdqu [eax + edx], ymm0 |
| 5010 lea eax, [eax + 32] |
| 5011 jg convertloop |
| 5012 |
| 5013 pop edi |
| 5014 pop esi |
| 5015 vzeroupper |
| 5016 ret |
| 5017 } |
| 5018 } |
| 5019 #endif // USE_GATHER |
| 5020 #endif // HAS_ARGBATTENUATEROW_AVX2 |
| 5021 |
| 5022 #ifdef HAS_ARGBGRAYROW_SSSE3 |
| 5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
| 5024 __declspec(naked) __declspec(align(16)) |
| 5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 5026 __asm { |
| 5027 mov eax, [esp + 4] /* src_argb */ |
| 5028 mov edx, [esp + 8] /* dst_argb */ |
| 5029 mov ecx, [esp + 12] /* width */ |
| 5030 movdqa xmm4, kARGBToYJ |
| 5031 movdqa xmm5, kAddYJ64 |
| 5032 |
| 5033 align 4 |
| 5034 convertloop: |
| 5035 movdqa xmm0, [eax] // G |
| 5036 movdqa xmm1, [eax + 16] |
| 5037 pmaddubsw xmm0, xmm4 |
| 5038 pmaddubsw xmm1, xmm4 |
| 5039 phaddw xmm0, xmm1 |
| 5040 paddw xmm0, xmm5 // Add .5 for rounding. |
| 5041 psrlw xmm0, 7 |
| 5042 packuswb xmm0, xmm0 // 8 G bytes |
| 5043 movdqa xmm2, [eax] // A |
| 5044 movdqa xmm3, [eax + 16] |
| 5045 lea eax, [eax + 32] |
| 5046 psrld xmm2, 24 |
| 5047 psrld xmm3, 24 |
| 5048 packuswb xmm2, xmm3 |
| 5049 packuswb xmm2, xmm2 // 8 A bytes |
| 5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA |
| 5051 punpcklbw xmm0, xmm0 // 8 GG words |
| 5052 punpcklbw xmm3, xmm2 // 8 GA words |
| 5053 movdqa xmm1, xmm0 |
| 5054 punpcklwd xmm0, xmm3 // GGGA first 4 |
| 5055 punpckhwd xmm1, xmm3 // GGGA next 4 |
| 5056 sub ecx, 8 |
| 5057 movdqa [edx], xmm0 |
| 5058 movdqa [edx + 16], xmm1 |
| 5059 lea edx, [edx + 32] |
| 5060 jg convertloop |
| 5061 ret |
| 5062 } |
| 5063 } |
| 5064 #endif // HAS_ARGBGRAYROW_SSSE3 |
| 5065 |
| 5066 #ifdef HAS_ARGBSEPIAROW_SSSE3 |
| 5067 // b = (r * 35 + g * 68 + b * 17) >> 7 |
| 5068 // g = (r * 45 + g * 88 + b * 22) >> 7 |
| 5069 // r = (r * 50 + g * 98 + b * 24) >> 7 |
| 5070 // Constant for ARGB color to sepia tone. |
| 5071 static const vec8 kARGBToSepiaB = { |
| 5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 |
| 5073 }; |
| 5074 |
| 5075 static const vec8 kARGBToSepiaG = { |
| 5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
| 5077 }; |
| 5078 |
| 5079 static const vec8 kARGBToSepiaR = { |
| 5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
| 5081 }; |
| 5082 |
| 5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| 5084 __declspec(naked) __declspec(align(16)) |
| 5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
| 5086 __asm { |
| 5087 mov eax, [esp + 4] /* dst_argb */ |
| 5088 mov ecx, [esp + 8] /* width */ |
| 5089 movdqa xmm2, kARGBToSepiaB |
| 5090 movdqa xmm3, kARGBToSepiaG |
| 5091 movdqa xmm4, kARGBToSepiaR |
| 5092 |
| 5093 align 4 |
| 5094 convertloop: |
| 5095 movdqa xmm0, [eax] // B |
| 5096 movdqa xmm6, [eax + 16] |
| 5097 pmaddubsw xmm0, xmm2 |
| 5098 pmaddubsw xmm6, xmm2 |
| 5099 phaddw xmm0, xmm6 |
| 5100 psrlw xmm0, 7 |
| 5101 packuswb xmm0, xmm0 // 8 B values |
| 5102 movdqa xmm5, [eax] // G |
| 5103 movdqa xmm1, [eax + 16] |
| 5104 pmaddubsw xmm5, xmm3 |
| 5105 pmaddubsw xmm1, xmm3 |
| 5106 phaddw xmm5, xmm1 |
| 5107 psrlw xmm5, 7 |
| 5108 packuswb xmm5, xmm5 // 8 G values |
| 5109 punpcklbw xmm0, xmm5 // 8 BG values |
| 5110 movdqa xmm5, [eax] // R |
| 5111 movdqa xmm1, [eax + 16] |
| 5112 pmaddubsw xmm5, xmm4 |
| 5113 pmaddubsw xmm1, xmm4 |
| 5114 phaddw xmm5, xmm1 |
| 5115 psrlw xmm5, 7 |
| 5116 packuswb xmm5, xmm5 // 8 R values |
| 5117 movdqa xmm6, [eax] // A |
| 5118 movdqa xmm1, [eax + 16] |
| 5119 psrld xmm6, 24 |
| 5120 psrld xmm1, 24 |
| 5121 packuswb xmm6, xmm1 |
| 5122 packuswb xmm6, xmm6 // 8 A values |
| 5123 punpcklbw xmm5, xmm6 // 8 RA values |
| 5124 movdqa xmm1, xmm0 // Weave BG, RA together |
| 5125 punpcklwd xmm0, xmm5 // BGRA first 4 |
| 5126 punpckhwd xmm1, xmm5 // BGRA next 4 |
| 5127 sub ecx, 8 |
| 5128 movdqa [eax], xmm0 |
| 5129 movdqa [eax + 16], xmm1 |
| 5130 lea eax, [eax + 32] |
| 5131 jg convertloop |
| 5132 ret |
| 5133 } |
| 5134 } |
| 5135 #endif // HAS_ARGBSEPIAROW_SSSE3 |
| 5136 |
| 5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| 5139 // Same as Sepia except matrix is provided. |
| 5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
| 5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
| 5142 __declspec(naked) __declspec(align(16)) |
| 5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 5144 const int8* matrix_argb, int width) { |
| 5145 __asm { |
| 5146 mov eax, [esp + 4] /* src_argb */ |
| 5147 mov edx, [esp + 8] /* dst_argb */ |
| 5148 mov ecx, [esp + 12] /* matrix_argb */ |
| 5149 movdqu xmm5, [ecx] |
| 5150 pshufd xmm2, xmm5, 0x00 |
| 5151 pshufd xmm3, xmm5, 0x55 |
| 5152 pshufd xmm4, xmm5, 0xaa |
| 5153 pshufd xmm5, xmm5, 0xff |
| 5154 mov ecx, [esp + 16] /* width */ |
| 5155 |
| 5156 align 4 |
| 5157 convertloop: |
| 5158 movdqa xmm0, [eax] // B |
| 5159 movdqa xmm7, [eax + 16] |
| 5160 pmaddubsw xmm0, xmm2 |
| 5161 pmaddubsw xmm7, xmm2 |
| 5162 movdqa xmm6, [eax] // G |
| 5163 movdqa xmm1, [eax + 16] |
| 5164 pmaddubsw xmm6, xmm3 |
| 5165 pmaddubsw xmm1, xmm3 |
| 5166 phaddsw xmm0, xmm7 // B |
| 5167 phaddsw xmm6, xmm1 // G |
| 5168 psraw xmm0, 6 // B |
| 5169 psraw xmm6, 6 // G |
| 5170 packuswb xmm0, xmm0 // 8 B values |
| 5171 packuswb xmm6, xmm6 // 8 G values |
| 5172 punpcklbw xmm0, xmm6 // 8 BG values |
| 5173 movdqa xmm1, [eax] // R |
| 5174 movdqa xmm7, [eax + 16] |
| 5175 pmaddubsw xmm1, xmm4 |
| 5176 pmaddubsw xmm7, xmm4 |
| 5177 phaddsw xmm1, xmm7 // R |
| 5178 movdqa xmm6, [eax] // A |
| 5179 movdqa xmm7, [eax + 16] |
| 5180 pmaddubsw xmm6, xmm5 |
| 5181 pmaddubsw xmm7, xmm5 |
| 5182 phaddsw xmm6, xmm7 // A |
| 5183 psraw xmm1, 6 // R |
| 5184 psraw xmm6, 6 // A |
| 5185 packuswb xmm1, xmm1 // 8 R values |
| 5186 packuswb xmm6, xmm6 // 8 A values |
| 5187 punpcklbw xmm1, xmm6 // 8 RA values |
| 5188 movdqa xmm6, xmm0 // Weave BG, RA together |
| 5189 punpcklwd xmm0, xmm1 // BGRA first 4 |
| 5190 punpckhwd xmm6, xmm1 // BGRA next 4 |
| 5191 sub ecx, 8 |
| 5192 movdqa [edx], xmm0 |
| 5193 movdqa [edx + 16], xmm6 |
| 5194 lea eax, [eax + 32] |
| 5195 lea edx, [edx + 32] |
| 5196 jg convertloop |
| 5197 ret |
| 5198 } |
| 5199 } |
| 5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 5201 |
| 5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
| 5203 // Quantize 4 ARGB pixels (16 bytes). |
| 5204 // Aligned to 16 bytes. |
| 5205 __declspec(naked) __declspec(align(16)) |
| 5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
| 5207 int interval_offset, int width) { |
| 5208 __asm { |
| 5209 mov eax, [esp + 4] /* dst_argb */ |
| 5210 movd xmm2, [esp + 8] /* scale */ |
| 5211 movd xmm3, [esp + 12] /* interval_size */ |
| 5212 movd xmm4, [esp + 16] /* interval_offset */ |
| 5213 mov ecx, [esp + 20] /* width */ |
| 5214 pshuflw xmm2, xmm2, 040h |
| 5215 pshufd xmm2, xmm2, 044h |
| 5216 pshuflw xmm3, xmm3, 040h |
| 5217 pshufd xmm3, xmm3, 044h |
| 5218 pshuflw xmm4, xmm4, 040h |
| 5219 pshufd xmm4, xmm4, 044h |
| 5220 pxor xmm5, xmm5 // constant 0 |
| 5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 |
| 5222 pslld xmm6, 24 |
| 5223 |
| 5224 align 4 |
| 5225 convertloop: |
| 5226 movdqa xmm0, [eax] // read 4 pixels |
| 5227 punpcklbw xmm0, xmm5 // first 2 pixels |
| 5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16 |
| 5229 movdqa xmm1, [eax] // read 4 pixels |
| 5230 punpckhbw xmm1, xmm5 // next 2 pixels |
| 5231 pmulhuw xmm1, xmm2 |
| 5232 pmullw xmm0, xmm3 // * interval_size |
| 5233 movdqa xmm7, [eax] // read 4 pixels |
| 5234 pmullw xmm1, xmm3 |
| 5235 pand xmm7, xmm6 // mask alpha |
| 5236 paddw xmm0, xmm4 // + interval_size / 2 |
| 5237 paddw xmm1, xmm4 |
| 5238 packuswb xmm0, xmm1 |
| 5239 por xmm0, xmm7 |
| 5240 sub ecx, 4 |
| 5241 movdqa [eax], xmm0 |
| 5242 lea eax, [eax + 16] |
| 5243 jg convertloop |
| 5244 ret |
| 5245 } |
| 5246 } |
| 5247 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
| 5248 |
| 5249 #ifdef HAS_ARGBSHADEROW_SSE2 |
| 5250 // Shade 4 pixels at a time by specified value. |
| 5251 // Aligned to 16 bytes. |
| 5252 __declspec(naked) __declspec(align(16)) |
| 5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
| 5254 uint32 value) { |
| 5255 __asm { |
| 5256 mov eax, [esp + 4] // src_argb |
| 5257 mov edx, [esp + 8] // dst_argb |
| 5258 mov ecx, [esp + 12] // width |
| 5259 movd xmm2, [esp + 16] // value |
| 5260 punpcklbw xmm2, xmm2 |
| 5261 punpcklqdq xmm2, xmm2 |
| 5262 |
| 5263 align 4 |
| 5264 convertloop: |
| 5265 movdqa xmm0, [eax] // read 4 pixels |
| 5266 lea eax, [eax + 16] |
| 5267 movdqa xmm1, xmm0 |
| 5268 punpcklbw xmm0, xmm0 // first 2 |
| 5269 punpckhbw xmm1, xmm1 // next 2 |
| 5270 pmulhuw xmm0, xmm2 // argb * value |
| 5271 pmulhuw xmm1, xmm2 // argb * value |
| 5272 psrlw xmm0, 8 |
| 5273 psrlw xmm1, 8 |
| 5274 packuswb xmm0, xmm1 |
| 5275 sub ecx, 4 |
| 5276 movdqa [edx], xmm0 |
| 5277 lea edx, [edx + 16] |
| 5278 jg convertloop |
| 5279 |
| 5280 ret |
| 5281 } |
| 5282 } |
| 5283 #endif // HAS_ARGBSHADEROW_SSE2 |
| 5284 |
| 5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
| 5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
| 5287 __declspec(naked) __declspec(align(16)) |
| 5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 5289 uint8* dst_argb, int width) { |
| 5290 __asm { |
| 5291 push esi |
| 5292 mov eax, [esp + 4 + 4] // src_argb0 |
| 5293 mov esi, [esp + 4 + 8] // src_argb1 |
| 5294 mov edx, [esp + 4 + 12] // dst_argb |
| 5295 mov ecx, [esp + 4 + 16] // width |
| 5296 pxor xmm5, xmm5 // constant 0 |
| 5297 |
| 5298 align 4 |
| 5299 convertloop: |
| 5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| 5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1 |
| 5302 movdqu xmm1, xmm0 |
| 5303 movdqu xmm3, xmm2 |
| 5304 punpcklbw xmm0, xmm0 // first 2 |
| 5305 punpckhbw xmm1, xmm1 // next 2 |
| 5306 punpcklbw xmm2, xmm5 // first 2 |
| 5307 punpckhbw xmm3, xmm5 // next 2 |
| 5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 |
| 5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 |
| 5310 lea eax, [eax + 16] |
| 5311 lea esi, [esi + 16] |
| 5312 packuswb xmm0, xmm1 |
| 5313 sub ecx, 4 |
| 5314 movdqu [edx], xmm0 |
| 5315 lea edx, [edx + 16] |
| 5316 jg convertloop |
| 5317 |
| 5318 pop esi |
| 5319 ret |
| 5320 } |
| 5321 } |
| 5322 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
| 5323 |
| 5324 #ifdef HAS_ARGBADDROW_SSE2 |
| 5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| 5326 // TODO(fbarchard): Port this to posix, neon and other math functions. |
| 5327 __declspec(naked) __declspec(align(16)) |
| 5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 5329 uint8* dst_argb, int width) { |
| 5330 __asm { |
| 5331 push esi |
| 5332 mov eax, [esp + 4 + 4] // src_argb0 |
| 5333 mov esi, [esp + 4 + 8] // src_argb1 |
| 5334 mov edx, [esp + 4 + 12] // dst_argb |
| 5335 mov ecx, [esp + 4 + 16] // width |
| 5336 |
| 5337 sub ecx, 4 |
| 5338 jl convertloop49 |
| 5339 |
| 5340 align 4 |
| 5341 convertloop4: |
| 5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| 5343 lea eax, [eax + 16] |
| 5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
| 5345 lea esi, [esi + 16] |
| 5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
| 5347 sub ecx, 4 |
| 5348 movdqu [edx], xmm0 |
| 5349 lea edx, [edx + 16] |
| 5350 jge convertloop4 |
| 5351 |
| 5352 convertloop49: |
| 5353 add ecx, 4 - 1 |
| 5354 jl convertloop19 |
| 5355 |
| 5356 convertloop1: |
| 5357 movd xmm0, [eax] // read 1 pixels from src_argb0 |
| 5358 lea eax, [eax + 4] |
| 5359 movd xmm1, [esi] // read 1 pixels from src_argb1 |
| 5360 lea esi, [esi + 4] |
| 5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
| 5362 sub ecx, 1 |
| 5363 movd [edx], xmm0 |
| 5364 lea edx, [edx + 4] |
| 5365 jge convertloop1 |
| 5366 |
| 5367 convertloop19: |
| 5368 pop esi |
| 5369 ret |
| 5370 } |
| 5371 } |
| 5372 #endif // HAS_ARGBADDROW_SSE2 |
| 5373 |
| 5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
| 5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
| 5376 __declspec(naked) __declspec(align(16)) |
| 5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
| 5378 uint8* dst_argb, int width) { |
| 5379 __asm { |
| 5380 push esi |
| 5381 mov eax, [esp + 4 + 4] // src_argb0 |
| 5382 mov esi, [esp + 4 + 8] // src_argb1 |
| 5383 mov edx, [esp + 4 + 12] // dst_argb |
| 5384 mov ecx, [esp + 4 + 16] // width |
| 5385 |
| 5386 align 4 |
| 5387 convertloop: |
| 5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
| 5389 lea eax, [eax + 16] |
| 5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
| 5391 lea esi, [esi + 16] |
| 5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
| 5393 sub ecx, 4 |
| 5394 movdqu [edx], xmm0 |
| 5395 lea edx, [edx + 16] |
| 5396 jg convertloop |
| 5397 |
| 5398 pop esi |
| 5399 ret |
| 5400 } |
| 5401 } |
| 5402 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
| 5403 |
| 5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
| 5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| 5406 __declspec(naked) __declspec(align(16)) |
| 5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 5408 uint8* dst_argb, int width) { |
| 5409 __asm { |
| 5410 push esi |
| 5411 mov eax, [esp + 4 + 4] // src_argb0 |
| 5412 mov esi, [esp + 4 + 8] // src_argb1 |
| 5413 mov edx, [esp + 4 + 12] // dst_argb |
| 5414 mov ecx, [esp + 4 + 16] // width |
| 5415 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 5416 |
| 5417 align 4 |
| 5418 convertloop: |
| 5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 |
| 5420 lea eax, [eax + 32] |
| 5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 |
| 5422 lea esi, [esi + 32] |
| 5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4 |
| 5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4 |
| 5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4 |
| 5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4 |
| 5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 |
| 5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 |
| 5429 vpackuswb ymm0, ymm0, ymm1 |
| 5430 vmovdqu [edx], ymm0 |
| 5431 lea edx, [edx + 32] |
| 5432 sub ecx, 8 |
| 5433 jg convertloop |
| 5434 |
| 5435 pop esi |
| 5436 vzeroupper |
| 5437 ret |
| 5438 } |
| 5439 } |
| 5440 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
| 5441 |
| 5442 #ifdef HAS_ARGBADDROW_AVX2 |
| 5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
| 5444 __declspec(naked) __declspec(align(16)) |
| 5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 5446 uint8* dst_argb, int width) { |
| 5447 __asm { |
| 5448 push esi |
| 5449 mov eax, [esp + 4 + 4] // src_argb0 |
| 5450 mov esi, [esp + 4 + 8] // src_argb1 |
| 5451 mov edx, [esp + 4 + 12] // dst_argb |
| 5452 mov ecx, [esp + 4 + 16] // width |
| 5453 |
| 5454 align 4 |
| 5455 convertloop: |
| 5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
| 5457 lea eax, [eax + 32] |
| 5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
| 5459 lea esi, [esi + 32] |
| 5460 vmovdqu [edx], ymm0 |
| 5461 lea edx, [edx + 32] |
| 5462 sub ecx, 8 |
| 5463 jg convertloop |
| 5464 |
| 5465 pop esi |
| 5466 vzeroupper |
| 5467 ret |
| 5468 } |
| 5469 } |
| 5470 #endif // HAS_ARGBADDROW_AVX2 |
| 5471 |
| 5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
| 5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
| 5474 __declspec(naked) __declspec(align(16)) |
| 5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
| 5476 uint8* dst_argb, int width) { |
| 5477 __asm { |
| 5478 push esi |
| 5479 mov eax, [esp + 4 + 4] // src_argb0 |
| 5480 mov esi, [esp + 4 + 8] // src_argb1 |
| 5481 mov edx, [esp + 4 + 12] // dst_argb |
| 5482 mov ecx, [esp + 4 + 16] // width |
| 5483 |
| 5484 align 4 |
| 5485 convertloop: |
| 5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
| 5487 lea eax, [eax + 32] |
| 5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 |
| 5489 lea esi, [esi + 32] |
| 5490 vmovdqu [edx], ymm0 |
| 5491 lea edx, [edx + 32] |
| 5492 sub ecx, 8 |
| 5493 jg convertloop |
| 5494 |
| 5495 pop esi |
| 5496 vzeroupper |
| 5497 ret |
| 5498 } |
| 5499 } |
| 5500 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
| 5501 |
| 5502 #ifdef HAS_SOBELXROW_SSE2 |
| 5503 // SobelX as a matrix is |
| 5504 // -1 0 1 |
| 5505 // -2 0 2 |
| 5506 // -1 0 1 |
| 5507 __declspec(naked) __declspec(align(16)) |
| 5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 5509 const uint8* src_y2, uint8* dst_sobelx, int width) { |
| 5510 __asm { |
| 5511 push esi |
| 5512 push edi |
| 5513 mov eax, [esp + 8 + 4] // src_y0 |
| 5514 mov esi, [esp + 8 + 8] // src_y1 |
| 5515 mov edi, [esp + 8 + 12] // src_y2 |
| 5516 mov edx, [esp + 8 + 16] // dst_sobelx |
| 5517 mov ecx, [esp + 8 + 20] // width |
| 5518 sub esi, eax |
| 5519 sub edi, eax |
| 5520 sub edx, eax |
| 5521 pxor xmm5, xmm5 // constant 0 |
| 5522 |
| 5523 align 4 |
| 5524 convertloop: |
| 5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
| 5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
| 5527 punpcklbw xmm0, xmm5 |
| 5528 punpcklbw xmm1, xmm5 |
| 5529 psubw xmm0, xmm1 |
| 5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
| 5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
| 5532 punpcklbw xmm1, xmm5 |
| 5533 punpcklbw xmm2, xmm5 |
| 5534 psubw xmm1, xmm2 |
| 5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] |
| 5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] |
| 5537 punpcklbw xmm2, xmm5 |
| 5538 punpcklbw xmm3, xmm5 |
| 5539 psubw xmm2, xmm3 |
| 5540 paddw xmm0, xmm2 |
| 5541 paddw xmm0, xmm1 |
| 5542 paddw xmm0, xmm1 |
| 5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
| 5544 psubw xmm1, xmm0 |
| 5545 pmaxsw xmm0, xmm1 |
| 5546 packuswb xmm0, xmm0 |
| 5547 sub ecx, 8 |
| 5548 movq qword ptr [eax + edx], xmm0 |
| 5549 lea eax, [eax + 8] |
| 5550 jg convertloop |
| 5551 |
| 5552 pop edi |
| 5553 pop esi |
| 5554 ret |
| 5555 } |
| 5556 } |
| 5557 #endif // HAS_SOBELXROW_SSE2 |
| 5558 |
| 5559 #ifdef HAS_SOBELYROW_SSE2 |
| 5560 // SobelY as a matrix is |
| 5561 // -1 -2 -1 |
| 5562 // 0 0 0 |
| 5563 // 1 2 1 |
| 5564 __declspec(naked) __declspec(align(16)) |
| 5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
| 5566 uint8* dst_sobely, int width) { |
| 5567 __asm { |
| 5568 push esi |
| 5569 mov eax, [esp + 4 + 4] // src_y0 |
| 5570 mov esi, [esp + 4 + 8] // src_y1 |
| 5571 mov edx, [esp + 4 + 12] // dst_sobely |
| 5572 mov ecx, [esp + 4 + 16] // width |
| 5573 sub esi, eax |
| 5574 sub edx, eax |
| 5575 pxor xmm5, xmm5 // constant 0 |
| 5576 |
| 5577 align 4 |
| 5578 convertloop: |
| 5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
| 5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
| 5581 punpcklbw xmm0, xmm5 |
| 5582 punpcklbw xmm1, xmm5 |
| 5583 psubw xmm0, xmm1 |
| 5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] |
| 5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] |
| 5586 punpcklbw xmm1, xmm5 |
| 5587 punpcklbw xmm2, xmm5 |
| 5588 psubw xmm1, xmm2 |
| 5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
| 5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
| 5591 punpcklbw xmm2, xmm5 |
| 5592 punpcklbw xmm3, xmm5 |
| 5593 psubw xmm2, xmm3 |
| 5594 paddw xmm0, xmm2 |
| 5595 paddw xmm0, xmm1 |
| 5596 paddw xmm0, xmm1 |
| 5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
| 5598 psubw xmm1, xmm0 |
| 5599 pmaxsw xmm0, xmm1 |
| 5600 packuswb xmm0, xmm0 |
| 5601 sub ecx, 8 |
| 5602 movq qword ptr [eax + edx], xmm0 |
| 5603 lea eax, [eax + 8] |
| 5604 jg convertloop |
| 5605 |
| 5606 pop esi |
| 5607 ret |
| 5608 } |
| 5609 } |
| 5610 #endif // HAS_SOBELYROW_SSE2 |
| 5611 |
| 5612 #ifdef HAS_SOBELROW_SSE2 |
| 5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 5614 // A = 255 |
| 5615 // R = Sobel |
| 5616 // G = Sobel |
| 5617 // B = Sobel |
| 5618 __declspec(naked) __declspec(align(16)) |
| 5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 5620 uint8* dst_argb, int width) { |
| 5621 __asm { |
| 5622 push esi |
| 5623 mov eax, [esp + 4 + 4] // src_sobelx |
| 5624 mov esi, [esp + 4 + 8] // src_sobely |
| 5625 mov edx, [esp + 4 + 12] // dst_argb |
| 5626 mov ecx, [esp + 4 + 16] // width |
| 5627 sub esi, eax |
| 5628 pcmpeqb xmm5, xmm5 // alpha 255 |
| 5629 pslld xmm5, 24 // 0xff000000 |
| 5630 |
| 5631 align 4 |
| 5632 convertloop: |
| 5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
| 5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
| 5635 lea eax, [eax + 16] |
| 5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely |
| 5637 movdqa xmm2, xmm0 // GG |
| 5638 punpcklbw xmm2, xmm0 // First 8 |
| 5639 punpckhbw xmm0, xmm0 // Next 8 |
| 5640 movdqa xmm1, xmm2 // GGGG |
| 5641 punpcklwd xmm1, xmm2 // First 4 |
| 5642 punpckhwd xmm2, xmm2 // Next 4 |
| 5643 por xmm1, xmm5 // GGGA |
| 5644 por xmm2, xmm5 |
| 5645 movdqa xmm3, xmm0 // GGGG |
| 5646 punpcklwd xmm3, xmm0 // Next 4 |
| 5647 punpckhwd xmm0, xmm0 // Last 4 |
| 5648 por xmm3, xmm5 // GGGA |
| 5649 por xmm0, xmm5 |
| 5650 sub ecx, 16 |
| 5651 movdqa [edx], xmm1 |
| 5652 movdqa [edx + 16], xmm2 |
| 5653 movdqa [edx + 32], xmm3 |
| 5654 movdqa [edx + 48], xmm0 |
| 5655 lea edx, [edx + 64] |
| 5656 jg convertloop |
| 5657 |
| 5658 pop esi |
| 5659 ret |
| 5660 } |
| 5661 } |
| 5662 #endif // HAS_SOBELROW_SSE2 |
| 5663 |
| 5664 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
| 5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
| 5666 __declspec(naked) __declspec(align(16)) |
| 5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 5668 uint8* dst_y, int width) { |
| 5669 __asm { |
| 5670 push esi |
| 5671 mov eax, [esp + 4 + 4] // src_sobelx |
| 5672 mov esi, [esp + 4 + 8] // src_sobely |
| 5673 mov edx, [esp + 4 + 12] // dst_argb |
| 5674 mov ecx, [esp + 4 + 16] // width |
| 5675 sub esi, eax |
| 5676 |
| 5677 align 4 |
| 5678 convertloop: |
| 5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
| 5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
| 5681 lea eax, [eax + 16] |
| 5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely |
| 5683 sub ecx, 16 |
| 5684 movdqa [edx], xmm0 |
| 5685 lea edx, [edx + 16] |
| 5686 jg convertloop |
| 5687 |
| 5688 pop esi |
| 5689 ret |
| 5690 } |
| 5691 } |
| 5692 #endif // HAS_SOBELTOPLANEROW_SSE2 |
| 5693 |
| 5694 #ifdef HAS_SOBELXYROW_SSE2 |
| 5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 5696 // A = 255 |
| 5697 // R = Sobel X |
| 5698 // G = Sobel |
| 5699 // B = Sobel Y |
| 5700 __declspec(naked) __declspec(align(16)) |
| 5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
| 5702 uint8* dst_argb, int width) { |
| 5703 __asm { |
| 5704 push esi |
| 5705 mov eax, [esp + 4 + 4] // src_sobelx |
| 5706 mov esi, [esp + 4 + 8] // src_sobely |
| 5707 mov edx, [esp + 4 + 12] // dst_argb |
| 5708 mov ecx, [esp + 4 + 16] // width |
| 5709 sub esi, eax |
| 5710 pcmpeqb xmm5, xmm5 // alpha 255 |
| 5711 |
| 5712 align 4 |
| 5713 convertloop: |
| 5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx |
| 5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
| 5716 lea eax, [eax + 16] |
| 5717 movdqa xmm2, xmm0 |
| 5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely |
| 5719 movdqa xmm3, xmm0 // XA |
| 5720 punpcklbw xmm3, xmm5 |
| 5721 punpckhbw xmm0, xmm5 |
| 5722 movdqa xmm4, xmm1 // YS |
| 5723 punpcklbw xmm4, xmm2 |
| 5724 punpckhbw xmm1, xmm2 |
| 5725 movdqa xmm6, xmm4 // YSXA |
| 5726 punpcklwd xmm6, xmm3 // First 4 |
| 5727 punpckhwd xmm4, xmm3 // Next 4 |
| 5728 movdqa xmm7, xmm1 // YSXA |
| 5729 punpcklwd xmm7, xmm0 // Next 4 |
| 5730 punpckhwd xmm1, xmm0 // Last 4 |
| 5731 sub ecx, 16 |
| 5732 movdqa [edx], xmm6 |
| 5733 movdqa [edx + 16], xmm4 |
| 5734 movdqa [edx + 32], xmm7 |
| 5735 movdqa [edx + 48], xmm1 |
| 5736 lea edx, [edx + 64] |
| 5737 jg convertloop |
| 5738 |
| 5739 pop esi |
| 5740 ret |
| 5741 } |
| 5742 } |
| 5743 #endif // HAS_SOBELXYROW_SSE2 |
| 5744 |
| 5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 5746 // Consider float CumulativeSum. |
| 5747 // Consider calling CumulativeSum one row at time as needed. |
| 5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. |
| 5749 // Convert cumulative sum for an area to an average for 1 pixel. |
| 5750 // topleft is pointer to top left of CumulativeSum buffer for area. |
| 5751 // botleft is pointer to bottom left of CumulativeSum buffer. |
| 5752 // width is offset from left to right of area in CumulativeSum buffer measured |
| 5753 // in number of ints. |
| 5754 // area is the number of pixels in the area being averaged. |
| 5755 // dst points to pixel to store result to. |
| 5756 // count is number of averaged pixels to produce. |
| 5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte |
| 5758 // aligned. |
| 5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, |
| 5760 int width, int area, uint8* dst, |
| 5761 int count) { |
| 5762 __asm { |
| 5763 mov eax, topleft // eax topleft |
| 5764 mov esi, botleft // esi botleft |
| 5765 mov edx, width |
| 5766 movd xmm5, area |
| 5767 mov edi, dst |
| 5768 mov ecx, count |
| 5769 cvtdq2ps xmm5, xmm5 |
| 5770 rcpss xmm4, xmm5 // 1.0f / area |
| 5771 pshufd xmm4, xmm4, 0 |
| 5772 sub ecx, 4 |
| 5773 jl l4b |
| 5774 |
| 5775 cmp area, 128 // 128 pixels will not overflow 15 bits. |
| 5776 ja l4 |
| 5777 |
| 5778 pshufd xmm5, xmm5, 0 // area |
| 5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 |
| 5780 psrld xmm6, 16 |
| 5781 cvtdq2ps xmm6, xmm6 |
| 5782 addps xmm5, xmm6 // (65536.0 + area - 1) |
| 5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area |
| 5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point |
| 5785 packssdw xmm5, xmm5 // 16 bit shorts |
| 5786 |
| 5787 // 4 pixel loop small blocks. |
| 5788 align 4 |
| 5789 s4: |
| 5790 // top left |
| 5791 movdqa xmm0, [eax] |
| 5792 movdqa xmm1, [eax + 16] |
| 5793 movdqa xmm2, [eax + 32] |
| 5794 movdqa xmm3, [eax + 48] |
| 5795 |
| 5796 // - top right |
| 5797 psubd xmm0, [eax + edx * 4] |
| 5798 psubd xmm1, [eax + edx * 4 + 16] |
| 5799 psubd xmm2, [eax + edx * 4 + 32] |
| 5800 psubd xmm3, [eax + edx * 4 + 48] |
| 5801 lea eax, [eax + 64] |
| 5802 |
| 5803 // - bottom left |
| 5804 psubd xmm0, [esi] |
| 5805 psubd xmm1, [esi + 16] |
| 5806 psubd xmm2, [esi + 32] |
| 5807 psubd xmm3, [esi + 48] |
| 5808 |
| 5809 // + bottom right |
| 5810 paddd xmm0, [esi + edx * 4] |
| 5811 paddd xmm1, [esi + edx * 4 + 16] |
| 5812 paddd xmm2, [esi + edx * 4 + 32] |
| 5813 paddd xmm3, [esi + edx * 4 + 48] |
| 5814 lea esi, [esi + 64] |
| 5815 |
| 5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers |
| 5817 packssdw xmm2, xmm3 |
| 5818 |
| 5819 pmulhuw xmm0, xmm5 |
| 5820 pmulhuw xmm2, xmm5 |
| 5821 |
| 5822 packuswb xmm0, xmm2 |
| 5823 movdqu [edi], xmm0 |
| 5824 lea edi, [edi + 16] |
| 5825 sub ecx, 4 |
| 5826 jge s4 |
| 5827 |
| 5828 jmp l4b |
| 5829 |
| 5830 // 4 pixel loop |
| 5831 align 4 |
| 5832 l4: |
| 5833 // top left |
| 5834 movdqa xmm0, [eax] |
| 5835 movdqa xmm1, [eax + 16] |
| 5836 movdqa xmm2, [eax + 32] |
| 5837 movdqa xmm3, [eax + 48] |
| 5838 |
| 5839 // - top right |
| 5840 psubd xmm0, [eax + edx * 4] |
| 5841 psubd xmm1, [eax + edx * 4 + 16] |
| 5842 psubd xmm2, [eax + edx * 4 + 32] |
| 5843 psubd xmm3, [eax + edx * 4 + 48] |
| 5844 lea eax, [eax + 64] |
| 5845 |
| 5846 // - bottom left |
| 5847 psubd xmm0, [esi] |
| 5848 psubd xmm1, [esi + 16] |
| 5849 psubd xmm2, [esi + 32] |
| 5850 psubd xmm3, [esi + 48] |
| 5851 |
| 5852 // + bottom right |
| 5853 paddd xmm0, [esi + edx * 4] |
| 5854 paddd xmm1, [esi + edx * 4 + 16] |
| 5855 paddd xmm2, [esi + edx * 4 + 32] |
| 5856 paddd xmm3, [esi + edx * 4 + 48] |
| 5857 lea esi, [esi + 64] |
| 5858 |
| 5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area |
| 5860 cvtdq2ps xmm1, xmm1 |
| 5861 mulps xmm0, xmm4 |
| 5862 mulps xmm1, xmm4 |
| 5863 cvtdq2ps xmm2, xmm2 |
| 5864 cvtdq2ps xmm3, xmm3 |
| 5865 mulps xmm2, xmm4 |
| 5866 mulps xmm3, xmm4 |
| 5867 cvtps2dq xmm0, xmm0 |
| 5868 cvtps2dq xmm1, xmm1 |
| 5869 cvtps2dq xmm2, xmm2 |
| 5870 cvtps2dq xmm3, xmm3 |
| 5871 packssdw xmm0, xmm1 |
| 5872 packssdw xmm2, xmm3 |
| 5873 packuswb xmm0, xmm2 |
| 5874 movdqu [edi], xmm0 |
| 5875 lea edi, [edi + 16] |
| 5876 sub ecx, 4 |
| 5877 jge l4 |
| 5878 |
| 5879 l4b: |
| 5880 add ecx, 4 - 1 |
| 5881 jl l1b |
| 5882 |
| 5883 // 1 pixel loop |
| 5884 align 4 |
| 5885 l1: |
| 5886 movdqa xmm0, [eax] |
| 5887 psubd xmm0, [eax + edx * 4] |
| 5888 lea eax, [eax + 16] |
| 5889 psubd xmm0, [esi] |
| 5890 paddd xmm0, [esi + edx * 4] |
| 5891 lea esi, [esi + 16] |
| 5892 cvtdq2ps xmm0, xmm0 |
| 5893 mulps xmm0, xmm4 |
| 5894 cvtps2dq xmm0, xmm0 |
| 5895 packssdw xmm0, xmm0 |
| 5896 packuswb xmm0, xmm0 |
| 5897 movd dword ptr [edi], xmm0 |
| 5898 lea edi, [edi + 4] |
| 5899 sub ecx, 1 |
| 5900 jge l1 |
| 5901 l1b: |
| 5902 } |
| 5903 } |
| 5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 5905 |
| 5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 5907 // Creates a table of cumulative sums where each value is a sum of all values |
| 5908 // above and to the left of the value. |
| 5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, |
| 5910 const int32* previous_cumsum, int width) { |
| 5911 __asm { |
| 5912 mov eax, row |
| 5913 mov edx, cumsum |
| 5914 mov esi, previous_cumsum |
| 5915 mov ecx, width |
| 5916 pxor xmm0, xmm0 |
| 5917 pxor xmm1, xmm1 |
| 5918 |
| 5919 sub ecx, 4 |
| 5920 jl l4b |
| 5921 test edx, 15 |
| 5922 jne l4b |
| 5923 |
| 5924 // 4 pixel loop |
| 5925 align 4 |
| 5926 l4: |
| 5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. |
| 5928 lea eax, [eax + 16] |
| 5929 movdqa xmm4, xmm2 |
| 5930 |
| 5931 punpcklbw xmm2, xmm1 |
| 5932 movdqa xmm3, xmm2 |
| 5933 punpcklwd xmm2, xmm1 |
| 5934 punpckhwd xmm3, xmm1 |
| 5935 |
| 5936 punpckhbw xmm4, xmm1 |
| 5937 movdqa xmm5, xmm4 |
| 5938 punpcklwd xmm4, xmm1 |
| 5939 punpckhwd xmm5, xmm1 |
| 5940 |
| 5941 paddd xmm0, xmm2 |
| 5942 movdqa xmm2, [esi] // previous row above. |
| 5943 paddd xmm2, xmm0 |
| 5944 |
| 5945 paddd xmm0, xmm3 |
| 5946 movdqa xmm3, [esi + 16] |
| 5947 paddd xmm3, xmm0 |
| 5948 |
| 5949 paddd xmm0, xmm4 |
| 5950 movdqa xmm4, [esi + 32] |
| 5951 paddd xmm4, xmm0 |
| 5952 |
| 5953 paddd xmm0, xmm5 |
| 5954 movdqa xmm5, [esi + 48] |
| 5955 lea esi, [esi + 64] |
| 5956 paddd xmm5, xmm0 |
| 5957 |
| 5958 movdqa [edx], xmm2 |
| 5959 movdqa [edx + 16], xmm3 |
| 5960 movdqa [edx + 32], xmm4 |
| 5961 movdqa [edx + 48], xmm5 |
| 5962 |
| 5963 lea edx, [edx + 64] |
| 5964 sub ecx, 4 |
| 5965 jge l4 |
| 5966 |
| 5967 l4b: |
| 5968 add ecx, 4 - 1 |
| 5969 jl l1b |
| 5970 |
| 5971 // 1 pixel loop |
| 5972 align 4 |
| 5973 l1: |
| 5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. |
| 5975 lea eax, [eax + 4] |
| 5976 punpcklbw xmm2, xmm1 |
| 5977 punpcklwd xmm2, xmm1 |
| 5978 paddd xmm0, xmm2 |
| 5979 movdqu xmm2, [esi] |
| 5980 lea esi, [esi + 16] |
| 5981 paddd xmm2, xmm0 |
| 5982 movdqu [edx], xmm2 |
| 5983 lea edx, [edx + 16] |
| 5984 sub ecx, 1 |
| 5985 jge l1 |
| 5986 |
| 5987 l1b: |
| 5988 } |
| 5989 } |
| 5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 5991 |
| 5992 #ifdef HAS_ARGBAFFINEROW_SSE2 |
| 5993 // Copy ARGB pixels from source image with slope to a row of destination. |
| 5994 __declspec(naked) __declspec(align(16)) |
| 5995 LIBYUV_API |
| 5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
| 5997 uint8* dst_argb, const float* uv_dudv, int width) { |
| 5998 __asm { |
| 5999 push esi |
| 6000 push edi |
| 6001 mov eax, [esp + 12] // src_argb |
| 6002 mov esi, [esp + 16] // stride |
| 6003 mov edx, [esp + 20] // dst_argb |
| 6004 mov ecx, [esp + 24] // pointer to uv_dudv |
| 6005 movq xmm2, qword ptr [ecx] // uv |
| 6006 movq xmm7, qword ptr [ecx + 8] // dudv |
| 6007 mov ecx, [esp + 28] // width |
| 6008 shl esi, 16 // 4, stride |
| 6009 add esi, 4 |
| 6010 movd xmm5, esi |
| 6011 sub ecx, 4 |
| 6012 jl l4b |
| 6013 |
| 6014 // setup for 4 pixel loop |
| 6015 pshufd xmm7, xmm7, 0x44 // dup dudv |
| 6016 pshufd xmm5, xmm5, 0 // dup 4, stride |
| 6017 movdqa xmm0, xmm2 // x0, y0, x1, y1 |
| 6018 addps xmm0, xmm7 |
| 6019 movlhps xmm2, xmm0 |
| 6020 movdqa xmm4, xmm7 |
| 6021 addps xmm4, xmm4 // dudv *= 2 |
| 6022 movdqa xmm3, xmm2 // x2, y2, x3, y3 |
| 6023 addps xmm3, xmm4 |
| 6024 addps xmm4, xmm4 // dudv *= 4 |
| 6025 |
| 6026 // 4 pixel loop |
| 6027 align 4 |
| 6028 l4: |
| 6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2 |
| 6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2 |
| 6031 packssdw xmm0, xmm1 // x, y as 8 shorts |
| 6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. |
| 6033 movd esi, xmm0 |
| 6034 pshufd xmm0, xmm0, 0x39 // shift right |
| 6035 movd edi, xmm0 |
| 6036 pshufd xmm0, xmm0, 0x39 // shift right |
| 6037 movd xmm1, [eax + esi] // read pixel 0 |
| 6038 movd xmm6, [eax + edi] // read pixel 1 |
| 6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1 |
| 6040 addps xmm2, xmm4 // x, y += dx, dy first 2 |
| 6041 movq qword ptr [edx], xmm1 |
| 6042 movd esi, xmm0 |
| 6043 pshufd xmm0, xmm0, 0x39 // shift right |
| 6044 movd edi, xmm0 |
| 6045 movd xmm6, [eax + esi] // read pixel 2 |
| 6046 movd xmm0, [eax + edi] // read pixel 3 |
| 6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3 |
| 6048 addps xmm3, xmm4 // x, y += dx, dy next 2 |
| 6049 sub ecx, 4 |
| 6050 movq qword ptr 8[edx], xmm6 |
| 6051 lea edx, [edx + 16] |
| 6052 jge l4 |
| 6053 |
| 6054 l4b: |
| 6055 add ecx, 4 - 1 |
| 6056 jl l1b |
| 6057 |
| 6058 // 1 pixel loop |
| 6059 align 4 |
| 6060 l1: |
| 6061 cvttps2dq xmm0, xmm2 // x, y float to int |
| 6062 packssdw xmm0, xmm0 // x, y as shorts |
| 6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride |
| 6064 addps xmm2, xmm7 // x, y += dx, dy |
| 6065 movd esi, xmm0 |
| 6066 movd xmm0, [eax + esi] // copy a pixel |
| 6067 sub ecx, 1 |
| 6068 movd [edx], xmm0 |
| 6069 lea edx, [edx + 4] |
| 6070 jge l1 |
| 6071 l1b: |
| 6072 pop edi |
| 6073 pop esi |
| 6074 ret |
| 6075 } |
| 6076 } |
| 6077 #endif // HAS_ARGBAFFINEROW_SSE2 |
| 6078 |
| 6079 #ifdef HAS_INTERPOLATEROW_AVX2 |
| 6080 // Bilinear filter 16x2 -> 16x1 |
| 6081 __declspec(naked) __declspec(align(16)) |
| 6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
| 6083 ptrdiff_t src_stride, int dst_width, |
| 6084 int source_y_fraction) { |
| 6085 __asm { |
| 6086 push esi |
| 6087 push edi |
| 6088 mov edi, [esp + 8 + 4] // dst_ptr |
| 6089 mov esi, [esp + 8 + 8] // src_ptr |
| 6090 mov edx, [esp + 8 + 12] // src_stride |
| 6091 mov ecx, [esp + 8 + 16] // dst_width |
| 6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 6093 shr eax, 1 |
| 6094 // Dispatch to specialized filters if applicable. |
| 6095 cmp eax, 0 |
| 6096 je xloop100 // 0 / 128. Blend 100 / 0. |
| 6097 sub edi, esi |
| 6098 cmp eax, 32 |
| 6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
| 6100 cmp eax, 64 |
| 6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
| 6102 cmp eax, 96 |
| 6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
| 6104 |
| 6105 vmovd xmm0, eax // high fraction 0..127 |
| 6106 neg eax |
| 6107 add eax, 128 |
| 6108 vmovd xmm5, eax // low fraction 128..1 |
| 6109 vpunpcklbw xmm5, xmm5, xmm0 |
| 6110 vpunpcklwd xmm5, xmm5, xmm5 |
| 6111 vpxor ymm0, ymm0, ymm0 |
| 6112 vpermd ymm5, ymm0, ymm5 |
| 6113 |
| 6114 align 4 |
| 6115 xloop: |
| 6116 vmovdqu ymm0, [esi] |
| 6117 vmovdqu ymm2, [esi + edx] |
| 6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates |
| 6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates |
| 6120 vpmaddubsw ymm0, ymm0, ymm5 |
| 6121 vpmaddubsw ymm1, ymm1, ymm5 |
| 6122 vpsrlw ymm0, ymm0, 7 |
| 6123 vpsrlw ymm1, ymm1, 7 |
| 6124 vpackuswb ymm0, ymm0, ymm1 // unmutates |
| 6125 sub ecx, 32 |
| 6126 vmovdqu [esi + edi], ymm0 |
| 6127 lea esi, [esi + 32] |
| 6128 jg xloop |
| 6129 jmp xloop99 |
| 6130 |
| 6131 // Blend 25 / 75. |
| 6132 align 4 |
| 6133 xloop25: |
| 6134 vmovdqu ymm0, [esi] |
| 6135 vpavgb ymm0, ymm0, [esi + edx] |
| 6136 vpavgb ymm0, ymm0, [esi + edx] |
| 6137 sub ecx, 32 |
| 6138 vmovdqu [esi + edi], ymm0 |
| 6139 lea esi, [esi + 32] |
| 6140 jg xloop25 |
| 6141 jmp xloop99 |
| 6142 |
| 6143 // Blend 50 / 50. |
| 6144 align 4 |
| 6145 xloop50: |
| 6146 vmovdqu ymm0, [esi] |
| 6147 vpavgb ymm0, ymm0, [esi + edx] |
| 6148 sub ecx, 32 |
| 6149 vmovdqu [esi + edi], ymm0 |
| 6150 lea esi, [esi + 32] |
| 6151 jg xloop50 |
| 6152 jmp xloop99 |
| 6153 |
| 6154 // Blend 75 / 25. |
| 6155 align 4 |
| 6156 xloop75: |
| 6157 vmovdqu ymm0, [esi + edx] |
| 6158 vpavgb ymm0, ymm0, [esi] |
| 6159 vpavgb ymm0, ymm0, [esi] |
| 6160 sub ecx, 32 |
| 6161 vmovdqu [esi + edi], ymm0 |
| 6162 lea esi, [esi + 32] |
| 6163 jg xloop75 |
| 6164 jmp xloop99 |
| 6165 |
| 6166 // Blend 100 / 0 - Copy row unchanged. |
| 6167 align 4 |
| 6168 xloop100: |
| 6169 rep movsb |
| 6170 |
| 6171 xloop99: |
| 6172 pop edi |
| 6173 pop esi |
| 6174 vzeroupper |
| 6175 ret |
| 6176 } |
| 6177 } |
| 6178 #endif // HAS_INTERPOLATEROW_AVX2 |
| 6179 |
| 6180 #ifdef HAS_INTERPOLATEROW_SSSE3 |
| 6181 // Bilinear filter 16x2 -> 16x1 |
| 6182 __declspec(naked) __declspec(align(16)) |
| 6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 6184 ptrdiff_t src_stride, int dst_width, |
| 6185 int source_y_fraction) { |
| 6186 __asm { |
| 6187 push esi |
| 6188 push edi |
| 6189 mov edi, [esp + 8 + 4] // dst_ptr |
| 6190 mov esi, [esp + 8 + 8] // src_ptr |
| 6191 mov edx, [esp + 8 + 12] // src_stride |
| 6192 mov ecx, [esp + 8 + 16] // dst_width |
| 6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 6194 sub edi, esi |
| 6195 shr eax, 1 |
| 6196 // Dispatch to specialized filters if applicable. |
| 6197 cmp eax, 0 |
| 6198 je xloop100 // 0 / 128. Blend 100 / 0. |
| 6199 cmp eax, 32 |
| 6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
| 6201 cmp eax, 64 |
| 6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
| 6203 cmp eax, 96 |
| 6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
| 6205 |
| 6206 movd xmm0, eax // high fraction 0..127 |
| 6207 neg eax |
| 6208 add eax, 128 |
| 6209 movd xmm5, eax // low fraction 128..1 |
| 6210 punpcklbw xmm5, xmm0 |
| 6211 punpcklwd xmm5, xmm5 |
| 6212 pshufd xmm5, xmm5, 0 |
| 6213 |
| 6214 align 4 |
| 6215 xloop: |
| 6216 movdqa xmm0, [esi] |
| 6217 movdqa xmm2, [esi + edx] |
| 6218 movdqa xmm1, xmm0 |
| 6219 punpcklbw xmm0, xmm2 |
| 6220 punpckhbw xmm1, xmm2 |
| 6221 pmaddubsw xmm0, xmm5 |
| 6222 pmaddubsw xmm1, xmm5 |
| 6223 psrlw xmm0, 7 |
| 6224 psrlw xmm1, 7 |
| 6225 packuswb xmm0, xmm1 |
| 6226 sub ecx, 16 |
| 6227 movdqa [esi + edi], xmm0 |
| 6228 lea esi, [esi + 16] |
| 6229 jg xloop |
| 6230 jmp xloop99 |
| 6231 |
| 6232 // Blend 25 / 75. |
| 6233 align 4 |
| 6234 xloop25: |
| 6235 movdqa xmm0, [esi] |
| 6236 movdqa xmm1, [esi + edx] |
| 6237 pavgb xmm0, xmm1 |
| 6238 pavgb xmm0, xmm1 |
| 6239 sub ecx, 16 |
| 6240 movdqa [esi + edi], xmm0 |
| 6241 lea esi, [esi + 16] |
| 6242 jg xloop25 |
| 6243 jmp xloop99 |
| 6244 |
| 6245 // Blend 50 / 50. |
| 6246 align 4 |
| 6247 xloop50: |
| 6248 movdqa xmm0, [esi] |
| 6249 movdqa xmm1, [esi + edx] |
| 6250 pavgb xmm0, xmm1 |
| 6251 sub ecx, 16 |
| 6252 movdqa [esi + edi], xmm0 |
| 6253 lea esi, [esi + 16] |
| 6254 jg xloop50 |
| 6255 jmp xloop99 |
| 6256 |
| 6257 // Blend 75 / 25. |
| 6258 align 4 |
| 6259 xloop75: |
| 6260 movdqa xmm1, [esi] |
| 6261 movdqa xmm0, [esi + edx] |
| 6262 pavgb xmm0, xmm1 |
| 6263 pavgb xmm0, xmm1 |
| 6264 sub ecx, 16 |
| 6265 movdqa [esi + edi], xmm0 |
| 6266 lea esi, [esi + 16] |
| 6267 jg xloop75 |
| 6268 jmp xloop99 |
| 6269 |
| 6270 // Blend 100 / 0 - Copy row unchanged. |
| 6271 align 4 |
| 6272 xloop100: |
| 6273 movdqa xmm0, [esi] |
| 6274 sub ecx, 16 |
| 6275 movdqa [esi + edi], xmm0 |
| 6276 lea esi, [esi + 16] |
| 6277 jg xloop100 |
| 6278 |
| 6279 xloop99: |
| 6280 pop edi |
| 6281 pop esi |
| 6282 ret |
| 6283 } |
| 6284 } |
| 6285 #endif // HAS_INTERPOLATEROW_SSSE3 |
| 6286 |
| 6287 #ifdef HAS_INTERPOLATEROW_SSE2 |
| 6288 // Bilinear filter 16x2 -> 16x1 |
| 6289 __declspec(naked) __declspec(align(16)) |
| 6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 6291 ptrdiff_t src_stride, int dst_width, |
| 6292 int source_y_fraction) { |
| 6293 __asm { |
| 6294 push esi |
| 6295 push edi |
| 6296 mov edi, [esp + 8 + 4] // dst_ptr |
| 6297 mov esi, [esp + 8 + 8] // src_ptr |
| 6298 mov edx, [esp + 8 + 12] // src_stride |
| 6299 mov ecx, [esp + 8 + 16] // dst_width |
| 6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 6301 sub edi, esi |
| 6302 // Dispatch to specialized filters if applicable. |
| 6303 cmp eax, 0 |
| 6304 je xloop100 // 0 / 256. Blend 100 / 0. |
| 6305 cmp eax, 64 |
| 6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
| 6307 cmp eax, 128 |
| 6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
| 6309 cmp eax, 192 |
| 6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
| 6311 |
| 6312 movd xmm5, eax // xmm5 = y fraction |
| 6313 punpcklbw xmm5, xmm5 |
| 6314 psrlw xmm5, 1 |
| 6315 punpcklwd xmm5, xmm5 |
| 6316 punpckldq xmm5, xmm5 |
| 6317 punpcklqdq xmm5, xmm5 |
| 6318 pxor xmm4, xmm4 |
| 6319 |
| 6320 align 4 |
| 6321 xloop: |
| 6322 movdqa xmm0, [esi] // row0 |
| 6323 movdqa xmm2, [esi + edx] // row1 |
| 6324 movdqa xmm1, xmm0 |
| 6325 movdqa xmm3, xmm2 |
| 6326 punpcklbw xmm2, xmm4 |
| 6327 punpckhbw xmm3, xmm4 |
| 6328 punpcklbw xmm0, xmm4 |
| 6329 punpckhbw xmm1, xmm4 |
| 6330 psubw xmm2, xmm0 // row1 - row0 |
| 6331 psubw xmm3, xmm1 |
| 6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
| 6333 paddw xmm3, xmm3 |
| 6334 pmulhw xmm2, xmm5 // scale diff |
| 6335 pmulhw xmm3, xmm5 |
| 6336 paddw xmm0, xmm2 // sum rows |
| 6337 paddw xmm1, xmm3 |
| 6338 packuswb xmm0, xmm1 |
| 6339 sub ecx, 16 |
| 6340 movdqa [esi + edi], xmm0 |
| 6341 lea esi, [esi + 16] |
| 6342 jg xloop |
| 6343 jmp xloop99 |
| 6344 |
| 6345 // Blend 25 / 75. |
| 6346 align 4 |
| 6347 xloop25: |
| 6348 movdqa xmm0, [esi] |
| 6349 movdqa xmm1, [esi + edx] |
| 6350 pavgb xmm0, xmm1 |
| 6351 pavgb xmm0, xmm1 |
| 6352 sub ecx, 16 |
| 6353 movdqa [esi + edi], xmm0 |
| 6354 lea esi, [esi + 16] |
| 6355 jg xloop25 |
| 6356 jmp xloop99 |
| 6357 |
| 6358 // Blend 50 / 50. |
| 6359 align 4 |
| 6360 xloop50: |
| 6361 movdqa xmm0, [esi] |
| 6362 movdqa xmm1, [esi + edx] |
| 6363 pavgb xmm0, xmm1 |
| 6364 sub ecx, 16 |
| 6365 movdqa [esi + edi], xmm0 |
| 6366 lea esi, [esi + 16] |
| 6367 jg xloop50 |
| 6368 jmp xloop99 |
| 6369 |
| 6370 // Blend 75 / 25. |
| 6371 align 4 |
| 6372 xloop75: |
| 6373 movdqa xmm1, [esi] |
| 6374 movdqa xmm0, [esi + edx] |
| 6375 pavgb xmm0, xmm1 |
| 6376 pavgb xmm0, xmm1 |
| 6377 sub ecx, 16 |
| 6378 movdqa [esi + edi], xmm0 |
| 6379 lea esi, [esi + 16] |
| 6380 jg xloop75 |
| 6381 jmp xloop99 |
| 6382 |
| 6383 // Blend 100 / 0 - Copy row unchanged. |
| 6384 align 4 |
| 6385 xloop100: |
| 6386 movdqa xmm0, [esi] |
| 6387 sub ecx, 16 |
| 6388 movdqa [esi + edi], xmm0 |
| 6389 lea esi, [esi + 16] |
| 6390 jg xloop100 |
| 6391 |
| 6392 xloop99: |
| 6393 pop edi |
| 6394 pop esi |
| 6395 ret |
| 6396 } |
| 6397 } |
| 6398 #endif // HAS_INTERPOLATEROW_SSE2 |
| 6399 |
| 6400 // Bilinear filter 16x2 -> 16x1 |
| 6401 __declspec(naked) __declspec(align(16)) |
| 6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 6403 ptrdiff_t src_stride, int dst_width, |
| 6404 int source_y_fraction) { |
| 6405 __asm { |
| 6406 push esi |
| 6407 push edi |
| 6408 mov edi, [esp + 8 + 4] // dst_ptr |
| 6409 mov esi, [esp + 8 + 8] // src_ptr |
| 6410 mov edx, [esp + 8 + 12] // src_stride |
| 6411 mov ecx, [esp + 8 + 16] // dst_width |
| 6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 6413 sub edi, esi |
| 6414 shr eax, 1 |
| 6415 // Dispatch to specialized filters if applicable. |
| 6416 cmp eax, 0 |
| 6417 je xloop100 // 0 / 128. Blend 100 / 0. |
| 6418 cmp eax, 32 |
| 6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
| 6420 cmp eax, 64 |
| 6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
| 6422 cmp eax, 96 |
| 6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
| 6424 |
| 6425 movd xmm0, eax // high fraction 0..127 |
| 6426 neg eax |
| 6427 add eax, 128 |
| 6428 movd xmm5, eax // low fraction 128..1 |
| 6429 punpcklbw xmm5, xmm0 |
| 6430 punpcklwd xmm5, xmm5 |
| 6431 pshufd xmm5, xmm5, 0 |
| 6432 |
| 6433 align 4 |
| 6434 xloop: |
| 6435 movdqu xmm0, [esi] |
| 6436 movdqu xmm2, [esi + edx] |
| 6437 movdqu xmm1, xmm0 |
| 6438 punpcklbw xmm0, xmm2 |
| 6439 punpckhbw xmm1, xmm2 |
| 6440 pmaddubsw xmm0, xmm5 |
| 6441 pmaddubsw xmm1, xmm5 |
| 6442 psrlw xmm0, 7 |
| 6443 psrlw xmm1, 7 |
| 6444 packuswb xmm0, xmm1 |
| 6445 sub ecx, 16 |
| 6446 movdqu [esi + edi], xmm0 |
| 6447 lea esi, [esi + 16] |
| 6448 jg xloop |
| 6449 jmp xloop99 |
| 6450 |
| 6451 // Blend 25 / 75. |
| 6452 align 4 |
| 6453 xloop25: |
| 6454 movdqu xmm0, [esi] |
| 6455 movdqu xmm1, [esi + edx] |
| 6456 pavgb xmm0, xmm1 |
| 6457 pavgb xmm0, xmm1 |
| 6458 sub ecx, 16 |
| 6459 movdqu [esi + edi], xmm0 |
| 6460 lea esi, [esi + 16] |
| 6461 jg xloop25 |
| 6462 jmp xloop99 |
| 6463 |
| 6464 // Blend 50 / 50. |
| 6465 align 4 |
| 6466 xloop50: |
| 6467 movdqu xmm0, [esi] |
| 6468 movdqu xmm1, [esi + edx] |
| 6469 pavgb xmm0, xmm1 |
| 6470 sub ecx, 16 |
| 6471 movdqu [esi + edi], xmm0 |
| 6472 lea esi, [esi + 16] |
| 6473 jg xloop50 |
| 6474 jmp xloop99 |
| 6475 |
| 6476 // Blend 75 / 25. |
| 6477 align 4 |
| 6478 xloop75: |
| 6479 movdqu xmm1, [esi] |
| 6480 movdqu xmm0, [esi + edx] |
| 6481 pavgb xmm0, xmm1 |
| 6482 pavgb xmm0, xmm1 |
| 6483 sub ecx, 16 |
| 6484 movdqu [esi + edi], xmm0 |
| 6485 lea esi, [esi + 16] |
| 6486 jg xloop75 |
| 6487 jmp xloop99 |
| 6488 |
| 6489 // Blend 100 / 0 - Copy row unchanged. |
| 6490 align 4 |
| 6491 xloop100: |
| 6492 movdqu xmm0, [esi] |
| 6493 sub ecx, 16 |
| 6494 movdqu [esi + edi], xmm0 |
| 6495 lea esi, [esi + 16] |
| 6496 jg xloop100 |
| 6497 |
| 6498 xloop99: |
| 6499 pop edi |
| 6500 pop esi |
| 6501 ret |
| 6502 } |
| 6503 } |
| 6504 |
| 6505 #ifdef HAS_INTERPOLATEROW_SSE2 |
| 6506 // Bilinear filter 16x2 -> 16x1 |
| 6507 __declspec(naked) __declspec(align(16)) |
| 6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 6509 ptrdiff_t src_stride, int dst_width, |
| 6510 int source_y_fraction) { |
| 6511 __asm { |
| 6512 push esi |
| 6513 push edi |
| 6514 mov edi, [esp + 8 + 4] // dst_ptr |
| 6515 mov esi, [esp + 8 + 8] // src_ptr |
| 6516 mov edx, [esp + 8 + 12] // src_stride |
| 6517 mov ecx, [esp + 8 + 16] // dst_width |
| 6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
| 6519 sub edi, esi |
| 6520 // Dispatch to specialized filters if applicable. |
| 6521 cmp eax, 0 |
| 6522 je xloop100 // 0 / 256. Blend 100 / 0. |
| 6523 cmp eax, 64 |
| 6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
| 6525 cmp eax, 128 |
| 6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
| 6527 cmp eax, 192 |
| 6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
| 6529 |
| 6530 movd xmm5, eax // xmm5 = y fraction |
| 6531 punpcklbw xmm5, xmm5 |
| 6532 psrlw xmm5, 1 |
| 6533 punpcklwd xmm5, xmm5 |
| 6534 punpckldq xmm5, xmm5 |
| 6535 punpcklqdq xmm5, xmm5 |
| 6536 pxor xmm4, xmm4 |
| 6537 |
| 6538 align 4 |
| 6539 xloop: |
| 6540 movdqu xmm0, [esi] // row0 |
| 6541 movdqu xmm2, [esi + edx] // row1 |
| 6542 movdqu xmm1, xmm0 |
| 6543 movdqu xmm3, xmm2 |
| 6544 punpcklbw xmm2, xmm4 |
| 6545 punpckhbw xmm3, xmm4 |
| 6546 punpcklbw xmm0, xmm4 |
| 6547 punpckhbw xmm1, xmm4 |
| 6548 psubw xmm2, xmm0 // row1 - row0 |
| 6549 psubw xmm3, xmm1 |
| 6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
| 6551 paddw xmm3, xmm3 |
| 6552 pmulhw xmm2, xmm5 // scale diff |
| 6553 pmulhw xmm3, xmm5 |
| 6554 paddw xmm0, xmm2 // sum rows |
| 6555 paddw xmm1, xmm3 |
| 6556 packuswb xmm0, xmm1 |
| 6557 sub ecx, 16 |
| 6558 movdqu [esi + edi], xmm0 |
| 6559 lea esi, [esi + 16] |
| 6560 jg xloop |
| 6561 jmp xloop99 |
| 6562 |
| 6563 // Blend 25 / 75. |
| 6564 align 4 |
| 6565 xloop25: |
| 6566 movdqu xmm0, [esi] |
| 6567 movdqu xmm1, [esi + edx] |
| 6568 pavgb xmm0, xmm1 |
| 6569 pavgb xmm0, xmm1 |
| 6570 sub ecx, 16 |
| 6571 movdqu [esi + edi], xmm0 |
| 6572 lea esi, [esi + 16] |
| 6573 jg xloop25 |
| 6574 jmp xloop99 |
| 6575 |
| 6576 // Blend 50 / 50. |
| 6577 align 4 |
| 6578 xloop50: |
| 6579 movdqu xmm0, [esi] |
| 6580 movdqu xmm1, [esi + edx] |
| 6581 pavgb xmm0, xmm1 |
| 6582 sub ecx, 16 |
| 6583 movdqu [esi + edi], xmm0 |
| 6584 lea esi, [esi + 16] |
| 6585 jg xloop50 |
| 6586 jmp xloop99 |
| 6587 |
| 6588 // Blend 75 / 25. |
| 6589 align 4 |
| 6590 xloop75: |
| 6591 movdqu xmm1, [esi] |
| 6592 movdqu xmm0, [esi + edx] |
| 6593 pavgb xmm0, xmm1 |
| 6594 pavgb xmm0, xmm1 |
| 6595 sub ecx, 16 |
| 6596 movdqu [esi + edi], xmm0 |
| 6597 lea esi, [esi + 16] |
| 6598 jg xloop75 |
| 6599 jmp xloop99 |
| 6600 |
| 6601 // Blend 100 / 0 - Copy row unchanged. |
| 6602 align 4 |
| 6603 xloop100: |
| 6604 movdqu xmm0, [esi] |
| 6605 sub ecx, 16 |
| 6606 movdqu [esi + edi], xmm0 |
| 6607 lea esi, [esi + 16] |
| 6608 jg xloop100 |
| 6609 |
| 6610 xloop99: |
| 6611 pop edi |
| 6612 pop esi |
| 6613 ret |
| 6614 } |
| 6615 } |
| 6616 #endif // HAS_INTERPOLATEROW_SSE2 |
| 6617 |
| 6618 __declspec(naked) __declspec(align(16)) |
| 6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, |
| 6620 uint8* dst_uv, int pix) { |
| 6621 __asm { |
| 6622 push edi |
| 6623 mov eax, [esp + 4 + 4] // src_uv |
| 6624 mov edx, [esp + 4 + 8] // src_uv_stride |
| 6625 mov edi, [esp + 4 + 12] // dst_v |
| 6626 mov ecx, [esp + 4 + 16] // pix |
| 6627 sub edi, eax |
| 6628 |
| 6629 align 4 |
| 6630 convertloop: |
| 6631 movdqa xmm0, [eax] |
| 6632 pavgb xmm0, [eax + edx] |
| 6633 sub ecx, 16 |
| 6634 movdqa [eax + edi], xmm0 |
| 6635 lea eax, [eax + 16] |
| 6636 jg convertloop |
| 6637 pop edi |
| 6638 ret |
| 6639 } |
| 6640 } |
| 6641 |
| 6642 #ifdef HAS_HALFROW_AVX2 |
| 6643 __declspec(naked) __declspec(align(16)) |
| 6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, |
| 6645 uint8* dst_uv, int pix) { |
| 6646 __asm { |
| 6647 push edi |
| 6648 mov eax, [esp + 4 + 4] // src_uv |
| 6649 mov edx, [esp + 4 + 8] // src_uv_stride |
| 6650 mov edi, [esp + 4 + 12] // dst_v |
| 6651 mov ecx, [esp + 4 + 16] // pix |
| 6652 sub edi, eax |
| 6653 |
| 6654 align 4 |
| 6655 convertloop: |
| 6656 vmovdqu ymm0, [eax] |
| 6657 vpavgb ymm0, ymm0, [eax + edx] |
| 6658 sub ecx, 32 |
| 6659 vmovdqu [eax + edi], ymm0 |
| 6660 lea eax, [eax + 32] |
| 6661 jg convertloop |
| 6662 |
| 6663 pop edi |
| 6664 vzeroupper |
| 6665 ret |
| 6666 } |
| 6667 } |
| 6668 #endif // HAS_HALFROW_AVX2 |
| 6669 |
| 6670 __declspec(naked) __declspec(align(16)) |
| 6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, |
| 6672 uint32 selector, int pix) { |
| 6673 __asm { |
| 6674 mov eax, [esp + 4] // src_argb |
| 6675 mov edx, [esp + 8] // dst_bayer |
| 6676 movd xmm5, [esp + 12] // selector |
| 6677 mov ecx, [esp + 16] // pix |
| 6678 pshufd xmm5, xmm5, 0 |
| 6679 |
| 6680 align 4 |
| 6681 wloop: |
| 6682 movdqa xmm0, [eax] |
| 6683 movdqa xmm1, [eax + 16] |
| 6684 lea eax, [eax + 32] |
| 6685 pshufb xmm0, xmm5 |
| 6686 pshufb xmm1, xmm5 |
| 6687 punpckldq xmm0, xmm1 |
| 6688 sub ecx, 8 |
| 6689 movq qword ptr [edx], xmm0 |
| 6690 lea edx, [edx + 8] |
| 6691 jg wloop |
| 6692 ret |
| 6693 } |
| 6694 } |
| 6695 |
| 6696 // Specialized ARGB to Bayer that just isolates G channel. |
| 6697 __declspec(naked) __declspec(align(16)) |
| 6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
| 6699 uint32 selector, int pix) { |
| 6700 __asm { |
| 6701 mov eax, [esp + 4] // src_argb |
| 6702 mov edx, [esp + 8] // dst_bayer |
| 6703 // selector |
| 6704 mov ecx, [esp + 16] // pix |
| 6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff |
| 6706 psrld xmm5, 24 |
| 6707 |
| 6708 align 4 |
| 6709 wloop: |
| 6710 movdqa xmm0, [eax] |
| 6711 movdqa xmm1, [eax + 16] |
| 6712 lea eax, [eax + 32] |
| 6713 psrld xmm0, 8 // Move green to bottom. |
| 6714 psrld xmm1, 8 |
| 6715 pand xmm0, xmm5 |
| 6716 pand xmm1, xmm5 |
| 6717 packssdw xmm0, xmm1 |
| 6718 packuswb xmm0, xmm1 |
| 6719 sub ecx, 8 |
| 6720 movq qword ptr [edx], xmm0 |
| 6721 lea edx, [edx + 8] |
| 6722 jg wloop |
| 6723 ret |
| 6724 } |
| 6725 } |
| 6726 |
| 6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 6728 __declspec(naked) __declspec(align(16)) |
| 6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 6730 const uint8* shuffler, int pix) { |
| 6731 __asm { |
| 6732 mov eax, [esp + 4] // src_argb |
| 6733 mov edx, [esp + 8] // dst_argb |
| 6734 mov ecx, [esp + 12] // shuffler |
| 6735 movdqa xmm5, [ecx] |
| 6736 mov ecx, [esp + 16] // pix |
| 6737 |
| 6738 align 4 |
| 6739 wloop: |
| 6740 movdqa xmm0, [eax] |
| 6741 movdqa xmm1, [eax + 16] |
| 6742 lea eax, [eax + 32] |
| 6743 pshufb xmm0, xmm5 |
| 6744 pshufb xmm1, xmm5 |
| 6745 sub ecx, 8 |
| 6746 movdqa [edx], xmm0 |
| 6747 movdqa [edx + 16], xmm1 |
| 6748 lea edx, [edx + 32] |
| 6749 jg wloop |
| 6750 ret |
| 6751 } |
| 6752 } |
| 6753 |
| 6754 __declspec(naked) __declspec(align(16)) |
| 6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 6756 const uint8* shuffler, int pix) { |
| 6757 __asm { |
| 6758 mov eax, [esp + 4] // src_argb |
| 6759 mov edx, [esp + 8] // dst_argb |
| 6760 mov ecx, [esp + 12] // shuffler |
| 6761 movdqa xmm5, [ecx] |
| 6762 mov ecx, [esp + 16] // pix |
| 6763 |
| 6764 align 4 |
| 6765 wloop: |
| 6766 movdqu xmm0, [eax] |
| 6767 movdqu xmm1, [eax + 16] |
| 6768 lea eax, [eax + 32] |
| 6769 pshufb xmm0, xmm5 |
| 6770 pshufb xmm1, xmm5 |
| 6771 sub ecx, 8 |
| 6772 movdqu [edx], xmm0 |
| 6773 movdqu [edx + 16], xmm1 |
| 6774 lea edx, [edx + 32] |
| 6775 jg wloop |
| 6776 ret |
| 6777 } |
| 6778 } |
| 6779 |
| 6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| 6781 __declspec(naked) __declspec(align(16)) |
| 6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
| 6783 const uint8* shuffler, int pix) { |
| 6784 __asm { |
| 6785 mov eax, [esp + 4] // src_argb |
| 6786 mov edx, [esp + 8] // dst_argb |
| 6787 mov ecx, [esp + 12] // shuffler |
| 6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
| 6789 mov ecx, [esp + 16] // pix |
| 6790 |
| 6791 align 4 |
| 6792 wloop: |
| 6793 vmovdqu ymm0, [eax] |
| 6794 vmovdqu ymm1, [eax + 32] |
| 6795 lea eax, [eax + 64] |
| 6796 vpshufb ymm0, ymm0, ymm5 |
| 6797 vpshufb ymm1, ymm1, ymm5 |
| 6798 sub ecx, 16 |
| 6799 vmovdqu [edx], ymm0 |
| 6800 vmovdqu [edx + 32], ymm1 |
| 6801 lea edx, [edx + 64] |
| 6802 jg wloop |
| 6803 |
| 6804 vzeroupper |
| 6805 ret |
| 6806 } |
| 6807 } |
| 6808 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| 6809 |
| 6810 __declspec(naked) __declspec(align(16)) |
| 6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
| 6812 const uint8* shuffler, int pix) { |
| 6813 __asm { |
| 6814 push ebx |
| 6815 push esi |
| 6816 mov eax, [esp + 8 + 4] // src_argb |
| 6817 mov edx, [esp + 8 + 8] // dst_argb |
| 6818 mov esi, [esp + 8 + 12] // shuffler |
| 6819 mov ecx, [esp + 8 + 16] // pix |
| 6820 pxor xmm5, xmm5 |
| 6821 |
| 6822 mov ebx, [esi] // shuffler |
| 6823 cmp ebx, 0x03000102 |
| 6824 je shuf_3012 |
| 6825 cmp ebx, 0x00010203 |
| 6826 je shuf_0123 |
| 6827 cmp ebx, 0x00030201 |
| 6828 je shuf_0321 |
| 6829 cmp ebx, 0x02010003 |
| 6830 je shuf_2103 |
| 6831 |
| 6832 // TODO(fbarchard): Use one source pointer and 3 offsets. |
| 6833 shuf_any1: |
| 6834 movzx ebx, byte ptr [esi] |
| 6835 movzx ebx, byte ptr [eax + ebx] |
| 6836 mov [edx], bl |
| 6837 movzx ebx, byte ptr [esi + 1] |
| 6838 movzx ebx, byte ptr [eax + ebx] |
| 6839 mov [edx + 1], bl |
| 6840 movzx ebx, byte ptr [esi + 2] |
| 6841 movzx ebx, byte ptr [eax + ebx] |
| 6842 mov [edx + 2], bl |
| 6843 movzx ebx, byte ptr [esi + 3] |
| 6844 movzx ebx, byte ptr [eax + ebx] |
| 6845 mov [edx + 3], bl |
| 6846 lea eax, [eax + 4] |
| 6847 lea edx, [edx + 4] |
| 6848 sub ecx, 1 |
| 6849 jg shuf_any1 |
| 6850 jmp shuf99 |
| 6851 |
| 6852 align 4 |
| 6853 shuf_0123: |
| 6854 movdqu xmm0, [eax] |
| 6855 lea eax, [eax + 16] |
| 6856 movdqa xmm1, xmm0 |
| 6857 punpcklbw xmm0, xmm5 |
| 6858 punpckhbw xmm1, xmm5 |
| 6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB |
| 6860 pshuflw xmm0, xmm0, 01Bh |
| 6861 pshufhw xmm1, xmm1, 01Bh |
| 6862 pshuflw xmm1, xmm1, 01Bh |
| 6863 packuswb xmm0, xmm1 |
| 6864 sub ecx, 4 |
| 6865 movdqu [edx], xmm0 |
| 6866 lea edx, [edx + 16] |
| 6867 jg shuf_0123 |
| 6868 jmp shuf99 |
| 6869 |
| 6870 align 4 |
| 6871 shuf_0321: |
| 6872 movdqu xmm0, [eax] |
| 6873 lea eax, [eax + 16] |
| 6874 movdqa xmm1, xmm0 |
| 6875 punpcklbw xmm0, xmm5 |
| 6876 punpckhbw xmm1, xmm5 |
| 6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB |
| 6878 pshuflw xmm0, xmm0, 039h |
| 6879 pshufhw xmm1, xmm1, 039h |
| 6880 pshuflw xmm1, xmm1, 039h |
| 6881 packuswb xmm0, xmm1 |
| 6882 sub ecx, 4 |
| 6883 movdqu [edx], xmm0 |
| 6884 lea edx, [edx + 16] |
| 6885 jg shuf_0321 |
| 6886 jmp shuf99 |
| 6887 |
| 6888 align 4 |
| 6889 shuf_2103: |
| 6890 movdqu xmm0, [eax] |
| 6891 lea eax, [eax + 16] |
| 6892 movdqa xmm1, xmm0 |
| 6893 punpcklbw xmm0, xmm5 |
| 6894 punpckhbw xmm1, xmm5 |
| 6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA |
| 6896 pshuflw xmm0, xmm0, 093h |
| 6897 pshufhw xmm1, xmm1, 093h |
| 6898 pshuflw xmm1, xmm1, 093h |
| 6899 packuswb xmm0, xmm1 |
| 6900 sub ecx, 4 |
| 6901 movdqu [edx], xmm0 |
| 6902 lea edx, [edx + 16] |
| 6903 jg shuf_2103 |
| 6904 jmp shuf99 |
| 6905 |
| 6906 align 4 |
| 6907 shuf_3012: |
| 6908 movdqu xmm0, [eax] |
| 6909 lea eax, [eax + 16] |
| 6910 movdqa xmm1, xmm0 |
| 6911 punpcklbw xmm0, xmm5 |
| 6912 punpckhbw xmm1, xmm5 |
| 6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB |
| 6914 pshuflw xmm0, xmm0, 0C6h |
| 6915 pshufhw xmm1, xmm1, 0C6h |
| 6916 pshuflw xmm1, xmm1, 0C6h |
| 6917 packuswb xmm0, xmm1 |
| 6918 sub ecx, 4 |
| 6919 movdqu [edx], xmm0 |
| 6920 lea edx, [edx + 16] |
| 6921 jg shuf_3012 |
| 6922 |
| 6923 shuf99: |
| 6924 pop esi |
| 6925 pop ebx |
| 6926 ret |
| 6927 } |
| 6928 } |
| 6929 |
| 6930 // YUY2 - Macro-pixel = 2 image pixels |
| 6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
| 6932 |
| 6933 // UYVY - Macro-pixel = 2 image pixels |
| 6934 // U0Y0V0Y1 |
| 6935 |
| 6936 __declspec(naked) __declspec(align(16)) |
| 6937 void I422ToYUY2Row_SSE2(const uint8* src_y, |
| 6938 const uint8* src_u, |
| 6939 const uint8* src_v, |
| 6940 uint8* dst_frame, int width) { |
| 6941 __asm { |
| 6942 push esi |
| 6943 push edi |
| 6944 mov eax, [esp + 8 + 4] // src_y |
| 6945 mov esi, [esp + 8 + 8] // src_u |
| 6946 mov edx, [esp + 8 + 12] // src_v |
| 6947 mov edi, [esp + 8 + 16] // dst_frame |
| 6948 mov ecx, [esp + 8 + 20] // width |
| 6949 sub edx, esi |
| 6950 |
| 6951 align 4 |
| 6952 convertloop: |
| 6953 movq xmm2, qword ptr [esi] // U |
| 6954 movq xmm3, qword ptr [esi + edx] // V |
| 6955 lea esi, [esi + 8] |
| 6956 punpcklbw xmm2, xmm3 // UV |
| 6957 movdqu xmm0, [eax] // Y |
| 6958 lea eax, [eax + 16] |
| 6959 movdqa xmm1, xmm0 |
| 6960 punpcklbw xmm0, xmm2 // YUYV |
| 6961 punpckhbw xmm1, xmm2 |
| 6962 movdqu [edi], xmm0 |
| 6963 movdqu [edi + 16], xmm1 |
| 6964 lea edi, [edi + 32] |
| 6965 sub ecx, 16 |
| 6966 jg convertloop |
| 6967 |
| 6968 pop edi |
| 6969 pop esi |
| 6970 ret |
| 6971 } |
| 6972 } |
| 6973 |
| 6974 __declspec(naked) __declspec(align(16)) |
| 6975 void I422ToUYVYRow_SSE2(const uint8* src_y, |
| 6976 const uint8* src_u, |
| 6977 const uint8* src_v, |
| 6978 uint8* dst_frame, int width) { |
| 6979 __asm { |
| 6980 push esi |
| 6981 push edi |
| 6982 mov eax, [esp + 8 + 4] // src_y |
| 6983 mov esi, [esp + 8 + 8] // src_u |
| 6984 mov edx, [esp + 8 + 12] // src_v |
| 6985 mov edi, [esp + 8 + 16] // dst_frame |
| 6986 mov ecx, [esp + 8 + 20] // width |
| 6987 sub edx, esi |
| 6988 |
| 6989 align 4 |
| 6990 convertloop: |
| 6991 movq xmm2, qword ptr [esi] // U |
| 6992 movq xmm3, qword ptr [esi + edx] // V |
| 6993 lea esi, [esi + 8] |
| 6994 punpcklbw xmm2, xmm3 // UV |
| 6995 movdqu xmm0, [eax] // Y |
| 6996 movdqa xmm1, xmm2 |
| 6997 lea eax, [eax + 16] |
| 6998 punpcklbw xmm1, xmm0 // UYVY |
| 6999 punpckhbw xmm2, xmm0 |
| 7000 movdqu [edi], xmm1 |
| 7001 movdqu [edi + 16], xmm2 |
| 7002 lea edi, [edi + 32] |
| 7003 sub ecx, 16 |
| 7004 jg convertloop |
| 7005 |
| 7006 pop edi |
| 7007 pop esi |
| 7008 ret |
| 7009 } |
| 7010 } |
| 7011 |
| 7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
| 7013 __declspec(naked) __declspec(align(16)) |
| 7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
| 7015 uint8* dst_argb, const float* poly, |
| 7016 int width) { |
| 7017 __asm { |
| 7018 push esi |
| 7019 mov eax, [esp + 4 + 4] /* src_argb */ |
| 7020 mov edx, [esp + 4 + 8] /* dst_argb */ |
| 7021 mov esi, [esp + 4 + 12] /* poly */ |
| 7022 mov ecx, [esp + 4 + 16] /* width */ |
| 7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
| 7024 |
| 7025 // 2 pixel loop. |
| 7026 align 4 |
| 7027 convertloop: |
| 7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel |
| 7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel |
| 7030 movq xmm0, qword ptr [eax] // BGRABGRA |
| 7031 lea eax, [eax + 8] |
| 7032 punpcklbw xmm0, xmm3 |
| 7033 movdqa xmm4, xmm0 |
| 7034 punpcklwd xmm0, xmm3 // pixel 0 |
| 7035 punpckhwd xmm4, xmm3 // pixel 1 |
| 7036 cvtdq2ps xmm0, xmm0 // 4 floats |
| 7037 cvtdq2ps xmm4, xmm4 |
| 7038 movdqa xmm1, xmm0 // X |
| 7039 movdqa xmm5, xmm4 |
| 7040 mulps xmm0, [esi + 16] // C1 * X |
| 7041 mulps xmm4, [esi + 16] |
| 7042 addps xmm0, [esi] // result = C0 + C1 * X |
| 7043 addps xmm4, [esi] |
| 7044 movdqa xmm2, xmm1 |
| 7045 movdqa xmm6, xmm5 |
| 7046 mulps xmm2, xmm1 // X * X |
| 7047 mulps xmm6, xmm5 |
| 7048 mulps xmm1, xmm2 // X * X * X |
| 7049 mulps xmm5, xmm6 |
| 7050 mulps xmm2, [esi + 32] // C2 * X * X |
| 7051 mulps xmm6, [esi + 32] |
| 7052 mulps xmm1, [esi + 48] // C3 * X * X * X |
| 7053 mulps xmm5, [esi + 48] |
| 7054 addps xmm0, xmm2 // result += C2 * X * X |
| 7055 addps xmm4, xmm6 |
| 7056 addps xmm0, xmm1 // result += C3 * X * X * X |
| 7057 addps xmm4, xmm5 |
| 7058 cvttps2dq xmm0, xmm0 |
| 7059 cvttps2dq xmm4, xmm4 |
| 7060 packuswb xmm0, xmm4 |
| 7061 packuswb xmm0, xmm0 |
| 7062 sub ecx, 2 |
| 7063 movq qword ptr [edx], xmm0 |
| 7064 lea edx, [edx + 8] |
| 7065 jg convertloop |
| 7066 pop esi |
| 7067 ret |
| 7068 } |
| 7069 } |
| 7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
| 7071 |
| 7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
| 7073 __declspec(naked) __declspec(align(16)) |
| 7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
| 7075 uint8* dst_argb, const float* poly, |
| 7076 int width) { |
| 7077 __asm { |
| 7078 mov eax, [esp + 4] /* src_argb */ |
| 7079 mov edx, [esp + 8] /* dst_argb */ |
| 7080 mov ecx, [esp + 12] /* poly */ |
| 7081 vbroadcastf128 ymm4, [ecx] // C0 |
| 7082 vbroadcastf128 ymm5, [ecx + 16] // C1 |
| 7083 vbroadcastf128 ymm6, [ecx + 32] // C2 |
| 7084 vbroadcastf128 ymm7, [ecx + 48] // C3 |
| 7085 mov ecx, [esp + 16] /* width */ |
| 7086 |
| 7087 // 2 pixel loop. |
| 7088 align 4 |
| 7089 convertloop: |
| 7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels |
| 7091 lea eax, [eax + 8] |
| 7092 vcvtdq2ps ymm0, ymm0 // X 8 floats |
| 7093 vmulps ymm2, ymm0, ymm0 // X * X |
| 7094 vmulps ymm3, ymm0, ymm7 // C3 * X |
| 7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X |
| 7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X |
| 7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X |
| 7098 vcvttps2dq ymm0, ymm0 |
| 7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 |
| 7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 |
| 7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 |
| 7102 sub ecx, 2 |
| 7103 vmovq qword ptr [edx], xmm0 |
| 7104 lea edx, [edx + 8] |
| 7105 jg convertloop |
| 7106 vzeroupper |
| 7107 ret |
| 7108 } |
| 7109 } |
| 7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
| 7111 |
| 7112 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
| 7113 // Tranform ARGB pixels with color table. |
| 7114 __declspec(naked) __declspec(align(16)) |
| 7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
| 7116 int width) { |
| 7117 __asm { |
| 7118 push esi |
| 7119 mov eax, [esp + 4 + 4] /* dst_argb */ |
| 7120 mov esi, [esp + 4 + 8] /* table_argb */ |
| 7121 mov ecx, [esp + 4 + 12] /* width */ |
| 7122 |
| 7123 // 1 pixel loop. |
| 7124 align 4 |
| 7125 convertloop: |
| 7126 movzx edx, byte ptr [eax] |
| 7127 lea eax, [eax + 4] |
| 7128 movzx edx, byte ptr [esi + edx * 4] |
| 7129 mov byte ptr [eax - 4], dl |
| 7130 movzx edx, byte ptr [eax - 4 + 1] |
| 7131 movzx edx, byte ptr [esi + edx * 4 + 1] |
| 7132 mov byte ptr [eax - 4 + 1], dl |
| 7133 movzx edx, byte ptr [eax - 4 + 2] |
| 7134 movzx edx, byte ptr [esi + edx * 4 + 2] |
| 7135 mov byte ptr [eax - 4 + 2], dl |
| 7136 movzx edx, byte ptr [eax - 4 + 3] |
| 7137 movzx edx, byte ptr [esi + edx * 4 + 3] |
| 7138 mov byte ptr [eax - 4 + 3], dl |
| 7139 dec ecx |
| 7140 jg convertloop |
| 7141 pop esi |
| 7142 ret |
| 7143 } |
| 7144 } |
| 7145 #endif // HAS_ARGBCOLORTABLEROW_X86 |
| 7146 |
| 7147 #ifdef HAS_RGBCOLORTABLEROW_X86 |
| 7148 // Tranform RGB pixels with color table. |
| 7149 __declspec(naked) __declspec(align(16)) |
| 7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
| 7151 __asm { |
| 7152 push esi |
| 7153 mov eax, [esp + 4 + 4] /* dst_argb */ |
| 7154 mov esi, [esp + 4 + 8] /* table_argb */ |
| 7155 mov ecx, [esp + 4 + 12] /* width */ |
| 7156 |
| 7157 // 1 pixel loop. |
| 7158 align 4 |
| 7159 convertloop: |
| 7160 movzx edx, byte ptr [eax] |
| 7161 lea eax, [eax + 4] |
| 7162 movzx edx, byte ptr [esi + edx * 4] |
| 7163 mov byte ptr [eax - 4], dl |
| 7164 movzx edx, byte ptr [eax - 4 + 1] |
| 7165 movzx edx, byte ptr [esi + edx * 4 + 1] |
| 7166 mov byte ptr [eax - 4 + 1], dl |
| 7167 movzx edx, byte ptr [eax - 4 + 2] |
| 7168 movzx edx, byte ptr [esi + edx * 4 + 2] |
| 7169 mov byte ptr [eax - 4 + 2], dl |
| 7170 dec ecx |
| 7171 jg convertloop |
| 7172 |
| 7173 pop esi |
| 7174 ret |
| 7175 } |
| 7176 } |
| 7177 #endif // HAS_RGBCOLORTABLEROW_X86 |
| 7178 |
| 7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 7180 // Tranform RGB pixels with luma table. |
| 7181 __declspec(naked) __declspec(align(16)) |
| 7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
| 7183 int width, |
| 7184 const uint8* luma, uint32 lumacoeff) { |
| 7185 __asm { |
| 7186 push esi |
| 7187 push edi |
| 7188 mov eax, [esp + 8 + 4] /* src_argb */ |
| 7189 mov edi, [esp + 8 + 8] /* dst_argb */ |
| 7190 mov ecx, [esp + 8 + 12] /* width */ |
| 7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table |
| 7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff |
| 7193 pshufd xmm2, xmm2, 0 |
| 7194 pshufd xmm3, xmm3, 0 |
| 7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 |
| 7196 psllw xmm4, 8 |
| 7197 pxor xmm5, xmm5 |
| 7198 |
| 7199 // 4 pixel loop. |
| 7200 align 4 |
| 7201 convertloop: |
| 7202 movdqu xmm0, qword ptr [eax] // generate luma ptr |
| 7203 pmaddubsw xmm0, xmm3 |
| 7204 phaddw xmm0, xmm0 |
| 7205 pand xmm0, xmm4 // mask out low bits |
| 7206 punpcklwd xmm0, xmm5 |
| 7207 paddd xmm0, xmm2 // add table base |
| 7208 movd esi, xmm0 |
| 7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| 7210 |
| 7211 movzx edx, byte ptr [eax] |
| 7212 movzx edx, byte ptr [esi + edx] |
| 7213 mov byte ptr [edi], dl |
| 7214 movzx edx, byte ptr [eax + 1] |
| 7215 movzx edx, byte ptr [esi + edx] |
| 7216 mov byte ptr [edi + 1], dl |
| 7217 movzx edx, byte ptr [eax + 2] |
| 7218 movzx edx, byte ptr [esi + edx] |
| 7219 mov byte ptr [edi + 2], dl |
| 7220 movzx edx, byte ptr [eax + 3] // copy alpha. |
| 7221 mov byte ptr [edi + 3], dl |
| 7222 |
| 7223 movd esi, xmm0 |
| 7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| 7225 |
| 7226 movzx edx, byte ptr [eax + 4] |
| 7227 movzx edx, byte ptr [esi + edx] |
| 7228 mov byte ptr [edi + 4], dl |
| 7229 movzx edx, byte ptr [eax + 5] |
| 7230 movzx edx, byte ptr [esi + edx] |
| 7231 mov byte ptr [edi + 5], dl |
| 7232 movzx edx, byte ptr [eax + 6] |
| 7233 movzx edx, byte ptr [esi + edx] |
| 7234 mov byte ptr [edi + 6], dl |
| 7235 movzx edx, byte ptr [eax + 7] // copy alpha. |
| 7236 mov byte ptr [edi + 7], dl |
| 7237 |
| 7238 movd esi, xmm0 |
| 7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
| 7240 |
| 7241 movzx edx, byte ptr [eax + 8] |
| 7242 movzx edx, byte ptr [esi + edx] |
| 7243 mov byte ptr [edi + 8], dl |
| 7244 movzx edx, byte ptr [eax + 9] |
| 7245 movzx edx, byte ptr [esi + edx] |
| 7246 mov byte ptr [edi + 9], dl |
| 7247 movzx edx, byte ptr [eax + 10] |
| 7248 movzx edx, byte ptr [esi + edx] |
| 7249 mov byte ptr [edi + 10], dl |
| 7250 movzx edx, byte ptr [eax + 11] // copy alpha. |
| 7251 mov byte ptr [edi + 11], dl |
| 7252 |
| 7253 movd esi, xmm0 |
| 7254 |
| 7255 movzx edx, byte ptr [eax + 12] |
| 7256 movzx edx, byte ptr [esi + edx] |
| 7257 mov byte ptr [edi + 12], dl |
| 7258 movzx edx, byte ptr [eax + 13] |
| 7259 movzx edx, byte ptr [esi + edx] |
| 7260 mov byte ptr [edi + 13], dl |
| 7261 movzx edx, byte ptr [eax + 14] |
| 7262 movzx edx, byte ptr [esi + edx] |
| 7263 mov byte ptr [edi + 14], dl |
| 7264 movzx edx, byte ptr [eax + 15] // copy alpha. |
| 7265 mov byte ptr [edi + 15], dl |
| 7266 |
| 7267 sub ecx, 4 |
| 7268 lea eax, [eax + 16] |
| 7269 lea edi, [edi + 16] |
| 7270 jg convertloop |
| 7271 |
| 7272 pop edi |
| 7273 pop esi |
| 7274 ret |
| 7275 } |
| 7276 } |
| 7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 7278 |
| 7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
| 7280 |
| 7281 #ifdef __cplusplus |
| 7282 } // extern "C" |
| 7283 } // namespace libyuv |
| 7284 #endif |
OLD | NEW |