OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "third_party/libyuv/include/libyuv/row.h" |
| 12 |
| 13 #ifdef __cplusplus |
| 14 namespace libyuv { |
| 15 extern "C" { |
| 16 #endif |
| 17 |
| 18 // This module is for GCC x86 and x64. |
| 19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
| 20 |
| 21 // Offsets for source bytes 0 to 9 |
| 22 static uvec8 kShuf0 = |
| 23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 24 |
| 25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
| 26 static uvec8 kShuf1 = |
| 27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 28 |
| 29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
| 30 static uvec8 kShuf2 = |
| 31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 32 |
| 33 // Offsets for source bytes 0 to 10 |
| 34 static uvec8 kShuf01 = |
| 35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
| 36 |
| 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
| 38 static uvec8 kShuf11 = |
| 39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
| 40 |
| 41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
| 42 static uvec8 kShuf21 = |
| 43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
| 44 |
| 45 // Coefficients for source bytes 0 to 10 |
| 46 static uvec8 kMadd01 = |
| 47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
| 48 |
| 49 // Coefficients for source bytes 10 to 21 |
| 50 static uvec8 kMadd11 = |
| 51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
| 52 |
| 53 // Coefficients for source bytes 21 to 31 |
| 54 static uvec8 kMadd21 = |
| 55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
| 56 |
| 57 // Coefficients for source bytes 21 to 31 |
| 58 static vec16 kRound34 = |
| 59 { 2, 2, 2, 2, 2, 2, 2, 2 }; |
| 60 |
| 61 static uvec8 kShuf38a = |
| 62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 63 |
| 64 static uvec8 kShuf38b = |
| 65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
| 66 |
| 67 // Arrange words 0,3,6 into 0,1,2 |
| 68 static uvec8 kShufAc = |
| 69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
| 70 |
| 71 // Arrange words 0,3,6 into 3,4,5 |
| 72 static uvec8 kShufAc3 = |
| 73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
| 74 |
| 75 // Scaling values for boxes of 3x3 and 2x3 |
| 76 static uvec16 kScaleAc33 = |
| 77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
| 78 |
| 79 // Arrange first value for pixels 0,1,2,3,4,5 |
| 80 static uvec8 kShufAb0 = |
| 81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
| 82 |
| 83 // Arrange second value for pixels 0,1,2,3,4,5 |
| 84 static uvec8 kShufAb1 = |
| 85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
| 86 |
| 87 // Arrange third value for pixels 0,1,2,3,4,5 |
| 88 static uvec8 kShufAb2 = |
| 89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
| 90 |
| 91 // Scaling values for boxes of 3x2 and 2x2 |
| 92 static uvec16 kScaleAb2 = |
| 93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
| 94 |
| 95 // GCC versions of row functions are verbatim conversions from Visual C. |
| 96 // Generated using gcc disassembly on Visual C object file: |
| 97 // objdump -D yuvscaler.obj >yuvscaler.txt |
| 98 |
| 99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 100 uint8* dst_ptr, int dst_width) { |
| 101 asm volatile ( |
| 102 LABELALIGN |
| 103 "1: \n" |
| 104 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 106 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 107 "psrlw $0x8,%%xmm0 \n" |
| 108 "psrlw $0x8,%%xmm1 \n" |
| 109 "packuswb %%xmm1,%%xmm0 \n" |
| 110 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 111 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 112 "sub $0x10,%2 \n" |
| 113 "jg 1b \n" |
| 114 : "+r"(src_ptr), // %0 |
| 115 "+r"(dst_ptr), // %1 |
| 116 "+r"(dst_width) // %2 |
| 117 : |
| 118 : "memory", "cc" |
| 119 #if defined(__SSE2__) |
| 120 , "xmm0", "xmm1" |
| 121 #endif |
| 122 ); |
| 123 } |
| 124 |
| 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 126 uint8* dst_ptr, int dst_width) { |
| 127 asm volatile ( |
| 128 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 129 "psrlw $0x8,%%xmm5 \n" |
| 130 |
| 131 LABELALIGN |
| 132 "1: \n" |
| 133 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" |
| 135 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 136 "movdqa %%xmm0,%%xmm2 \n" |
| 137 "psrlw $0x8,%%xmm0 \n" |
| 138 "movdqa %%xmm1,%%xmm3 \n" |
| 139 "psrlw $0x8,%%xmm1 \n" |
| 140 "pand %%xmm5,%%xmm2 \n" |
| 141 "pand %%xmm5,%%xmm3 \n" |
| 142 "pavgw %%xmm2,%%xmm0 \n" |
| 143 "pavgw %%xmm3,%%xmm1 \n" |
| 144 "packuswb %%xmm1,%%xmm0 \n" |
| 145 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 146 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 147 "sub $0x10,%2 \n" |
| 148 "jg 1b \n" |
| 149 : "+r"(src_ptr), // %0 |
| 150 "+r"(dst_ptr), // %1 |
| 151 "+r"(dst_width) // %2 |
| 152 : |
| 153 : "memory", "cc" |
| 154 #if defined(__SSE2__) |
| 155 , "xmm0", "xmm1", "xmm5" |
| 156 #endif |
| 157 ); |
| 158 } |
| 159 |
| 160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 161 uint8* dst_ptr, int dst_width) { |
| 162 asm volatile ( |
| 163 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 164 "psrlw $0x8,%%xmm5 \n" |
| 165 |
| 166 LABELALIGN |
| 167 "1: \n" |
| 168 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
| 171 BUNDLEALIGN |
| 172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
| 173 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 174 "pavgb %%xmm2,%%xmm0 \n" |
| 175 "pavgb %%xmm3,%%xmm1 \n" |
| 176 "movdqa %%xmm0,%%xmm2 \n" |
| 177 "psrlw $0x8,%%xmm0 \n" |
| 178 "movdqa %%xmm1,%%xmm3 \n" |
| 179 "psrlw $0x8,%%xmm1 \n" |
| 180 "pand %%xmm5,%%xmm2 \n" |
| 181 "pand %%xmm5,%%xmm3 \n" |
| 182 "pavgw %%xmm2,%%xmm0 \n" |
| 183 "pavgw %%xmm3,%%xmm1 \n" |
| 184 "packuswb %%xmm1,%%xmm0 \n" |
| 185 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 186 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 187 "sub $0x10,%2 \n" |
| 188 "jg 1b \n" |
| 189 : "+r"(src_ptr), // %0 |
| 190 "+r"(dst_ptr), // %1 |
| 191 "+r"(dst_width) // %2 |
| 192 : "r"((intptr_t)(src_stride)) // %3 |
| 193 : "memory", "cc" |
| 194 #if defined(__native_client__) && defined(__x86_64__) |
| 195 , "r14" |
| 196 #endif |
| 197 #if defined(__SSE2__) |
| 198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 199 #endif |
| 200 ); |
| 201 } |
| 202 |
| 203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 204 uint8* dst_ptr, int dst_width) { |
| 205 asm volatile ( |
| 206 LABELALIGN |
| 207 "1: \n" |
| 208 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 210 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 211 "psrlw $0x8,%%xmm0 \n" |
| 212 "psrlw $0x8,%%xmm1 \n" |
| 213 "packuswb %%xmm1,%%xmm0 \n" |
| 214 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 215 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 216 "sub $0x10,%2 \n" |
| 217 "jg 1b \n" |
| 218 : "+r"(src_ptr), // %0 |
| 219 "+r"(dst_ptr), // %1 |
| 220 "+r"(dst_width) // %2 |
| 221 : |
| 222 : "memory", "cc" |
| 223 #if defined(__SSE2__) |
| 224 , "xmm0", "xmm1" |
| 225 #endif |
| 226 ); |
| 227 } |
| 228 |
| 229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
| 230 ptrdiff_t src_stride, |
| 231 uint8* dst_ptr, int dst_width) { |
| 232 asm volatile ( |
| 233 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 234 "psrlw $0x8,%%xmm5 \n" |
| 235 |
| 236 LABELALIGN |
| 237 "1: \n" |
| 238 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 240 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 241 "movdqa %%xmm0,%%xmm2 \n" |
| 242 "psrlw $0x8,%%xmm0 \n" |
| 243 "movdqa %%xmm1,%%xmm3 \n" |
| 244 "psrlw $0x8,%%xmm1 \n" |
| 245 "pand %%xmm5,%%xmm2 \n" |
| 246 "pand %%xmm5,%%xmm3 \n" |
| 247 "pavgw %%xmm2,%%xmm0 \n" |
| 248 "pavgw %%xmm3,%%xmm1 \n" |
| 249 "packuswb %%xmm1,%%xmm0 \n" |
| 250 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 251 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 252 "sub $0x10,%2 \n" |
| 253 "jg 1b \n" |
| 254 : "+r"(src_ptr), // %0 |
| 255 "+r"(dst_ptr), // %1 |
| 256 "+r"(dst_width) // %2 |
| 257 : |
| 258 : "memory", "cc" |
| 259 #if defined(__SSE2__) |
| 260 , "xmm0", "xmm1", "xmm5" |
| 261 #endif |
| 262 ); |
| 263 } |
| 264 |
| 265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
| 266 ptrdiff_t src_stride, |
| 267 uint8* dst_ptr, int dst_width) { |
| 268 asm volatile ( |
| 269 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 270 "psrlw $0x8,%%xmm5 \n" |
| 271 |
| 272 LABELALIGN |
| 273 "1: \n" |
| 274 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
| 277 BUNDLEALIGN |
| 278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
| 279 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 280 "pavgb %%xmm2,%%xmm0 \n" |
| 281 "pavgb %%xmm3,%%xmm1 \n" |
| 282 "movdqa %%xmm0,%%xmm2 \n" |
| 283 "psrlw $0x8,%%xmm0 \n" |
| 284 "movdqa %%xmm1,%%xmm3 \n" |
| 285 "psrlw $0x8,%%xmm1 \n" |
| 286 "pand %%xmm5,%%xmm2 \n" |
| 287 "pand %%xmm5,%%xmm3 \n" |
| 288 "pavgw %%xmm2,%%xmm0 \n" |
| 289 "pavgw %%xmm3,%%xmm1 \n" |
| 290 "packuswb %%xmm1,%%xmm0 \n" |
| 291 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 292 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 293 "sub $0x10,%2 \n" |
| 294 "jg 1b \n" |
| 295 : "+r"(src_ptr), // %0 |
| 296 "+r"(dst_ptr), // %1 |
| 297 "+r"(dst_width) // %2 |
| 298 : "r"((intptr_t)(src_stride)) // %3 |
| 299 : "memory", "cc" |
| 300 #if defined(__native_client__) && defined(__x86_64__) |
| 301 , "r14" |
| 302 #endif |
| 303 #if defined(__SSE2__) |
| 304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 305 #endif |
| 306 ); |
| 307 } |
| 308 |
| 309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 310 uint8* dst_ptr, int dst_width) { |
| 311 asm volatile ( |
| 312 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 313 "psrld $0x18,%%xmm5 \n" |
| 314 "pslld $0x10,%%xmm5 \n" |
| 315 |
| 316 LABELALIGN |
| 317 "1: \n" |
| 318 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 320 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 321 "pand %%xmm5,%%xmm0 \n" |
| 322 "pand %%xmm5,%%xmm1 \n" |
| 323 "packuswb %%xmm1,%%xmm0 \n" |
| 324 "psrlw $0x8,%%xmm0 \n" |
| 325 "packuswb %%xmm0,%%xmm0 \n" |
| 326 "movq %%xmm0," MEMACCESS(1) " \n" |
| 327 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 328 "sub $0x8,%2 \n" |
| 329 "jg 1b \n" |
| 330 : "+r"(src_ptr), // %0 |
| 331 "+r"(dst_ptr), // %1 |
| 332 "+r"(dst_width) // %2 |
| 333 : |
| 334 : "memory", "cc" |
| 335 #if defined(__SSE2__) |
| 336 , "xmm0", "xmm1", "xmm5" |
| 337 #endif |
| 338 ); |
| 339 } |
| 340 |
| 341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 342 uint8* dst_ptr, int dst_width) { |
| 343 intptr_t stridex3 = 0; |
| 344 asm volatile ( |
| 345 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 346 "psrlw $0x8,%%xmm7 \n" |
| 347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
| 348 |
| 349 LABELALIGN |
| 350 "1: \n" |
| 351 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
| 354 BUNDLEALIGN |
| 355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
| 356 "pavgb %%xmm2,%%xmm0 \n" |
| 357 "pavgb %%xmm3,%%xmm1 \n" |
| 358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 |
| 359 BUNDLEALIGN |
| 360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 |
| 361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 |
| 362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 |
| 363 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 364 "pavgb %%xmm4,%%xmm2 \n" |
| 365 "pavgb %%xmm2,%%xmm0 \n" |
| 366 "pavgb %%xmm5,%%xmm3 \n" |
| 367 "pavgb %%xmm3,%%xmm1 \n" |
| 368 "movdqa %%xmm0,%%xmm2 \n" |
| 369 "psrlw $0x8,%%xmm0 \n" |
| 370 "movdqa %%xmm1,%%xmm3 \n" |
| 371 "psrlw $0x8,%%xmm1 \n" |
| 372 "pand %%xmm7,%%xmm2 \n" |
| 373 "pand %%xmm7,%%xmm3 \n" |
| 374 "pavgw %%xmm2,%%xmm0 \n" |
| 375 "pavgw %%xmm3,%%xmm1 \n" |
| 376 "packuswb %%xmm1,%%xmm0 \n" |
| 377 "movdqa %%xmm0,%%xmm2 \n" |
| 378 "psrlw $0x8,%%xmm0 \n" |
| 379 "pand %%xmm7,%%xmm2 \n" |
| 380 "pavgw %%xmm2,%%xmm0 \n" |
| 381 "packuswb %%xmm0,%%xmm0 \n" |
| 382 "movq %%xmm0," MEMACCESS(1) " \n" |
| 383 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 384 "sub $0x8,%2 \n" |
| 385 "jg 1b \n" |
| 386 : "+r"(src_ptr), // %0 |
| 387 "+r"(dst_ptr), // %1 |
| 388 "+r"(dst_width), // %2 |
| 389 "+r"(stridex3) // %3 |
| 390 : "r"((intptr_t)(src_stride)) // %4 |
| 391 : "memory", "cc" |
| 392 #if defined(__native_client__) && defined(__x86_64__) |
| 393 , "r14" |
| 394 #endif |
| 395 #if defined(__SSE2__) |
| 396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" |
| 397 #endif |
| 398 ); |
| 399 } |
| 400 |
| 401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 402 uint8* dst_ptr, int dst_width) { |
| 403 asm volatile ( |
| 404 "movdqa %0,%%xmm3 \n" |
| 405 "movdqa %1,%%xmm4 \n" |
| 406 "movdqa %2,%%xmm5 \n" |
| 407 : |
| 408 : "m"(kShuf0), // %0 |
| 409 "m"(kShuf1), // %1 |
| 410 "m"(kShuf2) // %2 |
| 411 ); |
| 412 asm volatile ( |
| 413 LABELALIGN |
| 414 "1: \n" |
| 415 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" |
| 417 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 418 "movdqa %%xmm2,%%xmm1 \n" |
| 419 "palignr $0x8,%%xmm0,%%xmm1 \n" |
| 420 "pshufb %%xmm3,%%xmm0 \n" |
| 421 "pshufb %%xmm4,%%xmm1 \n" |
| 422 "pshufb %%xmm5,%%xmm2 \n" |
| 423 "movq %%xmm0," MEMACCESS(1) " \n" |
| 424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" |
| 425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" |
| 426 "lea " MEMLEA(0x18,1) ",%1 \n" |
| 427 "sub $0x18,%2 \n" |
| 428 "jg 1b \n" |
| 429 : "+r"(src_ptr), // %0 |
| 430 "+r"(dst_ptr), // %1 |
| 431 "+r"(dst_width) // %2 |
| 432 : |
| 433 : "memory", "cc" |
| 434 #if defined(__SSE2__) |
| 435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 436 #endif |
| 437 ); |
| 438 } |
| 439 |
| 440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
| 441 ptrdiff_t src_stride, |
| 442 uint8* dst_ptr, int dst_width) { |
| 443 asm volatile ( |
| 444 "movdqa %0,%%xmm2 \n" // kShuf01 |
| 445 "movdqa %1,%%xmm3 \n" // kShuf11 |
| 446 "movdqa %2,%%xmm4 \n" // kShuf21 |
| 447 : |
| 448 : "m"(kShuf01), // %0 |
| 449 "m"(kShuf11), // %1 |
| 450 "m"(kShuf21) // %2 |
| 451 ); |
| 452 asm volatile ( |
| 453 "movdqa %0,%%xmm5 \n" // kMadd01 |
| 454 "movdqa %1,%%xmm0 \n" // kMadd11 |
| 455 "movdqa %2,%%xmm1 \n" // kRound34 |
| 456 : |
| 457 : "m"(kMadd01), // %0 |
| 458 "m"(kMadd11), // %1 |
| 459 "m"(kRound34) // %2 |
| 460 ); |
| 461 asm volatile ( |
| 462 LABELALIGN |
| 463 "1: \n" |
| 464 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
| 465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 |
| 466 "pavgb %%xmm7,%%xmm6 \n" |
| 467 "pshufb %%xmm2,%%xmm6 \n" |
| 468 "pmaddubsw %%xmm5,%%xmm6 \n" |
| 469 "paddsw %%xmm1,%%xmm6 \n" |
| 470 "psrlw $0x2,%%xmm6 \n" |
| 471 "packuswb %%xmm6,%%xmm6 \n" |
| 472 "movq %%xmm6," MEMACCESS(1) " \n" |
| 473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
| 474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 |
| 475 "pavgb %%xmm7,%%xmm6 \n" |
| 476 "pshufb %%xmm3,%%xmm6 \n" |
| 477 "pmaddubsw %%xmm0,%%xmm6 \n" |
| 478 "paddsw %%xmm1,%%xmm6 \n" |
| 479 "psrlw $0x2,%%xmm6 \n" |
| 480 "packuswb %%xmm6,%%xmm6 \n" |
| 481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
| 482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
| 483 BUNDLEALIGN |
| 484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 |
| 485 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 486 "pavgb %%xmm7,%%xmm6 \n" |
| 487 "pshufb %%xmm4,%%xmm6 \n" |
| 488 "pmaddubsw %4,%%xmm6 \n" |
| 489 "paddsw %%xmm1,%%xmm6 \n" |
| 490 "psrlw $0x2,%%xmm6 \n" |
| 491 "packuswb %%xmm6,%%xmm6 \n" |
| 492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
| 493 "lea " MEMLEA(0x18,1) ",%1 \n" |
| 494 "sub $0x18,%2 \n" |
| 495 "jg 1b \n" |
| 496 : "+r"(src_ptr), // %0 |
| 497 "+r"(dst_ptr), // %1 |
| 498 "+r"(dst_width) // %2 |
| 499 : "r"((intptr_t)(src_stride)), // %3 |
| 500 "m"(kMadd21) // %4 |
| 501 : "memory", "cc" |
| 502 #if defined(__native_client__) && defined(__x86_64__) |
| 503 , "r14" |
| 504 #endif |
| 505 #if defined(__SSE2__) |
| 506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 507 #endif |
| 508 ); |
| 509 } |
| 510 |
| 511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
| 512 ptrdiff_t src_stride, |
| 513 uint8* dst_ptr, int dst_width) { |
| 514 asm volatile ( |
| 515 "movdqa %0,%%xmm2 \n" // kShuf01 |
| 516 "movdqa %1,%%xmm3 \n" // kShuf11 |
| 517 "movdqa %2,%%xmm4 \n" // kShuf21 |
| 518 : |
| 519 : "m"(kShuf01), // %0 |
| 520 "m"(kShuf11), // %1 |
| 521 "m"(kShuf21) // %2 |
| 522 ); |
| 523 asm volatile ( |
| 524 "movdqa %0,%%xmm5 \n" // kMadd01 |
| 525 "movdqa %1,%%xmm0 \n" // kMadd11 |
| 526 "movdqa %2,%%xmm1 \n" // kRound34 |
| 527 : |
| 528 : "m"(kMadd01), // %0 |
| 529 "m"(kMadd11), // %1 |
| 530 "m"(kRound34) // %2 |
| 531 ); |
| 532 |
| 533 asm volatile ( |
| 534 LABELALIGN |
| 535 "1: \n" |
| 536 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
| 537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 |
| 538 "pavgb %%xmm6,%%xmm7 \n" |
| 539 "pavgb %%xmm7,%%xmm6 \n" |
| 540 "pshufb %%xmm2,%%xmm6 \n" |
| 541 "pmaddubsw %%xmm5,%%xmm6 \n" |
| 542 "paddsw %%xmm1,%%xmm6 \n" |
| 543 "psrlw $0x2,%%xmm6 \n" |
| 544 "packuswb %%xmm6,%%xmm6 \n" |
| 545 "movq %%xmm6," MEMACCESS(1) " \n" |
| 546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
| 547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 |
| 548 "pavgb %%xmm6,%%xmm7 \n" |
| 549 "pavgb %%xmm7,%%xmm6 \n" |
| 550 "pshufb %%xmm3,%%xmm6 \n" |
| 551 "pmaddubsw %%xmm0,%%xmm6 \n" |
| 552 "paddsw %%xmm1,%%xmm6 \n" |
| 553 "psrlw $0x2,%%xmm6 \n" |
| 554 "packuswb %%xmm6,%%xmm6 \n" |
| 555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
| 556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
| 557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 |
| 558 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 559 "pavgb %%xmm6,%%xmm7 \n" |
| 560 "pavgb %%xmm7,%%xmm6 \n" |
| 561 "pshufb %%xmm4,%%xmm6 \n" |
| 562 "pmaddubsw %4,%%xmm6 \n" |
| 563 "paddsw %%xmm1,%%xmm6 \n" |
| 564 "psrlw $0x2,%%xmm6 \n" |
| 565 "packuswb %%xmm6,%%xmm6 \n" |
| 566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
| 567 "lea " MEMLEA(0x18,1) ",%1 \n" |
| 568 "sub $0x18,%2 \n" |
| 569 "jg 1b \n" |
| 570 : "+r"(src_ptr), // %0 |
| 571 "+r"(dst_ptr), // %1 |
| 572 "+r"(dst_width) // %2 |
| 573 : "r"((intptr_t)(src_stride)), // %3 |
| 574 "m"(kMadd21) // %4 |
| 575 : "memory", "cc" |
| 576 #if defined(__native_client__) && defined(__x86_64__) |
| 577 , "r14" |
| 578 #endif |
| 579 #if defined(__SSE2__) |
| 580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 581 #endif |
| 582 ); |
| 583 } |
| 584 |
| 585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 586 uint8* dst_ptr, int dst_width) { |
| 587 asm volatile ( |
| 588 "movdqa %3,%%xmm4 \n" |
| 589 "movdqa %4,%%xmm5 \n" |
| 590 |
| 591 LABELALIGN |
| 592 "1: \n" |
| 593 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 595 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 596 "pshufb %%xmm4,%%xmm0 \n" |
| 597 "pshufb %%xmm5,%%xmm1 \n" |
| 598 "paddusb %%xmm1,%%xmm0 \n" |
| 599 "movq %%xmm0," MEMACCESS(1) " \n" |
| 600 "movhlps %%xmm0,%%xmm1 \n" |
| 601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n" |
| 602 "lea " MEMLEA(0xc,1) ",%1 \n" |
| 603 "sub $0xc,%2 \n" |
| 604 "jg 1b \n" |
| 605 : "+r"(src_ptr), // %0 |
| 606 "+r"(dst_ptr), // %1 |
| 607 "+r"(dst_width) // %2 |
| 608 : "m"(kShuf38a), // %3 |
| 609 "m"(kShuf38b) // %4 |
| 610 : "memory", "cc" |
| 611 #if defined(__SSE2__) |
| 612 , "xmm0", "xmm1", "xmm4", "xmm5" |
| 613 #endif |
| 614 ); |
| 615 } |
| 616 |
| 617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
| 618 ptrdiff_t src_stride, |
| 619 uint8* dst_ptr, int dst_width) { |
| 620 asm volatile ( |
| 621 "movdqa %0,%%xmm2 \n" |
| 622 "movdqa %1,%%xmm3 \n" |
| 623 "movdqa %2,%%xmm4 \n" |
| 624 "movdqa %3,%%xmm5 \n" |
| 625 : |
| 626 : "m"(kShufAb0), // %0 |
| 627 "m"(kShufAb1), // %1 |
| 628 "m"(kShufAb2), // %2 |
| 629 "m"(kScaleAb2) // %3 |
| 630 ); |
| 631 asm volatile ( |
| 632 LABELALIGN |
| 633 "1: \n" |
| 634 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 |
| 636 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 637 "movdqa %%xmm0,%%xmm1 \n" |
| 638 "pshufb %%xmm2,%%xmm1 \n" |
| 639 "movdqa %%xmm0,%%xmm6 \n" |
| 640 "pshufb %%xmm3,%%xmm6 \n" |
| 641 "paddusw %%xmm6,%%xmm1 \n" |
| 642 "pshufb %%xmm4,%%xmm0 \n" |
| 643 "paddusw %%xmm0,%%xmm1 \n" |
| 644 "pmulhuw %%xmm5,%%xmm1 \n" |
| 645 "packuswb %%xmm1,%%xmm1 \n" |
| 646 "sub $0x6,%2 \n" |
| 647 "movd %%xmm1," MEMACCESS(1) " \n" |
| 648 "psrlq $0x10,%%xmm1 \n" |
| 649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n" |
| 650 "lea " MEMLEA(0x6,1) ",%1 \n" |
| 651 "jg 1b \n" |
| 652 : "+r"(src_ptr), // %0 |
| 653 "+r"(dst_ptr), // %1 |
| 654 "+r"(dst_width) // %2 |
| 655 : "r"((intptr_t)(src_stride)) // %3 |
| 656 : "memory", "cc" |
| 657 #if defined(__native_client__) && defined(__x86_64__) |
| 658 , "r14" |
| 659 #endif |
| 660 #if defined(__SSE2__) |
| 661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 662 #endif |
| 663 ); |
| 664 } |
| 665 |
| 666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
| 667 ptrdiff_t src_stride, |
| 668 uint8* dst_ptr, int dst_width) { |
| 669 asm volatile ( |
| 670 "movdqa %0,%%xmm2 \n" |
| 671 "movdqa %1,%%xmm3 \n" |
| 672 "movdqa %2,%%xmm4 \n" |
| 673 "pxor %%xmm5,%%xmm5 \n" |
| 674 : |
| 675 : "m"(kShufAc), // %0 |
| 676 "m"(kShufAc3), // %1 |
| 677 "m"(kScaleAc33) // %2 |
| 678 ); |
| 679 asm volatile ( |
| 680 LABELALIGN |
| 681 "1: \n" |
| 682 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 |
| 684 "movhlps %%xmm0,%%xmm1 \n" |
| 685 "movhlps %%xmm6,%%xmm7 \n" |
| 686 "punpcklbw %%xmm5,%%xmm0 \n" |
| 687 "punpcklbw %%xmm5,%%xmm1 \n" |
| 688 "punpcklbw %%xmm5,%%xmm6 \n" |
| 689 "punpcklbw %%xmm5,%%xmm7 \n" |
| 690 "paddusw %%xmm6,%%xmm0 \n" |
| 691 "paddusw %%xmm7,%%xmm1 \n" |
| 692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 |
| 693 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 694 "movhlps %%xmm6,%%xmm7 \n" |
| 695 "punpcklbw %%xmm5,%%xmm6 \n" |
| 696 "punpcklbw %%xmm5,%%xmm7 \n" |
| 697 "paddusw %%xmm6,%%xmm0 \n" |
| 698 "paddusw %%xmm7,%%xmm1 \n" |
| 699 "movdqa %%xmm0,%%xmm6 \n" |
| 700 "psrldq $0x2,%%xmm0 \n" |
| 701 "paddusw %%xmm0,%%xmm6 \n" |
| 702 "psrldq $0x2,%%xmm0 \n" |
| 703 "paddusw %%xmm0,%%xmm6 \n" |
| 704 "pshufb %%xmm2,%%xmm6 \n" |
| 705 "movdqa %%xmm1,%%xmm7 \n" |
| 706 "psrldq $0x2,%%xmm1 \n" |
| 707 "paddusw %%xmm1,%%xmm7 \n" |
| 708 "psrldq $0x2,%%xmm1 \n" |
| 709 "paddusw %%xmm1,%%xmm7 \n" |
| 710 "pshufb %%xmm3,%%xmm7 \n" |
| 711 "paddusw %%xmm7,%%xmm6 \n" |
| 712 "pmulhuw %%xmm4,%%xmm6 \n" |
| 713 "packuswb %%xmm6,%%xmm6 \n" |
| 714 "sub $0x6,%2 \n" |
| 715 "movd %%xmm6," MEMACCESS(1) " \n" |
| 716 "psrlq $0x10,%%xmm6 \n" |
| 717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n" |
| 718 "lea " MEMLEA(0x6,1) ",%1 \n" |
| 719 "jg 1b \n" |
| 720 : "+r"(src_ptr), // %0 |
| 721 "+r"(dst_ptr), // %1 |
| 722 "+r"(dst_width) // %2 |
| 723 : "r"((intptr_t)(src_stride)) // %3 |
| 724 : "memory", "cc" |
| 725 #if defined(__native_client__) && defined(__x86_64__) |
| 726 , "r14" |
| 727 #endif |
| 728 #if defined(__SSE2__) |
| 729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 730 #endif |
| 731 ); |
| 732 } |
| 733 |
| 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 735 uint16* dst_ptr, int src_width, int src_height) { |
| 736 int tmp_height = 0; |
| 737 intptr_t tmp_src = 0; |
| 738 asm volatile ( |
| 739 "pxor %%xmm4,%%xmm4 \n" |
| 740 "sub $0x1,%5 \n" |
| 741 |
| 742 LABELALIGN |
| 743 "1: \n" |
| 744 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 745 "mov %0,%3 \n" |
| 746 "add %6,%0 \n" |
| 747 "movdqa %%xmm0,%%xmm1 \n" |
| 748 "punpcklbw %%xmm4,%%xmm0 \n" |
| 749 "punpckhbw %%xmm4,%%xmm1 \n" |
| 750 "mov %5,%2 \n" |
| 751 "test %2,%2 \n" |
| 752 "je 3f \n" |
| 753 |
| 754 LABELALIGN |
| 755 "2: \n" |
| 756 "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
| 757 "add %6,%0 \n" |
| 758 "movdqa %%xmm2,%%xmm3 \n" |
| 759 "punpcklbw %%xmm4,%%xmm2 \n" |
| 760 "punpckhbw %%xmm4,%%xmm3 \n" |
| 761 "paddusw %%xmm2,%%xmm0 \n" |
| 762 "paddusw %%xmm3,%%xmm1 \n" |
| 763 "sub $0x1,%2 \n" |
| 764 "jg 2b \n" |
| 765 |
| 766 LABELALIGN |
| 767 "3: \n" |
| 768 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 770 "lea " MEMLEA(0x10,3) ",%0 \n" |
| 771 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 772 "sub $0x10,%4 \n" |
| 773 "jg 1b \n" |
| 774 : "+r"(src_ptr), // %0 |
| 775 "+r"(dst_ptr), // %1 |
| 776 "+r"(tmp_height), // %2 |
| 777 "+r"(tmp_src), // %3 |
| 778 "+r"(src_width), // %4 |
| 779 "+rm"(src_height) // %5 |
| 780 : "rm"((intptr_t)(src_stride)) // %6 |
| 781 : "memory", "cc" |
| 782 #if defined(__SSE2__) |
| 783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 784 #endif |
| 785 ); |
| 786 } |
| 787 |
| 788 // Bilinear column filtering. SSSE3 version. |
| 789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 790 int dst_width, int x, int dx) { |
| 791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0; |
| 792 asm volatile ( |
| 793 "movd %6,%%xmm2 \n" |
| 794 "movd %7,%%xmm3 \n" |
| 795 "movl $0x04040000,%k2 \n" |
| 796 "movd %k2,%%xmm5 \n" |
| 797 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 798 "psrlw $0x9,%%xmm6 \n" |
| 799 "pextrw $0x1,%%xmm2,%k3 \n" |
| 800 "subl $0x2,%5 \n" |
| 801 "jl 29f \n" |
| 802 "movdqa %%xmm2,%%xmm0 \n" |
| 803 "paddd %%xmm3,%%xmm0 \n" |
| 804 "punpckldq %%xmm0,%%xmm2 \n" |
| 805 "punpckldq %%xmm3,%%xmm3 \n" |
| 806 "paddd %%xmm3,%%xmm3 \n" |
| 807 "pextrw $0x3,%%xmm2,%k4 \n" |
| 808 |
| 809 LABELALIGN |
| 810 "2: \n" |
| 811 "movdqa %%xmm2,%%xmm1 \n" |
| 812 "paddd %%xmm3,%%xmm2 \n" |
| 813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
| 814 "movd %k2,%%xmm0 \n" |
| 815 "psrlw $0x9,%%xmm1 \n" |
| 816 BUNDLEALIGN |
| 817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 |
| 818 "movd %k2,%%xmm4 \n" |
| 819 "pshufb %%xmm5,%%xmm1 \n" |
| 820 "punpcklwd %%xmm4,%%xmm0 \n" |
| 821 "pxor %%xmm6,%%xmm1 \n" |
| 822 "pmaddubsw %%xmm1,%%xmm0 \n" |
| 823 "pextrw $0x1,%%xmm2,%k3 \n" |
| 824 "pextrw $0x3,%%xmm2,%k4 \n" |
| 825 "psrlw $0x7,%%xmm0 \n" |
| 826 "packuswb %%xmm0,%%xmm0 \n" |
| 827 "movd %%xmm0,%k2 \n" |
| 828 "mov %w2," MEMACCESS(0) " \n" |
| 829 "lea " MEMLEA(0x2,0) ",%0 \n" |
| 830 "sub $0x2,%5 \n" |
| 831 "jge 2b \n" |
| 832 |
| 833 LABELALIGN |
| 834 "29: \n" |
| 835 "addl $0x1,%5 \n" |
| 836 "jl 99f \n" |
| 837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
| 838 "movd %k2,%%xmm0 \n" |
| 839 "psrlw $0x9,%%xmm2 \n" |
| 840 "pshufb %%xmm5,%%xmm2 \n" |
| 841 "pxor %%xmm6,%%xmm2 \n" |
| 842 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 843 "psrlw $0x7,%%xmm0 \n" |
| 844 "packuswb %%xmm0,%%xmm0 \n" |
| 845 "movd %%xmm0,%k2 \n" |
| 846 "mov %b2," MEMACCESS(0) " \n" |
| 847 "99: \n" |
| 848 : "+r"(dst_ptr), // %0 |
| 849 "+r"(src_ptr), // %1 |
| 850 "+a"(temp_pixel), // %2 |
| 851 "+r"(x0), // %3 |
| 852 "+r"(x1), // %4 |
| 853 "+rm"(dst_width) // %5 |
| 854 : "rm"(x), // %6 |
| 855 "rm"(dx) // %7 |
| 856 : "memory", "cc" |
| 857 #if defined(__native_client__) && defined(__x86_64__) |
| 858 , "r14" |
| 859 #endif |
| 860 #if defined(__SSE2__) |
| 861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 862 #endif |
| 863 ); |
| 864 } |
| 865 |
| 866 // Reads 4 pixels, duplicates them and writes 8 pixels. |
| 867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 869 int dst_width, int x, int dx) { |
| 870 asm volatile ( |
| 871 LABELALIGN |
| 872 "1: \n" |
| 873 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 874 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 875 "movdqa %%xmm0,%%xmm1 \n" |
| 876 "punpcklbw %%xmm0,%%xmm0 \n" |
| 877 "punpckhbw %%xmm1,%%xmm1 \n" |
| 878 "sub $0x20,%2 \n" |
| 879 "movdqa %%xmm0," MEMACCESS(0) " \n" |
| 880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
| 881 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 882 "jg 1b \n" |
| 883 |
| 884 : "+r"(dst_ptr), // %0 |
| 885 "+r"(src_ptr), // %1 |
| 886 "+r"(dst_width) // %2 |
| 887 : |
| 888 : "memory", "cc" |
| 889 #if defined(__SSE2__) |
| 890 , "xmm0", "xmm1" |
| 891 #endif |
| 892 ); |
| 893 } |
| 894 |
| 895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
| 896 ptrdiff_t src_stride, |
| 897 uint8* dst_argb, int dst_width) { |
| 898 asm volatile ( |
| 899 LABELALIGN |
| 900 "1: \n" |
| 901 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 903 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 904 "shufps $0xdd,%%xmm1,%%xmm0 \n" |
| 905 "sub $0x4,%2 \n" |
| 906 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 907 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 908 "jg 1b \n" |
| 909 : "+r"(src_argb), // %0 |
| 910 "+r"(dst_argb), // %1 |
| 911 "+r"(dst_width) // %2 |
| 912 : |
| 913 : "memory", "cc" |
| 914 #if defined(__SSE2__) |
| 915 , "xmm0", "xmm1" |
| 916 #endif |
| 917 ); |
| 918 } |
| 919 |
| 920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
| 921 ptrdiff_t src_stride, |
| 922 uint8* dst_argb, int dst_width) { |
| 923 asm volatile ( |
| 924 LABELALIGN |
| 925 "1: \n" |
| 926 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 928 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 929 "movdqa %%xmm0,%%xmm2 \n" |
| 930 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 931 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
| 932 "pavgb %%xmm2,%%xmm0 \n" |
| 933 "sub $0x4,%2 \n" |
| 934 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 935 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 936 "jg 1b \n" |
| 937 : "+r"(src_argb), // %0 |
| 938 "+r"(dst_argb), // %1 |
| 939 "+r"(dst_width) // %2 |
| 940 : |
| 941 : "memory", "cc" |
| 942 #if defined(__SSE2__) |
| 943 , "xmm0", "xmm1" |
| 944 #endif |
| 945 ); |
| 946 } |
| 947 |
| 948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
| 949 ptrdiff_t src_stride, |
| 950 uint8* dst_argb, int dst_width) { |
| 951 asm volatile ( |
| 952 LABELALIGN |
| 953 "1: \n" |
| 954 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
| 955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 956 BUNDLEALIGN |
| 957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
| 958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
| 959 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 960 "pavgb %%xmm2,%%xmm0 \n" |
| 961 "pavgb %%xmm3,%%xmm1 \n" |
| 962 "movdqa %%xmm0,%%xmm2 \n" |
| 963 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 964 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
| 965 "pavgb %%xmm2,%%xmm0 \n" |
| 966 "sub $0x4,%2 \n" |
| 967 "movdqa %%xmm0," MEMACCESS(1) " \n" |
| 968 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 969 "jg 1b \n" |
| 970 : "+r"(src_argb), // %0 |
| 971 "+r"(dst_argb), // %1 |
| 972 "+r"(dst_width) // %2 |
| 973 : "r"((intptr_t)(src_stride)) // %3 |
| 974 : "memory", "cc" |
| 975 #if defined(__native_client__) && defined(__x86_64__) |
| 976 , "r14" |
| 977 #endif |
| 978 #if defined(__SSE2__) |
| 979 , "xmm0", "xmm1", "xmm2", "xmm3" |
| 980 #endif |
| 981 ); |
| 982 } |
| 983 |
| 984 // Reads 4 pixels at a time. |
| 985 // Alignment requirement: dst_argb 16 byte aligned. |
| 986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
| 987 int src_stepx, |
| 988 uint8* dst_argb, int dst_width) { |
| 989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
| 990 intptr_t src_stepx_x12 = 0; |
| 991 asm volatile ( |
| 992 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
| 993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
| 994 LABELALIGN |
| 995 "1: \n" |
| 996 "movd " MEMACCESS(0) ",%%xmm0 \n" |
| 997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
| 998 "punpckldq %%xmm1,%%xmm0 \n" |
| 999 BUNDLEALIGN |
| 1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 |
| 1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 |
| 1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
| 1003 "punpckldq %%xmm3,%%xmm2 \n" |
| 1004 "punpcklqdq %%xmm2,%%xmm0 \n" |
| 1005 "sub $0x4,%3 \n" |
| 1006 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 1007 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 1008 "jg 1b \n" |
| 1009 : "+r"(src_argb), // %0 |
| 1010 "+r"(src_stepx_x4), // %1 |
| 1011 "+r"(dst_argb), // %2 |
| 1012 "+r"(dst_width), // %3 |
| 1013 "+r"(src_stepx_x12) // %4 |
| 1014 : |
| 1015 : "memory", "cc" |
| 1016 #if defined(__native_client__) && defined(__x86_64__) |
| 1017 , "r14" |
| 1018 #endif |
| 1019 #if defined(__SSE2__) |
| 1020 , "xmm0", "xmm1", "xmm2", "xmm3" |
| 1021 #endif |
| 1022 ); |
| 1023 } |
| 1024 |
| 1025 // Blends four 2x2 to 4x1. |
| 1026 // Alignment requirement: dst_argb 16 byte aligned. |
| 1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
| 1028 ptrdiff_t src_stride, int src_stepx, |
| 1029 uint8* dst_argb, int dst_width) { |
| 1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
| 1031 intptr_t src_stepx_x12 = 0; |
| 1032 intptr_t row1 = (intptr_t)(src_stride); |
| 1033 asm volatile ( |
| 1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
| 1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
| 1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" |
| 1037 |
| 1038 LABELALIGN |
| 1039 "1: \n" |
| 1040 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 |
| 1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 |
| 1043 BUNDLEALIGN |
| 1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 |
| 1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
| 1046 "movq " MEMACCESS(5) ",%%xmm2 \n" |
| 1047 BUNDLEALIGN |
| 1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 |
| 1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 |
| 1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 |
| 1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" |
| 1052 "pavgb %%xmm2,%%xmm0 \n" |
| 1053 "pavgb %%xmm3,%%xmm1 \n" |
| 1054 "movdqa %%xmm0,%%xmm2 \n" |
| 1055 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1056 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
| 1057 "pavgb %%xmm2,%%xmm0 \n" |
| 1058 "sub $0x4,%3 \n" |
| 1059 "movdqa %%xmm0," MEMACCESS(2) " \n" |
| 1060 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 1061 "jg 1b \n" |
| 1062 : "+r"(src_argb), // %0 |
| 1063 "+r"(src_stepx_x4), // %1 |
| 1064 "+r"(dst_argb), // %2 |
| 1065 "+rm"(dst_width), // %3 |
| 1066 "+r"(src_stepx_x12), // %4 |
| 1067 "+r"(row1) // %5 |
| 1068 : |
| 1069 : "memory", "cc" |
| 1070 #if defined(__native_client__) && defined(__x86_64__) |
| 1071 , "r14" |
| 1072 #endif |
| 1073 #if defined(__SSE2__) |
| 1074 , "xmm0", "xmm1", "xmm2", "xmm3" |
| 1075 #endif |
| 1076 ); |
| 1077 } |
| 1078 |
| 1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
| 1080 int dst_width, int x, int dx) { |
| 1081 intptr_t x0 = 0, x1 = 0; |
| 1082 asm volatile ( |
| 1083 "movd %5,%%xmm2 \n" |
| 1084 "movd %6,%%xmm3 \n" |
| 1085 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
| 1086 "pshufd $0x11,%%xmm3,%%xmm0 \n" |
| 1087 "paddd %%xmm0,%%xmm2 \n" |
| 1088 "paddd %%xmm3,%%xmm3 \n" |
| 1089 "pshufd $0x5,%%xmm3,%%xmm0 \n" |
| 1090 "paddd %%xmm0,%%xmm2 \n" |
| 1091 "paddd %%xmm3,%%xmm3 \n" |
| 1092 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
| 1093 "pextrw $0x1,%%xmm2,%k0 \n" |
| 1094 "pextrw $0x3,%%xmm2,%k1 \n" |
| 1095 "cmp $0x0,%4 \n" |
| 1096 "jl 99f \n" |
| 1097 "sub $0x4,%4 \n" |
| 1098 "jl 49f \n" |
| 1099 |
| 1100 LABELALIGN |
| 1101 "40: \n" |
| 1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
| 1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
| 1104 "pextrw $0x5,%%xmm2,%k0 \n" |
| 1105 "pextrw $0x7,%%xmm2,%k1 \n" |
| 1106 "paddd %%xmm3,%%xmm2 \n" |
| 1107 "punpckldq %%xmm1,%%xmm0 \n" |
| 1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 |
| 1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 |
| 1110 "pextrw $0x1,%%xmm2,%k0 \n" |
| 1111 "pextrw $0x3,%%xmm2,%k1 \n" |
| 1112 "punpckldq %%xmm4,%%xmm1 \n" |
| 1113 "punpcklqdq %%xmm1,%%xmm0 \n" |
| 1114 "sub $0x4,%4 \n" |
| 1115 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 1116 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 1117 "jge 40b \n" |
| 1118 |
| 1119 "49: \n" |
| 1120 "test $0x2,%4 \n" |
| 1121 "je 29f \n" |
| 1122 BUNDLEALIGN |
| 1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
| 1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
| 1125 "pextrw $0x5,%%xmm2,%k0 \n" |
| 1126 "punpckldq %%xmm1,%%xmm0 \n" |
| 1127 "movq %%xmm0," MEMACCESS(2) " \n" |
| 1128 "lea " MEMLEA(0x8,2) ",%2 \n" |
| 1129 "29: \n" |
| 1130 "test $0x1,%4 \n" |
| 1131 "je 99f \n" |
| 1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
| 1133 "movd %%xmm0," MEMACCESS(2) " \n" |
| 1134 "99: \n" |
| 1135 : "+a"(x0), // %0 |
| 1136 "+d"(x1), // %1 |
| 1137 "+r"(dst_argb), // %2 |
| 1138 "+r"(src_argb), // %3 |
| 1139 "+r"(dst_width) // %4 |
| 1140 : "rm"(x), // %5 |
| 1141 "rm"(dx) // %6 |
| 1142 : "memory", "cc" |
| 1143 #if defined(__native_client__) && defined(__x86_64__) |
| 1144 , "r14" |
| 1145 #endif |
| 1146 #if defined(__SSE2__) |
| 1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 1148 #endif |
| 1149 ); |
| 1150 } |
| 1151 |
| 1152 // Reads 4 pixels, duplicates them and writes 8 pixels. |
| 1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
| 1155 int dst_width, int x, int dx) { |
| 1156 asm volatile ( |
| 1157 LABELALIGN |
| 1158 "1: \n" |
| 1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
| 1160 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1161 "movdqa %%xmm0,%%xmm1 \n" |
| 1162 "punpckldq %%xmm0,%%xmm0 \n" |
| 1163 "punpckhdq %%xmm1,%%xmm1 \n" |
| 1164 "sub $0x8,%2 \n" |
| 1165 "movdqa %%xmm0," MEMACCESS(0) " \n" |
| 1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
| 1167 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 1168 "jg 1b \n" |
| 1169 |
| 1170 : "+r"(dst_argb), // %0 |
| 1171 "+r"(src_argb), // %1 |
| 1172 "+r"(dst_width) // %2 |
| 1173 : |
| 1174 : "memory", "cc" |
| 1175 #if defined(__native_client__) && defined(__x86_64__) |
| 1176 , "r14" |
| 1177 #endif |
| 1178 #if defined(__SSE2__) |
| 1179 , "xmm0", "xmm1" |
| 1180 #endif |
| 1181 ); |
| 1182 } |
| 1183 |
| 1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
| 1185 static uvec8 kShuffleColARGB = { |
| 1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
| 1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
| 1188 }; |
| 1189 |
| 1190 // Shuffle table for duplicating 2 fractions into 8 bytes each |
| 1191 static uvec8 kShuffleFractions = { |
| 1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
| 1193 }; |
| 1194 |
| 1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
| 1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
| 1197 int dst_width, int x, int dx) { |
| 1198 intptr_t x0 = 0, x1 = 0; |
| 1199 asm volatile ( |
| 1200 "movdqa %0,%%xmm4 \n" |
| 1201 "movdqa %1,%%xmm5 \n" |
| 1202 : |
| 1203 : "m"(kShuffleColARGB), // %0 |
| 1204 "m"(kShuffleFractions) // %1 |
| 1205 ); |
| 1206 |
| 1207 asm volatile ( |
| 1208 "movd %5,%%xmm2 \n" |
| 1209 "movd %6,%%xmm3 \n" |
| 1210 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 1211 "psrlw $0x9,%%xmm6 \n" |
| 1212 "pextrw $0x1,%%xmm2,%k3 \n" |
| 1213 "sub $0x2,%2 \n" |
| 1214 "jl 29f \n" |
| 1215 "movdqa %%xmm2,%%xmm0 \n" |
| 1216 "paddd %%xmm3,%%xmm0 \n" |
| 1217 "punpckldq %%xmm0,%%xmm2 \n" |
| 1218 "punpckldq %%xmm3,%%xmm3 \n" |
| 1219 "paddd %%xmm3,%%xmm3 \n" |
| 1220 "pextrw $0x3,%%xmm2,%k4 \n" |
| 1221 |
| 1222 LABELALIGN |
| 1223 "2: \n" |
| 1224 "movdqa %%xmm2,%%xmm1 \n" |
| 1225 "paddd %%xmm3,%%xmm2 \n" |
| 1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
| 1227 "psrlw $0x9,%%xmm1 \n" |
| 1228 BUNDLEALIGN |
| 1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 |
| 1230 "pshufb %%xmm5,%%xmm1 \n" |
| 1231 "pshufb %%xmm4,%%xmm0 \n" |
| 1232 "pxor %%xmm6,%%xmm1 \n" |
| 1233 "pmaddubsw %%xmm1,%%xmm0 \n" |
| 1234 "psrlw $0x7,%%xmm0 \n" |
| 1235 "pextrw $0x1,%%xmm2,%k3 \n" |
| 1236 "pextrw $0x3,%%xmm2,%k4 \n" |
| 1237 "packuswb %%xmm0,%%xmm0 \n" |
| 1238 "movq %%xmm0," MEMACCESS(0) " \n" |
| 1239 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 1240 "sub $0x2,%2 \n" |
| 1241 "jge 2b \n" |
| 1242 |
| 1243 LABELALIGN |
| 1244 "29: \n" |
| 1245 "add $0x1,%2 \n" |
| 1246 "jl 99f \n" |
| 1247 "psrlw $0x9,%%xmm2 \n" |
| 1248 BUNDLEALIGN |
| 1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
| 1250 "pshufb %%xmm5,%%xmm2 \n" |
| 1251 "pshufb %%xmm4,%%xmm0 \n" |
| 1252 "pxor %%xmm6,%%xmm2 \n" |
| 1253 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 1254 "psrlw $0x7,%%xmm0 \n" |
| 1255 "packuswb %%xmm0,%%xmm0 \n" |
| 1256 "movd %%xmm0," MEMACCESS(0) " \n" |
| 1257 |
| 1258 LABELALIGN |
| 1259 "99: \n" |
| 1260 : "+r"(dst_argb), // %0 |
| 1261 "+r"(src_argb), // %1 |
| 1262 "+rm"(dst_width), // %2 |
| 1263 "+r"(x0), // %3 |
| 1264 "+r"(x1) // %4 |
| 1265 : "rm"(x), // %5 |
| 1266 "rm"(dx) // %6 |
| 1267 : "memory", "cc" |
| 1268 #if defined(__native_client__) && defined(__x86_64__) |
| 1269 , "r14" |
| 1270 #endif |
| 1271 #if defined(__SSE2__) |
| 1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 1273 #endif |
| 1274 ); |
| 1275 } |
| 1276 |
| 1277 // Divide num by div and return as 16.16 fixed point result. |
| 1278 int FixedDiv_X86(int num, int div) { |
| 1279 asm volatile ( |
| 1280 "cdq \n" |
| 1281 "shld $0x10,%%eax,%%edx \n" |
| 1282 "shl $0x10,%%eax \n" |
| 1283 "idiv %1 \n" |
| 1284 "mov %0, %%eax \n" |
| 1285 : "+a"(num) // %0 |
| 1286 : "c"(div) // %1 |
| 1287 : "memory", "cc", "edx" |
| 1288 ); |
| 1289 return num; |
| 1290 } |
| 1291 |
| 1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result. |
| 1293 int FixedDiv1_X86(int num, int div) { |
| 1294 asm volatile ( |
| 1295 "cdq \n" |
| 1296 "shld $0x10,%%eax,%%edx \n" |
| 1297 "shl $0x10,%%eax \n" |
| 1298 "sub $0x10001,%%eax \n" |
| 1299 "sbb $0x0,%%edx \n" |
| 1300 "sub $0x1,%1 \n" |
| 1301 "idiv %1 \n" |
| 1302 "mov %0, %%eax \n" |
| 1303 : "+a"(num) // %0 |
| 1304 : "c"(div) // %1 |
| 1305 : "memory", "cc", "edx" |
| 1306 ); |
| 1307 return num; |
| 1308 } |
| 1309 |
| 1310 #endif // defined(__x86_64__) || defined(__i386__) |
| 1311 |
| 1312 #ifdef __cplusplus |
| 1313 } // extern "C" |
| 1314 } // namespace libyuv |
| 1315 #endif |
OLD | NEW |