OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 268 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
279 : "+r"(src_ptr), // %0 | 279 : "+r"(src_ptr), // %0 |
280 "+r"(dst_ptr), // %1 | 280 "+r"(dst_ptr), // %1 |
281 "+r"(dst_width) // %2 | 281 "+r"(dst_width) // %2 |
282 : "r"((intptr_t)(src_stride)) // %3 | 282 : "r"((intptr_t)(src_stride)) // %3 |
283 : "memory", "cc", NACL_R14 | 283 : "memory", "cc", NACL_R14 |
284 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 284 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
285 ); | 285 ); |
286 } | 286 } |
287 #endif // HAS_SCALEROWDOWN2_AVX2 | 287 #endif // HAS_SCALEROWDOWN2_AVX2 |
288 | 288 |
289 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 289 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
290 uint8* dst_ptr, int dst_width) { | 290 uint8* dst_ptr, int dst_width) { |
291 asm volatile ( | 291 asm volatile ( |
292 "pcmpeqb %%xmm5,%%xmm5 \n" | 292 "pcmpeqb %%xmm5,%%xmm5 \n" |
293 "psrld $0x18,%%xmm5 \n" | 293 "psrld $0x18,%%xmm5 \n" |
294 "pslld $0x10,%%xmm5 \n" | 294 "pslld $0x10,%%xmm5 \n" |
295 | 295 |
296 LABELALIGN | 296 LABELALIGN |
297 "1: \n" | 297 "1: \n" |
298 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 298 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
299 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 299 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
300 "lea " MEMLEA(0x20,0) ",%0 \n" | 300 "lea " MEMLEA(0x20,0) ",%0 \n" |
301 "pand %%xmm5,%%xmm0 \n" | 301 "pand %%xmm5,%%xmm0 \n" |
302 "pand %%xmm5,%%xmm1 \n" | 302 "pand %%xmm5,%%xmm1 \n" |
303 "packuswb %%xmm1,%%xmm0 \n" | 303 "packuswb %%xmm1,%%xmm0 \n" |
304 "psrlw $0x8,%%xmm0 \n" | 304 "psrlw $0x8,%%xmm0 \n" |
305 "packuswb %%xmm0,%%xmm0 \n" | 305 "packuswb %%xmm0,%%xmm0 \n" |
306 "movq %%xmm0," MEMACCESS(1) " \n" | 306 "movq %%xmm0," MEMACCESS(1) " \n" |
307 "lea " MEMLEA(0x8,1) ",%1 \n" | 307 "lea " MEMLEA(0x8,1) ",%1 \n" |
308 "sub $0x8,%2 \n" | 308 "sub $0x8,%2 \n" |
309 "jg 1b \n" | 309 "jg 1b \n" |
310 : "+r"(src_ptr), // %0 | 310 : "+r"(src_ptr), // %0 |
311 "+r"(dst_ptr), // %1 | 311 "+r"(dst_ptr), // %1 |
312 "+r"(dst_width) // %2 | 312 "+r"(dst_width) // %2 |
313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
314 ); | 314 ); |
315 } | 315 } |
316 | 316 |
317 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
318 uint8* dst_ptr, int dst_width) { | 318 uint8* dst_ptr, int dst_width) { |
319 intptr_t stridex3 = 0; | 319 intptr_t stridex3 = 0; |
320 asm volatile ( | 320 asm volatile ( |
321 "pcmpeqb %%xmm7,%%xmm7 \n" | 321 "pcmpeqb %%xmm4,%%xmm4 \n" |
322 "psrlw $0x8,%%xmm7 \n" | 322 "psrlw $0xf,%%xmm4 \n" |
| 323 "movdqa %%xmm4,%%xmm5 \n" |
| 324 "packuswb %%xmm4,%%xmm4 \n" |
| 325 "psllw $0x3,%%xmm5 \n" |
323 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" | 326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
324 | 327 |
325 LABELALIGN | 328 LABELALIGN |
326 "1: \n" | 329 "1: \n" |
327 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 330 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
328 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 331 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
329 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 332 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
330 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 333 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
331 "pavgb %%xmm2,%%xmm0 \n" | 334 "pmaddubsw %%xmm4,%%xmm0 \n" |
332 "pavgb %%xmm3,%%xmm1 \n" | 335 "pmaddubsw %%xmm4,%%xmm1 \n" |
| 336 "pmaddubsw %%xmm4,%%xmm2 \n" |
| 337 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 338 "paddw %%xmm2,%%xmm0 \n" |
| 339 "paddw %%xmm3,%%xmm1 \n" |
333 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 | 340 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 |
334 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 | 341 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 |
335 MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 | 342 "pmaddubsw %%xmm4,%%xmm2 \n" |
336 MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 | 343 "pmaddubsw %%xmm4,%%xmm3 \n" |
| 344 "paddw %%xmm2,%%xmm0 \n" |
| 345 "paddw %%xmm3,%%xmm1 \n" |
| 346 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
| 347 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
337 "lea " MEMLEA(0x20,0) ",%0 \n" | 348 "lea " MEMLEA(0x20,0) ",%0 \n" |
338 "pavgb %%xmm4,%%xmm2 \n" | 349 "pmaddubsw %%xmm4,%%xmm2 \n" |
339 "pavgb %%xmm2,%%xmm0 \n" | 350 "pmaddubsw %%xmm4,%%xmm3 \n" |
340 "pavgb %%xmm5,%%xmm3 \n" | 351 "paddw %%xmm2,%%xmm0 \n" |
341 "pavgb %%xmm3,%%xmm1 \n" | 352 "paddw %%xmm3,%%xmm1 \n" |
342 "movdqa %%xmm0,%%xmm2 \n" | 353 "phaddw %%xmm1,%%xmm0 \n" |
343 "psrlw $0x8,%%xmm0 \n" | 354 "paddw %%xmm5,%%xmm0 \n" |
344 "movdqa %%xmm1,%%xmm3 \n" | 355 "psrlw $0x4,%%xmm0 \n" |
345 "psrlw $0x8,%%xmm1 \n" | 356 "packuswb %%xmm0,%%xmm0 \n" |
346 "pand %%xmm7,%%xmm2 \n" | |
347 "pand %%xmm7,%%xmm3 \n" | |
348 "pavgw %%xmm2,%%xmm0 \n" | |
349 "pavgw %%xmm3,%%xmm1 \n" | |
350 "packuswb %%xmm1,%%xmm0 \n" | |
351 "movdqa %%xmm0,%%xmm2 \n" | |
352 "psrlw $0x8,%%xmm0 \n" | |
353 "pand %%xmm7,%%xmm2 \n" | |
354 "pavgw %%xmm2,%%xmm0 \n" | |
355 "packuswb %%xmm0,%%xmm0 \n" | |
356 "movq %%xmm0," MEMACCESS(1) " \n" | 357 "movq %%xmm0," MEMACCESS(1) " \n" |
357 "lea " MEMLEA(0x8,1) ",%1 \n" | 358 "lea " MEMLEA(0x8,1) ",%1 \n" |
358 "sub $0x8,%2 \n" | 359 "sub $0x8,%2 \n" |
359 "jg 1b \n" | 360 "jg 1b \n" |
360 : "+r"(src_ptr), // %0 | 361 : "+r"(src_ptr), // %0 |
361 "+r"(dst_ptr), // %1 | 362 "+r"(dst_ptr), // %1 |
362 "+r"(dst_width), // %2 | 363 "+r"(dst_width), // %2 |
363 "+r"(stridex3) // %3 | 364 "+r"(stridex3) // %3 |
364 : "r"((intptr_t)(src_stride)) // %4 | 365 : "r"((intptr_t)(src_stride)) // %4 |
365 : "memory", "cc", NACL_R14 | 366 : "memory", "cc", NACL_R14 |
366 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" | 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
367 ); | 368 ); |
368 } | 369 } |
369 | 370 |
370 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 371 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
371 uint8* dst_ptr, int dst_width) { | 372 uint8* dst_ptr, int dst_width) { |
372 asm volatile ( | 373 asm volatile ( |
373 "movdqa %0,%%xmm3 \n" | 374 "movdqa %0,%%xmm3 \n" |
374 "movdqa %1,%%xmm4 \n" | 375 "movdqa %1,%%xmm4 \n" |
375 "movdqa %2,%%xmm5 \n" | 376 "movdqa %2,%%xmm5 \n" |
376 : | 377 : |
(...skipping 818 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1195 ); | 1196 ); |
1196 return num; | 1197 return num; |
1197 } | 1198 } |
1198 | 1199 |
1199 #endif // defined(__x86_64__) || defined(__i386__) | 1200 #endif // defined(__x86_64__) || defined(__i386__) |
1200 | 1201 |
1201 #ifdef __cplusplus | 1202 #ifdef __cplusplus |
1202 } // extern "C" | 1203 } // extern "C" |
1203 } // namespace libyuv | 1204 } // namespace libyuv |
1204 #endif | 1205 #endif |
OLD | NEW |