source/libvpx/third_party/libyuv/source/scale_posix.cc - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_posix.cc

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include "third_party/libyuv/include/libyuv/row.h"

	12

	13 #ifdef __cplusplus

	14 namespace libyuv {

	15 extern "C" {

	16 #endif

	17

	18 // This module is for GCC x86 and x64.

	19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) \|\| defined(__i386__))

	20

	21 // Offsets for source bytes 0 to 9

	22 static uvec8 kShuf0 =

	23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

	24

	25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

	26 static uvec8 kShuf1 =

	27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

	28

	29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

	30 static uvec8 kShuf2 =

	31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

	32

	33 // Offsets for source bytes 0 to 10

	34 static uvec8 kShuf01 =

	35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

	36

	37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

	38 static uvec8 kShuf11 =

	39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

	40

	41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

	42 static uvec8 kShuf21 =

	43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

	44

	45 // Coefficients for source bytes 0 to 10

	46 static uvec8 kMadd01 =

	47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

	48

	49 // Coefficients for source bytes 10 to 21

	50 static uvec8 kMadd11 =

	51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

	52

	53 // Coefficients for source bytes 21 to 31

	54 static uvec8 kMadd21 =

	55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

	56

	57 // Coefficients for source bytes 21 to 31

	58 static vec16 kRound34 =

	59 { 2, 2, 2, 2, 2, 2, 2, 2 };

	60

	61 static uvec8 kShuf38a =

	62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

	63

	64 static uvec8 kShuf38b =

	65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

	66

	67 // Arrange words 0,3,6 into 0,1,2

	68 static uvec8 kShufAc =

	69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

	70

	71 // Arrange words 0,3,6 into 3,4,5

	72 static uvec8 kShufAc3 =

	73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

	74

	75 // Scaling values for boxes of 3x3 and 2x3

	76 static uvec16 kScaleAc33 =

	77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

	78

	79 // Arrange first value for pixels 0,1,2,3,4,5

	80 static uvec8 kShufAb0 =

	81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

	82

	83 // Arrange second value for pixels 0,1,2,3,4,5

	84 static uvec8 kShufAb1 =

	85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

	86

	87 // Arrange third value for pixels 0,1,2,3,4,5

	88 static uvec8 kShufAb2 =

	89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

	90

	91 // Scaling values for boxes of 3x2 and 2x2

	92 static uvec16 kScaleAb2 =

	93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

	94

	95 // GCC versions of row functions are verbatim conversions from Visual C.

	96 // Generated using gcc disassembly on Visual C object file:

	97 // objdump -D yuvscaler.obj >yuvscaler.txt

	98

	99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	100 uint8* dst_ptr, int dst_width) {

	101 asm volatile (

	102 LABELALIGN

	103 "1: \n"

	104 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	106 "lea " MEMLEA(0x20,0) ",%0 \n"

	107 "psrlw $0x8,%%xmm0 \n"

	108 "psrlw $0x8,%%xmm1 \n"

	109 "packuswb %%xmm1,%%xmm0 \n"

	110 "movdqa %%xmm0," MEMACCESS(1) " \n"

	111 "lea " MEMLEA(0x10,1) ",%1 \n"

	112 "sub $0x10,%2 \n"

	113 "jg 1b \n"

	114 : "+r"(src_ptr), // %0

	115 "+r"(dst_ptr), // %1

	116 "+r"(dst_width) // %2

	117 :

	118 : "memory", "cc"

	119 #if defined(__SSE2__)

	120 , "xmm0", "xmm1"

	121 #endif

	122 );

	123 }

	124

	125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	126 uint8* dst_ptr, int dst_width) {

	127 asm volatile (

	128 "pcmpeqb %%xmm5,%%xmm5 \n"

	129 "psrlw $0x8,%%xmm5 \n"

	130

	131 LABELALIGN

	132 "1: \n"

	133 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"

	135 "lea " MEMLEA(0x20,0) ",%0 \n"

	136 "movdqa %%xmm0,%%xmm2 \n"

	137 "psrlw $0x8,%%xmm0 \n"

	138 "movdqa %%xmm1,%%xmm3 \n"

	139 "psrlw $0x8,%%xmm1 \n"

	140 "pand %%xmm5,%%xmm2 \n"

	141 "pand %%xmm5,%%xmm3 \n"

	142 "pavgw %%xmm2,%%xmm0 \n"

	143 "pavgw %%xmm3,%%xmm1 \n"

	144 "packuswb %%xmm1,%%xmm0 \n"

	145 "movdqa %%xmm0," MEMACCESS(1) " \n"

	146 "lea " MEMLEA(0x10,1) ",%1 \n"

	147 "sub $0x10,%2 \n"

	148 "jg 1b \n"

	149 : "+r"(src_ptr), // %0

	150 "+r"(dst_ptr), // %1

	151 "+r"(dst_width) // %2

	152 :

	153 : "memory", "cc"

	154 #if defined(__SSE2__)

	155 , "xmm0", "xmm1", "xmm5"

	156 #endif

	157 );

	158 }

	159

	160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	161 uint8* dst_ptr, int dst_width) {

	162 asm volatile (

	163 "pcmpeqb %%xmm5,%%xmm5 \n"

	164 "psrlw $0x8,%%xmm5 \n"

	165

	166 LABELALIGN

	167 "1: \n"

	168 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2

	171 BUNDLEALIGN

	172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3

	173 "lea " MEMLEA(0x20,0) ",%0 \n"

	174 "pavgb %%xmm2,%%xmm0 \n"

	175 "pavgb %%xmm3,%%xmm1 \n"

	176 "movdqa %%xmm0,%%xmm2 \n"

	177 "psrlw $0x8,%%xmm0 \n"

	178 "movdqa %%xmm1,%%xmm3 \n"

	179 "psrlw $0x8,%%xmm1 \n"

	180 "pand %%xmm5,%%xmm2 \n"

	181 "pand %%xmm5,%%xmm3 \n"

	182 "pavgw %%xmm2,%%xmm0 \n"

	183 "pavgw %%xmm3,%%xmm1 \n"

	184 "packuswb %%xmm1,%%xmm0 \n"

	185 "movdqa %%xmm0," MEMACCESS(1) " \n"

	186 "lea " MEMLEA(0x10,1) ",%1 \n"

	187 "sub $0x10,%2 \n"

	188 "jg 1b \n"

	189 : "+r"(src_ptr), // %0

	190 "+r"(dst_ptr), // %1

	191 "+r"(dst_width) // %2

	192 : "r"((intptr_t)(src_stride)) // %3

	193 : "memory", "cc"

	194 #if defined(__native_client__) && defined(__x86_64__)

	195 , "r14"

	196 #endif

	197 #if defined(__SSE2__)

	198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

	199 #endif

	200 );

	201 }

	202

	203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	204 uint8* dst_ptr, int dst_width) {

	205 asm volatile (

	206 LABELALIGN

	207 "1: \n"

	208 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

	209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	210 "lea " MEMLEA(0x20,0) ",%0 \n"

	211 "psrlw $0x8,%%xmm0 \n"

	212 "psrlw $0x8,%%xmm1 \n"

	213 "packuswb %%xmm1,%%xmm0 \n"

	214 "movdqu %%xmm0," MEMACCESS(1) " \n"

	215 "lea " MEMLEA(0x10,1) ",%1 \n"

	216 "sub $0x10,%2 \n"

	217 "jg 1b \n"

	218 : "+r"(src_ptr), // %0

	219 "+r"(dst_ptr), // %1

	220 "+r"(dst_width) // %2

	221 :

	222 : "memory", "cc"

	223 #if defined(__SSE2__)

	224 , "xmm0", "xmm1"

	225 #endif

	226 );

	227 }

	228

	229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

	230 ptrdiff_t src_stride,

	231 uint8* dst_ptr, int dst_width) {

	232 asm volatile (

	233 "pcmpeqb %%xmm5,%%xmm5 \n"

	234 "psrlw $0x8,%%xmm5 \n"

	235

	236 LABELALIGN

	237 "1: \n"

	238 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

	239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	240 "lea " MEMLEA(0x20,0) ",%0 \n"

	241 "movdqa %%xmm0,%%xmm2 \n"

	242 "psrlw $0x8,%%xmm0 \n"

	243 "movdqa %%xmm1,%%xmm3 \n"

	244 "psrlw $0x8,%%xmm1 \n"

	245 "pand %%xmm5,%%xmm2 \n"

	246 "pand %%xmm5,%%xmm3 \n"

	247 "pavgw %%xmm2,%%xmm0 \n"

	248 "pavgw %%xmm3,%%xmm1 \n"

	249 "packuswb %%xmm1,%%xmm0 \n"

	250 "movdqu %%xmm0," MEMACCESS(1) " \n"

	251 "lea " MEMLEA(0x10,1) ",%1 \n"

	252 "sub $0x10,%2 \n"

	253 "jg 1b \n"

	254 : "+r"(src_ptr), // %0

	255 "+r"(dst_ptr), // %1

	256 "+r"(dst_width) // %2

	257 :

	258 : "memory", "cc"

	259 #if defined(__SSE2__)

	260 , "xmm0", "xmm1", "xmm5"

	261 #endif

	262 );

	263 }

	264

	265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

	266 ptrdiff_t src_stride,

	267 uint8* dst_ptr, int dst_width) {

	268 asm volatile (

	269 "pcmpeqb %%xmm5,%%xmm5 \n"

	270 "psrlw $0x8,%%xmm5 \n"

	271

	272 LABELALIGN

	273 "1: \n"

	274 "movdqu " MEMACCESS(0) ",%%xmm0 \n"

	275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2

	277 BUNDLEALIGN

	278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3

	279 "lea " MEMLEA(0x20,0) ",%0 \n"

	280 "pavgb %%xmm2,%%xmm0 \n"

	281 "pavgb %%xmm3,%%xmm1 \n"

	282 "movdqa %%xmm0,%%xmm2 \n"

	283 "psrlw $0x8,%%xmm0 \n"

	284 "movdqa %%xmm1,%%xmm3 \n"

	285 "psrlw $0x8,%%xmm1 \n"

	286 "pand %%xmm5,%%xmm2 \n"

	287 "pand %%xmm5,%%xmm3 \n"

	288 "pavgw %%xmm2,%%xmm0 \n"

	289 "pavgw %%xmm3,%%xmm1 \n"

	290 "packuswb %%xmm1,%%xmm0 \n"

	291 "movdqu %%xmm0," MEMACCESS(1) " \n"

	292 "lea " MEMLEA(0x10,1) ",%1 \n"

	293 "sub $0x10,%2 \n"

	294 "jg 1b \n"

	295 : "+r"(src_ptr), // %0

	296 "+r"(dst_ptr), // %1

	297 "+r"(dst_width) // %2

	298 : "r"((intptr_t)(src_stride)) // %3

	299 : "memory", "cc"

	300 #if defined(__native_client__) && defined(__x86_64__)

	301 , "r14"

	302 #endif

	303 #if defined(__SSE2__)

	304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

	305 #endif

	306 );

	307 }

	308

	309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	310 uint8* dst_ptr, int dst_width) {

	311 asm volatile (

	312 "pcmpeqb %%xmm5,%%xmm5 \n"

	313 "psrld $0x18,%%xmm5 \n"

	314 "pslld $0x10,%%xmm5 \n"

	315

	316 LABELALIGN

	317 "1: \n"

	318 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	320 "lea " MEMLEA(0x20,0) ",%0 \n"

	321 "pand %%xmm5,%%xmm0 \n"

	322 "pand %%xmm5,%%xmm1 \n"

	323 "packuswb %%xmm1,%%xmm0 \n"

	324 "psrlw $0x8,%%xmm0 \n"

	325 "packuswb %%xmm0,%%xmm0 \n"

	326 "movq %%xmm0," MEMACCESS(1) " \n"

	327 "lea " MEMLEA(0x8,1) ",%1 \n"

	328 "sub $0x8,%2 \n"

	329 "jg 1b \n"

	330 : "+r"(src_ptr), // %0

	331 "+r"(dst_ptr), // %1

	332 "+r"(dst_width) // %2

	333 :

	334 : "memory", "cc"

	335 #if defined(__SSE2__)

	336 , "xmm0", "xmm1", "xmm5"

	337 #endif

	338 );

	339 }

	340

	341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	342 uint8* dst_ptr, int dst_width) {

	343 intptr_t stridex3 = 0;

	344 asm volatile (

	345 "pcmpeqb %%xmm7,%%xmm7 \n"

	346 "psrlw $0x8,%%xmm7 \n"

	347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"

	348

	349 LABELALIGN

	350 "1: \n"

	351 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2

	354 BUNDLEALIGN

	355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3

	356 "pavgb %%xmm2,%%xmm0 \n"

	357 "pavgb %%xmm3,%%xmm1 \n"

	358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2

	359 BUNDLEALIGN

	360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3

	361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4

	362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5

	363 "lea " MEMLEA(0x20,0) ",%0 \n"

	364 "pavgb %%xmm4,%%xmm2 \n"

	365 "pavgb %%xmm2,%%xmm0 \n"

	366 "pavgb %%xmm5,%%xmm3 \n"

	367 "pavgb %%xmm3,%%xmm1 \n"

	368 "movdqa %%xmm0,%%xmm2 \n"

	369 "psrlw $0x8,%%xmm0 \n"

	370 "movdqa %%xmm1,%%xmm3 \n"

	371 "psrlw $0x8,%%xmm1 \n"

	372 "pand %%xmm7,%%xmm2 \n"

	373 "pand %%xmm7,%%xmm3 \n"

	374 "pavgw %%xmm2,%%xmm0 \n"

	375 "pavgw %%xmm3,%%xmm1 \n"

	376 "packuswb %%xmm1,%%xmm0 \n"

	377 "movdqa %%xmm0,%%xmm2 \n"

	378 "psrlw $0x8,%%xmm0 \n"

	379 "pand %%xmm7,%%xmm2 \n"

	380 "pavgw %%xmm2,%%xmm0 \n"

	381 "packuswb %%xmm0,%%xmm0 \n"

	382 "movq %%xmm0," MEMACCESS(1) " \n"

	383 "lea " MEMLEA(0x8,1) ",%1 \n"

	384 "sub $0x8,%2 \n"

	385 "jg 1b \n"

	386 : "+r"(src_ptr), // %0

	387 "+r"(dst_ptr), // %1

	388 "+r"(dst_width), // %2

	389 "+r"(stridex3) // %3

	390 : "r"((intptr_t)(src_stride)) // %4

	391 : "memory", "cc"

	392 #if defined(__native_client__) && defined(__x86_64__)

	393 , "r14"

	394 #endif

	395 #if defined(__SSE2__)

	396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"

	397 #endif

	398 );

	399 }

	400

	401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

	402 uint8* dst_ptr, int dst_width) {

	403 asm volatile (

	404 "movdqa %0,%%xmm3 \n"

	405 "movdqa %1,%%xmm4 \n"

	406 "movdqa %2,%%xmm5 \n"

	407 :

	408 : "m"(kShuf0), // %0

	409 "m"(kShuf1), // %1

	410 "m"(kShuf2) // %2

	411 );

	412 asm volatile (

	413 LABELALIGN

	414 "1: \n"

	415 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"

	417 "lea " MEMLEA(0x20,0) ",%0 \n"

	418 "movdqa %%xmm2,%%xmm1 \n"

	419 "palignr $0x8,%%xmm0,%%xmm1 \n"

	420 "pshufb %%xmm3,%%xmm0 \n"

	421 "pshufb %%xmm4,%%xmm1 \n"

	422 "pshufb %%xmm5,%%xmm2 \n"

	423 "movq %%xmm0," MEMACCESS(1) " \n"

	424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"

	425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"

	426 "lea " MEMLEA(0x18,1) ",%1 \n"

	427 "sub $0x18,%2 \n"

	428 "jg 1b \n"

	429 : "+r"(src_ptr), // %0

	430 "+r"(dst_ptr), // %1

	431 "+r"(dst_width) // %2

	432 :

	433 : "memory", "cc"

	434 #if defined(__SSE2__)

	435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

	436 #endif

	437 );

	438 }

	439

	440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

	441 ptrdiff_t src_stride,

	442 uint8* dst_ptr, int dst_width) {

	443 asm volatile (

	444 "movdqa %0,%%xmm2 \n" // kShuf01

	445 "movdqa %1,%%xmm3 \n" // kShuf11

	446 "movdqa %2,%%xmm4 \n" // kShuf21

	447 :

	448 : "m"(kShuf01), // %0

	449 "m"(kShuf11), // %1

	450 "m"(kShuf21) // %2

	451 );

	452 asm volatile (

	453 "movdqa %0,%%xmm5 \n" // kMadd01

	454 "movdqa %1,%%xmm0 \n" // kMadd11

	455 "movdqa %2,%%xmm1 \n" // kRound34

	456 :

	457 : "m"(kMadd01), // %0

	458 "m"(kMadd11), // %1

	459 "m"(kRound34) // %2

	460 );

	461 asm volatile (

	462 LABELALIGN

	463 "1: \n"

	464 "movdqa " MEMACCESS(0) ",%%xmm6 \n"

	465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7

	466 "pavgb %%xmm7,%%xmm6 \n"

	467 "pshufb %%xmm2,%%xmm6 \n"

	468 "pmaddubsw %%xmm5,%%xmm6 \n"

	469 "paddsw %%xmm1,%%xmm6 \n"

	470 "psrlw $0x2,%%xmm6 \n"

	471 "packuswb %%xmm6,%%xmm6 \n"

	472 "movq %%xmm6," MEMACCESS(1) " \n"

	473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"

	474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7

	475 "pavgb %%xmm7,%%xmm6 \n"

	476 "pshufb %%xmm3,%%xmm6 \n"

	477 "pmaddubsw %%xmm0,%%xmm6 \n"

	478 "paddsw %%xmm1,%%xmm6 \n"

	479 "psrlw $0x2,%%xmm6 \n"

	480 "packuswb %%xmm6,%%xmm6 \n"

	481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"

	482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"

	483 BUNDLEALIGN

	484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7

	485 "lea " MEMLEA(0x20,0) ",%0 \n"

	486 "pavgb %%xmm7,%%xmm6 \n"

	487 "pshufb %%xmm4,%%xmm6 \n"

	488 "pmaddubsw %4,%%xmm6 \n"

	489 "paddsw %%xmm1,%%xmm6 \n"

	490 "psrlw $0x2,%%xmm6 \n"

	491 "packuswb %%xmm6,%%xmm6 \n"

	492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"

	493 "lea " MEMLEA(0x18,1) ",%1 \n"

	494 "sub $0x18,%2 \n"

	495 "jg 1b \n"

	496 : "+r"(src_ptr), // %0

	497 "+r"(dst_ptr), // %1

	498 "+r"(dst_width) // %2

	499 : "r"((intptr_t)(src_stride)), // %3

	500 "m"(kMadd21) // %4

	501 : "memory", "cc"

	502 #if defined(__native_client__) && defined(__x86_64__)

	503 , "r14"

	504 #endif

	505 #if defined(__SSE2__)

	506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

	507 #endif

	508 );

	509 }

	510

	511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

	512 ptrdiff_t src_stride,

	513 uint8* dst_ptr, int dst_width) {

	514 asm volatile (

	515 "movdqa %0,%%xmm2 \n" // kShuf01

	516 "movdqa %1,%%xmm3 \n" // kShuf11

	517 "movdqa %2,%%xmm4 \n" // kShuf21

	518 :

	519 : "m"(kShuf01), // %0

	520 "m"(kShuf11), // %1

	521 "m"(kShuf21) // %2

	522 );

	523 asm volatile (

	524 "movdqa %0,%%xmm5 \n" // kMadd01

	525 "movdqa %1,%%xmm0 \n" // kMadd11

	526 "movdqa %2,%%xmm1 \n" // kRound34

	527 :

	528 : "m"(kMadd01), // %0

	529 "m"(kMadd11), // %1

	530 "m"(kRound34) // %2

	531 );

	532

	533 asm volatile (

	534 LABELALIGN

	535 "1: \n"

	536 "movdqa " MEMACCESS(0) ",%%xmm6 \n"

	537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7

	538 "pavgb %%xmm6,%%xmm7 \n"

	539 "pavgb %%xmm7,%%xmm6 \n"

	540 "pshufb %%xmm2,%%xmm6 \n"

	541 "pmaddubsw %%xmm5,%%xmm6 \n"

	542 "paddsw %%xmm1,%%xmm6 \n"

	543 "psrlw $0x2,%%xmm6 \n"

	544 "packuswb %%xmm6,%%xmm6 \n"

	545 "movq %%xmm6," MEMACCESS(1) " \n"

	546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"

	547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7

	548 "pavgb %%xmm6,%%xmm7 \n"

	549 "pavgb %%xmm7,%%xmm6 \n"

	550 "pshufb %%xmm3,%%xmm6 \n"

	551 "pmaddubsw %%xmm0,%%xmm6 \n"

	552 "paddsw %%xmm1,%%xmm6 \n"

	553 "psrlw $0x2,%%xmm6 \n"

	554 "packuswb %%xmm6,%%xmm6 \n"

	555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"

	556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"

	557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7

	558 "lea " MEMLEA(0x20,0) ",%0 \n"

	559 "pavgb %%xmm6,%%xmm7 \n"

	560 "pavgb %%xmm7,%%xmm6 \n"

	561 "pshufb %%xmm4,%%xmm6 \n"

	562 "pmaddubsw %4,%%xmm6 \n"

	563 "paddsw %%xmm1,%%xmm6 \n"

	564 "psrlw $0x2,%%xmm6 \n"

	565 "packuswb %%xmm6,%%xmm6 \n"

	566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"

	567 "lea " MEMLEA(0x18,1) ",%1 \n"

	568 "sub $0x18,%2 \n"

	569 "jg 1b \n"

	570 : "+r"(src_ptr), // %0

	571 "+r"(dst_ptr), // %1

	572 "+r"(dst_width) // %2

	573 : "r"((intptr_t)(src_stride)), // %3

	574 "m"(kMadd21) // %4

	575 : "memory", "cc"

	576 #if defined(__native_client__) && defined(__x86_64__)

	577 , "r14"

	578 #endif

	579 #if defined(__SSE2__)

	580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

	581 #endif

	582 );

	583 }

	584

	585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

	586 uint8* dst_ptr, int dst_width) {

	587 asm volatile (

	588 "movdqa %3,%%xmm4 \n"

	589 "movdqa %4,%%xmm5 \n"

	590

	591 LABELALIGN

	592 "1: \n"

	593 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	595 "lea " MEMLEA(0x20,0) ",%0 \n"

	596 "pshufb %%xmm4,%%xmm0 \n"

	597 "pshufb %%xmm5,%%xmm1 \n"

	598 "paddusb %%xmm1,%%xmm0 \n"

	599 "movq %%xmm0," MEMACCESS(1) " \n"

	600 "movhlps %%xmm0,%%xmm1 \n"

	601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"

	602 "lea " MEMLEA(0xc,1) ",%1 \n"

	603 "sub $0xc,%2 \n"

	604 "jg 1b \n"

	605 : "+r"(src_ptr), // %0

	606 "+r"(dst_ptr), // %1

	607 "+r"(dst_width) // %2

	608 : "m"(kShuf38a), // %3

	609 "m"(kShuf38b) // %4

	610 : "memory", "cc"

	611 #if defined(__SSE2__)

	612 , "xmm0", "xmm1", "xmm4", "xmm5"

	613 #endif

	614 );

	615 }

	616

	617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

	618 ptrdiff_t src_stride,

	619 uint8* dst_ptr, int dst_width) {

	620 asm volatile (

	621 "movdqa %0,%%xmm2 \n"

	622 "movdqa %1,%%xmm3 \n"

	623 "movdqa %2,%%xmm4 \n"

	624 "movdqa %3,%%xmm5 \n"

	625 :

	626 : "m"(kShufAb0), // %0

	627 "m"(kShufAb1), // %1

	628 "m"(kShufAb2), // %2

	629 "m"(kScaleAb2) // %3

	630 );

	631 asm volatile (

	632 LABELALIGN

	633 "1: \n"

	634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0

	636 "lea " MEMLEA(0x10,0) ",%0 \n"

	637 "movdqa %%xmm0,%%xmm1 \n"

	638 "pshufb %%xmm2,%%xmm1 \n"

	639 "movdqa %%xmm0,%%xmm6 \n"

	640 "pshufb %%xmm3,%%xmm6 \n"

	641 "paddusw %%xmm6,%%xmm1 \n"

	642 "pshufb %%xmm4,%%xmm0 \n"

	643 "paddusw %%xmm0,%%xmm1 \n"

	644 "pmulhuw %%xmm5,%%xmm1 \n"

	645 "packuswb %%xmm1,%%xmm1 \n"

	646 "sub $0x6,%2 \n"

	647 "movd %%xmm1," MEMACCESS(1) " \n"

	648 "psrlq $0x10,%%xmm1 \n"

	649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"

	650 "lea " MEMLEA(0x6,1) ",%1 \n"

	651 "jg 1b \n"

	652 : "+r"(src_ptr), // %0

	653 "+r"(dst_ptr), // %1

	654 "+r"(dst_width) // %2

	655 : "r"((intptr_t)(src_stride)) // %3

	656 : "memory", "cc"

	657 #if defined(__native_client__) && defined(__x86_64__)

	658 , "r14"

	659 #endif

	660 #if defined(__SSE2__)

	661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

	662 #endif

	663 );

	664 }

	665

	666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

	667 ptrdiff_t src_stride,

	668 uint8* dst_ptr, int dst_width) {

	669 asm volatile (

	670 "movdqa %0,%%xmm2 \n"

	671 "movdqa %1,%%xmm3 \n"

	672 "movdqa %2,%%xmm4 \n"

	673 "pxor %%xmm5,%%xmm5 \n"

	674 :

	675 : "m"(kShufAc), // %0

	676 "m"(kShufAc3), // %1

	677 "m"(kScaleAc33) // %2

	678 );

	679 asm volatile (

	680 LABELALIGN

	681 "1: \n"

	682 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6

	684 "movhlps %%xmm0,%%xmm1 \n"

	685 "movhlps %%xmm6,%%xmm7 \n"

	686 "punpcklbw %%xmm5,%%xmm0 \n"

	687 "punpcklbw %%xmm5,%%xmm1 \n"

	688 "punpcklbw %%xmm5,%%xmm6 \n"

	689 "punpcklbw %%xmm5,%%xmm7 \n"

	690 "paddusw %%xmm6,%%xmm0 \n"

	691 "paddusw %%xmm7,%%xmm1 \n"

	692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6

	693 "lea " MEMLEA(0x10,0) ",%0 \n"

	694 "movhlps %%xmm6,%%xmm7 \n"

	695 "punpcklbw %%xmm5,%%xmm6 \n"

	696 "punpcklbw %%xmm5,%%xmm7 \n"

	697 "paddusw %%xmm6,%%xmm0 \n"

	698 "paddusw %%xmm7,%%xmm1 \n"

	699 "movdqa %%xmm0,%%xmm6 \n"

	700 "psrldq $0x2,%%xmm0 \n"

	701 "paddusw %%xmm0,%%xmm6 \n"

	702 "psrldq $0x2,%%xmm0 \n"

	703 "paddusw %%xmm0,%%xmm6 \n"

	704 "pshufb %%xmm2,%%xmm6 \n"

	705 "movdqa %%xmm1,%%xmm7 \n"

	706 "psrldq $0x2,%%xmm1 \n"

	707 "paddusw %%xmm1,%%xmm7 \n"

	708 "psrldq $0x2,%%xmm1 \n"

	709 "paddusw %%xmm1,%%xmm7 \n"

	710 "pshufb %%xmm3,%%xmm7 \n"

	711 "paddusw %%xmm7,%%xmm6 \n"

	712 "pmulhuw %%xmm4,%%xmm6 \n"

	713 "packuswb %%xmm6,%%xmm6 \n"

	714 "sub $0x6,%2 \n"

	715 "movd %%xmm6," MEMACCESS(1) " \n"

	716 "psrlq $0x10,%%xmm6 \n"

	717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"

	718 "lea " MEMLEA(0x6,1) ",%1 \n"

	719 "jg 1b \n"

	720 : "+r"(src_ptr), // %0

	721 "+r"(dst_ptr), // %1

	722 "+r"(dst_width) // %2

	723 : "r"((intptr_t)(src_stride)) // %3

	724 : "memory", "cc"

	725 #if defined(__native_client__) && defined(__x86_64__)

	726 , "r14"

	727 #endif

	728 #if defined(__SSE2__)

	729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

	730 #endif

	731 );

	732 }

	733

	734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	735 uint16* dst_ptr, int src_width, int src_height) {

	736 int tmp_height = 0;

	737 intptr_t tmp_src = 0;

	738 asm volatile (

	739 "pxor %%xmm4,%%xmm4 \n"

	740 "sub $0x1,%5 \n"

	741

	742 LABELALIGN

	743 "1: \n"

	744 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	745 "mov %0,%3 \n"

	746 "add %6,%0 \n"

	747 "movdqa %%xmm0,%%xmm1 \n"

	748 "punpcklbw %%xmm4,%%xmm0 \n"

	749 "punpckhbw %%xmm4,%%xmm1 \n"

	750 "mov %5,%2 \n"

	751 "test %2,%2 \n"

	752 "je 3f \n"

	753

	754 LABELALIGN

	755 "2: \n"

	756 "movdqa " MEMACCESS(0) ",%%xmm2 \n"

	757 "add %6,%0 \n"

	758 "movdqa %%xmm2,%%xmm3 \n"

	759 "punpcklbw %%xmm4,%%xmm2 \n"

	760 "punpckhbw %%xmm4,%%xmm3 \n"

	761 "paddusw %%xmm2,%%xmm0 \n"

	762 "paddusw %%xmm3,%%xmm1 \n"

	763 "sub $0x1,%2 \n"

	764 "jg 2b \n"

	765

	766 LABELALIGN

	767 "3: \n"

	768 "movdqa %%xmm0," MEMACCESS(1) " \n"

	769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"

	770 "lea " MEMLEA(0x10,3) ",%0 \n"

	771 "lea " MEMLEA(0x20,1) ",%1 \n"

	772 "sub $0x10,%4 \n"

	773 "jg 1b \n"

	774 : "+r"(src_ptr), // %0

	775 "+r"(dst_ptr), // %1

	776 "+r"(tmp_height), // %2

	777 "+r"(tmp_src), // %3

	778 "+r"(src_width), // %4

	779 "+rm"(src_height) // %5

	780 : "rm"((intptr_t)(src_stride)) // %6

	781 : "memory", "cc"

	782 #if defined(__SSE2__)

	783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

	784 #endif

	785 );

	786 }

	787

	788 // Bilinear column filtering. SSSE3 version.

	789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

	790 int dst_width, int x, int dx) {

	791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;

	792 asm volatile (

	793 "movd %6,%%xmm2 \n"

	794 "movd %7,%%xmm3 \n"

	795 "movl $0x04040000,%k2 \n"

	796 "movd %k2,%%xmm5 \n"

	797 "pcmpeqb %%xmm6,%%xmm6 \n"

	798 "psrlw $0x9,%%xmm6 \n"

	799 "pextrw $0x1,%%xmm2,%k3 \n"

	800 "subl $0x2,%5 \n"

	801 "jl 29f \n"

	802 "movdqa %%xmm2,%%xmm0 \n"

	803 "paddd %%xmm3,%%xmm0 \n"

	804 "punpckldq %%xmm0,%%xmm2 \n"

	805 "punpckldq %%xmm3,%%xmm3 \n"

	806 "paddd %%xmm3,%%xmm3 \n"

	807 "pextrw $0x3,%%xmm2,%k4 \n"

	808

	809 LABELALIGN

	810 "2: \n"

	811 "movdqa %%xmm2,%%xmm1 \n"

	812 "paddd %%xmm3,%%xmm2 \n"

	813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2

	814 "movd %k2,%%xmm0 \n"

	815 "psrlw $0x9,%%xmm1 \n"

	816 BUNDLEALIGN

	817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2

	818 "movd %k2,%%xmm4 \n"

	819 "pshufb %%xmm5,%%xmm1 \n"

	820 "punpcklwd %%xmm4,%%xmm0 \n"

	821 "pxor %%xmm6,%%xmm1 \n"

	822 "pmaddubsw %%xmm1,%%xmm0 \n"

	823 "pextrw $0x1,%%xmm2,%k3 \n"

	824 "pextrw $0x3,%%xmm2,%k4 \n"

	825 "psrlw $0x7,%%xmm0 \n"

	826 "packuswb %%xmm0,%%xmm0 \n"

	827 "movd %%xmm0,%k2 \n"

	828 "mov %w2," MEMACCESS(0) " \n"

	829 "lea " MEMLEA(0x2,0) ",%0 \n"

	830 "sub $0x2,%5 \n"

	831 "jge 2b \n"

	832

	833 LABELALIGN

	834 "29: \n"

	835 "addl $0x1,%5 \n"

	836 "jl 99f \n"

	837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2

	838 "movd %k2,%%xmm0 \n"

	839 "psrlw $0x9,%%xmm2 \n"

	840 "pshufb %%xmm5,%%xmm2 \n"

	841 "pxor %%xmm6,%%xmm2 \n"

	842 "pmaddubsw %%xmm2,%%xmm0 \n"

	843 "psrlw $0x7,%%xmm0 \n"

	844 "packuswb %%xmm0,%%xmm0 \n"

	845 "movd %%xmm0,%k2 \n"

	846 "mov %b2," MEMACCESS(0) " \n"

	847 "99: \n"

	848 : "+r"(dst_ptr), // %0

	849 "+r"(src_ptr), // %1

	850 "+a"(temp_pixel), // %2

	851 "+r"(x0), // %3

	852 "+r"(x1), // %4

	853 "+rm"(dst_width) // %5

	854 : "rm"(x), // %6

	855 "rm"(dx) // %7

	856 : "memory", "cc"

	857 #if defined(__native_client__) && defined(__x86_64__)

	858 , "r14"

	859 #endif

	860 #if defined(__SSE2__)

	861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

	862 #endif

	863 );

	864 }

	865

	866 // Reads 4 pixels, duplicates them and writes 8 pixels.

	867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

	869 int dst_width, int x, int dx) {

	870 asm volatile (

	871 LABELALIGN

	872 "1: \n"

	873 "movdqa " MEMACCESS(1) ",%%xmm0 \n"

	874 "lea " MEMLEA(0x10,1) ",%1 \n"

	875 "movdqa %%xmm0,%%xmm1 \n"

	876 "punpcklbw %%xmm0,%%xmm0 \n"

	877 "punpckhbw %%xmm1,%%xmm1 \n"

	878 "sub $0x20,%2 \n"

	879 "movdqa %%xmm0," MEMACCESS(0) " \n"

	880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"

	881 "lea " MEMLEA(0x20,0) ",%0 \n"

	882 "jg 1b \n"

	883

	884 : "+r"(dst_ptr), // %0

	885 "+r"(src_ptr), // %1

	886 "+r"(dst_width) // %2

	887 :

	888 : "memory", "cc"

	889 #if defined(__SSE2__)

	890 , "xmm0", "xmm1"

	891 #endif

	892 );

	893 }

	894

	895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

	896 ptrdiff_t src_stride,

	897 uint8* dst_argb, int dst_width) {

	898 asm volatile (

	899 LABELALIGN

	900 "1: \n"

	901 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	903 "lea " MEMLEA(0x20,0) ",%0 \n"

	904 "shufps $0xdd,%%xmm1,%%xmm0 \n"

	905 "sub $0x4,%2 \n"

	906 "movdqa %%xmm0," MEMACCESS(1) " \n"

	907 "lea " MEMLEA(0x10,1) ",%1 \n"

	908 "jg 1b \n"

	909 : "+r"(src_argb), // %0

	910 "+r"(dst_argb), // %1

	911 "+r"(dst_width) // %2

	912 :

	913 : "memory", "cc"

	914 #if defined(__SSE2__)

	915 , "xmm0", "xmm1"

	916 #endif

	917 );

	918 }

	919

	920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

	921 ptrdiff_t src_stride,

	922 uint8* dst_argb, int dst_width) {

	923 asm volatile (

	924 LABELALIGN

	925 "1: \n"

	926 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	928 "lea " MEMLEA(0x20,0) ",%0 \n"

	929 "movdqa %%xmm0,%%xmm2 \n"

	930 "shufps $0x88,%%xmm1,%%xmm0 \n"

	931 "shufps $0xdd,%%xmm1,%%xmm2 \n"

	932 "pavgb %%xmm2,%%xmm0 \n"

	933 "sub $0x4,%2 \n"

	934 "movdqa %%xmm0," MEMACCESS(1) " \n"

	935 "lea " MEMLEA(0x10,1) ",%1 \n"

	936 "jg 1b \n"

	937 : "+r"(src_argb), // %0

	938 "+r"(dst_argb), // %1

	939 "+r"(dst_width) // %2

	940 :

	941 : "memory", "cc"

	942 #if defined(__SSE2__)

	943 , "xmm0", "xmm1"

	944 #endif

	945 );

	946 }

	947

	948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

	949 ptrdiff_t src_stride,

	950 uint8* dst_argb, int dst_width) {

	951 asm volatile (

	952 LABELALIGN

	953 "1: \n"

	954 "movdqa " MEMACCESS(0) ",%%xmm0 \n"

	955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"

	956 BUNDLEALIGN

	957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2

	958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3

	959 "lea " MEMLEA(0x20,0) ",%0 \n"

	960 "pavgb %%xmm2,%%xmm0 \n"

	961 "pavgb %%xmm3,%%xmm1 \n"

	962 "movdqa %%xmm0,%%xmm2 \n"

	963 "shufps $0x88,%%xmm1,%%xmm0 \n"

	964 "shufps $0xdd,%%xmm1,%%xmm2 \n"

	965 "pavgb %%xmm2,%%xmm0 \n"

	966 "sub $0x4,%2 \n"

	967 "movdqa %%xmm0," MEMACCESS(1) " \n"

	968 "lea " MEMLEA(0x10,1) ",%1 \n"

	969 "jg 1b \n"

	970 : "+r"(src_argb), // %0

	971 "+r"(dst_argb), // %1

	972 "+r"(dst_width) // %2

	973 : "r"((intptr_t)(src_stride)) // %3

	974 : "memory", "cc"

	975 #if defined(__native_client__) && defined(__x86_64__)

	976 , "r14"

	977 #endif

	978 #if defined(__SSE2__)

	979 , "xmm0", "xmm1", "xmm2", "xmm3"

	980 #endif

	981 );

	982 }

	983

	984 // Reads 4 pixels at a time.

	985 // Alignment requirement: dst_argb 16 byte aligned.

	986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

	987 int src_stepx,

	988 uint8* dst_argb, int dst_width) {

	989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

	990 intptr_t src_stepx_x12 = 0;

	991 asm volatile (

	992 "lea " MEMLEA3(0x00,1,4) ",%1 \n"

	993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"

	994 LABELALIGN

	995 "1: \n"

	996 "movd " MEMACCESS(0) ",%%xmm0 \n"

	997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1

	998 "punpckldq %%xmm1,%%xmm0 \n"

	999 BUNDLEALIGN

	1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2

	1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3

	1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"

	1003 "punpckldq %%xmm3,%%xmm2 \n"

	1004 "punpcklqdq %%xmm2,%%xmm0 \n"

	1005 "sub $0x4,%3 \n"

	1006 "movdqa %%xmm0," MEMACCESS(2) " \n"

	1007 "lea " MEMLEA(0x10,2) ",%2 \n"

	1008 "jg 1b \n"

	1009 : "+r"(src_argb), // %0

	1010 "+r"(src_stepx_x4), // %1

	1011 "+r"(dst_argb), // %2

	1012 "+r"(dst_width), // %3

	1013 "+r"(src_stepx_x12) // %4

	1014 :

	1015 : "memory", "cc"

	1016 #if defined(__native_client__) && defined(__x86_64__)

	1017 , "r14"

	1018 #endif

	1019 #if defined(__SSE2__)

	1020 , "xmm0", "xmm1", "xmm2", "xmm3"

	1021 #endif

	1022 );

	1023 }

	1024

	1025 // Blends four 2x2 to 4x1.

	1026 // Alignment requirement: dst_argb 16 byte aligned.

	1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

	1028 ptrdiff_t src_stride, int src_stepx,

	1029 uint8* dst_argb, int dst_width) {

	1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

	1031 intptr_t src_stepx_x12 = 0;

	1032 intptr_t row1 = (intptr_t)(src_stride);

	1033 asm volatile (

	1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n"

	1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"

	1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"

	1037

	1038 LABELALIGN

	1039 "1: \n"

	1040 "movq " MEMACCESS(0) ",%%xmm0 \n"

	1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0

	1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1

	1043 BUNDLEALIGN

	1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1

	1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"

	1046 "movq " MEMACCESS(5) ",%%xmm2 \n"

	1047 BUNDLEALIGN

	1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2

	1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3

	1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3

	1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"

	1052 "pavgb %%xmm2,%%xmm0 \n"

	1053 "pavgb %%xmm3,%%xmm1 \n"

	1054 "movdqa %%xmm0,%%xmm2 \n"

	1055 "shufps $0x88,%%xmm1,%%xmm0 \n"

	1056 "shufps $0xdd,%%xmm1,%%xmm2 \n"

	1057 "pavgb %%xmm2,%%xmm0 \n"

	1058 "sub $0x4,%3 \n"

	1059 "movdqa %%xmm0," MEMACCESS(2) " \n"

	1060 "lea " MEMLEA(0x10,2) ",%2 \n"

	1061 "jg 1b \n"

	1062 : "+r"(src_argb), // %0

	1063 "+r"(src_stepx_x4), // %1

	1064 "+r"(dst_argb), // %2

	1065 "+rm"(dst_width), // %3

	1066 "+r"(src_stepx_x12), // %4

	1067 "+r"(row1) // %5

	1068 :

	1069 : "memory", "cc"

	1070 #if defined(__native_client__) && defined(__x86_64__)

	1071 , "r14"

	1072 #endif

	1073 #if defined(__SSE2__)

	1074 , "xmm0", "xmm1", "xmm2", "xmm3"

	1075 #endif

	1076 );

	1077 }

	1078

	1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

	1080 int dst_width, int x, int dx) {

	1081 intptr_t x0 = 0, x1 = 0;

	1082 asm volatile (

	1083 "movd %5,%%xmm2 \n"

	1084 "movd %6,%%xmm3 \n"

	1085 "pshufd $0x0,%%xmm2,%%xmm2 \n"

	1086 "pshufd $0x11,%%xmm3,%%xmm0 \n"

	1087 "paddd %%xmm0,%%xmm2 \n"

	1088 "paddd %%xmm3,%%xmm3 \n"

	1089 "pshufd $0x5,%%xmm3,%%xmm0 \n"

	1090 "paddd %%xmm0,%%xmm2 \n"

	1091 "paddd %%xmm3,%%xmm3 \n"

	1092 "pshufd $0x0,%%xmm3,%%xmm3 \n"

	1093 "pextrw $0x1,%%xmm2,%k0 \n"

	1094 "pextrw $0x3,%%xmm2,%k1 \n"

	1095 "cmp $0x0,%4 \n"

	1096 "jl 99f \n"

	1097 "sub $0x4,%4 \n"

	1098 "jl 49f \n"

	1099

	1100 LABELALIGN

	1101 "40: \n"

	1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0

	1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1

	1104 "pextrw $0x5,%%xmm2,%k0 \n"

	1105 "pextrw $0x7,%%xmm2,%k1 \n"

	1106 "paddd %%xmm3,%%xmm2 \n"

	1107 "punpckldq %%xmm1,%%xmm0 \n"

	1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1

	1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4

	1110 "pextrw $0x1,%%xmm2,%k0 \n"

	1111 "pextrw $0x3,%%xmm2,%k1 \n"

	1112 "punpckldq %%xmm4,%%xmm1 \n"

	1113 "punpcklqdq %%xmm1,%%xmm0 \n"

	1114 "sub $0x4,%4 \n"

	1115 "movdqu %%xmm0," MEMACCESS(2) " \n"

	1116 "lea " MEMLEA(0x10,2) ",%2 \n"

	1117 "jge 40b \n"

	1118

	1119 "49: \n"

	1120 "test $0x2,%4 \n"

	1121 "je 29f \n"

	1122 BUNDLEALIGN

	1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0

	1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1

	1125 "pextrw $0x5,%%xmm2,%k0 \n"

	1126 "punpckldq %%xmm1,%%xmm0 \n"

	1127 "movq %%xmm0," MEMACCESS(2) " \n"

	1128 "lea " MEMLEA(0x8,2) ",%2 \n"

	1129 "29: \n"

	1130 "test $0x1,%4 \n"

	1131 "je 99f \n"

	1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0

	1133 "movd %%xmm0," MEMACCESS(2) " \n"

	1134 "99: \n"

	1135 : "+a"(x0), // %0

	1136 "+d"(x1), // %1

	1137 "+r"(dst_argb), // %2

	1138 "+r"(src_argb), // %3

	1139 "+r"(dst_width) // %4

	1140 : "rm"(x), // %5

	1141 "rm"(dx) // %6

	1142 : "memory", "cc"

	1143 #if defined(__native_client__) && defined(__x86_64__)

	1144 , "r14"

	1145 #endif

	1146 #if defined(__SSE2__)

	1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

	1148 #endif

	1149 );

	1150 }

	1151

	1152 // Reads 4 pixels, duplicates them and writes 8 pixels.

	1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

	1155 int dst_width, int x, int dx) {

	1156 asm volatile (

	1157 LABELALIGN

	1158 "1: \n"

	1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n"

	1160 "lea " MEMLEA(0x10,1) ",%1 \n"

	1161 "movdqa %%xmm0,%%xmm1 \n"

	1162 "punpckldq %%xmm0,%%xmm0 \n"

	1163 "punpckhdq %%xmm1,%%xmm1 \n"

	1164 "sub $0x8,%2 \n"

	1165 "movdqa %%xmm0," MEMACCESS(0) " \n"

	1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"

	1167 "lea " MEMLEA(0x20,0) ",%0 \n"

	1168 "jg 1b \n"

	1169

	1170 : "+r"(dst_argb), // %0

	1171 "+r"(src_argb), // %1

	1172 "+r"(dst_width) // %2

	1173 :

	1174 : "memory", "cc"

	1175 #if defined(__native_client__) && defined(__x86_64__)

	1176 , "r14"

	1177 #endif

	1178 #if defined(__SSE2__)

	1179 , "xmm0", "xmm1"

	1180 #endif

	1181 );

	1182 }

	1183

	1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

	1185 static uvec8 kShuffleColARGB = {

	1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel

	1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel

	1188 };

	1189

	1190 // Shuffle table for duplicating 2 fractions into 8 bytes each

	1191 static uvec8 kShuffleFractions = {

	1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

	1193 };

	1194

	1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version

	1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

	1197 int dst_width, int x, int dx) {

	1198 intptr_t x0 = 0, x1 = 0;

	1199 asm volatile (

	1200 "movdqa %0,%%xmm4 \n"

	1201 "movdqa %1,%%xmm5 \n"

	1202 :

	1203 : "m"(kShuffleColARGB), // %0

	1204 "m"(kShuffleFractions) // %1

	1205 );

	1206

	1207 asm volatile (

	1208 "movd %5,%%xmm2 \n"

	1209 "movd %6,%%xmm3 \n"

	1210 "pcmpeqb %%xmm6,%%xmm6 \n"

	1211 "psrlw $0x9,%%xmm6 \n"

	1212 "pextrw $0x1,%%xmm2,%k3 \n"

	1213 "sub $0x2,%2 \n"

	1214 "jl 29f \n"

	1215 "movdqa %%xmm2,%%xmm0 \n"

	1216 "paddd %%xmm3,%%xmm0 \n"

	1217 "punpckldq %%xmm0,%%xmm2 \n"

	1218 "punpckldq %%xmm3,%%xmm3 \n"

	1219 "paddd %%xmm3,%%xmm3 \n"

	1220 "pextrw $0x3,%%xmm2,%k4 \n"

	1221

	1222 LABELALIGN

	1223 "2: \n"

	1224 "movdqa %%xmm2,%%xmm1 \n"

	1225 "paddd %%xmm3,%%xmm2 \n"

	1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0

	1227 "psrlw $0x9,%%xmm1 \n"

	1228 BUNDLEALIGN

	1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0

	1230 "pshufb %%xmm5,%%xmm1 \n"

	1231 "pshufb %%xmm4,%%xmm0 \n"

	1232 "pxor %%xmm6,%%xmm1 \n"

	1233 "pmaddubsw %%xmm1,%%xmm0 \n"

	1234 "psrlw $0x7,%%xmm0 \n"

	1235 "pextrw $0x1,%%xmm2,%k3 \n"

	1236 "pextrw $0x3,%%xmm2,%k4 \n"

	1237 "packuswb %%xmm0,%%xmm0 \n"

	1238 "movq %%xmm0," MEMACCESS(0) " \n"

	1239 "lea " MEMLEA(0x8,0) ",%0 \n"

	1240 "sub $0x2,%2 \n"

	1241 "jge 2b \n"

	1242

	1243 LABELALIGN

	1244 "29: \n"

	1245 "add $0x1,%2 \n"

	1246 "jl 99f \n"

	1247 "psrlw $0x9,%%xmm2 \n"

	1248 BUNDLEALIGN

	1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0

	1250 "pshufb %%xmm5,%%xmm2 \n"

	1251 "pshufb %%xmm4,%%xmm0 \n"

	1252 "pxor %%xmm6,%%xmm2 \n"

	1253 "pmaddubsw %%xmm2,%%xmm0 \n"

	1254 "psrlw $0x7,%%xmm0 \n"

	1255 "packuswb %%xmm0,%%xmm0 \n"

	1256 "movd %%xmm0," MEMACCESS(0) " \n"

	1257

	1258 LABELALIGN

	1259 "99: \n"

	1260 : "+r"(dst_argb), // %0

	1261 "+r"(src_argb), // %1

	1262 "+rm"(dst_width), // %2

	1263 "+r"(x0), // %3

	1264 "+r"(x1) // %4

	1265 : "rm"(x), // %5

	1266 "rm"(dx) // %6

	1267 : "memory", "cc"

	1268 #if defined(__native_client__) && defined(__x86_64__)

	1269 , "r14"

	1270 #endif

	1271 #if defined(__SSE2__)

	1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

	1273 #endif

	1274 );

	1275 }

	1276

	1277 // Divide num by div and return as 16.16 fixed point result.

	1278 int FixedDiv_X86(int num, int div) {

	1279 asm volatile (

	1280 "cdq \n"

	1281 "shld $0x10,%%eax,%%edx \n"

	1282 "shl $0x10,%%eax \n"

	1283 "idiv %1 \n"

	1284 "mov %0, %%eax \n"

	1285 : "+a"(num) // %0

	1286 : "c"(div) // %1

	1287 : "memory", "cc", "edx"

	1288 );

	1289 return num;

	1290 }

	1291

	1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.

	1293 int FixedDiv1_X86(int num, int div) {

	1294 asm volatile (

	1295 "cdq \n"

	1296 "shld $0x10,%%eax,%%edx \n"

	1297 "shl $0x10,%%eax \n"

	1298 "sub $0x10001,%%eax \n"

	1299 "sbb $0x0,%%edx \n"

	1300 "sub $0x1,%1 \n"

	1301 "idiv %1 \n"

	1302 "mov %0, %%eax \n"

	1303 : "+a"(num) // %0

	1304 : "c"(div) // %1

	1305 : "memory", "cc", "edx"

	1306 );

	1307 return num;

	1308 }

	1309

	1310 #endif // defined(__x86_64__) \|\| defined(__i386__)

	1311

	1312 #ifdef __cplusplus

	1313 } // extern "C"

	1314 } // namespace libyuv

	1315 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_neon.cc ('k') | source/libvpx/third_party/libyuv/source/scale_win.cc » ('j') | no next file with comments »