source/libvpx/third_party/libyuv/source/scale_win.cc - Issue 341293003: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_win.cc

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include "third_party/libyuv/include/libyuv/row.h"

	12

	13 #ifdef __cplusplus

	14 namespace libyuv {

	15 extern "C" {

	16 #endif

	17

	18 // This module is for Visual C x86.

	19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

	20

	21 // Offsets for source bytes 0 to 9

	22 static uvec8 kShuf0 =

	23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

	24

	25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

	26 static uvec8 kShuf1 =

	27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

	28

	29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

	30 static uvec8 kShuf2 =

	31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

	32

	33 // Offsets for source bytes 0 to 10

	34 static uvec8 kShuf01 =

	35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

	36

	37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

	38 static uvec8 kShuf11 =

	39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

	40

	41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

	42 static uvec8 kShuf21 =

	43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

	44

	45 // Coefficients for source bytes 0 to 10

	46 static uvec8 kMadd01 =

	47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

	48

	49 // Coefficients for source bytes 10 to 21

	50 static uvec8 kMadd11 =

	51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

	52

	53 // Coefficients for source bytes 21 to 31

	54 static uvec8 kMadd21 =

	55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

	56

	57 // Coefficients for source bytes 21 to 31

	58 static vec16 kRound34 =

	59 { 2, 2, 2, 2, 2, 2, 2, 2 };

	60

	61 static uvec8 kShuf38a =

	62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

	63

	64 static uvec8 kShuf38b =

	65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

	66

	67 // Arrange words 0,3,6 into 0,1,2

	68 static uvec8 kShufAc =

	69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

	70

	71 // Arrange words 0,3,6 into 3,4,5

	72 static uvec8 kShufAc3 =

	73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

	74

	75 // Scaling values for boxes of 3x3 and 2x3

	76 static uvec16 kScaleAc33 =

	77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

	78

	79 // Arrange first value for pixels 0,1,2,3,4,5

	80 static uvec8 kShufAb0 =

	81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

	82

	83 // Arrange second value for pixels 0,1,2,3,4,5

	84 static uvec8 kShufAb1 =

	85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

	86

	87 // Arrange third value for pixels 0,1,2,3,4,5

	88 static uvec8 kShufAb2 =

	89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

	90

	91 // Scaling values for boxes of 3x2 and 2x2

	92 static uvec16 kScaleAb2 =

	93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

	94

	95 // Reads 32 pixels, throws half away and writes 16 pixels.

	96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	97 __declspec(naked) __declspec(align(16))

	98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	99 uint8* dst_ptr, int dst_width) {

	100 __asm {

	101 mov eax, [esp + 4] // src_ptr

	102 // src_stride ignored

	103 mov edx, [esp + 12] // dst_ptr

	104 mov ecx, [esp + 16] // dst_width

	105

	106 align 4

	107 wloop:

	108 movdqa xmm0, [eax]

	109 movdqa xmm1, [eax + 16]

	110 lea eax, [eax + 32]

	111 psrlw xmm0, 8 // isolate odd pixels.

	112 psrlw xmm1, 8

	113 packuswb xmm0, xmm1

	114 sub ecx, 16

	115 movdqa [edx], xmm0

	116 lea edx, [edx + 16]

	117 jg wloop

	118

	119 ret

	120 }

	121 }

	122

	123 // Blends 32x1 rectangle to 16x1.

	124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	125 __declspec(naked) __declspec(align(16))

	126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	127 uint8* dst_ptr, int dst_width) {

	128 __asm {

	129 mov eax, [esp + 4] // src_ptr

	130 // src_stride

	131 mov edx, [esp + 12] // dst_ptr

	132 mov ecx, [esp + 16] // dst_width

	133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

	134 psrlw xmm5, 8

	135

	136 align 4

	137 wloop:

	138 movdqa xmm0, [eax]

	139 movdqa xmm1, [eax + 16]

	140 lea eax, [eax + 32]

	141

	142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

	143 psrlw xmm0, 8

	144 movdqa xmm3, xmm1

	145 psrlw xmm1, 8

	146 pand xmm2, xmm5

	147 pand xmm3, xmm5

	148 pavgw xmm0, xmm2

	149 pavgw xmm1, xmm3

	150 packuswb xmm0, xmm1

	151

	152 sub ecx, 16

	153 movdqa [edx], xmm0

	154 lea edx, [edx + 16]

	155 jg wloop

	156

	157 ret

	158 }

	159 }

	160

	161 // Blends 32x2 rectangle to 16x1.

	162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	163 __declspec(naked) __declspec(align(16))

	164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	165 uint8* dst_ptr, int dst_width) {

	166 __asm {

	167 push esi

	168 mov eax, [esp + 4 + 4] // src_ptr

	169 mov esi, [esp + 4 + 8] // src_stride

	170 mov edx, [esp + 4 + 12] // dst_ptr

	171 mov ecx, [esp + 4 + 16] // dst_width

	172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

	173 psrlw xmm5, 8

	174

	175 align 4

	176 wloop:

	177 movdqa xmm0, [eax]

	178 movdqa xmm1, [eax + 16]

	179 movdqa xmm2, [eax + esi]

	180 movdqa xmm3, [eax + esi + 16]

	181 lea eax, [eax + 32]

	182 pavgb xmm0, xmm2 // average rows

	183 pavgb xmm1, xmm3

	184

	185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

	186 psrlw xmm0, 8

	187 movdqa xmm3, xmm1

	188 psrlw xmm1, 8

	189 pand xmm2, xmm5

	190 pand xmm3, xmm5

	191 pavgw xmm0, xmm2

	192 pavgw xmm1, xmm3

	193 packuswb xmm0, xmm1

	194

	195 sub ecx, 16

	196 movdqa [edx], xmm0

	197 lea edx, [edx + 16]

	198 jg wloop

	199

	200 pop esi

	201 ret

	202 }

	203 }

	204

	205 // Reads 32 pixels, throws half away and writes 16 pixels.

	206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	207 __declspec(naked) __declspec(align(16))

	208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,

	209 ptrdiff_t src_stride,

	210 uint8* dst_ptr, int dst_width) {

	211 __asm {

	212 mov eax, [esp + 4] // src_ptr

	213 // src_stride ignored

	214 mov edx, [esp + 12] // dst_ptr

	215 mov ecx, [esp + 16] // dst_width

	216

	217 align 4

	218 wloop:

	219 movdqu xmm0, [eax]

	220 movdqu xmm1, [eax + 16]

	221 lea eax, [eax + 32]

	222 psrlw xmm0, 8 // isolate odd pixels.

	223 psrlw xmm1, 8

	224 packuswb xmm0, xmm1

	225 sub ecx, 16

	226 movdqu [edx], xmm0

	227 lea edx, [edx + 16]

	228 jg wloop

	229

	230 ret

	231 }

	232 }

	233

	234 // Blends 32x1 rectangle to 16x1.

	235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	236 __declspec(naked) __declspec(align(16))

	237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

	238 ptrdiff_t src_stride,

	239 uint8* dst_ptr, int dst_width) {

	240 __asm {

	241 mov eax, [esp + 4] // src_ptr

	242 // src_stride

	243 mov edx, [esp + 12] // dst_ptr

	244 mov ecx, [esp + 16] // dst_width

	245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

	246 psrlw xmm5, 8

	247

	248 align 4

	249 wloop:

	250 movdqu xmm0, [eax]

	251 movdqu xmm1, [eax + 16]

	252 lea eax, [eax + 32]

	253

	254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

	255 psrlw xmm0, 8

	256 movdqa xmm3, xmm1

	257 psrlw xmm1, 8

	258 pand xmm2, xmm5

	259 pand xmm3, xmm5

	260 pavgw xmm0, xmm2

	261 pavgw xmm1, xmm3

	262 packuswb xmm0, xmm1

	263

	264 sub ecx, 16

	265 movdqu [edx], xmm0

	266 lea edx, [edx + 16]

	267 jg wloop

	268

	269 ret

	270 }

	271 }

	272

	273 // Blends 32x2 rectangle to 16x1.

	274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

	275 __declspec(naked) __declspec(align(16))

	276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

	277 ptrdiff_t src_stride,

	278 uint8* dst_ptr, int dst_width) {

	279 __asm {

	280 push esi

	281 mov eax, [esp + 4 + 4] // src_ptr

	282 mov esi, [esp + 4 + 8] // src_stride

	283 mov edx, [esp + 4 + 12] // dst_ptr

	284 mov ecx, [esp + 4 + 16] // dst_width

	285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

	286 psrlw xmm5, 8

	287

	288 align 4

	289 wloop:

	290 movdqu xmm0, [eax]

	291 movdqu xmm1, [eax + 16]

	292 movdqu xmm2, [eax + esi]

	293 movdqu xmm3, [eax + esi + 16]

	294 lea eax, [eax + 32]

	295 pavgb xmm0, xmm2 // average rows

	296 pavgb xmm1, xmm3

	297

	298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

	299 psrlw xmm0, 8

	300 movdqa xmm3, xmm1

	301 psrlw xmm1, 8

	302 pand xmm2, xmm5

	303 pand xmm3, xmm5

	304 pavgw xmm0, xmm2

	305 pavgw xmm1, xmm3

	306 packuswb xmm0, xmm1

	307

	308 sub ecx, 16

	309 movdqu [edx], xmm0

	310 lea edx, [edx + 16]

	311 jg wloop

	312

	313 pop esi

	314 ret

	315 }

	316 }

	317

	318 // Point samples 32 pixels to 8 pixels.

	319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

	320 __declspec(naked) __declspec(align(16))

	321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	322 uint8* dst_ptr, int dst_width) {

	323 __asm {

	324 mov eax, [esp + 4] // src_ptr

	325 // src_stride ignored

	326 mov edx, [esp + 12] // dst_ptr

	327 mov ecx, [esp + 16] // dst_width

	328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000

	329 psrld xmm5, 24

	330 pslld xmm5, 16

	331

	332 align 4

	333 wloop:

	334 movdqa xmm0, [eax]

	335 movdqa xmm1, [eax + 16]

	336 lea eax, [eax + 32]

	337 pand xmm0, xmm5

	338 pand xmm1, xmm5

	339 packuswb xmm0, xmm1

	340 psrlw xmm0, 8

	341 packuswb xmm0, xmm0

	342 sub ecx, 8

	343 movq qword ptr [edx], xmm0

	344 lea edx, [edx + 8]

	345 jg wloop

	346

	347 ret

	348 }

	349 }

	350

	351 // Blends 32x4 rectangle to 8x1.

	352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

	353 __declspec(naked) __declspec(align(16))

	354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	355 uint8* dst_ptr, int dst_width) {

	356 __asm {

	357 push esi

	358 push edi

	359 mov eax, [esp + 8 + 4] // src_ptr

	360 mov esi, [esp + 8 + 8] // src_stride

	361 mov edx, [esp + 8 + 12] // dst_ptr

	362 mov ecx, [esp + 8 + 16] // dst_width

	363 lea edi, [esi + esi * 2] // src_stride * 3

	364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff

	365 psrlw xmm7, 8

	366

	367 align 4

	368 wloop:

	369 movdqa xmm0, [eax]

	370 movdqa xmm1, [eax + 16]

	371 movdqa xmm2, [eax + esi]

	372 movdqa xmm3, [eax + esi + 16]

	373 pavgb xmm0, xmm2 // average rows

	374 pavgb xmm1, xmm3

	375 movdqa xmm2, [eax + esi * 2]

	376 movdqa xmm3, [eax + esi * 2 + 16]

	377 movdqa xmm4, [eax + edi]

	378 movdqa xmm5, [eax + edi + 16]

	379 lea eax, [eax + 32]

	380 pavgb xmm2, xmm4

	381 pavgb xmm3, xmm5

	382 pavgb xmm0, xmm2

	383 pavgb xmm1, xmm3

	384

	385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)

	386 psrlw xmm0, 8

	387 movdqa xmm3, xmm1

	388 psrlw xmm1, 8

	389 pand xmm2, xmm7

	390 pand xmm3, xmm7

	391 pavgw xmm0, xmm2

	392 pavgw xmm1, xmm3

	393 packuswb xmm0, xmm1

	394

	395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)

	396 psrlw xmm0, 8

	397 pand xmm2, xmm7

	398 pavgw xmm0, xmm2

	399 packuswb xmm0, xmm0

	400

	401 sub ecx, 8

	402 movq qword ptr [edx], xmm0

	403 lea edx, [edx + 8]

	404 jg wloop

	405

	406 pop edi

	407 pop esi

	408 ret

	409 }

	410 }

	411

	412 // Point samples 32 pixels to 24 pixels.

	413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

	414 // Then shuffled to do the scaling.

	415

	416 // Note that movdqa+palign may be better than movdqu.

	417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

	418 __declspec(naked) __declspec(align(16))

	419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

	420 uint8* dst_ptr, int dst_width) {

	421 __asm {

	422 mov eax, [esp + 4] // src_ptr

	423 // src_stride ignored

	424 mov edx, [esp + 12] // dst_ptr

	425 mov ecx, [esp + 16] // dst_width

	426 movdqa xmm3, kShuf0

	427 movdqa xmm4, kShuf1

	428 movdqa xmm5, kShuf2

	429

	430 align 4

	431 wloop:

	432 movdqa xmm0, [eax]

	433 movdqa xmm1, [eax + 16]

	434 lea eax, [eax + 32]

	435 movdqa xmm2, xmm1

	436 palignr xmm1, xmm0, 8

	437 pshufb xmm0, xmm3

	438 pshufb xmm1, xmm4

	439 pshufb xmm2, xmm5

	440 movq qword ptr [edx], xmm0

	441 movq qword ptr [edx + 8], xmm1

	442 movq qword ptr [edx + 16], xmm2

	443 lea edx, [edx + 24]

	444 sub ecx, 24

	445 jg wloop

	446

	447 ret

	448 }

	449 }

	450

	451 // Blends 32x2 rectangle to 24x1

	452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

	453 // Then shuffled to do the scaling.

	454

	455 // Register usage:

	456 // xmm0 src_row 0

	457 // xmm1 src_row 1

	458 // xmm2 shuf 0

	459 // xmm3 shuf 1

	460 // xmm4 shuf 2

	461 // xmm5 madd 0

	462 // xmm6 madd 1

	463 // xmm7 kRound34

	464

	465 // Note that movdqa+palign may be better than movdqu.

	466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

	467 __declspec(naked) __declspec(align(16))

	468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

	469 ptrdiff_t src_stride,

	470 uint8* dst_ptr, int dst_width) {

	471 __asm {

	472 push esi

	473 mov eax, [esp + 4 + 4] // src_ptr

	474 mov esi, [esp + 4 + 8] // src_stride

	475 mov edx, [esp + 4 + 12] // dst_ptr

	476 mov ecx, [esp + 4 + 16] // dst_width

	477 movdqa xmm2, kShuf01

	478 movdqa xmm3, kShuf11

	479 movdqa xmm4, kShuf21

	480 movdqa xmm5, kMadd01

	481 movdqa xmm6, kMadd11

	482 movdqa xmm7, kRound34

	483

	484 align 4

	485 wloop:

	486 movdqa xmm0, [eax] // pixels 0..7

	487 movdqa xmm1, [eax + esi]

	488 pavgb xmm0, xmm1

	489 pshufb xmm0, xmm2

	490 pmaddubsw xmm0, xmm5

	491 paddsw xmm0, xmm7

	492 psrlw xmm0, 2

	493 packuswb xmm0, xmm0

	494 movq qword ptr [edx], xmm0

	495 movdqu xmm0, [eax + 8] // pixels 8..15

	496 movdqu xmm1, [eax + esi + 8]

	497 pavgb xmm0, xmm1

	498 pshufb xmm0, xmm3

	499 pmaddubsw xmm0, xmm6

	500 paddsw xmm0, xmm7

	501 psrlw xmm0, 2

	502 packuswb xmm0, xmm0

	503 movq qword ptr [edx + 8], xmm0

	504 movdqa xmm0, [eax + 16] // pixels 16..23

	505 movdqa xmm1, [eax + esi + 16]

	506 lea eax, [eax + 32]

	507 pavgb xmm0, xmm1

	508 pshufb xmm0, xmm4

	509 movdqa xmm1, kMadd21

	510 pmaddubsw xmm0, xmm1

	511 paddsw xmm0, xmm7

	512 psrlw xmm0, 2

	513 packuswb xmm0, xmm0

	514 sub ecx, 24

	515 movq qword ptr [edx + 16], xmm0

	516 lea edx, [edx + 24]

	517 jg wloop

	518

	519 pop esi

	520 ret

	521 }

	522 }

	523

	524 // Note that movdqa+palign may be better than movdqu.

	525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

	526 __declspec(naked) __declspec(align(16))

	527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

	528 ptrdiff_t src_stride,

	529 uint8* dst_ptr, int dst_width) {

	530 __asm {

	531 push esi

	532 mov eax, [esp + 4 + 4] // src_ptr

	533 mov esi, [esp + 4 + 8] // src_stride

	534 mov edx, [esp + 4 + 12] // dst_ptr

	535 mov ecx, [esp + 4 + 16] // dst_width

	536 movdqa xmm2, kShuf01

	537 movdqa xmm3, kShuf11

	538 movdqa xmm4, kShuf21

	539 movdqa xmm5, kMadd01

	540 movdqa xmm6, kMadd11

	541 movdqa xmm7, kRound34

	542

	543 align 4

	544 wloop:

	545 movdqa xmm0, [eax] // pixels 0..7

	546 movdqa xmm1, [eax + esi]

	547 pavgb xmm1, xmm0

	548 pavgb xmm0, xmm1

	549 pshufb xmm0, xmm2

	550 pmaddubsw xmm0, xmm5

	551 paddsw xmm0, xmm7

	552 psrlw xmm0, 2

	553 packuswb xmm0, xmm0

	554 movq qword ptr [edx], xmm0

	555 movdqu xmm0, [eax + 8] // pixels 8..15

	556 movdqu xmm1, [eax + esi + 8]

	557 pavgb xmm1, xmm0

	558 pavgb xmm0, xmm1

	559 pshufb xmm0, xmm3

	560 pmaddubsw xmm0, xmm6

	561 paddsw xmm0, xmm7

	562 psrlw xmm0, 2

	563 packuswb xmm0, xmm0

	564 movq qword ptr [edx + 8], xmm0

	565 movdqa xmm0, [eax + 16] // pixels 16..23

	566 movdqa xmm1, [eax + esi + 16]

	567 lea eax, [eax + 32]

	568 pavgb xmm1, xmm0

	569 pavgb xmm0, xmm1

	570 pshufb xmm0, xmm4

	571 movdqa xmm1, kMadd21

	572 pmaddubsw xmm0, xmm1

	573 paddsw xmm0, xmm7

	574 psrlw xmm0, 2

	575 packuswb xmm0, xmm0

	576 sub ecx, 24

	577 movq qword ptr [edx + 16], xmm0

	578 lea edx, [edx+24]

	579 jg wloop

	580

	581 pop esi

	582 ret

	583 }

	584 }

	585

	586 // 3/8 point sampler

	587

	588 // Scale 32 pixels to 12

	589 __declspec(naked) __declspec(align(16))

	590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

	591 uint8* dst_ptr, int dst_width) {

	592 __asm {

	593 mov eax, [esp + 4] // src_ptr

	594 // src_stride ignored

	595 mov edx, [esp + 12] // dst_ptr

	596 mov ecx, [esp + 16] // dst_width

	597 movdqa xmm4, kShuf38a

	598 movdqa xmm5, kShuf38b

	599

	600 align 4

	601 xloop:

	602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5

	603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11

	604 lea eax, [eax + 32]

	605 pshufb xmm0, xmm4

	606 pshufb xmm1, xmm5

	607 paddusb xmm0, xmm1

	608

	609 sub ecx, 12

	610 movq qword ptr [edx], xmm0 // write 12 pixels

	611 movhlps xmm1, xmm0

	612 movd [edx + 8], xmm1

	613 lea edx, [edx + 12]

	614 jg xloop

	615

	616 ret

	617 }

	618 }

	619

	620 // Scale 16x3 pixels to 6x1 with interpolation

	621 __declspec(naked) __declspec(align(16))

	622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

	623 ptrdiff_t src_stride,

	624 uint8* dst_ptr, int dst_width) {

	625 __asm {

	626 push esi

	627 mov eax, [esp + 4 + 4] // src_ptr

	628 mov esi, [esp + 4 + 8] // src_stride

	629 mov edx, [esp + 4 + 12] // dst_ptr

	630 mov ecx, [esp + 4 + 16] // dst_width

	631 movdqa xmm2, kShufAc

	632 movdqa xmm3, kShufAc3

	633 movdqa xmm4, kScaleAc33

	634 pxor xmm5, xmm5

	635

	636 align 4

	637 xloop:

	638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1

	639 movdqa xmm6, [eax + esi]

	640 movhlps xmm1, xmm0

	641 movhlps xmm7, xmm6

	642 punpcklbw xmm0, xmm5

	643 punpcklbw xmm1, xmm5

	644 punpcklbw xmm6, xmm5

	645 punpcklbw xmm7, xmm5

	646 paddusw xmm0, xmm6

	647 paddusw xmm1, xmm7

	648 movdqa xmm6, [eax + esi * 2]

	649 lea eax, [eax + 16]

	650 movhlps xmm7, xmm6

	651 punpcklbw xmm6, xmm5

	652 punpcklbw xmm7, xmm5

	653 paddusw xmm0, xmm6

	654 paddusw xmm1, xmm7

	655

	656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6

	657 psrldq xmm0, 2

	658 paddusw xmm6, xmm0

	659 psrldq xmm0, 2

	660 paddusw xmm6, xmm0

	661 pshufb xmm6, xmm2

	662

	663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6

	664 psrldq xmm1, 2

	665 paddusw xmm7, xmm1

	666 psrldq xmm1, 2

	667 paddusw xmm7, xmm1

	668 pshufb xmm7, xmm3

	669 paddusw xmm6, xmm7

	670

	671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6

	672 packuswb xmm6, xmm6

	673

	674 sub ecx, 6

	675 movd [edx], xmm6 // write 6 pixels

	676 psrlq xmm6, 16

	677 movd [edx + 2], xmm6

	678 lea edx, [edx + 6]

	679 jg xloop

	680

	681 pop esi

	682 ret

	683 }

	684 }

	685

	686 // Scale 16x2 pixels to 6x1 with interpolation

	687 __declspec(naked) __declspec(align(16))

	688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

	689 ptrdiff_t src_stride,

	690 uint8* dst_ptr, int dst_width) {

	691 __asm {

	692 push esi

	693 mov eax, [esp + 4 + 4] // src_ptr

	694 mov esi, [esp + 4 + 8] // src_stride

	695 mov edx, [esp + 4 + 12] // dst_ptr

	696 mov ecx, [esp + 4 + 16] // dst_width

	697 movdqa xmm2, kShufAb0

	698 movdqa xmm3, kShufAb1

	699 movdqa xmm4, kShufAb2

	700 movdqa xmm5, kScaleAb2

	701

	702 align 4

	703 xloop:

	704 movdqa xmm0, [eax] // average 2 rows into xmm0

	705 pavgb xmm0, [eax + esi]

	706 lea eax, [eax + 16]

	707

	708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1

	709 pshufb xmm1, xmm2

	710 movdqa xmm6, xmm0

	711 pshufb xmm6, xmm3

	712 paddusw xmm1, xmm6

	713 pshufb xmm0, xmm4

	714 paddusw xmm1, xmm0

	715

	716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2

	717 packuswb xmm1, xmm1

	718

	719 sub ecx, 6

	720 movd [edx], xmm1 // write 6 pixels

	721 psrlq xmm1, 16

	722 movd [edx + 2], xmm1

	723 lea edx, [edx + 6]

	724 jg xloop

	725

	726 pop esi

	727 ret

	728 }

	729 }

	730

	731 // Reads 16xN bytes and produces 16 shorts at a time.

	732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.

	733 __declspec(naked) __declspec(align(16))

	734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

	735 uint16* dst_ptr, int src_width,

	736 int src_height) {

	737 __asm {

	738 push esi

	739 push edi

	740 push ebx

	741 push ebp

	742 mov esi, [esp + 16 + 4] // src_ptr

	743 mov edx, [esp + 16 + 8] // src_stride

	744 mov edi, [esp + 16 + 12] // dst_ptr

	745 mov ecx, [esp + 16 + 16] // dst_width

	746 mov ebx, [esp + 16 + 20] // height

	747 pxor xmm4, xmm4

	748 dec ebx

	749

	750 align 4

	751 xloop:

	752 // first row

	753 movdqa xmm0, [esi]

	754 lea eax, [esi + edx]

	755 movdqa xmm1, xmm0

	756 punpcklbw xmm0, xmm4

	757 punpckhbw xmm1, xmm4

	758 lea esi, [esi + 16]

	759 mov ebp, ebx

	760 test ebp, ebp

	761 je ydone

	762

	763 // sum remaining rows

	764 align 4

	765 yloop:

	766 movdqa xmm2, [eax] // read 16 pixels

	767 lea eax, [eax + edx] // advance to next row

	768 movdqa xmm3, xmm2

	769 punpcklbw xmm2, xmm4

	770 punpckhbw xmm3, xmm4

	771 paddusw xmm0, xmm2 // sum 16 words

	772 paddusw xmm1, xmm3

	773 sub ebp, 1

	774 jg yloop

	775

	776 align 4

	777 ydone:

	778 movdqa [edi], xmm0

	779 movdqa [edi + 16], xmm1

	780 lea edi, [edi + 32]

	781

	782 sub ecx, 16

	783 jg xloop

	784

	785 pop ebp

	786 pop ebx

	787 pop edi

	788 pop esi

	789 ret

	790 }

	791 }

	792

	793 // Bilinear column filtering. SSSE3 version.

	794 // TODO(fbarchard): Port to Neon

	795 // TODO(fbarchard): Switch the following:

	796 // xor ebx, ebx

	797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels

	798 // To

	799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

	800 // when drmemory bug fixed.

	801 // https://code.google.com/p/drmemory/issues/detail?id=1396

	802

	803 __declspec(naked) __declspec(align(16))

	804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

	805 int dst_width, int x, int dx) {

	806 __asm {

	807 push ebx

	808 push esi

	809 push edi

	810 mov edi, [esp + 12 + 4] // dst_ptr

	811 mov esi, [esp + 12 + 8] // src_ptr

	812 mov ecx, [esp + 12 + 12] // dst_width

	813 movd xmm2, [esp + 12 + 16] // x

	814 movd xmm3, [esp + 12 + 20] // dx

	815 mov eax, 0x04040000 // shuffle to line up fractions with pixel.

	816 movd xmm5, eax

	817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.

	818 psrlw xmm6, 9

	819 pextrw eax, xmm2, 1 // get x0 integer. preroll

	820 sub ecx, 2

	821 jl xloop29

	822

	823 movdqa xmm0, xmm2 // x1 = x0 + dx

	824 paddd xmm0, xmm3

	825 punpckldq xmm2, xmm0 // x0 x1

	826 punpckldq xmm3, xmm3 // dx dx

	827 paddd xmm3, xmm3 // dx * 2, dx * 2

	828 pextrw edx, xmm2, 3 // get x1 integer. preroll

	829

	830 // 2 Pixel loop.

	831 align 4

	832 xloop2:

	833 movdqa xmm1, xmm2 // x0, x1 fractions.

	834 paddd xmm2, xmm3 // x += dx

	835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

	836 movd xmm0, ebx

	837 psrlw xmm1, 9 // 7 bit fractions.

	838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels

	839 movd xmm4, ebx

	840 pshufb xmm1, xmm5 // 0011

	841 punpcklwd xmm0, xmm4

	842 pxor xmm1, xmm6 // 0..7f and 7f..0

	843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.

	844 pextrw eax, xmm2, 1 // get x0 integer. next iteration.

	845 pextrw edx, xmm2, 3 // get x1 integer. next iteration.

	846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.

	847 packuswb xmm0, xmm0 // 8 bits, 2 pixels.

	848 movd ebx, xmm0

	849 mov [edi], bx

	850 lea edi, [edi + 2]

	851 sub ecx, 2 // 2 pixels

	852 jge xloop2

	853

	854 align 4

	855 xloop29:

	856

	857 add ecx, 2 - 1

	858 jl xloop99

	859

	860 // 1 pixel remainder

	861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

	862 movd xmm0, ebx

	863 psrlw xmm2, 9 // 7 bit fractions.

	864 pshufb xmm2, xmm5 // 0011

	865 pxor xmm2, xmm6 // 0..7f and 7f..0

	866 pmaddubsw xmm0, xmm2 // 16 bit

	867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.

	868 packuswb xmm0, xmm0 // 8 bits

	869 movd ebx, xmm0

	870 mov [edi], bl

	871

	872 align 4

	873 xloop99:

	874

	875 pop edi

	876 pop esi

	877 pop ebx

	878 ret

	879 }

	880 }

	881

	882 // Reads 16 pixels, duplicates them and writes 32 pixels.

	883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	884 __declspec(naked) __declspec(align(16))

	885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

	886 int dst_width, int x, int dx) {

	887 __asm {

	888 mov edx, [esp + 4] // dst_ptr

	889 mov eax, [esp + 8] // src_ptr

	890 mov ecx, [esp + 12] // dst_width

	891

	892 align 4

	893 wloop:

	894 movdqa xmm0, [eax]

	895 lea eax, [eax + 16]

	896 movdqa xmm1, xmm0

	897 punpcklbw xmm0, xmm0

	898 punpckhbw xmm1, xmm1

	899 sub ecx, 32

	900 movdqa [edx], xmm0

	901 movdqa [edx + 16], xmm1

	902 lea edx, [edx + 32]

	903 jg wloop

	904

	905 ret

	906 }

	907 }

	908

	909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)

	910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	911 __declspec(naked) __declspec(align(16))

	912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

	913 ptrdiff_t src_stride,

	914 uint8* dst_argb, int dst_width) {

	915 __asm {

	916 mov eax, [esp + 4] // src_argb

	917 // src_stride ignored

	918 mov edx, [esp + 12] // dst_argb

	919 mov ecx, [esp + 16] // dst_width

	920

	921 align 4

	922 wloop:

	923 movdqa xmm0, [eax]

	924 movdqa xmm1, [eax + 16]

	925 lea eax, [eax + 32]

	926 shufps xmm0, xmm1, 0xdd

	927 sub ecx, 4

	928 movdqa [edx], xmm0

	929 lea edx, [edx + 16]

	930 jg wloop

	931

	932 ret

	933 }

	934 }

	935

	936 // Blends 8x1 rectangle to 4x1.

	937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	938 __declspec(naked) __declspec(align(16))

	939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

	940 ptrdiff_t src_stride,

	941 uint8* dst_argb, int dst_width) {

	942 __asm {

	943 mov eax, [esp + 4] // src_argb

	944 // src_stride ignored

	945 mov edx, [esp + 12] // dst_argb

	946 mov ecx, [esp + 16] // dst_width

	947

	948 align 4

	949 wloop:

	950 movdqa xmm0, [eax]

	951 movdqa xmm1, [eax + 16]

	952 lea eax, [eax + 32]

	953 movdqa xmm2, xmm0

	954 shufps xmm0, xmm1, 0x88 // even pixels

	955 shufps xmm2, xmm1, 0xdd // odd pixels

	956 pavgb xmm0, xmm2

	957 sub ecx, 4

	958 movdqa [edx], xmm0

	959 lea edx, [edx + 16]

	960 jg wloop

	961

	962 ret

	963 }

	964 }

	965

	966 // Blends 8x2 rectangle to 4x1.

	967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	968 __declspec(naked) __declspec(align(16))

	969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

	970 ptrdiff_t src_stride,

	971 uint8* dst_argb, int dst_width) {

	972 __asm {

	973 push esi

	974 mov eax, [esp + 4 + 4] // src_argb

	975 mov esi, [esp + 4 + 8] // src_stride

	976 mov edx, [esp + 4 + 12] // dst_argb

	977 mov ecx, [esp + 4 + 16] // dst_width

	978

	979 align 4

	980 wloop:

	981 movdqa xmm0, [eax]

	982 movdqa xmm1, [eax + 16]

	983 movdqa xmm2, [eax + esi]

	984 movdqa xmm3, [eax + esi + 16]

	985 lea eax, [eax + 32]

	986 pavgb xmm0, xmm2 // average rows

	987 pavgb xmm1, xmm3

	988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)

	989 shufps xmm0, xmm1, 0x88 // even pixels

	990 shufps xmm2, xmm1, 0xdd // odd pixels

	991 pavgb xmm0, xmm2

	992 sub ecx, 4

	993 movdqa [edx], xmm0

	994 lea edx, [edx + 16]

	995 jg wloop

	996

	997 pop esi

	998 ret

	999 }

	1000 }

	1001

	1002 // Reads 4 pixels at a time.

	1003 // Alignment requirement: dst_argb 16 byte aligned.

	1004 __declspec(naked) __declspec(align(16))

	1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

	1006 int src_stepx,

	1007 uint8* dst_argb, int dst_width) {

	1008 __asm {

	1009 push ebx

	1010 push edi

	1011 mov eax, [esp + 8 + 4] // src_argb

	1012 // src_stride ignored

	1013 mov ebx, [esp + 8 + 12] // src_stepx

	1014 mov edx, [esp + 8 + 16] // dst_argb

	1015 mov ecx, [esp + 8 + 20] // dst_width

	1016 lea ebx, [ebx * 4]

	1017 lea edi, [ebx + ebx * 2]

	1018

	1019 align 4

	1020 wloop:

	1021 movd xmm0, [eax]

	1022 movd xmm1, [eax + ebx]

	1023 punpckldq xmm0, xmm1

	1024 movd xmm2, [eax + ebx * 2]

	1025 movd xmm3, [eax + edi]

	1026 lea eax, [eax + ebx * 4]

	1027 punpckldq xmm2, xmm3

	1028 punpcklqdq xmm0, xmm2

	1029 sub ecx, 4

	1030 movdqa [edx], xmm0

	1031 lea edx, [edx + 16]

	1032 jg wloop

	1033

	1034 pop edi

	1035 pop ebx

	1036 ret

	1037 }

	1038 }

	1039

	1040 // Blends four 2x2 to 4x1.

	1041 // Alignment requirement: dst_argb 16 byte aligned.

	1042 __declspec(naked) __declspec(align(16))

	1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

	1044 ptrdiff_t src_stride,

	1045 int src_stepx,

	1046 uint8* dst_argb, int dst_width) {

	1047 __asm {

	1048 push ebx

	1049 push esi

	1050 push edi

	1051 mov eax, [esp + 12 + 4] // src_argb

	1052 mov esi, [esp + 12 + 8] // src_stride

	1053 mov ebx, [esp + 12 + 12] // src_stepx

	1054 mov edx, [esp + 12 + 16] // dst_argb

	1055 mov ecx, [esp + 12 + 20] // dst_width

	1056 lea esi, [eax + esi] // row1 pointer

	1057 lea ebx, [ebx * 4]

	1058 lea edi, [ebx + ebx * 2]

	1059

	1060 align 4

	1061 wloop:

	1062 movq xmm0, qword ptr [eax] // row0 4 pairs

	1063 movhps xmm0, qword ptr [eax + ebx]

	1064 movq xmm1, qword ptr [eax + ebx * 2]

	1065 movhps xmm1, qword ptr [eax + edi]

	1066 lea eax, [eax + ebx * 4]

	1067 movq xmm2, qword ptr [esi] // row1 4 pairs

	1068 movhps xmm2, qword ptr [esi + ebx]

	1069 movq xmm3, qword ptr [esi + ebx * 2]

	1070 movhps xmm3, qword ptr [esi + edi]

	1071 lea esi, [esi + ebx * 4]

	1072 pavgb xmm0, xmm2 // average rows

	1073 pavgb xmm1, xmm3

	1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)

	1075 shufps xmm0, xmm1, 0x88 // even pixels

	1076 shufps xmm2, xmm1, 0xdd // odd pixels

	1077 pavgb xmm0, xmm2

	1078 sub ecx, 4

	1079 movdqa [edx], xmm0

	1080 lea edx, [edx + 16]

	1081 jg wloop

	1082

	1083 pop edi

	1084 pop esi

	1085 pop ebx

	1086 ret

	1087 }

	1088 }

	1089

	1090 // Column scaling unfiltered. SSE2 version.

	1091 __declspec(naked) __declspec(align(16))

	1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

	1093 int dst_width, int x, int dx) {

	1094 __asm {

	1095 push edi

	1096 push esi

	1097 mov edi, [esp + 8 + 4] // dst_argb

	1098 mov esi, [esp + 8 + 8] // src_argb

	1099 mov ecx, [esp + 8 + 12] // dst_width

	1100 movd xmm2, [esp + 8 + 16] // x

	1101 movd xmm3, [esp + 8 + 20] // dx

	1102

	1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0

	1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0

	1105 paddd xmm2, xmm0

	1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2

	1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0

	1108 paddd xmm2, xmm0 // x3 x2 x1 x0

	1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4

	1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4

	1111

	1112 pextrw eax, xmm2, 1 // get x0 integer.

	1113 pextrw edx, xmm2, 3 // get x1 integer.

	1114

	1115 cmp ecx, 0

	1116 jle xloop99

	1117 sub ecx, 4

	1118 jl xloop49

	1119

	1120 // 4 Pixel loop.

	1121 align 4

	1122 xloop4:

	1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels

	1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels

	1125 pextrw eax, xmm2, 5 // get x2 integer.

	1126 pextrw edx, xmm2, 7 // get x3 integer.

	1127 paddd xmm2, xmm3 // x += dx

	1128 punpckldq xmm0, xmm1 // x0 x1

	1129

	1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels

	1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels

	1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration.

	1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration.

	1134 punpckldq xmm1, xmm4 // x2 x3

	1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3

	1136 sub ecx, 4 // 4 pixels

	1137 movdqu [edi], xmm0

	1138 lea edi, [edi + 16]

	1139 jge xloop4

	1140

	1141 align 4

	1142 xloop49:

	1143 test ecx, 2

	1144 je xloop29

	1145

	1146 // 2 Pixels.

	1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels

	1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels

	1149 pextrw eax, xmm2, 5 // get x2 integer.

	1150 punpckldq xmm0, xmm1 // x0 x1

	1151

	1152 movq qword ptr [edi], xmm0

	1153 lea edi, [edi + 8]

	1154

	1155 xloop29:

	1156 test ecx, 1

	1157 je xloop99

	1158

	1159 // 1 Pixels.

	1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels

	1161 movd dword ptr [edi], xmm0

	1162 align 4

	1163 xloop99:

	1164

	1165 pop esi

	1166 pop edi

	1167 ret

	1168 }

	1169 }

	1170

	1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.

	1172 // TODO(fbarchard): Port to Neon

	1173

	1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

	1175 static uvec8 kShuffleColARGB = {

	1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel

	1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel

	1178 };

	1179

	1180 // Shuffle table for duplicating 2 fractions into 8 bytes each

	1181 static uvec8 kShuffleFractions = {

	1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

	1183 };

	1184

	1185 __declspec(naked) __declspec(align(16))

	1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

	1187 int dst_width, int x, int dx) {

	1188 __asm {

	1189 push esi

	1190 push edi

	1191 mov edi, [esp + 8 + 4] // dst_argb

	1192 mov esi, [esp + 8 + 8] // src_argb

	1193 mov ecx, [esp + 8 + 12] // dst_width

	1194 movd xmm2, [esp + 8 + 16] // x

	1195 movd xmm3, [esp + 8 + 20] // dx

	1196 movdqa xmm4, kShuffleColARGB

	1197 movdqa xmm5, kShuffleFractions

	1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.

	1199 psrlw xmm6, 9

	1200 pextrw eax, xmm2, 1 // get x0 integer. preroll

	1201 sub ecx, 2

	1202 jl xloop29

	1203

	1204 movdqa xmm0, xmm2 // x1 = x0 + dx

	1205 paddd xmm0, xmm3

	1206 punpckldq xmm2, xmm0 // x0 x1

	1207 punpckldq xmm3, xmm3 // dx dx

	1208 paddd xmm3, xmm3 // dx * 2, dx * 2

	1209 pextrw edx, xmm2, 3 // get x1 integer. preroll

	1210

	1211 // 2 Pixel loop.

	1212 align 4

	1213 xloop2:

	1214 movdqa xmm1, xmm2 // x0, x1 fractions.

	1215 paddd xmm2, xmm3 // x += dx

	1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels

	1217 psrlw xmm1, 9 // 7 bit fractions.

	1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels

	1219 pshufb xmm1, xmm5 // 0000000011111111

	1220 pshufb xmm0, xmm4 // arrange pixels into pairs

	1221 pxor xmm1, xmm6 // 0..7f and 7f..0

	1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.

	1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration.

	1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration.

	1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.

	1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.

	1227 movq qword ptr [edi], xmm0

	1228 lea edi, [edi + 8]

	1229 sub ecx, 2 // 2 pixels

	1230 jge xloop2

	1231

	1232 align 4

	1233 xloop29:

	1234

	1235 add ecx, 2 - 1

	1236 jl xloop99

	1237

	1238 // 1 pixel remainder

	1239 psrlw xmm2, 9 // 7 bit fractions.

	1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels

	1241 pshufb xmm2, xmm5 // 00000000

	1242 pshufb xmm0, xmm4 // arrange pixels into pairs

	1243 pxor xmm2, xmm6 // 0..7f and 7f..0

	1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.

	1245 psrlw xmm0, 7

	1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.

	1247 movd [edi], xmm0

	1248

	1249 align 4

	1250 xloop99:

	1251

	1252 pop edi

	1253 pop esi

	1254 ret

	1255 }

	1256 }

	1257

	1258 // Reads 4 pixels, duplicates them and writes 8 pixels.

	1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

	1260 __declspec(naked) __declspec(align(16))

	1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

	1262 int dst_width, int x, int dx) {

	1263 __asm {

	1264 mov edx, [esp + 4] // dst_argb

	1265 mov eax, [esp + 8] // src_argb

	1266 mov ecx, [esp + 12] // dst_width

	1267

	1268 align 4

	1269 wloop:

	1270 movdqa xmm0, [eax]

	1271 lea eax, [eax + 16]

	1272 movdqa xmm1, xmm0

	1273 punpckldq xmm0, xmm0

	1274 punpckhdq xmm1, xmm1

	1275 sub ecx, 8

	1276 movdqa [edx], xmm0

	1277 movdqa [edx + 16], xmm1

	1278 lea edx, [edx + 32]

	1279 jg wloop

	1280

	1281 ret

	1282 }

	1283 }

	1284

	1285 // Divide num by div and return as 16.16 fixed point result.

	1286 __declspec(naked) __declspec(align(16))

	1287 int FixedDiv_X86(int num, int div) {

	1288 __asm {

	1289 mov eax, [esp + 4] // num

	1290 cdq // extend num to 64 bits

	1291 shld edx, eax, 16 // 32.16

	1292 shl eax, 16

	1293 idiv dword ptr [esp + 8]

	1294 ret

	1295 }

	1296 }

	1297

	1298 // Divide num by div and return as 16.16 fixed point result.

	1299 __declspec(naked) __declspec(align(16))

	1300 int FixedDiv1_X86(int num, int div) {

	1301 __asm {

	1302 mov eax, [esp + 4] // num

	1303 mov ecx, [esp + 8] // denom

	1304 cdq // extend num to 64 bits

	1305 shld edx, eax, 16 // 32.16

	1306 shl eax, 16

	1307 sub eax, 0x00010001

	1308 sbb edx, 0

	1309 sub ecx, 1

	1310 idiv ecx

	1311 ret

	1312 }

	1313 }

	1314

	1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

	1316

	1317 #ifdef __cplusplus

	1318 } // extern "C"

	1319 } // namespace libyuv

	1320 #endif

OLD	NEW