source/libvpx/third_party/libyuv/source/scale_win.cc - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/third_party/libyuv/source/scale_win.cc

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.	2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include "libyuv/row.h"	11 #include "libyuv/row.h"

	12 #include "libyuv/scale_row.h"

12	13

13 #ifdef __cplusplus	14 #ifdef __cplusplus

14 namespace libyuv {	15 namespace libyuv {

15 extern "C" {	16 extern "C" {

16 #endif	17 #endif

17	18

18 // This module is for Visual C x86.	19 // This module is for Visual C x86.

19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)	20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

	21 defined(_MSC_VER) && !defined(__clang__)

20	22

21 // Offsets for source bytes 0 to 9	23 // Offsets for source bytes 0 to 9

22 static uvec8 kShuf0 =	24 static uvec8 kShuf0 =

23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };	25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

24	26

25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.	27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

26 static uvec8 kShuf1 =	28 static uvec8 kShuf1 =

27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };	29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

28	30

29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.	31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
86	88

87 // Arrange third value for pixels 0,1,2,3,4,5	89 // Arrange third value for pixels 0,1,2,3,4,5

88 static uvec8 kShufAb2 =	90 static uvec8 kShufAb2 =

89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };	91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

90	92

91 // Scaling values for boxes of 3x2 and 2x2	93 // Scaling values for boxes of 3x2 and 2x2

92 static uvec16 kScaleAb2 =	94 static uvec16 kScaleAb2 =

93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };	95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

94	96

95 // Reads 32 pixels, throws half away and writes 16 pixels.	97 // Reads 32 pixels, throws half away and writes 16 pixels.

96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.	98 __declspec(naked)

97 __declspec(naked) __declspec(align(16))

98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,	99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

99 uint8* dst_ptr, int dst_width) {	100 uint8* dst_ptr, int dst_width) {

100 __asm {	101 __asm {

101 mov eax, [esp + 4] // src_ptr	102 mov eax, [esp + 4] // src_ptr

102 // src_stride ignored	103 // src_stride ignored

103 mov edx, [esp + 12] // dst_ptr	104 mov edx, [esp + 12] // dst_ptr

104 mov ecx, [esp + 16] // dst_width	105 mov ecx, [esp + 16] // dst_width

105	106

106 wloop:	107 wloop:

107 movdqu xmm0, [eax]	108 movdqu xmm0, [eax]

108 movdqu xmm1, [eax + 16]	109 movdqu xmm1, [eax + 16]

109 lea eax, [eax + 32]	110 lea eax, [eax + 32]

110 psrlw xmm0, 8 // isolate odd pixels.	111 psrlw xmm0, 8 // isolate odd pixels.

111 psrlw xmm1, 8	112 psrlw xmm1, 8

112 packuswb xmm0, xmm1	113 packuswb xmm0, xmm1

113 movdqu [edx], xmm0	114 movdqu [edx], xmm0

114 lea edx, [edx + 16]	115 lea edx, [edx + 16]

115 sub ecx, 16	116 sub ecx, 16

116 jg wloop	117 jg wloop

117	118

118 ret	119 ret

119 }	120 }

120 }	121 }

121	122

122 // Blends 32x1 rectangle to 16x1.	123 // Blends 32x1 rectangle to 16x1.

123 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.	124 __declspec(naked)

124 __declspec(naked) __declspec(align(16))

125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,	125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

126 uint8* dst_ptr, int dst_width) {	126 uint8* dst_ptr, int dst_width) {

127 __asm {	127 __asm {

128 mov eax, [esp + 4] // src_ptr	128 mov eax, [esp + 4] // src_ptr

129 // src_stride	129 // src_stride

130 mov edx, [esp + 12] // dst_ptr	130 mov edx, [esp + 12] // dst_ptr

131 mov ecx, [esp + 16] // dst_width	131 mov ecx, [esp + 16] // dst_width

132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

133 psrlw xmm5, 8	133 psrlw xmm5, 8

134	134

(...skipping 15 matching lines...) Expand all Loading...
150 movdqu [edx], xmm0	150 movdqu [edx], xmm0

151 lea edx, [edx + 16]	151 lea edx, [edx + 16]

152 sub ecx, 16	152 sub ecx, 16

153 jg wloop	153 jg wloop

154	154

155 ret	155 ret

156 }	156 }

157 }	157 }

158	158

159 // Blends 32x2 rectangle to 16x1.	159 // Blends 32x2 rectangle to 16x1.

160 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.	160 __declspec(naked)

161 __declspec(naked) __declspec(align(16))

162 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,	161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

163 uint8* dst_ptr, int dst_width) {	162 uint8* dst_ptr, int dst_width) {

164 __asm {	163 __asm {

165 push esi	164 push esi

166 mov eax, [esp + 4 + 4] // src_ptr	165 mov eax, [esp + 4 + 4] // src_ptr

167 mov esi, [esp + 4 + 8] // src_stride	166 mov esi, [esp + 4 + 8] // src_stride

168 mov edx, [esp + 4 + 12] // dst_ptr	167 mov edx, [esp + 4 + 12] // dst_ptr

169 mov ecx, [esp + 4 + 16] // dst_width	168 mov ecx, [esp + 4 + 16] // dst_width

170 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff	169 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff

171 psrlw xmm5, 8	170 psrlw xmm5, 8

(...skipping 20 matching lines...) Expand all Loading...
192 movdqu [edx], xmm0	191 movdqu [edx], xmm0

193 lea edx, [edx + 16]	192 lea edx, [edx + 16]

194 sub ecx, 16	193 sub ecx, 16

195 jg wloop	194 jg wloop

196	195

197 pop esi	196 pop esi

198 ret	197 ret

199 }	198 }

200 }	199 }

201	200

	201 #ifdef HAS_SCALEROWDOWN2_AVX2

	202 // Reads 64 pixels, throws half away and writes 32 pixels.

	203 __declspec(naked)

	204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

	205 uint8* dst_ptr, int dst_width) {

	206 __asm {

	207 mov eax, [esp + 4] // src_ptr

	208 // src_stride ignored

	209 mov edx, [esp + 12] // dst_ptr

	210 mov ecx, [esp + 16] // dst_width

	211

	212 wloop:

	213 vmovdqu ymm0, [eax]

	214 vmovdqu ymm1, [eax + 32]

	215 lea eax, [eax + 64]

	216 vpsrlw ymm0, ymm0, 8 // isolate odd pixels.

	217 vpsrlw ymm1, ymm1, 8

	218 vpackuswb ymm0, ymm0, ymm1

	219 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	220 vmovdqu [edx], ymm0

	221 lea edx, [edx + 32]

	222 sub ecx, 32

	223 jg wloop

	224

	225 vzeroupper

	226 ret

	227 }

	228 }

	229

	230 // Blends 64x1 rectangle to 32x1.

	231 __declspec(naked)

	232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

	233 uint8* dst_ptr, int dst_width) {

	234 __asm {

	235 mov eax, [esp + 4] // src_ptr

	236 // src_stride

	237 mov edx, [esp + 12] // dst_ptr

	238 mov ecx, [esp + 16] // dst_width

	239

	240 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b

	241 vpsrlw ymm4, ymm4, 15

	242 vpackuswb ymm4, ymm4, ymm4

	243 vpxor ymm5, ymm5, ymm5 // constant 0

	244

	245 wloop:

	246 vmovdqu ymm0, [eax]

	247 vmovdqu ymm1, [eax + 32]

	248 lea eax, [eax + 64]

	249

	250 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally

	251 vpmaddubsw ymm1, ymm1, ymm4

	252 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2

	253 vpavgw ymm1, ymm1, ymm5

	254 vpackuswb ymm0, ymm0, ymm1

	255 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	256

	257 vmovdqu [edx], ymm0

	258 lea edx, [edx + 32]

	259 sub ecx, 32

	260 jg wloop

	261

	262 vzeroupper

	263 ret

	264 }

	265 }

	266

	267 // Blends 64x2 rectangle to 32x1.

	268 __declspec(naked)

	269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

	270 uint8* dst_ptr, int dst_width) {

	271 __asm {

	272 push esi

	273 mov eax, [esp + 4 + 4] // src_ptr

	274 mov esi, [esp + 4 + 8] // src_stride

	275 mov edx, [esp + 4 + 12] // dst_ptr

	276 mov ecx, [esp + 4 + 16] // dst_width

	277

	278 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b

	279 vpsrlw ymm4, ymm4, 15

	280 vpackuswb ymm4, ymm4, ymm4

	281 vpxor ymm5, ymm5, ymm5 // constant 0

	282

	283 wloop:

	284 vmovdqu ymm0, [eax] // average rows

	285 vmovdqu ymm1, [eax + 32]

	286 vpavgb ymm0, ymm0, [eax + esi]

	287 vpavgb ymm1, ymm1, [eax + esi + 32]

	288 lea eax, [eax + 64]

	289

	290 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally

	291 vpmaddubsw ymm1, ymm1, ymm4

	292 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2

	293 vpavgw ymm1, ymm1, ymm5

	294 vpackuswb ymm0, ymm0, ymm1

	295 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	296

	297 vmovdqu [edx], ymm0

	298 lea edx, [edx + 32]

	299 sub ecx, 32

	300 jg wloop

	301

	302 pop esi

	303 vzeroupper

	304 ret

	305 }

	306 }

	307 #endif // HAS_SCALEROWDOWN2_AVX2

	308

202 // Point samples 32 pixels to 8 pixels.	309 // Point samples 32 pixels to 8 pixels.

203 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.	310 __declspec(naked)

204 __declspec(naked) __declspec(align(16))

205 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,	311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

206 uint8* dst_ptr, int dst_width) {	312 uint8* dst_ptr, int dst_width) {

207 __asm {	313 __asm {

208 mov eax, [esp + 4] // src_ptr	314 mov eax, [esp + 4] // src_ptr

209 // src_stride ignored	315 // src_stride ignored

210 mov edx, [esp + 12] // dst_ptr	316 mov edx, [esp + 12] // dst_ptr

211 mov ecx, [esp + 16] // dst_width	317 mov ecx, [esp + 16] // dst_width

212 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000	318 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000

213 psrld xmm5, 24	319 psrld xmm5, 24

214 pslld xmm5, 16	320 pslld xmm5, 16

(...skipping 10 matching lines...) Expand all Loading...
225 movq qword ptr [edx], xmm0	331 movq qword ptr [edx], xmm0

226 lea edx, [edx + 8]	332 lea edx, [edx + 8]

227 sub ecx, 8	333 sub ecx, 8

228 jg wloop	334 jg wloop

229	335

230 ret	336 ret

231 }	337 }

232 }	338 }

233	339

234 // Blends 32x4 rectangle to 8x1.	340 // Blends 32x4 rectangle to 8x1.

235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.	341 __declspec(naked)

236 __declspec(naked) __declspec(align(16))

237 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,	342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

238 uint8* dst_ptr, int dst_width) {	343 uint8* dst_ptr, int dst_width) {

239 __asm {	344 __asm {

240 push esi	345 push esi

241 push edi	346 push edi

242 mov eax, [esp + 8 + 4] // src_ptr	347 mov eax, [esp + 8 + 4] // src_ptr

243 mov esi, [esp + 8 + 8] // src_stride	348 mov esi, [esp + 8 + 8] // src_stride

244 mov edx, [esp + 8 + 12] // dst_ptr	349 mov edx, [esp + 8 + 12] // dst_ptr

245 mov ecx, [esp + 8 + 16] // dst_width	350 mov ecx, [esp + 8 + 16] // dst_width

246 lea edi, [esi + esi * 2] // src_stride * 3	351 lea edi, [esi + esi * 2] // src_stride * 3

247 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff	352 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff

248 psrlw xmm7, 8	353 psrlw xmm7, 8

249	354

250 wloop:	355 wloop:

251 movdqu xmm0, [eax]	356 movdqu xmm0, [eax] // average rows

252 movdqu xmm1, [eax + 16]	357 movdqu xmm1, [eax + 16]

253 movdqu xmm2, [eax + esi]	358 movdqu xmm2, [eax + esi]

254 movdqu xmm3, [eax + esi + 16]	359 movdqu xmm3, [eax + esi + 16]

255 pavgb xmm0, xmm2 // average rows	360 pavgb xmm0, xmm2

256 pavgb xmm1, xmm3	361 pavgb xmm1, xmm3

257 movdqu xmm2, [eax + esi * 2]	362 movdqu xmm2, [eax + esi * 2]

258 movdqu xmm3, [eax + esi * 2 + 16]	363 movdqu xmm3, [eax + esi * 2 + 16]

259 movdqu xmm4, [eax + edi]	364 movdqu xmm4, [eax + edi]

260 movdqu xmm5, [eax + edi + 16]	365 movdqu xmm5, [eax + edi + 16]

261 lea eax, [eax + 32]	366 lea eax, [eax + 32]

262 pavgb xmm2, xmm4	367 pavgb xmm2, xmm4

263 pavgb xmm3, xmm5	368 pavgb xmm3, xmm5

264 pavgb xmm0, xmm2	369 pavgb xmm0, xmm2

265 pavgb xmm1, xmm3	370 pavgb xmm1, xmm3

(...skipping 18 matching lines...) Expand all Loading...
284 lea edx, [edx + 8]	389 lea edx, [edx + 8]

285 sub ecx, 8	390 sub ecx, 8

286 jg wloop	391 jg wloop

287	392

288 pop edi	393 pop edi

289 pop esi	394 pop esi

290 ret	395 ret

291 }	396 }

292 }	397 }

293	398

	399 #ifdef HAS_SCALEROWDOWN4_AVX2

	400 // Point samples 64 pixels to 16 pixels.

	401 __declspec(naked)

	402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

	403 uint8* dst_ptr, int dst_width) {

	404 __asm {

	405 mov eax, [esp + 4] // src_ptr

	406 // src_stride ignored

	407 mov edx, [esp + 12] // dst_ptr

	408 mov ecx, [esp + 16] // dst_width

	409 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000

	410 vpsrld ymm5, ymm5, 24

	411 vpslld ymm5, ymm5, 16

	412

	413 wloop:

	414 vmovdqu ymm0, [eax]

	415 vmovdqu ymm1, [eax + 32]

	416 lea eax, [eax + 64]

	417 vpand ymm0, ymm0, ymm5

	418 vpand ymm1, ymm1, ymm5

	419 vpackuswb ymm0, ymm0, ymm1

	420 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	421 vpsrlw ymm0, ymm0, 8

	422 vpackuswb ymm0, ymm0, ymm0

	423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	424 vmovdqu [edx], xmm0

	425 lea edx, [edx + 16]

	426 sub ecx, 16

	427 jg wloop

	428

	429 vzeroupper

	430 ret

	431 }

	432 }

	433

	434 // Blends 64x4 rectangle to 16x1.

	435 __declspec(naked)

	436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

	437 uint8* dst_ptr, int dst_width) {

	438 __asm {

	439 push esi

	440 push edi

	441 mov eax, [esp + 8 + 4] // src_ptr

	442 mov esi, [esp + 8 + 8] // src_stride

	443 mov edx, [esp + 8 + 12] // dst_ptr

	444 mov ecx, [esp + 8 + 16] // dst_width

	445 lea edi, [esi + esi * 2] // src_stride * 3

	446 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff

	447 vpsrlw ymm7, ymm7, 8

	448

	449 wloop:

	450 vmovdqu ymm0, [eax] // average rows

	451 vmovdqu ymm1, [eax + 32]

	452 vpavgb ymm0, ymm0, [eax + esi]

	453 vpavgb ymm1, ymm1, [eax + esi + 32]

	454 vmovdqu ymm2, [eax + esi * 2]

	455 vmovdqu ymm3, [eax + esi * 2 + 32]

	456 vpavgb ymm2, ymm2, [eax + edi]

	457 vpavgb ymm3, ymm3, [eax + edi + 32]

	458 lea eax, [eax + 64]

	459 vpavgb ymm0, ymm0, ymm2

	460 vpavgb ymm1, ymm1, ymm3

	461

	462 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)

	463 vpand ymm3, ymm1, ymm7

	464 vpsrlw ymm0, ymm0, 8

	465 vpsrlw ymm1, ymm1, 8

	466 vpavgw ymm0, ymm0, ymm2

	467 vpavgw ymm1, ymm1, ymm3

	468 vpackuswb ymm0, ymm0, ymm1

	469 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	470

	471 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)

	472 vpsrlw ymm0, ymm0, 8

	473 vpavgw ymm0, ymm0, ymm2

	474 vpackuswb ymm0, ymm0, ymm0

	475 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb

	476

	477 vmovdqu [edx], xmm0

	478 lea edx, [edx + 16]

	479 sub ecx, 16

	480 jg wloop

	481

	482 pop edi

	483 pop esi

	484 vzeroupper

	485 ret

	486 }

	487 }

	488 #endif // HAS_SCALEROWDOWN4_AVX2

	489

294 // Point samples 32 pixels to 24 pixels.	490 // Point samples 32 pixels to 24 pixels.

295 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.	491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

296 // Then shuffled to do the scaling.	492 // Then shuffled to do the scaling.

297	493

298 // Note that movdqa+palign may be better than movdqu.	494 __declspec(naked)

299 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

300 __declspec(naked) __declspec(align(16))

301 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,	495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

302 uint8* dst_ptr, int dst_width) {	496 uint8* dst_ptr, int dst_width) {

303 __asm {	497 __asm {

304 mov eax, [esp + 4] // src_ptr	498 mov eax, [esp + 4] // src_ptr

305 // src_stride ignored	499 // src_stride ignored

306 mov edx, [esp + 12] // dst_ptr	500 mov edx, [esp + 12] // dst_ptr

307 mov ecx, [esp + 16] // dst_width	501 mov ecx, [esp + 16] // dst_width

308 movdqa xmm3, kShuf0	502 movdqa xmm3, kShuf0

309 movdqa xmm4, kShuf1	503 movdqa xmm4, kShuf1

310 movdqa xmm5, kShuf2	504 movdqa xmm5, kShuf2

(...skipping 26 matching lines...) Expand all Loading...
337 // xmm0 src_row 0	531 // xmm0 src_row 0

338 // xmm1 src_row 1	532 // xmm1 src_row 1

339 // xmm2 shuf 0	533 // xmm2 shuf 0

340 // xmm3 shuf 1	534 // xmm3 shuf 1

341 // xmm4 shuf 2	535 // xmm4 shuf 2

342 // xmm5 madd 0	536 // xmm5 madd 0

343 // xmm6 madd 1	537 // xmm6 madd 1

344 // xmm7 kRound34	538 // xmm7 kRound34

345	539

346 // Note that movdqa+palign may be better than movdqu.	540 // Note that movdqa+palign may be better than movdqu.

347 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.	541 __declspec(naked)

348 __declspec(naked) __declspec(align(16))

349 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,	542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

350 ptrdiff_t src_stride,	543 ptrdiff_t src_stride,

351 uint8* dst_ptr, int dst_width) {	544 uint8* dst_ptr, int dst_width) {

352 __asm {	545 __asm {

353 push esi	546 push esi

354 mov eax, [esp + 4 + 4] // src_ptr	547 mov eax, [esp + 4 + 4] // src_ptr

355 mov esi, [esp + 4 + 8] // src_stride	548 mov esi, [esp + 4 + 8] // src_stride

356 mov edx, [esp + 4 + 12] // dst_ptr	549 mov edx, [esp + 4 + 12] // dst_ptr

357 mov ecx, [esp + 4 + 16] // dst_width	550 mov ecx, [esp + 4 + 16] // dst_width

358 movdqa xmm2, kShuf01	551 movdqa xmm2, kShuf01

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
395 lea edx, [edx + 24]	588 lea edx, [edx + 24]

396 sub ecx, 24	589 sub ecx, 24

397 jg wloop	590 jg wloop

398	591

399 pop esi	592 pop esi

400 ret	593 ret

401 }	594 }

402 }	595 }

403	596

404 // Note that movdqa+palign may be better than movdqu.	597 // Note that movdqa+palign may be better than movdqu.

405 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.	598 __declspec(naked)

406 __declspec(naked) __declspec(align(16))

407 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,	599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

408 ptrdiff_t src_stride,	600 ptrdiff_t src_stride,

409 uint8* dst_ptr, int dst_width) {	601 uint8* dst_ptr, int dst_width) {

410 __asm {	602 __asm {

411 push esi	603 push esi

412 mov eax, [esp + 4 + 4] // src_ptr	604 mov eax, [esp + 4 + 4] // src_ptr

413 mov esi, [esp + 4 + 8] // src_stride	605 mov esi, [esp + 4 + 8] // src_stride

414 mov edx, [esp + 4 + 12] // dst_ptr	606 mov edx, [esp + 4 + 12] // dst_ptr

415 mov ecx, [esp + 4 + 16] // dst_width	607 mov ecx, [esp + 4 + 16] // dst_width

416 movdqa xmm2, kShuf01	608 movdqa xmm2, kShuf01

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
458 jg wloop	650 jg wloop

459	651

460 pop esi	652 pop esi

461 ret	653 ret

462 }	654 }

463 }	655 }

464	656

465 // 3/8 point sampler	657 // 3/8 point sampler

466	658

467 // Scale 32 pixels to 12	659 // Scale 32 pixels to 12

468 __declspec(naked) __declspec(align(16))	660 __declspec(naked)

469 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,	661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

470 uint8* dst_ptr, int dst_width) {	662 uint8* dst_ptr, int dst_width) {

471 __asm {	663 __asm {

472 mov eax, [esp + 4] // src_ptr	664 mov eax, [esp + 4] // src_ptr

473 // src_stride ignored	665 // src_stride ignored

474 mov edx, [esp + 12] // dst_ptr	666 mov edx, [esp + 12] // dst_ptr

475 mov ecx, [esp + 16] // dst_width	667 mov ecx, [esp + 16] // dst_width

476 movdqa xmm4, kShuf38a	668 movdqa xmm4, kShuf38a

477 movdqa xmm5, kShuf38b	669 movdqa xmm5, kShuf38b

478	670

(...skipping 10 matching lines...) Expand all Loading...
489 movd [edx + 8], xmm1	681 movd [edx + 8], xmm1

490 lea edx, [edx + 12]	682 lea edx, [edx + 12]

491 sub ecx, 12	683 sub ecx, 12

492 jg xloop	684 jg xloop

493	685

494 ret	686 ret

495 }	687 }

496 }	688 }

497	689

498 // Scale 16x3 pixels to 6x1 with interpolation	690 // Scale 16x3 pixels to 6x1 with interpolation

499 __declspec(naked) __declspec(align(16))	691 __declspec(naked)

500 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,	692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

501 ptrdiff_t src_stride,	693 ptrdiff_t src_stride,

502 uint8* dst_ptr, int dst_width) {	694 uint8* dst_ptr, int dst_width) {

503 __asm {	695 __asm {

504 push esi	696 push esi

505 mov eax, [esp + 4 + 4] // src_ptr	697 mov eax, [esp + 4 + 4] // src_ptr

506 mov esi, [esp + 4 + 8] // src_stride	698 mov esi, [esp + 4 + 8] // src_stride

507 mov edx, [esp + 4 + 12] // dst_ptr	699 mov edx, [esp + 4 + 12] // dst_ptr

508 mov ecx, [esp + 4 + 16] // dst_width	700 mov ecx, [esp + 4 + 16] // dst_width

509 movdqa xmm2, kShufAc	701 movdqa xmm2, kShufAc

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
554 lea edx, [edx + 6]	746 lea edx, [edx + 6]

555 sub ecx, 6	747 sub ecx, 6

556 jg xloop	748 jg xloop

557	749

558 pop esi	750 pop esi

559 ret	751 ret

560 }	752 }

561 }	753 }

562	754

563 // Scale 16x2 pixels to 6x1 with interpolation	755 // Scale 16x2 pixels to 6x1 with interpolation

564 __declspec(naked) __declspec(align(16))	756 __declspec(naked)

565 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,	757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

566 ptrdiff_t src_stride,	758 ptrdiff_t src_stride,

567 uint8* dst_ptr, int dst_width) {	759 uint8* dst_ptr, int dst_width) {

568 __asm {	760 __asm {

569 push esi	761 push esi

570 mov eax, [esp + 4 + 4] // src_ptr	762 mov eax, [esp + 4 + 4] // src_ptr

571 mov esi, [esp + 4 + 8] // src_stride	763 mov esi, [esp + 4 + 8] // src_stride

572 mov edx, [esp + 4 + 12] // dst_ptr	764 mov edx, [esp + 4 + 12] // dst_ptr

573 mov ecx, [esp + 4 + 16] // dst_width	765 mov ecx, [esp + 4 + 16] // dst_width

574 movdqa xmm2, kShufAb0	766 movdqa xmm2, kShufAb0

(...skipping 23 matching lines...) Expand all Loading...
598 movd [edx + 2], xmm1	790 movd [edx + 2], xmm1

599 lea edx, [edx + 6]	791 lea edx, [edx + 6]

600 sub ecx, 6	792 sub ecx, 6

601 jg xloop	793 jg xloop

602	794

603 pop esi	795 pop esi

604 ret	796 ret

605 }	797 }

606 }	798 }

607	799

608 // Reads 16xN bytes and produces 16 shorts at a time.	800 // Reads 16 bytes and accumulates to 16 shorts at a time.

609 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.	801 __declspec(naked)

610 __declspec(naked) __declspec(align(16))	802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

611 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

612 uint16* dst_ptr, int src_width,

613 int src_height) {

614 __asm {	803 __asm {

615 push esi	804 mov eax, [esp + 4] // src_ptr

616 push edi	805 mov edx, [esp + 8] // dst_ptr

617 push ebx	806 mov ecx, [esp + 12] // src_width

618 push ebp	807 pxor xmm5, xmm5

619 mov esi, [esp + 16 + 4] // src_ptr

620 mov edx, [esp + 16 + 8] // src_stride

621 mov edi, [esp + 16 + 12] // dst_ptr

622 mov ecx, [esp + 16 + 16] // dst_width

623 mov ebx, [esp + 16 + 20] // height

624 pxor xmm4, xmm4

625 dec ebx

626	808

	809 // sum rows

627 xloop:	810 xloop:

628 // first row	811 movdqu xmm3, [eax] // read 16 bytes

629 movdqu xmm0, [esi]	812 lea eax, [eax + 16]

630 lea eax, [esi + edx]	813 movdqu xmm0, [edx] // read 16 words from destination

631 movdqa xmm1, xmm0	814 movdqu xmm1, [edx + 16]

632 punpcklbw xmm0, xmm4	815 movdqa xmm2, xmm3

633 punpckhbw xmm1, xmm4	816 punpcklbw xmm2, xmm5

634 lea esi, [esi + 16]	817 punpckhbw xmm3, xmm5

635 mov ebp, ebx

636 test ebp, ebp

637 je ydone

638

639 // sum remaining rows

640 yloop:

641 movdqu xmm2, [eax] // read 16 pixels

642 lea eax, [eax + edx] // advance to next row

643 movdqa xmm3, xmm2

644 punpcklbw xmm2, xmm4

645 punpckhbw xmm3, xmm4

646 paddusw xmm0, xmm2 // sum 16 words	818 paddusw xmm0, xmm2 // sum 16 words

647 paddusw xmm1, xmm3	819 paddusw xmm1, xmm3

648 sub ebp, 1	820 movdqu [edx], xmm0 // write 16 words to destination

649 jg yloop	821 movdqu [edx + 16], xmm1

650	822 lea edx, [edx + 32]

651 ydone:

652 movdqu [edi], xmm0

653 movdqu [edi + 16], xmm1

654 lea edi, [edi + 32]

655

656 sub ecx, 16	823 sub ecx, 16

657 jg xloop	824 jg xloop

658

659 pop ebp

660 pop ebx

661 pop edi

662 pop esi

663 ret	825 ret

664 }	826 }

665 }	827 }

666	828

	829 #ifdef HAS_SCALEADDROW_AVX2

	830 // Reads 32 bytes and accumulates to 32 shorts at a time.

	831 __declspec(naked)

	832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

	833 __asm {

	834 mov eax, [esp + 4] // src_ptr

	835 mov edx, [esp + 8] // dst_ptr

	836 mov ecx, [esp + 12] // src_width

	837 vpxor ymm5, ymm5, ymm5

	838

	839 // sum rows

	840 xloop:

	841 vmovdqu ymm3, [eax] // read 32 bytes

	842 lea eax, [eax + 32]

	843 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck

	844 vpunpcklbw ymm2, ymm3, ymm5

	845 vpunpckhbw ymm3, ymm3, ymm5

	846 vpaddusw ymm0, ymm2, [edx] // sum 16 words

	847 vpaddusw ymm1, ymm3, [edx + 32]

	848 vmovdqu [edx], ymm0 // write 32 words to destination

	849 vmovdqu [edx + 32], ymm1

	850 lea edx, [edx + 64]

	851 sub ecx, 32

	852 jg xloop

	853

	854 vzeroupper

	855 ret

	856 }

	857 }

	858 #endif // HAS_SCALEADDROW_AVX2

	859

667 // Bilinear column filtering. SSSE3 version.	860 // Bilinear column filtering. SSSE3 version.

668 // TODO(fbarchard): Port to Neon	861 __declspec(naked)

669 // TODO(fbarchard): Switch the following:

670 // xor ebx, ebx

671 // mov bx, word ptr [esi + eax] // 2 source x0 pixels

672 // To

673 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels

674 // when drmemory bug fixed.

675 // https://code.google.com/p/drmemory/issues/detail?id=1396

676

677 __declspec(naked) __declspec(align(16))

678 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,	862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

679 int dst_width, int x, int dx) {	863 int dst_width, int x, int dx) {

680 __asm {	864 __asm {

681 push ebx	865 push ebx

682 push esi	866 push esi

683 push edi	867 push edi

684 mov edi, [esp + 12 + 4] // dst_ptr	868 mov edi, [esp + 12 + 4] // dst_ptr

685 mov esi, [esp + 12 + 8] // src_ptr	869 mov esi, [esp + 12 + 8] // src_ptr

686 mov ecx, [esp + 12 + 12] // dst_width	870 mov ecx, [esp + 12 + 12] // dst_width

687 movd xmm2, [esp + 12 + 16] // x	871 movd xmm2, [esp + 12 + 16] // x

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
744 xloop99:	928 xloop99:

745	929

746 pop edi	930 pop edi

747 pop esi	931 pop esi

748 pop ebx	932 pop ebx

749 ret	933 ret

750 }	934 }

751 }	935 }

752	936

753 // Reads 16 pixels, duplicates them and writes 32 pixels.	937 // Reads 16 pixels, duplicates them and writes 32 pixels.

754 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.	938 __declspec(naked)

755 __declspec(naked) __declspec(align(16))

756 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,	939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

757 int dst_width, int x, int dx) {	940 int dst_width, int x, int dx) {

758 __asm {	941 __asm {

759 mov edx, [esp + 4] // dst_ptr	942 mov edx, [esp + 4] // dst_ptr

760 mov eax, [esp + 8] // src_ptr	943 mov eax, [esp + 8] // src_ptr

761 mov ecx, [esp + 12] // dst_width	944 mov ecx, [esp + 12] // dst_width

762	945

763 wloop:	946 wloop:

764 movdqu xmm0, [eax]	947 movdqu xmm0, [eax]

765 lea eax, [eax + 16]	948 lea eax, [eax + 16]

766 movdqa xmm1, xmm0	949 movdqa xmm1, xmm0

767 punpcklbw xmm0, xmm0	950 punpcklbw xmm0, xmm0

768 punpckhbw xmm1, xmm1	951 punpckhbw xmm1, xmm1

769 movdqu [edx], xmm0	952 movdqu [edx], xmm0

770 movdqu [edx + 16], xmm1	953 movdqu [edx + 16], xmm1

771 lea edx, [edx + 32]	954 lea edx, [edx + 32]

772 sub ecx, 32	955 sub ecx, 32

773 jg wloop	956 jg wloop

774	957

775 ret	958 ret

776 }	959 }

777 }	960 }

778	961

779 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)	962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)

780 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.	963 __declspec(naked)

781 __declspec(naked) __declspec(align(16))

782 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,	964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

783 ptrdiff_t src_stride,	965 ptrdiff_t src_stride,

784 uint8* dst_argb, int dst_width) {	966 uint8* dst_argb, int dst_width) {

785 __asm {	967 __asm {

786 mov eax, [esp + 4] // src_argb	968 mov eax, [esp + 4] // src_argb

787 // src_stride ignored	969 // src_stride ignored

788 mov edx, [esp + 12] // dst_argb	970 mov edx, [esp + 12] // dst_argb

789 mov ecx, [esp + 16] // dst_width	971 mov ecx, [esp + 16] // dst_width

790	972

791 wloop:	973 wloop:

792 movdqu xmm0, [eax]	974 movdqu xmm0, [eax]

793 movdqu xmm1, [eax + 16]	975 movdqu xmm1, [eax + 16]

794 lea eax, [eax + 32]	976 lea eax, [eax + 32]

795 shufps xmm0, xmm1, 0xdd	977 shufps xmm0, xmm1, 0xdd

796 movdqu [edx], xmm0	978 movdqu [edx], xmm0

797 lea edx, [edx + 16]	979 lea edx, [edx + 16]

798 sub ecx, 4	980 sub ecx, 4

799 jg wloop	981 jg wloop

800	982

801 ret	983 ret

802 }	984 }

803 }	985 }

804	986

805 // Blends 8x1 rectangle to 4x1.	987 // Blends 8x1 rectangle to 4x1.

806 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.	988 __declspec(naked)

807 __declspec(naked) __declspec(align(16))

808 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,	989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

809 ptrdiff_t src_stride,	990 ptrdiff_t src_stride,

810 uint8* dst_argb, int dst_width) {	991 uint8* dst_argb, int dst_width) {

811 __asm {	992 __asm {

812 mov eax, [esp + 4] // src_argb	993 mov eax, [esp + 4] // src_argb

813 // src_stride ignored	994 // src_stride ignored

814 mov edx, [esp + 12] // dst_argb	995 mov edx, [esp + 12] // dst_argb

815 mov ecx, [esp + 16] // dst_width	996 mov ecx, [esp + 16] // dst_width

816	997

817 wloop:	998 wloop:

818 movdqu xmm0, [eax]	999 movdqu xmm0, [eax]

819 movdqu xmm1, [eax + 16]	1000 movdqu xmm1, [eax + 16]

820 lea eax, [eax + 32]	1001 lea eax, [eax + 32]

821 movdqa xmm2, xmm0	1002 movdqa xmm2, xmm0

822 shufps xmm0, xmm1, 0x88 // even pixels	1003 shufps xmm0, xmm1, 0x88 // even pixels

823 shufps xmm2, xmm1, 0xdd // odd pixels	1004 shufps xmm2, xmm1, 0xdd // odd pixels

824 pavgb xmm0, xmm2	1005 pavgb xmm0, xmm2

825 movdqu [edx], xmm0	1006 movdqu [edx], xmm0

826 lea edx, [edx + 16]	1007 lea edx, [edx + 16]

827 sub ecx, 4	1008 sub ecx, 4

828 jg wloop	1009 jg wloop

829	1010

830 ret	1011 ret

831 }	1012 }

832 }	1013 }

833	1014

834 // Blends 8x2 rectangle to 4x1.	1015 // Blends 8x2 rectangle to 4x1.

835 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.	1016 __declspec(naked)

836 __declspec(naked) __declspec(align(16))

837 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,	1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

838 ptrdiff_t src_stride,	1018 ptrdiff_t src_stride,

839 uint8* dst_argb, int dst_width) {	1019 uint8* dst_argb, int dst_width) {

840 __asm {	1020 __asm {

841 push esi	1021 push esi

842 mov eax, [esp + 4 + 4] // src_argb	1022 mov eax, [esp + 4 + 4] // src_argb

843 mov esi, [esp + 4 + 8] // src_stride	1023 mov esi, [esp + 4 + 8] // src_stride

844 mov edx, [esp + 4 + 12] // dst_argb	1024 mov edx, [esp + 4 + 12] // dst_argb

845 mov ecx, [esp + 4 + 16] // dst_width	1025 mov ecx, [esp + 4 + 16] // dst_width

846	1026

(...skipping 13 matching lines...) Expand all Loading...
860 lea edx, [edx + 16]	1040 lea edx, [edx + 16]

861 sub ecx, 4	1041 sub ecx, 4

862 jg wloop	1042 jg wloop

863	1043

864 pop esi	1044 pop esi

865 ret	1045 ret

866 }	1046 }

867 }	1047 }

868	1048

869 // Reads 4 pixels at a time.	1049 // Reads 4 pixels at a time.

870 // Alignment requirement: dst_argb 16 byte aligned.	1050 __declspec(naked)

871 __declspec(naked) __declspec(align(16))

872 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,	1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

873 int src_stepx,	1052 int src_stepx,

874 uint8* dst_argb, int dst_width) {	1053 uint8* dst_argb, int dst_width) {

875 __asm {	1054 __asm {

876 push ebx	1055 push ebx

877 push edi	1056 push edi

878 mov eax, [esp + 8 + 4] // src_argb	1057 mov eax, [esp + 8 + 4] // src_argb

879 // src_stride ignored	1058 // src_stride ignored

880 mov ebx, [esp + 8 + 12] // src_stepx	1059 mov ebx, [esp + 8 + 12] // src_stepx

881 mov edx, [esp + 8 + 16] // dst_argb	1060 mov edx, [esp + 8 + 16] // dst_argb

(...skipping 15 matching lines...) Expand all Loading...
897 sub ecx, 4	1076 sub ecx, 4

898 jg wloop	1077 jg wloop

899	1078

900 pop edi	1079 pop edi

901 pop ebx	1080 pop ebx

902 ret	1081 ret

903 }	1082 }

904 }	1083 }

905	1084

906 // Blends four 2x2 to 4x1.	1085 // Blends four 2x2 to 4x1.

907 // Alignment requirement: dst_argb 16 byte aligned.	1086 __declspec(naked)

908 __declspec(naked) __declspec(align(16))

909 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,	1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

910 ptrdiff_t src_stride,	1088 ptrdiff_t src_stride,

911 int src_stepx,	1089 int src_stepx,

912 uint8* dst_argb, int dst_width) {	1090 uint8* dst_argb, int dst_width) {

913 __asm {	1091 __asm {

914 push ebx	1092 push ebx

915 push esi	1093 push esi

916 push edi	1094 push edi

917 mov eax, [esp + 12 + 4] // src_argb	1095 mov eax, [esp + 12 + 4] // src_argb

918 mov esi, [esp + 12 + 8] // src_stride	1096 mov esi, [esp + 12 + 8] // src_stride

(...skipping 27 matching lines...) Expand all Loading...
946 jg wloop	1124 jg wloop

947	1125

948 pop edi	1126 pop edi

949 pop esi	1127 pop esi

950 pop ebx	1128 pop ebx

951 ret	1129 ret

952 }	1130 }

953 }	1131 }

954	1132

955 // Column scaling unfiltered. SSE2 version.	1133 // Column scaling unfiltered. SSE2 version.

956 __declspec(naked) __declspec(align(16))	1134 __declspec(naked)

957 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,	1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

958 int dst_width, int x, int dx) {	1136 int dst_width, int x, int dx) {

959 __asm {	1137 __asm {

960 push edi	1138 push edi

961 push esi	1139 push esi

962 mov edi, [esp + 8 + 4] // dst_argb	1140 mov edi, [esp + 8 + 4] // dst_argb

963 mov esi, [esp + 8 + 8] // src_argb	1141 mov esi, [esp + 8 + 8] // src_argb

964 mov ecx, [esp + 8 + 12] // dst_width	1142 mov ecx, [esp + 8 + 12] // dst_width

965 movd xmm2, [esp + 8 + 16] // x	1143 movd xmm2, [esp + 8 + 16] // x

966 movd xmm3, [esp + 8 + 20] // dx	1144 movd xmm3, [esp + 8 + 20] // dx

(...skipping 70 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1037 static uvec8 kShuffleColARGB = {	1215 static uvec8 kShuffleColARGB = {

1038 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel	1216 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel

1039 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel	1217 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel

1040 };	1218 };

1041	1219

1042 // Shuffle table for duplicating 2 fractions into 8 bytes each	1220 // Shuffle table for duplicating 2 fractions into 8 bytes each

1043 static uvec8 kShuffleFractions = {	1221 static uvec8 kShuffleFractions = {

1044 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,	1222 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

1045 };	1223 };

1046	1224

1047 __declspec(naked) __declspec(align(16))	1225 __declspec(naked)

1048 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,	1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

1049 int dst_width, int x, int dx) {	1227 int dst_width, int x, int dx) {

1050 __asm {	1228 __asm {

1051 push esi	1229 push esi

1052 push edi	1230 push edi

1053 mov edi, [esp + 8 + 4] // dst_argb	1231 mov edi, [esp + 8 + 4] // dst_argb

1054 mov esi, [esp + 8 + 8] // src_argb	1232 mov esi, [esp + 8 + 8] // src_argb

1055 mov ecx, [esp + 8 + 12] // dst_width	1233 mov ecx, [esp + 8 + 12] // dst_width

1056 movd xmm2, [esp + 8 + 16] // x	1234 movd xmm2, [esp + 8 + 16] // x

1057 movd xmm3, [esp + 8 + 20] // dx	1235 movd xmm3, [esp + 8 + 20] // dx

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1108	1286

1109 xloop99:	1287 xloop99:

1110	1288

1111 pop edi	1289 pop edi

1112 pop esi	1290 pop esi

1113 ret	1291 ret

1114 }	1292 }

1115 }	1293 }

1116	1294

1117 // Reads 4 pixels, duplicates them and writes 8 pixels.	1295 // Reads 4 pixels, duplicates them and writes 8 pixels.

1118 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.	1296 __declspec(naked)

1119 __declspec(naked) __declspec(align(16))

1120 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,	1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

1121 int dst_width, int x, int dx) {	1298 int dst_width, int x, int dx) {

1122 __asm {	1299 __asm {

1123 mov edx, [esp + 4] // dst_argb	1300 mov edx, [esp + 4] // dst_argb

1124 mov eax, [esp + 8] // src_argb	1301 mov eax, [esp + 8] // src_argb

1125 mov ecx, [esp + 12] // dst_width	1302 mov ecx, [esp + 12] // dst_width

1126	1303

1127 wloop:	1304 wloop:

1128 movdqu xmm0, [eax]	1305 movdqu xmm0, [eax]

1129 lea eax, [eax + 16]	1306 lea eax, [eax + 16]

1130 movdqa xmm1, xmm0	1307 movdqa xmm1, xmm0

1131 punpckldq xmm0, xmm0	1308 punpckldq xmm0, xmm0

1132 punpckhdq xmm1, xmm1	1309 punpckhdq xmm1, xmm1

1133 movdqu [edx], xmm0	1310 movdqu [edx], xmm0

1134 movdqu [edx + 16], xmm1	1311 movdqu [edx + 16], xmm1

1135 lea edx, [edx + 32]	1312 lea edx, [edx + 32]

1136 sub ecx, 8	1313 sub ecx, 8

1137 jg wloop	1314 jg wloop

1138	1315

1139 ret	1316 ret

1140 }	1317 }

1141 }	1318 }

1142	1319

1143 // Divide num by div and return as 16.16 fixed point result.	1320 // Divide num by div and return as 16.16 fixed point result.

1144 __declspec(naked) __declspec(align(16))	1321 __declspec(naked)

1145 int FixedDiv_X86(int num, int div) {	1322 int FixedDiv_X86(int num, int div) {

1146 __asm {	1323 __asm {

1147 mov eax, [esp + 4] // num	1324 mov eax, [esp + 4] // num

1148 cdq // extend num to 64 bits	1325 cdq // extend num to 64 bits

1149 shld edx, eax, 16 // 32.16	1326 shld edx, eax, 16 // 32.16

1150 shl eax, 16	1327 shl eax, 16

1151 idiv dword ptr [esp + 8]	1328 idiv dword ptr [esp + 8]

1152 ret	1329 ret

1153 }	1330 }

1154 }	1331 }

1155	1332

1156 // Divide num by div and return as 16.16 fixed point result.	1333 // Divide num by div and return as 16.16 fixed point result.

1157 __declspec(naked) __declspec(align(16))	1334 __declspec(naked)

1158 int FixedDiv1_X86(int num, int div) {	1335 int FixedDiv1_X86(int num, int div) {

1159 __asm {	1336 __asm {

1160 mov eax, [esp + 4] // num	1337 mov eax, [esp + 4] // num

1161 mov ecx, [esp + 8] // denom	1338 mov ecx, [esp + 8] // denom

1162 cdq // extend num to 64 bits	1339 cdq // extend num to 64 bits

1163 shld edx, eax, 16 // 32.16	1340 shld edx, eax, 16 // 32.16

1164 shl eax, 16	1341 shl eax, 16

1165 sub eax, 0x00010001	1342 sub eax, 0x00010001

1166 sbb edx, 0	1343 sbb edx, 0

1167 sub ecx, 1	1344 sub ecx, 1

1168 idiv ecx	1345 idiv ecx

1169 ret	1346 ret

1170 }	1347 }

1171 }	1348 }

1172	1349 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

1173 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

1174	1350

1175 #ifdef __cplusplus	1351 #ifdef __cplusplus

1176 } // extern "C"	1352 } // extern "C"

1177 } // namespace libyuv	1353 } // namespace libyuv

1178 #endif	1354 #endif

OLD	NEW

« no previous file with comments | « source/libvpx/third_party/libyuv/source/scale_posix.cc ('k') | source/libvpx/third_party/x86inc/README.libvpx » ('j') | no next file with comments »