source/libvpx/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm - Issue 958693004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm

Issue 958693004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 ;	1 ;

2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.	2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.

3 ;	3 ;

4 ; Use of this source code is governed by a BSD-style license	4 ; Use of this source code is governed by a BSD-style license

5 ; that can be found in the LICENSE file in the root of the source	5 ; that can be found in the LICENSE file in the root of the source

6 ; tree. An additional intellectual property rights grant can be found	6 ; tree. An additional intellectual property rights grant can be found

7 ; in the file PATENTS. All contributing project authors may	7 ; in the file PATENTS. All contributing project authors may

8 ; be found in the AUTHORS file in the root of the source tree.	8 ; be found in the AUTHORS file in the root of the source tree.

9 ;	9 ;

10	10

(...skipping 197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)	208 HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)

209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6	209 HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6

210 %endmacro	210 %endmacro

211	211

212 ; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,	212 ; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,

213 ; uint8_t *ref[4], int ref_stride,	213 ; uint8_t *ref[4], int ref_stride,

214 ; unsigned int res[4]);	214 ; unsigned int res[4]);

215 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8	215 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8

216 %macro HIGH_SADNXN4D 2	216 %macro HIGH_SADNXN4D 2

217 %if UNIX64	217 %if UNIX64

218 cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \	218 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \

219 res, ref2, ref3, ref4, one	219 res, ref2, ref3, ref4

220 %else	220 %else

221 cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \	221 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \

222 ref2, ref3, ref4, one	222 ref2, ref3, ref4

223 %endif	223 %endif

224	224

	225 ; set m1

	226 push srcq

	227 mov srcd, 0x00010001

	228 movd m1, srcd

	229 pshufd m1, m1, 0x0

	230 pop srcq

	231

225 movsxdifnidn src_strideq, src_strided	232 movsxdifnidn src_strideq, src_strided

226 movsxdifnidn ref_strideq, ref_strided	233 movsxdifnidn ref_strideq, ref_strided

227 mov ref2q, [ref1q+gprsize*1]	234 mov ref2q, [ref1q+gprsize*1]

228 mov ref3q, [ref1q+gprsize*2]	235 mov ref3q, [ref1q+gprsize*2]

229 mov ref4q, [ref1q+gprsize*3]	236 mov ref4q, [ref1q+gprsize*3]

230 mov ref1q, [ref1q+gprsize*0]	237 mov ref1q, [ref1q+gprsize*0]

231	238

232 ; convert byte pointers to short pointers	239 ; convert byte pointers to short pointers

233 shl srcq, 1	240 shl srcq, 1

234 shl ref2q, 1	241 shl ref2q, 1

235 shl ref3q, 1	242 shl ref3q, 1

236 shl ref4q, 1	243 shl ref4q, 1

237 shl ref1q, 1	244 shl ref1q, 1

238	245

239 mov oned, 0x00010001

240 movd m1, oned

241 pshufd m1, m1, 0x0

242

243 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1	246 HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1

244 %rep (%2-4)/2	247 %rep (%2-4)/2

245 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1	248 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1

246 %endrep	249 %endrep

247 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0	250 HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0

248 ; N.B. HIGH_PROCESS outputs dwords (32 bits)	251 ; N.B. HIGH_PROCESS outputs dwords (32 bits)

249 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM	252 ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM

250 movhlps m0, m4	253 movhlps m0, m4

251 movhlps m1, m5	254 movhlps m1, m5

252 movhlps m2, m6	255 movhlps m2, m6

(...skipping 22 matching lines...) Expand all Loading...
275 HIGH_SADNXN4D 32, 32	278 HIGH_SADNXN4D 32, 32

276 HIGH_SADNXN4D 32, 16	279 HIGH_SADNXN4D 32, 16

277 HIGH_SADNXN4D 16, 32	280 HIGH_SADNXN4D 16, 32

278 HIGH_SADNXN4D 16, 16	281 HIGH_SADNXN4D 16, 16

279 HIGH_SADNXN4D 16, 8	282 HIGH_SADNXN4D 16, 8

280 HIGH_SADNXN4D 8, 16	283 HIGH_SADNXN4D 8, 16

281 HIGH_SADNXN4D 8, 8	284 HIGH_SADNXN4D 8, 8

282 HIGH_SADNXN4D 8, 4	285 HIGH_SADNXN4D 8, 4

283 HIGH_SADNXN4D 4, 8	286 HIGH_SADNXN4D 4, 8

284 HIGH_SADNXN4D 4, 4	287 HIGH_SADNXN4D 4, 4

OLD	NEW

« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c ('k') | source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm » ('j') | no next file with comments »