Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

Issue 341293003: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/
Patch Set: Created 6 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11 %include "third_party/x86inc/x86inc.asm"
12
13 SECTION_RODATA
14 pw_1: times 8 dw 1
15
16 SECTION .text
17
18 %macro QUANTIZE_FN 2
19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
20 shift, qcoeff, dqcoeff, dequant, zbin_oq, \
21 eob, scan, iscan
22 cmp dword skipm, 0
23 jne .blank
24
25 ; actual quantize loop - setup pointers, rounders, etc.
26 movifnidn coeffq, coeffmp
27 movifnidn ncoeffq, ncoeffmp
28 mov r2, dequantmp
29 movifnidn zbinq, zbinmp
30 movifnidn roundq, roundmp
31 movifnidn quantq, quantmp
32 movd m4, dword zbin_oqm ; m4 = zbin_oq
33 mova m0, [zbinq] ; m0 = zbin
34 punpcklwd m4, m4
35 mova m1, [roundq] ; m1 = round
36 pshufd m4, m4, 0
37 mova m2, [quantq] ; m2 = quant
38 paddw m0, m4 ; m0 = zbin + zbin_oq
39 %ifidn %1, b_32x32
40 pcmpeqw m5, m5
41 psrlw m5, 15
42 paddw m0, m5
43 paddw m1, m5
44 psrlw m0, 1 ; m0 = (m0 + 1) / 2
45 psrlw m1, 1 ; m1 = (m1 + 1) / 2
46 %endif
47 mova m3, [r2q] ; m3 = dequant
48 psubw m0, [pw_1]
49 mov r2, shiftmp
50 mov r3, qcoeffmp
51 mova m4, [r2] ; m4 = shift
52 mov r4, dqcoeffmp
53 mov r5, iscanmp
54 %ifidn %1, b_32x32
55 psllw m4, 1
56 %endif
57 pxor m5, m5 ; m5 = dedicated zero
58 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
59 lea coeffq, [ coeffq+ncoeffq*2]
60 lea iscanq, [ iscanq+ncoeffq*2]
61 lea qcoeffq, [ qcoeffq+ncoeffq*2]
62 lea dqcoeffq, [dqcoeffq+ncoeffq*2]
63 neg ncoeffq
64
65 ; get DC and first 15 AC coeffs
66 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
67 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
68 pabsw m6, m9 ; m6 = abs(m9)
69 pabsw m11, m10 ; m11 = abs(m10)
70 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
71 punpckhqdq m0, m0
72 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
73 paddsw m6, m1 ; m6 += round
74 punpckhqdq m1, m1
75 paddsw m11, m1 ; m11 += round
76 pmulhw m8, m6, m2 ; m8 = m6*q>>16
77 punpckhqdq m2, m2
78 pmulhw m13, m11, m2 ; m13 = m11*q>>16
79 paddw m8, m6 ; m8 += m6
80 paddw m13, m11 ; m13 += m11
81 pmulhw m8, m4 ; m8 = m8*qsh>>16
82 punpckhqdq m4, m4
83 pmulhw m13, m4 ; m13 = m13*qsh>>16
84 psignw m8, m9 ; m8 = reinsert sign
85 psignw m13, m10 ; m13 = reinsert sign
86 pand m8, m7
87 pand m13, m12
88 mova [qcoeffq+ncoeffq*2+ 0], m8
89 mova [qcoeffq+ncoeffq*2+16], m13
90 %ifidn %1, b_32x32
91 pabsw m8, m8
92 pabsw m13, m13
93 %endif
94 pmullw m8, m3 ; dqc[i] = qc[i] * q
95 punpckhqdq m3, m3
96 pmullw m13, m3 ; dqc[i] = qc[i] * q
97 %ifidn %1, b_32x32
98 psrlw m8, 1
99 psrlw m13, 1
100 psignw m8, m9
101 psignw m13, m10
102 %endif
103 mova [dqcoeffq+ncoeffq*2+ 0], m8
104 mova [dqcoeffq+ncoeffq*2+16], m13
105 pcmpeqw m8, m5 ; m8 = c[i] == 0
106 pcmpeqw m13, m5 ; m13 = c[i] == 0
107 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
108 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
109 psubw m6, m7 ; m6 = scan[i] + 1
110 psubw m11, m12 ; m11 = scan[i] + 1
111 pandn m8, m6 ; m8 = max(eob)
112 pandn m13, m11 ; m13 = max(eob)
113 pmaxsw m8, m13
114 add ncoeffq, mmsize
115 jz .accumulate_eob
116
117 .ac_only_loop:
118 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
119 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
120 pabsw m6, m9 ; m6 = abs(m9)
121 pabsw m11, m10 ; m11 = abs(m10)
122 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
123 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
124 %ifidn %1, b_32x32
125 pmovmskb r6, m7
126 pmovmskb r2, m12
127 or r6, r2
128 jz .skip_iter
129 %endif
130 paddsw m6, m1 ; m6 += round
131 paddsw m11, m1 ; m11 += round
132 pmulhw m14, m6, m2 ; m14 = m6*q>>16
133 pmulhw m13, m11, m2 ; m13 = m11*q>>16
134 paddw m14, m6 ; m14 += m6
135 paddw m13, m11 ; m13 += m11
136 pmulhw m14, m4 ; m14 = m14*qsh>>16
137 pmulhw m13, m4 ; m13 = m13*qsh>>16
138 psignw m14, m9 ; m14 = reinsert sign
139 psignw m13, m10 ; m13 = reinsert sign
140 pand m14, m7
141 pand m13, m12
142 mova [qcoeffq+ncoeffq*2+ 0], m14
143 mova [qcoeffq+ncoeffq*2+16], m13
144 %ifidn %1, b_32x32
145 pabsw m14, m14
146 pabsw m13, m13
147 %endif
148 pmullw m14, m3 ; dqc[i] = qc[i] * q
149 pmullw m13, m3 ; dqc[i] = qc[i] * q
150 %ifidn %1, b_32x32
151 psrlw m14, 1
152 psrlw m13, 1
153 psignw m14, m9
154 psignw m13, m10
155 %endif
156 mova [dqcoeffq+ncoeffq*2+ 0], m14
157 mova [dqcoeffq+ncoeffq*2+16], m13
158 pcmpeqw m14, m5 ; m14 = c[i] == 0
159 pcmpeqw m13, m5 ; m13 = c[i] == 0
160 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
161 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
162 psubw m6, m7 ; m6 = scan[i] + 1
163 psubw m11, m12 ; m11 = scan[i] + 1
164 pandn m14, m6 ; m14 = max(eob)
165 pandn m13, m11 ; m13 = max(eob)
166 pmaxsw m8, m14
167 pmaxsw m8, m13
168 add ncoeffq, mmsize
169 jl .ac_only_loop
170
171 %ifidn %1, b_32x32
172 jmp .accumulate_eob
173 .skip_iter:
174 mova [qcoeffq+ncoeffq*2+ 0], m5
175 mova [qcoeffq+ncoeffq*2+16], m5
176 mova [dqcoeffq+ncoeffq*2+ 0], m5
177 mova [dqcoeffq+ncoeffq*2+16], m5
178 add ncoeffq, mmsize
179 jl .ac_only_loop
180 %endif
181
182 .accumulate_eob:
183 ; horizontally accumulate/max eobs and write into [eob] memory pointer
184 mov r2, eobmp
185 pshufd m7, m8, 0xe
186 pmaxsw m8, m7
187 pshuflw m7, m8, 0xe
188 pmaxsw m8, m7
189 pshuflw m7, m8, 0x1
190 pmaxsw m8, m7
191 pextrw r6, m8, 0
192 mov [r2], r6
193 RET
194
195 ; skip-block, i.e. just write all zeroes
196 .blank:
197 mov r0, dqcoeffmp
198 movifnidn ncoeffq, ncoeffmp
199 mov r2, qcoeffmp
200 mov r3, eobmp
201 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
202 lea dqcoeffq, [dqcoeffq+ncoeffq*2]
203 lea qcoeffq, [ qcoeffq+ncoeffq*2]
204 neg ncoeffq
205 pxor m7, m7
206 .blank_loop:
207 mova [dqcoeffq+ncoeffq*2+ 0], m7
208 mova [dqcoeffq+ncoeffq*2+16], m7
209 mova [qcoeffq+ncoeffq*2+ 0], m7
210 mova [qcoeffq+ncoeffq*2+16], m7
211 add ncoeffq, mmsize
212 jl .blank_loop
213 mov word [eobq], 0
214 RET
215 %endmacro
216
217 INIT_XMM ssse3
218 QUANTIZE_FN b, 7
219 QUANTIZE_FN b_32x32, 7
OLDNEW
« no previous file with comments | « source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm ('k') | source/libvpx/vp9/encoder/x86/vp9_ssim_opt.asm » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698