OLD | NEW |
(Empty) | |
| 1 ; vim:filetype=nasm ts=8 |
| 2 |
| 3 ; libFLAC - Free Lossless Audio Codec library |
| 4 ; Copyright (C) 2001-2009 Josh Coalson |
| 5 ; Copyright (C) 2011-2014 Xiph.Org Foundation |
| 6 ; |
| 7 ; Redistribution and use in source and binary forms, with or without |
| 8 ; modification, are permitted provided that the following conditions |
| 9 ; are met: |
| 10 ; |
| 11 ; - Redistributions of source code must retain the above copyright |
| 12 ; notice, this list of conditions and the following disclaimer. |
| 13 ; |
| 14 ; - Redistributions in binary form must reproduce the above copyright |
| 15 ; notice, this list of conditions and the following disclaimer in the |
| 16 ; documentation and/or other materials provided with the distribution. |
| 17 ; |
| 18 ; - Neither the name of the Xiph.org Foundation nor the names of its |
| 19 ; contributors may be used to endorse or promote products derived from |
| 20 ; this software without specific prior written permission. |
| 21 ; |
| 22 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 23 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 24 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 25 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 26 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 27 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 28 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 29 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 30 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 31 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 32 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 33 |
| 34 %include "nasm.h" |
| 35 |
| 36 data_section |
| 37 |
| 38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 |
| 39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 |
| 40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 |
| 41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 |
| 42 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 |
| 43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 |
| 44 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx |
| 45 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 |
| 46 cglobal FLAC__lpc_restore_signal_asm_ia32 |
| 47 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx |
| 48 cglobal FLAC__lpc_restore_signal_wide_asm_ia32 |
| 49 |
| 50 code_section |
| 51 |
| 52 ; ********************************************************************** |
| 53 ; |
| 54 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned d
ata_len, unsigned lag, FLAC__real autoc[]) |
| 55 ; { |
| 56 ; FLAC__real d; |
| 57 ; unsigned sample, coeff; |
| 58 ; const unsigned limit = data_len - lag; |
| 59 ; |
| 60 ; FLAC__ASSERT(lag > 0); |
| 61 ; FLAC__ASSERT(lag <= data_len); |
| 62 ; |
| 63 ; for(coeff = 0; coeff < lag; coeff++) |
| 64 ; autoc[coeff] = 0.0; |
| 65 ; for(sample = 0; sample <= limit; sample++) { |
| 66 ; d = data[sample]; |
| 67 ; for(coeff = 0; coeff < lag; coeff++) |
| 68 ; autoc[coeff] += d * data[sample+coeff]; |
| 69 ; } |
| 70 ; for(; sample < data_len; sample++) { |
| 71 ; d = data[sample]; |
| 72 ; for(coeff = 0; coeff < data_len - sample; coeff++) |
| 73 ; autoc[coeff] += d * data[sample+coeff]; |
| 74 ; } |
| 75 ; } |
| 76 ; |
| 77 ALIGN 16 |
| 78 cident FLAC__lpc_compute_autocorrelation_asm_ia32 |
| 79 ;[esp + 28] == autoc[] |
| 80 ;[esp + 24] == lag |
| 81 ;[esp + 20] == data_len |
| 82 ;[esp + 16] == data[] |
| 83 |
| 84 ;ASSERT(lag > 0) |
| 85 ;ASSERT(lag <= 33) |
| 86 ;ASSERT(lag <= data_len) |
| 87 |
| 88 .begin: |
| 89 push esi |
| 90 push edi |
| 91 push ebx |
| 92 |
| 93 ; for(coeff = 0; coeff < lag; coeff++) |
| 94 ; autoc[coeff] = 0.0; |
| 95 mov edi, [esp + 28] ; edi == autoc |
| 96 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0
to write |
| 97 xor eax, eax |
| 98 rep stosd |
| 99 |
| 100 ; const unsigned limit = data_len - lag; |
| 101 mov eax, [esp + 24] ; eax == lag |
| 102 mov ecx, [esp + 20] |
| 103 sub ecx, eax ; ecx == limit |
| 104 |
| 105 mov edi, [esp + 28] ; edi == autoc |
| 106 mov esi, [esp + 16] ; esi == data |
| 107 inc ecx ; we are looping <= limit so we
add one to the counter |
| 108 |
| 109 ; for(sample = 0; sample <= limit; sample++) { |
| 110 ; d = data[sample]; |
| 111 ; for(coeff = 0; coeff < lag; coeff++) |
| 112 ; autoc[coeff] += d * data[sample+coeff]; |
| 113 ; } |
| 114 fld dword [esi] ; ST = d <- data[sample] |
| 115 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + e
ax) |
| 116 lea edx, [eax + eax*2] |
| 117 neg edx |
| 118 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] |
| 119 call .mov_eip_to_ebx |
| 120 .get_eip1: |
| 121 add edx, ebx |
| 122 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 123 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 124 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 125 cmp eax, 33 |
| 126 jne .loop1_start |
| 127 sub edx, byte 9 ; compensate for the longer opco
des on the first iteration |
| 128 .loop1_start: |
| 129 jmp edx |
| 130 |
| 131 .mov_eip_to_ebx: |
| 132 mov ebx, [esp] |
| 133 ret |
| 134 |
| 135 fld st0 ; ST = d d |
| 136 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d
WATCHOUT: not a byte displacement here! |
| 137 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+3
2] d WATCHOUT: not a byte displacement here! |
| 138 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32]
ST = d WATCHOUT: not a byte displacement here! |
| 139 fld st0 ; ST = d d |
| 140 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d |
| 141 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+3
1] d |
| 142 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31]
ST = d |
| 143 fld st0 ; ST = d d |
| 144 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d |
| 145 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+3
0] d |
| 146 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30]
ST = d |
| 147 fld st0 ; ST = d d |
| 148 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d |
| 149 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+2
9] d |
| 150 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29]
ST = d |
| 151 fld st0 ; ST = d d |
| 152 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d |
| 153 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+2
8] d |
| 154 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28]
ST = d |
| 155 fld st0 ; ST = d d |
| 156 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d |
| 157 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+2
7] d |
| 158 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27]
ST = d |
| 159 fld st0 ; ST = d d |
| 160 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d |
| 161 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+2
6] d |
| 162 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26]
ST = d |
| 163 fld st0 ; ST = d d |
| 164 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d |
| 165 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+2
5] d |
| 166 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25]
ST = d |
| 167 fld st0 ; ST = d d |
| 168 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d |
| 169 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+2
4] d |
| 170 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24]
ST = d |
| 171 fld st0 ; ST = d d |
| 172 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d |
| 173 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+2
3] d |
| 174 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23]
ST = d |
| 175 fld st0 ; ST = d d |
| 176 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d |
| 177 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+2
2] d |
| 178 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22]
ST = d |
| 179 fld st0 ; ST = d d |
| 180 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d |
| 181 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+2
1] d |
| 182 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21]
ST = d |
| 183 fld st0 ; ST = d d |
| 184 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d |
| 185 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+2
0] d |
| 186 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20]
ST = d |
| 187 fld st0 ; ST = d d |
| 188 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d |
| 189 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+1
9] d |
| 190 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19]
ST = d |
| 191 fld st0 ; ST = d d |
| 192 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d |
| 193 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+1
8] d |
| 194 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18]
ST = d |
| 195 fld st0 ; ST = d d |
| 196 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d |
| 197 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+1
7] d |
| 198 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17]
ST = d |
| 199 fld st0 ; ST = d d |
| 200 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d |
| 201 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+1
6] d |
| 202 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16]
ST = d |
| 203 fld st0 ; ST = d d |
| 204 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d |
| 205 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+1
5] d |
| 206 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15]
ST = d |
| 207 fld st0 ; ST = d d |
| 208 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d |
| 209 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+1
4] d |
| 210 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14]
ST = d |
| 211 fld st0 ; ST = d d |
| 212 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d |
| 213 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+1
3] d |
| 214 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13]
ST = d |
| 215 fld st0 ; ST = d d |
| 216 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d |
| 217 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+1
2] d |
| 218 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12]
ST = d |
| 219 fld st0 ; ST = d d |
| 220 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d |
| 221 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+1
1] d |
| 222 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11]
ST = d |
| 223 fld st0 ; ST = d d |
| 224 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d |
| 225 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+1
0] d |
| 226 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10]
ST = d |
| 227 fld st0 ; ST = d d |
| 228 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d |
| 229 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9]
d |
| 230 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST
= d |
| 231 fld st0 ; ST = d d |
| 232 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d |
| 233 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8]
d |
| 234 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST
= d |
| 235 fld st0 ; ST = d d |
| 236 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d |
| 237 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7]
d |
| 238 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST
= d |
| 239 fld st0 ; ST = d d |
| 240 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d |
| 241 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6]
d |
| 242 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST
= d |
| 243 fld st0 ; ST = d d |
| 244 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d |
| 245 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4]
d |
| 246 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST
= d |
| 247 fld st0 ; ST = d d |
| 248 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d |
| 249 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4]
d |
| 250 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST
= d |
| 251 fld st0 ; ST = d d |
| 252 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d |
| 253 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3]
d |
| 254 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST
= d |
| 255 fld st0 ; ST = d d |
| 256 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d |
| 257 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2]
d |
| 258 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST
= d |
| 259 fld st0 ; ST = d d |
| 260 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d |
| 261 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1]
d |
| 262 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST
= d |
| 263 fld st0 ; ST = d d |
| 264 fmul dword [esi] ; ST = d*data[sample] d
WATCHOUT: no displacement byte here! |
| 265 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d
WATCHOUT: no displacement byte here! |
| 266 fstp dword [edi] ; autoc[0]+=d*data[sample] ST =
d WATCHOUT: no displacement byte here! |
| 267 .jumper1_0: |
| 268 |
| 269 fstp st0 ; pop d, ST = empty |
| 270 add esi, byte 4 ; sample++ |
| 271 dec ecx |
| 272 jz .loop1_end |
| 273 fld dword [esi] ; ST = d <- data[sample] |
| 274 jmp edx |
| 275 .loop1_end: |
| 276 |
| 277 ; for(; sample < data_len; sample++) { |
| 278 ; d = data[sample]; |
| 279 ; for(coeff = 0; coeff < data_len - sample; coeff++) |
| 280 ; autoc[coeff] += d * data[sample+coeff]; |
| 281 ; } |
| 282 mov ecx, [esp + 24] ; ecx <- lag |
| 283 dec ecx ; ecx <- lag - 1 |
| 284 jz near .end ; skip loop if 0 (i.e. lag == 1) |
| 285 |
| 286 fld dword [esi] ; ST = d <- data[sample] |
| 287 mov eax, ecx ; eax <- lag - 1 == data_len - s
ample the first time through |
| 288 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + e
ax) |
| 289 lea edx, [eax + eax*2] |
| 290 neg edx |
| 291 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] |
| 292 call .mov_eip_to_ebx |
| 293 .get_eip2: |
| 294 add edx, ebx |
| 295 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 296 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 297 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 298 jmp edx |
| 299 |
| 300 fld st0 ; ST = d d |
| 301 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d |
| 302 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+3
1] d |
| 303 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31]
ST = d |
| 304 fld st0 ; ST = d d |
| 305 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d |
| 306 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+3
0] d |
| 307 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30]
ST = d |
| 308 fld st0 ; ST = d d |
| 309 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d |
| 310 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+2
9] d |
| 311 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29]
ST = d |
| 312 fld st0 ; ST = d d |
| 313 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d |
| 314 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+2
8] d |
| 315 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28]
ST = d |
| 316 fld st0 ; ST = d d |
| 317 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d |
| 318 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+2
7] d |
| 319 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27]
ST = d |
| 320 fld st0 ; ST = d d |
| 321 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d |
| 322 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+2
6] d |
| 323 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26]
ST = d |
| 324 fld st0 ; ST = d d |
| 325 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d |
| 326 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+2
5] d |
| 327 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25]
ST = d |
| 328 fld st0 ; ST = d d |
| 329 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d |
| 330 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+2
4] d |
| 331 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24]
ST = d |
| 332 fld st0 ; ST = d d |
| 333 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d |
| 334 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+2
3] d |
| 335 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23]
ST = d |
| 336 fld st0 ; ST = d d |
| 337 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d |
| 338 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+2
2] d |
| 339 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22]
ST = d |
| 340 fld st0 ; ST = d d |
| 341 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d |
| 342 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+2
1] d |
| 343 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21]
ST = d |
| 344 fld st0 ; ST = d d |
| 345 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d |
| 346 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+2
0] d |
| 347 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20]
ST = d |
| 348 fld st0 ; ST = d d |
| 349 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d |
| 350 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+1
9] d |
| 351 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19]
ST = d |
| 352 fld st0 ; ST = d d |
| 353 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d |
| 354 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+1
8] d |
| 355 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18]
ST = d |
| 356 fld st0 ; ST = d d |
| 357 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d |
| 358 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+1
7] d |
| 359 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17]
ST = d |
| 360 fld st0 ; ST = d d |
| 361 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d |
| 362 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+1
6] d |
| 363 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16]
ST = d |
| 364 fld st0 ; ST = d d |
| 365 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d |
| 366 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+1
5] d |
| 367 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15]
ST = d |
| 368 fld st0 ; ST = d d |
| 369 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d |
| 370 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+1
4] d |
| 371 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14]
ST = d |
| 372 fld st0 ; ST = d d |
| 373 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d |
| 374 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+1
3] d |
| 375 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13]
ST = d |
| 376 fld st0 ; ST = d d |
| 377 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d |
| 378 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+1
2] d |
| 379 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12]
ST = d |
| 380 fld st0 ; ST = d d |
| 381 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d |
| 382 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+1
1] d |
| 383 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11]
ST = d |
| 384 fld st0 ; ST = d d |
| 385 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d |
| 386 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+1
0] d |
| 387 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10]
ST = d |
| 388 fld st0 ; ST = d d |
| 389 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d |
| 390 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9]
d |
| 391 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST
= d |
| 392 fld st0 ; ST = d d |
| 393 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d |
| 394 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8]
d |
| 395 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST
= d |
| 396 fld st0 ; ST = d d |
| 397 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d |
| 398 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7]
d |
| 399 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST
= d |
| 400 fld st0 ; ST = d d |
| 401 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d |
| 402 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6]
d |
| 403 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST
= d |
| 404 fld st0 ; ST = d d |
| 405 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d |
| 406 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4]
d |
| 407 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST
= d |
| 408 fld st0 ; ST = d d |
| 409 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d |
| 410 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4]
d |
| 411 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST
= d |
| 412 fld st0 ; ST = d d |
| 413 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d |
| 414 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3]
d |
| 415 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST
= d |
| 416 fld st0 ; ST = d d |
| 417 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d |
| 418 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2]
d |
| 419 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST
= d |
| 420 fld st0 ; ST = d d |
| 421 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d |
| 422 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1]
d |
| 423 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST
= d |
| 424 fld st0 ; ST = d d |
| 425 fmul dword [esi] ; ST = d*data[sample] d
WATCHOUT: no displacement byte here! |
| 426 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d
WATCHOUT: no displacement byte here! |
| 427 fstp dword [edi] ; autoc[0]+=d*data[sample] ST =
d WATCHOUT: no displacement byte here! |
| 428 .jumper2_0: |
| 429 |
| 430 fstp st0 ; pop d, ST = empty |
| 431 add esi, byte 4 ; sample++ |
| 432 dec ecx |
| 433 jz .loop2_end |
| 434 add edx, byte 11 ; adjust our inner loop counter
by adjusting the jump target |
| 435 fld dword [esi] ; ST = d <- data[sample] |
| 436 jmp edx |
| 437 .loop2_end: |
| 438 |
| 439 .end: |
| 440 pop ebx |
| 441 pop edi |
| 442 pop esi |
| 443 ret |
| 444 |
| 445 ALIGN 16 |
| 446 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 |
| 447 ;[esp + 16] == autoc[] |
| 448 ;[esp + 12] == lag |
| 449 ;[esp + 8] == data_len |
| 450 ;[esp + 4] == data[] |
| 451 |
| 452 ;ASSERT(lag > 0) |
| 453 ;ASSERT(lag <= 4) |
| 454 ;ASSERT(lag <= data_len) |
| 455 |
| 456 ; for(coeff = 0; coeff < lag; coeff++) |
| 457 ; autoc[coeff] = 0.0; |
| 458 xorps xmm5, xmm5 |
| 459 |
| 460 mov edx, [esp + 8] ; edx == data_len |
| 461 mov eax, [esp + 4] ; eax == &data[sample] <- &data[
0] |
| 462 |
| 463 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
| 464 add eax, 4 |
| 465 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
| 466 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[samp
le],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
| 467 .warmup: ; xmm2 == data[sample-3],data[sa
mple-2],data[sample-1],data[sample] |
| 468 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 |
| 469 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 |
| 470 dec edx |
| 471 jz .loop_end |
| 472 ALIGN 16 |
| 473 .loop_start: |
| 474 ; start by reading the next sample |
| 475 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
| 476 add eax, 4 |
| 477 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sampl
e],data[sample],data[sample] |
| 478 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotat
ed left by one float |
| 479 movss xmm2, xmm0 |
| 480 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 |
| 481 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 |
| 482 dec edx |
| 483 jnz .loop_start |
| 484 .loop_end: |
| 485 ; store autoc |
| 486 mov edx, [esp + 16] ; edx == autoc |
| 487 movups [edx], xmm5 |
| 488 |
| 489 .end: |
| 490 ret |
| 491 |
| 492 ALIGN 16 |
| 493 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 |
| 494 ;[esp + 16] == autoc[] |
| 495 ;[esp + 12] == lag |
| 496 ;[esp + 8] == data_len |
| 497 ;[esp + 4] == data[] |
| 498 |
| 499 ;ASSERT(lag > 0) |
| 500 ;ASSERT(lag <= 8) |
| 501 ;ASSERT(lag <= data_len) |
| 502 |
| 503 ; for(coeff = 0; coeff < lag; coeff++) |
| 504 ; autoc[coeff] = 0.0; |
| 505 xorps xmm5, xmm5 |
| 506 xorps xmm6, xmm6 |
| 507 |
| 508 mov edx, [esp + 8] ; edx == data_len |
| 509 mov eax, [esp + 4] ; eax == &data[sample] <- &data[
0] |
| 510 |
| 511 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
| 512 add eax, 4 |
| 513 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
| 514 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[samp
le],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
| 515 movaps xmm1, xmm0 ; xmm1 == data[sample],data[samp
le],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
| 516 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
| 517 .warmup: ; xmm3:xmm2 == data[sample-7],da
ta[sample-6],...,data[sample] |
| 518 mulps xmm0, xmm2 |
| 519 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:x
mm2 |
| 520 addps xmm5, xmm0 |
| 521 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:
xmm2 |
| 522 dec edx |
| 523 jz .loop_end |
| 524 ALIGN 16 |
| 525 .loop_start: |
| 526 ; start by reading the next sample |
| 527 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
| 528 ; here we reorder the instructions; see the (#) indexes for a logical or
der |
| 529 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets r
otated left by one float |
| 530 add eax, 4 ; (0) |
| 531 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets r
otated left by one float |
| 532 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[s
ample],data[sample],data[sample] |
| 533 movss xmm3, xmm2 ; (5) |
| 534 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[s
ample],data[sample],data[sample] |
| 535 movss xmm2, xmm0 ; (6) |
| 536 mulps xmm1, xmm3 ; (8) |
| 537 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xm
m3:xmm2 |
| 538 addps xmm6, xmm1 ; (10) |
| 539 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * x
mm3:xmm2 |
| 540 dec edx |
| 541 jnz .loop_start |
| 542 .loop_end: |
| 543 ; store autoc |
| 544 mov edx, [esp + 16] ; edx == autoc |
| 545 movups [edx], xmm5 |
| 546 movups [edx + 16], xmm6 |
| 547 |
| 548 .end: |
| 549 ret |
| 550 |
| 551 ALIGN 16 |
| 552 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 |
| 553 ;[esp + 16] == autoc[] |
| 554 ;[esp + 12] == lag |
| 555 ;[esp + 8] == data_len |
| 556 ;[esp + 4] == data[] |
| 557 |
| 558 ;ASSERT(lag > 0) |
| 559 ;ASSERT(lag <= 12) |
| 560 ;ASSERT(lag <= data_len) |
| 561 |
| 562 ; for(coeff = 0; coeff < lag; coeff++) |
| 563 ; autoc[coeff] = 0.0; |
| 564 xorps xmm5, xmm5 |
| 565 xorps xmm6, xmm6 |
| 566 xorps xmm7, xmm7 |
| 567 |
| 568 mov edx, [esp + 8] ; edx == data_len |
| 569 mov eax, [esp + 4] ; eax == &data[sample] <- &data[
0] |
| 570 |
| 571 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
| 572 add eax, 4 |
| 573 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] |
| 574 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[samp
le],data[sample],data[sample] = data[0],data[0],data[0],data[0] |
| 575 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
| 576 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 |
| 577 .warmup: ; xmm3:xmm2 == data[sample-7],da
ta[sample-6],...,data[sample] |
| 578 movaps xmm1, xmm0 |
| 579 mulps xmm1, xmm2 |
| 580 addps xmm5, xmm1 |
| 581 movaps xmm1, xmm0 |
| 582 mulps xmm1, xmm3 |
| 583 addps xmm6, xmm1 |
| 584 mulps xmm0, xmm4 |
| 585 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xm
m0 * xmm4:xmm3:xmm2 |
| 586 dec edx |
| 587 jz .loop_end |
| 588 ALIGN 16 |
| 589 .loop_start: |
| 590 ; start by reading the next sample |
| 591 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] |
| 592 add eax, 4 |
| 593 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sampl
e],data[sample],data[sample] |
| 594 |
| 595 ; shift xmm4:xmm3:xmm2 left by one float |
| 596 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotat
ed left by one float |
| 597 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotat
ed left by one float |
| 598 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotat
ed left by one float |
| 599 movss xmm4, xmm3 |
| 600 movss xmm3, xmm2 |
| 601 movss xmm2, xmm0 |
| 602 |
| 603 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 |
| 604 movaps xmm1, xmm0 |
| 605 mulps xmm1, xmm2 |
| 606 addps xmm5, xmm1 |
| 607 movaps xmm1, xmm0 |
| 608 mulps xmm1, xmm3 |
| 609 addps xmm6, xmm1 |
| 610 mulps xmm0, xmm4 |
| 611 addps xmm7, xmm0 |
| 612 |
| 613 dec edx |
| 614 jnz .loop_start |
| 615 .loop_end: |
| 616 ; store autoc |
| 617 mov edx, [esp + 16] ; edx == autoc |
| 618 movups [edx], xmm5 |
| 619 movups [edx + 16], xmm6 |
| 620 movups [edx + 32], xmm7 |
| 621 |
| 622 .end: |
| 623 ret |
| 624 |
| 625 ALIGN 16 |
| 626 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16 |
| 627 ;[ebp + 20] == autoc[] |
| 628 ;[ebp + 16] == lag |
| 629 ;[ebp + 12] == data_len |
| 630 ;[ebp + 8] == data[] |
| 631 ;[esp] == __m128 |
| 632 ;[esp + 16] == __m128 |
| 633 |
| 634 push ebp |
| 635 mov ebp, esp |
| 636 and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addp
s' |
| 637 sub esp, 32 |
| 638 |
| 639 ;ASSERT(lag > 0) |
| 640 ;ASSERT(lag <= 12) |
| 641 ;ASSERT(lag <= data_len) |
| 642 ;ASSERT(data_len > 0) |
| 643 |
| 644 ; for(coeff = 0; coeff < lag; coeff++) |
| 645 ; autoc[coeff] = 0.0; |
| 646 xorps xmm5, xmm5 |
| 647 xorps xmm6, xmm6 |
| 648 movaps [esp], xmm5 |
| 649 movaps [esp + 16], xmm6 |
| 650 |
| 651 mov edx, [ebp + 12] ; edx == data_len |
| 652 mov eax, [ebp + 8] ; eax == &data[sample] <- &data[
0] |
| 653 |
| 654 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] |
| 655 add eax, 4 |
| 656 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] |
| 657 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data
[sample],data[sample] = data[0],data[0],data[0],data[0] |
| 658 xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 |
| 659 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 |
| 660 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 |
| 661 movaps xmm7, xmm0 |
| 662 mulps xmm7, xmm1 |
| 663 addps xmm5, xmm7 |
| 664 dec edx |
| 665 jz .loop_end |
| 666 ALIGN 16 |
| 667 .loop_start: |
| 668 ; start by reading the next sample |
| 669 movss xmm0, [eax] ; xmm0 = 0,0,0,data[samp
le] |
| 670 add eax, 4 |
| 671 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sampl
e],data[sample],data[sample] |
| 672 |
| 673 ; shift xmm4:xmm3:xmm2:xmm1 left by one float |
| 674 shufps xmm1, xmm1, 93h |
| 675 shufps xmm2, xmm2, 93h |
| 676 shufps xmm3, xmm3, 93h |
| 677 shufps xmm4, xmm4, 93h |
| 678 movss xmm4, xmm3 |
| 679 movss xmm3, xmm2 |
| 680 movss xmm2, xmm1 |
| 681 movss xmm1, xmm0 |
| 682 |
| 683 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 |
| 684 movaps xmm7, xmm0 |
| 685 mulps xmm7, xmm1 |
| 686 addps xmm5, xmm7 |
| 687 movaps xmm7, xmm0 |
| 688 mulps xmm7, xmm2 |
| 689 addps xmm6, xmm7 |
| 690 movaps xmm7, xmm0 |
| 691 mulps xmm7, xmm3 |
| 692 mulps xmm0, xmm4 |
| 693 addps xmm7, [esp] |
| 694 addps xmm0, [esp + 16] |
| 695 movaps [esp], xmm7 |
| 696 movaps [esp + 16], xmm0 |
| 697 |
| 698 dec edx |
| 699 jnz .loop_start |
| 700 .loop_end: |
| 701 ; store autoc |
| 702 mov edx, [ebp + 20] ; edx == autoc |
| 703 movups [edx], xmm5 |
| 704 movups [edx + 16], xmm6 |
| 705 movaps xmm5, [esp] |
| 706 movaps xmm6, [esp + 16] |
| 707 movups [edx + 32], xmm5 |
| 708 movups [edx + 48], xmm6 |
| 709 .end: |
| 710 mov esp, ebp |
| 711 pop ebp |
| 712 ret |
| 713 |
| 714 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data,
unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantiz
ation, FLAC__int32 residual[]) |
| 715 ; |
| 716 ; for(i = 0; i < data_len; i++) { |
| 717 ; sum = 0; |
| 718 ; for(j = 0; j < order; j++) |
| 719 ; sum += qlp_coeff[j] * data[i-j-1]; |
| 720 ; residual[i] = data[i] - (sum >> lp_quantization); |
| 721 ; } |
| 722 ; |
| 723 ALIGN 16 |
| 724 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 |
| 725 ;[esp + 40] residual[] |
| 726 ;[esp + 36] lp_quantization |
| 727 ;[esp + 32] order |
| 728 ;[esp + 28] qlp_coeff[] |
| 729 ;[esp + 24] data_len |
| 730 ;[esp + 20] data[] |
| 731 |
| 732 ;ASSERT(order > 0) |
| 733 |
| 734 push ebp |
| 735 push ebx |
| 736 push esi |
| 737 push edi |
| 738 |
| 739 mov esi, [esp + 20] ; esi = data[] |
| 740 mov edi, [esp + 40] ; edi = residual[] |
| 741 mov eax, [esp + 32] ; eax = order |
| 742 mov ebx, [esp + 24] ; ebx = data_len |
| 743 |
| 744 test ebx, ebx |
| 745 jz near .end ; do nothing if data_len == 0 |
| 746 .begin: |
| 747 cmp eax, byte 1 |
| 748 jg short .i_1more |
| 749 |
| 750 mov ecx, [esp + 28] |
| 751 mov edx, [ecx] ; edx = qlp_coeff[0] |
| 752 mov eax, [esi - 4] ; eax = data[-1] |
| 753 mov ecx, [esp + 36] ; cl = lp_quantization |
| 754 ALIGN 16 |
| 755 .i_1_loop_i: |
| 756 imul eax, edx |
| 757 sar eax, cl |
| 758 neg eax |
| 759 add eax, [esi] |
| 760 mov [edi], eax |
| 761 mov eax, [esi] |
| 762 add edi, byte 4 |
| 763 add esi, byte 4 |
| 764 dec ebx |
| 765 jnz .i_1_loop_i |
| 766 |
| 767 jmp .end |
| 768 |
| 769 .i_1more: |
| 770 cmp eax, byte 32 ; for order <= 32 there is a fas
ter routine |
| 771 jbe short .i_32 |
| 772 |
| 773 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER
== 32 |
| 774 ALIGN 16 |
| 775 .i_32more_loop_i: |
| 776 xor ebp, ebp |
| 777 mov ecx, [esp + 32] |
| 778 mov edx, ecx |
| 779 shl edx, 2 |
| 780 add edx, [esp + 28] |
| 781 neg ecx |
| 782 ALIGN 16 |
| 783 .i_32more_loop_j: |
| 784 sub edx, byte 4 |
| 785 mov eax, [edx] |
| 786 imul eax, [esi + 4 * ecx] |
| 787 add ebp, eax |
| 788 inc ecx |
| 789 jnz short .i_32more_loop_j |
| 790 |
| 791 mov ecx, [esp + 36] |
| 792 sar ebp, cl |
| 793 neg ebp |
| 794 add ebp, [esi] |
| 795 mov [edi], ebp |
| 796 add esi, byte 4 |
| 797 add edi, byte 4 |
| 798 |
| 799 dec ebx |
| 800 jnz .i_32more_loop_i |
| 801 |
| 802 jmp .end |
| 803 |
| 804 .mov_eip_to_eax: |
| 805 mov eax, [esp] |
| 806 ret |
| 807 |
| 808 .i_32: |
| 809 sub edi, esi |
| 810 neg eax |
| 811 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] |
| 812 call .mov_eip_to_eax |
| 813 .get_eip0: |
| 814 add edx, eax |
| 815 inc edx |
| 816 mov eax, [esp + 28] ; eax = qlp_coeff[] |
| 817 xor ebp, ebp |
| 818 jmp edx |
| 819 |
| 820 mov ecx, [eax + 124] |
| 821 imul ecx, [esi - 128] |
| 822 add ebp, ecx |
| 823 mov ecx, [eax + 120] |
| 824 imul ecx, [esi - 124] |
| 825 add ebp, ecx |
| 826 mov ecx, [eax + 116] |
| 827 imul ecx, [esi - 120] |
| 828 add ebp, ecx |
| 829 mov ecx, [eax + 112] |
| 830 imul ecx, [esi - 116] |
| 831 add ebp, ecx |
| 832 mov ecx, [eax + 108] |
| 833 imul ecx, [esi - 112] |
| 834 add ebp, ecx |
| 835 mov ecx, [eax + 104] |
| 836 imul ecx, [esi - 108] |
| 837 add ebp, ecx |
| 838 mov ecx, [eax + 100] |
| 839 imul ecx, [esi - 104] |
| 840 add ebp, ecx |
| 841 mov ecx, [eax + 96] |
| 842 imul ecx, [esi - 100] |
| 843 add ebp, ecx |
| 844 mov ecx, [eax + 92] |
| 845 imul ecx, [esi - 96] |
| 846 add ebp, ecx |
| 847 mov ecx, [eax + 88] |
| 848 imul ecx, [esi - 92] |
| 849 add ebp, ecx |
| 850 mov ecx, [eax + 84] |
| 851 imul ecx, [esi - 88] |
| 852 add ebp, ecx |
| 853 mov ecx, [eax + 80] |
| 854 imul ecx, [esi - 84] |
| 855 add ebp, ecx |
| 856 mov ecx, [eax + 76] |
| 857 imul ecx, [esi - 80] |
| 858 add ebp, ecx |
| 859 mov ecx, [eax + 72] |
| 860 imul ecx, [esi - 76] |
| 861 add ebp, ecx |
| 862 mov ecx, [eax + 68] |
| 863 imul ecx, [esi - 72] |
| 864 add ebp, ecx |
| 865 mov ecx, [eax + 64] |
| 866 imul ecx, [esi - 68] |
| 867 add ebp, ecx |
| 868 mov ecx, [eax + 60] |
| 869 imul ecx, [esi - 64] |
| 870 add ebp, ecx |
| 871 mov ecx, [eax + 56] |
| 872 imul ecx, [esi - 60] |
| 873 add ebp, ecx |
| 874 mov ecx, [eax + 52] |
| 875 imul ecx, [esi - 56] |
| 876 add ebp, ecx |
| 877 mov ecx, [eax + 48] |
| 878 imul ecx, [esi - 52] |
| 879 add ebp, ecx |
| 880 mov ecx, [eax + 44] |
| 881 imul ecx, [esi - 48] |
| 882 add ebp, ecx |
| 883 mov ecx, [eax + 40] |
| 884 imul ecx, [esi - 44] |
| 885 add ebp, ecx |
| 886 mov ecx, [eax + 36] |
| 887 imul ecx, [esi - 40] |
| 888 add ebp, ecx |
| 889 mov ecx, [eax + 32] |
| 890 imul ecx, [esi - 36] |
| 891 add ebp, ecx |
| 892 mov ecx, [eax + 28] |
| 893 imul ecx, [esi - 32] |
| 894 add ebp, ecx |
| 895 mov ecx, [eax + 24] |
| 896 imul ecx, [esi - 28] |
| 897 add ebp, ecx |
| 898 mov ecx, [eax + 20] |
| 899 imul ecx, [esi - 24] |
| 900 add ebp, ecx |
| 901 mov ecx, [eax + 16] |
| 902 imul ecx, [esi - 20] |
| 903 add ebp, ecx |
| 904 mov ecx, [eax + 12] |
| 905 imul ecx, [esi - 16] |
| 906 add ebp, ecx |
| 907 mov ecx, [eax + 8] |
| 908 imul ecx, [esi - 12] |
| 909 add ebp, ecx |
| 910 mov ecx, [eax + 4] |
| 911 imul ecx, [esi - 8] |
| 912 add ebp, ecx |
| 913 mov ecx, [eax] ; there is one byte missing |
| 914 imul ecx, [esi - 4] |
| 915 add ebp, ecx |
| 916 .jumper_0: |
| 917 |
| 918 mov ecx, [esp + 36] |
| 919 sar ebp, cl |
| 920 neg ebp |
| 921 add ebp, [esi] |
| 922 mov [edi + esi], ebp |
| 923 add esi, byte 4 |
| 924 |
| 925 dec ebx |
| 926 jz short .end |
| 927 xor ebp, ebp |
| 928 jmp edx |
| 929 |
| 930 .end: |
| 931 pop edi |
| 932 pop esi |
| 933 pop ebx |
| 934 pop ebp |
| 935 ret |
| 936 |
| 937 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for |
| 938 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine |
| 939 ; cannot be used for side-channel coded 16bps channels since the effective bps |
| 940 ; is 17. |
| 941 ALIGN 16 |
| 942 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx |
| 943 ;[esp + 40] residual[] |
| 944 ;[esp + 36] lp_quantization |
| 945 ;[esp + 32] order |
| 946 ;[esp + 28] qlp_coeff[] |
| 947 ;[esp + 24] data_len |
| 948 ;[esp + 20] data[] |
| 949 |
| 950 ;ASSERT(order > 0) |
| 951 |
| 952 push ebp |
| 953 push ebx |
| 954 push esi |
| 955 push edi |
| 956 |
| 957 mov esi, [esp + 20] ; esi = data[] |
| 958 mov edi, [esp + 40] ; edi = residual[] |
| 959 mov eax, [esp + 32] ; eax = order |
| 960 mov ebx, [esp + 24] ; ebx = data_len |
| 961 |
| 962 test ebx, ebx |
| 963 jz near .end ; do nothing if data_len == 0 |
| 964 dec ebx |
| 965 test ebx, ebx |
| 966 jz near .last_one |
| 967 |
| 968 mov edx, [esp + 28] ; edx = qlp_coeff[] |
| 969 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization |
| 970 mov ebp, esp |
| 971 |
| 972 and esp, 0xfffffff8 |
| 973 |
| 974 xor ecx, ecx |
| 975 .copy_qlp_loop: |
| 976 push word [edx + 4 * ecx] |
| 977 inc ecx |
| 978 cmp ecx, eax |
| 979 jnz short .copy_qlp_loop |
| 980 |
| 981 and ecx, 0x3 |
| 982 test ecx, ecx |
| 983 je short .za_end |
| 984 sub ecx, byte 4 |
| 985 .za_loop: |
| 986 push word 0 |
| 987 inc eax |
| 988 inc ecx |
| 989 jnz short .za_loop |
| 990 .za_end: |
| 991 |
| 992 movq mm5, [esp + 2 * eax - 8] |
| 993 movd mm4, [esi - 16] |
| 994 punpckldq mm4, [esi - 12] |
| 995 movd mm0, [esi - 8] |
| 996 punpckldq mm0, [esi - 4] |
| 997 packssdw mm4, mm0 |
| 998 |
| 999 cmp eax, byte 4 |
| 1000 jnbe short .mmx_4more |
| 1001 |
| 1002 ALIGN 16 |
| 1003 .mmx_4_loop_i: |
| 1004 movd mm1, [esi] |
| 1005 movq mm3, mm4 |
| 1006 punpckldq mm1, [esi + 4] |
| 1007 psrlq mm4, 16 |
| 1008 movq mm0, mm1 |
| 1009 psllq mm0, 48 |
| 1010 por mm4, mm0 |
| 1011 movq mm2, mm4 |
| 1012 psrlq mm4, 16 |
| 1013 pxor mm0, mm0 |
| 1014 punpckhdq mm0, mm1 |
| 1015 pmaddwd mm3, mm5 |
| 1016 pmaddwd mm2, mm5 |
| 1017 psllq mm0, 16 |
| 1018 por mm4, mm0 |
| 1019 movq mm0, mm3 |
| 1020 punpckldq mm3, mm2 |
| 1021 punpckhdq mm0, mm2 |
| 1022 paddd mm3, mm0 |
| 1023 psrad mm3, mm6 |
| 1024 psubd mm1, mm3 |
| 1025 movd [edi], mm1 |
| 1026 punpckhdq mm1, mm1 |
| 1027 movd [edi + 4], mm1 |
| 1028 |
| 1029 add edi, byte 8 |
| 1030 add esi, byte 8 |
| 1031 |
| 1032 sub ebx, 2 |
| 1033 jg .mmx_4_loop_i |
| 1034 jmp .mmx_end |
| 1035 |
| 1036 .mmx_4more: |
| 1037 shl eax, 2 |
| 1038 neg eax |
| 1039 add eax, byte 16 |
| 1040 |
| 1041 ALIGN 16 |
| 1042 .mmx_4more_loop_i: |
| 1043 movd mm1, [esi] |
| 1044 punpckldq mm1, [esi + 4] |
| 1045 movq mm3, mm4 |
| 1046 psrlq mm4, 16 |
| 1047 movq mm0, mm1 |
| 1048 psllq mm0, 48 |
| 1049 por mm4, mm0 |
| 1050 movq mm2, mm4 |
| 1051 psrlq mm4, 16 |
| 1052 pxor mm0, mm0 |
| 1053 punpckhdq mm0, mm1 |
| 1054 pmaddwd mm3, mm5 |
| 1055 pmaddwd mm2, mm5 |
| 1056 psllq mm0, 16 |
| 1057 por mm4, mm0 |
| 1058 |
| 1059 mov ecx, esi |
| 1060 add ecx, eax |
| 1061 mov edx, esp |
| 1062 |
| 1063 ALIGN 16 |
| 1064 .mmx_4more_loop_j: |
| 1065 movd mm0, [ecx - 16] |
| 1066 movd mm7, [ecx - 8] |
| 1067 punpckldq mm0, [ecx - 12] |
| 1068 punpckldq mm7, [ecx - 4] |
| 1069 packssdw mm0, mm7 |
| 1070 pmaddwd mm0, [edx] |
| 1071 punpckhdq mm7, mm7 |
| 1072 paddd mm3, mm0 |
| 1073 movd mm0, [ecx - 12] |
| 1074 punpckldq mm0, [ecx - 8] |
| 1075 punpckldq mm7, [ecx] |
| 1076 packssdw mm0, mm7 |
| 1077 pmaddwd mm0, [edx] |
| 1078 paddd mm2, mm0 |
| 1079 |
| 1080 add edx, byte 8 |
| 1081 add ecx, byte 16 |
| 1082 cmp ecx, esi |
| 1083 jnz .mmx_4more_loop_j |
| 1084 |
| 1085 movq mm0, mm3 |
| 1086 punpckldq mm3, mm2 |
| 1087 punpckhdq mm0, mm2 |
| 1088 paddd mm3, mm0 |
| 1089 psrad mm3, mm6 |
| 1090 psubd mm1, mm3 |
| 1091 movd [edi], mm1 |
| 1092 punpckhdq mm1, mm1 |
| 1093 movd [edi + 4], mm1 |
| 1094 |
| 1095 add edi, byte 8 |
| 1096 add esi, byte 8 |
| 1097 |
| 1098 sub ebx, 2 |
| 1099 jg near .mmx_4more_loop_i |
| 1100 |
| 1101 .mmx_end: |
| 1102 emms |
| 1103 mov esp, ebp |
| 1104 .last_one: |
| 1105 mov eax, [esp + 32] |
| 1106 inc ebx |
| 1107 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.b
egin |
| 1108 |
| 1109 .end: |
| 1110 pop edi |
| 1111 pop esi |
| 1112 pop ebx |
| 1113 pop ebp |
| 1114 ret |
| 1115 |
| 1116 ; ********************************************************************** |
| 1117 ; |
| 1118 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len,
const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32
data[]) |
| 1119 ; { |
| 1120 ; unsigned i, j; |
| 1121 ; FLAC__int32 sum; |
| 1122 ; |
| 1123 ; FLAC__ASSERT(order > 0); |
| 1124 ; |
| 1125 ; for(i = 0; i < data_len; i++) { |
| 1126 ; sum = 0; |
| 1127 ; for(j = 0; j < order; j++) |
| 1128 ; sum += qlp_coeff[j] * data[i-j-1]; |
| 1129 ; data[i] = residual[i] + (sum >> lp_quantization); |
| 1130 ; } |
| 1131 ; } |
| 1132 ALIGN 16 |
| 1133 cident FLAC__lpc_restore_signal_asm_ia32 |
| 1134 ;[esp + 40] data[] |
| 1135 ;[esp + 36] lp_quantization |
| 1136 ;[esp + 32] order |
| 1137 ;[esp + 28] qlp_coeff[] |
| 1138 ;[esp + 24] data_len |
| 1139 ;[esp + 20] residual[] |
| 1140 |
| 1141 ;ASSERT(order > 0) |
| 1142 |
| 1143 push ebp |
| 1144 push ebx |
| 1145 push esi |
| 1146 push edi |
| 1147 |
| 1148 mov esi, [esp + 20] ; esi = residual[] |
| 1149 mov edi, [esp + 40] ; edi = data[] |
| 1150 mov eax, [esp + 32] ; eax = order |
| 1151 mov ebx, [esp + 24] ; ebx = data_len |
| 1152 |
| 1153 test ebx, ebx |
| 1154 jz near .end ; do nothing if data_len == 0 |
| 1155 |
| 1156 .begin: |
| 1157 cmp eax, byte 1 |
| 1158 jg short .x87_1more |
| 1159 |
| 1160 mov ecx, [esp + 28] |
| 1161 mov edx, [ecx] |
| 1162 mov eax, [edi - 4] |
| 1163 mov ecx, [esp + 36] |
| 1164 ALIGN 16 |
| 1165 .x87_1_loop_i: |
| 1166 imul eax, edx |
| 1167 sar eax, cl |
| 1168 add eax, [esi] |
| 1169 mov [edi], eax |
| 1170 add esi, byte 4 |
| 1171 add edi, byte 4 |
| 1172 dec ebx |
| 1173 jnz .x87_1_loop_i |
| 1174 |
| 1175 jmp .end |
| 1176 |
| 1177 .x87_1more: |
| 1178 cmp eax, byte 32 ; for order <= 32 there is a fas
ter routine |
| 1179 jbe short .x87_32 |
| 1180 |
| 1181 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER
== 32 |
| 1182 ALIGN 16 |
| 1183 .x87_32more_loop_i: |
| 1184 xor ebp, ebp |
| 1185 mov ecx, [esp + 32] |
| 1186 mov edx, ecx |
| 1187 shl edx, 2 |
| 1188 add edx, [esp + 28] |
| 1189 neg ecx |
| 1190 ALIGN 16 |
| 1191 .x87_32more_loop_j: |
| 1192 sub edx, byte 4 |
| 1193 mov eax, [edx] |
| 1194 imul eax, [edi + 4 * ecx] |
| 1195 add ebp, eax |
| 1196 inc ecx |
| 1197 jnz short .x87_32more_loop_j |
| 1198 |
| 1199 mov ecx, [esp + 36] |
| 1200 sar ebp, cl |
| 1201 add ebp, [esi] |
| 1202 mov [edi], ebp |
| 1203 add edi, byte 4 |
| 1204 add esi, byte 4 |
| 1205 |
| 1206 dec ebx |
| 1207 jnz .x87_32more_loop_i |
| 1208 |
| 1209 jmp .end |
| 1210 |
| 1211 .mov_eip_to_eax: |
| 1212 mov eax, [esp] |
| 1213 ret |
| 1214 |
| 1215 .x87_32: |
| 1216 sub esi, edi |
| 1217 neg eax |
| 1218 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] |
| 1219 call .mov_eip_to_eax |
| 1220 .get_eip0: |
| 1221 add edx, eax |
| 1222 inc edx ; compensate for the shorter opc
ode on the last iteration |
| 1223 mov eax, [esp + 28] ; eax = qlp_coeff[] |
| 1224 xor ebp, ebp |
| 1225 jmp edx |
| 1226 |
| 1227 mov ecx, [eax + 124] ; ecx = qlp_coeff[31] |
| 1228 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-
32] |
| 1229 add ebp, ecx ; sum += qlp_coeff[31] * data[i-
32] |
| 1230 mov ecx, [eax + 120] ; ecx = qlp_coeff[30] |
| 1231 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-
31] |
| 1232 add ebp, ecx ; sum += qlp_coeff[30] * data[i-
31] |
| 1233 mov ecx, [eax + 116] ; ecx = qlp_coeff[29] |
| 1234 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-
30] |
| 1235 add ebp, ecx ; sum += qlp_coeff[29] * data[i-
30] |
| 1236 mov ecx, [eax + 112] ; ecx = qlp_coeff[28] |
| 1237 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-
29] |
| 1238 add ebp, ecx ; sum += qlp_coeff[28] * data[i-
29] |
| 1239 mov ecx, [eax + 108] ; ecx = qlp_coeff[27] |
| 1240 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-
28] |
| 1241 add ebp, ecx ; sum += qlp_coeff[27] * data[i-
28] |
| 1242 mov ecx, [eax + 104] ; ecx = qlp_coeff[26] |
| 1243 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-
27] |
| 1244 add ebp, ecx ; sum += qlp_coeff[26] * data[i-
27] |
| 1245 mov ecx, [eax + 100] ; ecx = qlp_coeff[25] |
| 1246 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-
26] |
| 1247 add ebp, ecx ; sum += qlp_coeff[25] * data[i-
26] |
| 1248 mov ecx, [eax + 96] ; ecx = qlp_coeff[24] |
| 1249 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-
25] |
| 1250 add ebp, ecx ; sum += qlp_coeff[24] * data[i-
25] |
| 1251 mov ecx, [eax + 92] ; ecx = qlp_coeff[23] |
| 1252 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-
24] |
| 1253 add ebp, ecx ; sum += qlp_coeff[23] * data[i-
24] |
| 1254 mov ecx, [eax + 88] ; ecx = qlp_coeff[22] |
| 1255 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-
23] |
| 1256 add ebp, ecx ; sum += qlp_coeff[22] * data[i-
23] |
| 1257 mov ecx, [eax + 84] ; ecx = qlp_coeff[21] |
| 1258 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-
22] |
| 1259 add ebp, ecx ; sum += qlp_coeff[21] * data[i-
22] |
| 1260 mov ecx, [eax + 80] ; ecx = qlp_coeff[20] |
| 1261 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-
21] |
| 1262 add ebp, ecx ; sum += qlp_coeff[20] * data[i-
21] |
| 1263 mov ecx, [eax + 76] ; ecx = qlp_coeff[19] |
| 1264 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-
20] |
| 1265 add ebp, ecx ; sum += qlp_coeff[19] * data[i-
20] |
| 1266 mov ecx, [eax + 72] ; ecx = qlp_coeff[18] |
| 1267 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-
19] |
| 1268 add ebp, ecx ; sum += qlp_coeff[18] * data[i-
19] |
| 1269 mov ecx, [eax + 68] ; ecx = qlp_coeff[17] |
| 1270 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-
18] |
| 1271 add ebp, ecx ; sum += qlp_coeff[17] * data[i-
18] |
| 1272 mov ecx, [eax + 64] ; ecx = qlp_coeff[16] |
| 1273 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-
17] |
| 1274 add ebp, ecx ; sum += qlp_coeff[16] * data[i-
17] |
| 1275 mov ecx, [eax + 60] ; ecx = qlp_coeff[15] |
| 1276 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-
16] |
| 1277 add ebp, ecx ; sum += qlp_coeff[15] * data[i-
16] |
| 1278 mov ecx, [eax + 56] ; ecx = qlp_coeff[14] |
| 1279 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-
15] |
| 1280 add ebp, ecx ; sum += qlp_coeff[14] * data[i-
15] |
| 1281 mov ecx, [eax + 52] ; ecx = qlp_coeff[13] |
| 1282 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-
14] |
| 1283 add ebp, ecx ; sum += qlp_coeff[13] * data[i-
14] |
| 1284 mov ecx, [eax + 48] ; ecx = qlp_coeff[12] |
| 1285 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-
13] |
| 1286 add ebp, ecx ; sum += qlp_coeff[12] * data[i-
13] |
| 1287 mov ecx, [eax + 44] ; ecx = qlp_coeff[11] |
| 1288 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-
12] |
| 1289 add ebp, ecx ; sum += qlp_coeff[11] * data[i-
12] |
| 1290 mov ecx, [eax + 40] ; ecx = qlp_coeff[10] |
| 1291 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-
11] |
| 1292 add ebp, ecx ; sum += qlp_coeff[10] * data[i-
11] |
| 1293 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] |
| 1294 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-
10] |
| 1295 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-
10] |
| 1296 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] |
| 1297 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i-
9] |
| 1298 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i-
9] |
| 1299 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] |
| 1300 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i-
8] |
| 1301 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i-
8] |
| 1302 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] |
| 1303 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i-
7] |
| 1304 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i-
7] |
| 1305 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] |
| 1306 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i-
6] |
| 1307 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i-
6] |
| 1308 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] |
| 1309 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i-
5] |
| 1310 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i-
5] |
| 1311 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] |
| 1312 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i-
4] |
| 1313 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i-
4] |
| 1314 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] |
| 1315 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i-
3] |
| 1316 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i-
3] |
| 1317 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] |
| 1318 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i-
2] |
| 1319 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i-
2] |
| 1320 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: on
e byte missing from instruction) |
| 1321 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i-
1] |
| 1322 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i-
1] |
| 1323 .jumper_0: |
| 1324 |
| 1325 mov ecx, [esp + 36] |
| 1326 sar ebp, cl ; ebp = (sum >> lp_quantization) |
| 1327 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp
_quantization) |
| 1328 mov [edi], ebp ; data[i] = residual[i] + (sum >
> lp_quantization) |
| 1329 add edi, byte 4 |
| 1330 |
| 1331 dec ebx |
| 1332 jz short .end |
| 1333 xor ebp, ebp |
| 1334 jmp edx |
| 1335 |
| 1336 .end: |
| 1337 pop edi |
| 1338 pop esi |
| 1339 pop ebx |
| 1340 pop ebp |
| 1341 ret |
| 1342 |
| 1343 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for |
| 1344 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine |
| 1345 ; cannot be used for side-channel coded 16bps channels since the effective bps |
| 1346 ; is 17. |
| 1347 ; WATCHOUT: this routine requires that each data array have a buffer of up to |
| 1348 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each |
| 1349 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero. |
| 1350 ALIGN 16 |
| 1351 cident FLAC__lpc_restore_signal_asm_ia32_mmx |
| 1352 ;[esp + 40] data[] |
| 1353 ;[esp + 36] lp_quantization |
| 1354 ;[esp + 32] order |
| 1355 ;[esp + 28] qlp_coeff[] |
| 1356 ;[esp + 24] data_len |
| 1357 ;[esp + 20] residual[] |
| 1358 |
| 1359 ;ASSERT(order > 0) |
| 1360 |
| 1361 push ebp |
| 1362 push ebx |
| 1363 push esi |
| 1364 push edi |
| 1365 |
| 1366 mov esi, [esp + 20] |
| 1367 mov edi, [esp + 40] |
| 1368 mov eax, [esp + 32] |
| 1369 mov ebx, [esp + 24] |
| 1370 |
| 1371 test ebx, ebx |
| 1372 jz near .end ; do nothing if data_len == 0 |
| 1373 cmp eax, byte 4 |
| 1374 jb near FLAC__lpc_restore_signal_asm_ia32.begin |
| 1375 |
| 1376 mov edx, [esp + 28] |
| 1377 movd mm6, [esp + 36] |
| 1378 mov ebp, esp |
| 1379 |
| 1380 and esp, 0xfffffff8 |
| 1381 |
| 1382 xor ecx, ecx |
| 1383 .copy_qlp_loop: |
| 1384 push word [edx + 4 * ecx] |
| 1385 inc ecx |
| 1386 cmp ecx, eax |
| 1387 jnz short .copy_qlp_loop |
| 1388 |
| 1389 and ecx, 0x3 |
| 1390 test ecx, ecx |
| 1391 je short .za_end |
| 1392 sub ecx, byte 4 |
| 1393 .za_loop: |
| 1394 push word 0 |
| 1395 inc eax |
| 1396 inc ecx |
| 1397 jnz short .za_loop |
| 1398 .za_end: |
| 1399 |
| 1400 movq mm5, [esp + 2 * eax - 8] |
| 1401 movd mm4, [edi - 16] |
| 1402 punpckldq mm4, [edi - 12] |
| 1403 movd mm0, [edi - 8] |
| 1404 punpckldq mm0, [edi - 4] |
| 1405 packssdw mm4, mm0 |
| 1406 |
| 1407 cmp eax, byte 4 |
| 1408 jnbe short .mmx_4more |
| 1409 |
| 1410 ALIGN 16 |
| 1411 .mmx_4_loop_i: |
| 1412 movq mm7, mm4 |
| 1413 pmaddwd mm7, mm5 |
| 1414 movq mm0, mm7 |
| 1415 punpckhdq mm7, mm7 |
| 1416 paddd mm7, mm0 |
| 1417 psrad mm7, mm6 |
| 1418 movd mm1, [esi] |
| 1419 paddd mm7, mm1 |
| 1420 movd [edi], mm7 |
| 1421 psllq mm7, 48 |
| 1422 psrlq mm4, 16 |
| 1423 por mm4, mm7 |
| 1424 |
| 1425 add esi, byte 4 |
| 1426 add edi, byte 4 |
| 1427 |
| 1428 dec ebx |
| 1429 jnz .mmx_4_loop_i |
| 1430 jmp .mmx_end |
| 1431 .mmx_4more: |
| 1432 shl eax, 2 |
| 1433 neg eax |
| 1434 add eax, byte 16 |
| 1435 ALIGN 16 |
| 1436 .mmx_4more_loop_i: |
| 1437 mov ecx, edi |
| 1438 add ecx, eax |
| 1439 mov edx, esp |
| 1440 |
| 1441 movq mm7, mm4 |
| 1442 pmaddwd mm7, mm5 |
| 1443 |
| 1444 ALIGN 16 |
| 1445 .mmx_4more_loop_j: |
| 1446 movd mm0, [ecx - 16] |
| 1447 punpckldq mm0, [ecx - 12] |
| 1448 movd mm1, [ecx - 8] |
| 1449 punpckldq mm1, [ecx - 4] |
| 1450 packssdw mm0, mm1 |
| 1451 pmaddwd mm0, [edx] |
| 1452 paddd mm7, mm0 |
| 1453 |
| 1454 add edx, byte 8 |
| 1455 add ecx, byte 16 |
| 1456 cmp ecx, edi |
| 1457 jnz .mmx_4more_loop_j |
| 1458 |
| 1459 movq mm0, mm7 |
| 1460 punpckhdq mm7, mm7 |
| 1461 paddd mm7, mm0 |
| 1462 psrad mm7, mm6 |
| 1463 movd mm1, [esi] |
| 1464 paddd mm7, mm1 |
| 1465 movd [edi], mm7 |
| 1466 psllq mm7, 48 |
| 1467 psrlq mm4, 16 |
| 1468 por mm4, mm7 |
| 1469 |
| 1470 add esi, byte 4 |
| 1471 add edi, byte 4 |
| 1472 |
| 1473 dec ebx |
| 1474 jnz short .mmx_4more_loop_i |
| 1475 .mmx_end: |
| 1476 emms |
| 1477 mov esp, ebp |
| 1478 |
| 1479 .end: |
| 1480 pop edi |
| 1481 pop esi |
| 1482 pop ebx |
| 1483 pop ebp |
| 1484 ret |
| 1485 |
| 1486 |
| 1487 ; ********************************************************************** |
| 1488 ; |
| 1489 ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *d
ata, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_qu
antization, FLAC__int32 residual[]) |
| 1490 ; { |
| 1491 ; unsigned i, j; |
| 1492 ; FLAC__int64 sum; |
| 1493 ; |
| 1494 ; FLAC__ASSERT(order > 0); |
| 1495 ; |
| 1496 ; for(i = 0; i < data_len; i++) { |
| 1497 ; sum = 0; |
| 1498 ; for(j = 0; j < order; j++) |
| 1499 ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; |
| 1500 ; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); |
| 1501 ; } |
| 1502 ; } |
| 1503 ALIGN 16 |
| 1504 cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 |
| 1505 ;[esp + 40] residual[] |
| 1506 ;[esp + 36] lp_quantization |
| 1507 ;[esp + 32] order |
| 1508 ;[esp + 28] qlp_coeff[] |
| 1509 ;[esp + 24] data_len |
| 1510 ;[esp + 20] data[] |
| 1511 |
| 1512 ;ASSERT(order > 0) |
| 1513 ;ASSERT(order <= 32) |
| 1514 ;ASSERT(lp_quantization <= 31) |
| 1515 |
| 1516 push ebp |
| 1517 push ebx |
| 1518 push esi |
| 1519 push edi |
| 1520 |
| 1521 mov ebx, [esp + 24] ; ebx = data_len |
| 1522 test ebx, ebx |
| 1523 jz near .end ; do nothing if data_len
== 0 |
| 1524 |
| 1525 .begin: |
| 1526 mov eax, [esp + 32] ; eax = order |
| 1527 cmp eax, 1 |
| 1528 jg short .i_32 |
| 1529 |
| 1530 mov esi, [esp + 40] ; esi = residual[] |
| 1531 mov edi, [esp + 20] ; edi = data[] |
| 1532 mov ecx, [esp + 28] ; ecx = qlp_coeff[] |
| 1533 mov ebp, [ecx] ; ebp = qlp_coeff[0] |
| 1534 mov eax, [edi - 4] ; eax = data[-1] |
| 1535 mov ecx, [esp + 36] ; cl = lp_quantization |
| 1536 ALIGN 16 |
| 1537 .i_1_loop_i: |
| 1538 imul ebp ; edx:eax = qlp_coeff[0]
* (FLAC__int64)data[i-1] |
| 1539 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 |
| 1540 neg eax |
| 1541 add eax, [edi] |
| 1542 mov [esi], eax |
| 1543 mov eax, [edi] |
| 1544 add esi, 4 |
| 1545 add edi, 4 |
| 1546 dec ebx |
| 1547 jnz .i_1_loop_i |
| 1548 jmp .end |
| 1549 |
| 1550 .mov_eip_to_eax: |
| 1551 mov eax, [esp] |
| 1552 ret |
| 1553 |
| 1554 .i_32: ; eax = order |
| 1555 neg eax |
| 1556 add eax, eax |
| 1557 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] |
| 1558 call .mov_eip_to_eax |
| 1559 .get_eip0: |
| 1560 add ebp, eax |
| 1561 inc ebp ; compensate for the shorter opc
ode on the last iteration |
| 1562 |
| 1563 mov ebx, [esp + 28] ; ebx = qlp_coeff[] |
| 1564 mov edi, [esp + 20] ; edi = data[] |
| 1565 sub [esp + 40], edi ; residual[] -= data[] |
| 1566 |
| 1567 xor ecx, ecx |
| 1568 xor esi, esi |
| 1569 jmp ebp |
| 1570 |
| 1571 ;eax = -- |
| 1572 ;edx = -- |
| 1573 ;ecx = 0 |
| 1574 ;esi = 0 |
| 1575 ; |
| 1576 ;ebx = qlp_coeff[] |
| 1577 ;edi = data[] |
| 1578 ;ebp = @address |
| 1579 |
| 1580 mov eax, [ebx + 124] ; eax = qlp_coeff[31] |
| 1581 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * dat
a[i-32] |
| 1582 add ecx, eax |
| 1583 adc esi, edx ; sum += qlp_coe
ff[31] * data[i-32] |
| 1584 |
| 1585 mov eax, [ebx + 120] ; eax = qlp_coeff[30] |
| 1586 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * dat
a[i-31] |
| 1587 add ecx, eax |
| 1588 adc esi, edx ; sum += qlp_coe
ff[30] * data[i-31] |
| 1589 |
| 1590 mov eax, [ebx + 116] |
| 1591 imul dword [edi - 120] |
| 1592 add ecx, eax |
| 1593 adc esi, edx |
| 1594 |
| 1595 mov eax, [ebx + 112] |
| 1596 imul dword [edi - 116] |
| 1597 add ecx, eax |
| 1598 adc esi, edx |
| 1599 |
| 1600 mov eax, [ebx + 108] |
| 1601 imul dword [edi - 112] |
| 1602 add ecx, eax |
| 1603 adc esi, edx |
| 1604 |
| 1605 mov eax, [ebx + 104] |
| 1606 imul dword [edi - 108] |
| 1607 add ecx, eax |
| 1608 adc esi, edx |
| 1609 |
| 1610 mov eax, [ebx + 100] |
| 1611 imul dword [edi - 104] |
| 1612 add ecx, eax |
| 1613 adc esi, edx |
| 1614 |
| 1615 mov eax, [ebx + 96] |
| 1616 imul dword [edi - 100] |
| 1617 add ecx, eax |
| 1618 adc esi, edx |
| 1619 |
| 1620 mov eax, [ebx + 92] |
| 1621 imul dword [edi - 96] |
| 1622 add ecx, eax |
| 1623 adc esi, edx |
| 1624 |
| 1625 mov eax, [ebx + 88] |
| 1626 imul dword [edi - 92] |
| 1627 add ecx, eax |
| 1628 adc esi, edx |
| 1629 |
| 1630 mov eax, [ebx + 84] |
| 1631 imul dword [edi - 88] |
| 1632 add ecx, eax |
| 1633 adc esi, edx |
| 1634 |
| 1635 mov eax, [ebx + 80] |
| 1636 imul dword [edi - 84] |
| 1637 add ecx, eax |
| 1638 adc esi, edx |
| 1639 |
| 1640 mov eax, [ebx + 76] |
| 1641 imul dword [edi - 80] |
| 1642 add ecx, eax |
| 1643 adc esi, edx |
| 1644 |
| 1645 mov eax, [ebx + 72] |
| 1646 imul dword [edi - 76] |
| 1647 add ecx, eax |
| 1648 adc esi, edx |
| 1649 |
| 1650 mov eax, [ebx + 68] |
| 1651 imul dword [edi - 72] |
| 1652 add ecx, eax |
| 1653 adc esi, edx |
| 1654 |
| 1655 mov eax, [ebx + 64] |
| 1656 imul dword [edi - 68] |
| 1657 add ecx, eax |
| 1658 adc esi, edx |
| 1659 |
| 1660 mov eax, [ebx + 60] |
| 1661 imul dword [edi - 64] |
| 1662 add ecx, eax |
| 1663 adc esi, edx |
| 1664 |
| 1665 mov eax, [ebx + 56] |
| 1666 imul dword [edi - 60] |
| 1667 add ecx, eax |
| 1668 adc esi, edx |
| 1669 |
| 1670 mov eax, [ebx + 52] |
| 1671 imul dword [edi - 56] |
| 1672 add ecx, eax |
| 1673 adc esi, edx |
| 1674 |
| 1675 mov eax, [ebx + 48] |
| 1676 imul dword [edi - 52] |
| 1677 add ecx, eax |
| 1678 adc esi, edx |
| 1679 |
| 1680 mov eax, [ebx + 44] |
| 1681 imul dword [edi - 48] |
| 1682 add ecx, eax |
| 1683 adc esi, edx |
| 1684 |
| 1685 mov eax, [ebx + 40] |
| 1686 imul dword [edi - 44] |
| 1687 add ecx, eax |
| 1688 adc esi, edx |
| 1689 |
| 1690 mov eax, [ebx + 36] |
| 1691 imul dword [edi - 40] |
| 1692 add ecx, eax |
| 1693 adc esi, edx |
| 1694 |
| 1695 mov eax, [ebx + 32] |
| 1696 imul dword [edi - 36] |
| 1697 add ecx, eax |
| 1698 adc esi, edx |
| 1699 |
| 1700 mov eax, [ebx + 28] |
| 1701 imul dword [edi - 32] |
| 1702 add ecx, eax |
| 1703 adc esi, edx |
| 1704 |
| 1705 mov eax, [ebx + 24] |
| 1706 imul dword [edi - 28] |
| 1707 add ecx, eax |
| 1708 adc esi, edx |
| 1709 |
| 1710 mov eax, [ebx + 20] |
| 1711 imul dword [edi - 24] |
| 1712 add ecx, eax |
| 1713 adc esi, edx |
| 1714 |
| 1715 mov eax, [ebx + 16] |
| 1716 imul dword [edi - 20] |
| 1717 add ecx, eax |
| 1718 adc esi, edx |
| 1719 |
| 1720 mov eax, [ebx + 12] |
| 1721 imul dword [edi - 16] |
| 1722 add ecx, eax |
| 1723 adc esi, edx |
| 1724 |
| 1725 mov eax, [ebx + 8] |
| 1726 imul dword [edi - 12] |
| 1727 add ecx, eax |
| 1728 adc esi, edx |
| 1729 |
| 1730 mov eax, [ebx + 4] |
| 1731 imul dword [edi - 8] |
| 1732 add ecx, eax |
| 1733 adc esi, edx |
| 1734 |
| 1735 mov eax, [ebx] ; eax = qlp_coe
ff[ 0] (NOTE: one byte missing from instruction) |
| 1736 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * dat
a[i- 1] |
| 1737 add ecx, eax |
| 1738 adc esi, edx ; sum += qlp_coe
ff[ 0] * data[i- 1] |
| 1739 |
| 1740 .jumper_0: |
| 1741 mov edx, ecx |
| 1742 ;esi:edx = sum |
| 1743 mov ecx, [esp + 36] ; cl = lp_quantization |
| 1744 shrd edx, esi, cl ; edx = (sum >> lp_quantization) |
| 1745 ;eax = -- |
| 1746 ;ecx = -- |
| 1747 ;edx = sum >> lp_q |
| 1748 ;esi = -- |
| 1749 neg edx ; edx = -(sum >>
lp_quantization) |
| 1750 mov eax, [esp + 40] ; residual[] - data[] |
| 1751 add edx, [edi] ; edx = data[i] - (sum >
> lp_quantization) |
| 1752 mov [edi + eax], edx |
| 1753 add edi, 4 |
| 1754 |
| 1755 dec dword [esp + 24] |
| 1756 jz short .end |
| 1757 xor ecx, ecx |
| 1758 xor esi, esi |
| 1759 jmp ebp |
| 1760 |
| 1761 .end: |
| 1762 pop edi |
| 1763 pop esi |
| 1764 pop ebx |
| 1765 pop ebp |
| 1766 ret |
| 1767 |
| 1768 ; ********************************************************************** |
| 1769 ; |
| 1770 ; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data
_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__
int32 data[]) |
| 1771 ; { |
| 1772 ; unsigned i, j; |
| 1773 ; FLAC__int64 sum; |
| 1774 ; |
| 1775 ; FLAC__ASSERT(order > 0); |
| 1776 ; |
| 1777 ; for(i = 0; i < data_len; i++) { |
| 1778 ; sum = 0; |
| 1779 ; for(j = 0; j < order; j++) |
| 1780 ; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; |
| 1781 ; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); |
| 1782 ; } |
| 1783 ; } |
| 1784 ALIGN 16 |
| 1785 cident FLAC__lpc_restore_signal_wide_asm_ia32 |
| 1786 ;[esp + 40] data[] |
| 1787 ;[esp + 36] lp_quantization |
| 1788 ;[esp + 32] order |
| 1789 ;[esp + 28] qlp_coeff[] |
| 1790 ;[esp + 24] data_len |
| 1791 ;[esp + 20] residual[] |
| 1792 |
| 1793 ;ASSERT(order > 0) |
| 1794 ;ASSERT(order <= 32) |
| 1795 ;ASSERT(lp_quantization <= 31) |
| 1796 |
| 1797 push ebp |
| 1798 push ebx |
| 1799 push esi |
| 1800 push edi |
| 1801 |
| 1802 mov ebx, [esp + 24] ; ebx = data_len |
| 1803 test ebx, ebx |
| 1804 jz near .end ; do nothing if data_len
== 0 |
| 1805 |
| 1806 .begin: |
| 1807 mov eax, [esp + 32] ; eax = order |
| 1808 cmp eax, 1 |
| 1809 jg short .x87_32 |
| 1810 |
| 1811 mov esi, [esp + 20] ; esi = residual[] |
| 1812 mov edi, [esp + 40] ; edi = data[] |
| 1813 mov ecx, [esp + 28] ; ecx = qlp_coeff[] |
| 1814 mov ebp, [ecx] ; ebp = qlp_coeff[0] |
| 1815 mov eax, [edi - 4] ; eax = data[-1] |
| 1816 mov ecx, [esp + 36] ; cl = lp_quantization |
| 1817 ALIGN 16 |
| 1818 .x87_1_loop_i: |
| 1819 imul ebp ; edx:eax = qlp_coeff[0]
* (FLAC__int64)data[i-1] |
| 1820 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 |
| 1821 ; |
| 1822 add eax, [esi] |
| 1823 mov [edi], eax |
| 1824 ; |
| 1825 add esi, 4 |
| 1826 add edi, 4 |
| 1827 dec ebx |
| 1828 jnz .x87_1_loop_i |
| 1829 jmp .end |
| 1830 |
| 1831 .mov_eip_to_eax: |
| 1832 mov eax, [esp] |
| 1833 ret |
| 1834 |
| 1835 .x87_32: ; eax = order |
| 1836 neg eax |
| 1837 add eax, eax |
| 1838 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] |
| 1839 call .mov_eip_to_eax |
| 1840 .get_eip0: |
| 1841 add ebp, eax |
| 1842 inc ebp ; compensate for the shorter opc
ode on the last iteration |
| 1843 |
| 1844 mov ebx, [esp + 28] ; ebx = qlp_coeff[] |
| 1845 mov edi, [esp + 40] ; esi = data[] |
| 1846 sub [esp + 20], edi ; residual[] -= data[] |
| 1847 |
| 1848 xor ecx, ecx |
| 1849 xor esi, esi |
| 1850 jmp ebp |
| 1851 |
| 1852 ;eax = -- |
| 1853 ;edx = -- |
| 1854 ;ecx = 0 |
| 1855 ;esi = 0 |
| 1856 ; |
| 1857 ;ebx = qlp_coeff[] |
| 1858 ;edi = data[] |
| 1859 ;ebp = @address |
| 1860 |
| 1861 mov eax, [ebx + 124] ; eax = qlp_coeff[31] |
| 1862 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * dat
a[i-32] |
| 1863 add ecx, eax |
| 1864 adc esi, edx ; sum += qlp_coe
ff[31] * data[i-32] |
| 1865 |
| 1866 mov eax, [ebx + 120] ; eax = qlp_coeff[30] |
| 1867 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * dat
a[i-31] |
| 1868 add ecx, eax |
| 1869 adc esi, edx ; sum += qlp_coe
ff[30] * data[i-31] |
| 1870 |
| 1871 mov eax, [ebx + 116] |
| 1872 imul dword [edi - 120] |
| 1873 add ecx, eax |
| 1874 adc esi, edx |
| 1875 |
| 1876 mov eax, [ebx + 112] |
| 1877 imul dword [edi - 116] |
| 1878 add ecx, eax |
| 1879 adc esi, edx |
| 1880 |
| 1881 mov eax, [ebx + 108] |
| 1882 imul dword [edi - 112] |
| 1883 add ecx, eax |
| 1884 adc esi, edx |
| 1885 |
| 1886 mov eax, [ebx + 104] |
| 1887 imul dword [edi - 108] |
| 1888 add ecx, eax |
| 1889 adc esi, edx |
| 1890 |
| 1891 mov eax, [ebx + 100] |
| 1892 imul dword [edi - 104] |
| 1893 add ecx, eax |
| 1894 adc esi, edx |
| 1895 |
| 1896 mov eax, [ebx + 96] |
| 1897 imul dword [edi - 100] |
| 1898 add ecx, eax |
| 1899 adc esi, edx |
| 1900 |
| 1901 mov eax, [ebx + 92] |
| 1902 imul dword [edi - 96] |
| 1903 add ecx, eax |
| 1904 adc esi, edx |
| 1905 |
| 1906 mov eax, [ebx + 88] |
| 1907 imul dword [edi - 92] |
| 1908 add ecx, eax |
| 1909 adc esi, edx |
| 1910 |
| 1911 mov eax, [ebx + 84] |
| 1912 imul dword [edi - 88] |
| 1913 add ecx, eax |
| 1914 adc esi, edx |
| 1915 |
| 1916 mov eax, [ebx + 80] |
| 1917 imul dword [edi - 84] |
| 1918 add ecx, eax |
| 1919 adc esi, edx |
| 1920 |
| 1921 mov eax, [ebx + 76] |
| 1922 imul dword [edi - 80] |
| 1923 add ecx, eax |
| 1924 adc esi, edx |
| 1925 |
| 1926 mov eax, [ebx + 72] |
| 1927 imul dword [edi - 76] |
| 1928 add ecx, eax |
| 1929 adc esi, edx |
| 1930 |
| 1931 mov eax, [ebx + 68] |
| 1932 imul dword [edi - 72] |
| 1933 add ecx, eax |
| 1934 adc esi, edx |
| 1935 |
| 1936 mov eax, [ebx + 64] |
| 1937 imul dword [edi - 68] |
| 1938 add ecx, eax |
| 1939 adc esi, edx |
| 1940 |
| 1941 mov eax, [ebx + 60] |
| 1942 imul dword [edi - 64] |
| 1943 add ecx, eax |
| 1944 adc esi, edx |
| 1945 |
| 1946 mov eax, [ebx + 56] |
| 1947 imul dword [edi - 60] |
| 1948 add ecx, eax |
| 1949 adc esi, edx |
| 1950 |
| 1951 mov eax, [ebx + 52] |
| 1952 imul dword [edi - 56] |
| 1953 add ecx, eax |
| 1954 adc esi, edx |
| 1955 |
| 1956 mov eax, [ebx + 48] |
| 1957 imul dword [edi - 52] |
| 1958 add ecx, eax |
| 1959 adc esi, edx |
| 1960 |
| 1961 mov eax, [ebx + 44] |
| 1962 imul dword [edi - 48] |
| 1963 add ecx, eax |
| 1964 adc esi, edx |
| 1965 |
| 1966 mov eax, [ebx + 40] |
| 1967 imul dword [edi - 44] |
| 1968 add ecx, eax |
| 1969 adc esi, edx |
| 1970 |
| 1971 mov eax, [ebx + 36] |
| 1972 imul dword [edi - 40] |
| 1973 add ecx, eax |
| 1974 adc esi, edx |
| 1975 |
| 1976 mov eax, [ebx + 32] |
| 1977 imul dword [edi - 36] |
| 1978 add ecx, eax |
| 1979 adc esi, edx |
| 1980 |
| 1981 mov eax, [ebx + 28] |
| 1982 imul dword [edi - 32] |
| 1983 add ecx, eax |
| 1984 adc esi, edx |
| 1985 |
| 1986 mov eax, [ebx + 24] |
| 1987 imul dword [edi - 28] |
| 1988 add ecx, eax |
| 1989 adc esi, edx |
| 1990 |
| 1991 mov eax, [ebx + 20] |
| 1992 imul dword [edi - 24] |
| 1993 add ecx, eax |
| 1994 adc esi, edx |
| 1995 |
| 1996 mov eax, [ebx + 16] |
| 1997 imul dword [edi - 20] |
| 1998 add ecx, eax |
| 1999 adc esi, edx |
| 2000 |
| 2001 mov eax, [ebx + 12] |
| 2002 imul dword [edi - 16] |
| 2003 add ecx, eax |
| 2004 adc esi, edx |
| 2005 |
| 2006 mov eax, [ebx + 8] |
| 2007 imul dword [edi - 12] |
| 2008 add ecx, eax |
| 2009 adc esi, edx |
| 2010 |
| 2011 mov eax, [ebx + 4] |
| 2012 imul dword [edi - 8] |
| 2013 add ecx, eax |
| 2014 adc esi, edx |
| 2015 |
| 2016 mov eax, [ebx] ; eax = qlp_coe
ff[ 0] (NOTE: one byte missing from instruction) |
| 2017 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * dat
a[i- 1] |
| 2018 add ecx, eax |
| 2019 adc esi, edx ; sum += qlp_coe
ff[ 0] * data[i- 1] |
| 2020 |
| 2021 .jumper_0: |
| 2022 mov edx, ecx |
| 2023 ;esi:edx = sum |
| 2024 mov ecx, [esp + 36] ; cl = lp_quantization |
| 2025 shrd edx, esi, cl ; edx = (sum >> lp_quantization) |
| 2026 ;eax = -- |
| 2027 ;ecx = -- |
| 2028 ;edx = sum >> lp_q |
| 2029 ;esi = -- |
| 2030 ; |
| 2031 mov eax, [esp + 20] ; residual[] - data[] |
| 2032 add edx, [edi + eax] ; edx = residual[i] + (sum >> lp
_quantization) |
| 2033 mov [edi], edx ; data[i] = residual[i]
+ (sum >> lp_quantization) |
| 2034 add edi, 4 |
| 2035 |
| 2036 dec dword [esp + 24] |
| 2037 jz short .end |
| 2038 xor ecx, ecx |
| 2039 xor esi, esi |
| 2040 jmp ebp |
| 2041 |
| 2042 .end: |
| 2043 pop edi |
| 2044 pop esi |
| 2045 pop ebx |
| 2046 pop ebp |
| 2047 ret |
| 2048 |
| 2049 ; end |
OLD | NEW |