OLD | NEW |
(Empty) | |
| 1 ; vim:filetype=nasm ts=8 |
| 2 |
| 3 ; libFLAC - Free Lossless Audio Codec library |
| 4 ; Copyright (C) 2001-2009 Josh Coalson |
| 5 ; Copyright (C) 2011-2014 Xiph.Org Foundation |
| 6 ; |
| 7 ; Redistribution and use in source and binary forms, with or without |
| 8 ; modification, are permitted provided that the following conditions |
| 9 ; are met: |
| 10 ; |
| 11 ; - Redistributions of source code must retain the above copyright |
| 12 ; notice, this list of conditions and the following disclaimer. |
| 13 ; |
| 14 ; - Redistributions in binary form must reproduce the above copyright |
| 15 ; notice, this list of conditions and the following disclaimer in the |
| 16 ; documentation and/or other materials provided with the distribution. |
| 17 ; |
| 18 ; - Neither the name of the Xiph.org Foundation nor the names of its |
| 19 ; contributors may be used to endorse or promote products derived from |
| 20 ; this software without specific prior written permission. |
| 21 ; |
| 22 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 23 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 24 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 25 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| 26 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 27 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 28 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 29 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 30 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 31 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 32 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 33 |
| 34 %include "nasm.h" |
| 35 |
| 36 data_section |
| 37 |
| 38 cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
| 39 |
| 40 code_section |
| 41 |
| 42 ; ********************************************************************** |
| 43 ; |
| 44 ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned
data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) |
| 45 ; { |
| 46 ; FLAC__int32 last_error_0 = data[-1]; |
| 47 ; FLAC__int32 last_error_1 = data[-1] - data[-2]; |
| 48 ; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); |
| 49 ; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[
-4]); |
| 50 ; FLAC__int32 error, save; |
| 51 ; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, to
tal_error_3 = 0, total_error_4 = 0; |
| 52 ; unsigned i, order; |
| 53 ; |
| 54 ; for(i = 0; i < data_len; i++) { |
| 55 ; error = data[i] ; total_error_0 += local_abs(error);
save = error; |
| 56 ; error -= last_error_0; total_error_1 += local_abs(error); last_e
rror_0 = save; save = error; |
| 57 ; error -= last_error_1; total_error_2 += local_abs(error); last_e
rror_1 = save; save = error; |
| 58 ; error -= last_error_2; total_error_3 += local_abs(error); last_e
rror_2 = save; save = error; |
| 59 ; error -= last_error_3; total_error_4 += local_abs(error); last_e
rror_3 = save; |
| 60 ; } |
| 61 ; |
| 62 ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_erro
r_3), total_error_4)) |
| 63 ; order = 0; |
| 64 ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_err
or_4)) |
| 65 ; order = 1; |
| 66 ; else if(total_error_2 < min(total_error_3, total_error_4)) |
| 67 ; order = 2; |
| 68 ; else if(total_error_3 < total_error_4) |
| 69 ; order = 3; |
| 70 ; else |
| 71 ; order = 4; |
| 72 ; |
| 73 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error
_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_
LN2 : 0.0); |
| 74 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error
_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_
LN2 : 0.0); |
| 75 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error
_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_
LN2 : 0.0); |
| 76 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error
_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_
LN2 : 0.0); |
| 77 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error
_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_
LN2 : 0.0); |
| 78 ; |
| 79 ; return order; |
| 80 ; } |
| 81 ALIGN 16 |
| 82 cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov |
| 83 |
| 84 ; esp + 36 == data[] |
| 85 ; esp + 40 == data_len |
| 86 ; esp + 44 == residual_bits_per_sample[] |
| 87 |
| 88 push ebp |
| 89 push ebx |
| 90 push esi |
| 91 push edi |
| 92 sub esp, byte 16 |
| 93 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs |
| 94 |
| 95 ; ebx == &data[i] |
| 96 ; ecx == loop counter (i) |
| 97 ; ebp == order |
| 98 ; mm0 == total_error_1:total_error_0 |
| 99 ; mm1 == total_error_2:total_error_3 |
| 100 ; mm2 == :total_error_4 |
| 101 ; mm3 == last_error_1:last_error_0 |
| 102 ; mm4 == last_error_2:last_error_3 |
| 103 |
| 104 mov ecx, [esp + 40] ; ecx = data_len |
| 105 test ecx, ecx |
| 106 jz near .data_len_is_0 |
| 107 |
| 108 mov ebx, [esp + 36] ; ebx = data[] |
| 109 movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 |
| 110 movd mm2, [ebx - 8] ; mm2 = 0:data[-2] |
| 111 movd mm1, [ebx - 12] ; mm1 = 0:data[-3] |
| 112 movd mm0, [ebx - 16] ; mm0 = 0:data[-4] |
| 113 movq mm5, mm3 ; mm5 = 0:last_error_0 |
| 114 psubd mm5, mm2 ; mm5 = 0:last_error_1 |
| 115 punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_
0 |
| 116 psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] |
| 117 psubd mm5, mm2 ; mm5 = 0:last_error_2 |
| 118 movq mm4, mm5 ; mm4 = 0:last_error_2 |
| 119 psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-
2] - data[-3]) |
| 120 paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-
2] - 2 * data[-3]) |
| 121 psubd mm4, mm0 ; mm4 = 0:last_error_3 |
| 122 punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_
3 |
| 123 pxor mm0, mm0 ; mm0 = total_error_1:total_erro
r_0 |
| 124 pxor mm1, mm1 ; mm1 = total_error_2:total_erro
r_3 |
| 125 pxor mm2, mm2 ; mm2 = 0:total_error_4 |
| 126 |
| 127 ALIGN 16 |
| 128 .loop: |
| 129 movd mm7, [ebx] ; mm7 = 0:error_0 |
| 130 add ebx, byte 4 |
| 131 movq mm6, mm7 ; mm6 = 0:error_0 |
| 132 psubd mm7, mm3 ; mm7 = :error_1 |
| 133 punpckldq mm6, mm7 ; mm6 = error_1:error_0 |
| 134 movq mm5, mm6 ; mm5 = error_1:error_0 |
| 135 movq mm7, mm6 ; mm7 = error_1:error_0 |
| 136 psubd mm5, mm3 ; mm5 = error_2: |
| 137 movq mm3, mm6 ; mm3 = error_1:error_0 |
| 138 psrad mm6, 31 |
| 139 pxor mm7, mm6 |
| 140 psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0
) |
| 141 paddd mm0, mm7 ; mm0 = total_error_1:total_erro
r_0 |
| 142 movq mm6, mm5 ; mm6 = error_2: |
| 143 psubd mm5, mm4 ; mm5 = error_3: |
| 144 punpckhdq mm5, mm6 ; mm5 = error_2:error_3 |
| 145 movq mm7, mm5 ; mm7 = error_2:error_3 |
| 146 movq mm6, mm5 ; mm6 = error_2:error_3 |
| 147 psubd mm5, mm4 ; mm5 = :error_4 |
| 148 movq mm4, mm6 ; mm4 = error_2:error_3 |
| 149 psrad mm6, 31 |
| 150 pxor mm7, mm6 |
| 151 psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3
) |
| 152 paddd mm1, mm7 ; mm1 = total_error_2:total_erro
r_3 |
| 153 movq mm6, mm5 ; mm6 = :error_4 |
| 154 psrad mm5, 31 |
| 155 pxor mm6, mm5 |
| 156 psubd mm6, mm5 ; mm6 = :abs(error_4) |
| 157 paddd mm2, mm6 ; mm2 = :total_error_4 |
| 158 |
| 159 dec ecx |
| 160 jnz short .loop |
| 161 |
| 162 ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_erro
r_3), total_error_4)) |
| 163 ; order = 0; |
| 164 ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_err
or_4)) |
| 165 ; order = 1; |
| 166 ; else if(total_error_2 < min(total_error_3, total_error_4)) |
| 167 ; order = 2; |
| 168 ; else if(total_error_3 < total_error_4) |
| 169 ; order = 3; |
| 170 ; else |
| 171 ; order = 4; |
| 172 movq mm3, mm0 ; mm3 = total_error_1:total_erro
r_0 |
| 173 movd edi, mm2 ; edi = total_error_4 |
| 174 movd esi, mm1 ; esi = total_error_3 |
| 175 movd eax, mm0 ; eax = total_error_0 |
| 176 punpckhdq mm1, mm1 ; mm1 = total_error_2:total_erro
r_2 |
| 177 punpckhdq mm3, mm3 ; mm3 = total_error_1:total_erro
r_1 |
| 178 movd edx, mm1 ; edx = total_error_2 |
| 179 movd ecx, mm3 ; ecx = total_error_1 |
| 180 |
| 181 xor ebx, ebx |
| 182 xor ebp, ebp |
| 183 inc ebx |
| 184 cmp ecx, eax |
| 185 cmovb eax, ecx ; eax = min(total_error_0, total
_error_1) |
| 186 cmovbe ebp, ebx |
| 187 inc ebx |
| 188 cmp edx, eax |
| 189 cmovb eax, edx ; eax = min(total_error_0, total
_error_1, total_error_2) |
| 190 cmovbe ebp, ebx |
| 191 inc ebx |
| 192 cmp esi, eax |
| 193 cmovb eax, esi ; eax = min(total_error_0, total
_error_1, total_error_2, total_error_3) |
| 194 cmovbe ebp, ebx |
| 195 inc ebx |
| 196 cmp edi, eax |
| 197 cmovb eax, edi ; eax = min(total_error_0, total
_error_1, total_error_2, total_error_3, total_error_4) |
| 198 cmovbe ebp, ebx |
| 199 movd ebx, mm0 ; ebx = total_error_0 |
| 200 emms |
| 201 |
| 202 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && tot
al_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_l
en) / M_LN2 : 0.0); |
| 203 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && tot
al_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_l
en) / M_LN2 : 0.0); |
| 204 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && tot
al_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_l
en) / M_LN2 : 0.0); |
| 205 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && tot
al_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_l
en) / M_LN2 : 0.0); |
| 206 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && tot
al_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_l
en) / M_LN2 : 0.0); |
| 207 xor eax, eax |
| 208 fild dword [esp + 40] ; ST = data_len (NOTE: assumes d
ata_len is <2gigs) |
| 209 .rbps_0: |
| 210 test ebx, ebx |
| 211 jz .total_error_0_is_0 |
| 212 fld1 ; ST = 1.0 data_len |
| 213 mov [esp], ebx |
| 214 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_er
ror_0 |
| 215 mov ebx, [esp + 44] |
| 216 fild qword [esp] ; ST = total_error_0 1.0 data_le
n |
| 217 fdiv st2 ; ST = total_error_0/data_len 1.
0 data_len |
| 218 fldln2 ; ST = ln2 total_error_0/data_le
n 1.0 data_len |
| 219 fmulp st1 ; ST = ln2*total_error_0/data_le
n 1.0 data_len |
| 220 fyl2x ; ST = log2(ln2*total_error_0/da
ta_len) data_len |
| 221 fstp dword [ebx] ; residual_bits_per_sample[0] =
log2(ln2*total_error_0/data_len) ST = data_len |
| 222 jmp short .rbps_1 |
| 223 .total_error_0_is_0: |
| 224 mov ebx, [esp + 44] |
| 225 mov [ebx], eax ; residual_bits_per_sample[0] =
0.0 |
| 226 .rbps_1: |
| 227 test ecx, ecx |
| 228 jz .total_error_1_is_0 |
| 229 fld1 ; ST = 1.0 data_len |
| 230 mov [esp], ecx |
| 231 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_er
ror_1 |
| 232 fild qword [esp] ; ST = total_error_1 1.0 data_le
n |
| 233 fdiv st2 ; ST = total_error_1/data_len 1.
0 data_len |
| 234 fldln2 ; ST = ln2 total_error_1/data_le
n 1.0 data_len |
| 235 fmulp st1 ; ST = ln2*total_error_1/data_le
n 1.0 data_len |
| 236 fyl2x ; ST = log2(ln2*total_error_1/da
ta_len) data_len |
| 237 fstp dword [ebx + 4] ; residual_bits_per_sample[1] =
log2(ln2*total_error_1/data_len) ST = data_len |
| 238 jmp short .rbps_2 |
| 239 .total_error_1_is_0: |
| 240 mov [ebx + 4], eax ; residual_bits_per_sample[1] =
0.0 |
| 241 .rbps_2: |
| 242 test edx, edx |
| 243 jz .total_error_2_is_0 |
| 244 fld1 ; ST = 1.0 data_len |
| 245 mov [esp], edx |
| 246 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_er
ror_2 |
| 247 fild qword [esp] ; ST = total_error_2 1.0 data_le
n |
| 248 fdiv st2 ; ST = total_error_2/data_len 1.
0 data_len |
| 249 fldln2 ; ST = ln2 total_error_2/data_le
n 1.0 data_len |
| 250 fmulp st1 ; ST = ln2*total_error_2/data_le
n 1.0 data_len |
| 251 fyl2x ; ST = log2(ln2*total_error_2/da
ta_len) data_len |
| 252 fstp dword [ebx + 8] ; residual_bits_per_sample[2] =
log2(ln2*total_error_2/data_len) ST = data_len |
| 253 jmp short .rbps_3 |
| 254 .total_error_2_is_0: |
| 255 mov [ebx + 8], eax ; residual_bits_per_sample[2] =
0.0 |
| 256 .rbps_3: |
| 257 test esi, esi |
| 258 jz .total_error_3_is_0 |
| 259 fld1 ; ST = 1.0 data_len |
| 260 mov [esp], esi |
| 261 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_er
ror_3 |
| 262 fild qword [esp] ; ST = total_error_3 1.0 data_le
n |
| 263 fdiv st2 ; ST = total_error_3/data_len 1.
0 data_len |
| 264 fldln2 ; ST = ln2 total_error_3/data_le
n 1.0 data_len |
| 265 fmulp st1 ; ST = ln2*total_error_3/data_le
n 1.0 data_len |
| 266 fyl2x ; ST = log2(ln2*total_error_3/da
ta_len) data_len |
| 267 fstp dword [ebx + 12] ; residual_bits_per_sample[3] =
log2(ln2*total_error_3/data_len) ST = data_len |
| 268 jmp short .rbps_4 |
| 269 .total_error_3_is_0: |
| 270 mov [ebx + 12], eax ; residual_bits_per_sample[3] =
0.0 |
| 271 .rbps_4: |
| 272 test edi, edi |
| 273 jz .total_error_4_is_0 |
| 274 fld1 ; ST = 1.0 data_len |
| 275 mov [esp], edi |
| 276 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_er
ror_4 |
| 277 fild qword [esp] ; ST = total_error_4 1.0 data_le
n |
| 278 fdiv st2 ; ST = total_error_4/data_len 1.
0 data_len |
| 279 fldln2 ; ST = ln2 total_error_4/data_le
n 1.0 data_len |
| 280 fmulp st1 ; ST = ln2*total_error_4/data_le
n 1.0 data_len |
| 281 fyl2x ; ST = log2(ln2*total_error_4/da
ta_len) data_len |
| 282 fstp dword [ebx + 16] ; residual_bits_per_sample[4] =
log2(ln2*total_error_4/data_len) ST = data_len |
| 283 jmp short .rbps_end |
| 284 .total_error_4_is_0: |
| 285 mov [ebx + 16], eax ; residual_bits_per_sample[4] =
0.0 |
| 286 .rbps_end: |
| 287 fstp st0 ; ST = [empty] |
| 288 jmp short .end |
| 289 .data_len_is_0: |
| 290 ; data_len == 0, so residual_bits_per_sample[*] = 0.0 |
| 291 xor ebp, ebp |
| 292 mov edi, [esp + 44] |
| 293 mov [edi], ebp |
| 294 mov [edi + 4], ebp |
| 295 mov [edi + 8], ebp |
| 296 mov [edi + 12], ebp |
| 297 mov [edi + 16], ebp |
| 298 add ebp, byte 4 ; order = 4 |
| 299 |
| 300 .end: |
| 301 mov eax, ebp ; return order |
| 302 add esp, byte 16 |
| 303 pop edi |
| 304 pop esi |
| 305 pop ebx |
| 306 pop ebp |
| 307 ret |
| 308 |
| 309 ; end |
OLD | NEW |