OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * MIPS DSPr2 optimizations for libjpeg-turbo |
| 3 * |
| 4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. |
| 5 * All rights reserved. |
| 6 * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com) |
| 7 * Darko Laus (darko.laus@imgtec.com) |
| 8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. |
| 9 * This software is provided 'as-is', without any express or implied |
| 10 * warranty. In no event will the authors be held liable for any damages |
| 11 * arising from the use of this software. |
| 12 * |
| 13 * Permission is granted to anyone to use this software for any purpose, |
| 14 * including commercial applications, and to alter it and redistribute it |
| 15 * freely, subject to the following restrictions: |
| 16 * |
| 17 * 1. The origin of this software must not be misrepresented; you must not |
| 18 * claim that you wrote the original software. If you use this software |
| 19 * in a product, an acknowledgment in the product documentation would be |
| 20 * appreciated but is not required. |
| 21 * 2. Altered source versions must be plainly marked as such, and must not be |
| 22 * misrepresented as being the original software. |
| 23 * 3. This notice may not be removed or altered from any source distribution. |
| 24 */ |
| 25 |
| 26 #include "jsimd_mips_dspr2_asm.h" |
| 27 |
| 28 /*****************************************************************************/ |
| 29 LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2) |
| 30 /* |
| 31 * a0 - cinfo->image_width |
| 32 * a1 - input_buf |
| 33 * a2 - output_buf |
| 34 * a3 - output_row |
| 35 * 16(sp) - num_rows |
| 36 * 20(sp) - cinfo->num_components |
| 37 * |
| 38 * Null conversion for compression |
| 39 */ |
| 40 |
| 41 SAVE_REGS_ON_STACK 8, s0, s1 |
| 42 |
| 43 lw t9, 24(sp) // t9 = num_rows |
| 44 lw s0, 28(sp) // s0 = cinfo->num_components |
| 45 andi t0, a0, 3 // t0 = cinfo->image_width & 3 |
| 46 beqz t0, 4f // no residual |
| 47 nop |
| 48 0: |
| 49 addiu t9, t9, -1 |
| 50 bltz t9, 7f |
| 51 li t1, 0 |
| 52 1: |
| 53 sll t3, t1, 2 |
| 54 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] |
| 55 lw t2, 0(a1) // t2 = inptr = *input_buf |
| 56 sll t4, a3, 2 |
| 57 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] |
| 58 addu t2, t2, t1 |
| 59 addu s1, t5, a0 |
| 60 addu t6, t5, t0 |
| 61 2: |
| 62 lbu t3, 0(t2) |
| 63 addiu t5, t5, 1 |
| 64 sb t3, -1(t5) |
| 65 bne t6, t5, 2b |
| 66 addu t2, t2, s0 |
| 67 3: |
| 68 lbu t3, 0(t2) |
| 69 addu t4, t2, s0 |
| 70 addu t7, t4, s0 |
| 71 addu t8, t7, s0 |
| 72 addu t2, t8, s0 |
| 73 lbu t4, 0(t4) |
| 74 lbu t7, 0(t7) |
| 75 lbu t8, 0(t8) |
| 76 addiu t5, t5, 4 |
| 77 sb t3, -4(t5) |
| 78 sb t4, -3(t5) |
| 79 sb t7, -2(t5) |
| 80 bne s1, t5, 3b |
| 81 sb t8, -1(t5) |
| 82 addiu t1, t1, 1 |
| 83 bne t1, s0, 1b |
| 84 nop |
| 85 addiu a1, a1, 4 |
| 86 bgez t9, 0b |
| 87 addiu a3, a3, 1 |
| 88 b 7f |
| 89 nop |
| 90 4: |
| 91 addiu t9, t9, -1 |
| 92 bltz t9, 7f |
| 93 li t1, 0 |
| 94 5: |
| 95 sll t3, t1, 2 |
| 96 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] |
| 97 lw t2, 0(a1) // t2 = inptr = *input_buf |
| 98 sll t4, a3, 2 |
| 99 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] |
| 100 addu t2, t2, t1 |
| 101 addu s1, t5, a0 |
| 102 addu t6, t5, t0 |
| 103 6: |
| 104 lbu t3, 0(t2) |
| 105 addu t4, t2, s0 |
| 106 addu t7, t4, s0 |
| 107 addu t8, t7, s0 |
| 108 addu t2, t8, s0 |
| 109 lbu t4, 0(t4) |
| 110 lbu t7, 0(t7) |
| 111 lbu t8, 0(t8) |
| 112 addiu t5, t5, 4 |
| 113 sb t3, -4(t5) |
| 114 sb t4, -3(t5) |
| 115 sb t7, -2(t5) |
| 116 bne s1, t5, 6b |
| 117 sb t8, -1(t5) |
| 118 addiu t1, t1, 1 |
| 119 bne t1, s0, 5b |
| 120 nop |
| 121 addiu a1, a1, 4 |
| 122 bgez t9, 4b |
| 123 addiu a3, a3, 1 |
| 124 7: |
| 125 RESTORE_REGS_FROM_STACK 8, s0, s1 |
| 126 |
| 127 j ra |
| 128 nop |
| 129 |
| 130 END(jsimd_c_null_convert_mips_dspr2) |
| 131 |
| 132 /*****************************************************************************/ |
| 133 /* |
| 134 * jsimd_extrgb_ycc_convert_mips_dspr2 |
| 135 * jsimd_extbgr_ycc_convert_mips_dspr2 |
| 136 * jsimd_extrgbx_ycc_convert_mips_dspr2 |
| 137 * jsimd_extbgrx_ycc_convert_mips_dspr2 |
| 138 * jsimd_extxbgr_ycc_convert_mips_dspr2 |
| 139 * jsimd_extxrgb_ycc_convert_mips_dspr2 |
| 140 * |
| 141 * Colorspace conversion RGB -> YCbCr |
| 142 */ |
| 143 |
| 144 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_
offs, b_offs |
| 145 |
| 146 .macro DO_RGB_TO_YCC r, \ |
| 147 g, \ |
| 148 b, \ |
| 149 inptr |
| 150 lbu \r, \r_offs(\inptr) |
| 151 lbu \g, \g_offs(\inptr) |
| 152 lbu \b, \b_offs(\inptr) |
| 153 addiu \inptr, \pixel_size |
| 154 .endm |
| 155 |
| 156 LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2) |
| 157 /* |
| 158 * a0 - cinfo->image_width |
| 159 * a1 - input_buf |
| 160 * a2 - output_buf |
| 161 * a3 - output_row |
| 162 * 16(sp) - num_rows |
| 163 */ |
| 164 |
| 165 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 166 |
| 167 lw t7, 48(sp) // t7 = num_rows |
| 168 li s0, 0x4c8b // FIX(0.29900) |
| 169 li s1, 0x9646 // FIX(0.58700) |
| 170 li s2, 0x1d2f // FIX(0.11400) |
| 171 li s3, 0xffffd4cd // -FIX(0.16874) |
| 172 li s4, 0xffffab33 // -FIX(0.33126) |
| 173 li s5, 0x8000 // FIX(0.50000) |
| 174 li s6, 0xffff94d1 // -FIX(0.41869) |
| 175 li s7, 0xffffeb2f // -FIX(0.08131) |
| 176 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 |
| 177 |
| 178 0: |
| 179 addiu t7, -1 // --num_rows |
| 180 lw t6, 0(a1) // t6 = input_buf[0] |
| 181 lw t0, 0(a2) |
| 182 lw t1, 4(a2) |
| 183 lw t2, 8(a2) |
| 184 sll t3, a3, 2 |
| 185 lwx t0, t3(t0) // t0 = output_buf[0][output_row] |
| 186 lwx t1, t3(t1) // t1 = output_buf[1][output_row] |
| 187 lwx t2, t3(t2) // t2 = output_buf[2][output_row] |
| 188 |
| 189 addu t9, t2, a0 // t9 = end address |
| 190 addiu a3, 1 |
| 191 |
| 192 1: |
| 193 DO_RGB_TO_YCC t3, t4, t5, t6 |
| 194 |
| 195 mtlo s5, $ac0 |
| 196 mtlo t8, $ac1 |
| 197 mtlo t8, $ac2 |
| 198 maddu $ac0, s2, t5 |
| 199 maddu $ac1, s5, t5 |
| 200 maddu $ac2, s5, t3 |
| 201 maddu $ac0, s0, t3 |
| 202 maddu $ac1, s3, t3 |
| 203 maddu $ac2, s6, t4 |
| 204 maddu $ac0, s1, t4 |
| 205 maddu $ac1, s4, t4 |
| 206 maddu $ac2, s7, t5 |
| 207 extr.w t3, $ac0, 16 |
| 208 extr.w t4, $ac1, 16 |
| 209 extr.w t5, $ac2, 16 |
| 210 sb t3, 0(t0) |
| 211 sb t4, 0(t1) |
| 212 sb t5, 0(t2) |
| 213 addiu t0, 1 |
| 214 addiu t2, 1 |
| 215 bne t2, t9, 1b |
| 216 addiu t1, 1 |
| 217 bgtz t7, 0b |
| 218 addiu a1, 4 |
| 219 |
| 220 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 221 |
| 222 j ra |
| 223 nop |
| 224 END(jsimd_\colorid\()_ycc_convert_mips_dspr2) |
| 225 |
| 226 .purgem DO_RGB_TO_YCC |
| 227 |
| 228 .endm |
| 229 |
| 230 /*------------------------------------------id -- pix R G B */ |
| 231 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 |
| 232 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 |
| 233 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 |
| 234 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 |
| 235 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 |
| 236 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 |
| 237 |
| 238 /*****************************************************************************/ |
| 239 /* |
| 240 * jsimd_ycc_extrgb_convert_mips_dspr2 |
| 241 * jsimd_ycc_extbgr_convert_mips_dspr2 |
| 242 * jsimd_ycc_extrgbx_convert_mips_dspr2 |
| 243 * jsimd_ycc_extbgrx_convert_mips_dspr2 |
| 244 * jsimd_ycc_extxbgr_convert_mips_dspr2 |
| 245 * jsimd_ycc_extxrgb_convert_mips_dspr2 |
| 246 * |
| 247 * Colorspace conversion YCbCr -> RGB |
| 248 */ |
| 249 |
| 250 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_
offs, b_offs, a_offs |
| 251 |
| 252 .macro STORE_YCC_TO_RGB scratch0 \ |
| 253 scratch1 \ |
| 254 scratch2 \ |
| 255 outptr |
| 256 sb \scratch0, \r_offs(\outptr) |
| 257 sb \scratch1, \g_offs(\outptr) |
| 258 sb \scratch2, \b_offs(\outptr) |
| 259 .if (\pixel_size == 4) |
| 260 li t0, 0xFF |
| 261 sb t0, \a_offs(\outptr) |
| 262 .endif |
| 263 addiu \outptr, \pixel_size |
| 264 .endm |
| 265 |
| 266 LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2) |
| 267 /* |
| 268 * a0 - cinfo->image_width |
| 269 * a1 - input_buf |
| 270 * a2 - input_row |
| 271 * a3 - output_buf |
| 272 * 16(sp) - num_rows |
| 273 */ |
| 274 |
| 275 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 276 |
| 277 lw s1, 48(sp) |
| 278 li t3, 0x8000 |
| 279 li t4, 0x166e9 // FIX(1.40200) |
| 280 li t5, 0x1c5a2 // FIX(1.77200) |
| 281 li t6, 0xffff492e // -FIX(0.71414) |
| 282 li t7, 0xffffa7e6 // -FIX(0.34414) |
| 283 repl.ph t8, 128 |
| 284 |
| 285 0: |
| 286 lw s0, 0(a3) |
| 287 lw t0, 0(a1) |
| 288 lw t1, 4(a1) |
| 289 lw t2, 8(a1) |
| 290 sll s5, a2, 2 |
| 291 addiu s1, -1 |
| 292 lwx s2, s5(t0) |
| 293 lwx s3, s5(t1) |
| 294 lwx s4, s5(t2) |
| 295 addu t9, s2, a0 |
| 296 addiu a2, 1 |
| 297 |
| 298 1: |
| 299 lbu s7, 0(s4) // cr |
| 300 lbu s6, 0(s3) // cb |
| 301 lbu s5, 0(s2) // y |
| 302 addiu s2, 1 |
| 303 addiu s4, 1 |
| 304 addiu s7, -128 |
| 305 addiu s6, -128 |
| 306 mul t2, t7, s6 |
| 307 mul t0, t6, s7 // Crgtab[cr] |
| 308 sll s7, 15 |
| 309 mulq_rs.w t1, t4, s7 // Crrtab[cr] |
| 310 sll s6, 15 |
| 311 addu t2, t3 // Cbgtab[cb] |
| 312 addu t2, t0 |
| 313 |
| 314 mulq_rs.w t0, t5, s6 // Cbbtab[cb] |
| 315 sra t2, 16 |
| 316 addu t1, s5 |
| 317 addu t2, s5 // add y |
| 318 ins t2, t1, 16, 16 |
| 319 subu.ph t2, t2, t8 |
| 320 addu t0, s5 |
| 321 shll_s.ph t2, t2, 8 |
| 322 subu t0, 128 |
| 323 shra.ph t2, t2, 8 |
| 324 shll_s.w t0, t0, 24 |
| 325 addu.ph t2, t2, t8 // clip & store |
| 326 sra t0, t0, 24 |
| 327 sra t1, t2, 16 |
| 328 addiu t0, 128 |
| 329 |
| 330 STORE_YCC_TO_RGB t1, t2, t0, s0 |
| 331 |
| 332 bne s2, t9, 1b |
| 333 addiu s3, 1 |
| 334 bgtz s1, 0b |
| 335 addiu a3, 4 |
| 336 |
| 337 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 338 |
| 339 j ra |
| 340 nop |
| 341 END(jsimd_ycc_\colorid\()_convert_mips_dspr2) |
| 342 |
| 343 .purgem STORE_YCC_TO_RGB |
| 344 |
| 345 .endm |
| 346 |
| 347 /*------------------------------------------id -- pix R G B A */ |
| 348 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3 |
| 349 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3 |
| 350 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3 |
| 351 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3 |
| 352 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0 |
| 353 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0 |
| 354 |
| 355 /*****************************************************************************/ |
| 356 /* |
| 357 * jsimd_extrgb_gray_convert_mips_dspr2 |
| 358 * jsimd_extbgr_gray_convert_mips_dspr2 |
| 359 * jsimd_extrgbx_gray_convert_mips_dspr2 |
| 360 * jsimd_extbgrx_gray_convert_mips_dspr2 |
| 361 * jsimd_extxbgr_gray_convert_mips_dspr2 |
| 362 * jsimd_extxrgb_gray_convert_mips_dspr2 |
| 363 * |
| 364 * Colorspace conversion RGB -> GRAY |
| 365 */ |
| 366 |
| 367 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g
_offs, b_offs |
| 368 |
| 369 .macro DO_RGB_TO_GRAY r, \ |
| 370 g, \ |
| 371 b, \ |
| 372 inptr |
| 373 lbu \r, \r_offs(\inptr) |
| 374 lbu \g, \g_offs(\inptr) |
| 375 lbu \b, \b_offs(\inptr) |
| 376 addiu \inptr, \pixel_size |
| 377 .endm |
| 378 |
| 379 LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2) |
| 380 /* |
| 381 * a0 - cinfo->image_width |
| 382 * a1 - input_buf |
| 383 * a2 - output_buf |
| 384 * a3 - output_row |
| 385 * 16(sp) - num_rows |
| 386 */ |
| 387 |
| 388 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 389 |
| 390 li s0, 0x4c8b // s0 = FIX(0.29900) |
| 391 li s1, 0x9646 // s1 = FIX(0.58700) |
| 392 li s2, 0x1d2f // s2 = FIX(0.11400) |
| 393 li s7, 0x8000 // s7 = FIX(0.50000) |
| 394 lw s6, 48(sp) |
| 395 andi t7, a0, 3 |
| 396 |
| 397 0: |
| 398 addiu s6, -1 // s6 = num_rows |
| 399 lw t0, 0(a1) |
| 400 lw t1, 0(a2) |
| 401 sll t3, a3, 2 |
| 402 lwx t1, t3(t1) |
| 403 addiu a3, 1 |
| 404 addu t9, t1, a0 |
| 405 subu t8, t9, t7 |
| 406 beq t1, t8, 2f |
| 407 nop |
| 408 |
| 409 1: |
| 410 DO_RGB_TO_GRAY t3, t4, t5, t0 |
| 411 DO_RGB_TO_GRAY s3, s4, s5, t0 |
| 412 |
| 413 mtlo s7, $ac0 |
| 414 maddu $ac0, s2, t5 |
| 415 maddu $ac0, s1, t4 |
| 416 maddu $ac0, s0, t3 |
| 417 mtlo s7, $ac1 |
| 418 maddu $ac1, s2, s5 |
| 419 maddu $ac1, s1, s4 |
| 420 maddu $ac1, s0, s3 |
| 421 extr.w t6, $ac0, 16 |
| 422 |
| 423 DO_RGB_TO_GRAY t3, t4, t5, t0 |
| 424 DO_RGB_TO_GRAY s3, s4, s5, t0 |
| 425 |
| 426 mtlo s7, $ac0 |
| 427 maddu $ac0, s2, t5 |
| 428 maddu $ac0, s1, t4 |
| 429 extr.w t2, $ac1, 16 |
| 430 maddu $ac0, s0, t3 |
| 431 mtlo s7, $ac1 |
| 432 maddu $ac1, s2, s5 |
| 433 maddu $ac1, s1, s4 |
| 434 maddu $ac1, s0, s3 |
| 435 extr.w t5, $ac0, 16 |
| 436 sb t6, 0(t1) |
| 437 sb t2, 1(t1) |
| 438 extr.w t3, $ac1, 16 |
| 439 addiu t1, 4 |
| 440 sb t5, -2(t1) |
| 441 sb t3, -1(t1) |
| 442 bne t1, t8, 1b |
| 443 nop |
| 444 |
| 445 2: |
| 446 beqz t7, 4f |
| 447 nop |
| 448 |
| 449 3: |
| 450 DO_RGB_TO_GRAY t3, t4, t5, t0 |
| 451 |
| 452 mtlo s7, $ac0 |
| 453 maddu $ac0, s2, t5 |
| 454 maddu $ac0, s1, t4 |
| 455 maddu $ac0, s0, t3 |
| 456 extr.w t6, $ac0, 16 |
| 457 sb t6, 0(t1) |
| 458 addiu t1, 1 |
| 459 bne t1, t9, 3b |
| 460 nop |
| 461 |
| 462 4: |
| 463 bgtz s6, 0b |
| 464 addiu a1, 4 |
| 465 |
| 466 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 467 |
| 468 j ra |
| 469 nop |
| 470 END(jsimd_\colorid\()_gray_convert_mips_dspr2) |
| 471 |
| 472 .purgem DO_RGB_TO_GRAY |
| 473 |
| 474 .endm |
| 475 |
| 476 /*------------------------------------------id -- pix R G B */ |
| 477 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2 |
| 478 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0 |
| 479 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2 |
| 480 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0 |
| 481 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1 |
| 482 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3 |
| 483 /*****************************************************************************/ |
| 484 /* |
| 485 * jsimd_h2v2_merged_upsample_mips_dspr2 |
| 486 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2 |
| 487 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2 |
| 488 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2 |
| 489 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2 |
| 490 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2 |
| 491 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2 |
| 492 * |
| 493 * Merged h2v2 upsample routines |
| 494 */ |
| 495 .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ |
| 496 pixel_size, \ |
| 497 r1_offs, \ |
| 498 g1_offs, \ |
| 499 b1_offs, \ |
| 500 a1_offs, \ |
| 501 r2_offs, \ |
| 502 g2_offs, \ |
| 503 b2_offs, \ |
| 504 a2_offs |
| 505 |
| 506 .macro STORE_H2V2_2_PIXELS scratch0 \ |
| 507 scratch1 \ |
| 508 scratch2 \ |
| 509 scratch3 \ |
| 510 scratch4 \ |
| 511 scratch5 \ |
| 512 outptr |
| 513 sb \scratch0, \r1_offs(\outptr) |
| 514 sb \scratch1, \g1_offs(\outptr) |
| 515 sb \scratch2, \b1_offs(\outptr) |
| 516 sb \scratch3, \r2_offs(\outptr) |
| 517 sb \scratch4, \g2_offs(\outptr) |
| 518 sb \scratch5, \b2_offs(\outptr) |
| 519 .if (\pixel_size == 8) |
| 520 li \scratch0, 0xFF |
| 521 sb \scratch0, \a1_offs(\outptr) |
| 522 sb \scratch0, \a2_offs(\outptr) |
| 523 .endif |
| 524 addiu \outptr, \pixel_size |
| 525 .endm |
| 526 |
| 527 .macro STORE_H2V2_1_PIXEL scratch0 \ |
| 528 scratch1 \ |
| 529 scratch2 \ |
| 530 outptr |
| 531 sb \scratch0, \r1_offs(\outptr) |
| 532 sb \scratch1, \g1_offs(\outptr) |
| 533 sb \scratch2, \b1_offs(\outptr) |
| 534 |
| 535 .if (\pixel_size == 8) |
| 536 li t0, 0xFF |
| 537 sb t0, \a1_offs(\outptr) |
| 538 .endif |
| 539 .endm |
| 540 |
| 541 LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) |
| 542 /* |
| 543 * a0 - cinfo->output_width |
| 544 * a1 - input_buf |
| 545 * a2 - in_row_group_ctr |
| 546 * a3 - output_buf |
| 547 * 16(sp) - cinfo->sample_range_limit |
| 548 */ |
| 549 |
| 550 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| 551 |
| 552 lw t9, 56(sp) // cinfo->sample_range_limit |
| 553 lw v0, 0(a1) |
| 554 lw v1, 4(a1) |
| 555 lw t0, 8(a1) |
| 556 sll t1, a2, 3 |
| 557 addiu t2, t1, 4 |
| 558 sll t3, a2, 2 |
| 559 lw t4, 0(a3) // t4 = output_buf[0] |
| 560 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] |
| 561 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] |
| 562 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] |
| 563 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] |
| 564 lw t7, 4(a3) // t7 = output_buf[1] |
| 565 li s1, 0xe6ea |
| 566 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] |
| 567 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] |
| 568 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] |
| 569 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] |
| 570 srl t3, a0, 1 |
| 571 blez t3, 2f |
| 572 addu t0, t5, t3 // t0 = end address |
| 573 1: |
| 574 lbu t3, 0(t5) |
| 575 lbu s3, 0(t6) |
| 576 addiu t5, t5, 1 |
| 577 addiu t3, t3, -128 // (cb - 128) |
| 578 addiu s3, s3, -128 // (cr - 128) |
| 579 mult $ac1, s1, t3 |
| 580 madd $ac1, s2, s3 |
| 581 sll s3, s3, 15 |
| 582 sll t3, t3, 15 |
| 583 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS |
| 584 extr_r.w s5, $ac1, 16 |
| 585 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS |
| 586 lbu v0, 0(t1) |
| 587 addiu t6, t6, 1 |
| 588 addiu t1, t1, 2 |
| 589 addu t3, v0, s4 // y+cred |
| 590 addu s3, v0, s5 // y+cgreen |
| 591 addu v1, v0, s6 // y+cblue |
| 592 addu t3, t9, t3 // y+cred |
| 593 addu s3, t9, s3 // y+cgreen |
| 594 addu v1, t9, v1 // y+cblue |
| 595 lbu AT, 0(t3) |
| 596 lbu s7, 0(s3) |
| 597 lbu ra, 0(v1) |
| 598 lbu v0, -1(t1) |
| 599 addu t3, v0, s4 // y+cred |
| 600 addu s3, v0, s5 // y+cgreen |
| 601 addu v1, v0, s6 // y+cblue |
| 602 addu t3, t9, t3 // y+cred |
| 603 addu s3, t9, s3 // y+cgreen |
| 604 addu v1, t9, v1 // y+cblue |
| 605 lbu t3, 0(t3) |
| 606 lbu s3, 0(s3) |
| 607 lbu v1, 0(v1) |
| 608 lbu v0, 0(t2) |
| 609 |
| 610 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 |
| 611 |
| 612 addu t3, v0, s4 // y+cred |
| 613 addu s3, v0, s5 // y+cgreen |
| 614 addu v1, v0, s6 // y+cblue |
| 615 addu t3, t9, t3 // y+cred |
| 616 addu s3, t9, s3 // y+cgreen |
| 617 addu v1, t9, v1 // y+cblue |
| 618 lbu AT, 0(t3) |
| 619 lbu s7, 0(s3) |
| 620 lbu ra, 0(v1) |
| 621 lbu v0, 1(t2) |
| 622 addiu t2, t2, 2 |
| 623 addu t3, v0, s4 // y+cred |
| 624 addu s3, v0, s5 // y+cgreen |
| 625 addu v1, v0, s6 // y+cblue |
| 626 addu t3, t9, t3 // y+cred |
| 627 addu s3, t9, s3 // y+cgreen |
| 628 addu v1, t9, v1 // y+cblue |
| 629 lbu t3, 0(t3) |
| 630 lbu s3, 0(s3) |
| 631 lbu v1, 0(v1) |
| 632 |
| 633 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 |
| 634 |
| 635 bne t0, t5, 1b |
| 636 nop |
| 637 2: |
| 638 andi t0, a0, 1 |
| 639 beqz t0, 4f |
| 640 lbu t3, 0(t5) |
| 641 lbu s3, 0(t6) |
| 642 addiu t3, t3, -128 // (cb - 128) |
| 643 addiu s3, s3, -128 // (cr - 128) |
| 644 mult $ac1, s1, t3 |
| 645 madd $ac1, s2, s3 |
| 646 sll s3, s3, 15 |
| 647 sll t3, t3, 15 |
| 648 lbu v0, 0(t1) |
| 649 extr_r.w s5, $ac1, 16 |
| 650 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS |
| 651 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS |
| 652 addu t3, v0, s4 // y+cred |
| 653 addu s3, v0, s5 // y+cgreen |
| 654 addu v1, v0, s6 // y+cblue |
| 655 addu t3, t9, t3 // y+cred |
| 656 addu s3, t9, s3 // y+cgreen |
| 657 addu v1, t9, v1 // y+cblue |
| 658 lbu t3, 0(t3) |
| 659 lbu s3, 0(s3) |
| 660 lbu v1, 0(v1) |
| 661 lbu v0, 0(t2) |
| 662 |
| 663 STORE_H2V2_1_PIXEL t3, s3, v1, t4 |
| 664 |
| 665 addu t3, v0, s4 // y+cred |
| 666 addu s3, v0, s5 // y+cgreen |
| 667 addu v1, v0, s6 // y+cblue |
| 668 addu t3, t9, t3 // y+cred |
| 669 addu s3, t9, s3 // y+cgreen |
| 670 addu v1, t9, v1 // y+cblue |
| 671 lbu t3, 0(t3) |
| 672 lbu s3, 0(s3) |
| 673 lbu v1, 0(v1) |
| 674 |
| 675 STORE_H2V2_1_PIXEL t3, s3, v1, t7 |
| 676 4: |
| 677 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| 678 |
| 679 j ra |
| 680 nop |
| 681 |
| 682 END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2) |
| 683 |
| 684 .purgem STORE_H2V2_1_PIXEL |
| 685 .purgem STORE_H2V2_2_PIXELS |
| 686 .endm |
| 687 |
| 688 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ |
| 689 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 |
| 690 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 |
| 691 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 |
| 692 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 |
| 693 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 |
| 694 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 |
| 695 /*****************************************************************************/ |
| 696 /* |
| 697 * jsimd_h2v1_merged_upsample_mips_dspr2 |
| 698 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2 |
| 699 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2 |
| 700 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2 |
| 701 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2 |
| 702 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2 |
| 703 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2 |
| 704 * |
| 705 * Merged h2v1 upsample routines |
| 706 */ |
| 707 |
| 708 .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \ |
| 709 pixel_size, \ |
| 710 r1_offs, \ |
| 711 g1_offs, \ |
| 712 b1_offs, \ |
| 713 a1_offs, \ |
| 714 r2_offs, \ |
| 715 g2_offs, \ |
| 716 b2_offs, \ |
| 717 a2_offs |
| 718 |
| 719 .macro STORE_H2V1_2_PIXELS scratch0 \ |
| 720 scratch1 \ |
| 721 scratch2 \ |
| 722 scratch3 \ |
| 723 scratch4 \ |
| 724 scratch5 \ |
| 725 outptr |
| 726 sb \scratch0, \r1_offs(\outptr) |
| 727 sb \scratch1, \g1_offs(\outptr) |
| 728 sb \scratch2, \b1_offs(\outptr) |
| 729 sb \scratch3, \r2_offs(\outptr) |
| 730 sb \scratch4, \g2_offs(\outptr) |
| 731 sb \scratch5, \b2_offs(\outptr) |
| 732 .if (\pixel_size == 8) |
| 733 li t0, 0xFF |
| 734 sb t0, \a1_offs(\outptr) |
| 735 sb t0, \a2_offs(\outptr) |
| 736 .endif |
| 737 addiu \outptr, \pixel_size |
| 738 .endm |
| 739 |
| 740 .macro STORE_H2V1_1_PIXEL scratch0 \ |
| 741 scratch1 \ |
| 742 scratch2 \ |
| 743 outptr |
| 744 sb \scratch0, \r1_offs(\outptr) |
| 745 sb \scratch1, \g1_offs(\outptr) |
| 746 sb \scratch2, \b1_offs(\outptr) |
| 747 .if (\pixel_size == 8) |
| 748 li t0, 0xFF |
| 749 sb t0, \a1_offs(\outptr) |
| 750 .endif |
| 751 .endm |
| 752 |
| 753 LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) |
| 754 /* |
| 755 * a0 - cinfo->output_width |
| 756 * a1 - input_buf |
| 757 * a2 - in_row_group_ctr |
| 758 * a3 - output_buf |
| 759 * 16(sp) - range_limit |
| 760 */ |
| 761 |
| 762 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| 763 |
| 764 li t0, 0xe6ea |
| 765 lw t1, 0(a1) // t1 = input_buf[0] |
| 766 lw t2, 4(a1) // t2 = input_buf[1] |
| 767 lw t3, 8(a1) // t3 = input_buf[2] |
| 768 lw t8, 56(sp) // t8 = range_limit |
| 769 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] |
| 770 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] |
| 771 addiu s0, t0, 0x9916 // s0 = 0x8000 |
| 772 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] |
| 773 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] |
| 774 srl t0, a0, 1 |
| 775 sll t4, a2, 2 |
| 776 lwx s5, t4(t1) // s5 = inptr0 |
| 777 lwx s6, t4(t2) // s6 = inptr1 |
| 778 lwx s7, t4(t3) // s7 = inptr2 |
| 779 lw t7, 0(a3) // t7 = outptr |
| 780 blez t0, 2f |
| 781 addu t9, s6, t0 // t9 = end address |
| 782 1: |
| 783 lbu t2, 0(s6) // t2 = cb |
| 784 lbu t0, 0(s7) // t0 = cr |
| 785 lbu t1, 0(s5) // t1 = y |
| 786 addiu t2, t2, -128 // t2 = cb - 128 |
| 787 addiu t0, t0, -128 // t0 = cr - 128 |
| 788 mult $ac1, s4, t2 |
| 789 madd $ac1, s3, t0 |
| 790 sll t0, t0, 15 |
| 791 sll t2, t2, 15 |
| 792 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS |
| 793 extr_r.w t5, $ac1, 16 |
| 794 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS |
| 795 addiu s7, s7, 1 |
| 796 addiu s6, s6, 1 |
| 797 addu t2, t1, t0 // t2 = y + cred |
| 798 addu t3, t1, t5 // t3 = y + cgreen |
| 799 addu t4, t1, t6 // t4 = y + cblue |
| 800 addu t2, t8, t2 |
| 801 addu t3, t8, t3 |
| 802 addu t4, t8, t4 |
| 803 lbu t1, 1(s5) |
| 804 lbu v0, 0(t2) |
| 805 lbu v1, 0(t3) |
| 806 lbu ra, 0(t4) |
| 807 addu t2, t1, t0 |
| 808 addu t3, t1, t5 |
| 809 addu t4, t1, t6 |
| 810 addu t2, t8, t2 |
| 811 addu t3, t8, t3 |
| 812 addu t4, t8, t4 |
| 813 lbu t2, 0(t2) |
| 814 lbu t3, 0(t3) |
| 815 lbu t4, 0(t4) |
| 816 |
| 817 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 |
| 818 |
| 819 bne t9, s6, 1b |
| 820 addiu s5, s5, 2 |
| 821 2: |
| 822 andi t0, a0, 1 |
| 823 beqz t0, 4f |
| 824 nop |
| 825 3: |
| 826 lbu t2, 0(s6) |
| 827 lbu t0, 0(s7) |
| 828 lbu t1, 0(s5) |
| 829 addiu t2, t2, -128 //(cb - 128) |
| 830 addiu t0, t0, -128 //(cr - 128) |
| 831 mul t3, s4, t2 |
| 832 mul t4, s3, t0 |
| 833 sll t0, t0, 15 |
| 834 sll t2, t2, 15 |
| 835 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS |
| 836 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS |
| 837 addu t3, t3, s0 |
| 838 addu t3, t4, t3 |
| 839 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS |
| 840 addu t2, t1, t0 // y + cred |
| 841 addu t3, t1, t5 // y + cgreen |
| 842 addu t4, t1, t6 // y + cblue |
| 843 addu t2, t8, t2 |
| 844 addu t3, t8, t3 |
| 845 addu t4, t8, t4 |
| 846 lbu t2, 0(t2) |
| 847 lbu t3, 0(t3) |
| 848 lbu t4, 0(t4) |
| 849 |
| 850 STORE_H2V1_1_PIXEL t2, t3, t4, t7 |
| 851 4: |
| 852 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra |
| 853 |
| 854 j ra |
| 855 nop |
| 856 |
| 857 END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2) |
| 858 |
| 859 .purgem STORE_H2V1_1_PIXEL |
| 860 .purgem STORE_H2V1_2_PIXELS |
| 861 .endm |
| 862 |
| 863 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ |
| 864 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 |
| 865 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 |
| 866 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 |
| 867 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 |
| 868 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 |
| 869 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 |
| 870 /*****************************************************************************/ |
| 871 /* |
| 872 * jsimd_h2v2_fancy_upsample_mips_dspr2 |
| 873 * |
| 874 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. |
| 875 */ |
| 876 LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2) |
| 877 /* |
| 878 * a0 - cinfo->max_v_samp_factor |
| 879 * a1 - downsampled_width |
| 880 * a2 - input_data |
| 881 * a3 - output_data_ptr |
| 882 */ |
| 883 |
| 884 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 |
| 885 |
| 886 li s4, 0 |
| 887 lw s2, 0(a3) // s2 = *output_data_ptr |
| 888 0: |
| 889 li t9, 2 |
| 890 lw s1, -4(a2) // s1 = inptr1 |
| 891 |
| 892 1: |
| 893 lw s0, 0(a2) // s0 = inptr0 |
| 894 lwx s3, s4(s2) |
| 895 addiu s5, a1, -2 // s5 = downsampled_width - 2 |
| 896 srl t4, s5, 1 |
| 897 sll t4, t4, 1 |
| 898 lbu t0, 0(s0) |
| 899 lbu t1, 1(s0) |
| 900 lbu t2, 0(s1) |
| 901 lbu t3, 1(s1) |
| 902 addiu s0, 2 |
| 903 addiu s1, 2 |
| 904 addu t8, s0, t4 // t8 = end address |
| 905 andi s5, s5, 1 // s5 = residual |
| 906 sll t4, t0, 1 |
| 907 sll t6, t1, 1 |
| 908 addu t0, t0, t4 // t0 = (*inptr0++) * 3 |
| 909 addu t1, t1, t6 // t1 = (*inptr0++) * 3 |
| 910 addu t7, t0, t2 // t7 = thiscolsum |
| 911 addu t6, t1, t3 // t5 = nextcolsum |
| 912 sll t0, t7, 2 // t0 = thiscolsum * 4 |
| 913 subu t1, t0, t7 // t1 = thiscolsum * 3 |
| 914 shra_r.w t0, t0, 4 |
| 915 addiu t1, 7 |
| 916 addu t1, t1, t6 |
| 917 srl t1, t1, 4 |
| 918 sb t0, 0(s3) |
| 919 sb t1, 1(s3) |
| 920 beq t8, s0, 22f // skip to final iteration if width == 3 |
| 921 addiu s3, 2 |
| 922 2: |
| 923 lh t0, 0(s0) // t0 = A3|A2 |
| 924 lh t2, 0(s1) // t2 = B3|B2 |
| 925 addiu s0, 2 |
| 926 addiu s1, 2 |
| 927 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 |
| 928 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 |
| 929 shll.ph t1, t0, 1 |
| 930 sll t3, t6, 1 |
| 931 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 |
| 932 addu t3, t3, t6 // t3 = this * 3 |
| 933 addu.ph t0, t0, t2 // t0 = next2|next1 |
| 934 addu t1, t3, t7 |
| 935 andi t7, t0, 0xFFFF // t7 = next1 |
| 936 sll t2, t7, 1 |
| 937 addu t2, t7, t2 // t2 = next1*3 |
| 938 addu t4, t2, t6 |
| 939 srl t6, t0, 16 // t6 = next2 |
| 940 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 |
| 941 addu t0, t3, t7 |
| 942 addiu t0, 7 |
| 943 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 |
| 944 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 |
| 945 addu t2, t2, t6 |
| 946 addiu t2, 7 |
| 947 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 |
| 948 sb t1, 0(s3) |
| 949 sb t0, 1(s3) |
| 950 sb t4, 2(s3) |
| 951 sb t2, 3(s3) |
| 952 bne t8, s0, 2b |
| 953 addiu s3, 4 |
| 954 22: |
| 955 beqz s5, 4f |
| 956 addu t8, s0, s5 |
| 957 3: |
| 958 lbu t0, 0(s0) |
| 959 lbu t2, 0(s1) |
| 960 addiu s0, 1 |
| 961 addiu s1, 1 |
| 962 sll t3, t6, 1 |
| 963 sll t1, t0, 1 |
| 964 addu t1, t0, t1 // t1 = inptr0 * 3 |
| 965 addu t3, t3, t6 // t3 = thiscolsum * 3 |
| 966 addu t5, t1, t2 |
| 967 addu t1, t3, t7 |
| 968 shra_r.w t1, t1, 4 |
| 969 addu t0, t3, t5 |
| 970 addiu t0, 7 |
| 971 srl t0, t0, 4 |
| 972 sb t1, 0(s3) |
| 973 sb t0, 1(s3) |
| 974 addiu s3, 2 |
| 975 move t7, t6 |
| 976 bne t8, s0, 3b |
| 977 move t6, t5 |
| 978 4: |
| 979 sll t0, t6, 2 // t0 = thiscolsum * 4 |
| 980 subu t1, t0, t6 // t1 = thiscolsum * 3 |
| 981 addu t1, t1, t7 |
| 982 addiu s4, 4 |
| 983 shra_r.w t1, t1, 4 |
| 984 addiu t0, 7 |
| 985 srl t0, t0, 4 |
| 986 sb t1, 0(s3) |
| 987 sb t0, 1(s3) |
| 988 addiu t9, -1 |
| 989 addiu s3, 2 |
| 990 bnez t9, 1b |
| 991 lw s1, 4(a2) |
| 992 srl t0, s4, 2 |
| 993 subu t0, a0, t0 |
| 994 bgtz t0, 0b |
| 995 addiu a2, 4 |
| 996 |
| 997 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 |
| 998 |
| 999 j ra |
| 1000 nop |
| 1001 END(jsimd_h2v2_fancy_upsample_mips_dspr2) |
| 1002 |
| 1003 /*****************************************************************************/ |
| 1004 LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2) |
| 1005 /* |
| 1006 * a0 - cinfo->max_v_samp_factor |
| 1007 * a1 - downsampled_width |
| 1008 * a2 - input_data |
| 1009 * a3 - output_data_ptr |
| 1010 */ |
| 1011 |
| 1012 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| 1013 |
| 1014 .set at |
| 1015 |
| 1016 beqz a0, 3f |
| 1017 sll t0, a0, 2 |
| 1018 lw s1, 0(a3) |
| 1019 li s3, 0x10001 |
| 1020 addu s0, s1, t0 |
| 1021 0: |
| 1022 addiu t8, a1, -2 |
| 1023 srl t9, t8, 2 |
| 1024 lw t7, 0(a2) |
| 1025 lw s2, 0(s1) |
| 1026 lbu t0, 0(t7) |
| 1027 lbu t1, 1(t7) // t1 = inptr[1] |
| 1028 sll t2, t0, 1 |
| 1029 addu t2, t2, t0 // t2 = invalue*3 |
| 1030 addu t2, t2, t1 |
| 1031 shra_r.w t2, t2, 2 |
| 1032 sb t0, 0(s2) |
| 1033 sb t2, 1(s2) |
| 1034 beqz t9, 11f |
| 1035 addiu s2, 2 |
| 1036 1: |
| 1037 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| |
| 1038 ulw t1, 1(t7) |
| 1039 ulh t2, 4(t7) // t2 = |0|0|P5|P4| |
| 1040 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| |
| 1041 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| |
| 1042 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| |
| 1043 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| |
| 1044 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| |
| 1045 shll.ph t5, t4, 1 |
| 1046 shll.ph t6, t1, 1 |
| 1047 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| |
| 1048 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| |
| 1049 addu.ph t4, t3, s3 |
| 1050 addu.ph t0, t0, s3 |
| 1051 addu.ph t4, t4, t5 |
| 1052 addu.ph t0, t0, t6 |
| 1053 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| |
| 1054 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| |
| 1055 addu.ph t2, t2, t5 |
| 1056 addu.ph t3, t3, t6 |
| 1057 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| |
| 1058 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| |
| 1059 shll.ph t2, t2, 8 |
| 1060 shll.ph t3, t3, 8 |
| 1061 or t2, t4, t2 |
| 1062 or t3, t3, t0 |
| 1063 addiu t9, -1 |
| 1064 usw t3, 0(s2) |
| 1065 usw t2, 4(s2) |
| 1066 addiu s2, 8 |
| 1067 bgtz t9, 1b |
| 1068 addiu t7, 4 |
| 1069 11: |
| 1070 andi t8, 3 |
| 1071 beqz t8, 22f |
| 1072 addiu t7, 1 |
| 1073 |
| 1074 2: |
| 1075 lbu t0, 0(t7) |
| 1076 addiu t7, 1 |
| 1077 sll t1, t0, 1 |
| 1078 addu t2, t0, t1 // t2 = invalue |
| 1079 lbu t3, -2(t7) |
| 1080 lbu t4, 0(t7) |
| 1081 addiu t3, 1 |
| 1082 addiu t4, 2 |
| 1083 addu t3, t3, t2 |
| 1084 addu t4, t4, t2 |
| 1085 srl t3, 2 |
| 1086 srl t4, 2 |
| 1087 sb t3, 0(s2) |
| 1088 sb t4, 1(s2) |
| 1089 addiu t8, -1 |
| 1090 bgtz t8, 2b |
| 1091 addiu s2, 2 |
| 1092 |
| 1093 22: |
| 1094 lbu t0, 0(t7) |
| 1095 lbu t2, -1(t7) |
| 1096 sll t1, t0, 1 |
| 1097 addu t1, t1, t0 // t1 = invalue * 3 |
| 1098 addu t1, t1, t2 |
| 1099 addiu t1, 1 |
| 1100 srl t1, t1, 2 |
| 1101 sb t1, 0(s2) |
| 1102 sb t0, 1(s2) |
| 1103 addiu s1, 4 |
| 1104 bne s1, s0, 0b |
| 1105 addiu a2, 4 |
| 1106 3: |
| 1107 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| 1108 |
| 1109 j ra |
| 1110 nop |
| 1111 END(jsimd_h2v1_fancy_upsample_mips_dspr2) |
| 1112 |
| 1113 /*****************************************************************************/ |
| 1114 LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2) |
| 1115 /* |
| 1116 * a0 - cinfo->image_width |
| 1117 * a1 - cinfo->max_v_samp_factor |
| 1118 * a2 - compptr->v_samp_factor |
| 1119 * a3 - compptr->width_in_blocks |
| 1120 * 16(sp) - input_data |
| 1121 * 20(sp) - output_data |
| 1122 */ |
| 1123 .set at |
| 1124 |
| 1125 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 |
| 1126 |
| 1127 beqz a2, 7f |
| 1128 lw s1, 44(sp) // s1 = output_data |
| 1129 lw s0, 40(sp) // s0 = input_data |
| 1130 srl s2, a0, 2 |
| 1131 andi t9, a0, 2 |
| 1132 srl t7, t9, 1 |
| 1133 addu s2, t7, s2 |
| 1134 sll t0, a3, 3 // t0 = width_in_blocks*DCT |
| 1135 srl t7, t0, 1 |
| 1136 subu s2, t7, s2 |
| 1137 0: |
| 1138 andi t6, a0, 1 // t6 = temp_index |
| 1139 addiu t6, -1 |
| 1140 lw t4, 0(s1) // t4 = outptr |
| 1141 lw t5, 0(s0) // t5 = inptr0 |
| 1142 li s3, 0 // s3 = bias |
| 1143 srl t7, a0, 1 // t7 = image_width1 |
| 1144 srl s4, t7, 2 |
| 1145 andi t8, t7, 3 |
| 1146 1: |
| 1147 ulhu t0, 0(t5) |
| 1148 ulhu t1, 2(t5) |
| 1149 ulhu t2, 4(t5) |
| 1150 ulhu t3, 6(t5) |
| 1151 raddu.w.qb t0, t0 |
| 1152 raddu.w.qb t1, t1 |
| 1153 raddu.w.qb t2, t2 |
| 1154 raddu.w.qb t3, t3 |
| 1155 shra.ph t0, t0, 1 |
| 1156 shra_r.ph t1, t1, 1 |
| 1157 shra.ph t2, t2, 1 |
| 1158 shra_r.ph t3, t3, 1 |
| 1159 sb t0, 0(t4) |
| 1160 sb t1, 1(t4) |
| 1161 sb t2, 2(t4) |
| 1162 sb t3, 3(t4) |
| 1163 addiu s4, -1 |
| 1164 addiu t4, 4 |
| 1165 bgtz s4, 1b |
| 1166 addiu t5, 8 |
| 1167 beqz t8, 3f |
| 1168 addu s4, t4, t8 |
| 1169 2: |
| 1170 ulhu t0, 0(t5) |
| 1171 raddu.w.qb t0, t0 |
| 1172 addqh.w t0, t0, s3 |
| 1173 xori s3, s3, 1 |
| 1174 sb t0, 0(t4) |
| 1175 addiu t4, 1 |
| 1176 bne t4, s4, 2b |
| 1177 addiu t5, 2 |
| 1178 3: |
| 1179 lbux t1, t6(t5) |
| 1180 sll t1, 1 |
| 1181 addqh.w t2, t1, s3 // t2 = pixval1 |
| 1182 xori s3, s3, 1 |
| 1183 addqh.w t3, t1, s3 // t3 = pixval2 |
| 1184 blez s2, 5f |
| 1185 append t3, t2, 8 |
| 1186 addu t5, t4, s2 // t5 = loop_end2 |
| 1187 4: |
| 1188 ush t3, 0(t4) |
| 1189 addiu s2, -1 |
| 1190 bgtz s2, 4b |
| 1191 addiu t4, 2 |
| 1192 5: |
| 1193 beqz t9, 6f |
| 1194 nop |
| 1195 sb t2, 0(t4) |
| 1196 6: |
| 1197 addiu s1, 4 |
| 1198 addiu a2, -1 |
| 1199 bnez a2, 0b |
| 1200 addiu s0, 4 |
| 1201 7: |
| 1202 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 |
| 1203 |
| 1204 j ra |
| 1205 nop |
| 1206 END(jsimd_h2v1_downsample_mips_dspr2) |
| 1207 |
| 1208 /*****************************************************************************/ |
| 1209 LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2) |
| 1210 |
| 1211 /* |
| 1212 * a0 - cinfo->image_width |
| 1213 * a1 - cinfo->max_v_samp_factor |
| 1214 * a2 - compptr->v_samp_factor |
| 1215 * a3 - compptr->width_in_blocks |
| 1216 * 16(sp) - input_data |
| 1217 * 20(sp) - output_data |
| 1218 */ |
| 1219 .set at |
| 1220 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 1221 |
| 1222 beqz a2, 8f |
| 1223 lw s1, 52(sp) // s1 = output_data |
| 1224 lw s0, 48(sp) // s0 = input_data |
| 1225 |
| 1226 andi t6, a0, 1 // t6 = temp_index |
| 1227 addiu t6, -1 |
| 1228 srl t7, a0, 1 // t7 = image_width1 |
| 1229 srl s4, t7, 2 |
| 1230 andi t8, t7, 3 |
| 1231 andi t9, a0, 2 |
| 1232 srl s2, a0, 2 |
| 1233 srl t7, t9, 1 |
| 1234 addu s2, t7, s2 |
| 1235 sll t0, a3, 3 // s2 = width_in_blocks*DCT |
| 1236 srl t7, t0, 1 |
| 1237 subu s2, t7, s2 |
| 1238 0: |
| 1239 lw t4, 0(s1) // t4 = outptr |
| 1240 lw t5, 0(s0) // t5 = inptr0 |
| 1241 lw s7, 4(s0) // s7 = inptr1 |
| 1242 li s6, 1 // s6 = bias |
| 1243 2: |
| 1244 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| |
| 1245 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| |
| 1246 ulw t2, 4(t5) |
| 1247 ulw t3, 4(s7) |
| 1248 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| |
| 1249 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| |
| 1250 raddu.w.qb t1, t7 |
| 1251 raddu.w.qb t0, t0 |
| 1252 shra_r.w t1, t1, 2 |
| 1253 addiu t0, 1 |
| 1254 srl t0, 2 |
| 1255 precrq.ph.w t7, t2, t3 |
| 1256 ins t2, t3, 16, 16 |
| 1257 raddu.w.qb t7, t7 |
| 1258 raddu.w.qb t2, t2 |
| 1259 shra_r.w t7, t7, 2 |
| 1260 addiu t2, 1 |
| 1261 srl t2, 2 |
| 1262 sb t0, 0(t4) |
| 1263 sb t1, 1(t4) |
| 1264 sb t2, 2(t4) |
| 1265 sb t7, 3(t4) |
| 1266 addiu t4, 4 |
| 1267 addiu t5, 8 |
| 1268 addiu s4, s4, -1 |
| 1269 bgtz s4, 2b |
| 1270 addiu s7, 8 |
| 1271 beqz t8, 4f |
| 1272 addu t8, t4, t8 |
| 1273 3: |
| 1274 ulhu t0, 0(t5) |
| 1275 ulhu t1, 0(s7) |
| 1276 ins t0, t1, 16, 16 |
| 1277 raddu.w.qb t0, t0 |
| 1278 addu t0, t0, s6 |
| 1279 srl t0, 2 |
| 1280 xori s6, s6, 3 |
| 1281 sb t0, 0(t4) |
| 1282 addiu t5, 2 |
| 1283 addiu t4, 1 |
| 1284 bne t8, t4, 3b |
| 1285 addiu s7, 2 |
| 1286 4: |
| 1287 lbux t1, t6(t5) |
| 1288 sll t1, 1 |
| 1289 lbux t0, t6(s7) |
| 1290 sll t0, 1 |
| 1291 addu t1, t1, t0 |
| 1292 addu t3, t1, s6 |
| 1293 srl t0, t3, 2 // t2 = pixval1 |
| 1294 xori s6, s6, 3 |
| 1295 addu t2, t1, s6 |
| 1296 srl t1, t2, 2 // t3 = pixval2 |
| 1297 blez s2, 6f |
| 1298 append t1, t0, 8 |
| 1299 5: |
| 1300 ush t1, 0(t4) |
| 1301 addiu s2, -1 |
| 1302 bgtz s2, 5b |
| 1303 addiu t4, 2 |
| 1304 6: |
| 1305 beqz t9, 7f |
| 1306 nop |
| 1307 sb t0, 0(t4) |
| 1308 7: |
| 1309 addiu s1, 4 |
| 1310 addiu a2, -1 |
| 1311 bnez a2, 0b |
| 1312 addiu s0, 8 |
| 1313 8: |
| 1314 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 1315 |
| 1316 j ra |
| 1317 nop |
| 1318 END(jsimd_h2v2_downsample_mips_dspr2) |
| 1319 /*****************************************************************************/ |
| 1320 LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2) |
| 1321 /* |
| 1322 * a0 - input_data |
| 1323 * a1 - output_data |
| 1324 * a2 - compptr->v_samp_factor |
| 1325 * a3 - cinfo->max_v_samp_factor |
| 1326 * 16(sp) - cinfo->smoothing_factor |
| 1327 * 20(sp) - compptr->width_in_blocks |
| 1328 * 24(sp) - cinfo->image_width |
| 1329 */ |
| 1330 |
| 1331 .set at |
| 1332 |
| 1333 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 1334 |
| 1335 lw s7, 52(sp) // compptr->width_in_blocks |
| 1336 lw s0, 56(sp) // cinfo->image_width |
| 1337 lw s6, 48(sp) // cinfo->smoothing_factor |
| 1338 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE |
| 1339 sll v0, s7, 1 |
| 1340 subu v0, v0, s0 |
| 1341 blez v0, 2f |
| 1342 move v1, zero |
| 1343 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 |
| 1344 0: |
| 1345 addiu t1, a0, -4 |
| 1346 sll t2, v1, 2 |
| 1347 lwx t1, t2(t1) |
| 1348 move t3, v0 |
| 1349 addu t1, t1, s0 |
| 1350 lbu t2, -1(t1) |
| 1351 1: |
| 1352 addiu t3, t3, -1 |
| 1353 sb t2, 0(t1) |
| 1354 bgtz t3, 1b |
| 1355 addiu t1, t1, 1 |
| 1356 addiu v1, v1, 1 |
| 1357 bne v1, t0, 0b |
| 1358 nop |
| 1359 2: |
| 1360 li v0, 80 |
| 1361 mul v0, s6, v0 |
| 1362 li v1, 16384 |
| 1363 move t4, zero |
| 1364 move t5, zero |
| 1365 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 |
| 1366 sll t7, s6, 4 // t7 = tmp_smoot_f * 16 |
| 1367 3: |
| 1368 /* Special case for first column: pretend column -1 is same as column 0 */ |
| 1369 sll v0, t4, 2 |
| 1370 lwx t8, v0(a1) // outptr = output_data[outrow] |
| 1371 sll v1, t5, 2 |
| 1372 addiu t9, v1, 4 |
| 1373 addiu s0, v1, -4 |
| 1374 addiu s1, v1, 8 |
| 1375 lwx s2, v1(a0) // inptr0 = input_data[inrow] |
| 1376 lwx t9, t9(a0) // inptr1 = input_data[inrow+1] |
| 1377 lwx s0, s0(a0) // above_ptr = input_data[inrow-1] |
| 1378 lwx s1, s1(a0) // below_ptr = input_data[inrow+2] |
| 1379 lh v0, 0(s2) |
| 1380 lh v1, 0(t9) |
| 1381 lh t0, 0(s0) |
| 1382 lh t1, 0(s1) |
| 1383 ins v0, v1, 16, 16 |
| 1384 ins t0, t1, 16, 16 |
| 1385 raddu.w.qb t2, v0 |
| 1386 raddu.w.qb s3, t0 |
| 1387 lbu v0, 0(s2) |
| 1388 lbu v1, 2(s2) |
| 1389 lbu t0, 0(t9) |
| 1390 lbu t1, 2(t9) |
| 1391 addu v0, v0, v1 |
| 1392 mult $ac1,t2, t6 |
| 1393 addu t0, t0, t1 |
| 1394 lbu t2, 2(s0) |
| 1395 addu t0, t0, v0 |
| 1396 lbu t3, 2(s1) |
| 1397 addu s3, t0, s3 |
| 1398 lbu v0, 0(s0) |
| 1399 lbu t0, 0(s1) |
| 1400 sll s3, s3, 1 |
| 1401 addu v0, v0, t2 |
| 1402 addu t0, t0, t3 |
| 1403 addu t0, t0, v0 |
| 1404 addu s3, t0, s3 |
| 1405 madd $ac1,s3, t7 |
| 1406 extr_r.w v0, $ac1, 16 |
| 1407 addiu t8, t8, 1 |
| 1408 addiu s2, s2, 2 |
| 1409 addiu t9, t9, 2 |
| 1410 addiu s0, s0, 2 |
| 1411 addiu s1, s1, 2 |
| 1412 sb v0, -1(t8) |
| 1413 addiu s4, s7, -2 |
| 1414 and s4, s4, 3 |
| 1415 addu s5, s4, t8 //end adress |
| 1416 4: |
| 1417 lh v0, 0(s2) |
| 1418 lh v1, 0(t9) |
| 1419 lh t0, 0(s0) |
| 1420 lh t1, 0(s1) |
| 1421 ins v0, v1, 16, 16 |
| 1422 ins t0, t1, 16, 16 |
| 1423 raddu.w.qb t2, v0 |
| 1424 raddu.w.qb s3, t0 |
| 1425 lbu v0, -1(s2) |
| 1426 lbu v1, 2(s2) |
| 1427 lbu t0, -1(t9) |
| 1428 lbu t1, 2(t9) |
| 1429 addu v0, v0, v1 |
| 1430 mult $ac1, t2, t6 |
| 1431 addu t0, t0, t1 |
| 1432 lbu t2, 2(s0) |
| 1433 addu t0, t0, v0 |
| 1434 lbu t3, 2(s1) |
| 1435 addu s3, t0, s3 |
| 1436 lbu v0, -1(s0) |
| 1437 lbu t0, -1(s1) |
| 1438 sll s3, s3, 1 |
| 1439 addu v0, v0, t2 |
| 1440 addu t0, t0, t3 |
| 1441 addu t0, t0, v0 |
| 1442 addu s3, t0, s3 |
| 1443 madd $ac1, s3, t7 |
| 1444 extr_r.w t2, $ac1, 16 |
| 1445 addiu t8, t8, 1 |
| 1446 addiu s2, s2, 2 |
| 1447 addiu t9, t9, 2 |
| 1448 addiu s0, s0, 2 |
| 1449 sb t2, -1(t8) |
| 1450 bne s5, t8, 4b |
| 1451 addiu s1, s1, 2 |
| 1452 addiu s5, s7, -2 |
| 1453 subu s5, s5, s4 |
| 1454 addu s5, s5, t8 //end adress |
| 1455 5: |
| 1456 lh v0, 0(s2) |
| 1457 lh v1, 0(t9) |
| 1458 lh t0, 0(s0) |
| 1459 lh t1, 0(s1) |
| 1460 ins v0, v1, 16, 16 |
| 1461 ins t0, t1, 16, 16 |
| 1462 raddu.w.qb t2, v0 |
| 1463 raddu.w.qb s3, t0 |
| 1464 lbu v0, -1(s2) |
| 1465 lbu v1, 2(s2) |
| 1466 lbu t0, -1(t9) |
| 1467 lbu t1, 2(t9) |
| 1468 addu v0, v0, v1 |
| 1469 mult $ac1, t2, t6 |
| 1470 addu t0, t0, t1 |
| 1471 lbu t2, 2(s0) |
| 1472 addu t0, t0, v0 |
| 1473 lbu t3, 2(s1) |
| 1474 addu s3, t0, s3 |
| 1475 lbu v0, -1(s0) |
| 1476 lbu t0, -1(s1) |
| 1477 sll s3, s3, 1 |
| 1478 addu v0, v0, t2 |
| 1479 addu t0, t0, t3 |
| 1480 lh v1, 2(t9) |
| 1481 addu t0, t0, v0 |
| 1482 lh v0, 2(s2) |
| 1483 addu s3, t0, s3 |
| 1484 lh t0, 2(s0) |
| 1485 lh t1, 2(s1) |
| 1486 madd $ac1, s3, t7 |
| 1487 extr_r.w t2, $ac1, 16 |
| 1488 ins t0, t1, 16, 16 |
| 1489 ins v0, v1, 16, 16 |
| 1490 raddu.w.qb s3, t0 |
| 1491 lbu v1, 4(s2) |
| 1492 lbu t0, 1(t9) |
| 1493 lbu t1, 4(t9) |
| 1494 sb t2, 0(t8) |
| 1495 raddu.w.qb t3, v0 |
| 1496 lbu v0, 1(s2) |
| 1497 addu t0, t0, t1 |
| 1498 mult $ac1, t3, t6 |
| 1499 addu v0, v0, v1 |
| 1500 lbu t2, 4(s0) |
| 1501 addu t0, t0, v0 |
| 1502 lbu v0, 1(s0) |
| 1503 addu s3, t0, s3 |
| 1504 lbu t0, 1(s1) |
| 1505 lbu t3, 4(s1) |
| 1506 addu v0, v0, t2 |
| 1507 sll s3, s3, 1 |
| 1508 addu t0, t0, t3 |
| 1509 lh v1, 4(t9) |
| 1510 addu t0, t0, v0 |
| 1511 lh v0, 4(s2) |
| 1512 addu s3, t0, s3 |
| 1513 lh t0, 4(s0) |
| 1514 lh t1, 4(s1) |
| 1515 madd $ac1, s3, t7 |
| 1516 extr_r.w t2, $ac1, 16 |
| 1517 ins t0, t1, 16, 16 |
| 1518 ins v0, v1, 16, 16 |
| 1519 raddu.w.qb s3, t0 |
| 1520 lbu v1, 6(s2) |
| 1521 lbu t0, 3(t9) |
| 1522 lbu t1, 6(t9) |
| 1523 sb t2, 1(t8) |
| 1524 raddu.w.qb t3, v0 |
| 1525 lbu v0, 3(s2) |
| 1526 addu t0, t0,t1 |
| 1527 mult $ac1, t3, t6 |
| 1528 addu v0, v0, v1 |
| 1529 lbu t2, 6(s0) |
| 1530 addu t0, t0, v0 |
| 1531 lbu v0, 3(s0) |
| 1532 addu s3, t0, s3 |
| 1533 lbu t0, 3(s1) |
| 1534 lbu t3, 6(s1) |
| 1535 addu v0, v0, t2 |
| 1536 sll s3, s3, 1 |
| 1537 addu t0, t0, t3 |
| 1538 lh v1, 6(t9) |
| 1539 addu t0, t0, v0 |
| 1540 lh v0, 6(s2) |
| 1541 addu s3, t0, s3 |
| 1542 lh t0, 6(s0) |
| 1543 lh t1, 6(s1) |
| 1544 madd $ac1, s3, t7 |
| 1545 extr_r.w t3, $ac1, 16 |
| 1546 ins t0, t1, 16, 16 |
| 1547 ins v0, v1, 16, 16 |
| 1548 raddu.w.qb s3, t0 |
| 1549 lbu v1, 8(s2) |
| 1550 lbu t0, 5(t9) |
| 1551 lbu t1, 8(t9) |
| 1552 sb t3, 2(t8) |
| 1553 raddu.w.qb t2, v0 |
| 1554 lbu v0, 5(s2) |
| 1555 addu t0, t0, t1 |
| 1556 mult $ac1, t2, t6 |
| 1557 addu v0, v0, v1 |
| 1558 lbu t2, 8(s0) |
| 1559 addu t0, t0, v0 |
| 1560 lbu v0, 5(s0) |
| 1561 addu s3, t0, s3 |
| 1562 lbu t0, 5(s1) |
| 1563 lbu t3, 8(s1) |
| 1564 addu v0, v0, t2 |
| 1565 sll s3, s3, 1 |
| 1566 addu t0, t0, t3 |
| 1567 addiu t8, t8, 4 |
| 1568 addu t0, t0, v0 |
| 1569 addiu s2, s2, 8 |
| 1570 addu s3, t0, s3 |
| 1571 addiu t9, t9, 8 |
| 1572 madd $ac1, s3, t7 |
| 1573 extr_r.w t1, $ac1, 16 |
| 1574 addiu s0, s0, 8 |
| 1575 addiu s1, s1, 8 |
| 1576 bne s5, t8, 5b |
| 1577 sb t1, -1(t8) |
| 1578 /* Special case for last column */ |
| 1579 lh v0, 0(s2) |
| 1580 lh v1, 0(t9) |
| 1581 lh t0, 0(s0) |
| 1582 lh t1, 0(s1) |
| 1583 ins v0, v1, 16, 16 |
| 1584 ins t0, t1, 16, 16 |
| 1585 raddu.w.qb t2, v0 |
| 1586 raddu.w.qb s3, t0 |
| 1587 lbu v0, -1(s2) |
| 1588 lbu v1, 1(s2) |
| 1589 lbu t0, -1(t9) |
| 1590 lbu t1, 1(t9) |
| 1591 addu v0, v0, v1 |
| 1592 mult $ac1, t2, t6 |
| 1593 addu t0, t0, t1 |
| 1594 lbu t2, 1(s0) |
| 1595 addu t0, t0, v0 |
| 1596 lbu t3, 1(s1) |
| 1597 addu s3, t0, s3 |
| 1598 lbu v0, -1(s0) |
| 1599 lbu t0, -1(s1) |
| 1600 sll s3, s3, 1 |
| 1601 addu v0, v0, t2 |
| 1602 addu t0, t0, t3 |
| 1603 addu t0, t0, v0 |
| 1604 addu s3, t0, s3 |
| 1605 madd $ac1, s3, t7 |
| 1606 extr_r.w t0, $ac1, 16 |
| 1607 addiu t5, t5, 2 |
| 1608 sb t0, 0(t8) |
| 1609 addiu t4, t4, 1 |
| 1610 bne t4, a2, 3b |
| 1611 addiu t5, t5, 2 |
| 1612 |
| 1613 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 1614 |
| 1615 j ra |
| 1616 nop |
| 1617 |
| 1618 END(jsimd_h2v2_smooth_downsample_mips_dspr2) |
| 1619 |
| 1620 /*****************************************************************************/ |
| 1621 LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2) |
| 1622 /* |
| 1623 * a0 - upsample->h_expand[compptr->component_index] |
| 1624 * a1 - upsample->v_expand[compptr->component_index] |
| 1625 * a2 - input_data |
| 1626 * a3 - output_data_ptr |
| 1627 * 16(sp) - cinfo->output_width |
| 1628 * 20(sp) - cinfo->max_v_samp_factor |
| 1629 */ |
| 1630 .set at |
| 1631 |
| 1632 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| 1633 |
| 1634 lw s0, 0(a3) // s0 = output_data |
| 1635 lw s1, 32(sp) // s1 = cinfo->output_width |
| 1636 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor |
| 1637 li t6, 0 // t6 = inrow |
| 1638 beqz s2, 10f |
| 1639 li s3, 0 // s3 = outrow |
| 1640 0: |
| 1641 addu t0, a2, t6 |
| 1642 addu t7, s0, s3 |
| 1643 lw t3, 0(t0) // t3 = inptr |
| 1644 lw t8, 0(t7) // t8 = outptr |
| 1645 beqz s1, 4f |
| 1646 addu t5, t8, s1 // t5 = outend |
| 1647 1: |
| 1648 lb t2, 0(t3) // t2 = invalue = *inptr++ |
| 1649 addiu t3, 1 |
| 1650 beqz a0, 3f |
| 1651 move t0, a0 // t0 = h_expand |
| 1652 2: |
| 1653 sb t2, 0(t8) |
| 1654 addiu t0, -1 |
| 1655 bgtz t0, 2b |
| 1656 addiu t8, 1 |
| 1657 3: |
| 1658 bgt t5, t8, 1b |
| 1659 nop |
| 1660 4: |
| 1661 addiu t9, a1, -1 // t9 = v_expand - 1 |
| 1662 blez t9, 9f |
| 1663 nop |
| 1664 5: |
| 1665 lw t3, 0(s0) |
| 1666 lw t4, 4(s0) |
| 1667 subu t0, s1, 0xF |
| 1668 blez t0, 7f |
| 1669 addu t5, t3, s1 // t5 = end address |
| 1670 andi t7, s1, 0xF // t7 = residual |
| 1671 subu t8, t5, t7 |
| 1672 6: |
| 1673 ulw t0, 0(t3) |
| 1674 ulw t1, 4(t3) |
| 1675 ulw t2, 8(t3) |
| 1676 usw t0, 0(t4) |
| 1677 ulw t0, 12(t3) |
| 1678 usw t1, 4(t4) |
| 1679 usw t2, 8(t4) |
| 1680 usw t0, 12(t4) |
| 1681 addiu t3, 16 |
| 1682 bne t3, t8, 6b |
| 1683 addiu t4, 16 |
| 1684 beqz t7, 8f |
| 1685 nop |
| 1686 7: |
| 1687 lbu t0, 0(t3) |
| 1688 sb t0, 0(t4) |
| 1689 addiu t3, 1 |
| 1690 bne t3, t5, 7b |
| 1691 addiu t4, 1 |
| 1692 8: |
| 1693 addiu t9, -1 |
| 1694 bgtz t9, 5b |
| 1695 addiu s0, 8 |
| 1696 9: |
| 1697 addu s3, s3, a1 |
| 1698 bne s3, s2, 0b |
| 1699 addiu t6, 1 |
| 1700 10: |
| 1701 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| 1702 |
| 1703 j ra |
| 1704 nop |
| 1705 END(jsimd_int_upsample_mips_dspr2) |
| 1706 |
| 1707 /*****************************************************************************/ |
| 1708 LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2) |
| 1709 /* |
| 1710 * a0 - cinfo->max_v_samp_factor |
| 1711 * a1 - cinfo->output_width |
| 1712 * a2 - input_data |
| 1713 * a3 - output_data_ptr |
| 1714 */ |
| 1715 lw t7, 0(a3) // t7 = output_data |
| 1716 andi t8, a1, 0xf // t8 = residual |
| 1717 sll t0, a0, 2 |
| 1718 blez a0, 4f |
| 1719 addu t9, t7, t0 // t9 = output_data end address |
| 1720 0: |
| 1721 lw t5, 0(t7) // t5 = outptr |
| 1722 lw t6, 0(a2) // t6 = inptr |
| 1723 addu t3, t5, a1 // t3 = outptr + output_width (end address) |
| 1724 subu t3, t8 // t3 = end address - residual |
| 1725 beq t5, t3, 2f |
| 1726 move t4, t8 |
| 1727 1: |
| 1728 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| |
| 1729 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| |
| 1730 srl t1, t0, 16 // t1 = |X|X|P3|P2| |
| 1731 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| |
| 1732 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| |
| 1733 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| |
| 1734 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| |
| 1735 usw t0, 0(t5) |
| 1736 usw t1, 4(t5) |
| 1737 srl t0, t2, 16 // t0 = |X|X|P7|P6| |
| 1738 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| |
| 1739 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| |
| 1740 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| |
| 1741 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| |
| 1742 usw t2, 8(t5) |
| 1743 usw t0, 12(t5) |
| 1744 addiu t5, 16 |
| 1745 bne t5, t3, 1b |
| 1746 addiu t6, 8 |
| 1747 beqz t8, 3f |
| 1748 move t4, t8 |
| 1749 2: |
| 1750 lbu t1, 0(t6) |
| 1751 sb t1, 0(t5) |
| 1752 sb t1, 1(t5) |
| 1753 addiu t4, -2 |
| 1754 addiu t6, 1 |
| 1755 bgtz t4, 2b |
| 1756 addiu t5, 2 |
| 1757 3: |
| 1758 addiu t7, 4 |
| 1759 bne t9, t7, 0b |
| 1760 addiu a2, 4 |
| 1761 4: |
| 1762 j ra |
| 1763 nop |
| 1764 END(jsimd_h2v1_upsample_mips_dspr2) |
| 1765 |
| 1766 /*****************************************************************************/ |
| 1767 LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) |
| 1768 /* |
| 1769 * a0 - cinfo->max_v_samp_factor |
| 1770 * a1 - cinfo->output_width |
| 1771 * a2 - input_data |
| 1772 * a3 - output_data_ptr |
| 1773 */ |
| 1774 lw t7, 0(a3) |
| 1775 blez a0, 7f |
| 1776 andi t9, a1, 0xf // t9 = residual |
| 1777 0: |
| 1778 lw t6, 0(a2) // t6 = inptr |
| 1779 lw t5, 0(t7) // t5 = outptr |
| 1780 addu t8, t5, a1 // t8 = outptr end address |
| 1781 subu t8, t9 // t8 = end address - residual |
| 1782 beq t5, t8, 2f |
| 1783 move t4, t9 |
| 1784 1: |
| 1785 ulw t0, 0(t6) |
| 1786 srl t1, t0, 16 |
| 1787 ins t0, t0, 16, 16 |
| 1788 ins t0, t0, 8, 16 |
| 1789 ins t1, t1, 16, 16 |
| 1790 ins t1, t1, 8, 16 |
| 1791 ulw t2, 4(t6) |
| 1792 usw t0, 0(t5) |
| 1793 usw t1, 4(t5) |
| 1794 srl t3, t2, 16 |
| 1795 ins t2, t2, 16, 16 |
| 1796 ins t2, t2, 8, 16 |
| 1797 ins t3, t3, 16, 16 |
| 1798 ins t3, t3, 8, 16 |
| 1799 usw t2, 8(t5) |
| 1800 usw t3, 12(t5) |
| 1801 addiu t5, 16 |
| 1802 bne t5, t8, 1b |
| 1803 addiu t6, 8 |
| 1804 beqz t9, 3f |
| 1805 move t4, t9 |
| 1806 2: |
| 1807 lbu t0, 0(t6) |
| 1808 sb t0, 0(t5) |
| 1809 sb t0, 1(t5) |
| 1810 addiu t4, -2 |
| 1811 addiu t6, 1 |
| 1812 bgtz t4, 2b |
| 1813 addiu t5, 2 |
| 1814 3: |
| 1815 lw t6, 0(t7) // t6 = outptr[0] |
| 1816 lw t5, 4(t7) // t5 = outptr[1] |
| 1817 addu t4, t6, a1 // t4 = new end address |
| 1818 beq a1, t9, 5f |
| 1819 subu t8, t4, t9 |
| 1820 4: |
| 1821 ulw t0, 0(t6) |
| 1822 ulw t1, 4(t6) |
| 1823 ulw t2, 8(t6) |
| 1824 usw t0, 0(t5) |
| 1825 ulw t0, 12(t6) |
| 1826 usw t1, 4(t5) |
| 1827 usw t2, 8(t5) |
| 1828 usw t0, 12(t5) |
| 1829 addiu t6, 16 |
| 1830 bne t6, t8, 4b |
| 1831 addiu t5, 16 |
| 1832 beqz t9, 6f |
| 1833 nop |
| 1834 5: |
| 1835 lbu t0, 0(t6) |
| 1836 sb t0, 0(t5) |
| 1837 addiu t6, 1 |
| 1838 bne t6, t4, 5b |
| 1839 addiu t5, 1 |
| 1840 6: |
| 1841 addiu t7, 8 |
| 1842 addiu a0, -2 |
| 1843 bgtz a0, 0b |
| 1844 addiu a2, 4 |
| 1845 7: |
| 1846 j ra |
| 1847 nop |
| 1848 END(jsimd_h2v2_upsample_mips_dspr2) |
| 1849 |
| 1850 /*****************************************************************************/ |
| 1851 LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2) |
| 1852 /* |
| 1853 * a0 - coef_block |
| 1854 * a1 - compptr->dcttable |
| 1855 * a2 - output |
| 1856 * a3 - range_limit |
| 1857 */ |
| 1858 |
| 1859 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 1860 |
| 1861 addiu sp, sp, -256 |
| 1862 move v0, sp |
| 1863 addiu v1, zero, 8 // v1 = DCTSIZE = 8 |
| 1864 1: |
| 1865 lh s4, 32(a0) // s4 = inptr[16] |
| 1866 lh s5, 64(a0) // s5 = inptr[32] |
| 1867 lh s6, 96(a0) // s6 = inptr[48] |
| 1868 lh t1, 112(a0) // t1 = inptr[56] |
| 1869 lh t7, 16(a0) // t7 = inptr[8] |
| 1870 lh t5, 80(a0) // t5 = inptr[40] |
| 1871 lh t3, 48(a0) // t3 = inptr[24] |
| 1872 or s4, s4, t1 |
| 1873 or s4, s4, t3 |
| 1874 or s4, s4, t5 |
| 1875 or s4, s4, t7 |
| 1876 or s4, s4, s5 |
| 1877 or s4, s4, s6 |
| 1878 bnez s4, 2f |
| 1879 addiu v1, v1, -1 |
| 1880 lh s5, 0(a1) // quantptr[DCTSIZE*0] |
| 1881 lh s6, 0(a0) // inptr[DCTSIZE*0] |
| 1882 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) |
| 1883 sll s5, s5, 2 |
| 1884 sw s5, 0(v0) |
| 1885 sw s5, 32(v0) |
| 1886 sw s5, 64(v0) |
| 1887 sw s5, 96(v0) |
| 1888 sw s5, 128(v0) |
| 1889 sw s5, 160(v0) |
| 1890 sw s5, 192(v0) |
| 1891 b 3f |
| 1892 sw s5, 224(v0) |
| 1893 2: |
| 1894 lh t0, 112(a1) |
| 1895 lh t2, 48(a1) |
| 1896 lh t4, 80(a1) |
| 1897 lh t6, 16(a1) |
| 1898 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7]) |
| 1899 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3]) |
| 1900 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5]) |
| 1901 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1]) |
| 1902 lh t4, 32(a1) |
| 1903 lh t5, 32(a0) |
| 1904 lh t6, 96(a1) |
| 1905 lh t7, 96(a0) |
| 1906 addu s0, t0, t1 // z3 = tmp0 + tmp2 |
| 1907 addu s1, t1, t2 // z2 = tmp1 + tmp2 |
| 1908 addu s2, t2, t3 // z4 = tmp1 + tmp3 |
| 1909 addu s3, s0, s2 // z3 + z4 |
| 1910 addiu t9, zero, 9633 // FIX_1_175875602 |
| 1911 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) |
| 1912 addu t8, t0, t3 // z1 = tmp0 + tmp3 |
| 1913 addiu t9, zero, 2446 // FIX_0_298631336 |
| 1914 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) |
| 1915 addiu t9, zero, 16819 // FIX_2_053119869 |
| 1916 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) |
| 1917 addiu t9, zero, 25172 // FIX_3_072711026 |
| 1918 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) |
| 1919 addiu t9, zero, 12299 // FIX_1_501321110 |
| 1920 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) |
| 1921 addiu t9, zero, 16069 // FIX_1_961570560 |
| 1922 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) |
| 1923 addiu t9, zero, 3196 // FIX_0_390180644 |
| 1924 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) |
| 1925 addiu t9, zero, 7373 // FIX_0_899976223 |
| 1926 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) |
| 1927 addiu t9, zero, 20995 // FIX_2_562915447 |
| 1928 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) |
| 1929 subu s0, s3, s0 // z3 += z5 |
| 1930 addu t0, t0, s0 // tmp0 += z3 |
| 1931 addu t1, t1, s0 // tmp2 += z3 |
| 1932 subu s2, s3, s2 // z4 += z5 |
| 1933 addu t2, t2, s2 // tmp1 += z4 |
| 1934 addu t3, t3, s2 // tmp3 += z4 |
| 1935 subu t0, t0, t8 // tmp0 += z1 |
| 1936 subu t1, t1, s1 // tmp2 += z2 |
| 1937 subu t2, t2, s1 // tmp1 += z2 |
| 1938 subu t3, t3, t8 // tmp3 += z1 |
| 1939 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2]) |
| 1940 addiu t9, zero, 6270 // FIX_0_765366865 |
| 1941 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6]) |
| 1942 lh t4, 0(a1) |
| 1943 lh t5, 0(a0) |
| 1944 lh t6, 64(a1) |
| 1945 lh t7, 64(a0) |
| 1946 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) |
| 1947 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0]) |
| 1948 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4]) |
| 1949 addiu t9, zero, 4433 // FIX_0_541196100 |
| 1950 addu s3, s0, s1 // z2 + z3 |
| 1951 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) |
| 1952 addiu t9, zero, 15137 // FIX_1_847759065 |
| 1953 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) |
| 1954 addu t4, t5, t6 |
| 1955 subu t5, t5, t6 |
| 1956 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS |
| 1957 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS |
| 1958 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) |
| 1959 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) |
| 1960 addu s0, t4, t7 |
| 1961 subu s1, t4, t7 |
| 1962 addu s2, t5, t6 |
| 1963 subu s3, t5, t6 |
| 1964 addu t4, s0, t3 |
| 1965 subu s0, s0, t3 |
| 1966 addu t3, s2, t1 |
| 1967 subu s2, s2, t1 |
| 1968 addu t1, s3, t2 |
| 1969 subu s3, s3, t2 |
| 1970 addu t2, s1, t0 |
| 1971 subu s1, s1, t0 |
| 1972 shra_r.w t4, t4, 11 |
| 1973 shra_r.w t3, t3, 11 |
| 1974 shra_r.w t1, t1, 11 |
| 1975 shra_r.w t2, t2, 11 |
| 1976 shra_r.w s1, s1, 11 |
| 1977 shra_r.w s3, s3, 11 |
| 1978 shra_r.w s2, s2, 11 |
| 1979 shra_r.w s0, s0, 11 |
| 1980 sw t4, 0(v0) |
| 1981 sw t3, 32(v0) |
| 1982 sw t1, 64(v0) |
| 1983 sw t2, 96(v0) |
| 1984 sw s1, 128(v0) |
| 1985 sw s3, 160(v0) |
| 1986 sw s2, 192(v0) |
| 1987 sw s0, 224(v0) |
| 1988 3: |
| 1989 addiu a1, a1, 2 |
| 1990 addiu a0, a0, 2 |
| 1991 bgtz v1, 1b |
| 1992 addiu v0, v0, 4 |
| 1993 move v0, sp |
| 1994 addiu v1, zero, 8 |
| 1995 4: |
| 1996 lw t0, 8(v0) // z2 = (JLONG) wsptr[2] |
| 1997 lw t1, 24(v0) // z3 = (JLONG) wsptr[6] |
| 1998 lw t2, 0(v0) // (JLONG) wsptr[0] |
| 1999 lw t3, 16(v0) // (JLONG) wsptr[4] |
| 2000 lw s4, 4(v0) // (JLONG) wsptr[1] |
| 2001 lw s5, 12(v0) // (JLONG) wsptr[3] |
| 2002 lw s6, 20(v0) // (JLONG) wsptr[5] |
| 2003 lw s7, 28(v0) // (JLONG) wsptr[7] |
| 2004 or s4, s4, t0 |
| 2005 or s4, s4, t1 |
| 2006 or s4, s4, t3 |
| 2007 or s4, s4, s7 |
| 2008 or s4, s4, s5 |
| 2009 or s4, s4, s6 |
| 2010 bnez s4, 5f |
| 2011 addiu v1, v1, -1 |
| 2012 shra_r.w s5, t2, 5 |
| 2013 andi s5, s5, 0x3ff |
| 2014 lbux s5, s5(a3) |
| 2015 lw s1, 0(a2) |
| 2016 replv.qb s5, s5 |
| 2017 usw s5, 0(s1) |
| 2018 usw s5, 4(s1) |
| 2019 b 6f |
| 2020 nop |
| 2021 5: |
| 2022 addu t4, t0, t1 // z2 + z3 |
| 2023 addiu t8, zero, 4433 // FIX_0_541196100 |
| 2024 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) |
| 2025 addiu t8, zero, 15137 // FIX_1_847759065 |
| 2026 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) |
| 2027 addiu t8, zero, 6270 // FIX_0_765366865 |
| 2028 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) |
| 2029 addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4] |
| 2030 subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4] |
| 2031 sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS |
| 2032 sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS |
| 2033 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065) |
| 2034 subu t3, t2, t1 // tmp12 = tmp1 - tmp2 |
| 2035 addu t2, t2, t1 // tmp11 = tmp1 + tmp2 |
| 2036 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) |
| 2037 subu t1, t4, t5 // tmp13 = tmp0 - tmp3 |
| 2038 addu t0, t4, t5 // tmp10 = tmp0 + tmp3 |
| 2039 lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7] |
| 2040 lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3] |
| 2041 lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5] |
| 2042 lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1] |
| 2043 addu s0, t4, t6 // z3 = tmp0 + tmp2 |
| 2044 addiu t8, zero, 9633 // FIX_1_175875602 |
| 2045 addu s1, t5, t7 // z4 = tmp1 + tmp3 |
| 2046 addu s2, s0, s1 // z3 + z4 |
| 2047 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) |
| 2048 addu s3, t4, t7 // z1 = tmp0 + tmp3 |
| 2049 addu t9, t5, t6 // z2 = tmp1 + tmp2 |
| 2050 addiu t8, zero, 16069 // FIX_1_961570560 |
| 2051 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) |
| 2052 addiu t8, zero, 3196 // FIX_0_390180644 |
| 2053 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) |
| 2054 addiu t8, zero, 2446 // FIX_0_298631336 |
| 2055 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) |
| 2056 addiu t8, zero, 7373 // FIX_0_899976223 |
| 2057 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) |
| 2058 addiu t8, zero, 16819 // FIX_2_053119869 |
| 2059 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) |
| 2060 addiu t8, zero, 20995 // FIX_2_562915447 |
| 2061 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) |
| 2062 addiu t8, zero, 25172 // FIX_3_072711026 |
| 2063 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) |
| 2064 addiu t8, zero, 12299 // FIX_1_501321110 |
| 2065 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) |
| 2066 subu s0, s2, s0 // z3 += z5 |
| 2067 subu s1, s2, s1 // z4 += z5 |
| 2068 addu t4, t4, s0 |
| 2069 subu t4, t4, s3 // tmp0 |
| 2070 addu t5, t5, s1 |
| 2071 subu t5, t5, t9 // tmp1 |
| 2072 addu t6, t6, s0 |
| 2073 subu t6, t6, t9 // tmp2 |
| 2074 addu t7, t7, s1 |
| 2075 subu t7, t7, s3 // tmp3 |
| 2076 addu s0, t0, t7 |
| 2077 subu t0, t0, t7 |
| 2078 addu t7, t2, t6 |
| 2079 subu t2, t2, t6 |
| 2080 addu t6, t3, t5 |
| 2081 subu t3, t3, t5 |
| 2082 addu t5, t1, t4 |
| 2083 subu t1, t1, t4 |
| 2084 shra_r.w s0, s0, 18 |
| 2085 shra_r.w t7, t7, 18 |
| 2086 shra_r.w t6, t6, 18 |
| 2087 shra_r.w t5, t5, 18 |
| 2088 shra_r.w t1, t1, 18 |
| 2089 shra_r.w t3, t3, 18 |
| 2090 shra_r.w t2, t2, 18 |
| 2091 shra_r.w t0, t0, 18 |
| 2092 andi s0, s0, 0x3ff |
| 2093 andi t7, t7, 0x3ff |
| 2094 andi t6, t6, 0x3ff |
| 2095 andi t5, t5, 0x3ff |
| 2096 andi t1, t1, 0x3ff |
| 2097 andi t3, t3, 0x3ff |
| 2098 andi t2, t2, 0x3ff |
| 2099 andi t0, t0, 0x3ff |
| 2100 lw s1, 0(a2) |
| 2101 lbux s0, s0(a3) |
| 2102 lbux t7, t7(a3) |
| 2103 lbux t6, t6(a3) |
| 2104 lbux t5, t5(a3) |
| 2105 lbux t1, t1(a3) |
| 2106 lbux t3, t3(a3) |
| 2107 lbux t2, t2(a3) |
| 2108 lbux t0, t0(a3) |
| 2109 sb s0, 0(s1) |
| 2110 sb t7, 1(s1) |
| 2111 sb t6, 2(s1) |
| 2112 sb t5, 3(s1) |
| 2113 sb t1, 4(s1) |
| 2114 sb t3, 5(s1) |
| 2115 sb t2, 6(s1) |
| 2116 sb t0, 7(s1) |
| 2117 6: |
| 2118 addiu v0, v0, 32 |
| 2119 bgtz v1, 4b |
| 2120 addiu a2, a2, 4 |
| 2121 addiu sp, sp, 256 |
| 2122 |
| 2123 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 2124 |
| 2125 j ra |
| 2126 nop |
| 2127 |
| 2128 END(jsimd_idct_islow_mips_dspr2) |
| 2129 |
| 2130 /*****************************************************************************/ |
| 2131 LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) |
| 2132 /* |
| 2133 * a0 - inptr |
| 2134 * a1 - quantptr |
| 2135 * a2 - wsptr |
| 2136 * a3 - mips_idct_ifast_coefs |
| 2137 */ |
| 2138 |
| 2139 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 2140 |
| 2141 addiu t9, a0, 16 // end address |
| 2142 or AT, a3, zero |
| 2143 |
| 2144 0: |
| 2145 lw s0, 0(a1) // quantptr[DCTSIZE*0] |
| 2146 lw t0, 0(a0) // inptr[DCTSIZE*0] |
| 2147 lw t1, 16(a0) // inptr[DCTSIZE*1] |
| 2148 muleq_s.w.phl v0, t0, s0 // tmp0 ... |
| 2149 lw t2, 32(a0) // inptr[DCTSIZE*2] |
| 2150 lw t3, 48(a0) // inptr[DCTSIZE*3] |
| 2151 lw t4, 64(a0) // inptr[DCTSIZE*4] |
| 2152 lw t5, 80(a0) // inptr[DCTSIZE*5] |
| 2153 muleq_s.w.phr t0, t0, s0 // ... tmp0 ... |
| 2154 lw t6, 96(a0) // inptr[DCTSIZE*6] |
| 2155 lw t7, 112(a0) // inptr[DCTSIZE*7] |
| 2156 or s4, t1, t2 |
| 2157 or s5, t3, t4 |
| 2158 bnez s4, 1f |
| 2159 ins t0, v0, 16, 16 // ... tmp0 |
| 2160 bnez s5, 1f |
| 2161 or s6, t5, t6 |
| 2162 or s6, s6, t7 |
| 2163 bnez s6, 1f |
| 2164 sw t0, 0(a2) // wsptr[DCTSIZE*0] |
| 2165 sw t0, 16(a2) // wsptr[DCTSIZE*1] |
| 2166 sw t0, 32(a2) // wsptr[DCTSIZE*2] |
| 2167 sw t0, 48(a2) // wsptr[DCTSIZE*3] |
| 2168 sw t0, 64(a2) // wsptr[DCTSIZE*4] |
| 2169 sw t0, 80(a2) // wsptr[DCTSIZE*5] |
| 2170 sw t0, 96(a2) // wsptr[DCTSIZE*6] |
| 2171 sw t0, 112(a2) // wsptr[DCTSIZE*7] |
| 2172 addiu a0, a0, 4 |
| 2173 b 2f |
| 2174 addiu a1, a1, 4 |
| 2175 |
| 2176 1: |
| 2177 lw s1, 32(a1) // quantptr[DCTSIZE*2] |
| 2178 lw s2, 64(a1) // quantptr[DCTSIZE*4] |
| 2179 muleq_s.w.phl v0, t2, s1 // tmp1 ... |
| 2180 muleq_s.w.phr t2, t2, s1 // ... tmp1 ... |
| 2181 lw s0, 16(a1) // quantptr[DCTSIZE*1] |
| 2182 lw s1, 48(a1) // quantptr[DCTSIZE*3] |
| 2183 lw s3, 96(a1) // quantptr[DCTSIZE*6] |
| 2184 muleq_s.w.phl v1, t4, s2 // tmp2 ... |
| 2185 muleq_s.w.phr t4, t4, s2 // ... tmp2 ... |
| 2186 lw s2, 80(a1) // quantptr[DCTSIZE*5] |
| 2187 lw t8, 4(AT) // FIX(1.414213562) |
| 2188 ins t2, v0, 16, 16 // ... tmp1 |
| 2189 muleq_s.w.phl v0, t6, s3 // tmp3 ... |
| 2190 muleq_s.w.phr t6, t6, s3 // ... tmp3 ... |
| 2191 ins t4, v1, 16, 16 // ... tmp2 |
| 2192 addq.ph s4, t0, t4 // tmp10 |
| 2193 subq.ph s5, t0, t4 // tmp11 |
| 2194 ins t6, v0, 16, 16 // ... tmp3 |
| 2195 subq.ph s6, t2, t6 // tmp12 ... |
| 2196 addq.ph s7, t2, t6 // tmp13 |
| 2197 mulq_s.ph s6, s6, t8 // ... tmp12 ... |
| 2198 addq.ph t0, s4, s7 // tmp0 |
| 2199 subq.ph t6, s4, s7 // tmp3 |
| 2200 muleq_s.w.phl v0, t1, s0 // tmp4 ... |
| 2201 muleq_s.w.phr t1, t1, s0 // ... tmp4 ... |
| 2202 shll_s.ph s6, s6, 1 // x2 |
| 2203 lw s3, 112(a1) // quantptr[DCTSIZE*7] |
| 2204 subq.ph s6, s6, s7 // ... tmp12 |
| 2205 muleq_s.w.phl v1, t7, s3 // tmp7 ... |
| 2206 muleq_s.w.phr t7, t7, s3 // ... tmp7 ... |
| 2207 ins t1, v0, 16, 16 // ... tmp4 |
| 2208 addq.ph t2, s5, s6 // tmp1 |
| 2209 subq.ph t4, s5, s6 // tmp2 |
| 2210 muleq_s.w.phl v0, t5, s2 // tmp6 ... |
| 2211 muleq_s.w.phr t5, t5, s2 // ... tmp6 ... |
| 2212 ins t7, v1, 16, 16 // ... tmp7 |
| 2213 addq.ph s5, t1, t7 // z11 |
| 2214 subq.ph s6, t1, t7 // z12 |
| 2215 muleq_s.w.phl v1, t3, s1 // tmp5 ... |
| 2216 muleq_s.w.phr t3, t3, s1 // ... tmp5 ... |
| 2217 ins t5, v0, 16, 16 // ... tmp6 |
| 2218 ins t3, v1, 16, 16 // ... tmp5 |
| 2219 addq.ph s7, t5, t3 // z13 |
| 2220 subq.ph v0, t5, t3 // z10 |
| 2221 addq.ph t7, s5, s7 // tmp7 |
| 2222 subq.ph s5, s5, s7 // tmp11 ... |
| 2223 addq.ph v1, v0, s6 // z5 ... |
| 2224 mulq_s.ph s5, s5, t8 // ... tmp11 |
| 2225 lw t8, 8(AT) // FIX(1.847759065) |
| 2226 lw s4, 0(AT) // FIX(1.082392200) |
| 2227 addq.ph s0, t0, t7 |
| 2228 subq.ph s1, t0, t7 |
| 2229 mulq_s.ph v1, v1, t8 // ... z5 |
| 2230 shll_s.ph s5, s5, 1 // x2 |
| 2231 lw t8, 12(AT) // FIX(-2.613125930) |
| 2232 sw s0, 0(a2) // wsptr[DCTSIZE*0] |
| 2233 shll_s.ph v0, v0, 1 // x4 |
| 2234 mulq_s.ph v0, v0, t8 // tmp12 ... |
| 2235 mulq_s.ph s4, s6, s4 // tmp10 ... |
| 2236 shll_s.ph v1, v1, 1 // x2 |
| 2237 addiu a0, a0, 4 |
| 2238 addiu a1, a1, 4 |
| 2239 sw s1, 112(a2) // wsptr[DCTSIZE*7] |
| 2240 shll_s.ph s6, v0, 1 // x4 |
| 2241 shll_s.ph s4, s4, 1 // x2 |
| 2242 addq.ph s6, s6, v1 // ... tmp12 |
| 2243 subq.ph t5, s6, t7 // tmp6 |
| 2244 subq.ph s4, s4, v1 // ... tmp10 |
| 2245 subq.ph t3, s5, t5 // tmp5 |
| 2246 addq.ph s2, t2, t5 |
| 2247 addq.ph t1, s4, t3 // tmp4 |
| 2248 subq.ph s3, t2, t5 |
| 2249 sw s2, 16(a2) // wsptr[DCTSIZE*1] |
| 2250 sw s3, 96(a2) // wsptr[DCTSIZE*6] |
| 2251 addq.ph v0, t4, t3 |
| 2252 subq.ph v1, t4, t3 |
| 2253 sw v0, 32(a2) // wsptr[DCTSIZE*2] |
| 2254 sw v1, 80(a2) // wsptr[DCTSIZE*5] |
| 2255 addq.ph v0, t6, t1 |
| 2256 subq.ph v1, t6, t1 |
| 2257 sw v0, 64(a2) // wsptr[DCTSIZE*4] |
| 2258 sw v1, 48(a2) // wsptr[DCTSIZE*3] |
| 2259 |
| 2260 2: |
| 2261 bne a0, t9, 0b |
| 2262 addiu a2, a2, 4 |
| 2263 |
| 2264 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 2265 |
| 2266 j ra |
| 2267 nop |
| 2268 |
| 2269 END(jsimd_idct_ifast_cols_mips_dspr2) |
| 2270 |
| 2271 /*****************************************************************************/ |
| 2272 LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) |
| 2273 /* |
| 2274 * a0 - wsptr |
| 2275 * a1 - output_buf |
| 2276 * a2 - output_col |
| 2277 * a3 - mips_idct_ifast_coefs |
| 2278 */ |
| 2279 |
| 2280 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 |
| 2281 |
| 2282 addiu t9, a0, 128 // end address |
| 2283 lui s8, 0x8080 |
| 2284 ori s8, s8, 0x8080 |
| 2285 |
| 2286 0: |
| 2287 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) |
| 2288 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a |
| 2289 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A |
| 2290 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c |
| 2291 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C |
| 2292 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e |
| 2293 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E |
| 2294 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g |
| 2295 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G |
| 2296 precrq.ph.w t1, s0, t0 // B b |
| 2297 ins t0, s0, 16, 16 // A a |
| 2298 bnez t1, 1f |
| 2299 or s0, t2, s2 |
| 2300 bnez s0, 1f |
| 2301 or s0, t4, s4 |
| 2302 bnez s0, 1f |
| 2303 or s0, t6, s6 |
| 2304 bnez s0, 1f |
| 2305 shll_s.ph s0, t0, 2 // A a |
| 2306 lw a3, 0(a1) |
| 2307 lw AT, 4(a1) |
| 2308 precrq.ph.w t0, s0, s0 // A A |
| 2309 ins s0, s0, 16, 16 // a a |
| 2310 addu a3, a3, a2 |
| 2311 addu AT, AT, a2 |
| 2312 precrq.qb.ph t0, t0, t0 // A A A A |
| 2313 precrq.qb.ph s0, s0, s0 // a a a a |
| 2314 addu.qb s0, s0, s8 |
| 2315 addu.qb t0, t0, s8 |
| 2316 sw s0, 0(a3) |
| 2317 sw s0, 4(a3) |
| 2318 sw t0, 0(AT) |
| 2319 sw t0, 4(AT) |
| 2320 addiu a0, a0, 32 |
| 2321 bne a0, t9, 0b |
| 2322 addiu a1, a1, 8 |
| 2323 b 2f |
| 2324 nop |
| 2325 |
| 2326 1: |
| 2327 precrq.ph.w t3, s2, t2 |
| 2328 ins t2, s2, 16, 16 |
| 2329 precrq.ph.w t5, s4, t4 |
| 2330 ins t4, s4, 16, 16 |
| 2331 precrq.ph.w t7, s6, t6 |
| 2332 ins t6, s6, 16, 16 |
| 2333 lw t8, 4(AT) // FIX(1.414213562) |
| 2334 addq.ph s4, t0, t4 // tmp10 |
| 2335 subq.ph s5, t0, t4 // tmp11 |
| 2336 subq.ph s6, t2, t6 // tmp12 ... |
| 2337 addq.ph s7, t2, t6 // tmp13 |
| 2338 mulq_s.ph s6, s6, t8 // ... tmp12 ... |
| 2339 addq.ph t0, s4, s7 // tmp0 |
| 2340 subq.ph t6, s4, s7 // tmp3 |
| 2341 shll_s.ph s6, s6, 1 // x2 |
| 2342 subq.ph s6, s6, s7 // ... tmp12 |
| 2343 addq.ph t2, s5, s6 // tmp1 |
| 2344 subq.ph t4, s5, s6 // tmp2 |
| 2345 addq.ph s5, t1, t7 // z11 |
| 2346 subq.ph s6, t1, t7 // z12 |
| 2347 addq.ph s7, t5, t3 // z13 |
| 2348 subq.ph v0, t5, t3 // z10 |
| 2349 addq.ph t7, s5, s7 // tmp7 |
| 2350 subq.ph s5, s5, s7 // tmp11 ... |
| 2351 addq.ph v1, v0, s6 // z5 ... |
| 2352 mulq_s.ph s5, s5, t8 // ... tmp11 |
| 2353 lw t8, 8(AT) // FIX(1.847759065) |
| 2354 lw s4, 0(AT) // FIX(1.082392200) |
| 2355 addq.ph s0, t0, t7 // tmp0 + tmp7 |
| 2356 subq.ph s7, t0, t7 // tmp0 - tmp7 |
| 2357 mulq_s.ph v1, v1, t8 // ... z5 |
| 2358 lw a3, 0(a1) |
| 2359 lw t8, 12(AT) // FIX(-2.613125930) |
| 2360 shll_s.ph s5, s5, 1 // x2 |
| 2361 addu a3, a3, a2 |
| 2362 shll_s.ph v0, v0, 1 // x4 |
| 2363 mulq_s.ph v0, v0, t8 // tmp12 ... |
| 2364 mulq_s.ph s4, s6, s4 // tmp10 ... |
| 2365 shll_s.ph v1, v1, 1 // x2 |
| 2366 addiu a0, a0, 32 |
| 2367 addiu a1, a1, 8 |
| 2368 shll_s.ph s6, v0, 1 // x4 |
| 2369 shll_s.ph s4, s4, 1 // x2 |
| 2370 addq.ph s6, s6, v1 // ... tmp12 |
| 2371 shll_s.ph s0, s0, 2 |
| 2372 subq.ph t5, s6, t7 // tmp6 |
| 2373 subq.ph s4, s4, v1 // ... tmp10 |
| 2374 subq.ph t3, s5, t5 // tmp5 |
| 2375 shll_s.ph s7, s7, 2 |
| 2376 addq.ph t1, s4, t3 // tmp4 |
| 2377 addq.ph s1, t2, t5 // tmp1 + tmp6 |
| 2378 subq.ph s6, t2, t5 // tmp1 - tmp6 |
| 2379 addq.ph s2, t4, t3 // tmp2 + tmp5 |
| 2380 subq.ph s5, t4, t3 // tmp2 - tmp5 |
| 2381 addq.ph s4, t6, t1 // tmp3 + tmp4 |
| 2382 subq.ph s3, t6, t1 // tmp3 - tmp4 |
| 2383 shll_s.ph s1, s1, 2 |
| 2384 shll_s.ph s2, s2, 2 |
| 2385 shll_s.ph s3, s3, 2 |
| 2386 shll_s.ph s4, s4, 2 |
| 2387 shll_s.ph s5, s5, 2 |
| 2388 shll_s.ph s6, s6, 2 |
| 2389 precrq.ph.w t0, s1, s0 // B A |
| 2390 ins s0, s1, 16, 16 // b a |
| 2391 precrq.ph.w t2, s3, s2 // D C |
| 2392 ins s2, s3, 16, 16 // d c |
| 2393 precrq.ph.w t4, s5, s4 // F E |
| 2394 ins s4, s5, 16, 16 // f e |
| 2395 precrq.ph.w t6, s7, s6 // H G |
| 2396 ins s6, s7, 16, 16 // h g |
| 2397 precrq.qb.ph t0, t2, t0 // D C B A |
| 2398 precrq.qb.ph s0, s2, s0 // d c b a |
| 2399 precrq.qb.ph t4, t6, t4 // H G F E |
| 2400 precrq.qb.ph s4, s6, s4 // h g f e |
| 2401 addu.qb s0, s0, s8 |
| 2402 addu.qb s4, s4, s8 |
| 2403 sw s0, 0(a3) // outptr[0/1/2/3] d c b a |
| 2404 sw s4, 4(a3) // outptr[4/5/6/7] h g f e |
| 2405 lw a3, -4(a1) |
| 2406 addu.qb t0, t0, s8 |
| 2407 addu a3, a3, a2 |
| 2408 addu.qb t4, t4, s8 |
| 2409 sw t0, 0(a3) // outptr[0/1/2/3] D C B A |
| 2410 bne a0, t9, 0b |
| 2411 sw t4, 4(a3) // outptr[4/5/6/7] H G F E |
| 2412 |
| 2413 2: |
| 2414 |
| 2415 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 |
| 2416 |
| 2417 j ra |
| 2418 nop |
| 2419 |
| 2420 END(jsimd_idct_ifast_rows_mips_dspr2) |
| 2421 |
| 2422 /*****************************************************************************/ |
| 2423 LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) |
| 2424 /* |
| 2425 * a0 - data |
| 2426 */ |
| 2427 |
| 2428 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 |
| 2429 |
| 2430 lui t0, 6437 |
| 2431 ori t0, 2260 |
| 2432 lui t1, 9633 |
| 2433 ori t1, 11363 |
| 2434 lui t2, 0xd39e |
| 2435 ori t2, 0xe6dc |
| 2436 lui t3, 0xf72d |
| 2437 ori t3, 9633 |
| 2438 lui t4, 2261 |
| 2439 ori t4, 9633 |
| 2440 lui t5, 0xd39e |
| 2441 ori t5, 6437 |
| 2442 lui t6, 9633 |
| 2443 ori t6, 0xd39d |
| 2444 lui t7, 0xe6dc |
| 2445 ori t7, 2260 |
| 2446 lui t8, 4433 |
| 2447 ori t8, 10703 |
| 2448 lui t9, 0xd630 |
| 2449 ori t9, 4433 |
| 2450 li s8, 8 |
| 2451 move a1, a0 |
| 2452 1: |
| 2453 lw s0, 0(a1) // tmp0 = 1|0 |
| 2454 lw s1, 4(a1) // tmp1 = 3|2 |
| 2455 lw s2, 8(a1) // tmp2 = 5|4 |
| 2456 lw s3, 12(a1) // tmp3 = 7|6 |
| 2457 packrl.ph s1, s1, s1 // tmp1 = 2|3 |
| 2458 packrl.ph s3, s3, s3 // tmp3 = 6|7 |
| 2459 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 |
| 2460 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 |
| 2461 mult $0, $0 // ac0 = 0 |
| 2462 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 |
| 2463 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 |
| 2464 mult $ac1, $0, $0 // ac1 = 0 |
| 2465 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 |
| 2466 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 |
| 2467 mult $ac2, $0, $0 // ac2 = 0 |
| 2468 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 |
| 2469 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 |
| 2470 mult $ac3, $0, $0 // ac3 = 0 |
| 2471 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 |
| 2472 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 |
| 2473 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 |
| 2474 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 |
| 2475 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 |
| 2476 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 |
| 2477 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 |
| 2478 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 |
| 2479 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 |
| 2480 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 |
| 2481 sh s0, 2(a1) |
| 2482 sh s1, 6(a1) |
| 2483 sh s2, 10(a1) |
| 2484 sh s3, 14(a1) |
| 2485 mult $0, $0 // ac0 = 0 |
| 2486 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 |
| 2487 mult $ac1, $0, $0 // ac1 = 0 |
| 2488 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 |
| 2489 sra s4, s5, 16 // tmp4 = t11 |
| 2490 addiu a1, a1, 16 |
| 2491 addiu s8, s8, -1 |
| 2492 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 |
| 2493 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 |
| 2494 addu s2, s5, s4 // tmp2 = t10 + t11 |
| 2495 subu s3, s5, s4 // tmp3 = t10 - t11 |
| 2496 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 |
| 2497 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 |
| 2498 sh s2, -16(a1) |
| 2499 sh s3, -8(a1) |
| 2500 sh s0, -12(a1) |
| 2501 bgtz s8, 1b |
| 2502 sh s1, -4(a1) |
| 2503 li t0, 2260 |
| 2504 li t1, 11363 |
| 2505 li t2, 9633 |
| 2506 li t3, 6436 |
| 2507 li t4, 6437 |
| 2508 li t5, 2261 |
| 2509 li t6, 11362 |
| 2510 li t7, 2259 |
| 2511 li t8, 4433 |
| 2512 li t9, 10703 |
| 2513 li a1, 10704 |
| 2514 li s8, 8 |
| 2515 |
| 2516 2: |
| 2517 lh a2, 0(a0) // 0 |
| 2518 lh a3, 16(a0) // 8 |
| 2519 lh v0, 32(a0) // 16 |
| 2520 lh v1, 48(a0) // 24 |
| 2521 lh s4, 64(a0) // 32 |
| 2522 lh s5, 80(a0) // 40 |
| 2523 lh s6, 96(a0) // 48 |
| 2524 lh s7, 112(a0) // 56 |
| 2525 addu s2, v0, s5 // tmp2 = 16 + 40 |
| 2526 subu s5, v0, s5 // tmp5 = 16 - 40 |
| 2527 addu s3, v1, s4 // tmp3 = 24 + 32 |
| 2528 subu s4, v1, s4 // tmp4 = 24 - 32 |
| 2529 addu s0, a2, s7 // tmp0 = 0 + 56 |
| 2530 subu s7, a2, s7 // tmp7 = 0 - 56 |
| 2531 addu s1, a3, s6 // tmp1 = 8 + 48 |
| 2532 subu s6, a3, s6 // tmp6 = 8 - 48 |
| 2533 addu a2, s0, s3 // tmp10 = tmp0 + tmp3 |
| 2534 subu v1, s0, s3 // tmp13 = tmp0 - tmp3 |
| 2535 addu a3, s1, s2 // tmp11 = tmp1 + tmp2 |
| 2536 subu v0, s1, s2 // tmp12 = tmp1 - tmp2 |
| 2537 mult s7, t1 // ac0 = tmp7 * c1 |
| 2538 madd s4, t0 // ac0 += tmp4 * c0 |
| 2539 madd s5, t4 // ac0 += tmp5 * c4 |
| 2540 madd s6, t2 // ac0 += tmp6 * c2 |
| 2541 mult $ac1, s7, t2 // ac1 = tmp7 * c2 |
| 2542 msub $ac1, s4, t3 // ac1 -= tmp4 * c3 |
| 2543 msub $ac1, s5, t6 // ac1 -= tmp5 * c6 |
| 2544 msub $ac1, s6, t7 // ac1 -= tmp6 * c7 |
| 2545 mult $ac2, s7, t4 // ac2 = tmp7 * c4 |
| 2546 madd $ac2, s4, t2 // ac2 += tmp4 * c2 |
| 2547 madd $ac2, s5, t5 // ac2 += tmp5 * c5 |
| 2548 msub $ac2, s6, t6 // ac2 -= tmp6 * c6 |
| 2549 mult $ac3, s7, t0 // ac3 = tmp7 * c0 |
| 2550 msub $ac3, s4, t1 // ac3 -= tmp4 * c1 |
| 2551 madd $ac3, s5, t2 // ac3 += tmp5 * c2 |
| 2552 msub $ac3, s6, t3 // ac3 -= tmp6 * c3 |
| 2553 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 |
| 2554 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 |
| 2555 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 |
| 2556 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 |
| 2557 addiu s8, s8, -1 |
| 2558 addu s4, a2, a3 // tmp4 = tmp10 + tmp11 |
| 2559 subu s5, a2, a3 // tmp5 = tmp10 - tmp11 |
| 2560 sh s0, 16(a0) |
| 2561 sh s1, 48(a0) |
| 2562 sh s2, 80(a0) |
| 2563 sh s3, 112(a0) |
| 2564 mult v0, t8 // ac0 = tmp12 * c8 |
| 2565 madd v1, t9 // ac0 += tmp13 * c9 |
| 2566 mult $ac1, v1, t8 // ac1 = tmp13 * c8 |
| 2567 msub $ac1, v0, a1 // ac1 -= tmp12 * c10 |
| 2568 addiu a0, a0, 2 |
| 2569 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 |
| 2570 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 |
| 2571 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 |
| 2572 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 |
| 2573 sh s4, -2(a0) |
| 2574 sh s5, 62(a0) |
| 2575 sh s6, 30(a0) |
| 2576 bgtz s8, 2b |
| 2577 sh s7, 94(a0) |
| 2578 |
| 2579 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 |
| 2580 |
| 2581 jr ra |
| 2582 nop |
| 2583 |
| 2584 END(jsimd_fdct_islow_mips_dspr2) |
| 2585 |
| 2586 /*****************************************************************************/ |
| 2587 LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2) |
| 2588 /* |
| 2589 * a0 - data |
| 2590 */ |
| 2591 .set at |
| 2592 SAVE_REGS_ON_STACK 8, s0, s1 |
| 2593 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) |
| 2594 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) |
| 2595 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) |
| 2596 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) |
| 2597 |
| 2598 move v0, a0 |
| 2599 addiu v1, v0, 128 // end address |
| 2600 |
| 2601 0: |
| 2602 lw t0, 0(v0) // tmp0 = 1|0 |
| 2603 lw t1, 4(v0) // tmp1 = 3|2 |
| 2604 lw t2, 8(v0) // tmp2 = 5|4 |
| 2605 lw t3, 12(v0) // tmp3 = 7|6 |
| 2606 packrl.ph t1, t1, t1 // tmp1 = 2|3 |
| 2607 packrl.ph t3, t3, t3 // tmp3 = 6|7 |
| 2608 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 |
| 2609 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 |
| 2610 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 |
| 2611 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 |
| 2612 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 |
| 2613 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 |
| 2614 sra t4, t8, 16 // tmp4 = t11 |
| 2615 mult $0, $0 // ac0 = 0 |
| 2616 dpa.w.ph $ac0, t9, s1 |
| 2617 mult $ac1, $0, $0 // ac1 = 0 |
| 2618 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 |
| 2619 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 |
| 2620 mult $ac2, $0, $0 // ac2 = 0 |
| 2621 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 |
| 2622 mult $ac3, $0, $0 // ac3 = 0 |
| 2623 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 |
| 2624 precrq.ph.w t0, t5, t7 // t0 = t5|t6 |
| 2625 addq.ph t2, t8, t4 // tmp2 = t10 + t11 |
| 2626 subq.ph t3, t8, t4 // tmp3 = t10 - t11 |
| 2627 extr.w t4, $ac0, 8 |
| 2628 mult $0, $0 // ac0 = 0 |
| 2629 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 |
| 2630 extr.w t0, $ac1, 8 // t0 = z5 |
| 2631 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) |
| 2632 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) |
| 2633 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) |
| 2634 add t6, t1, t0 // t6 = z2 |
| 2635 add t7, t7, t0 // t7 = z4 |
| 2636 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 |
| 2637 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 |
| 2638 addq.ph t1, t0, t6 // t1 = z13 + z2 |
| 2639 subq.ph t6, t0, t6 // t6 = z13 - z2 |
| 2640 addq.ph t0, t8, t7 // t0 = z11 + z4 |
| 2641 subq.ph t7, t8, t7 // t7 = z11 - z4 |
| 2642 addq.ph t5, t4, t9 |
| 2643 subq.ph t4, t9, t4 |
| 2644 sh t2, 0(v0) |
| 2645 sh t5, 4(v0) |
| 2646 sh t3, 8(v0) |
| 2647 sh t4, 12(v0) |
| 2648 sh t1, 10(v0) |
| 2649 sh t6, 6(v0) |
| 2650 sh t0, 2(v0) |
| 2651 sh t7, 14(v0) |
| 2652 addiu v0, 16 |
| 2653 bne v1, v0, 0b |
| 2654 nop |
| 2655 move v0, a0 |
| 2656 addiu v1, v0, 16 |
| 2657 |
| 2658 1: |
| 2659 lh t0, 0(v0) // 0 |
| 2660 lh t1, 16(v0) // 8 |
| 2661 lh t2, 32(v0) // 16 |
| 2662 lh t3, 48(v0) // 24 |
| 2663 lh t4, 64(v0) // 32 |
| 2664 lh t5, 80(v0) // 40 |
| 2665 lh t6, 96(v0) // 48 |
| 2666 lh t7, 112(v0) // 56 |
| 2667 add t8, t0, t7 // t8 = tmp0 |
| 2668 sub t7, t0, t7 // t7 = tmp7 |
| 2669 add t0, t1, t6 // t0 = tmp1 |
| 2670 sub t1, t1, t6 // t1 = tmp6 |
| 2671 add t6, t2, t5 // t6 = tmp2 |
| 2672 sub t5, t2, t5 // t5 = tmp5 |
| 2673 add t2, t3, t4 // t2 = tmp3 |
| 2674 sub t3, t3, t4 // t3 = tmp4 |
| 2675 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 |
| 2676 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 |
| 2677 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 |
| 2678 ins t8, s0, 16, 16 // t8 = tmp12|tmp13 |
| 2679 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 |
| 2680 mult $0, $0 // ac0 = 0 |
| 2681 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 |
| 2682 add s0, t4, t2 // t8 = tmp10+tmp11 |
| 2683 sub t4, t4, t2 // t4 = tmp10-tmp11 |
| 2684 sh s0, 0(v0) |
| 2685 sh t4, 64(v0) |
| 2686 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781) |
| 2687 addq.ph t4, t8, t2 // t9 = tmp13 + z1 |
| 2688 subq.ph t8, t8, t2 // t2 = tmp13 - z1 |
| 2689 sh t4, 32(v0) |
| 2690 sh t8, 96(v0) |
| 2691 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 |
| 2692 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 |
| 2693 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 |
| 2694 andi t4, a1, 0xffff |
| 2695 mul s0, t1, t4 |
| 2696 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) |
| 2697 ins t1, t3, 16, 16 // t1 = tmp10|tmp12 |
| 2698 mult $0, $0 // ac0 = 0 |
| 2699 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 |
| 2700 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433) |
| 2701 add t2, t7, t8 // t2 = tmp7 + z5 |
| 2702 sub t7, t7, t8 // t7 = tmp7 - z5 |
| 2703 andi t4, a2, 0xffff |
| 2704 mul t8, t3, t4 |
| 2705 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) |
| 2706 andi t4, s1, 0xffff |
| 2707 mul t6, t0, t4 |
| 2708 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) |
| 2709 add t0, t6, t8 // t0 = z3 + z2 |
| 2710 sub t1, t6, t8 // t1 = z3 - z2 |
| 2711 add t3, t6, s0 // t3 = z3 + z4 |
| 2712 sub t4, t6, s0 // t4 = z3 - z4 |
| 2713 sub t5, t2, t1 // t5 = dataptr[5] |
| 2714 sub t6, t7, t0 // t6 = dataptr[3] |
| 2715 add t3, t2, t3 // t3 = dataptr[1] |
| 2716 add t4, t7, t4 // t4 = dataptr[7] |
| 2717 sh t5, 80(v0) |
| 2718 sh t6, 48(v0) |
| 2719 sh t3, 16(v0) |
| 2720 sh t4, 112(v0) |
| 2721 addiu v0, 2 |
| 2722 bne v0, v1, 1b |
| 2723 nop |
| 2724 |
| 2725 RESTORE_REGS_FROM_STACK 8, s0, s1 |
| 2726 |
| 2727 j ra |
| 2728 nop |
| 2729 END(jsimd_fdct_ifast_mips_dspr2) |
| 2730 |
| 2731 /*****************************************************************************/ |
| 2732 LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) |
| 2733 /* |
| 2734 * a0 - coef_block |
| 2735 * a1 - divisors |
| 2736 * a2 - workspace |
| 2737 */ |
| 2738 |
| 2739 .set at |
| 2740 |
| 2741 SAVE_REGS_ON_STACK 16, s0, s1, s2 |
| 2742 |
| 2743 addiu v0, a2, 124 // v0 = workspace_end |
| 2744 lh t0, 0(a2) |
| 2745 lh t1, 0(a1) |
| 2746 lh t2, 128(a1) |
| 2747 sra t3, t0, 15 |
| 2748 sll t3, t3, 1 |
| 2749 addiu t3, t3, 1 |
| 2750 mul t0, t0, t3 |
| 2751 lh t4, 384(a1) |
| 2752 lh t5, 130(a1) |
| 2753 lh t6, 2(a2) |
| 2754 lh t7, 2(a1) |
| 2755 lh t8, 386(a1) |
| 2756 |
| 2757 1: |
| 2758 andi t1, 0xffff |
| 2759 add t9, t0, t2 |
| 2760 andi t9, 0xffff |
| 2761 mul v1, t9, t1 |
| 2762 sra s0, t6, 15 |
| 2763 sll s0, s0, 1 |
| 2764 addiu s0, s0, 1 |
| 2765 addiu t9, t4, 16 |
| 2766 srav v1, v1, t9 |
| 2767 mul v1, v1, t3 |
| 2768 mul t6, t6, s0 |
| 2769 andi t7, 0xffff |
| 2770 addiu a2, a2, 4 |
| 2771 addiu a1, a1, 4 |
| 2772 add s1, t6, t5 |
| 2773 andi s1, 0xffff |
| 2774 sh v1, 0(a0) |
| 2775 |
| 2776 mul s2, s1, t7 |
| 2777 addiu s1, t8, 16 |
| 2778 srav s2, s2, s1 |
| 2779 mul s2,s2, s0 |
| 2780 lh t0, 0(a2) |
| 2781 lh t1, 0(a1) |
| 2782 sra t3, t0, 15 |
| 2783 sll t3, t3, 1 |
| 2784 addiu t3, t3, 1 |
| 2785 mul t0, t0, t3 |
| 2786 lh t2, 128(a1) |
| 2787 lh t4, 384(a1) |
| 2788 lh t5, 130(a1) |
| 2789 lh t8, 386(a1) |
| 2790 lh t6, 2(a2) |
| 2791 lh t7, 2(a1) |
| 2792 sh s2, 2(a0) |
| 2793 lh t0, 0(a2) |
| 2794 sra t3, t0, 15 |
| 2795 sll t3, t3, 1 |
| 2796 addiu t3, t3, 1 |
| 2797 mul t0, t0,t3 |
| 2798 bne a2, v0, 1b |
| 2799 addiu a0, a0, 4 |
| 2800 |
| 2801 andi t1, 0xffff |
| 2802 add t9, t0, t2 |
| 2803 andi t9, 0xffff |
| 2804 mul v1, t9, t1 |
| 2805 sra s0, t6, 15 |
| 2806 sll s0, s0, 1 |
| 2807 addiu s0, s0, 1 |
| 2808 addiu t9, t4, 16 |
| 2809 srav v1, v1, t9 |
| 2810 mul v1, v1, t3 |
| 2811 mul t6, t6, s0 |
| 2812 andi t7, 0xffff |
| 2813 sh v1, 0(a0) |
| 2814 add s1, t6, t5 |
| 2815 andi s1, 0xffff |
| 2816 mul s2, s1, t7 |
| 2817 addiu s1, t8, 16 |
| 2818 addiu a2, a2, 4 |
| 2819 addiu a1, a1, 4 |
| 2820 srav s2, s2, s1 |
| 2821 mul s2, s2, s0 |
| 2822 sh s2, 2(a0) |
| 2823 |
| 2824 RESTORE_REGS_FROM_STACK 16, s0, s1, s2 |
| 2825 |
| 2826 j ra |
| 2827 nop |
| 2828 |
| 2829 END(jsimd_quantize_mips_dspr2) |
| 2830 |
| 2831 /*****************************************************************************/ |
| 2832 LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) |
| 2833 /* |
| 2834 * a0 - coef_block |
| 2835 * a1 - divisors |
| 2836 * a2 - workspace |
| 2837 */ |
| 2838 |
| 2839 .set at |
| 2840 |
| 2841 li t1, 0x46800100 //integer representation 16384.5 |
| 2842 mtc1 t1, f0 |
| 2843 li t0, 63 |
| 2844 0: |
| 2845 lwc1 f2, 0(a2) |
| 2846 lwc1 f10, 0(a1) |
| 2847 lwc1 f4, 4(a2) |
| 2848 lwc1 f12, 4(a1) |
| 2849 lwc1 f6, 8(a2) |
| 2850 lwc1 f14, 8(a1) |
| 2851 lwc1 f8, 12(a2) |
| 2852 lwc1 f16, 12(a1) |
| 2853 madd.s f2, f0, f2, f10 |
| 2854 madd.s f4, f0, f4, f12 |
| 2855 madd.s f6, f0, f6, f14 |
| 2856 madd.s f8, f0, f8, f16 |
| 2857 lwc1 f10, 16(a1) |
| 2858 lwc1 f12, 20(a1) |
| 2859 trunc.w.s f2, f2 |
| 2860 trunc.w.s f4, f4 |
| 2861 trunc.w.s f6, f6 |
| 2862 trunc.w.s f8, f8 |
| 2863 lwc1 f14, 24(a1) |
| 2864 lwc1 f16, 28(a1) |
| 2865 mfc1 t1, f2 |
| 2866 mfc1 t2, f4 |
| 2867 mfc1 t3, f6 |
| 2868 mfc1 t4, f8 |
| 2869 lwc1 f2, 16(a2) |
| 2870 lwc1 f4, 20(a2) |
| 2871 lwc1 f6, 24(a2) |
| 2872 lwc1 f8, 28(a2) |
| 2873 madd.s f2, f0, f2, f10 |
| 2874 madd.s f4, f0, f4, f12 |
| 2875 madd.s f6, f0, f6, f14 |
| 2876 madd.s f8, f0, f8, f16 |
| 2877 addiu t1, t1, -16384 |
| 2878 addiu t2, t2, -16384 |
| 2879 addiu t3, t3, -16384 |
| 2880 addiu t4, t4, -16384 |
| 2881 trunc.w.s f2, f2 |
| 2882 trunc.w.s f4, f4 |
| 2883 trunc.w.s f6, f6 |
| 2884 trunc.w.s f8, f8 |
| 2885 sh t1, 0(a0) |
| 2886 sh t2, 2(a0) |
| 2887 sh t3, 4(a0) |
| 2888 sh t4, 6(a0) |
| 2889 mfc1 t1, f2 |
| 2890 mfc1 t2, f4 |
| 2891 mfc1 t3, f6 |
| 2892 mfc1 t4, f8 |
| 2893 addiu t0, t0, -8 |
| 2894 addiu a2, a2, 32 |
| 2895 addiu a1, a1, 32 |
| 2896 addiu t1, t1, -16384 |
| 2897 addiu t2, t2, -16384 |
| 2898 addiu t3, t3, -16384 |
| 2899 addiu t4, t4, -16384 |
| 2900 sh t1, 8(a0) |
| 2901 sh t2, 10(a0) |
| 2902 sh t3, 12(a0) |
| 2903 sh t4, 14(a0) |
| 2904 bgez t0, 0b |
| 2905 addiu a0, a0, 16 |
| 2906 |
| 2907 j ra |
| 2908 nop |
| 2909 |
| 2910 END(jsimd_quantize_float_mips_dspr2) |
| 2911 /*****************************************************************************/ |
| 2912 LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) |
| 2913 /* |
| 2914 * a0 - compptr->dct_table |
| 2915 * a1 - coef_block |
| 2916 * a2 - output_buf |
| 2917 * a3 - output_col |
| 2918 */ |
| 2919 .set at |
| 2920 |
| 2921 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 |
| 2922 |
| 2923 addiu sp, sp, -40 |
| 2924 move v0, sp |
| 2925 addiu s2, zero, 29692 |
| 2926 addiu s3, zero, -10426 |
| 2927 addiu s4, zero, 6967 |
| 2928 addiu s5, zero, -5906 |
| 2929 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] |
| 2930 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] |
| 2931 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] |
| 2932 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] |
| 2933 mul t4, t5, t0 |
| 2934 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] |
| 2935 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] |
| 2936 mul t6, t6, t1 |
| 2937 mul t5, t5, t0 |
| 2938 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] |
| 2939 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] |
| 2940 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] |
| 2941 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] |
| 2942 mul t7, t7, t2 |
| 2943 mult zero, zero |
| 2944 mul t8, t8, t3 |
| 2945 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) |
| 2946 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) |
| 2947 ins t6, t5, 16, 16 // t6 = t5|t6 |
| 2948 sll t4, t4, 15 |
| 2949 dpa.w.ph $ac0, t6, s0 |
| 2950 lh t1, 2(a1) |
| 2951 lh t6, 2(a0) |
| 2952 ins t8, t7, 16, 16 // t8 = t7|t8 |
| 2953 dpa.w.ph $ac0, t8, s1 |
| 2954 mflo t0, $ac0 |
| 2955 mul t5, t6, t1 |
| 2956 lh t1, 18(a1) |
| 2957 lh t6, 18(a0) |
| 2958 lh t2, 50(a1) |
| 2959 lh t7, 50(a0) |
| 2960 mul t6, t6, t1 |
| 2961 subu t8, t4, t0 |
| 2962 mul t7, t7, t2 |
| 2963 addu t0, t4, t0 |
| 2964 shra_r.w t0, t0, 13 |
| 2965 lh t1, 82(a1) |
| 2966 lh t2, 82(a0) |
| 2967 lh t3, 114(a1) |
| 2968 lh t4, 114(a0) |
| 2969 shra_r.w t8, t8, 13 |
| 2970 mul t1, t1, t2 |
| 2971 mul t3, t3, t4 |
| 2972 sw t0, 0(v0) |
| 2973 sw t8, 20(v0) |
| 2974 sll t4, t5, 15 |
| 2975 ins t7, t6, 16, 16 |
| 2976 mult zero, zero |
| 2977 dpa.w.ph $ac0, t7, s0 |
| 2978 ins t3, t1, 16, 16 |
| 2979 lh t1, 6(a1) |
| 2980 lh t6, 6(a0) |
| 2981 dpa.w.ph $ac0, t3, s1 |
| 2982 mflo t0, $ac0 |
| 2983 mul t5, t6, t1 |
| 2984 lh t1, 22(a1) |
| 2985 lh t6, 22(a0) |
| 2986 lh t2, 54(a1) |
| 2987 lh t7, 54(a0) |
| 2988 mul t6, t6, t1 |
| 2989 subu t8, t4, t0 |
| 2990 mul t7, t7, t2 |
| 2991 addu t0, t4, t0 |
| 2992 shra_r.w t0, t0, 13 |
| 2993 lh t1, 86(a1) |
| 2994 lh t2, 86(a0) |
| 2995 lh t3, 118(a1) |
| 2996 lh t4, 118(a0) |
| 2997 shra_r.w t8, t8, 13 |
| 2998 mul t1, t1, t2 |
| 2999 mul t3, t3, t4 |
| 3000 sw t0, 4(v0) |
| 3001 sw t8, 24(v0) |
| 3002 sll t4, t5, 15 |
| 3003 ins t7, t6, 16, 16 |
| 3004 mult zero, zero |
| 3005 dpa.w.ph $ac0, t7, s0 |
| 3006 ins t3, t1, 16, 16 |
| 3007 lh t1, 10(a1) |
| 3008 lh t6, 10(a0) |
| 3009 dpa.w.ph $ac0, t3, s1 |
| 3010 mflo t0, $ac0 |
| 3011 mul t5, t6, t1 |
| 3012 lh t1, 26(a1) |
| 3013 lh t6, 26(a0) |
| 3014 lh t2, 58(a1) |
| 3015 lh t7, 58(a0) |
| 3016 mul t6, t6, t1 |
| 3017 subu t8, t4, t0 |
| 3018 mul t7, t7, t2 |
| 3019 addu t0, t4, t0 |
| 3020 shra_r.w t0, t0, 13 |
| 3021 lh t1, 90(a1) |
| 3022 lh t2, 90(a0) |
| 3023 lh t3, 122(a1) |
| 3024 lh t4, 122(a0) |
| 3025 shra_r.w t8, t8, 13 |
| 3026 mul t1, t1, t2 |
| 3027 mul t3, t3, t4 |
| 3028 sw t0, 8(v0) |
| 3029 sw t8, 28(v0) |
| 3030 sll t4, t5, 15 |
| 3031 ins t7, t6, 16, 16 |
| 3032 mult zero, zero |
| 3033 dpa.w.ph $ac0, t7, s0 |
| 3034 ins t3, t1, 16, 16 |
| 3035 lh t1, 14(a1) |
| 3036 lh t6, 14(a0) |
| 3037 dpa.w.ph $ac0, t3, s1 |
| 3038 mflo t0, $ac0 |
| 3039 mul t5, t6, t1 |
| 3040 lh t1, 30(a1) |
| 3041 lh t6, 30(a0) |
| 3042 lh t2, 62(a1) |
| 3043 lh t7, 62(a0) |
| 3044 mul t6, t6, t1 |
| 3045 subu t8, t4, t0 |
| 3046 mul t7, t7, t2 |
| 3047 addu t0, t4, t0 |
| 3048 shra_r.w t0, t0, 13 |
| 3049 lh t1, 94(a1) |
| 3050 lh t2, 94(a0) |
| 3051 lh t3, 126(a1) |
| 3052 lh t4, 126(a0) |
| 3053 shra_r.w t8, t8, 13 |
| 3054 mul t1, t1, t2 |
| 3055 mul t3, t3, t4 |
| 3056 sw t0, 12(v0) |
| 3057 sw t8, 32(v0) |
| 3058 sll t4, t5, 15 |
| 3059 ins t7, t6, 16, 16 |
| 3060 mult zero, zero |
| 3061 dpa.w.ph $ac0, t7, s0 |
| 3062 ins t3, t1, 16, 16 |
| 3063 dpa.w.ph $ac0, t3, s1 |
| 3064 mflo t0, $ac0 |
| 3065 lw t9, 0(a2) |
| 3066 lw t3, 0(v0) |
| 3067 lw t7, 4(v0) |
| 3068 lw t1, 8(v0) |
| 3069 addu t9, t9, a3 |
| 3070 sll t3, t3, 15 |
| 3071 subu t8, t4, t0 |
| 3072 addu t0, t4, t0 |
| 3073 shra_r.w t0, t0, 13 |
| 3074 shra_r.w t8, t8, 13 |
| 3075 sw t0, 16(v0) |
| 3076 sw t8, 36(v0) |
| 3077 lw t5, 12(v0) |
| 3078 lw t6, 16(v0) |
| 3079 mult t7, s2 |
| 3080 madd t1, s3 |
| 3081 madd t5, s4 |
| 3082 madd t6, s5 |
| 3083 lw t5, 24(v0) |
| 3084 lw t7, 28(v0) |
| 3085 mflo t0, $ac0 |
| 3086 lw t8, 32(v0) |
| 3087 lw t2, 36(v0) |
| 3088 mult $ac1, t5, s2 |
| 3089 madd $ac1, t7, s3 |
| 3090 madd $ac1, t8, s4 |
| 3091 madd $ac1, t2, s5 |
| 3092 addu t1, t3, t0 |
| 3093 subu t6, t3, t0 |
| 3094 shra_r.w t1, t1, 20 |
| 3095 shra_r.w t6, t6, 20 |
| 3096 mflo t4, $ac1 |
| 3097 shll_s.w t1, t1, 24 |
| 3098 shll_s.w t6, t6, 24 |
| 3099 sra t1, t1, 24 |
| 3100 sra t6, t6, 24 |
| 3101 addiu t1, t1, 128 |
| 3102 addiu t6, t6, 128 |
| 3103 lw t0, 20(v0) |
| 3104 sb t1, 0(t9) |
| 3105 sb t6, 1(t9) |
| 3106 sll t0, t0, 15 |
| 3107 lw t9, 4(a2) |
| 3108 addu t1, t0, t4 |
| 3109 subu t6, t0, t4 |
| 3110 addu t9, t9, a3 |
| 3111 shra_r.w t1, t1, 20 |
| 3112 shra_r.w t6, t6, 20 |
| 3113 shll_s.w t1, t1, 24 |
| 3114 shll_s.w t6, t6, 24 |
| 3115 sra t1, t1, 24 |
| 3116 sra t6, t6, 24 |
| 3117 addiu t1, t1, 128 |
| 3118 addiu t6, t6, 128 |
| 3119 sb t1, 0(t9) |
| 3120 sb t6, 1(t9) |
| 3121 addiu sp, sp, 40 |
| 3122 |
| 3123 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 |
| 3124 |
| 3125 j ra |
| 3126 nop |
| 3127 |
| 3128 END(jsimd_idct_2x2_mips_dspr2) |
| 3129 |
| 3130 /*****************************************************************************/ |
| 3131 LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) |
| 3132 /* |
| 3133 * a0 - compptr->dct_table |
| 3134 * a1 - coef_block |
| 3135 * a2 - output_buf |
| 3136 * a3 - output_col |
| 3137 * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes |
| 3138 */ |
| 3139 |
| 3140 .set at |
| 3141 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 3142 |
| 3143 lw v1, 48(sp) |
| 3144 move t0, a1 |
| 3145 move t1, v1 |
| 3146 li t9, 4 |
| 3147 li s0, 0x2e75f93e |
| 3148 li s1, 0x21f9ba79 |
| 3149 li s2, 0xecc2efb0 |
| 3150 li s3, 0x52031ccd |
| 3151 |
| 3152 0: |
| 3153 lh s6, 32(t0) // inptr[DCTSIZE*2] |
| 3154 lh t6, 32(a0) // quantptr[DCTSIZE*2] |
| 3155 lh s7, 96(t0) // inptr[DCTSIZE*6] |
| 3156 lh t7, 96(a0) // quantptr[DCTSIZE*6] |
| 3157 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) |
| 3158 lh s4, 0(t0) // inptr[DCTSIZE*0] |
| 3159 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) |
| 3160 lh s5, 0(a0) // quantptr[0] |
| 3161 li s6, 15137 |
| 3162 li s7, 6270 |
| 3163 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) |
| 3164 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) |
| 3165 lh t5, 112(t0) // inptr[DCTSIZE*7] |
| 3166 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) |
| 3167 lh s4, 112(a0) // quantptr[DCTSIZE*7] |
| 3168 lh v0, 80(t0) // inptr[DCTSIZE*5] |
| 3169 lh s5, 80(a0) // quantptr[DCTSIZE*5] |
| 3170 lh s6, 48(a0) // quantptr[DCTSIZE*3] |
| 3171 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) |
| 3172 lh s7, 16(a0) // quantptr[DCTSIZE*1] |
| 3173 lh t8, 16(t0) // inptr[DCTSIZE*1] |
| 3174 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) |
| 3175 lh t7, 48(t0) // inptr[DCTSIZE*3] |
| 3176 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) |
| 3177 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) |
| 3178 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) |
| 3179 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) |
| 3180 addu t3, t2, t6 // tmp10 = tmp0 + z2 |
| 3181 subu t4, t2, t6 // tmp10 = tmp0 - z2 |
| 3182 mult $ac0, zero, zero |
| 3183 mult $ac1, zero, zero |
| 3184 ins t5, v0, 16, 16 |
| 3185 ins t7, t8, 16, 16 |
| 3186 addiu t9, t9, -1 |
| 3187 dpa.w.ph $ac0, t5, s0 |
| 3188 dpa.w.ph $ac0, t7, s1 |
| 3189 dpa.w.ph $ac1, t5, s2 |
| 3190 dpa.w.ph $ac1, t7, s3 |
| 3191 mflo s4, $ac0 |
| 3192 mflo s5, $ac1 |
| 3193 addiu a0, a0, 2 |
| 3194 addiu t1, t1, 4 |
| 3195 addiu t0, t0, 2 |
| 3196 addu t6, t4, s4 |
| 3197 subu t5, t4, s4 |
| 3198 addu s6, t3, s5 |
| 3199 subu s7, t3, s5 |
| 3200 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) |
| 3201 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) |
| 3202 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) |
| 3203 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) |
| 3204 sw t6, 28(t1) |
| 3205 sw t5, 60(t1) |
| 3206 sw s6, -4(t1) |
| 3207 bgtz t9, 0b |
| 3208 sw s7, 92(t1) |
| 3209 // second loop three pass |
| 3210 li t9, 3 |
| 3211 1: |
| 3212 lh s6, 34(t0) // inptr[DCTSIZE*2] |
| 3213 lh t6, 34(a0) // quantptr[DCTSIZE*2] |
| 3214 lh s7, 98(t0) // inptr[DCTSIZE*6] |
| 3215 lh t7, 98(a0) // quantptr[DCTSIZE*6] |
| 3216 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) |
| 3217 lh s4, 2(t0) // inptr[DCTSIZE*0] |
| 3218 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) |
| 3219 lh s5, 2(a0) // quantptr[DCTSIZE*0] |
| 3220 li s6, 15137 |
| 3221 li s7, 6270 |
| 3222 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) |
| 3223 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) |
| 3224 lh t5, 114(t0) // inptr[DCTSIZE*7] |
| 3225 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) |
| 3226 lh s4, 114(a0) // quantptr[DCTSIZE*7] |
| 3227 lh s5, 82(a0) // quantptr[DCTSIZE*5] |
| 3228 lh t6, 82(t0) // inptr[DCTSIZE*5] |
| 3229 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) |
| 3230 lh s6, 50(a0) // quantptr[DCTSIZE*3] |
| 3231 lh t8, 18(t0) // inptr[DCTSIZE*1] |
| 3232 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) |
| 3233 lh t7, 50(t0) // inptr[DCTSIZE*3] |
| 3234 lh s7, 18(a0) // quantptr[DCTSIZE*1] |
| 3235 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) |
| 3236 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) |
| 3237 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) |
| 3238 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) |
| 3239 addu t3, t2, v0 // tmp10 = tmp0 + z2 |
| 3240 subu t4, t2, v0 // tmp10 = tmp0 - z2 |
| 3241 mult $ac0, zero, zero |
| 3242 mult $ac1, zero, zero |
| 3243 ins t5, t6, 16, 16 |
| 3244 ins t7, t8, 16, 16 |
| 3245 dpa.w.ph $ac0, t5, s0 |
| 3246 dpa.w.ph $ac0, t7, s1 |
| 3247 dpa.w.ph $ac1, t5, s2 |
| 3248 dpa.w.ph $ac1, t7, s3 |
| 3249 mflo t5, $ac0 |
| 3250 mflo t6, $ac1 |
| 3251 addiu t9, t9, -1 |
| 3252 addiu t0, t0, 2 |
| 3253 addiu a0, a0, 2 |
| 3254 addiu t1, t1, 4 |
| 3255 addu s5, t4, t5 |
| 3256 subu s4, t4, t5 |
| 3257 addu s6, t3, t6 |
| 3258 subu s7, t3, t6 |
| 3259 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) |
| 3260 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) |
| 3261 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) |
| 3262 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) |
| 3263 sw s5, 32(t1) |
| 3264 sw s4, 64(t1) |
| 3265 sw s6, 0(t1) |
| 3266 bgtz t9, 1b |
| 3267 sw s7, 96(t1) |
| 3268 move t1, v1 |
| 3269 li s4, 15137 |
| 3270 lw s6, 8(t1) // wsptr[2] |
| 3271 li s5, 6270 |
| 3272 lw s7, 24(t1) // wsptr[6] |
| 3273 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) |
| 3274 lw t2, 0(t1) // wsptr[0] |
| 3275 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) |
| 3276 lh t5, 28(t1) // wsptr[7] |
| 3277 lh t6, 20(t1) // wsptr[5] |
| 3278 lh t7, 12(t1) // wsptr[3] |
| 3279 lh t8, 4(t1) // wsptr[1] |
| 3280 ins t5, t6, 16, 16 |
| 3281 ins t7, t8, 16, 16 |
| 3282 mult $ac0, zero, zero |
| 3283 dpa.w.ph $ac0, t5, s0 |
| 3284 dpa.w.ph $ac0, t7, s1 |
| 3285 mult $ac1, zero, zero |
| 3286 dpa.w.ph $ac1, t5, s2 |
| 3287 dpa.w.ph $ac1, t7, s3 |
| 3288 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) |
| 3289 mflo s6, $ac0 |
| 3290 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) |
| 3291 subu s4, s4, s5 |
| 3292 addu t3, t2, s4 // tmp10 = tmp0 + z2 |
| 3293 mflo s7, $ac1 |
| 3294 subu t4, t2, s4 // tmp10 = tmp0 - z2 |
| 3295 addu t7, t4, s6 |
| 3296 subu t8, t4, s6 |
| 3297 addu t5, t3, s7 |
| 3298 subu t6, t3, s7 |
| 3299 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) |
| 3300 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) |
| 3301 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) |
| 3302 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) |
| 3303 sll s4, t9, 2 |
| 3304 lw v0, 0(a2) // output_buf[ctr] |
| 3305 shll_s.w t5, t5, 24 |
| 3306 shll_s.w t6, t6, 24 |
| 3307 shll_s.w t7, t7, 24 |
| 3308 shll_s.w t8, t8, 24 |
| 3309 sra t5, t5, 24 |
| 3310 sra t6, t6, 24 |
| 3311 sra t7, t7, 24 |
| 3312 sra t8, t8, 24 |
| 3313 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col |
| 3314 addiu t5, t5, 128 |
| 3315 addiu t6, t6, 128 |
| 3316 addiu t7, t7, 128 |
| 3317 addiu t8, t8, 128 |
| 3318 sb t5, 0(v0) |
| 3319 sb t7, 1(v0) |
| 3320 sb t8, 2(v0) |
| 3321 sb t6, 3(v0) |
| 3322 // 2 |
| 3323 li s4, 15137 |
| 3324 lw s6, 40(t1) // wsptr[2] |
| 3325 li s5, 6270 |
| 3326 lw s7, 56(t1) // wsptr[6] |
| 3327 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) |
| 3328 lw t2, 32(t1) // wsptr[0] |
| 3329 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) |
| 3330 lh t5, 60(t1) // wsptr[7] |
| 3331 lh t6, 52(t1) // wsptr[5] |
| 3332 lh t7, 44(t1) // wsptr[3] |
| 3333 lh t8, 36(t1) // wsptr[1] |
| 3334 ins t5, t6, 16, 16 |
| 3335 ins t7, t8, 16, 16 |
| 3336 mult $ac0, zero, zero |
| 3337 dpa.w.ph $ac0, t5, s0 |
| 3338 dpa.w.ph $ac0, t7, s1 |
| 3339 mult $ac1, zero, zero |
| 3340 dpa.w.ph $ac1, t5, s2 |
| 3341 dpa.w.ph $ac1, t7, s3 |
| 3342 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) |
| 3343 mflo s6, $ac0 |
| 3344 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) |
| 3345 subu s4, s4, s5 |
| 3346 addu t3, t2, s4 // tmp10 = tmp0 + z2 |
| 3347 mflo s7, $ac1 |
| 3348 subu t4, t2, s4 // tmp10 = tmp0 - z2 |
| 3349 addu t7, t4, s6 |
| 3350 subu t8, t4, s6 |
| 3351 addu t5, t3, s7 |
| 3352 subu t6, t3, s7 |
| 3353 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+
1) |
| 3354 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+
1) |
| 3355 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+
1) |
| 3356 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+
1) |
| 3357 sll s4, t9, 2 |
| 3358 lw v0, 4(a2) // output_buf[ctr] |
| 3359 shll_s.w t5, t5, 24 |
| 3360 shll_s.w t6, t6, 24 |
| 3361 shll_s.w t7, t7, 24 |
| 3362 shll_s.w t8, t8, 24 |
| 3363 sra t5, t5, 24 |
| 3364 sra t6, t6, 24 |
| 3365 sra t7, t7, 24 |
| 3366 sra t8, t8, 24 |
| 3367 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col |
| 3368 addiu t5, t5, 128 |
| 3369 addiu t6, t6, 128 |
| 3370 addiu t7, t7, 128 |
| 3371 addiu t8, t8, 128 |
| 3372 sb t5, 0(v0) |
| 3373 sb t7, 1(v0) |
| 3374 sb t8, 2(v0) |
| 3375 sb t6, 3(v0) |
| 3376 // 3 |
| 3377 li s4, 15137 |
| 3378 lw s6, 72(t1) // wsptr[2] |
| 3379 li s5, 6270 |
| 3380 lw s7, 88(t1) // wsptr[6] |
| 3381 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) |
| 3382 lw t2, 64(t1) // wsptr[0] |
| 3383 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865) |
| 3384 lh t5, 92(t1) // wsptr[7] |
| 3385 lh t6, 84(t1) // wsptr[5] |
| 3386 lh t7, 76(t1) // wsptr[3] |
| 3387 lh t8, 68(t1) // wsptr[1] |
| 3388 ins t5, t6, 16, 16 |
| 3389 ins t7, t8, 16, 16 |
| 3390 mult $ac0, zero, zero |
| 3391 dpa.w.ph $ac0, t5, s0 |
| 3392 dpa.w.ph $ac0, t7, s1 |
| 3393 mult $ac1, zero, zero |
| 3394 dpa.w.ph $ac1, t5, s2 |
| 3395 dpa.w.ph $ac1, t7, s3 |
| 3396 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) |
| 3397 mflo s6, $ac0 |
| 3398 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) |
| 3399 subu s4, s4, s5 |
| 3400 addu t3, t2, s4 // tmp10 = tmp0 + z2 |
| 3401 mflo s7, $ac1 |
| 3402 subu t4, t2, s4 // tmp10 = tmp0 - z2 |
| 3403 addu t7, t4, s6 |
| 3404 subu t8, t4, s6 |
| 3405 addu t5, t3, s7 |
| 3406 subu t6, t3, s7 |
| 3407 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) |
| 3408 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) |
| 3409 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) |
| 3410 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) |
| 3411 sll s4, t9, 2 |
| 3412 lw v0, 8(a2) // output_buf[ctr] |
| 3413 shll_s.w t5, t5, 24 |
| 3414 shll_s.w t6, t6, 24 |
| 3415 shll_s.w t7, t7, 24 |
| 3416 shll_s.w t8, t8, 24 |
| 3417 sra t5, t5, 24 |
| 3418 sra t6, t6, 24 |
| 3419 sra t7, t7, 24 |
| 3420 sra t8, t8, 24 |
| 3421 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col |
| 3422 addiu t5, t5, 128 |
| 3423 addiu t6, t6, 128 |
| 3424 addiu t7, t7, 128 |
| 3425 addiu t8, t8, 128 |
| 3426 sb t5, 0(v0) |
| 3427 sb t7, 1(v0) |
| 3428 sb t8, 2(v0) |
| 3429 sb t6, 3(v0) |
| 3430 li s4, 15137 |
| 3431 lw s6, 104(t1) // wsptr[2] |
| 3432 li s5, 6270 |
| 3433 lw s7, 120(t1) // wsptr[6] |
| 3434 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065) |
| 3435 lw t2, 96(t1) // wsptr[0] |
| 3436 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865) |
| 3437 lh t5, 124(t1) // wsptr[7] |
| 3438 lh t6, 116(t1) // wsptr[5] |
| 3439 lh t7, 108(t1) // wsptr[3] |
| 3440 lh t8, 100(t1) // wsptr[1] |
| 3441 ins t5, t6, 16, 16 |
| 3442 ins t7, t8, 16, 16 |
| 3443 mult $ac0, zero, zero |
| 3444 dpa.w.ph $ac0, t5, s0 |
| 3445 dpa.w.ph $ac0, t7, s1 |
| 3446 mult $ac1, zero, zero |
| 3447 dpa.w.ph $ac1, t5, s2 |
| 3448 dpa.w.ph $ac1, t7, s3 |
| 3449 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1) |
| 3450 mflo s6, $ac0 |
| 3451 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) |
| 3452 subu s4, s4, s5 |
| 3453 addu t3, t2, s4 // tmp10 = tmp0 + z2; |
| 3454 mflo s7, $ac1 |
| 3455 subu t4, t2, s4 // tmp10 = tmp0 - z2; |
| 3456 addu t7, t4, s6 |
| 3457 subu t8, t4, s6 |
| 3458 addu t5, t3, s7 |
| 3459 subu t6, t3, s7 |
| 3460 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) |
| 3461 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) |
| 3462 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) |
| 3463 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) |
| 3464 sll s4, t9, 2 |
| 3465 lw v0, 12(a2) // output_buf[ctr] |
| 3466 shll_s.w t5, t5, 24 |
| 3467 shll_s.w t6, t6, 24 |
| 3468 shll_s.w t7, t7, 24 |
| 3469 shll_s.w t8, t8, 24 |
| 3470 sra t5, t5, 24 |
| 3471 sra t6, t6, 24 |
| 3472 sra t7, t7, 24 |
| 3473 sra t8, t8, 24 |
| 3474 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col |
| 3475 addiu t5, t5, 128 |
| 3476 addiu t6, t6, 128 |
| 3477 addiu t7, t7, 128 |
| 3478 addiu t8, t8, 128 |
| 3479 sb t5, 0(v0) |
| 3480 sb t7, 1(v0) |
| 3481 sb t8, 2(v0) |
| 3482 sb t6, 3(v0) |
| 3483 |
| 3484 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 3485 |
| 3486 j ra |
| 3487 nop |
| 3488 END(jsimd_idct_4x4_mips_dspr2) |
| 3489 |
| 3490 /*****************************************************************************/ |
| 3491 LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) |
| 3492 /* |
| 3493 * a0 - compptr->dct_table |
| 3494 * a1 - coef_block |
| 3495 * a2 - output_buf |
| 3496 * a3 - output_col |
| 3497 */ |
| 3498 .set at |
| 3499 |
| 3500 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 3501 |
| 3502 addiu sp, sp, -144 |
| 3503 move v0, sp |
| 3504 addiu v1, v0, 24 |
| 3505 addiu t9, zero, 5793 |
| 3506 addiu s0, zero, 10033 |
| 3507 addiu s1, zero, 2998 |
| 3508 |
| 3509 1: |
| 3510 lh s2, 0(a0) // q0 = quantptr[ 0] |
| 3511 lh s3, 32(a0) // q1 = quantptr[16] |
| 3512 lh s4, 64(a0) // q2 = quantptr[32] |
| 3513 lh t2, 64(a1) // tmp2 = inptr[32] |
| 3514 lh t1, 32(a1) // tmp1 = inptr[16] |
| 3515 lh t0, 0(a1) // tmp0 = inptr[ 0] |
| 3516 mul t2, t2, s4 // tmp2 = tmp2 * q2 |
| 3517 mul t1, t1, s3 // tmp1 = tmp1 * q1 |
| 3518 mul t0, t0, s2 // tmp0 = tmp0 * q0 |
| 3519 lh t6, 16(a1) // z1 = inptr[ 8] |
| 3520 lh t8, 80(a1) // z3 = inptr[40] |
| 3521 lh t7, 48(a1) // z2 = inptr[24] |
| 3522 lh s2, 16(a0) // q0 = quantptr[ 8] |
| 3523 lh s4, 80(a0) // q2 = quantptr[40] |
| 3524 lh s3, 48(a0) // q1 = quantptr[24] |
| 3525 mul t2, t2, t9 // tmp2 = tmp2 * 5793 |
| 3526 mul t1, t1, s0 // tmp1 = tmp1 * 10033 |
| 3527 sll t0, t0, 13 // tmp0 = tmp0 << 13 |
| 3528 mul t6, t6, s2 // z1 = z1 * q0 |
| 3529 mul t8, t8, s4 // z3 = z3 * q2 |
| 3530 mul t7, t7, s3 // z2 = z2 * q1 |
| 3531 addu t3, t0, t2 // tmp10 = tmp0 + tmp2 |
| 3532 sll t2, t2, 1 // tmp2 = tmp2 << 2 |
| 3533 subu t4, t0, t2 // tmp11 = tmp0 - tmp2; |
| 3534 subu t5, t3, t1 // tmp12 = tmp10 - tmp1 |
| 3535 addu t3, t3, t1 // tmp10 = tmp10 + tmp1 |
| 3536 addu t1, t6, t8 // tmp1 = z1 + z3 |
| 3537 mul t1, t1, s1 // tmp1 = tmp1 * 2998 |
| 3538 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 |
| 3539 subu t2, t6, t8 // tmp2 = z1 - z3 |
| 3540 subu t2, t2, t7 // tmp2 = tmp2 - z2 |
| 3541 sll t2, t2, 2 // tmp2 = tmp2 << 2 |
| 3542 addu t0, t6, t7 // tmp0 = z1 + z2 |
| 3543 sll t0, t0, 13 // tmp0 = tmp0 << 13 |
| 3544 subu s2, t8, t7 // q0 = z3 - z2 |
| 3545 sll s2, s2, 13 // q0 = q0 << 13 |
| 3546 addu t0, t0, t1 // tmp0 = tmp0 + tmp1 |
| 3547 addu t1, s2, t1 // tmp1 = q0 + tmp1 |
| 3548 addu s2, t4, t2 // q0 = tmp11 + tmp2 |
| 3549 subu s3, t4, t2 // q1 = tmp11 - tmp2 |
| 3550 addu t6, t3, t0 // z1 = tmp10 + tmp0 |
| 3551 subu t7, t3, t0 // z2 = tmp10 - tmp0 |
| 3552 addu t4, t5, t1 // tmp11 = tmp12 + tmp1 |
| 3553 subu t5, t5, t1 // tmp12 = tmp12 - tmp1 |
| 3554 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 |
| 3555 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 |
| 3556 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 |
| 3557 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 |
| 3558 sw s2, 24(v0) |
| 3559 sw s3, 96(v0) |
| 3560 sw t6, 0(v0) |
| 3561 sw t7, 120(v0) |
| 3562 sw t4, 48(v0) |
| 3563 sw t5, 72(v0) |
| 3564 addiu v0, v0, 4 |
| 3565 addiu a1, a1, 2 |
| 3566 bne v0, v1, 1b |
| 3567 addiu a0, a0, 2 |
| 3568 |
| 3569 /* Pass 2: process 6 rows from work array, store into output array. */ |
| 3570 move v0, sp |
| 3571 addiu v1, v0, 144 |
| 3572 |
| 3573 2: |
| 3574 lw t0, 0(v0) |
| 3575 lw t2, 16(v0) |
| 3576 lw s5, 0(a2) |
| 3577 addiu t0, t0, 16 |
| 3578 sll t0, t0, 13 |
| 3579 mul t3, t2, t9 |
| 3580 lw t6, 4(v0) |
| 3581 lw t8, 20(v0) |
| 3582 lw t7, 12(v0) |
| 3583 addu s5, s5, a3 |
| 3584 addu s6, t6, t8 |
| 3585 mul s6, s6, s1 |
| 3586 addu t1, t0, t3 |
| 3587 subu t4, t0, t3 |
| 3588 subu t4, t4, t3 |
| 3589 lw t3, 8(v0) |
| 3590 mul t0, t3, s0 |
| 3591 addu s7, t6, t7 |
| 3592 sll s7, s7, 13 |
| 3593 addu s7, s6, s7 |
| 3594 subu t2, t8, t7 |
| 3595 sll t2, t2, 13 |
| 3596 addu t2, s6, t2 |
| 3597 subu s6, t6, t7 |
| 3598 subu s6, s6, t8 |
| 3599 sll s6, s6, 13 |
| 3600 addu t3, t1, t0 |
| 3601 subu t5, t1, t0 |
| 3602 addu t6, t3, s7 |
| 3603 subu t3, t3, s7 |
| 3604 addu t7, t4, s6 |
| 3605 subu t4, t4, s6 |
| 3606 addu t8, t5, t2 |
| 3607 subu t5, t5, t2 |
| 3608 shll_s.w t6, t6, 6 |
| 3609 shll_s.w t3, t3, 6 |
| 3610 shll_s.w t7, t7, 6 |
| 3611 shll_s.w t4, t4, 6 |
| 3612 shll_s.w t8, t8, 6 |
| 3613 shll_s.w t5, t5, 6 |
| 3614 sra t6, t6, 24 |
| 3615 addiu t6, t6, 128 |
| 3616 sra t3, t3, 24 |
| 3617 addiu t3, t3, 128 |
| 3618 sb t6, 0(s5) |
| 3619 sra t7, t7, 24 |
| 3620 addiu t7, t7, 128 |
| 3621 sb t3, 5(s5) |
| 3622 sra t4, t4, 24 |
| 3623 addiu t4, t4, 128 |
| 3624 sb t7, 1(s5) |
| 3625 sra t8, t8, 24 |
| 3626 addiu t8, t8, 128 |
| 3627 sb t4, 4(s5) |
| 3628 addiu v0, v0, 24 |
| 3629 sra t5, t5, 24 |
| 3630 addiu t5, t5, 128 |
| 3631 sb t8, 2(s5) |
| 3632 addiu a2, a2, 4 |
| 3633 bne v0, v1, 2b |
| 3634 sb t5, 3(s5) |
| 3635 |
| 3636 addiu sp, sp, 144 |
| 3637 |
| 3638 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 |
| 3639 |
| 3640 j ra |
| 3641 nop |
| 3642 |
| 3643 END(jsimd_idct_6x6_mips_dspr2) |
| 3644 |
| 3645 /*****************************************************************************/ |
| 3646 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) |
| 3647 /* |
| 3648 * a0 - compptr->dct_table |
| 3649 * a1 - coef_block |
| 3650 * a2 - workspace |
| 3651 */ |
| 3652 |
| 3653 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| 3654 |
| 3655 li a3, 8 |
| 3656 |
| 3657 1: |
| 3658 // odd part |
| 3659 lh t0, 48(a1) |
| 3660 lh t1, 48(a0) |
| 3661 lh t2, 16(a1) |
| 3662 lh t3, 16(a0) |
| 3663 lh t4, 80(a1) |
| 3664 lh t5, 80(a0) |
| 3665 lh t6, 112(a1) |
| 3666 lh t7, 112(a0) |
| 3667 mul t0, t0, t1 // z2 |
| 3668 mul t1, t2, t3 // z1 |
| 3669 mul t2, t4, t5 // z3 |
| 3670 mul t3, t6, t7 // z4 |
| 3671 li t4, 10703 // FIX(1.306562965) |
| 3672 li t5, 4433 // FIX_0_541196100 |
| 3673 li t6, 7053 // FIX(0.860918669) |
| 3674 mul t4, t0,t4 // tmp11 |
| 3675 mul t5, t0,t5 // -tmp14 |
| 3676 addu t7, t1,t2 // tmp10 |
| 3677 addu t8, t7,t3 // tmp10 + z4 |
| 3678 mul t6, t6, t8 // tmp15 |
| 3679 li t8, 2139 // FIX(0.261052384) |
| 3680 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) |
| 3681 li t7, 2295 // FIX(0.280143716) |
| 3682 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) |
| 3683 addu t9, t2, t3 // z3 + z4 |
| 3684 li s0, 8565 // FIX(1.045510580) |
| 3685 mul t9, t9, s0 // -tmp13 |
| 3686 li s0, 12112 // FIX(1.478575242) |
| 3687 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) |
| 3688 li s1, 12998 // FIX(1.586706681) |
| 3689 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) |
| 3690 li s2, 5540 // FIX(0.676326758) |
| 3691 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) |
| 3692 li s3, 16244 // FIX(1.982889723) |
| 3693 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) |
| 3694 subu t1, t1, t3 // z1-=z4 |
| 3695 subu t0, t0, t2 // z2-=z3 |
| 3696 addu t2, t0, t1 // z1+z2 |
| 3697 li t3, 4433 // FIX_0_541196100 |
| 3698 mul t2, t2, t3 // z3 |
| 3699 li t3, 6270 // FIX_0_765366865 |
| 3700 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) |
| 3701 li t3, 15137 // FIX_0_765366865 |
| 3702 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) |
| 3703 addu t8, t6, t8 // tmp12 |
| 3704 addu t3, t8, t4 // tmp12 + tmp11 |
| 3705 addu t3, t3, t7 // tmp10 |
| 3706 subu t8, t8, t9 // tmp12 + tmp13 |
| 3707 addu s0, t5, s0 |
| 3708 subu t8, t8, s0 // tmp12 |
| 3709 subu t9, t6, t9 |
| 3710 subu s1, s1, t4 |
| 3711 addu t9, t9, s1 // tmp13 |
| 3712 subu t6, t6, t5 |
| 3713 subu t6, t6, s2 |
| 3714 subu t6, t6, s3 // tmp15 |
| 3715 // even part start |
| 3716 lh t4, 64(a1) |
| 3717 lh t5, 64(a0) |
| 3718 lh t7, 32(a1) |
| 3719 lh s0, 32(a0) |
| 3720 lh s1, 0(a1) |
| 3721 lh s2, 0(a0) |
| 3722 lh s3, 96(a1) |
| 3723 lh v0, 96(a0) |
| 3724 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) |
| 3725 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) |
| 3726 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) |
| 3727 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) |
| 3728 // odd part end |
| 3729 addu t1, t2, t1 // tmp11 |
| 3730 subu t0, t2, t0 // tmp14 |
| 3731 // update counter and pointers |
| 3732 addiu a3, a3, -1 |
| 3733 addiu a0, a0, 2 |
| 3734 addiu a1, a1, 2 |
| 3735 // even part rest |
| 3736 li s1, 10033 |
| 3737 li s2, 11190 |
| 3738 mul t4, t4, s1 // z4 |
| 3739 mul s1, t5, s2 // z4 |
| 3740 sll t5, t5, 13 // z1 |
| 3741 sll t7, t7, 13 |
| 3742 addiu t7, t7, 1024 // z3 |
| 3743 sll s0, s0, 13 // z2 |
| 3744 addu s2, t7, t4 // tmp10 |
| 3745 subu t4, t7, t4 // tmp11 |
| 3746 subu s3, t5, s0 // tmp12 |
| 3747 addu t2, t7, s3 // tmp21 |
| 3748 subu s3, t7, s3 // tmp24 |
| 3749 addu t7, s1, s0 // tmp12 |
| 3750 addu v0, s2, t7 // tmp20 |
| 3751 subu s2, s2, t7 // tmp25 |
| 3752 subu s1, s1, t5 // z4 - z1 |
| 3753 subu s1, s1, s0 // tmp12 |
| 3754 addu s0, t4, s1 // tmp22 |
| 3755 subu t4, t4, s1 // tmp23 |
| 3756 // final output stage |
| 3757 addu t5, v0, t3 |
| 3758 subu v0, v0, t3 |
| 3759 addu t3, t2, t1 |
| 3760 subu t2, t2, t1 |
| 3761 addu t1, s0, t8 |
| 3762 subu s0, s0, t8 |
| 3763 addu t8, t4, t9 |
| 3764 subu t4, t4, t9 |
| 3765 addu t9, s3, t0 |
| 3766 subu s3, s3, t0 |
| 3767 addu t0, s2, t6 |
| 3768 subu s2, s2, t6 |
| 3769 sra t5, t5, 11 |
| 3770 sra t3, t3, 11 |
| 3771 sra t1, t1, 11 |
| 3772 sra t8, t8, 11 |
| 3773 sra t9, t9, 11 |
| 3774 sra t0, t0, 11 |
| 3775 sra s2, s2, 11 |
| 3776 sra s3, s3, 11 |
| 3777 sra t4, t4, 11 |
| 3778 sra s0, s0, 11 |
| 3779 sra t2, t2, 11 |
| 3780 sra v0, v0, 11 |
| 3781 sw t5, 0(a2) |
| 3782 sw t3, 32(a2) |
| 3783 sw t1, 64(a2) |
| 3784 sw t8, 96(a2) |
| 3785 sw t9, 128(a2) |
| 3786 sw t0, 160(a2) |
| 3787 sw s2, 192(a2) |
| 3788 sw s3, 224(a2) |
| 3789 sw t4, 256(a2) |
| 3790 sw s0, 288(a2) |
| 3791 sw t2, 320(a2) |
| 3792 sw v0, 352(a2) |
| 3793 bgtz a3, 1b |
| 3794 addiu a2, a2, 4 |
| 3795 |
| 3796 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| 3797 |
| 3798 j ra |
| 3799 nop |
| 3800 |
| 3801 END(jsimd_idct_12x12_pass1_mips_dspr2) |
| 3802 |
| 3803 /*****************************************************************************/ |
| 3804 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) |
| 3805 /* |
| 3806 * a0 - workspace |
| 3807 * a1 - output |
| 3808 */ |
| 3809 |
| 3810 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 |
| 3811 |
| 3812 li a3, 12 |
| 3813 |
| 3814 1: |
| 3815 // Odd part |
| 3816 lw t0, 12(a0) |
| 3817 lw t1, 4(a0) |
| 3818 lw t2, 20(a0) |
| 3819 lw t3, 28(a0) |
| 3820 li t4, 10703 // FIX(1.306562965) |
| 3821 li t5, 4433 // FIX_0_541196100 |
| 3822 mul t4, t0, t4 // tmp11 |
| 3823 mul t5, t0, t5 // -tmp14 |
| 3824 addu t6, t1, t2 // tmp10 |
| 3825 li t7, 2139 // FIX(0.261052384) |
| 3826 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) |
| 3827 addu t6, t6, t3 // tmp10 + z4 |
| 3828 li t8, 7053 // FIX(0.860918669) |
| 3829 mul t6, t6, t8 // tmp15 |
| 3830 li t8, 2295 // FIX(0.280143716) |
| 3831 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) |
| 3832 addu t9, t2, t3 // z3 + z4 |
| 3833 li s0, 8565 // FIX(1.045510580) |
| 3834 mul t9, t9, s0 // -tmp13 |
| 3835 li s0, 12112 // FIX(1.478575242) |
| 3836 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) |
| 3837 li s1, 12998 // FIX(1.586706681) |
| 3838 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) |
| 3839 li s2, 5540 // FIX(0.676326758) |
| 3840 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) |
| 3841 li s3, 16244 // FIX(1.982889723) |
| 3842 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) |
| 3843 subu t1, t1, t3 // z1 -= z4 |
| 3844 subu t0, t0, t2 // z2 -= z3 |
| 3845 addu t2, t1, t0 // z1 + z2 |
| 3846 li t3, 4433 // FIX_0_541196100 |
| 3847 mul t2, t2, t3 // z3 |
| 3848 li t3, 6270 // FIX_0_765366865 |
| 3849 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) |
| 3850 li t3, 15137 // FIX_1_847759065 |
| 3851 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) |
| 3852 addu t3, t6, t7 // tmp12 |
| 3853 addu t7, t3, t4 |
| 3854 addu t7, t7, t8 // tmp10 |
| 3855 subu t3, t3, t9 |
| 3856 subu t3, t3, t5 |
| 3857 subu t3, t3, s0 // tmp12 |
| 3858 subu t9, t6, t9 |
| 3859 subu t9, t9, t4 |
| 3860 addu t9, t9, s1 // tmp13 |
| 3861 subu t6, t6, t5 |
| 3862 subu t6, t6, s2 |
| 3863 subu t6, t6, s3 // tmp15 |
| 3864 addu t1, t2, t1 // tmp11 |
| 3865 subu t0, t2, t0 // tmp14 |
| 3866 // even part |
| 3867 lw t2, 16(a0) // z4 |
| 3868 lw t4, 8(a0) // z1 |
| 3869 lw t5, 0(a0) // z3 |
| 3870 lw t8, 24(a0) // z2 |
| 3871 li s0, 10033 // FIX(1.224744871) |
| 3872 li s1, 11190 // FIX(1.366025404) |
| 3873 mul t2, t2, s0 // z4 |
| 3874 mul s0, t4, s1 // z4 |
| 3875 addiu t5, t5, 0x10 |
| 3876 sll t5, t5, 13 // z3 |
| 3877 sll t4, t4, 13 // z1 |
| 3878 sll t8, t8, 13 // z2 |
| 3879 subu s1, t4, t8 // tmp12 |
| 3880 addu s2, t5, t2 // tmp10 |
| 3881 subu t2, t5, t2 // tmp11 |
| 3882 addu s3, t5, s1 // tmp21 |
| 3883 subu s1, t5, s1 // tmp24 |
| 3884 addu t5, s0, t8 // tmp12 |
| 3885 addu v0, s2, t5 // tmp20 |
| 3886 subu t5, s2, t5 // tmp25 |
| 3887 subu t4, s0, t4 |
| 3888 subu t4, t4, t8 // tmp12 |
| 3889 addu t8, t2, t4 // tmp22 |
| 3890 subu t2, t2, t4 // tmp23 |
| 3891 // increment counter and pointers |
| 3892 addiu a3, a3, -1 |
| 3893 addiu a0, a0, 32 |
| 3894 // Final stage |
| 3895 addu t4, v0, t7 |
| 3896 subu v0, v0, t7 |
| 3897 addu t7, s3, t1 |
| 3898 subu s3, s3, t1 |
| 3899 addu t1, t8, t3 |
| 3900 subu t8, t8, t3 |
| 3901 addu t3, t2, t9 |
| 3902 subu t2, t2, t9 |
| 3903 addu t9, s1, t0 |
| 3904 subu s1, s1, t0 |
| 3905 addu t0, t5, t6 |
| 3906 subu t5, t5, t6 |
| 3907 sll t4, t4, 4 |
| 3908 sll t7, t7, 4 |
| 3909 sll t1, t1, 4 |
| 3910 sll t3, t3, 4 |
| 3911 sll t9, t9, 4 |
| 3912 sll t0, t0, 4 |
| 3913 sll t5, t5, 4 |
| 3914 sll s1, s1, 4 |
| 3915 sll t2, t2, 4 |
| 3916 sll t8, t8, 4 |
| 3917 sll s3, s3, 4 |
| 3918 sll v0, v0, 4 |
| 3919 shll_s.w t4, t4, 2 |
| 3920 shll_s.w t7, t7, 2 |
| 3921 shll_s.w t1, t1, 2 |
| 3922 shll_s.w t3, t3, 2 |
| 3923 shll_s.w t9, t9, 2 |
| 3924 shll_s.w t0, t0, 2 |
| 3925 shll_s.w t5, t5, 2 |
| 3926 shll_s.w s1, s1, 2 |
| 3927 shll_s.w t2, t2, 2 |
| 3928 shll_s.w t8, t8, 2 |
| 3929 shll_s.w s3, s3, 2 |
| 3930 shll_s.w v0, v0, 2 |
| 3931 srl t4, t4, 24 |
| 3932 srl t7, t7, 24 |
| 3933 srl t1, t1, 24 |
| 3934 srl t3, t3, 24 |
| 3935 srl t9, t9, 24 |
| 3936 srl t0, t0, 24 |
| 3937 srl t5, t5, 24 |
| 3938 srl s1, s1, 24 |
| 3939 srl t2, t2, 24 |
| 3940 srl t8, t8, 24 |
| 3941 srl s3, s3, 24 |
| 3942 srl v0, v0, 24 |
| 3943 lw t6, 0(a1) |
| 3944 addiu t4, t4, 0x80 |
| 3945 addiu t7, t7, 0x80 |
| 3946 addiu t1, t1, 0x80 |
| 3947 addiu t3, t3, 0x80 |
| 3948 addiu t9, t9, 0x80 |
| 3949 addiu t0, t0, 0x80 |
| 3950 addiu t5, t5, 0x80 |
| 3951 addiu s1, s1, 0x80 |
| 3952 addiu t2, t2, 0x80 |
| 3953 addiu t8, t8, 0x80 |
| 3954 addiu s3, s3, 0x80 |
| 3955 addiu v0, v0, 0x80 |
| 3956 sb t4, 0(t6) |
| 3957 sb t7, 1(t6) |
| 3958 sb t1, 2(t6) |
| 3959 sb t3, 3(t6) |
| 3960 sb t9, 4(t6) |
| 3961 sb t0, 5(t6) |
| 3962 sb t5, 6(t6) |
| 3963 sb s1, 7(t6) |
| 3964 sb t2, 8(t6) |
| 3965 sb t8, 9(t6) |
| 3966 sb s3, 10(t6) |
| 3967 sb v0, 11(t6) |
| 3968 bgtz a3, 1b |
| 3969 addiu a1, a1, 4 |
| 3970 |
| 3971 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 |
| 3972 |
| 3973 jr ra |
| 3974 nop |
| 3975 |
| 3976 END(jsimd_idct_12x12_pass2_mips_dspr2) |
| 3977 |
| 3978 /*****************************************************************************/ |
| 3979 LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2) |
| 3980 /* |
| 3981 * a0 - sample_data |
| 3982 * a1 - start_col |
| 3983 * a2 - workspace |
| 3984 */ |
| 3985 |
| 3986 lw t0, 0(a0) |
| 3987 li t7, 0xff80ff80 |
| 3988 addu t0, t0, a1 |
| 3989 ulw t1, 0(t0) |
| 3990 ulw t2, 4(t0) |
| 3991 preceu.ph.qbr t3, t1 |
| 3992 preceu.ph.qbl t4, t1 |
| 3993 lw t0, 4(a0) |
| 3994 preceu.ph.qbr t5, t2 |
| 3995 preceu.ph.qbl t6, t2 |
| 3996 addu t0, t0, a1 |
| 3997 addu.ph t3, t3, t7 |
| 3998 addu.ph t4, t4, t7 |
| 3999 ulw t1, 0(t0) |
| 4000 ulw t2, 4(t0) |
| 4001 addu.ph t5, t5, t7 |
| 4002 addu.ph t6, t6, t7 |
| 4003 usw t3, 0(a2) |
| 4004 usw t4, 4(a2) |
| 4005 preceu.ph.qbr t3, t1 |
| 4006 preceu.ph.qbl t4, t1 |
| 4007 usw t5, 8(a2) |
| 4008 usw t6, 12(a2) |
| 4009 |
| 4010 lw t0, 8(a0) |
| 4011 preceu.ph.qbr t5, t2 |
| 4012 preceu.ph.qbl t6, t2 |
| 4013 addu t0, t0, a1 |
| 4014 addu.ph t3, t3, t7 |
| 4015 addu.ph t4, t4, t7 |
| 4016 ulw t1, 0(t0) |
| 4017 ulw t2, 4(t0) |
| 4018 addu.ph t5, t5, t7 |
| 4019 addu.ph t6, t6, t7 |
| 4020 usw t3, 16(a2) |
| 4021 usw t4, 20(a2) |
| 4022 preceu.ph.qbr t3, t1 |
| 4023 preceu.ph.qbl t4, t1 |
| 4024 usw t5, 24(a2) |
| 4025 usw t6, 28(a2) |
| 4026 |
| 4027 lw t0, 12(a0) |
| 4028 preceu.ph.qbr t5, t2 |
| 4029 preceu.ph.qbl t6, t2 |
| 4030 addu t0, t0, a1 |
| 4031 addu.ph t3, t3, t7 |
| 4032 addu.ph t4, t4, t7 |
| 4033 ulw t1, 0(t0) |
| 4034 ulw t2, 4(t0) |
| 4035 addu.ph t5, t5, t7 |
| 4036 addu.ph t6, t6, t7 |
| 4037 usw t3, 32(a2) |
| 4038 usw t4, 36(a2) |
| 4039 preceu.ph.qbr t3, t1 |
| 4040 preceu.ph.qbl t4, t1 |
| 4041 usw t5, 40(a2) |
| 4042 usw t6, 44(a2) |
| 4043 |
| 4044 lw t0, 16(a0) |
| 4045 preceu.ph.qbr t5, t2 |
| 4046 preceu.ph.qbl t6, t2 |
| 4047 addu t0, t0, a1 |
| 4048 addu.ph t3, t3, t7 |
| 4049 addu.ph t4, t4, t7 |
| 4050 ulw t1, 0(t0) |
| 4051 ulw t2, 4(t0) |
| 4052 addu.ph t5, t5, t7 |
| 4053 addu.ph t6, t6, t7 |
| 4054 usw t3, 48(a2) |
| 4055 usw t4, 52(a2) |
| 4056 preceu.ph.qbr t3, t1 |
| 4057 preceu.ph.qbl t4, t1 |
| 4058 usw t5, 56(a2) |
| 4059 usw t6, 60(a2) |
| 4060 |
| 4061 lw t0, 20(a0) |
| 4062 preceu.ph.qbr t5, t2 |
| 4063 preceu.ph.qbl t6, t2 |
| 4064 addu t0, t0, a1 |
| 4065 addu.ph t3, t3, t7 |
| 4066 addu.ph t4, t4, t7 |
| 4067 ulw t1, 0(t0) |
| 4068 ulw t2, 4(t0) |
| 4069 addu.ph t5, t5, t7 |
| 4070 addu.ph t6, t6, t7 |
| 4071 usw t3, 64(a2) |
| 4072 usw t4, 68(a2) |
| 4073 preceu.ph.qbr t3, t1 |
| 4074 preceu.ph.qbl t4, t1 |
| 4075 usw t5, 72(a2) |
| 4076 usw t6, 76(a2) |
| 4077 |
| 4078 lw t0, 24(a0) |
| 4079 preceu.ph.qbr t5, t2 |
| 4080 preceu.ph.qbl t6, t2 |
| 4081 addu t0, t0, a1 |
| 4082 addu.ph t3, t3, t7 |
| 4083 addu.ph t4, t4, t7 |
| 4084 ulw t1, 0(t0) |
| 4085 ulw t2, 4(t0) |
| 4086 addu.ph t5, t5, t7 |
| 4087 addu.ph t6, t6, t7 |
| 4088 usw t3, 80(a2) |
| 4089 usw t4, 84(a2) |
| 4090 preceu.ph.qbr t3, t1 |
| 4091 preceu.ph.qbl t4, t1 |
| 4092 usw t5, 88(a2) |
| 4093 usw t6, 92(a2) |
| 4094 |
| 4095 lw t0, 28(a0) |
| 4096 preceu.ph.qbr t5, t2 |
| 4097 preceu.ph.qbl t6, t2 |
| 4098 addu t0, t0, a1 |
| 4099 addu.ph t3, t3, t7 |
| 4100 addu.ph t4, t4, t7 |
| 4101 ulw t1, 0(t0) |
| 4102 ulw t2, 4(t0) |
| 4103 addu.ph t5, t5, t7 |
| 4104 addu.ph t6, t6, t7 |
| 4105 usw t3, 96(a2) |
| 4106 usw t4, 100(a2) |
| 4107 preceu.ph.qbr t3, t1 |
| 4108 preceu.ph.qbl t4, t1 |
| 4109 usw t5, 104(a2) |
| 4110 usw t6, 108(a2) |
| 4111 preceu.ph.qbr t5, t2 |
| 4112 preceu.ph.qbl t6, t2 |
| 4113 addu.ph t3, t3, t7 |
| 4114 addu.ph t4, t4, t7 |
| 4115 addu.ph t5, t5, t7 |
| 4116 addu.ph t6, t6, t7 |
| 4117 usw t3, 112(a2) |
| 4118 usw t4, 116(a2) |
| 4119 usw t5, 120(a2) |
| 4120 usw t6, 124(a2) |
| 4121 |
| 4122 j ra |
| 4123 nop |
| 4124 |
| 4125 END(jsimd_convsamp_mips_dspr2) |
| 4126 |
| 4127 /*****************************************************************************/ |
| 4128 LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) |
| 4129 /* |
| 4130 * a0 - sample_data |
| 4131 * a1 - start_col |
| 4132 * a2 - workspace |
| 4133 */ |
| 4134 |
| 4135 .set at |
| 4136 |
| 4137 lw t0, 0(a0) |
| 4138 addu t0, t0, a1 |
| 4139 lbu t1, 0(t0) |
| 4140 lbu t2, 1(t0) |
| 4141 lbu t3, 2(t0) |
| 4142 lbu t4, 3(t0) |
| 4143 lbu t5, 4(t0) |
| 4144 lbu t6, 5(t0) |
| 4145 lbu t7, 6(t0) |
| 4146 lbu t8, 7(t0) |
| 4147 addiu t1, t1, -128 |
| 4148 addiu t2, t2, -128 |
| 4149 addiu t3, t3, -128 |
| 4150 addiu t4, t4, -128 |
| 4151 addiu t5, t5, -128 |
| 4152 addiu t6, t6, -128 |
| 4153 addiu t7, t7, -128 |
| 4154 addiu t8, t8, -128 |
| 4155 mtc1 t1, f2 |
| 4156 mtc1 t2, f4 |
| 4157 mtc1 t3, f6 |
| 4158 mtc1 t4, f8 |
| 4159 mtc1 t5, f10 |
| 4160 mtc1 t6, f12 |
| 4161 mtc1 t7, f14 |
| 4162 mtc1 t8, f16 |
| 4163 cvt.s.w f2, f2 |
| 4164 cvt.s.w f4, f4 |
| 4165 cvt.s.w f6, f6 |
| 4166 cvt.s.w f8, f8 |
| 4167 cvt.s.w f10, f10 |
| 4168 cvt.s.w f12, f12 |
| 4169 cvt.s.w f14, f14 |
| 4170 cvt.s.w f16, f16 |
| 4171 lw t0, 4(a0) |
| 4172 swc1 f2, 0(a2) |
| 4173 swc1 f4, 4(a2) |
| 4174 swc1 f6, 8(a2) |
| 4175 addu t0, t0, a1 |
| 4176 swc1 f8, 12(a2) |
| 4177 swc1 f10, 16(a2) |
| 4178 swc1 f12, 20(a2) |
| 4179 swc1 f14, 24(a2) |
| 4180 swc1 f16, 28(a2) |
| 4181 //elemr 1 |
| 4182 lbu t1, 0(t0) |
| 4183 lbu t2, 1(t0) |
| 4184 lbu t3, 2(t0) |
| 4185 lbu t4, 3(t0) |
| 4186 lbu t5, 4(t0) |
| 4187 lbu t6, 5(t0) |
| 4188 lbu t7, 6(t0) |
| 4189 lbu t8, 7(t0) |
| 4190 addiu t1, t1, -128 |
| 4191 addiu t2, t2, -128 |
| 4192 addiu t3, t3, -128 |
| 4193 addiu t4, t4, -128 |
| 4194 addiu t5, t5, -128 |
| 4195 addiu t6, t6, -128 |
| 4196 addiu t7, t7, -128 |
| 4197 addiu t8, t8, -128 |
| 4198 mtc1 t1, f2 |
| 4199 mtc1 t2, f4 |
| 4200 mtc1 t3, f6 |
| 4201 mtc1 t4, f8 |
| 4202 mtc1 t5, f10 |
| 4203 mtc1 t6, f12 |
| 4204 mtc1 t7, f14 |
| 4205 mtc1 t8, f16 |
| 4206 cvt.s.w f2, f2 |
| 4207 cvt.s.w f4, f4 |
| 4208 cvt.s.w f6, f6 |
| 4209 cvt.s.w f8, f8 |
| 4210 cvt.s.w f10, f10 |
| 4211 cvt.s.w f12, f12 |
| 4212 cvt.s.w f14, f14 |
| 4213 cvt.s.w f16, f16 |
| 4214 lw t0, 8(a0) |
| 4215 swc1 f2, 32(a2) |
| 4216 swc1 f4, 36(a2) |
| 4217 swc1 f6, 40(a2) |
| 4218 addu t0, t0, a1 |
| 4219 swc1 f8, 44(a2) |
| 4220 swc1 f10, 48(a2) |
| 4221 swc1 f12, 52(a2) |
| 4222 swc1 f14, 56(a2) |
| 4223 swc1 f16, 60(a2) |
| 4224 //elemr 2 |
| 4225 lbu t1, 0(t0) |
| 4226 lbu t2, 1(t0) |
| 4227 lbu t3, 2(t0) |
| 4228 lbu t4, 3(t0) |
| 4229 lbu t5, 4(t0) |
| 4230 lbu t6, 5(t0) |
| 4231 lbu t7, 6(t0) |
| 4232 lbu t8, 7(t0) |
| 4233 addiu t1, t1, -128 |
| 4234 addiu t2, t2, -128 |
| 4235 addiu t3, t3, -128 |
| 4236 addiu t4, t4, -128 |
| 4237 addiu t5, t5, -128 |
| 4238 addiu t6, t6, -128 |
| 4239 addiu t7, t7, -128 |
| 4240 addiu t8, t8, -128 |
| 4241 mtc1 t1, f2 |
| 4242 mtc1 t2, f4 |
| 4243 mtc1 t3, f6 |
| 4244 mtc1 t4, f8 |
| 4245 mtc1 t5, f10 |
| 4246 mtc1 t6, f12 |
| 4247 mtc1 t7, f14 |
| 4248 mtc1 t8, f16 |
| 4249 cvt.s.w f2, f2 |
| 4250 cvt.s.w f4, f4 |
| 4251 cvt.s.w f6, f6 |
| 4252 cvt.s.w f8, f8 |
| 4253 cvt.s.w f10, f10 |
| 4254 cvt.s.w f12, f12 |
| 4255 cvt.s.w f14, f14 |
| 4256 cvt.s.w f16, f16 |
| 4257 lw t0, 12(a0) |
| 4258 swc1 f2, 64(a2) |
| 4259 swc1 f4, 68(a2) |
| 4260 swc1 f6, 72(a2) |
| 4261 addu t0, t0, a1 |
| 4262 swc1 f8, 76(a2) |
| 4263 swc1 f10, 80(a2) |
| 4264 swc1 f12, 84(a2) |
| 4265 swc1 f14, 88(a2) |
| 4266 swc1 f16, 92(a2) |
| 4267 //elemr 3 |
| 4268 lbu t1, 0(t0) |
| 4269 lbu t2, 1(t0) |
| 4270 lbu t3, 2(t0) |
| 4271 lbu t4, 3(t0) |
| 4272 lbu t5, 4(t0) |
| 4273 lbu t6, 5(t0) |
| 4274 lbu t7, 6(t0) |
| 4275 lbu t8, 7(t0) |
| 4276 addiu t1, t1, -128 |
| 4277 addiu t2, t2, -128 |
| 4278 addiu t3, t3, -128 |
| 4279 addiu t4, t4, -128 |
| 4280 addiu t5, t5, -128 |
| 4281 addiu t6, t6, -128 |
| 4282 addiu t7, t7, -128 |
| 4283 addiu t8, t8, -128 |
| 4284 mtc1 t1, f2 |
| 4285 mtc1 t2, f4 |
| 4286 mtc1 t3, f6 |
| 4287 mtc1 t4, f8 |
| 4288 mtc1 t5, f10 |
| 4289 mtc1 t6, f12 |
| 4290 mtc1 t7, f14 |
| 4291 mtc1 t8, f16 |
| 4292 cvt.s.w f2, f2 |
| 4293 cvt.s.w f4, f4 |
| 4294 cvt.s.w f6, f6 |
| 4295 cvt.s.w f8, f8 |
| 4296 cvt.s.w f10, f10 |
| 4297 cvt.s.w f12, f12 |
| 4298 cvt.s.w f14, f14 |
| 4299 cvt.s.w f16, f16 |
| 4300 lw t0, 16(a0) |
| 4301 swc1 f2, 96(a2) |
| 4302 swc1 f4, 100(a2) |
| 4303 swc1 f6, 104(a2) |
| 4304 addu t0, t0, a1 |
| 4305 swc1 f8, 108(a2) |
| 4306 swc1 f10, 112(a2) |
| 4307 swc1 f12, 116(a2) |
| 4308 swc1 f14, 120(a2) |
| 4309 swc1 f16, 124(a2) |
| 4310 //elemr 4 |
| 4311 lbu t1, 0(t0) |
| 4312 lbu t2, 1(t0) |
| 4313 lbu t3, 2(t0) |
| 4314 lbu t4, 3(t0) |
| 4315 lbu t5, 4(t0) |
| 4316 lbu t6, 5(t0) |
| 4317 lbu t7, 6(t0) |
| 4318 lbu t8, 7(t0) |
| 4319 addiu t1, t1, -128 |
| 4320 addiu t2, t2, -128 |
| 4321 addiu t3, t3, -128 |
| 4322 addiu t4, t4, -128 |
| 4323 addiu t5, t5, -128 |
| 4324 addiu t6, t6, -128 |
| 4325 addiu t7, t7, -128 |
| 4326 addiu t8, t8, -128 |
| 4327 mtc1 t1, f2 |
| 4328 mtc1 t2, f4 |
| 4329 mtc1 t3, f6 |
| 4330 mtc1 t4, f8 |
| 4331 mtc1 t5, f10 |
| 4332 mtc1 t6, f12 |
| 4333 mtc1 t7, f14 |
| 4334 mtc1 t8, f16 |
| 4335 cvt.s.w f2, f2 |
| 4336 cvt.s.w f4, f4 |
| 4337 cvt.s.w f6, f6 |
| 4338 cvt.s.w f8, f8 |
| 4339 cvt.s.w f10, f10 |
| 4340 cvt.s.w f12, f12 |
| 4341 cvt.s.w f14, f14 |
| 4342 cvt.s.w f16, f16 |
| 4343 lw t0, 20(a0) |
| 4344 swc1 f2, 128(a2) |
| 4345 swc1 f4, 132(a2) |
| 4346 swc1 f6, 136(a2) |
| 4347 addu t0, t0, a1 |
| 4348 swc1 f8, 140(a2) |
| 4349 swc1 f10, 144(a2) |
| 4350 swc1 f12, 148(a2) |
| 4351 swc1 f14, 152(a2) |
| 4352 swc1 f16, 156(a2) |
| 4353 //elemr 5 |
| 4354 lbu t1, 0(t0) |
| 4355 lbu t2, 1(t0) |
| 4356 lbu t3, 2(t0) |
| 4357 lbu t4, 3(t0) |
| 4358 lbu t5, 4(t0) |
| 4359 lbu t6, 5(t0) |
| 4360 lbu t7, 6(t0) |
| 4361 lbu t8, 7(t0) |
| 4362 addiu t1, t1, -128 |
| 4363 addiu t2, t2, -128 |
| 4364 addiu t3, t3, -128 |
| 4365 addiu t4, t4, -128 |
| 4366 addiu t5, t5, -128 |
| 4367 addiu t6, t6, -128 |
| 4368 addiu t7, t7, -128 |
| 4369 addiu t8, t8, -128 |
| 4370 mtc1 t1, f2 |
| 4371 mtc1 t2, f4 |
| 4372 mtc1 t3, f6 |
| 4373 mtc1 t4, f8 |
| 4374 mtc1 t5, f10 |
| 4375 mtc1 t6, f12 |
| 4376 mtc1 t7, f14 |
| 4377 mtc1 t8, f16 |
| 4378 cvt.s.w f2, f2 |
| 4379 cvt.s.w f4, f4 |
| 4380 cvt.s.w f6, f6 |
| 4381 cvt.s.w f8, f8 |
| 4382 cvt.s.w f10, f10 |
| 4383 cvt.s.w f12, f12 |
| 4384 cvt.s.w f14, f14 |
| 4385 cvt.s.w f16, f16 |
| 4386 lw t0, 24(a0) |
| 4387 swc1 f2, 160(a2) |
| 4388 swc1 f4, 164(a2) |
| 4389 swc1 f6, 168(a2) |
| 4390 addu t0, t0, a1 |
| 4391 swc1 f8, 172(a2) |
| 4392 swc1 f10, 176(a2) |
| 4393 swc1 f12, 180(a2) |
| 4394 swc1 f14, 184(a2) |
| 4395 swc1 f16, 188(a2) |
| 4396 //elemr 6 |
| 4397 lbu t1, 0(t0) |
| 4398 lbu t2, 1(t0) |
| 4399 lbu t3, 2(t0) |
| 4400 lbu t4, 3(t0) |
| 4401 lbu t5, 4(t0) |
| 4402 lbu t6, 5(t0) |
| 4403 lbu t7, 6(t0) |
| 4404 lbu t8, 7(t0) |
| 4405 addiu t1, t1, -128 |
| 4406 addiu t2, t2, -128 |
| 4407 addiu t3, t3, -128 |
| 4408 addiu t4, t4, -128 |
| 4409 addiu t5, t5, -128 |
| 4410 addiu t6, t6, -128 |
| 4411 addiu t7, t7, -128 |
| 4412 addiu t8, t8, -128 |
| 4413 mtc1 t1, f2 |
| 4414 mtc1 t2, f4 |
| 4415 mtc1 t3, f6 |
| 4416 mtc1 t4, f8 |
| 4417 mtc1 t5, f10 |
| 4418 mtc1 t6, f12 |
| 4419 mtc1 t7, f14 |
| 4420 mtc1 t8, f16 |
| 4421 cvt.s.w f2, f2 |
| 4422 cvt.s.w f4, f4 |
| 4423 cvt.s.w f6, f6 |
| 4424 cvt.s.w f8, f8 |
| 4425 cvt.s.w f10, f10 |
| 4426 cvt.s.w f12, f12 |
| 4427 cvt.s.w f14, f14 |
| 4428 cvt.s.w f16, f16 |
| 4429 lw t0, 28(a0) |
| 4430 swc1 f2, 192(a2) |
| 4431 swc1 f4, 196(a2) |
| 4432 swc1 f6, 200(a2) |
| 4433 addu t0, t0, a1 |
| 4434 swc1 f8, 204(a2) |
| 4435 swc1 f10, 208(a2) |
| 4436 swc1 f12, 212(a2) |
| 4437 swc1 f14, 216(a2) |
| 4438 swc1 f16, 220(a2) |
| 4439 //elemr 7 |
| 4440 lbu t1, 0(t0) |
| 4441 lbu t2, 1(t0) |
| 4442 lbu t3, 2(t0) |
| 4443 lbu t4, 3(t0) |
| 4444 lbu t5, 4(t0) |
| 4445 lbu t6, 5(t0) |
| 4446 lbu t7, 6(t0) |
| 4447 lbu t8, 7(t0) |
| 4448 addiu t1, t1, -128 |
| 4449 addiu t2, t2, -128 |
| 4450 addiu t3, t3, -128 |
| 4451 addiu t4, t4, -128 |
| 4452 addiu t5, t5, -128 |
| 4453 addiu t6, t6, -128 |
| 4454 addiu t7, t7, -128 |
| 4455 addiu t8, t8, -128 |
| 4456 mtc1 t1, f2 |
| 4457 mtc1 t2, f4 |
| 4458 mtc1 t3, f6 |
| 4459 mtc1 t4, f8 |
| 4460 mtc1 t5, f10 |
| 4461 mtc1 t6, f12 |
| 4462 mtc1 t7, f14 |
| 4463 mtc1 t8, f16 |
| 4464 cvt.s.w f2, f2 |
| 4465 cvt.s.w f4, f4 |
| 4466 cvt.s.w f6, f6 |
| 4467 cvt.s.w f8, f8 |
| 4468 cvt.s.w f10, f10 |
| 4469 cvt.s.w f12, f12 |
| 4470 cvt.s.w f14, f14 |
| 4471 cvt.s.w f16, f16 |
| 4472 swc1 f2, 224(a2) |
| 4473 swc1 f4, 228(a2) |
| 4474 swc1 f6, 232(a2) |
| 4475 swc1 f8, 236(a2) |
| 4476 swc1 f10, 240(a2) |
| 4477 swc1 f12, 244(a2) |
| 4478 swc1 f14, 248(a2) |
| 4479 swc1 f16, 252(a2) |
| 4480 |
| 4481 j ra |
| 4482 nop |
| 4483 |
| 4484 END(jsimd_convsamp_float_mips_dspr2) |
| 4485 |
| 4486 /*****************************************************************************/ |
| 4487 |
OLD | NEW |