Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(449)

Side by Side Diff: simd/jsimd_mips_dspr2.S

Issue 1934113002: Update libjpeg_turbo to 1.4.90 from https://github.com/libjpeg-turbo/ (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5 * All rights reserved.
6 * Authors: Teodora Novkovic (teodora.novkovic@imgtec.com)
7 * Darko Laus (darko.laus@imgtec.com)
8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the authors be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute it
15 * freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must not
18 * claim that you wrote the original software. If you use this software
19 * in a product, an acknowledgment in the product documentation would be
20 * appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must not be
22 * misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source distribution.
24 */
25
26 #include "jsimd_mips_dspr2_asm.h"
27
28 /*****************************************************************************/
29 LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
30 /*
31 * a0 - cinfo->image_width
32 * a1 - input_buf
33 * a2 - output_buf
34 * a3 - output_row
35 * 16(sp) - num_rows
36 * 20(sp) - cinfo->num_components
37 *
38 * Null conversion for compression
39 */
40
41 SAVE_REGS_ON_STACK 8, s0, s1
42
43 lw t9, 24(sp) // t9 = num_rows
44 lw s0, 28(sp) // s0 = cinfo->num_components
45 andi t0, a0, 3 // t0 = cinfo->image_width & 3
46 beqz t0, 4f // no residual
47 nop
48 0:
49 addiu t9, t9, -1
50 bltz t9, 7f
51 li t1, 0
52 1:
53 sll t3, t1, 2
54 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
55 lw t2, 0(a1) // t2 = inptr = *input_buf
56 sll t4, a3, 2
57 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
58 addu t2, t2, t1
59 addu s1, t5, a0
60 addu t6, t5, t0
61 2:
62 lbu t3, 0(t2)
63 addiu t5, t5, 1
64 sb t3, -1(t5)
65 bne t6, t5, 2b
66 addu t2, t2, s0
67 3:
68 lbu t3, 0(t2)
69 addu t4, t2, s0
70 addu t7, t4, s0
71 addu t8, t7, s0
72 addu t2, t8, s0
73 lbu t4, 0(t4)
74 lbu t7, 0(t7)
75 lbu t8, 0(t8)
76 addiu t5, t5, 4
77 sb t3, -4(t5)
78 sb t4, -3(t5)
79 sb t7, -2(t5)
80 bne s1, t5, 3b
81 sb t8, -1(t5)
82 addiu t1, t1, 1
83 bne t1, s0, 1b
84 nop
85 addiu a1, a1, 4
86 bgez t9, 0b
87 addiu a3, a3, 1
88 b 7f
89 nop
90 4:
91 addiu t9, t9, -1
92 bltz t9, 7f
93 li t1, 0
94 5:
95 sll t3, t1, 2
96 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
97 lw t2, 0(a1) // t2 = inptr = *input_buf
98 sll t4, a3, 2
99 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
100 addu t2, t2, t1
101 addu s1, t5, a0
102 addu t6, t5, t0
103 6:
104 lbu t3, 0(t2)
105 addu t4, t2, s0
106 addu t7, t4, s0
107 addu t8, t7, s0
108 addu t2, t8, s0
109 lbu t4, 0(t4)
110 lbu t7, 0(t7)
111 lbu t8, 0(t8)
112 addiu t5, t5, 4
113 sb t3, -4(t5)
114 sb t4, -3(t5)
115 sb t7, -2(t5)
116 bne s1, t5, 6b
117 sb t8, -1(t5)
118 addiu t1, t1, 1
119 bne t1, s0, 5b
120 nop
121 addiu a1, a1, 4
122 bgez t9, 4b
123 addiu a3, a3, 1
124 7:
125 RESTORE_REGS_FROM_STACK 8, s0, s1
126
127 j ra
128 nop
129
130 END(jsimd_c_null_convert_mips_dspr2)
131
132 /*****************************************************************************/
133 /*
134 * jsimd_extrgb_ycc_convert_mips_dspr2
135 * jsimd_extbgr_ycc_convert_mips_dspr2
136 * jsimd_extrgbx_ycc_convert_mips_dspr2
137 * jsimd_extbgrx_ycc_convert_mips_dspr2
138 * jsimd_extxbgr_ycc_convert_mips_dspr2
139 * jsimd_extxrgb_ycc_convert_mips_dspr2
140 *
141 * Colorspace conversion RGB -> YCbCr
142 */
143
144 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_ offs, b_offs
145
146 .macro DO_RGB_TO_YCC r, \
147 g, \
148 b, \
149 inptr
150 lbu \r, \r_offs(\inptr)
151 lbu \g, \g_offs(\inptr)
152 lbu \b, \b_offs(\inptr)
153 addiu \inptr, \pixel_size
154 .endm
155
156 LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
157 /*
158 * a0 - cinfo->image_width
159 * a1 - input_buf
160 * a2 - output_buf
161 * a3 - output_row
162 * 16(sp) - num_rows
163 */
164
165 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
166
167 lw t7, 48(sp) // t7 = num_rows
168 li s0, 0x4c8b // FIX(0.29900)
169 li s1, 0x9646 // FIX(0.58700)
170 li s2, 0x1d2f // FIX(0.11400)
171 li s3, 0xffffd4cd // -FIX(0.16874)
172 li s4, 0xffffab33 // -FIX(0.33126)
173 li s5, 0x8000 // FIX(0.50000)
174 li s6, 0xffff94d1 // -FIX(0.41869)
175 li s7, 0xffffeb2f // -FIX(0.08131)
176 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
177
178 0:
179 addiu t7, -1 // --num_rows
180 lw t6, 0(a1) // t6 = input_buf[0]
181 lw t0, 0(a2)
182 lw t1, 4(a2)
183 lw t2, 8(a2)
184 sll t3, a3, 2
185 lwx t0, t3(t0) // t0 = output_buf[0][output_row]
186 lwx t1, t3(t1) // t1 = output_buf[1][output_row]
187 lwx t2, t3(t2) // t2 = output_buf[2][output_row]
188
189 addu t9, t2, a0 // t9 = end address
190 addiu a3, 1
191
192 1:
193 DO_RGB_TO_YCC t3, t4, t5, t6
194
195 mtlo s5, $ac0
196 mtlo t8, $ac1
197 mtlo t8, $ac2
198 maddu $ac0, s2, t5
199 maddu $ac1, s5, t5
200 maddu $ac2, s5, t3
201 maddu $ac0, s0, t3
202 maddu $ac1, s3, t3
203 maddu $ac2, s6, t4
204 maddu $ac0, s1, t4
205 maddu $ac1, s4, t4
206 maddu $ac2, s7, t5
207 extr.w t3, $ac0, 16
208 extr.w t4, $ac1, 16
209 extr.w t5, $ac2, 16
210 sb t3, 0(t0)
211 sb t4, 0(t1)
212 sb t5, 0(t2)
213 addiu t0, 1
214 addiu t2, 1
215 bne t2, t9, 1b
216 addiu t1, 1
217 bgtz t7, 0b
218 addiu a1, 4
219
220 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
221
222 j ra
223 nop
224 END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
225
226 .purgem DO_RGB_TO_YCC
227
228 .endm
229
230 /*------------------------------------------id -- pix R G B */
231 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
232 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
233 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
234 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
235 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
236 GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
237
238 /*****************************************************************************/
239 /*
240 * jsimd_ycc_extrgb_convert_mips_dspr2
241 * jsimd_ycc_extbgr_convert_mips_dspr2
242 * jsimd_ycc_extrgbx_convert_mips_dspr2
243 * jsimd_ycc_extbgrx_convert_mips_dspr2
244 * jsimd_ycc_extxbgr_convert_mips_dspr2
245 * jsimd_ycc_extxrgb_convert_mips_dspr2
246 *
247 * Colorspace conversion YCbCr -> RGB
248 */
249
250 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_ offs, b_offs, a_offs
251
252 .macro STORE_YCC_TO_RGB scratch0 \
253 scratch1 \
254 scratch2 \
255 outptr
256 sb \scratch0, \r_offs(\outptr)
257 sb \scratch1, \g_offs(\outptr)
258 sb \scratch2, \b_offs(\outptr)
259 .if (\pixel_size == 4)
260 li t0, 0xFF
261 sb t0, \a_offs(\outptr)
262 .endif
263 addiu \outptr, \pixel_size
264 .endm
265
266 LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
267 /*
268 * a0 - cinfo->image_width
269 * a1 - input_buf
270 * a2 - input_row
271 * a3 - output_buf
272 * 16(sp) - num_rows
273 */
274
275 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
276
277 lw s1, 48(sp)
278 li t3, 0x8000
279 li t4, 0x166e9 // FIX(1.40200)
280 li t5, 0x1c5a2 // FIX(1.77200)
281 li t6, 0xffff492e // -FIX(0.71414)
282 li t7, 0xffffa7e6 // -FIX(0.34414)
283 repl.ph t8, 128
284
285 0:
286 lw s0, 0(a3)
287 lw t0, 0(a1)
288 lw t1, 4(a1)
289 lw t2, 8(a1)
290 sll s5, a2, 2
291 addiu s1, -1
292 lwx s2, s5(t0)
293 lwx s3, s5(t1)
294 lwx s4, s5(t2)
295 addu t9, s2, a0
296 addiu a2, 1
297
298 1:
299 lbu s7, 0(s4) // cr
300 lbu s6, 0(s3) // cb
301 lbu s5, 0(s2) // y
302 addiu s2, 1
303 addiu s4, 1
304 addiu s7, -128
305 addiu s6, -128
306 mul t2, t7, s6
307 mul t0, t6, s7 // Crgtab[cr]
308 sll s7, 15
309 mulq_rs.w t1, t4, s7 // Crrtab[cr]
310 sll s6, 15
311 addu t2, t3 // Cbgtab[cb]
312 addu t2, t0
313
314 mulq_rs.w t0, t5, s6 // Cbbtab[cb]
315 sra t2, 16
316 addu t1, s5
317 addu t2, s5 // add y
318 ins t2, t1, 16, 16
319 subu.ph t2, t2, t8
320 addu t0, s5
321 shll_s.ph t2, t2, 8
322 subu t0, 128
323 shra.ph t2, t2, 8
324 shll_s.w t0, t0, 24
325 addu.ph t2, t2, t8 // clip & store
326 sra t0, t0, 24
327 sra t1, t2, 16
328 addiu t0, 128
329
330 STORE_YCC_TO_RGB t1, t2, t0, s0
331
332 bne s2, t9, 1b
333 addiu s3, 1
334 bgtz s1, 0b
335 addiu a3, 4
336
337 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
338
339 j ra
340 nop
341 END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
342
343 .purgem STORE_YCC_TO_RGB
344
345 .endm
346
347 /*------------------------------------------id -- pix R G B A */
348 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2, 3
349 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0, 3
350 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
351 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
352 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
353 GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
354
355 /*****************************************************************************/
356 /*
357 * jsimd_extrgb_gray_convert_mips_dspr2
358 * jsimd_extbgr_gray_convert_mips_dspr2
359 * jsimd_extrgbx_gray_convert_mips_dspr2
360 * jsimd_extbgrx_gray_convert_mips_dspr2
361 * jsimd_extxbgr_gray_convert_mips_dspr2
362 * jsimd_extxrgb_gray_convert_mips_dspr2
363 *
364 * Colorspace conversion RGB -> GRAY
365 */
366
367 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g _offs, b_offs
368
369 .macro DO_RGB_TO_GRAY r, \
370 g, \
371 b, \
372 inptr
373 lbu \r, \r_offs(\inptr)
374 lbu \g, \g_offs(\inptr)
375 lbu \b, \b_offs(\inptr)
376 addiu \inptr, \pixel_size
377 .endm
378
379 LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
380 /*
381 * a0 - cinfo->image_width
382 * a1 - input_buf
383 * a2 - output_buf
384 * a3 - output_row
385 * 16(sp) - num_rows
386 */
387
388 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
389
390 li s0, 0x4c8b // s0 = FIX(0.29900)
391 li s1, 0x9646 // s1 = FIX(0.58700)
392 li s2, 0x1d2f // s2 = FIX(0.11400)
393 li s7, 0x8000 // s7 = FIX(0.50000)
394 lw s6, 48(sp)
395 andi t7, a0, 3
396
397 0:
398 addiu s6, -1 // s6 = num_rows
399 lw t0, 0(a1)
400 lw t1, 0(a2)
401 sll t3, a3, 2
402 lwx t1, t3(t1)
403 addiu a3, 1
404 addu t9, t1, a0
405 subu t8, t9, t7
406 beq t1, t8, 2f
407 nop
408
409 1:
410 DO_RGB_TO_GRAY t3, t4, t5, t0
411 DO_RGB_TO_GRAY s3, s4, s5, t0
412
413 mtlo s7, $ac0
414 maddu $ac0, s2, t5
415 maddu $ac0, s1, t4
416 maddu $ac0, s0, t3
417 mtlo s7, $ac1
418 maddu $ac1, s2, s5
419 maddu $ac1, s1, s4
420 maddu $ac1, s0, s3
421 extr.w t6, $ac0, 16
422
423 DO_RGB_TO_GRAY t3, t4, t5, t0
424 DO_RGB_TO_GRAY s3, s4, s5, t0
425
426 mtlo s7, $ac0
427 maddu $ac0, s2, t5
428 maddu $ac0, s1, t4
429 extr.w t2, $ac1, 16
430 maddu $ac0, s0, t3
431 mtlo s7, $ac1
432 maddu $ac1, s2, s5
433 maddu $ac1, s1, s4
434 maddu $ac1, s0, s3
435 extr.w t5, $ac0, 16
436 sb t6, 0(t1)
437 sb t2, 1(t1)
438 extr.w t3, $ac1, 16
439 addiu t1, 4
440 sb t5, -2(t1)
441 sb t3, -1(t1)
442 bne t1, t8, 1b
443 nop
444
445 2:
446 beqz t7, 4f
447 nop
448
449 3:
450 DO_RGB_TO_GRAY t3, t4, t5, t0
451
452 mtlo s7, $ac0
453 maddu $ac0, s2, t5
454 maddu $ac0, s1, t4
455 maddu $ac0, s0, t3
456 extr.w t6, $ac0, 16
457 sb t6, 0(t1)
458 addiu t1, 1
459 bne t1, t9, 3b
460 nop
461
462 4:
463 bgtz s6, 0b
464 addiu a1, 4
465
466 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
467
468 j ra
469 nop
470 END(jsimd_\colorid\()_gray_convert_mips_dspr2)
471
472 .purgem DO_RGB_TO_GRAY
473
474 .endm
475
476 /*------------------------------------------id -- pix R G B */
477 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb, 3, 0, 1, 2
478 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr, 3, 2, 1, 0
479 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
480 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
481 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
482 GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
483 /*****************************************************************************/
484 /*
485 * jsimd_h2v2_merged_upsample_mips_dspr2
486 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
487 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
488 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
489 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
490 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
491 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
492 *
493 * Merged h2v2 upsample routines
494 */
495 .macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
496 pixel_size, \
497 r1_offs, \
498 g1_offs, \
499 b1_offs, \
500 a1_offs, \
501 r2_offs, \
502 g2_offs, \
503 b2_offs, \
504 a2_offs
505
506 .macro STORE_H2V2_2_PIXELS scratch0 \
507 scratch1 \
508 scratch2 \
509 scratch3 \
510 scratch4 \
511 scratch5 \
512 outptr
513 sb \scratch0, \r1_offs(\outptr)
514 sb \scratch1, \g1_offs(\outptr)
515 sb \scratch2, \b1_offs(\outptr)
516 sb \scratch3, \r2_offs(\outptr)
517 sb \scratch4, \g2_offs(\outptr)
518 sb \scratch5, \b2_offs(\outptr)
519 .if (\pixel_size == 8)
520 li \scratch0, 0xFF
521 sb \scratch0, \a1_offs(\outptr)
522 sb \scratch0, \a2_offs(\outptr)
523 .endif
524 addiu \outptr, \pixel_size
525 .endm
526
527 .macro STORE_H2V2_1_PIXEL scratch0 \
528 scratch1 \
529 scratch2 \
530 outptr
531 sb \scratch0, \r1_offs(\outptr)
532 sb \scratch1, \g1_offs(\outptr)
533 sb \scratch2, \b1_offs(\outptr)
534
535 .if (\pixel_size == 8)
536 li t0, 0xFF
537 sb t0, \a1_offs(\outptr)
538 .endif
539 .endm
540
541 LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
542 /*
543 * a0 - cinfo->output_width
544 * a1 - input_buf
545 * a2 - in_row_group_ctr
546 * a3 - output_buf
547 * 16(sp) - cinfo->sample_range_limit
548 */
549
550 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
551
552 lw t9, 56(sp) // cinfo->sample_range_limit
553 lw v0, 0(a1)
554 lw v1, 4(a1)
555 lw t0, 8(a1)
556 sll t1, a2, 3
557 addiu t2, t1, 4
558 sll t3, a2, 2
559 lw t4, 0(a3) // t4 = output_buf[0]
560 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
561 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
562 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
563 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
564 lw t7, 4(a3) // t7 = output_buf[1]
565 li s1, 0xe6ea
566 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
567 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
568 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
569 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
570 srl t3, a0, 1
571 blez t3, 2f
572 addu t0, t5, t3 // t0 = end address
573 1:
574 lbu t3, 0(t5)
575 lbu s3, 0(t6)
576 addiu t5, t5, 1
577 addiu t3, t3, -128 // (cb - 128)
578 addiu s3, s3, -128 // (cr - 128)
579 mult $ac1, s1, t3
580 madd $ac1, s2, s3
581 sll s3, s3, 15
582 sll t3, t3, 15
583 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
584 extr_r.w s5, $ac1, 16
585 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
586 lbu v0, 0(t1)
587 addiu t6, t6, 1
588 addiu t1, t1, 2
589 addu t3, v0, s4 // y+cred
590 addu s3, v0, s5 // y+cgreen
591 addu v1, v0, s6 // y+cblue
592 addu t3, t9, t3 // y+cred
593 addu s3, t9, s3 // y+cgreen
594 addu v1, t9, v1 // y+cblue
595 lbu AT, 0(t3)
596 lbu s7, 0(s3)
597 lbu ra, 0(v1)
598 lbu v0, -1(t1)
599 addu t3, v0, s4 // y+cred
600 addu s3, v0, s5 // y+cgreen
601 addu v1, v0, s6 // y+cblue
602 addu t3, t9, t3 // y+cred
603 addu s3, t9, s3 // y+cgreen
604 addu v1, t9, v1 // y+cblue
605 lbu t3, 0(t3)
606 lbu s3, 0(s3)
607 lbu v1, 0(v1)
608 lbu v0, 0(t2)
609
610 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
611
612 addu t3, v0, s4 // y+cred
613 addu s3, v0, s5 // y+cgreen
614 addu v1, v0, s6 // y+cblue
615 addu t3, t9, t3 // y+cred
616 addu s3, t9, s3 // y+cgreen
617 addu v1, t9, v1 // y+cblue
618 lbu AT, 0(t3)
619 lbu s7, 0(s3)
620 lbu ra, 0(v1)
621 lbu v0, 1(t2)
622 addiu t2, t2, 2
623 addu t3, v0, s4 // y+cred
624 addu s3, v0, s5 // y+cgreen
625 addu v1, v0, s6 // y+cblue
626 addu t3, t9, t3 // y+cred
627 addu s3, t9, s3 // y+cgreen
628 addu v1, t9, v1 // y+cblue
629 lbu t3, 0(t3)
630 lbu s3, 0(s3)
631 lbu v1, 0(v1)
632
633 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
634
635 bne t0, t5, 1b
636 nop
637 2:
638 andi t0, a0, 1
639 beqz t0, 4f
640 lbu t3, 0(t5)
641 lbu s3, 0(t6)
642 addiu t3, t3, -128 // (cb - 128)
643 addiu s3, s3, -128 // (cr - 128)
644 mult $ac1, s1, t3
645 madd $ac1, s2, s3
646 sll s3, s3, 15
647 sll t3, t3, 15
648 lbu v0, 0(t1)
649 extr_r.w s5, $ac1, 16
650 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
651 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
652 addu t3, v0, s4 // y+cred
653 addu s3, v0, s5 // y+cgreen
654 addu v1, v0, s6 // y+cblue
655 addu t3, t9, t3 // y+cred
656 addu s3, t9, s3 // y+cgreen
657 addu v1, t9, v1 // y+cblue
658 lbu t3, 0(t3)
659 lbu s3, 0(s3)
660 lbu v1, 0(v1)
661 lbu v0, 0(t2)
662
663 STORE_H2V2_1_PIXEL t3, s3, v1, t4
664
665 addu t3, v0, s4 // y+cred
666 addu s3, v0, s5 // y+cgreen
667 addu v1, v0, s6 // y+cblue
668 addu t3, t9, t3 // y+cred
669 addu s3, t9, s3 // y+cgreen
670 addu v1, t9, v1 // y+cblue
671 lbu t3, 0(t3)
672 lbu s3, 0(s3)
673 lbu v1, 0(v1)
674
675 STORE_H2V2_1_PIXEL t3, s3, v1, t7
676 4:
677 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
678
679 j ra
680 nop
681
682 END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
683
684 .purgem STORE_H2V2_1_PIXEL
685 .purgem STORE_H2V2_2_PIXELS
686 .endm
687
688 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
689 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
690 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
691 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
692 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
693 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
694 GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
695 /*****************************************************************************/
696 /*
697 * jsimd_h2v1_merged_upsample_mips_dspr2
698 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
699 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
700 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
701 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
702 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
703 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
704 *
705 * Merged h2v1 upsample routines
706 */
707
708 .macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid, \
709 pixel_size, \
710 r1_offs, \
711 g1_offs, \
712 b1_offs, \
713 a1_offs, \
714 r2_offs, \
715 g2_offs, \
716 b2_offs, \
717 a2_offs
718
719 .macro STORE_H2V1_2_PIXELS scratch0 \
720 scratch1 \
721 scratch2 \
722 scratch3 \
723 scratch4 \
724 scratch5 \
725 outptr
726 sb \scratch0, \r1_offs(\outptr)
727 sb \scratch1, \g1_offs(\outptr)
728 sb \scratch2, \b1_offs(\outptr)
729 sb \scratch3, \r2_offs(\outptr)
730 sb \scratch4, \g2_offs(\outptr)
731 sb \scratch5, \b2_offs(\outptr)
732 .if (\pixel_size == 8)
733 li t0, 0xFF
734 sb t0, \a1_offs(\outptr)
735 sb t0, \a2_offs(\outptr)
736 .endif
737 addiu \outptr, \pixel_size
738 .endm
739
740 .macro STORE_H2V1_1_PIXEL scratch0 \
741 scratch1 \
742 scratch2 \
743 outptr
744 sb \scratch0, \r1_offs(\outptr)
745 sb \scratch1, \g1_offs(\outptr)
746 sb \scratch2, \b1_offs(\outptr)
747 .if (\pixel_size == 8)
748 li t0, 0xFF
749 sb t0, \a1_offs(\outptr)
750 .endif
751 .endm
752
753 LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
754 /*
755 * a0 - cinfo->output_width
756 * a1 - input_buf
757 * a2 - in_row_group_ctr
758 * a3 - output_buf
759 * 16(sp) - range_limit
760 */
761
762 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
763
764 li t0, 0xe6ea
765 lw t1, 0(a1) // t1 = input_buf[0]
766 lw t2, 4(a1) // t2 = input_buf[1]
767 lw t3, 8(a1) // t3 = input_buf[2]
768 lw t8, 56(sp) // t8 = range_limit
769 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
770 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
771 addiu s0, t0, 0x9916 // s0 = 0x8000
772 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
773 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
774 srl t0, a0, 1
775 sll t4, a2, 2
776 lwx s5, t4(t1) // s5 = inptr0
777 lwx s6, t4(t2) // s6 = inptr1
778 lwx s7, t4(t3) // s7 = inptr2
779 lw t7, 0(a3) // t7 = outptr
780 blez t0, 2f
781 addu t9, s6, t0 // t9 = end address
782 1:
783 lbu t2, 0(s6) // t2 = cb
784 lbu t0, 0(s7) // t0 = cr
785 lbu t1, 0(s5) // t1 = y
786 addiu t2, t2, -128 // t2 = cb - 128
787 addiu t0, t0, -128 // t0 = cr - 128
788 mult $ac1, s4, t2
789 madd $ac1, s3, t0
790 sll t0, t0, 15
791 sll t2, t2, 15
792 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
793 extr_r.w t5, $ac1, 16
794 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
795 addiu s7, s7, 1
796 addiu s6, s6, 1
797 addu t2, t1, t0 // t2 = y + cred
798 addu t3, t1, t5 // t3 = y + cgreen
799 addu t4, t1, t6 // t4 = y + cblue
800 addu t2, t8, t2
801 addu t3, t8, t3
802 addu t4, t8, t4
803 lbu t1, 1(s5)
804 lbu v0, 0(t2)
805 lbu v1, 0(t3)
806 lbu ra, 0(t4)
807 addu t2, t1, t0
808 addu t3, t1, t5
809 addu t4, t1, t6
810 addu t2, t8, t2
811 addu t3, t8, t3
812 addu t4, t8, t4
813 lbu t2, 0(t2)
814 lbu t3, 0(t3)
815 lbu t4, 0(t4)
816
817 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
818
819 bne t9, s6, 1b
820 addiu s5, s5, 2
821 2:
822 andi t0, a0, 1
823 beqz t0, 4f
824 nop
825 3:
826 lbu t2, 0(s6)
827 lbu t0, 0(s7)
828 lbu t1, 0(s5)
829 addiu t2, t2, -128 //(cb - 128)
830 addiu t0, t0, -128 //(cr - 128)
831 mul t3, s4, t2
832 mul t4, s3, t0
833 sll t0, t0, 15
834 sll t2, t2, 15
835 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
836 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
837 addu t3, t3, s0
838 addu t3, t4, t3
839 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
840 addu t2, t1, t0 // y + cred
841 addu t3, t1, t5 // y + cgreen
842 addu t4, t1, t6 // y + cblue
843 addu t2, t8, t2
844 addu t3, t8, t3
845 addu t4, t8, t4
846 lbu t2, 0(t2)
847 lbu t3, 0(t3)
848 lbu t4, 0(t4)
849
850 STORE_H2V1_1_PIXEL t2, t3, t4, t7
851 4:
852 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
853
854 j ra
855 nop
856
857 END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
858
859 .purgem STORE_H2V1_1_PIXEL
860 .purgem STORE_H2V1_2_PIXELS
861 .endm
862
863 /*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
864 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
865 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
866 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
867 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
868 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
869 GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
870 /*****************************************************************************/
871 /*
872 * jsimd_h2v2_fancy_upsample_mips_dspr2
873 *
874 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
875 */
876 LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
877 /*
878 * a0 - cinfo->max_v_samp_factor
879 * a1 - downsampled_width
880 * a2 - input_data
881 * a3 - output_data_ptr
882 */
883
884 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
885
886 li s4, 0
887 lw s2, 0(a3) // s2 = *output_data_ptr
888 0:
889 li t9, 2
890 lw s1, -4(a2) // s1 = inptr1
891
892 1:
893 lw s0, 0(a2) // s0 = inptr0
894 lwx s3, s4(s2)
895 addiu s5, a1, -2 // s5 = downsampled_width - 2
896 srl t4, s5, 1
897 sll t4, t4, 1
898 lbu t0, 0(s0)
899 lbu t1, 1(s0)
900 lbu t2, 0(s1)
901 lbu t3, 1(s1)
902 addiu s0, 2
903 addiu s1, 2
904 addu t8, s0, t4 // t8 = end address
905 andi s5, s5, 1 // s5 = residual
906 sll t4, t0, 1
907 sll t6, t1, 1
908 addu t0, t0, t4 // t0 = (*inptr0++) * 3
909 addu t1, t1, t6 // t1 = (*inptr0++) * 3
910 addu t7, t0, t2 // t7 = thiscolsum
911 addu t6, t1, t3 // t5 = nextcolsum
912 sll t0, t7, 2 // t0 = thiscolsum * 4
913 subu t1, t0, t7 // t1 = thiscolsum * 3
914 shra_r.w t0, t0, 4
915 addiu t1, 7
916 addu t1, t1, t6
917 srl t1, t1, 4
918 sb t0, 0(s3)
919 sb t1, 1(s3)
920 beq t8, s0, 22f // skip to final iteration if width == 3
921 addiu s3, 2
922 2:
923 lh t0, 0(s0) // t0 = A3|A2
924 lh t2, 0(s1) // t2 = B3|B2
925 addiu s0, 2
926 addiu s1, 2
927 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
928 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
929 shll.ph t1, t0, 1
930 sll t3, t6, 1
931 addu.ph t0, t1, t0 // t0 = A3*3|A2*3
932 addu t3, t3, t6 // t3 = this * 3
933 addu.ph t0, t0, t2 // t0 = next2|next1
934 addu t1, t3, t7
935 andi t7, t0, 0xFFFF // t7 = next1
936 sll t2, t7, 1
937 addu t2, t7, t2 // t2 = next1*3
938 addu t4, t2, t6
939 srl t6, t0, 16 // t6 = next2
940 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
941 addu t0, t3, t7
942 addiu t0, 7
943 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
944 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
945 addu t2, t2, t6
946 addiu t2, 7
947 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
948 sb t1, 0(s3)
949 sb t0, 1(s3)
950 sb t4, 2(s3)
951 sb t2, 3(s3)
952 bne t8, s0, 2b
953 addiu s3, 4
954 22:
955 beqz s5, 4f
956 addu t8, s0, s5
957 3:
958 lbu t0, 0(s0)
959 lbu t2, 0(s1)
960 addiu s0, 1
961 addiu s1, 1
962 sll t3, t6, 1
963 sll t1, t0, 1
964 addu t1, t0, t1 // t1 = inptr0 * 3
965 addu t3, t3, t6 // t3 = thiscolsum * 3
966 addu t5, t1, t2
967 addu t1, t3, t7
968 shra_r.w t1, t1, 4
969 addu t0, t3, t5
970 addiu t0, 7
971 srl t0, t0, 4
972 sb t1, 0(s3)
973 sb t0, 1(s3)
974 addiu s3, 2
975 move t7, t6
976 bne t8, s0, 3b
977 move t6, t5
978 4:
979 sll t0, t6, 2 // t0 = thiscolsum * 4
980 subu t1, t0, t6 // t1 = thiscolsum * 3
981 addu t1, t1, t7
982 addiu s4, 4
983 shra_r.w t1, t1, 4
984 addiu t0, 7
985 srl t0, t0, 4
986 sb t1, 0(s3)
987 sb t0, 1(s3)
988 addiu t9, -1
989 addiu s3, 2
990 bnez t9, 1b
991 lw s1, 4(a2)
992 srl t0, s4, 2
993 subu t0, a0, t0
994 bgtz t0, 0b
995 addiu a2, 4
996
997 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
998
999 j ra
1000 nop
1001 END(jsimd_h2v2_fancy_upsample_mips_dspr2)
1002
1003 /*****************************************************************************/
1004 LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
1005 /*
1006 * a0 - cinfo->max_v_samp_factor
1007 * a1 - downsampled_width
1008 * a2 - input_data
1009 * a3 - output_data_ptr
1010 */
1011
1012 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1013
1014 .set at
1015
1016 beqz a0, 3f
1017 sll t0, a0, 2
1018 lw s1, 0(a3)
1019 li s3, 0x10001
1020 addu s0, s1, t0
1021 0:
1022 addiu t8, a1, -2
1023 srl t9, t8, 2
1024 lw t7, 0(a2)
1025 lw s2, 0(s1)
1026 lbu t0, 0(t7)
1027 lbu t1, 1(t7) // t1 = inptr[1]
1028 sll t2, t0, 1
1029 addu t2, t2, t0 // t2 = invalue*3
1030 addu t2, t2, t1
1031 shra_r.w t2, t2, 2
1032 sb t0, 0(s2)
1033 sb t2, 1(s2)
1034 beqz t9, 11f
1035 addiu s2, 2
1036 1:
1037 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
1038 ulw t1, 1(t7)
1039 ulh t2, 4(t7) // t2 = |0|0|P5|P4|
1040 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
1041 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
1042 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
1043 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
1044 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
1045 shll.ph t5, t4, 1
1046 shll.ph t6, t1, 1
1047 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
1048 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
1049 addu.ph t4, t3, s3
1050 addu.ph t0, t0, s3
1051 addu.ph t4, t4, t5
1052 addu.ph t0, t0, t6
1053 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
1054 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
1055 addu.ph t2, t2, t5
1056 addu.ph t3, t3, t6
1057 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
1058 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
1059 shll.ph t2, t2, 8
1060 shll.ph t3, t3, 8
1061 or t2, t4, t2
1062 or t3, t3, t0
1063 addiu t9, -1
1064 usw t3, 0(s2)
1065 usw t2, 4(s2)
1066 addiu s2, 8
1067 bgtz t9, 1b
1068 addiu t7, 4
1069 11:
1070 andi t8, 3
1071 beqz t8, 22f
1072 addiu t7, 1
1073
1074 2:
1075 lbu t0, 0(t7)
1076 addiu t7, 1
1077 sll t1, t0, 1
1078 addu t2, t0, t1 // t2 = invalue
1079 lbu t3, -2(t7)
1080 lbu t4, 0(t7)
1081 addiu t3, 1
1082 addiu t4, 2
1083 addu t3, t3, t2
1084 addu t4, t4, t2
1085 srl t3, 2
1086 srl t4, 2
1087 sb t3, 0(s2)
1088 sb t4, 1(s2)
1089 addiu t8, -1
1090 bgtz t8, 2b
1091 addiu s2, 2
1092
1093 22:
1094 lbu t0, 0(t7)
1095 lbu t2, -1(t7)
1096 sll t1, t0, 1
1097 addu t1, t1, t0 // t1 = invalue * 3
1098 addu t1, t1, t2
1099 addiu t1, 1
1100 srl t1, t1, 2
1101 sb t1, 0(s2)
1102 sb t0, 1(s2)
1103 addiu s1, 4
1104 bne s1, s0, 0b
1105 addiu a2, 4
1106 3:
1107 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1108
1109 j ra
1110 nop
1111 END(jsimd_h2v1_fancy_upsample_mips_dspr2)
1112
1113 /*****************************************************************************/
1114 LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
1115 /*
1116 * a0 - cinfo->image_width
1117 * a1 - cinfo->max_v_samp_factor
1118 * a2 - compptr->v_samp_factor
1119 * a3 - compptr->width_in_blocks
1120 * 16(sp) - input_data
1121 * 20(sp) - output_data
1122 */
1123 .set at
1124
1125 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1126
1127 beqz a2, 7f
1128 lw s1, 44(sp) // s1 = output_data
1129 lw s0, 40(sp) // s0 = input_data
1130 srl s2, a0, 2
1131 andi t9, a0, 2
1132 srl t7, t9, 1
1133 addu s2, t7, s2
1134 sll t0, a3, 3 // t0 = width_in_blocks*DCT
1135 srl t7, t0, 1
1136 subu s2, t7, s2
1137 0:
1138 andi t6, a0, 1 // t6 = temp_index
1139 addiu t6, -1
1140 lw t4, 0(s1) // t4 = outptr
1141 lw t5, 0(s0) // t5 = inptr0
1142 li s3, 0 // s3 = bias
1143 srl t7, a0, 1 // t7 = image_width1
1144 srl s4, t7, 2
1145 andi t8, t7, 3
1146 1:
1147 ulhu t0, 0(t5)
1148 ulhu t1, 2(t5)
1149 ulhu t2, 4(t5)
1150 ulhu t3, 6(t5)
1151 raddu.w.qb t0, t0
1152 raddu.w.qb t1, t1
1153 raddu.w.qb t2, t2
1154 raddu.w.qb t3, t3
1155 shra.ph t0, t0, 1
1156 shra_r.ph t1, t1, 1
1157 shra.ph t2, t2, 1
1158 shra_r.ph t3, t3, 1
1159 sb t0, 0(t4)
1160 sb t1, 1(t4)
1161 sb t2, 2(t4)
1162 sb t3, 3(t4)
1163 addiu s4, -1
1164 addiu t4, 4
1165 bgtz s4, 1b
1166 addiu t5, 8
1167 beqz t8, 3f
1168 addu s4, t4, t8
1169 2:
1170 ulhu t0, 0(t5)
1171 raddu.w.qb t0, t0
1172 addqh.w t0, t0, s3
1173 xori s3, s3, 1
1174 sb t0, 0(t4)
1175 addiu t4, 1
1176 bne t4, s4, 2b
1177 addiu t5, 2
1178 3:
1179 lbux t1, t6(t5)
1180 sll t1, 1
1181 addqh.w t2, t1, s3 // t2 = pixval1
1182 xori s3, s3, 1
1183 addqh.w t3, t1, s3 // t3 = pixval2
1184 blez s2, 5f
1185 append t3, t2, 8
1186 addu t5, t4, s2 // t5 = loop_end2
1187 4:
1188 ush t3, 0(t4)
1189 addiu s2, -1
1190 bgtz s2, 4b
1191 addiu t4, 2
1192 5:
1193 beqz t9, 6f
1194 nop
1195 sb t2, 0(t4)
1196 6:
1197 addiu s1, 4
1198 addiu a2, -1
1199 bnez a2, 0b
1200 addiu s0, 4
1201 7:
1202 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1203
1204 j ra
1205 nop
1206 END(jsimd_h2v1_downsample_mips_dspr2)
1207
1208 /*****************************************************************************/
1209 LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
1210
1211 /*
1212 * a0 - cinfo->image_width
1213 * a1 - cinfo->max_v_samp_factor
1214 * a2 - compptr->v_samp_factor
1215 * a3 - compptr->width_in_blocks
1216 * 16(sp) - input_data
1217 * 20(sp) - output_data
1218 */
1219 .set at
1220 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1221
1222 beqz a2, 8f
1223 lw s1, 52(sp) // s1 = output_data
1224 lw s0, 48(sp) // s0 = input_data
1225
1226 andi t6, a0, 1 // t6 = temp_index
1227 addiu t6, -1
1228 srl t7, a0, 1 // t7 = image_width1
1229 srl s4, t7, 2
1230 andi t8, t7, 3
1231 andi t9, a0, 2
1232 srl s2, a0, 2
1233 srl t7, t9, 1
1234 addu s2, t7, s2
1235 sll t0, a3, 3 // s2 = width_in_blocks*DCT
1236 srl t7, t0, 1
1237 subu s2, t7, s2
1238 0:
1239 lw t4, 0(s1) // t4 = outptr
1240 lw t5, 0(s0) // t5 = inptr0
1241 lw s7, 4(s0) // s7 = inptr1
1242 li s6, 1 // s6 = bias
1243 2:
1244 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
1245 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
1246 ulw t2, 4(t5)
1247 ulw t3, 4(s7)
1248 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
1249 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
1250 raddu.w.qb t1, t7
1251 raddu.w.qb t0, t0
1252 shra_r.w t1, t1, 2
1253 addiu t0, 1
1254 srl t0, 2
1255 precrq.ph.w t7, t2, t3
1256 ins t2, t3, 16, 16
1257 raddu.w.qb t7, t7
1258 raddu.w.qb t2, t2
1259 shra_r.w t7, t7, 2
1260 addiu t2, 1
1261 srl t2, 2
1262 sb t0, 0(t4)
1263 sb t1, 1(t4)
1264 sb t2, 2(t4)
1265 sb t7, 3(t4)
1266 addiu t4, 4
1267 addiu t5, 8
1268 addiu s4, s4, -1
1269 bgtz s4, 2b
1270 addiu s7, 8
1271 beqz t8, 4f
1272 addu t8, t4, t8
1273 3:
1274 ulhu t0, 0(t5)
1275 ulhu t1, 0(s7)
1276 ins t0, t1, 16, 16
1277 raddu.w.qb t0, t0
1278 addu t0, t0, s6
1279 srl t0, 2
1280 xori s6, s6, 3
1281 sb t0, 0(t4)
1282 addiu t5, 2
1283 addiu t4, 1
1284 bne t8, t4, 3b
1285 addiu s7, 2
1286 4:
1287 lbux t1, t6(t5)
1288 sll t1, 1
1289 lbux t0, t6(s7)
1290 sll t0, 1
1291 addu t1, t1, t0
1292 addu t3, t1, s6
1293 srl t0, t3, 2 // t2 = pixval1
1294 xori s6, s6, 3
1295 addu t2, t1, s6
1296 srl t1, t2, 2 // t3 = pixval2
1297 blez s2, 6f
1298 append t1, t0, 8
1299 5:
1300 ush t1, 0(t4)
1301 addiu s2, -1
1302 bgtz s2, 5b
1303 addiu t4, 2
1304 6:
1305 beqz t9, 7f
1306 nop
1307 sb t0, 0(t4)
1308 7:
1309 addiu s1, 4
1310 addiu a2, -1
1311 bnez a2, 0b
1312 addiu s0, 8
1313 8:
1314 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1315
1316 j ra
1317 nop
1318 END(jsimd_h2v2_downsample_mips_dspr2)
1319 /*****************************************************************************/
1320 LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
1321 /*
1322 * a0 - input_data
1323 * a1 - output_data
1324 * a2 - compptr->v_samp_factor
1325 * a3 - cinfo->max_v_samp_factor
1326 * 16(sp) - cinfo->smoothing_factor
1327 * 20(sp) - compptr->width_in_blocks
1328 * 24(sp) - cinfo->image_width
1329 */
1330
1331 .set at
1332
1333 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1334
1335 lw s7, 52(sp) // compptr->width_in_blocks
1336 lw s0, 56(sp) // cinfo->image_width
1337 lw s6, 48(sp) // cinfo->smoothing_factor
1338 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
1339 sll v0, s7, 1
1340 subu v0, v0, s0
1341 blez v0, 2f
1342 move v1, zero
1343 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
1344 0:
1345 addiu t1, a0, -4
1346 sll t2, v1, 2
1347 lwx t1, t2(t1)
1348 move t3, v0
1349 addu t1, t1, s0
1350 lbu t2, -1(t1)
1351 1:
1352 addiu t3, t3, -1
1353 sb t2, 0(t1)
1354 bgtz t3, 1b
1355 addiu t1, t1, 1
1356 addiu v1, v1, 1
1357 bne v1, t0, 0b
1358 nop
1359 2:
1360 li v0, 80
1361 mul v0, s6, v0
1362 li v1, 16384
1363 move t4, zero
1364 move t5, zero
1365 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
1366 sll t7, s6, 4 // t7 = tmp_smoot_f * 16
1367 3:
1368 /* Special case for first column: pretend column -1 is same as column 0 */
1369 sll v0, t4, 2
1370 lwx t8, v0(a1) // outptr = output_data[outrow]
1371 sll v1, t5, 2
1372 addiu t9, v1, 4
1373 addiu s0, v1, -4
1374 addiu s1, v1, 8
1375 lwx s2, v1(a0) // inptr0 = input_data[inrow]
1376 lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
1377 lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
1378 lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
1379 lh v0, 0(s2)
1380 lh v1, 0(t9)
1381 lh t0, 0(s0)
1382 lh t1, 0(s1)
1383 ins v0, v1, 16, 16
1384 ins t0, t1, 16, 16
1385 raddu.w.qb t2, v0
1386 raddu.w.qb s3, t0
1387 lbu v0, 0(s2)
1388 lbu v1, 2(s2)
1389 lbu t0, 0(t9)
1390 lbu t1, 2(t9)
1391 addu v0, v0, v1
1392 mult $ac1,t2, t6
1393 addu t0, t0, t1
1394 lbu t2, 2(s0)
1395 addu t0, t0, v0
1396 lbu t3, 2(s1)
1397 addu s3, t0, s3
1398 lbu v0, 0(s0)
1399 lbu t0, 0(s1)
1400 sll s3, s3, 1
1401 addu v0, v0, t2
1402 addu t0, t0, t3
1403 addu t0, t0, v0
1404 addu s3, t0, s3
1405 madd $ac1,s3, t7
1406 extr_r.w v0, $ac1, 16
1407 addiu t8, t8, 1
1408 addiu s2, s2, 2
1409 addiu t9, t9, 2
1410 addiu s0, s0, 2
1411 addiu s1, s1, 2
1412 sb v0, -1(t8)
1413 addiu s4, s7, -2
1414 and s4, s4, 3
1415 addu s5, s4, t8 //end adress
1416 4:
1417 lh v0, 0(s2)
1418 lh v1, 0(t9)
1419 lh t0, 0(s0)
1420 lh t1, 0(s1)
1421 ins v0, v1, 16, 16
1422 ins t0, t1, 16, 16
1423 raddu.w.qb t2, v0
1424 raddu.w.qb s3, t0
1425 lbu v0, -1(s2)
1426 lbu v1, 2(s2)
1427 lbu t0, -1(t9)
1428 lbu t1, 2(t9)
1429 addu v0, v0, v1
1430 mult $ac1, t2, t6
1431 addu t0, t0, t1
1432 lbu t2, 2(s0)
1433 addu t0, t0, v0
1434 lbu t3, 2(s1)
1435 addu s3, t0, s3
1436 lbu v0, -1(s0)
1437 lbu t0, -1(s1)
1438 sll s3, s3, 1
1439 addu v0, v0, t2
1440 addu t0, t0, t3
1441 addu t0, t0, v0
1442 addu s3, t0, s3
1443 madd $ac1, s3, t7
1444 extr_r.w t2, $ac1, 16
1445 addiu t8, t8, 1
1446 addiu s2, s2, 2
1447 addiu t9, t9, 2
1448 addiu s0, s0, 2
1449 sb t2, -1(t8)
1450 bne s5, t8, 4b
1451 addiu s1, s1, 2
1452 addiu s5, s7, -2
1453 subu s5, s5, s4
1454 addu s5, s5, t8 //end adress
1455 5:
1456 lh v0, 0(s2)
1457 lh v1, 0(t9)
1458 lh t0, 0(s0)
1459 lh t1, 0(s1)
1460 ins v0, v1, 16, 16
1461 ins t0, t1, 16, 16
1462 raddu.w.qb t2, v0
1463 raddu.w.qb s3, t0
1464 lbu v0, -1(s2)
1465 lbu v1, 2(s2)
1466 lbu t0, -1(t9)
1467 lbu t1, 2(t9)
1468 addu v0, v0, v1
1469 mult $ac1, t2, t6
1470 addu t0, t0, t1
1471 lbu t2, 2(s0)
1472 addu t0, t0, v0
1473 lbu t3, 2(s1)
1474 addu s3, t0, s3
1475 lbu v0, -1(s0)
1476 lbu t0, -1(s1)
1477 sll s3, s3, 1
1478 addu v0, v0, t2
1479 addu t0, t0, t3
1480 lh v1, 2(t9)
1481 addu t0, t0, v0
1482 lh v0, 2(s2)
1483 addu s3, t0, s3
1484 lh t0, 2(s0)
1485 lh t1, 2(s1)
1486 madd $ac1, s3, t7
1487 extr_r.w t2, $ac1, 16
1488 ins t0, t1, 16, 16
1489 ins v0, v1, 16, 16
1490 raddu.w.qb s3, t0
1491 lbu v1, 4(s2)
1492 lbu t0, 1(t9)
1493 lbu t1, 4(t9)
1494 sb t2, 0(t8)
1495 raddu.w.qb t3, v0
1496 lbu v0, 1(s2)
1497 addu t0, t0, t1
1498 mult $ac1, t3, t6
1499 addu v0, v0, v1
1500 lbu t2, 4(s0)
1501 addu t0, t0, v0
1502 lbu v0, 1(s0)
1503 addu s3, t0, s3
1504 lbu t0, 1(s1)
1505 lbu t3, 4(s1)
1506 addu v0, v0, t2
1507 sll s3, s3, 1
1508 addu t0, t0, t3
1509 lh v1, 4(t9)
1510 addu t0, t0, v0
1511 lh v0, 4(s2)
1512 addu s3, t0, s3
1513 lh t0, 4(s0)
1514 lh t1, 4(s1)
1515 madd $ac1, s3, t7
1516 extr_r.w t2, $ac1, 16
1517 ins t0, t1, 16, 16
1518 ins v0, v1, 16, 16
1519 raddu.w.qb s3, t0
1520 lbu v1, 6(s2)
1521 lbu t0, 3(t9)
1522 lbu t1, 6(t9)
1523 sb t2, 1(t8)
1524 raddu.w.qb t3, v0
1525 lbu v0, 3(s2)
1526 addu t0, t0,t1
1527 mult $ac1, t3, t6
1528 addu v0, v0, v1
1529 lbu t2, 6(s0)
1530 addu t0, t0, v0
1531 lbu v0, 3(s0)
1532 addu s3, t0, s3
1533 lbu t0, 3(s1)
1534 lbu t3, 6(s1)
1535 addu v0, v0, t2
1536 sll s3, s3, 1
1537 addu t0, t0, t3
1538 lh v1, 6(t9)
1539 addu t0, t0, v0
1540 lh v0, 6(s2)
1541 addu s3, t0, s3
1542 lh t0, 6(s0)
1543 lh t1, 6(s1)
1544 madd $ac1, s3, t7
1545 extr_r.w t3, $ac1, 16
1546 ins t0, t1, 16, 16
1547 ins v0, v1, 16, 16
1548 raddu.w.qb s3, t0
1549 lbu v1, 8(s2)
1550 lbu t0, 5(t9)
1551 lbu t1, 8(t9)
1552 sb t3, 2(t8)
1553 raddu.w.qb t2, v0
1554 lbu v0, 5(s2)
1555 addu t0, t0, t1
1556 mult $ac1, t2, t6
1557 addu v0, v0, v1
1558 lbu t2, 8(s0)
1559 addu t0, t0, v0
1560 lbu v0, 5(s0)
1561 addu s3, t0, s3
1562 lbu t0, 5(s1)
1563 lbu t3, 8(s1)
1564 addu v0, v0, t2
1565 sll s3, s3, 1
1566 addu t0, t0, t3
1567 addiu t8, t8, 4
1568 addu t0, t0, v0
1569 addiu s2, s2, 8
1570 addu s3, t0, s3
1571 addiu t9, t9, 8
1572 madd $ac1, s3, t7
1573 extr_r.w t1, $ac1, 16
1574 addiu s0, s0, 8
1575 addiu s1, s1, 8
1576 bne s5, t8, 5b
1577 sb t1, -1(t8)
1578 /* Special case for last column */
1579 lh v0, 0(s2)
1580 lh v1, 0(t9)
1581 lh t0, 0(s0)
1582 lh t1, 0(s1)
1583 ins v0, v1, 16, 16
1584 ins t0, t1, 16, 16
1585 raddu.w.qb t2, v0
1586 raddu.w.qb s3, t0
1587 lbu v0, -1(s2)
1588 lbu v1, 1(s2)
1589 lbu t0, -1(t9)
1590 lbu t1, 1(t9)
1591 addu v0, v0, v1
1592 mult $ac1, t2, t6
1593 addu t0, t0, t1
1594 lbu t2, 1(s0)
1595 addu t0, t0, v0
1596 lbu t3, 1(s1)
1597 addu s3, t0, s3
1598 lbu v0, -1(s0)
1599 lbu t0, -1(s1)
1600 sll s3, s3, 1
1601 addu v0, v0, t2
1602 addu t0, t0, t3
1603 addu t0, t0, v0
1604 addu s3, t0, s3
1605 madd $ac1, s3, t7
1606 extr_r.w t0, $ac1, 16
1607 addiu t5, t5, 2
1608 sb t0, 0(t8)
1609 addiu t4, t4, 1
1610 bne t4, a2, 3b
1611 addiu t5, t5, 2
1612
1613 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1614
1615 j ra
1616 nop
1617
1618 END(jsimd_h2v2_smooth_downsample_mips_dspr2)
1619
1620 /*****************************************************************************/
1621 LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
1622 /*
1623 * a0 - upsample->h_expand[compptr->component_index]
1624 * a1 - upsample->v_expand[compptr->component_index]
1625 * a2 - input_data
1626 * a3 - output_data_ptr
1627 * 16(sp) - cinfo->output_width
1628 * 20(sp) - cinfo->max_v_samp_factor
1629 */
1630 .set at
1631
1632 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1633
1634 lw s0, 0(a3) // s0 = output_data
1635 lw s1, 32(sp) // s1 = cinfo->output_width
1636 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
1637 li t6, 0 // t6 = inrow
1638 beqz s2, 10f
1639 li s3, 0 // s3 = outrow
1640 0:
1641 addu t0, a2, t6
1642 addu t7, s0, s3
1643 lw t3, 0(t0) // t3 = inptr
1644 lw t8, 0(t7) // t8 = outptr
1645 beqz s1, 4f
1646 addu t5, t8, s1 // t5 = outend
1647 1:
1648 lb t2, 0(t3) // t2 = invalue = *inptr++
1649 addiu t3, 1
1650 beqz a0, 3f
1651 move t0, a0 // t0 = h_expand
1652 2:
1653 sb t2, 0(t8)
1654 addiu t0, -1
1655 bgtz t0, 2b
1656 addiu t8, 1
1657 3:
1658 bgt t5, t8, 1b
1659 nop
1660 4:
1661 addiu t9, a1, -1 // t9 = v_expand - 1
1662 blez t9, 9f
1663 nop
1664 5:
1665 lw t3, 0(s0)
1666 lw t4, 4(s0)
1667 subu t0, s1, 0xF
1668 blez t0, 7f
1669 addu t5, t3, s1 // t5 = end address
1670 andi t7, s1, 0xF // t7 = residual
1671 subu t8, t5, t7
1672 6:
1673 ulw t0, 0(t3)
1674 ulw t1, 4(t3)
1675 ulw t2, 8(t3)
1676 usw t0, 0(t4)
1677 ulw t0, 12(t3)
1678 usw t1, 4(t4)
1679 usw t2, 8(t4)
1680 usw t0, 12(t4)
1681 addiu t3, 16
1682 bne t3, t8, 6b
1683 addiu t4, 16
1684 beqz t7, 8f
1685 nop
1686 7:
1687 lbu t0, 0(t3)
1688 sb t0, 0(t4)
1689 addiu t3, 1
1690 bne t3, t5, 7b
1691 addiu t4, 1
1692 8:
1693 addiu t9, -1
1694 bgtz t9, 5b
1695 addiu s0, 8
1696 9:
1697 addu s3, s3, a1
1698 bne s3, s2, 0b
1699 addiu t6, 1
1700 10:
1701 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1702
1703 j ra
1704 nop
1705 END(jsimd_int_upsample_mips_dspr2)
1706
1707 /*****************************************************************************/
1708 LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
1709 /*
1710 * a0 - cinfo->max_v_samp_factor
1711 * a1 - cinfo->output_width
1712 * a2 - input_data
1713 * a3 - output_data_ptr
1714 */
1715 lw t7, 0(a3) // t7 = output_data
1716 andi t8, a1, 0xf // t8 = residual
1717 sll t0, a0, 2
1718 blez a0, 4f
1719 addu t9, t7, t0 // t9 = output_data end address
1720 0:
1721 lw t5, 0(t7) // t5 = outptr
1722 lw t6, 0(a2) // t6 = inptr
1723 addu t3, t5, a1 // t3 = outptr + output_width (end address)
1724 subu t3, t8 // t3 = end address - residual
1725 beq t5, t3, 2f
1726 move t4, t8
1727 1:
1728 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
1729 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
1730 srl t1, t0, 16 // t1 = |X|X|P3|P2|
1731 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
1732 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
1733 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
1734 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
1735 usw t0, 0(t5)
1736 usw t1, 4(t5)
1737 srl t0, t2, 16 // t0 = |X|X|P7|P6|
1738 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
1739 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
1740 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
1741 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
1742 usw t2, 8(t5)
1743 usw t0, 12(t5)
1744 addiu t5, 16
1745 bne t5, t3, 1b
1746 addiu t6, 8
1747 beqz t8, 3f
1748 move t4, t8
1749 2:
1750 lbu t1, 0(t6)
1751 sb t1, 0(t5)
1752 sb t1, 1(t5)
1753 addiu t4, -2
1754 addiu t6, 1
1755 bgtz t4, 2b
1756 addiu t5, 2
1757 3:
1758 addiu t7, 4
1759 bne t9, t7, 0b
1760 addiu a2, 4
1761 4:
1762 j ra
1763 nop
1764 END(jsimd_h2v1_upsample_mips_dspr2)
1765
1766 /*****************************************************************************/
1767 LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
1768 /*
1769 * a0 - cinfo->max_v_samp_factor
1770 * a1 - cinfo->output_width
1771 * a2 - input_data
1772 * a3 - output_data_ptr
1773 */
1774 lw t7, 0(a3)
1775 blez a0, 7f
1776 andi t9, a1, 0xf // t9 = residual
1777 0:
1778 lw t6, 0(a2) // t6 = inptr
1779 lw t5, 0(t7) // t5 = outptr
1780 addu t8, t5, a1 // t8 = outptr end address
1781 subu t8, t9 // t8 = end address - residual
1782 beq t5, t8, 2f
1783 move t4, t9
1784 1:
1785 ulw t0, 0(t6)
1786 srl t1, t0, 16
1787 ins t0, t0, 16, 16
1788 ins t0, t0, 8, 16
1789 ins t1, t1, 16, 16
1790 ins t1, t1, 8, 16
1791 ulw t2, 4(t6)
1792 usw t0, 0(t5)
1793 usw t1, 4(t5)
1794 srl t3, t2, 16
1795 ins t2, t2, 16, 16
1796 ins t2, t2, 8, 16
1797 ins t3, t3, 16, 16
1798 ins t3, t3, 8, 16
1799 usw t2, 8(t5)
1800 usw t3, 12(t5)
1801 addiu t5, 16
1802 bne t5, t8, 1b
1803 addiu t6, 8
1804 beqz t9, 3f
1805 move t4, t9
1806 2:
1807 lbu t0, 0(t6)
1808 sb t0, 0(t5)
1809 sb t0, 1(t5)
1810 addiu t4, -2
1811 addiu t6, 1
1812 bgtz t4, 2b
1813 addiu t5, 2
1814 3:
1815 lw t6, 0(t7) // t6 = outptr[0]
1816 lw t5, 4(t7) // t5 = outptr[1]
1817 addu t4, t6, a1 // t4 = new end address
1818 beq a1, t9, 5f
1819 subu t8, t4, t9
1820 4:
1821 ulw t0, 0(t6)
1822 ulw t1, 4(t6)
1823 ulw t2, 8(t6)
1824 usw t0, 0(t5)
1825 ulw t0, 12(t6)
1826 usw t1, 4(t5)
1827 usw t2, 8(t5)
1828 usw t0, 12(t5)
1829 addiu t6, 16
1830 bne t6, t8, 4b
1831 addiu t5, 16
1832 beqz t9, 6f
1833 nop
1834 5:
1835 lbu t0, 0(t6)
1836 sb t0, 0(t5)
1837 addiu t6, 1
1838 bne t6, t4, 5b
1839 addiu t5, 1
1840 6:
1841 addiu t7, 8
1842 addiu a0, -2
1843 bgtz a0, 0b
1844 addiu a2, 4
1845 7:
1846 j ra
1847 nop
1848 END(jsimd_h2v2_upsample_mips_dspr2)
1849
1850 /*****************************************************************************/
1851 LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
1852 /*
1853 * a0 - coef_block
1854 * a1 - compptr->dcttable
1855 * a2 - output
1856 * a3 - range_limit
1857 */
1858
1859 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1860
1861 addiu sp, sp, -256
1862 move v0, sp
1863 addiu v1, zero, 8 // v1 = DCTSIZE = 8
1864 1:
1865 lh s4, 32(a0) // s4 = inptr[16]
1866 lh s5, 64(a0) // s5 = inptr[32]
1867 lh s6, 96(a0) // s6 = inptr[48]
1868 lh t1, 112(a0) // t1 = inptr[56]
1869 lh t7, 16(a0) // t7 = inptr[8]
1870 lh t5, 80(a0) // t5 = inptr[40]
1871 lh t3, 48(a0) // t3 = inptr[24]
1872 or s4, s4, t1
1873 or s4, s4, t3
1874 or s4, s4, t5
1875 or s4, s4, t7
1876 or s4, s4, s5
1877 or s4, s4, s6
1878 bnez s4, 2f
1879 addiu v1, v1, -1
1880 lh s5, 0(a1) // quantptr[DCTSIZE*0]
1881 lh s6, 0(a0) // inptr[DCTSIZE*0]
1882 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
1883 sll s5, s5, 2
1884 sw s5, 0(v0)
1885 sw s5, 32(v0)
1886 sw s5, 64(v0)
1887 sw s5, 96(v0)
1888 sw s5, 128(v0)
1889 sw s5, 160(v0)
1890 sw s5, 192(v0)
1891 b 3f
1892 sw s5, 224(v0)
1893 2:
1894 lh t0, 112(a1)
1895 lh t2, 48(a1)
1896 lh t4, 80(a1)
1897 lh t6, 16(a1)
1898 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
1899 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
1900 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
1901 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
1902 lh t4, 32(a1)
1903 lh t5, 32(a0)
1904 lh t6, 96(a1)
1905 lh t7, 96(a0)
1906 addu s0, t0, t1 // z3 = tmp0 + tmp2
1907 addu s1, t1, t2 // z2 = tmp1 + tmp2
1908 addu s2, t2, t3 // z4 = tmp1 + tmp3
1909 addu s3, s0, s2 // z3 + z4
1910 addiu t9, zero, 9633 // FIX_1_175875602
1911 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1912 addu t8, t0, t3 // z1 = tmp0 + tmp3
1913 addiu t9, zero, 2446 // FIX_0_298631336
1914 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1915 addiu t9, zero, 16819 // FIX_2_053119869
1916 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1917 addiu t9, zero, 25172 // FIX_3_072711026
1918 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1919 addiu t9, zero, 12299 // FIX_1_501321110
1920 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1921 addiu t9, zero, 16069 // FIX_1_961570560
1922 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
1923 addiu t9, zero, 3196 // FIX_0_390180644
1924 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
1925 addiu t9, zero, 7373 // FIX_0_899976223
1926 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
1927 addiu t9, zero, 20995 // FIX_2_562915447
1928 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
1929 subu s0, s3, s0 // z3 += z5
1930 addu t0, t0, s0 // tmp0 += z3
1931 addu t1, t1, s0 // tmp2 += z3
1932 subu s2, s3, s2 // z4 += z5
1933 addu t2, t2, s2 // tmp1 += z4
1934 addu t3, t3, s2 // tmp3 += z4
1935 subu t0, t0, t8 // tmp0 += z1
1936 subu t1, t1, s1 // tmp2 += z2
1937 subu t2, t2, s1 // tmp1 += z2
1938 subu t3, t3, t8 // tmp3 += z1
1939 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
1940 addiu t9, zero, 6270 // FIX_0_765366865
1941 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
1942 lh t4, 0(a1)
1943 lh t5, 0(a0)
1944 lh t6, 64(a1)
1945 lh t7, 64(a0)
1946 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
1947 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
1948 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
1949 addiu t9, zero, 4433 // FIX_0_541196100
1950 addu s3, s0, s1 // z2 + z3
1951 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1952 addiu t9, zero, 15137 // FIX_1_847759065
1953 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
1954 addu t4, t5, t6
1955 subu t5, t5, t6
1956 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
1957 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
1958 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1959 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
1960 addu s0, t4, t7
1961 subu s1, t4, t7
1962 addu s2, t5, t6
1963 subu s3, t5, t6
1964 addu t4, s0, t3
1965 subu s0, s0, t3
1966 addu t3, s2, t1
1967 subu s2, s2, t1
1968 addu t1, s3, t2
1969 subu s3, s3, t2
1970 addu t2, s1, t0
1971 subu s1, s1, t0
1972 shra_r.w t4, t4, 11
1973 shra_r.w t3, t3, 11
1974 shra_r.w t1, t1, 11
1975 shra_r.w t2, t2, 11
1976 shra_r.w s1, s1, 11
1977 shra_r.w s3, s3, 11
1978 shra_r.w s2, s2, 11
1979 shra_r.w s0, s0, 11
1980 sw t4, 0(v0)
1981 sw t3, 32(v0)
1982 sw t1, 64(v0)
1983 sw t2, 96(v0)
1984 sw s1, 128(v0)
1985 sw s3, 160(v0)
1986 sw s2, 192(v0)
1987 sw s0, 224(v0)
1988 3:
1989 addiu a1, a1, 2
1990 addiu a0, a0, 2
1991 bgtz v1, 1b
1992 addiu v0, v0, 4
1993 move v0, sp
1994 addiu v1, zero, 8
1995 4:
1996 lw t0, 8(v0) // z2 = (JLONG) wsptr[2]
1997 lw t1, 24(v0) // z3 = (JLONG) wsptr[6]
1998 lw t2, 0(v0) // (JLONG) wsptr[0]
1999 lw t3, 16(v0) // (JLONG) wsptr[4]
2000 lw s4, 4(v0) // (JLONG) wsptr[1]
2001 lw s5, 12(v0) // (JLONG) wsptr[3]
2002 lw s6, 20(v0) // (JLONG) wsptr[5]
2003 lw s7, 28(v0) // (JLONG) wsptr[7]
2004 or s4, s4, t0
2005 or s4, s4, t1
2006 or s4, s4, t3
2007 or s4, s4, s7
2008 or s4, s4, s5
2009 or s4, s4, s6
2010 bnez s4, 5f
2011 addiu v1, v1, -1
2012 shra_r.w s5, t2, 5
2013 andi s5, s5, 0x3ff
2014 lbux s5, s5(a3)
2015 lw s1, 0(a2)
2016 replv.qb s5, s5
2017 usw s5, 0(s1)
2018 usw s5, 4(s1)
2019 b 6f
2020 nop
2021 5:
2022 addu t4, t0, t1 // z2 + z3
2023 addiu t8, zero, 4433 // FIX_0_541196100
2024 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2025 addiu t8, zero, 15137 // FIX_1_847759065
2026 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
2027 addiu t8, zero, 6270 // FIX_0_765366865
2028 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
2029 addu t4, t2, t3 // (JLONG) wsptr[0] + (JLONG) wsptr[4]
2030 subu t2, t2, t3 // (JLONG) wsptr[0] - (JLONG) wsptr[4]
2031 sll t4, t4, 13 // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
2032 sll t2, t2, 13 // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
2033 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
2034 subu t3, t2, t1 // tmp12 = tmp1 - tmp2
2035 addu t2, t2, t1 // tmp11 = tmp1 + tmp2
2036 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2037 subu t1, t4, t5 // tmp13 = tmp0 - tmp3
2038 addu t0, t4, t5 // tmp10 = tmp0 + tmp3
2039 lw t4, 28(v0) // tmp0 = (JLONG) wsptr[7]
2040 lw t6, 12(v0) // tmp2 = (JLONG) wsptr[3]
2041 lw t5, 20(v0) // tmp1 = (JLONG) wsptr[5]
2042 lw t7, 4(v0) // tmp3 = (JLONG) wsptr[1]
2043 addu s0, t4, t6 // z3 = tmp0 + tmp2
2044 addiu t8, zero, 9633 // FIX_1_175875602
2045 addu s1, t5, t7 // z4 = tmp1 + tmp3
2046 addu s2, s0, s1 // z3 + z4
2047 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2048 addu s3, t4, t7 // z1 = tmp0 + tmp3
2049 addu t9, t5, t6 // z2 = tmp1 + tmp2
2050 addiu t8, zero, 16069 // FIX_1_961570560
2051 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
2052 addiu t8, zero, 3196 // FIX_0_390180644
2053 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
2054 addiu t8, zero, 2446 // FIX_0_298631336
2055 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2056 addiu t8, zero, 7373 // FIX_0_899976223
2057 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
2058 addiu t8, zero, 16819 // FIX_2_053119869
2059 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2060 addiu t8, zero, 20995 // FIX_2_562915447
2061 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
2062 addiu t8, zero, 25172 // FIX_3_072711026
2063 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2064 addiu t8, zero, 12299 // FIX_1_501321110
2065 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2066 subu s0, s2, s0 // z3 += z5
2067 subu s1, s2, s1 // z4 += z5
2068 addu t4, t4, s0
2069 subu t4, t4, s3 // tmp0
2070 addu t5, t5, s1
2071 subu t5, t5, t9 // tmp1
2072 addu t6, t6, s0
2073 subu t6, t6, t9 // tmp2
2074 addu t7, t7, s1
2075 subu t7, t7, s3 // tmp3
2076 addu s0, t0, t7
2077 subu t0, t0, t7
2078 addu t7, t2, t6
2079 subu t2, t2, t6
2080 addu t6, t3, t5
2081 subu t3, t3, t5
2082 addu t5, t1, t4
2083 subu t1, t1, t4
2084 shra_r.w s0, s0, 18
2085 shra_r.w t7, t7, 18
2086 shra_r.w t6, t6, 18
2087 shra_r.w t5, t5, 18
2088 shra_r.w t1, t1, 18
2089 shra_r.w t3, t3, 18
2090 shra_r.w t2, t2, 18
2091 shra_r.w t0, t0, 18
2092 andi s0, s0, 0x3ff
2093 andi t7, t7, 0x3ff
2094 andi t6, t6, 0x3ff
2095 andi t5, t5, 0x3ff
2096 andi t1, t1, 0x3ff
2097 andi t3, t3, 0x3ff
2098 andi t2, t2, 0x3ff
2099 andi t0, t0, 0x3ff
2100 lw s1, 0(a2)
2101 lbux s0, s0(a3)
2102 lbux t7, t7(a3)
2103 lbux t6, t6(a3)
2104 lbux t5, t5(a3)
2105 lbux t1, t1(a3)
2106 lbux t3, t3(a3)
2107 lbux t2, t2(a3)
2108 lbux t0, t0(a3)
2109 sb s0, 0(s1)
2110 sb t7, 1(s1)
2111 sb t6, 2(s1)
2112 sb t5, 3(s1)
2113 sb t1, 4(s1)
2114 sb t3, 5(s1)
2115 sb t2, 6(s1)
2116 sb t0, 7(s1)
2117 6:
2118 addiu v0, v0, 32
2119 bgtz v1, 4b
2120 addiu a2, a2, 4
2121 addiu sp, sp, 256
2122
2123 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2124
2125 j ra
2126 nop
2127
2128 END(jsimd_idct_islow_mips_dspr2)
2129
2130 /*****************************************************************************/
2131 LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
2132 /*
2133 * a0 - inptr
2134 * a1 - quantptr
2135 * a2 - wsptr
2136 * a3 - mips_idct_ifast_coefs
2137 */
2138
2139 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2140
2141 addiu t9, a0, 16 // end address
2142 or AT, a3, zero
2143
2144 0:
2145 lw s0, 0(a1) // quantptr[DCTSIZE*0]
2146 lw t0, 0(a0) // inptr[DCTSIZE*0]
2147 lw t1, 16(a0) // inptr[DCTSIZE*1]
2148 muleq_s.w.phl v0, t0, s0 // tmp0 ...
2149 lw t2, 32(a0) // inptr[DCTSIZE*2]
2150 lw t3, 48(a0) // inptr[DCTSIZE*3]
2151 lw t4, 64(a0) // inptr[DCTSIZE*4]
2152 lw t5, 80(a0) // inptr[DCTSIZE*5]
2153 muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
2154 lw t6, 96(a0) // inptr[DCTSIZE*6]
2155 lw t7, 112(a0) // inptr[DCTSIZE*7]
2156 or s4, t1, t2
2157 or s5, t3, t4
2158 bnez s4, 1f
2159 ins t0, v0, 16, 16 // ... tmp0
2160 bnez s5, 1f
2161 or s6, t5, t6
2162 or s6, s6, t7
2163 bnez s6, 1f
2164 sw t0, 0(a2) // wsptr[DCTSIZE*0]
2165 sw t0, 16(a2) // wsptr[DCTSIZE*1]
2166 sw t0, 32(a2) // wsptr[DCTSIZE*2]
2167 sw t0, 48(a2) // wsptr[DCTSIZE*3]
2168 sw t0, 64(a2) // wsptr[DCTSIZE*4]
2169 sw t0, 80(a2) // wsptr[DCTSIZE*5]
2170 sw t0, 96(a2) // wsptr[DCTSIZE*6]
2171 sw t0, 112(a2) // wsptr[DCTSIZE*7]
2172 addiu a0, a0, 4
2173 b 2f
2174 addiu a1, a1, 4
2175
2176 1:
2177 lw s1, 32(a1) // quantptr[DCTSIZE*2]
2178 lw s2, 64(a1) // quantptr[DCTSIZE*4]
2179 muleq_s.w.phl v0, t2, s1 // tmp1 ...
2180 muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
2181 lw s0, 16(a1) // quantptr[DCTSIZE*1]
2182 lw s1, 48(a1) // quantptr[DCTSIZE*3]
2183 lw s3, 96(a1) // quantptr[DCTSIZE*6]
2184 muleq_s.w.phl v1, t4, s2 // tmp2 ...
2185 muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
2186 lw s2, 80(a1) // quantptr[DCTSIZE*5]
2187 lw t8, 4(AT) // FIX(1.414213562)
2188 ins t2, v0, 16, 16 // ... tmp1
2189 muleq_s.w.phl v0, t6, s3 // tmp3 ...
2190 muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
2191 ins t4, v1, 16, 16 // ... tmp2
2192 addq.ph s4, t0, t4 // tmp10
2193 subq.ph s5, t0, t4 // tmp11
2194 ins t6, v0, 16, 16 // ... tmp3
2195 subq.ph s6, t2, t6 // tmp12 ...
2196 addq.ph s7, t2, t6 // tmp13
2197 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2198 addq.ph t0, s4, s7 // tmp0
2199 subq.ph t6, s4, s7 // tmp3
2200 muleq_s.w.phl v0, t1, s0 // tmp4 ...
2201 muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
2202 shll_s.ph s6, s6, 1 // x2
2203 lw s3, 112(a1) // quantptr[DCTSIZE*7]
2204 subq.ph s6, s6, s7 // ... tmp12
2205 muleq_s.w.phl v1, t7, s3 // tmp7 ...
2206 muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
2207 ins t1, v0, 16, 16 // ... tmp4
2208 addq.ph t2, s5, s6 // tmp1
2209 subq.ph t4, s5, s6 // tmp2
2210 muleq_s.w.phl v0, t5, s2 // tmp6 ...
2211 muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
2212 ins t7, v1, 16, 16 // ... tmp7
2213 addq.ph s5, t1, t7 // z11
2214 subq.ph s6, t1, t7 // z12
2215 muleq_s.w.phl v1, t3, s1 // tmp5 ...
2216 muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
2217 ins t5, v0, 16, 16 // ... tmp6
2218 ins t3, v1, 16, 16 // ... tmp5
2219 addq.ph s7, t5, t3 // z13
2220 subq.ph v0, t5, t3 // z10
2221 addq.ph t7, s5, s7 // tmp7
2222 subq.ph s5, s5, s7 // tmp11 ...
2223 addq.ph v1, v0, s6 // z5 ...
2224 mulq_s.ph s5, s5, t8 // ... tmp11
2225 lw t8, 8(AT) // FIX(1.847759065)
2226 lw s4, 0(AT) // FIX(1.082392200)
2227 addq.ph s0, t0, t7
2228 subq.ph s1, t0, t7
2229 mulq_s.ph v1, v1, t8 // ... z5
2230 shll_s.ph s5, s5, 1 // x2
2231 lw t8, 12(AT) // FIX(-2.613125930)
2232 sw s0, 0(a2) // wsptr[DCTSIZE*0]
2233 shll_s.ph v0, v0, 1 // x4
2234 mulq_s.ph v0, v0, t8 // tmp12 ...
2235 mulq_s.ph s4, s6, s4 // tmp10 ...
2236 shll_s.ph v1, v1, 1 // x2
2237 addiu a0, a0, 4
2238 addiu a1, a1, 4
2239 sw s1, 112(a2) // wsptr[DCTSIZE*7]
2240 shll_s.ph s6, v0, 1 // x4
2241 shll_s.ph s4, s4, 1 // x2
2242 addq.ph s6, s6, v1 // ... tmp12
2243 subq.ph t5, s6, t7 // tmp6
2244 subq.ph s4, s4, v1 // ... tmp10
2245 subq.ph t3, s5, t5 // tmp5
2246 addq.ph s2, t2, t5
2247 addq.ph t1, s4, t3 // tmp4
2248 subq.ph s3, t2, t5
2249 sw s2, 16(a2) // wsptr[DCTSIZE*1]
2250 sw s3, 96(a2) // wsptr[DCTSIZE*6]
2251 addq.ph v0, t4, t3
2252 subq.ph v1, t4, t3
2253 sw v0, 32(a2) // wsptr[DCTSIZE*2]
2254 sw v1, 80(a2) // wsptr[DCTSIZE*5]
2255 addq.ph v0, t6, t1
2256 subq.ph v1, t6, t1
2257 sw v0, 64(a2) // wsptr[DCTSIZE*4]
2258 sw v1, 48(a2) // wsptr[DCTSIZE*3]
2259
2260 2:
2261 bne a0, t9, 0b
2262 addiu a2, a2, 4
2263
2264 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2265
2266 j ra
2267 nop
2268
2269 END(jsimd_idct_ifast_cols_mips_dspr2)
2270
2271 /*****************************************************************************/
2272 LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
2273 /*
2274 * a0 - wsptr
2275 * a1 - output_buf
2276 * a2 - output_col
2277 * a3 - mips_idct_ifast_coefs
2278 */
2279
2280 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2281
2282 addiu t9, a0, 128 // end address
2283 lui s8, 0x8080
2284 ori s8, s8, 0x8080
2285
2286 0:
2287 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
2288 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
2289 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
2290 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
2291 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
2292 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
2293 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
2294 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
2295 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
2296 precrq.ph.w t1, s0, t0 // B b
2297 ins t0, s0, 16, 16 // A a
2298 bnez t1, 1f
2299 or s0, t2, s2
2300 bnez s0, 1f
2301 or s0, t4, s4
2302 bnez s0, 1f
2303 or s0, t6, s6
2304 bnez s0, 1f
2305 shll_s.ph s0, t0, 2 // A a
2306 lw a3, 0(a1)
2307 lw AT, 4(a1)
2308 precrq.ph.w t0, s0, s0 // A A
2309 ins s0, s0, 16, 16 // a a
2310 addu a3, a3, a2
2311 addu AT, AT, a2
2312 precrq.qb.ph t0, t0, t0 // A A A A
2313 precrq.qb.ph s0, s0, s0 // a a a a
2314 addu.qb s0, s0, s8
2315 addu.qb t0, t0, s8
2316 sw s0, 0(a3)
2317 sw s0, 4(a3)
2318 sw t0, 0(AT)
2319 sw t0, 4(AT)
2320 addiu a0, a0, 32
2321 bne a0, t9, 0b
2322 addiu a1, a1, 8
2323 b 2f
2324 nop
2325
2326 1:
2327 precrq.ph.w t3, s2, t2
2328 ins t2, s2, 16, 16
2329 precrq.ph.w t5, s4, t4
2330 ins t4, s4, 16, 16
2331 precrq.ph.w t7, s6, t6
2332 ins t6, s6, 16, 16
2333 lw t8, 4(AT) // FIX(1.414213562)
2334 addq.ph s4, t0, t4 // tmp10
2335 subq.ph s5, t0, t4 // tmp11
2336 subq.ph s6, t2, t6 // tmp12 ...
2337 addq.ph s7, t2, t6 // tmp13
2338 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2339 addq.ph t0, s4, s7 // tmp0
2340 subq.ph t6, s4, s7 // tmp3
2341 shll_s.ph s6, s6, 1 // x2
2342 subq.ph s6, s6, s7 // ... tmp12
2343 addq.ph t2, s5, s6 // tmp1
2344 subq.ph t4, s5, s6 // tmp2
2345 addq.ph s5, t1, t7 // z11
2346 subq.ph s6, t1, t7 // z12
2347 addq.ph s7, t5, t3 // z13
2348 subq.ph v0, t5, t3 // z10
2349 addq.ph t7, s5, s7 // tmp7
2350 subq.ph s5, s5, s7 // tmp11 ...
2351 addq.ph v1, v0, s6 // z5 ...
2352 mulq_s.ph s5, s5, t8 // ... tmp11
2353 lw t8, 8(AT) // FIX(1.847759065)
2354 lw s4, 0(AT) // FIX(1.082392200)
2355 addq.ph s0, t0, t7 // tmp0 + tmp7
2356 subq.ph s7, t0, t7 // tmp0 - tmp7
2357 mulq_s.ph v1, v1, t8 // ... z5
2358 lw a3, 0(a1)
2359 lw t8, 12(AT) // FIX(-2.613125930)
2360 shll_s.ph s5, s5, 1 // x2
2361 addu a3, a3, a2
2362 shll_s.ph v0, v0, 1 // x4
2363 mulq_s.ph v0, v0, t8 // tmp12 ...
2364 mulq_s.ph s4, s6, s4 // tmp10 ...
2365 shll_s.ph v1, v1, 1 // x2
2366 addiu a0, a0, 32
2367 addiu a1, a1, 8
2368 shll_s.ph s6, v0, 1 // x4
2369 shll_s.ph s4, s4, 1 // x2
2370 addq.ph s6, s6, v1 // ... tmp12
2371 shll_s.ph s0, s0, 2
2372 subq.ph t5, s6, t7 // tmp6
2373 subq.ph s4, s4, v1 // ... tmp10
2374 subq.ph t3, s5, t5 // tmp5
2375 shll_s.ph s7, s7, 2
2376 addq.ph t1, s4, t3 // tmp4
2377 addq.ph s1, t2, t5 // tmp1 + tmp6
2378 subq.ph s6, t2, t5 // tmp1 - tmp6
2379 addq.ph s2, t4, t3 // tmp2 + tmp5
2380 subq.ph s5, t4, t3 // tmp2 - tmp5
2381 addq.ph s4, t6, t1 // tmp3 + tmp4
2382 subq.ph s3, t6, t1 // tmp3 - tmp4
2383 shll_s.ph s1, s1, 2
2384 shll_s.ph s2, s2, 2
2385 shll_s.ph s3, s3, 2
2386 shll_s.ph s4, s4, 2
2387 shll_s.ph s5, s5, 2
2388 shll_s.ph s6, s6, 2
2389 precrq.ph.w t0, s1, s0 // B A
2390 ins s0, s1, 16, 16 // b a
2391 precrq.ph.w t2, s3, s2 // D C
2392 ins s2, s3, 16, 16 // d c
2393 precrq.ph.w t4, s5, s4 // F E
2394 ins s4, s5, 16, 16 // f e
2395 precrq.ph.w t6, s7, s6 // H G
2396 ins s6, s7, 16, 16 // h g
2397 precrq.qb.ph t0, t2, t0 // D C B A
2398 precrq.qb.ph s0, s2, s0 // d c b a
2399 precrq.qb.ph t4, t6, t4 // H G F E
2400 precrq.qb.ph s4, s6, s4 // h g f e
2401 addu.qb s0, s0, s8
2402 addu.qb s4, s4, s8
2403 sw s0, 0(a3) // outptr[0/1/2/3] d c b a
2404 sw s4, 4(a3) // outptr[4/5/6/7] h g f e
2405 lw a3, -4(a1)
2406 addu.qb t0, t0, s8
2407 addu a3, a3, a2
2408 addu.qb t4, t4, s8
2409 sw t0, 0(a3) // outptr[0/1/2/3] D C B A
2410 bne a0, t9, 0b
2411 sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2412
2413 2:
2414
2415 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2416
2417 j ra
2418 nop
2419
2420 END(jsimd_idct_ifast_rows_mips_dspr2)
2421
2422 /*****************************************************************************/
2423 LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
2424 /*
2425 * a0 - data
2426 */
2427
2428 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2429
2430 lui t0, 6437
2431 ori t0, 2260
2432 lui t1, 9633
2433 ori t1, 11363
2434 lui t2, 0xd39e
2435 ori t2, 0xe6dc
2436 lui t3, 0xf72d
2437 ori t3, 9633
2438 lui t4, 2261
2439 ori t4, 9633
2440 lui t5, 0xd39e
2441 ori t5, 6437
2442 lui t6, 9633
2443 ori t6, 0xd39d
2444 lui t7, 0xe6dc
2445 ori t7, 2260
2446 lui t8, 4433
2447 ori t8, 10703
2448 lui t9, 0xd630
2449 ori t9, 4433
2450 li s8, 8
2451 move a1, a0
2452 1:
2453 lw s0, 0(a1) // tmp0 = 1|0
2454 lw s1, 4(a1) // tmp1 = 3|2
2455 lw s2, 8(a1) // tmp2 = 5|4
2456 lw s3, 12(a1) // tmp3 = 7|6
2457 packrl.ph s1, s1, s1 // tmp1 = 2|3
2458 packrl.ph s3, s3, s3 // tmp3 = 6|7
2459 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
2460 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
2461 mult $0, $0 // ac0 = 0
2462 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
2463 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
2464 mult $ac1, $0, $0 // ac1 = 0
2465 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
2466 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
2467 mult $ac2, $0, $0 // ac2 = 0
2468 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
2469 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
2470 mult $ac3, $0, $0 // ac3 = 0
2471 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
2472 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
2473 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
2474 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
2475 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2476 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2477 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
2478 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
2479 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
2480 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
2481 sh s0, 2(a1)
2482 sh s1, 6(a1)
2483 sh s2, 10(a1)
2484 sh s3, 14(a1)
2485 mult $0, $0 // ac0 = 0
2486 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
2487 mult $ac1, $0, $0 // ac1 = 0
2488 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
2489 sra s4, s5, 16 // tmp4 = t11
2490 addiu a1, a1, 16
2491 addiu s8, s8, -1
2492 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2493 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2494 addu s2, s5, s4 // tmp2 = t10 + t11
2495 subu s3, s5, s4 // tmp3 = t10 - t11
2496 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
2497 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
2498 sh s2, -16(a1)
2499 sh s3, -8(a1)
2500 sh s0, -12(a1)
2501 bgtz s8, 1b
2502 sh s1, -4(a1)
2503 li t0, 2260
2504 li t1, 11363
2505 li t2, 9633
2506 li t3, 6436
2507 li t4, 6437
2508 li t5, 2261
2509 li t6, 11362
2510 li t7, 2259
2511 li t8, 4433
2512 li t9, 10703
2513 li a1, 10704
2514 li s8, 8
2515
2516 2:
2517 lh a2, 0(a0) // 0
2518 lh a3, 16(a0) // 8
2519 lh v0, 32(a0) // 16
2520 lh v1, 48(a0) // 24
2521 lh s4, 64(a0) // 32
2522 lh s5, 80(a0) // 40
2523 lh s6, 96(a0) // 48
2524 lh s7, 112(a0) // 56
2525 addu s2, v0, s5 // tmp2 = 16 + 40
2526 subu s5, v0, s5 // tmp5 = 16 - 40
2527 addu s3, v1, s4 // tmp3 = 24 + 32
2528 subu s4, v1, s4 // tmp4 = 24 - 32
2529 addu s0, a2, s7 // tmp0 = 0 + 56
2530 subu s7, a2, s7 // tmp7 = 0 - 56
2531 addu s1, a3, s6 // tmp1 = 8 + 48
2532 subu s6, a3, s6 // tmp6 = 8 - 48
2533 addu a2, s0, s3 // tmp10 = tmp0 + tmp3
2534 subu v1, s0, s3 // tmp13 = tmp0 - tmp3
2535 addu a3, s1, s2 // tmp11 = tmp1 + tmp2
2536 subu v0, s1, s2 // tmp12 = tmp1 - tmp2
2537 mult s7, t1 // ac0 = tmp7 * c1
2538 madd s4, t0 // ac0 += tmp4 * c0
2539 madd s5, t4 // ac0 += tmp5 * c4
2540 madd s6, t2 // ac0 += tmp6 * c2
2541 mult $ac1, s7, t2 // ac1 = tmp7 * c2
2542 msub $ac1, s4, t3 // ac1 -= tmp4 * c3
2543 msub $ac1, s5, t6 // ac1 -= tmp5 * c6
2544 msub $ac1, s6, t7 // ac1 -= tmp6 * c7
2545 mult $ac2, s7, t4 // ac2 = tmp7 * c4
2546 madd $ac2, s4, t2 // ac2 += tmp4 * c2
2547 madd $ac2, s5, t5 // ac2 += tmp5 * c5
2548 msub $ac2, s6, t6 // ac2 -= tmp6 * c6
2549 mult $ac3, s7, t0 // ac3 = tmp7 * c0
2550 msub $ac3, s4, t1 // ac3 -= tmp4 * c1
2551 madd $ac3, s5, t2 // ac3 += tmp5 * c2
2552 msub $ac3, s6, t3 // ac3 -= tmp6 * c3
2553 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
2554 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
2555 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
2556 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
2557 addiu s8, s8, -1
2558 addu s4, a2, a3 // tmp4 = tmp10 + tmp11
2559 subu s5, a2, a3 // tmp5 = tmp10 - tmp11
2560 sh s0, 16(a0)
2561 sh s1, 48(a0)
2562 sh s2, 80(a0)
2563 sh s3, 112(a0)
2564 mult v0, t8 // ac0 = tmp12 * c8
2565 madd v1, t9 // ac0 += tmp13 * c9
2566 mult $ac1, v1, t8 // ac1 = tmp13 * c8
2567 msub $ac1, v0, a1 // ac1 -= tmp12 * c10
2568 addiu a0, a0, 2
2569 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
2570 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
2571 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
2572 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
2573 sh s4, -2(a0)
2574 sh s5, 62(a0)
2575 sh s6, 30(a0)
2576 bgtz s8, 2b
2577 sh s7, 94(a0)
2578
2579 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2580
2581 jr ra
2582 nop
2583
2584 END(jsimd_fdct_islow_mips_dspr2)
2585
2586 /*****************************************************************************/
2587 LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
2588 /*
2589 * a0 - data
2590 */
2591 .set at
2592 SAVE_REGS_ON_STACK 8, s0, s1
2593 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2594 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2595 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2596 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2597
2598 move v0, a0
2599 addiu v1, v0, 128 // end address
2600
2601 0:
2602 lw t0, 0(v0) // tmp0 = 1|0
2603 lw t1, 4(v0) // tmp1 = 3|2
2604 lw t2, 8(v0) // tmp2 = 5|4
2605 lw t3, 12(v0) // tmp3 = 7|6
2606 packrl.ph t1, t1, t1 // tmp1 = 2|3
2607 packrl.ph t3, t3, t3 // tmp3 = 6|7
2608 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
2609 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
2610 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
2611 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
2612 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
2613 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
2614 sra t4, t8, 16 // tmp4 = t11
2615 mult $0, $0 // ac0 = 0
2616 dpa.w.ph $ac0, t9, s1
2617 mult $ac1, $0, $0 // ac1 = 0
2618 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
2619 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
2620 mult $ac2, $0, $0 // ac2 = 0
2621 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
2622 mult $ac3, $0, $0 // ac3 = 0
2623 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
2624 precrq.ph.w t0, t5, t7 // t0 = t5|t6
2625 addq.ph t2, t8, t4 // tmp2 = t10 + t11
2626 subq.ph t3, t8, t4 // tmp3 = t10 - t11
2627 extr.w t4, $ac0, 8
2628 mult $0, $0 // ac0 = 0
2629 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
2630 extr.w t0, $ac1, 8 // t0 = z5
2631 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
2632 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
2633 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
2634 add t6, t1, t0 // t6 = z2
2635 add t7, t7, t0 // t7 = z4
2636 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
2637 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
2638 addq.ph t1, t0, t6 // t1 = z13 + z2
2639 subq.ph t6, t0, t6 // t6 = z13 - z2
2640 addq.ph t0, t8, t7 // t0 = z11 + z4
2641 subq.ph t7, t8, t7 // t7 = z11 - z4
2642 addq.ph t5, t4, t9
2643 subq.ph t4, t9, t4
2644 sh t2, 0(v0)
2645 sh t5, 4(v0)
2646 sh t3, 8(v0)
2647 sh t4, 12(v0)
2648 sh t1, 10(v0)
2649 sh t6, 6(v0)
2650 sh t0, 2(v0)
2651 sh t7, 14(v0)
2652 addiu v0, 16
2653 bne v1, v0, 0b
2654 nop
2655 move v0, a0
2656 addiu v1, v0, 16
2657
2658 1:
2659 lh t0, 0(v0) // 0
2660 lh t1, 16(v0) // 8
2661 lh t2, 32(v0) // 16
2662 lh t3, 48(v0) // 24
2663 lh t4, 64(v0) // 32
2664 lh t5, 80(v0) // 40
2665 lh t6, 96(v0) // 48
2666 lh t7, 112(v0) // 56
2667 add t8, t0, t7 // t8 = tmp0
2668 sub t7, t0, t7 // t7 = tmp7
2669 add t0, t1, t6 // t0 = tmp1
2670 sub t1, t1, t6 // t1 = tmp6
2671 add t6, t2, t5 // t6 = tmp2
2672 sub t5, t2, t5 // t5 = tmp5
2673 add t2, t3, t4 // t2 = tmp3
2674 sub t3, t3, t4 // t3 = tmp4
2675 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
2676 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
2677 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
2678 ins t8, s0, 16, 16 // t8 = tmp12|tmp13
2679 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
2680 mult $0, $0 // ac0 = 0
2681 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
2682 add s0, t4, t2 // t8 = tmp10+tmp11
2683 sub t4, t4, t2 // t4 = tmp10-tmp11
2684 sh s0, 0(v0)
2685 sh t4, 64(v0)
2686 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
2687 addq.ph t4, t8, t2 // t9 = tmp13 + z1
2688 subq.ph t8, t8, t2 // t2 = tmp13 - z1
2689 sh t4, 32(v0)
2690 sh t8, 96(v0)
2691 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
2692 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
2693 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
2694 andi t4, a1, 0xffff
2695 mul s0, t1, t4
2696 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2697 ins t1, t3, 16, 16 // t1 = tmp10|tmp12
2698 mult $0, $0 // ac0 = 0
2699 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
2700 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
2701 add t2, t7, t8 // t2 = tmp7 + z5
2702 sub t7, t7, t8 // t7 = tmp7 - z5
2703 andi t4, a2, 0xffff
2704 mul t8, t3, t4
2705 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2706 andi t4, s1, 0xffff
2707 mul t6, t0, t4
2708 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2709 add t0, t6, t8 // t0 = z3 + z2
2710 sub t1, t6, t8 // t1 = z3 - z2
2711 add t3, t6, s0 // t3 = z3 + z4
2712 sub t4, t6, s0 // t4 = z3 - z4
2713 sub t5, t2, t1 // t5 = dataptr[5]
2714 sub t6, t7, t0 // t6 = dataptr[3]
2715 add t3, t2, t3 // t3 = dataptr[1]
2716 add t4, t7, t4 // t4 = dataptr[7]
2717 sh t5, 80(v0)
2718 sh t6, 48(v0)
2719 sh t3, 16(v0)
2720 sh t4, 112(v0)
2721 addiu v0, 2
2722 bne v0, v1, 1b
2723 nop
2724
2725 RESTORE_REGS_FROM_STACK 8, s0, s1
2726
2727 j ra
2728 nop
2729 END(jsimd_fdct_ifast_mips_dspr2)
2730
2731 /*****************************************************************************/
2732 LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
2733 /*
2734 * a0 - coef_block
2735 * a1 - divisors
2736 * a2 - workspace
2737 */
2738
2739 .set at
2740
2741 SAVE_REGS_ON_STACK 16, s0, s1, s2
2742
2743 addiu v0, a2, 124 // v0 = workspace_end
2744 lh t0, 0(a2)
2745 lh t1, 0(a1)
2746 lh t2, 128(a1)
2747 sra t3, t0, 15
2748 sll t3, t3, 1
2749 addiu t3, t3, 1
2750 mul t0, t0, t3
2751 lh t4, 384(a1)
2752 lh t5, 130(a1)
2753 lh t6, 2(a2)
2754 lh t7, 2(a1)
2755 lh t8, 386(a1)
2756
2757 1:
2758 andi t1, 0xffff
2759 add t9, t0, t2
2760 andi t9, 0xffff
2761 mul v1, t9, t1
2762 sra s0, t6, 15
2763 sll s0, s0, 1
2764 addiu s0, s0, 1
2765 addiu t9, t4, 16
2766 srav v1, v1, t9
2767 mul v1, v1, t3
2768 mul t6, t6, s0
2769 andi t7, 0xffff
2770 addiu a2, a2, 4
2771 addiu a1, a1, 4
2772 add s1, t6, t5
2773 andi s1, 0xffff
2774 sh v1, 0(a0)
2775
2776 mul s2, s1, t7
2777 addiu s1, t8, 16
2778 srav s2, s2, s1
2779 mul s2,s2, s0
2780 lh t0, 0(a2)
2781 lh t1, 0(a1)
2782 sra t3, t0, 15
2783 sll t3, t3, 1
2784 addiu t3, t3, 1
2785 mul t0, t0, t3
2786 lh t2, 128(a1)
2787 lh t4, 384(a1)
2788 lh t5, 130(a1)
2789 lh t8, 386(a1)
2790 lh t6, 2(a2)
2791 lh t7, 2(a1)
2792 sh s2, 2(a0)
2793 lh t0, 0(a2)
2794 sra t3, t0, 15
2795 sll t3, t3, 1
2796 addiu t3, t3, 1
2797 mul t0, t0,t3
2798 bne a2, v0, 1b
2799 addiu a0, a0, 4
2800
2801 andi t1, 0xffff
2802 add t9, t0, t2
2803 andi t9, 0xffff
2804 mul v1, t9, t1
2805 sra s0, t6, 15
2806 sll s0, s0, 1
2807 addiu s0, s0, 1
2808 addiu t9, t4, 16
2809 srav v1, v1, t9
2810 mul v1, v1, t3
2811 mul t6, t6, s0
2812 andi t7, 0xffff
2813 sh v1, 0(a0)
2814 add s1, t6, t5
2815 andi s1, 0xffff
2816 mul s2, s1, t7
2817 addiu s1, t8, 16
2818 addiu a2, a2, 4
2819 addiu a1, a1, 4
2820 srav s2, s2, s1
2821 mul s2, s2, s0
2822 sh s2, 2(a0)
2823
2824 RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2825
2826 j ra
2827 nop
2828
2829 END(jsimd_quantize_mips_dspr2)
2830
2831 /*****************************************************************************/
2832 LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
2833 /*
2834 * a0 - coef_block
2835 * a1 - divisors
2836 * a2 - workspace
2837 */
2838
2839 .set at
2840
2841 li t1, 0x46800100 //integer representation 16384.5
2842 mtc1 t1, f0
2843 li t0, 63
2844 0:
2845 lwc1 f2, 0(a2)
2846 lwc1 f10, 0(a1)
2847 lwc1 f4, 4(a2)
2848 lwc1 f12, 4(a1)
2849 lwc1 f6, 8(a2)
2850 lwc1 f14, 8(a1)
2851 lwc1 f8, 12(a2)
2852 lwc1 f16, 12(a1)
2853 madd.s f2, f0, f2, f10
2854 madd.s f4, f0, f4, f12
2855 madd.s f6, f0, f6, f14
2856 madd.s f8, f0, f8, f16
2857 lwc1 f10, 16(a1)
2858 lwc1 f12, 20(a1)
2859 trunc.w.s f2, f2
2860 trunc.w.s f4, f4
2861 trunc.w.s f6, f6
2862 trunc.w.s f8, f8
2863 lwc1 f14, 24(a1)
2864 lwc1 f16, 28(a1)
2865 mfc1 t1, f2
2866 mfc1 t2, f4
2867 mfc1 t3, f6
2868 mfc1 t4, f8
2869 lwc1 f2, 16(a2)
2870 lwc1 f4, 20(a2)
2871 lwc1 f6, 24(a2)
2872 lwc1 f8, 28(a2)
2873 madd.s f2, f0, f2, f10
2874 madd.s f4, f0, f4, f12
2875 madd.s f6, f0, f6, f14
2876 madd.s f8, f0, f8, f16
2877 addiu t1, t1, -16384
2878 addiu t2, t2, -16384
2879 addiu t3, t3, -16384
2880 addiu t4, t4, -16384
2881 trunc.w.s f2, f2
2882 trunc.w.s f4, f4
2883 trunc.w.s f6, f6
2884 trunc.w.s f8, f8
2885 sh t1, 0(a0)
2886 sh t2, 2(a0)
2887 sh t3, 4(a0)
2888 sh t4, 6(a0)
2889 mfc1 t1, f2
2890 mfc1 t2, f4
2891 mfc1 t3, f6
2892 mfc1 t4, f8
2893 addiu t0, t0, -8
2894 addiu a2, a2, 32
2895 addiu a1, a1, 32
2896 addiu t1, t1, -16384
2897 addiu t2, t2, -16384
2898 addiu t3, t3, -16384
2899 addiu t4, t4, -16384
2900 sh t1, 8(a0)
2901 sh t2, 10(a0)
2902 sh t3, 12(a0)
2903 sh t4, 14(a0)
2904 bgez t0, 0b
2905 addiu a0, a0, 16
2906
2907 j ra
2908 nop
2909
2910 END(jsimd_quantize_float_mips_dspr2)
2911 /*****************************************************************************/
2912 LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
2913 /*
2914 * a0 - compptr->dct_table
2915 * a1 - coef_block
2916 * a2 - output_buf
2917 * a3 - output_col
2918 */
2919 .set at
2920
2921 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2922
2923 addiu sp, sp, -40
2924 move v0, sp
2925 addiu s2, zero, 29692
2926 addiu s3, zero, -10426
2927 addiu s4, zero, 6967
2928 addiu s5, zero, -5906
2929 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
2930 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
2931 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
2932 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
2933 mul t4, t5, t0
2934 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
2935 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
2936 mul t6, t6, t1
2937 mul t5, t5, t0
2938 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
2939 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
2940 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
2941 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
2942 mul t7, t7, t2
2943 mult zero, zero
2944 mul t8, t8, t3
2945 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
2946 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
2947 ins t6, t5, 16, 16 // t6 = t5|t6
2948 sll t4, t4, 15
2949 dpa.w.ph $ac0, t6, s0
2950 lh t1, 2(a1)
2951 lh t6, 2(a0)
2952 ins t8, t7, 16, 16 // t8 = t7|t8
2953 dpa.w.ph $ac0, t8, s1
2954 mflo t0, $ac0
2955 mul t5, t6, t1
2956 lh t1, 18(a1)
2957 lh t6, 18(a0)
2958 lh t2, 50(a1)
2959 lh t7, 50(a0)
2960 mul t6, t6, t1
2961 subu t8, t4, t0
2962 mul t7, t7, t2
2963 addu t0, t4, t0
2964 shra_r.w t0, t0, 13
2965 lh t1, 82(a1)
2966 lh t2, 82(a0)
2967 lh t3, 114(a1)
2968 lh t4, 114(a0)
2969 shra_r.w t8, t8, 13
2970 mul t1, t1, t2
2971 mul t3, t3, t4
2972 sw t0, 0(v0)
2973 sw t8, 20(v0)
2974 sll t4, t5, 15
2975 ins t7, t6, 16, 16
2976 mult zero, zero
2977 dpa.w.ph $ac0, t7, s0
2978 ins t3, t1, 16, 16
2979 lh t1, 6(a1)
2980 lh t6, 6(a0)
2981 dpa.w.ph $ac0, t3, s1
2982 mflo t0, $ac0
2983 mul t5, t6, t1
2984 lh t1, 22(a1)
2985 lh t6, 22(a0)
2986 lh t2, 54(a1)
2987 lh t7, 54(a0)
2988 mul t6, t6, t1
2989 subu t8, t4, t0
2990 mul t7, t7, t2
2991 addu t0, t4, t0
2992 shra_r.w t0, t0, 13
2993 lh t1, 86(a1)
2994 lh t2, 86(a0)
2995 lh t3, 118(a1)
2996 lh t4, 118(a0)
2997 shra_r.w t8, t8, 13
2998 mul t1, t1, t2
2999 mul t3, t3, t4
3000 sw t0, 4(v0)
3001 sw t8, 24(v0)
3002 sll t4, t5, 15
3003 ins t7, t6, 16, 16
3004 mult zero, zero
3005 dpa.w.ph $ac0, t7, s0
3006 ins t3, t1, 16, 16
3007 lh t1, 10(a1)
3008 lh t6, 10(a0)
3009 dpa.w.ph $ac0, t3, s1
3010 mflo t0, $ac0
3011 mul t5, t6, t1
3012 lh t1, 26(a1)
3013 lh t6, 26(a0)
3014 lh t2, 58(a1)
3015 lh t7, 58(a0)
3016 mul t6, t6, t1
3017 subu t8, t4, t0
3018 mul t7, t7, t2
3019 addu t0, t4, t0
3020 shra_r.w t0, t0, 13
3021 lh t1, 90(a1)
3022 lh t2, 90(a0)
3023 lh t3, 122(a1)
3024 lh t4, 122(a0)
3025 shra_r.w t8, t8, 13
3026 mul t1, t1, t2
3027 mul t3, t3, t4
3028 sw t0, 8(v0)
3029 sw t8, 28(v0)
3030 sll t4, t5, 15
3031 ins t7, t6, 16, 16
3032 mult zero, zero
3033 dpa.w.ph $ac0, t7, s0
3034 ins t3, t1, 16, 16
3035 lh t1, 14(a1)
3036 lh t6, 14(a0)
3037 dpa.w.ph $ac0, t3, s1
3038 mflo t0, $ac0
3039 mul t5, t6, t1
3040 lh t1, 30(a1)
3041 lh t6, 30(a0)
3042 lh t2, 62(a1)
3043 lh t7, 62(a0)
3044 mul t6, t6, t1
3045 subu t8, t4, t0
3046 mul t7, t7, t2
3047 addu t0, t4, t0
3048 shra_r.w t0, t0, 13
3049 lh t1, 94(a1)
3050 lh t2, 94(a0)
3051 lh t3, 126(a1)
3052 lh t4, 126(a0)
3053 shra_r.w t8, t8, 13
3054 mul t1, t1, t2
3055 mul t3, t3, t4
3056 sw t0, 12(v0)
3057 sw t8, 32(v0)
3058 sll t4, t5, 15
3059 ins t7, t6, 16, 16
3060 mult zero, zero
3061 dpa.w.ph $ac0, t7, s0
3062 ins t3, t1, 16, 16
3063 dpa.w.ph $ac0, t3, s1
3064 mflo t0, $ac0
3065 lw t9, 0(a2)
3066 lw t3, 0(v0)
3067 lw t7, 4(v0)
3068 lw t1, 8(v0)
3069 addu t9, t9, a3
3070 sll t3, t3, 15
3071 subu t8, t4, t0
3072 addu t0, t4, t0
3073 shra_r.w t0, t0, 13
3074 shra_r.w t8, t8, 13
3075 sw t0, 16(v0)
3076 sw t8, 36(v0)
3077 lw t5, 12(v0)
3078 lw t6, 16(v0)
3079 mult t7, s2
3080 madd t1, s3
3081 madd t5, s4
3082 madd t6, s5
3083 lw t5, 24(v0)
3084 lw t7, 28(v0)
3085 mflo t0, $ac0
3086 lw t8, 32(v0)
3087 lw t2, 36(v0)
3088 mult $ac1, t5, s2
3089 madd $ac1, t7, s3
3090 madd $ac1, t8, s4
3091 madd $ac1, t2, s5
3092 addu t1, t3, t0
3093 subu t6, t3, t0
3094 shra_r.w t1, t1, 20
3095 shra_r.w t6, t6, 20
3096 mflo t4, $ac1
3097 shll_s.w t1, t1, 24
3098 shll_s.w t6, t6, 24
3099 sra t1, t1, 24
3100 sra t6, t6, 24
3101 addiu t1, t1, 128
3102 addiu t6, t6, 128
3103 lw t0, 20(v0)
3104 sb t1, 0(t9)
3105 sb t6, 1(t9)
3106 sll t0, t0, 15
3107 lw t9, 4(a2)
3108 addu t1, t0, t4
3109 subu t6, t0, t4
3110 addu t9, t9, a3
3111 shra_r.w t1, t1, 20
3112 shra_r.w t6, t6, 20
3113 shll_s.w t1, t1, 24
3114 shll_s.w t6, t6, 24
3115 sra t1, t1, 24
3116 sra t6, t6, 24
3117 addiu t1, t1, 128
3118 addiu t6, t6, 128
3119 sb t1, 0(t9)
3120 sb t6, 1(t9)
3121 addiu sp, sp, 40
3122
3123 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3124
3125 j ra
3126 nop
3127
3128 END(jsimd_idct_2x2_mips_dspr2)
3129
3130 /*****************************************************************************/
3131 LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
3132 /*
3133 * a0 - compptr->dct_table
3134 * a1 - coef_block
3135 * a2 - output_buf
3136 * a3 - output_col
3137 * 16(sp) - workspace[DCTSIZE*4]; // buffers data between passes
3138 */
3139
3140 .set at
3141 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3142
3143 lw v1, 48(sp)
3144 move t0, a1
3145 move t1, v1
3146 li t9, 4
3147 li s0, 0x2e75f93e
3148 li s1, 0x21f9ba79
3149 li s2, 0xecc2efb0
3150 li s3, 0x52031ccd
3151
3152 0:
3153 lh s6, 32(t0) // inptr[DCTSIZE*2]
3154 lh t6, 32(a0) // quantptr[DCTSIZE*2]
3155 lh s7, 96(t0) // inptr[DCTSIZE*6]
3156 lh t7, 96(a0) // quantptr[DCTSIZE*6]
3157 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3158 lh s4, 0(t0) // inptr[DCTSIZE*0]
3159 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3160 lh s5, 0(a0) // quantptr[0]
3161 li s6, 15137
3162 li s7, 6270
3163 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3164 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3165 lh t5, 112(t0) // inptr[DCTSIZE*7]
3166 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3167 lh s4, 112(a0) // quantptr[DCTSIZE*7]
3168 lh v0, 80(t0) // inptr[DCTSIZE*5]
3169 lh s5, 80(a0) // quantptr[DCTSIZE*5]
3170 lh s6, 48(a0) // quantptr[DCTSIZE*3]
3171 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3172 lh s7, 16(a0) // quantptr[DCTSIZE*1]
3173 lh t8, 16(t0) // inptr[DCTSIZE*1]
3174 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3175 lh t7, 48(t0) // inptr[DCTSIZE*3]
3176 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3177 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3178 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3179 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3180 addu t3, t2, t6 // tmp10 = tmp0 + z2
3181 subu t4, t2, t6 // tmp10 = tmp0 - z2
3182 mult $ac0, zero, zero
3183 mult $ac1, zero, zero
3184 ins t5, v0, 16, 16
3185 ins t7, t8, 16, 16
3186 addiu t9, t9, -1
3187 dpa.w.ph $ac0, t5, s0
3188 dpa.w.ph $ac0, t7, s1
3189 dpa.w.ph $ac1, t5, s2
3190 dpa.w.ph $ac1, t7, s3
3191 mflo s4, $ac0
3192 mflo s5, $ac1
3193 addiu a0, a0, 2
3194 addiu t1, t1, 4
3195 addiu t0, t0, 2
3196 addu t6, t4, s4
3197 subu t5, t4, s4
3198 addu s6, t3, s5
3199 subu s7, t3, s5
3200 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
3201 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
3202 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3203 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3204 sw t6, 28(t1)
3205 sw t5, 60(t1)
3206 sw s6, -4(t1)
3207 bgtz t9, 0b
3208 sw s7, 92(t1)
3209 // second loop three pass
3210 li t9, 3
3211 1:
3212 lh s6, 34(t0) // inptr[DCTSIZE*2]
3213 lh t6, 34(a0) // quantptr[DCTSIZE*2]
3214 lh s7, 98(t0) // inptr[DCTSIZE*6]
3215 lh t7, 98(a0) // quantptr[DCTSIZE*6]
3216 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3217 lh s4, 2(t0) // inptr[DCTSIZE*0]
3218 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3219 lh s5, 2(a0) // quantptr[DCTSIZE*0]
3220 li s6, 15137
3221 li s7, 6270
3222 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3223 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3224 lh t5, 114(t0) // inptr[DCTSIZE*7]
3225 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3226 lh s4, 114(a0) // quantptr[DCTSIZE*7]
3227 lh s5, 82(a0) // quantptr[DCTSIZE*5]
3228 lh t6, 82(t0) // inptr[DCTSIZE*5]
3229 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3230 lh s6, 50(a0) // quantptr[DCTSIZE*3]
3231 lh t8, 18(t0) // inptr[DCTSIZE*1]
3232 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3233 lh t7, 50(t0) // inptr[DCTSIZE*3]
3234 lh s7, 18(a0) // quantptr[DCTSIZE*1]
3235 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3236 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3237 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3238 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3239 addu t3, t2, v0 // tmp10 = tmp0 + z2
3240 subu t4, t2, v0 // tmp10 = tmp0 - z2
3241 mult $ac0, zero, zero
3242 mult $ac1, zero, zero
3243 ins t5, t6, 16, 16
3244 ins t7, t8, 16, 16
3245 dpa.w.ph $ac0, t5, s0
3246 dpa.w.ph $ac0, t7, s1
3247 dpa.w.ph $ac1, t5, s2
3248 dpa.w.ph $ac1, t7, s3
3249 mflo t5, $ac0
3250 mflo t6, $ac1
3251 addiu t9, t9, -1
3252 addiu t0, t0, 2
3253 addiu a0, a0, 2
3254 addiu t1, t1, 4
3255 addu s5, t4, t5
3256 subu s4, t4, t5
3257 addu s6, t3, t6
3258 subu s7, t3, t6
3259 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
3260 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
3261 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3262 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3263 sw s5, 32(t1)
3264 sw s4, 64(t1)
3265 sw s6, 0(t1)
3266 bgtz t9, 1b
3267 sw s7, 96(t1)
3268 move t1, v1
3269 li s4, 15137
3270 lw s6, 8(t1) // wsptr[2]
3271 li s5, 6270
3272 lw s7, 24(t1) // wsptr[6]
3273 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3274 lw t2, 0(t1) // wsptr[0]
3275 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3276 lh t5, 28(t1) // wsptr[7]
3277 lh t6, 20(t1) // wsptr[5]
3278 lh t7, 12(t1) // wsptr[3]
3279 lh t8, 4(t1) // wsptr[1]
3280 ins t5, t6, 16, 16
3281 ins t7, t8, 16, 16
3282 mult $ac0, zero, zero
3283 dpa.w.ph $ac0, t5, s0
3284 dpa.w.ph $ac0, t7, s1
3285 mult $ac1, zero, zero
3286 dpa.w.ph $ac1, t5, s2
3287 dpa.w.ph $ac1, t7, s3
3288 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3289 mflo s6, $ac0
3290 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3291 subu s4, s4, s5
3292 addu t3, t2, s4 // tmp10 = tmp0 + z2
3293 mflo s7, $ac1
3294 subu t4, t2, s4 // tmp10 = tmp0 - z2
3295 addu t7, t4, s6
3296 subu t8, t4, s6
3297 addu t5, t3, s7
3298 subu t6, t3, s7
3299 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3300 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3301 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3302 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3303 sll s4, t9, 2
3304 lw v0, 0(a2) // output_buf[ctr]
3305 shll_s.w t5, t5, 24
3306 shll_s.w t6, t6, 24
3307 shll_s.w t7, t7, 24
3308 shll_s.w t8, t8, 24
3309 sra t5, t5, 24
3310 sra t6, t6, 24
3311 sra t7, t7, 24
3312 sra t8, t8, 24
3313 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3314 addiu t5, t5, 128
3315 addiu t6, t6, 128
3316 addiu t7, t7, 128
3317 addiu t8, t8, 128
3318 sb t5, 0(v0)
3319 sb t7, 1(v0)
3320 sb t8, 2(v0)
3321 sb t6, 3(v0)
3322 // 2
3323 li s4, 15137
3324 lw s6, 40(t1) // wsptr[2]
3325 li s5, 6270
3326 lw s7, 56(t1) // wsptr[6]
3327 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3328 lw t2, 32(t1) // wsptr[0]
3329 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3330 lh t5, 60(t1) // wsptr[7]
3331 lh t6, 52(t1) // wsptr[5]
3332 lh t7, 44(t1) // wsptr[3]
3333 lh t8, 36(t1) // wsptr[1]
3334 ins t5, t6, 16, 16
3335 ins t7, t8, 16, 16
3336 mult $ac0, zero, zero
3337 dpa.w.ph $ac0, t5, s0
3338 dpa.w.ph $ac0, t7, s1
3339 mult $ac1, zero, zero
3340 dpa.w.ph $ac1, t5, s2
3341 dpa.w.ph $ac1, t7, s3
3342 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3343 mflo s6, $ac0
3344 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3345 subu s4, s4, s5
3346 addu t3, t2, s4 // tmp10 = tmp0 + z2
3347 mflo s7, $ac1
3348 subu t4, t2, s4 // tmp10 = tmp0 - z2
3349 addu t7, t4, s6
3350 subu t8, t4, s6
3351 addu t5, t3, s7
3352 subu t6, t3, s7
3353 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+ 1)
3354 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+ 1)
3355 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+ 1)
3356 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+ 1)
3357 sll s4, t9, 2
3358 lw v0, 4(a2) // output_buf[ctr]
3359 shll_s.w t5, t5, 24
3360 shll_s.w t6, t6, 24
3361 shll_s.w t7, t7, 24
3362 shll_s.w t8, t8, 24
3363 sra t5, t5, 24
3364 sra t6, t6, 24
3365 sra t7, t7, 24
3366 sra t8, t8, 24
3367 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3368 addiu t5, t5, 128
3369 addiu t6, t6, 128
3370 addiu t7, t7, 128
3371 addiu t8, t8, 128
3372 sb t5, 0(v0)
3373 sb t7, 1(v0)
3374 sb t8, 2(v0)
3375 sb t6, 3(v0)
3376 // 3
3377 li s4, 15137
3378 lw s6, 72(t1) // wsptr[2]
3379 li s5, 6270
3380 lw s7, 88(t1) // wsptr[6]
3381 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3382 lw t2, 64(t1) // wsptr[0]
3383 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3384 lh t5, 92(t1) // wsptr[7]
3385 lh t6, 84(t1) // wsptr[5]
3386 lh t7, 76(t1) // wsptr[3]
3387 lh t8, 68(t1) // wsptr[1]
3388 ins t5, t6, 16, 16
3389 ins t7, t8, 16, 16
3390 mult $ac0, zero, zero
3391 dpa.w.ph $ac0, t5, s0
3392 dpa.w.ph $ac0, t7, s1
3393 mult $ac1, zero, zero
3394 dpa.w.ph $ac1, t5, s2
3395 dpa.w.ph $ac1, t7, s3
3396 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3397 mflo s6, $ac0
3398 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3399 subu s4, s4, s5
3400 addu t3, t2, s4 // tmp10 = tmp0 + z2
3401 mflo s7, $ac1
3402 subu t4, t2, s4 // tmp10 = tmp0 - z2
3403 addu t7, t4, s6
3404 subu t8, t4, s6
3405 addu t5, t3, s7
3406 subu t6, t3, s7
3407 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3408 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3409 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3410 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3411 sll s4, t9, 2
3412 lw v0, 8(a2) // output_buf[ctr]
3413 shll_s.w t5, t5, 24
3414 shll_s.w t6, t6, 24
3415 shll_s.w t7, t7, 24
3416 shll_s.w t8, t8, 24
3417 sra t5, t5, 24
3418 sra t6, t6, 24
3419 sra t7, t7, 24
3420 sra t8, t8, 24
3421 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3422 addiu t5, t5, 128
3423 addiu t6, t6, 128
3424 addiu t7, t7, 128
3425 addiu t8, t8, 128
3426 sb t5, 0(v0)
3427 sb t7, 1(v0)
3428 sb t8, 2(v0)
3429 sb t6, 3(v0)
3430 li s4, 15137
3431 lw s6, 104(t1) // wsptr[2]
3432 li s5, 6270
3433 lw s7, 120(t1) // wsptr[6]
3434 mul s4, s4, s6 // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3435 lw t2, 96(t1) // wsptr[0]
3436 mul s5, s5, s7 // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
3437 lh t5, 124(t1) // wsptr[7]
3438 lh t6, 116(t1) // wsptr[5]
3439 lh t7, 108(t1) // wsptr[3]
3440 lh t8, 100(t1) // wsptr[1]
3441 ins t5, t6, 16, 16
3442 ins t7, t8, 16, 16
3443 mult $ac0, zero, zero
3444 dpa.w.ph $ac0, t5, s0
3445 dpa.w.ph $ac0, t7, s1
3446 mult $ac1, zero, zero
3447 dpa.w.ph $ac1, t5, s2
3448 dpa.w.ph $ac1, t7, s3
3449 sll t2, t2, 14 // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3450 mflo s6, $ac0
3451 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3452 subu s4, s4, s5
3453 addu t3, t2, s4 // tmp10 = tmp0 + z2;
3454 mflo s7, $ac1
3455 subu t4, t2, s4 // tmp10 = tmp0 - z2;
3456 addu t7, t4, s6
3457 subu t8, t4, s6
3458 addu t5, t3, s7
3459 subu t6, t3, s7
3460 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3461 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3462 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3463 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3464 sll s4, t9, 2
3465 lw v0, 12(a2) // output_buf[ctr]
3466 shll_s.w t5, t5, 24
3467 shll_s.w t6, t6, 24
3468 shll_s.w t7, t7, 24
3469 shll_s.w t8, t8, 24
3470 sra t5, t5, 24
3471 sra t6, t6, 24
3472 sra t7, t7, 24
3473 sra t8, t8, 24
3474 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3475 addiu t5, t5, 128
3476 addiu t6, t6, 128
3477 addiu t7, t7, 128
3478 addiu t8, t8, 128
3479 sb t5, 0(v0)
3480 sb t7, 1(v0)
3481 sb t8, 2(v0)
3482 sb t6, 3(v0)
3483
3484 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3485
3486 j ra
3487 nop
3488 END(jsimd_idct_4x4_mips_dspr2)
3489
3490 /*****************************************************************************/
3491 LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
3492 /*
3493 * a0 - compptr->dct_table
3494 * a1 - coef_block
3495 * a2 - output_buf
3496 * a3 - output_col
3497 */
3498 .set at
3499
3500 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3501
3502 addiu sp, sp, -144
3503 move v0, sp
3504 addiu v1, v0, 24
3505 addiu t9, zero, 5793
3506 addiu s0, zero, 10033
3507 addiu s1, zero, 2998
3508
3509 1:
3510 lh s2, 0(a0) // q0 = quantptr[ 0]
3511 lh s3, 32(a0) // q1 = quantptr[16]
3512 lh s4, 64(a0) // q2 = quantptr[32]
3513 lh t2, 64(a1) // tmp2 = inptr[32]
3514 lh t1, 32(a1) // tmp1 = inptr[16]
3515 lh t0, 0(a1) // tmp0 = inptr[ 0]
3516 mul t2, t2, s4 // tmp2 = tmp2 * q2
3517 mul t1, t1, s3 // tmp1 = tmp1 * q1
3518 mul t0, t0, s2 // tmp0 = tmp0 * q0
3519 lh t6, 16(a1) // z1 = inptr[ 8]
3520 lh t8, 80(a1) // z3 = inptr[40]
3521 lh t7, 48(a1) // z2 = inptr[24]
3522 lh s2, 16(a0) // q0 = quantptr[ 8]
3523 lh s4, 80(a0) // q2 = quantptr[40]
3524 lh s3, 48(a0) // q1 = quantptr[24]
3525 mul t2, t2, t9 // tmp2 = tmp2 * 5793
3526 mul t1, t1, s0 // tmp1 = tmp1 * 10033
3527 sll t0, t0, 13 // tmp0 = tmp0 << 13
3528 mul t6, t6, s2 // z1 = z1 * q0
3529 mul t8, t8, s4 // z3 = z3 * q2
3530 mul t7, t7, s3 // z2 = z2 * q1
3531 addu t3, t0, t2 // tmp10 = tmp0 + tmp2
3532 sll t2, t2, 1 // tmp2 = tmp2 << 2
3533 subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
3534 subu t5, t3, t1 // tmp12 = tmp10 - tmp1
3535 addu t3, t3, t1 // tmp10 = tmp10 + tmp1
3536 addu t1, t6, t8 // tmp1 = z1 + z3
3537 mul t1, t1, s1 // tmp1 = tmp1 * 2998
3538 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3539 subu t2, t6, t8 // tmp2 = z1 - z3
3540 subu t2, t2, t7 // tmp2 = tmp2 - z2
3541 sll t2, t2, 2 // tmp2 = tmp2 << 2
3542 addu t0, t6, t7 // tmp0 = z1 + z2
3543 sll t0, t0, 13 // tmp0 = tmp0 << 13
3544 subu s2, t8, t7 // q0 = z3 - z2
3545 sll s2, s2, 13 // q0 = q0 << 13
3546 addu t0, t0, t1 // tmp0 = tmp0 + tmp1
3547 addu t1, s2, t1 // tmp1 = q0 + tmp1
3548 addu s2, t4, t2 // q0 = tmp11 + tmp2
3549 subu s3, t4, t2 // q1 = tmp11 - tmp2
3550 addu t6, t3, t0 // z1 = tmp10 + tmp0
3551 subu t7, t3, t0 // z2 = tmp10 - tmp0
3552 addu t4, t5, t1 // tmp11 = tmp12 + tmp1
3553 subu t5, t5, t1 // tmp12 = tmp12 - tmp1
3554 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
3555 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
3556 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3557 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
3558 sw s2, 24(v0)
3559 sw s3, 96(v0)
3560 sw t6, 0(v0)
3561 sw t7, 120(v0)
3562 sw t4, 48(v0)
3563 sw t5, 72(v0)
3564 addiu v0, v0, 4
3565 addiu a1, a1, 2
3566 bne v0, v1, 1b
3567 addiu a0, a0, 2
3568
3569 /* Pass 2: process 6 rows from work array, store into output array. */
3570 move v0, sp
3571 addiu v1, v0, 144
3572
3573 2:
3574 lw t0, 0(v0)
3575 lw t2, 16(v0)
3576 lw s5, 0(a2)
3577 addiu t0, t0, 16
3578 sll t0, t0, 13
3579 mul t3, t2, t9
3580 lw t6, 4(v0)
3581 lw t8, 20(v0)
3582 lw t7, 12(v0)
3583 addu s5, s5, a3
3584 addu s6, t6, t8
3585 mul s6, s6, s1
3586 addu t1, t0, t3
3587 subu t4, t0, t3
3588 subu t4, t4, t3
3589 lw t3, 8(v0)
3590 mul t0, t3, s0
3591 addu s7, t6, t7
3592 sll s7, s7, 13
3593 addu s7, s6, s7
3594 subu t2, t8, t7
3595 sll t2, t2, 13
3596 addu t2, s6, t2
3597 subu s6, t6, t7
3598 subu s6, s6, t8
3599 sll s6, s6, 13
3600 addu t3, t1, t0
3601 subu t5, t1, t0
3602 addu t6, t3, s7
3603 subu t3, t3, s7
3604 addu t7, t4, s6
3605 subu t4, t4, s6
3606 addu t8, t5, t2
3607 subu t5, t5, t2
3608 shll_s.w t6, t6, 6
3609 shll_s.w t3, t3, 6
3610 shll_s.w t7, t7, 6
3611 shll_s.w t4, t4, 6
3612 shll_s.w t8, t8, 6
3613 shll_s.w t5, t5, 6
3614 sra t6, t6, 24
3615 addiu t6, t6, 128
3616 sra t3, t3, 24
3617 addiu t3, t3, 128
3618 sb t6, 0(s5)
3619 sra t7, t7, 24
3620 addiu t7, t7, 128
3621 sb t3, 5(s5)
3622 sra t4, t4, 24
3623 addiu t4, t4, 128
3624 sb t7, 1(s5)
3625 sra t8, t8, 24
3626 addiu t8, t8, 128
3627 sb t4, 4(s5)
3628 addiu v0, v0, 24
3629 sra t5, t5, 24
3630 addiu t5, t5, 128
3631 sb t8, 2(s5)
3632 addiu a2, a2, 4
3633 bne v0, v1, 2b
3634 sb t5, 3(s5)
3635
3636 addiu sp, sp, 144
3637
3638 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3639
3640 j ra
3641 nop
3642
3643 END(jsimd_idct_6x6_mips_dspr2)
3644
3645 /*****************************************************************************/
3646 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
3647 /*
3648 * a0 - compptr->dct_table
3649 * a1 - coef_block
3650 * a2 - workspace
3651 */
3652
3653 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3654
3655 li a3, 8
3656
3657 1:
3658 // odd part
3659 lh t0, 48(a1)
3660 lh t1, 48(a0)
3661 lh t2, 16(a1)
3662 lh t3, 16(a0)
3663 lh t4, 80(a1)
3664 lh t5, 80(a0)
3665 lh t6, 112(a1)
3666 lh t7, 112(a0)
3667 mul t0, t0, t1 // z2
3668 mul t1, t2, t3 // z1
3669 mul t2, t4, t5 // z3
3670 mul t3, t6, t7 // z4
3671 li t4, 10703 // FIX(1.306562965)
3672 li t5, 4433 // FIX_0_541196100
3673 li t6, 7053 // FIX(0.860918669)
3674 mul t4, t0,t4 // tmp11
3675 mul t5, t0,t5 // -tmp14
3676 addu t7, t1,t2 // tmp10
3677 addu t8, t7,t3 // tmp10 + z4
3678 mul t6, t6, t8 // tmp15
3679 li t8, 2139 // FIX(0.261052384)
3680 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
3681 li t7, 2295 // FIX(0.280143716)
3682 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
3683 addu t9, t2, t3 // z3 + z4
3684 li s0, 8565 // FIX(1.045510580)
3685 mul t9, t9, s0 // -tmp13
3686 li s0, 12112 // FIX(1.478575242)
3687 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
3688 li s1, 12998 // FIX(1.586706681)
3689 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3690 li s2, 5540 // FIX(0.676326758)
3691 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3692 li s3, 16244 // FIX(1.982889723)
3693 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3694 subu t1, t1, t3 // z1-=z4
3695 subu t0, t0, t2 // z2-=z3
3696 addu t2, t0, t1 // z1+z2
3697 li t3, 4433 // FIX_0_541196100
3698 mul t2, t2, t3 // z3
3699 li t3, 6270 // FIX_0_765366865
3700 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3701 li t3, 15137 // FIX_0_765366865
3702 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3703 addu t8, t6, t8 // tmp12
3704 addu t3, t8, t4 // tmp12 + tmp11
3705 addu t3, t3, t7 // tmp10
3706 subu t8, t8, t9 // tmp12 + tmp13
3707 addu s0, t5, s0
3708 subu t8, t8, s0 // tmp12
3709 subu t9, t6, t9
3710 subu s1, s1, t4
3711 addu t9, t9, s1 // tmp13
3712 subu t6, t6, t5
3713 subu t6, t6, s2
3714 subu t6, t6, s3 // tmp15
3715 // even part start
3716 lh t4, 64(a1)
3717 lh t5, 64(a0)
3718 lh t7, 32(a1)
3719 lh s0, 32(a0)
3720 lh s1, 0(a1)
3721 lh s2, 0(a0)
3722 lh s3, 96(a1)
3723 lh v0, 96(a0)
3724 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
3725 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
3726 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
3727 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
3728 // odd part end
3729 addu t1, t2, t1 // tmp11
3730 subu t0, t2, t0 // tmp14
3731 // update counter and pointers
3732 addiu a3, a3, -1
3733 addiu a0, a0, 2
3734 addiu a1, a1, 2
3735 // even part rest
3736 li s1, 10033
3737 li s2, 11190
3738 mul t4, t4, s1 // z4
3739 mul s1, t5, s2 // z4
3740 sll t5, t5, 13 // z1
3741 sll t7, t7, 13
3742 addiu t7, t7, 1024 // z3
3743 sll s0, s0, 13 // z2
3744 addu s2, t7, t4 // tmp10
3745 subu t4, t7, t4 // tmp11
3746 subu s3, t5, s0 // tmp12
3747 addu t2, t7, s3 // tmp21
3748 subu s3, t7, s3 // tmp24
3749 addu t7, s1, s0 // tmp12
3750 addu v0, s2, t7 // tmp20
3751 subu s2, s2, t7 // tmp25
3752 subu s1, s1, t5 // z4 - z1
3753 subu s1, s1, s0 // tmp12
3754 addu s0, t4, s1 // tmp22
3755 subu t4, t4, s1 // tmp23
3756 // final output stage
3757 addu t5, v0, t3
3758 subu v0, v0, t3
3759 addu t3, t2, t1
3760 subu t2, t2, t1
3761 addu t1, s0, t8
3762 subu s0, s0, t8
3763 addu t8, t4, t9
3764 subu t4, t4, t9
3765 addu t9, s3, t0
3766 subu s3, s3, t0
3767 addu t0, s2, t6
3768 subu s2, s2, t6
3769 sra t5, t5, 11
3770 sra t3, t3, 11
3771 sra t1, t1, 11
3772 sra t8, t8, 11
3773 sra t9, t9, 11
3774 sra t0, t0, 11
3775 sra s2, s2, 11
3776 sra s3, s3, 11
3777 sra t4, t4, 11
3778 sra s0, s0, 11
3779 sra t2, t2, 11
3780 sra v0, v0, 11
3781 sw t5, 0(a2)
3782 sw t3, 32(a2)
3783 sw t1, 64(a2)
3784 sw t8, 96(a2)
3785 sw t9, 128(a2)
3786 sw t0, 160(a2)
3787 sw s2, 192(a2)
3788 sw s3, 224(a2)
3789 sw t4, 256(a2)
3790 sw s0, 288(a2)
3791 sw t2, 320(a2)
3792 sw v0, 352(a2)
3793 bgtz a3, 1b
3794 addiu a2, a2, 4
3795
3796 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3797
3798 j ra
3799 nop
3800
3801 END(jsimd_idct_12x12_pass1_mips_dspr2)
3802
3803 /*****************************************************************************/
3804 LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
3805 /*
3806 * a0 - workspace
3807 * a1 - output
3808 */
3809
3810 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3811
3812 li a3, 12
3813
3814 1:
3815 // Odd part
3816 lw t0, 12(a0)
3817 lw t1, 4(a0)
3818 lw t2, 20(a0)
3819 lw t3, 28(a0)
3820 li t4, 10703 // FIX(1.306562965)
3821 li t5, 4433 // FIX_0_541196100
3822 mul t4, t0, t4 // tmp11
3823 mul t5, t0, t5 // -tmp14
3824 addu t6, t1, t2 // tmp10
3825 li t7, 2139 // FIX(0.261052384)
3826 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
3827 addu t6, t6, t3 // tmp10 + z4
3828 li t8, 7053 // FIX(0.860918669)
3829 mul t6, t6, t8 // tmp15
3830 li t8, 2295 // FIX(0.280143716)
3831 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
3832 addu t9, t2, t3 // z3 + z4
3833 li s0, 8565 // FIX(1.045510580)
3834 mul t9, t9, s0 // -tmp13
3835 li s0, 12112 // FIX(1.478575242)
3836 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
3837 li s1, 12998 // FIX(1.586706681)
3838 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3839 li s2, 5540 // FIX(0.676326758)
3840 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3841 li s3, 16244 // FIX(1.982889723)
3842 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3843 subu t1, t1, t3 // z1 -= z4
3844 subu t0, t0, t2 // z2 -= z3
3845 addu t2, t1, t0 // z1 + z2
3846 li t3, 4433 // FIX_0_541196100
3847 mul t2, t2, t3 // z3
3848 li t3, 6270 // FIX_0_765366865
3849 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3850 li t3, 15137 // FIX_1_847759065
3851 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3852 addu t3, t6, t7 // tmp12
3853 addu t7, t3, t4
3854 addu t7, t7, t8 // tmp10
3855 subu t3, t3, t9
3856 subu t3, t3, t5
3857 subu t3, t3, s0 // tmp12
3858 subu t9, t6, t9
3859 subu t9, t9, t4
3860 addu t9, t9, s1 // tmp13
3861 subu t6, t6, t5
3862 subu t6, t6, s2
3863 subu t6, t6, s3 // tmp15
3864 addu t1, t2, t1 // tmp11
3865 subu t0, t2, t0 // tmp14
3866 // even part
3867 lw t2, 16(a0) // z4
3868 lw t4, 8(a0) // z1
3869 lw t5, 0(a0) // z3
3870 lw t8, 24(a0) // z2
3871 li s0, 10033 // FIX(1.224744871)
3872 li s1, 11190 // FIX(1.366025404)
3873 mul t2, t2, s0 // z4
3874 mul s0, t4, s1 // z4
3875 addiu t5, t5, 0x10
3876 sll t5, t5, 13 // z3
3877 sll t4, t4, 13 // z1
3878 sll t8, t8, 13 // z2
3879 subu s1, t4, t8 // tmp12
3880 addu s2, t5, t2 // tmp10
3881 subu t2, t5, t2 // tmp11
3882 addu s3, t5, s1 // tmp21
3883 subu s1, t5, s1 // tmp24
3884 addu t5, s0, t8 // tmp12
3885 addu v0, s2, t5 // tmp20
3886 subu t5, s2, t5 // tmp25
3887 subu t4, s0, t4
3888 subu t4, t4, t8 // tmp12
3889 addu t8, t2, t4 // tmp22
3890 subu t2, t2, t4 // tmp23
3891 // increment counter and pointers
3892 addiu a3, a3, -1
3893 addiu a0, a0, 32
3894 // Final stage
3895 addu t4, v0, t7
3896 subu v0, v0, t7
3897 addu t7, s3, t1
3898 subu s3, s3, t1
3899 addu t1, t8, t3
3900 subu t8, t8, t3
3901 addu t3, t2, t9
3902 subu t2, t2, t9
3903 addu t9, s1, t0
3904 subu s1, s1, t0
3905 addu t0, t5, t6
3906 subu t5, t5, t6
3907 sll t4, t4, 4
3908 sll t7, t7, 4
3909 sll t1, t1, 4
3910 sll t3, t3, 4
3911 sll t9, t9, 4
3912 sll t0, t0, 4
3913 sll t5, t5, 4
3914 sll s1, s1, 4
3915 sll t2, t2, 4
3916 sll t8, t8, 4
3917 sll s3, s3, 4
3918 sll v0, v0, 4
3919 shll_s.w t4, t4, 2
3920 shll_s.w t7, t7, 2
3921 shll_s.w t1, t1, 2
3922 shll_s.w t3, t3, 2
3923 shll_s.w t9, t9, 2
3924 shll_s.w t0, t0, 2
3925 shll_s.w t5, t5, 2
3926 shll_s.w s1, s1, 2
3927 shll_s.w t2, t2, 2
3928 shll_s.w t8, t8, 2
3929 shll_s.w s3, s3, 2
3930 shll_s.w v0, v0, 2
3931 srl t4, t4, 24
3932 srl t7, t7, 24
3933 srl t1, t1, 24
3934 srl t3, t3, 24
3935 srl t9, t9, 24
3936 srl t0, t0, 24
3937 srl t5, t5, 24
3938 srl s1, s1, 24
3939 srl t2, t2, 24
3940 srl t8, t8, 24
3941 srl s3, s3, 24
3942 srl v0, v0, 24
3943 lw t6, 0(a1)
3944 addiu t4, t4, 0x80
3945 addiu t7, t7, 0x80
3946 addiu t1, t1, 0x80
3947 addiu t3, t3, 0x80
3948 addiu t9, t9, 0x80
3949 addiu t0, t0, 0x80
3950 addiu t5, t5, 0x80
3951 addiu s1, s1, 0x80
3952 addiu t2, t2, 0x80
3953 addiu t8, t8, 0x80
3954 addiu s3, s3, 0x80
3955 addiu v0, v0, 0x80
3956 sb t4, 0(t6)
3957 sb t7, 1(t6)
3958 sb t1, 2(t6)
3959 sb t3, 3(t6)
3960 sb t9, 4(t6)
3961 sb t0, 5(t6)
3962 sb t5, 6(t6)
3963 sb s1, 7(t6)
3964 sb t2, 8(t6)
3965 sb t8, 9(t6)
3966 sb s3, 10(t6)
3967 sb v0, 11(t6)
3968 bgtz a3, 1b
3969 addiu a1, a1, 4
3970
3971 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3972
3973 jr ra
3974 nop
3975
3976 END(jsimd_idct_12x12_pass2_mips_dspr2)
3977
3978 /*****************************************************************************/
3979 LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
3980 /*
3981 * a0 - sample_data
3982 * a1 - start_col
3983 * a2 - workspace
3984 */
3985
3986 lw t0, 0(a0)
3987 li t7, 0xff80ff80
3988 addu t0, t0, a1
3989 ulw t1, 0(t0)
3990 ulw t2, 4(t0)
3991 preceu.ph.qbr t3, t1
3992 preceu.ph.qbl t4, t1
3993 lw t0, 4(a0)
3994 preceu.ph.qbr t5, t2
3995 preceu.ph.qbl t6, t2
3996 addu t0, t0, a1
3997 addu.ph t3, t3, t7
3998 addu.ph t4, t4, t7
3999 ulw t1, 0(t0)
4000 ulw t2, 4(t0)
4001 addu.ph t5, t5, t7
4002 addu.ph t6, t6, t7
4003 usw t3, 0(a2)
4004 usw t4, 4(a2)
4005 preceu.ph.qbr t3, t1
4006 preceu.ph.qbl t4, t1
4007 usw t5, 8(a2)
4008 usw t6, 12(a2)
4009
4010 lw t0, 8(a0)
4011 preceu.ph.qbr t5, t2
4012 preceu.ph.qbl t6, t2
4013 addu t0, t0, a1
4014 addu.ph t3, t3, t7
4015 addu.ph t4, t4, t7
4016 ulw t1, 0(t0)
4017 ulw t2, 4(t0)
4018 addu.ph t5, t5, t7
4019 addu.ph t6, t6, t7
4020 usw t3, 16(a2)
4021 usw t4, 20(a2)
4022 preceu.ph.qbr t3, t1
4023 preceu.ph.qbl t4, t1
4024 usw t5, 24(a2)
4025 usw t6, 28(a2)
4026
4027 lw t0, 12(a0)
4028 preceu.ph.qbr t5, t2
4029 preceu.ph.qbl t6, t2
4030 addu t0, t0, a1
4031 addu.ph t3, t3, t7
4032 addu.ph t4, t4, t7
4033 ulw t1, 0(t0)
4034 ulw t2, 4(t0)
4035 addu.ph t5, t5, t7
4036 addu.ph t6, t6, t7
4037 usw t3, 32(a2)
4038 usw t4, 36(a2)
4039 preceu.ph.qbr t3, t1
4040 preceu.ph.qbl t4, t1
4041 usw t5, 40(a2)
4042 usw t6, 44(a2)
4043
4044 lw t0, 16(a0)
4045 preceu.ph.qbr t5, t2
4046 preceu.ph.qbl t6, t2
4047 addu t0, t0, a1
4048 addu.ph t3, t3, t7
4049 addu.ph t4, t4, t7
4050 ulw t1, 0(t0)
4051 ulw t2, 4(t0)
4052 addu.ph t5, t5, t7
4053 addu.ph t6, t6, t7
4054 usw t3, 48(a2)
4055 usw t4, 52(a2)
4056 preceu.ph.qbr t3, t1
4057 preceu.ph.qbl t4, t1
4058 usw t5, 56(a2)
4059 usw t6, 60(a2)
4060
4061 lw t0, 20(a0)
4062 preceu.ph.qbr t5, t2
4063 preceu.ph.qbl t6, t2
4064 addu t0, t0, a1
4065 addu.ph t3, t3, t7
4066 addu.ph t4, t4, t7
4067 ulw t1, 0(t0)
4068 ulw t2, 4(t0)
4069 addu.ph t5, t5, t7
4070 addu.ph t6, t6, t7
4071 usw t3, 64(a2)
4072 usw t4, 68(a2)
4073 preceu.ph.qbr t3, t1
4074 preceu.ph.qbl t4, t1
4075 usw t5, 72(a2)
4076 usw t6, 76(a2)
4077
4078 lw t0, 24(a0)
4079 preceu.ph.qbr t5, t2
4080 preceu.ph.qbl t6, t2
4081 addu t0, t0, a1
4082 addu.ph t3, t3, t7
4083 addu.ph t4, t4, t7
4084 ulw t1, 0(t0)
4085 ulw t2, 4(t0)
4086 addu.ph t5, t5, t7
4087 addu.ph t6, t6, t7
4088 usw t3, 80(a2)
4089 usw t4, 84(a2)
4090 preceu.ph.qbr t3, t1
4091 preceu.ph.qbl t4, t1
4092 usw t5, 88(a2)
4093 usw t6, 92(a2)
4094
4095 lw t0, 28(a0)
4096 preceu.ph.qbr t5, t2
4097 preceu.ph.qbl t6, t2
4098 addu t0, t0, a1
4099 addu.ph t3, t3, t7
4100 addu.ph t4, t4, t7
4101 ulw t1, 0(t0)
4102 ulw t2, 4(t0)
4103 addu.ph t5, t5, t7
4104 addu.ph t6, t6, t7
4105 usw t3, 96(a2)
4106 usw t4, 100(a2)
4107 preceu.ph.qbr t3, t1
4108 preceu.ph.qbl t4, t1
4109 usw t5, 104(a2)
4110 usw t6, 108(a2)
4111 preceu.ph.qbr t5, t2
4112 preceu.ph.qbl t6, t2
4113 addu.ph t3, t3, t7
4114 addu.ph t4, t4, t7
4115 addu.ph t5, t5, t7
4116 addu.ph t6, t6, t7
4117 usw t3, 112(a2)
4118 usw t4, 116(a2)
4119 usw t5, 120(a2)
4120 usw t6, 124(a2)
4121
4122 j ra
4123 nop
4124
4125 END(jsimd_convsamp_mips_dspr2)
4126
4127 /*****************************************************************************/
4128 LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
4129 /*
4130 * a0 - sample_data
4131 * a1 - start_col
4132 * a2 - workspace
4133 */
4134
4135 .set at
4136
4137 lw t0, 0(a0)
4138 addu t0, t0, a1
4139 lbu t1, 0(t0)
4140 lbu t2, 1(t0)
4141 lbu t3, 2(t0)
4142 lbu t4, 3(t0)
4143 lbu t5, 4(t0)
4144 lbu t6, 5(t0)
4145 lbu t7, 6(t0)
4146 lbu t8, 7(t0)
4147 addiu t1, t1, -128
4148 addiu t2, t2, -128
4149 addiu t3, t3, -128
4150 addiu t4, t4, -128
4151 addiu t5, t5, -128
4152 addiu t6, t6, -128
4153 addiu t7, t7, -128
4154 addiu t8, t8, -128
4155 mtc1 t1, f2
4156 mtc1 t2, f4
4157 mtc1 t3, f6
4158 mtc1 t4, f8
4159 mtc1 t5, f10
4160 mtc1 t6, f12
4161 mtc1 t7, f14
4162 mtc1 t8, f16
4163 cvt.s.w f2, f2
4164 cvt.s.w f4, f4
4165 cvt.s.w f6, f6
4166 cvt.s.w f8, f8
4167 cvt.s.w f10, f10
4168 cvt.s.w f12, f12
4169 cvt.s.w f14, f14
4170 cvt.s.w f16, f16
4171 lw t0, 4(a0)
4172 swc1 f2, 0(a2)
4173 swc1 f4, 4(a2)
4174 swc1 f6, 8(a2)
4175 addu t0, t0, a1
4176 swc1 f8, 12(a2)
4177 swc1 f10, 16(a2)
4178 swc1 f12, 20(a2)
4179 swc1 f14, 24(a2)
4180 swc1 f16, 28(a2)
4181 //elemr 1
4182 lbu t1, 0(t0)
4183 lbu t2, 1(t0)
4184 lbu t3, 2(t0)
4185 lbu t4, 3(t0)
4186 lbu t5, 4(t0)
4187 lbu t6, 5(t0)
4188 lbu t7, 6(t0)
4189 lbu t8, 7(t0)
4190 addiu t1, t1, -128
4191 addiu t2, t2, -128
4192 addiu t3, t3, -128
4193 addiu t4, t4, -128
4194 addiu t5, t5, -128
4195 addiu t6, t6, -128
4196 addiu t7, t7, -128
4197 addiu t8, t8, -128
4198 mtc1 t1, f2
4199 mtc1 t2, f4
4200 mtc1 t3, f6
4201 mtc1 t4, f8
4202 mtc1 t5, f10
4203 mtc1 t6, f12
4204 mtc1 t7, f14
4205 mtc1 t8, f16
4206 cvt.s.w f2, f2
4207 cvt.s.w f4, f4
4208 cvt.s.w f6, f6
4209 cvt.s.w f8, f8
4210 cvt.s.w f10, f10
4211 cvt.s.w f12, f12
4212 cvt.s.w f14, f14
4213 cvt.s.w f16, f16
4214 lw t0, 8(a0)
4215 swc1 f2, 32(a2)
4216 swc1 f4, 36(a2)
4217 swc1 f6, 40(a2)
4218 addu t0, t0, a1
4219 swc1 f8, 44(a2)
4220 swc1 f10, 48(a2)
4221 swc1 f12, 52(a2)
4222 swc1 f14, 56(a2)
4223 swc1 f16, 60(a2)
4224 //elemr 2
4225 lbu t1, 0(t0)
4226 lbu t2, 1(t0)
4227 lbu t3, 2(t0)
4228 lbu t4, 3(t0)
4229 lbu t5, 4(t0)
4230 lbu t6, 5(t0)
4231 lbu t7, 6(t0)
4232 lbu t8, 7(t0)
4233 addiu t1, t1, -128
4234 addiu t2, t2, -128
4235 addiu t3, t3, -128
4236 addiu t4, t4, -128
4237 addiu t5, t5, -128
4238 addiu t6, t6, -128
4239 addiu t7, t7, -128
4240 addiu t8, t8, -128
4241 mtc1 t1, f2
4242 mtc1 t2, f4
4243 mtc1 t3, f6
4244 mtc1 t4, f8
4245 mtc1 t5, f10
4246 mtc1 t6, f12
4247 mtc1 t7, f14
4248 mtc1 t8, f16
4249 cvt.s.w f2, f2
4250 cvt.s.w f4, f4
4251 cvt.s.w f6, f6
4252 cvt.s.w f8, f8
4253 cvt.s.w f10, f10
4254 cvt.s.w f12, f12
4255 cvt.s.w f14, f14
4256 cvt.s.w f16, f16
4257 lw t0, 12(a0)
4258 swc1 f2, 64(a2)
4259 swc1 f4, 68(a2)
4260 swc1 f6, 72(a2)
4261 addu t0, t0, a1
4262 swc1 f8, 76(a2)
4263 swc1 f10, 80(a2)
4264 swc1 f12, 84(a2)
4265 swc1 f14, 88(a2)
4266 swc1 f16, 92(a2)
4267 //elemr 3
4268 lbu t1, 0(t0)
4269 lbu t2, 1(t0)
4270 lbu t3, 2(t0)
4271 lbu t4, 3(t0)
4272 lbu t5, 4(t0)
4273 lbu t6, 5(t0)
4274 lbu t7, 6(t0)
4275 lbu t8, 7(t0)
4276 addiu t1, t1, -128
4277 addiu t2, t2, -128
4278 addiu t3, t3, -128
4279 addiu t4, t4, -128
4280 addiu t5, t5, -128
4281 addiu t6, t6, -128
4282 addiu t7, t7, -128
4283 addiu t8, t8, -128
4284 mtc1 t1, f2
4285 mtc1 t2, f4
4286 mtc1 t3, f6
4287 mtc1 t4, f8
4288 mtc1 t5, f10
4289 mtc1 t6, f12
4290 mtc1 t7, f14
4291 mtc1 t8, f16
4292 cvt.s.w f2, f2
4293 cvt.s.w f4, f4
4294 cvt.s.w f6, f6
4295 cvt.s.w f8, f8
4296 cvt.s.w f10, f10
4297 cvt.s.w f12, f12
4298 cvt.s.w f14, f14
4299 cvt.s.w f16, f16
4300 lw t0, 16(a0)
4301 swc1 f2, 96(a2)
4302 swc1 f4, 100(a2)
4303 swc1 f6, 104(a2)
4304 addu t0, t0, a1
4305 swc1 f8, 108(a2)
4306 swc1 f10, 112(a2)
4307 swc1 f12, 116(a2)
4308 swc1 f14, 120(a2)
4309 swc1 f16, 124(a2)
4310 //elemr 4
4311 lbu t1, 0(t0)
4312 lbu t2, 1(t0)
4313 lbu t3, 2(t0)
4314 lbu t4, 3(t0)
4315 lbu t5, 4(t0)
4316 lbu t6, 5(t0)
4317 lbu t7, 6(t0)
4318 lbu t8, 7(t0)
4319 addiu t1, t1, -128
4320 addiu t2, t2, -128
4321 addiu t3, t3, -128
4322 addiu t4, t4, -128
4323 addiu t5, t5, -128
4324 addiu t6, t6, -128
4325 addiu t7, t7, -128
4326 addiu t8, t8, -128
4327 mtc1 t1, f2
4328 mtc1 t2, f4
4329 mtc1 t3, f6
4330 mtc1 t4, f8
4331 mtc1 t5, f10
4332 mtc1 t6, f12
4333 mtc1 t7, f14
4334 mtc1 t8, f16
4335 cvt.s.w f2, f2
4336 cvt.s.w f4, f4
4337 cvt.s.w f6, f6
4338 cvt.s.w f8, f8
4339 cvt.s.w f10, f10
4340 cvt.s.w f12, f12
4341 cvt.s.w f14, f14
4342 cvt.s.w f16, f16
4343 lw t0, 20(a0)
4344 swc1 f2, 128(a2)
4345 swc1 f4, 132(a2)
4346 swc1 f6, 136(a2)
4347 addu t0, t0, a1
4348 swc1 f8, 140(a2)
4349 swc1 f10, 144(a2)
4350 swc1 f12, 148(a2)
4351 swc1 f14, 152(a2)
4352 swc1 f16, 156(a2)
4353 //elemr 5
4354 lbu t1, 0(t0)
4355 lbu t2, 1(t0)
4356 lbu t3, 2(t0)
4357 lbu t4, 3(t0)
4358 lbu t5, 4(t0)
4359 lbu t6, 5(t0)
4360 lbu t7, 6(t0)
4361 lbu t8, 7(t0)
4362 addiu t1, t1, -128
4363 addiu t2, t2, -128
4364 addiu t3, t3, -128
4365 addiu t4, t4, -128
4366 addiu t5, t5, -128
4367 addiu t6, t6, -128
4368 addiu t7, t7, -128
4369 addiu t8, t8, -128
4370 mtc1 t1, f2
4371 mtc1 t2, f4
4372 mtc1 t3, f6
4373 mtc1 t4, f8
4374 mtc1 t5, f10
4375 mtc1 t6, f12
4376 mtc1 t7, f14
4377 mtc1 t8, f16
4378 cvt.s.w f2, f2
4379 cvt.s.w f4, f4
4380 cvt.s.w f6, f6
4381 cvt.s.w f8, f8
4382 cvt.s.w f10, f10
4383 cvt.s.w f12, f12
4384 cvt.s.w f14, f14
4385 cvt.s.w f16, f16
4386 lw t0, 24(a0)
4387 swc1 f2, 160(a2)
4388 swc1 f4, 164(a2)
4389 swc1 f6, 168(a2)
4390 addu t0, t0, a1
4391 swc1 f8, 172(a2)
4392 swc1 f10, 176(a2)
4393 swc1 f12, 180(a2)
4394 swc1 f14, 184(a2)
4395 swc1 f16, 188(a2)
4396 //elemr 6
4397 lbu t1, 0(t0)
4398 lbu t2, 1(t0)
4399 lbu t3, 2(t0)
4400 lbu t4, 3(t0)
4401 lbu t5, 4(t0)
4402 lbu t6, 5(t0)
4403 lbu t7, 6(t0)
4404 lbu t8, 7(t0)
4405 addiu t1, t1, -128
4406 addiu t2, t2, -128
4407 addiu t3, t3, -128
4408 addiu t4, t4, -128
4409 addiu t5, t5, -128
4410 addiu t6, t6, -128
4411 addiu t7, t7, -128
4412 addiu t8, t8, -128
4413 mtc1 t1, f2
4414 mtc1 t2, f4
4415 mtc1 t3, f6
4416 mtc1 t4, f8
4417 mtc1 t5, f10
4418 mtc1 t6, f12
4419 mtc1 t7, f14
4420 mtc1 t8, f16
4421 cvt.s.w f2, f2
4422 cvt.s.w f4, f4
4423 cvt.s.w f6, f6
4424 cvt.s.w f8, f8
4425 cvt.s.w f10, f10
4426 cvt.s.w f12, f12
4427 cvt.s.w f14, f14
4428 cvt.s.w f16, f16
4429 lw t0, 28(a0)
4430 swc1 f2, 192(a2)
4431 swc1 f4, 196(a2)
4432 swc1 f6, 200(a2)
4433 addu t0, t0, a1
4434 swc1 f8, 204(a2)
4435 swc1 f10, 208(a2)
4436 swc1 f12, 212(a2)
4437 swc1 f14, 216(a2)
4438 swc1 f16, 220(a2)
4439 //elemr 7
4440 lbu t1, 0(t0)
4441 lbu t2, 1(t0)
4442 lbu t3, 2(t0)
4443 lbu t4, 3(t0)
4444 lbu t5, 4(t0)
4445 lbu t6, 5(t0)
4446 lbu t7, 6(t0)
4447 lbu t8, 7(t0)
4448 addiu t1, t1, -128
4449 addiu t2, t2, -128
4450 addiu t3, t3, -128
4451 addiu t4, t4, -128
4452 addiu t5, t5, -128
4453 addiu t6, t6, -128
4454 addiu t7, t7, -128
4455 addiu t8, t8, -128
4456 mtc1 t1, f2
4457 mtc1 t2, f4
4458 mtc1 t3, f6
4459 mtc1 t4, f8
4460 mtc1 t5, f10
4461 mtc1 t6, f12
4462 mtc1 t7, f14
4463 mtc1 t8, f16
4464 cvt.s.w f2, f2
4465 cvt.s.w f4, f4
4466 cvt.s.w f6, f6
4467 cvt.s.w f8, f8
4468 cvt.s.w f10, f10
4469 cvt.s.w f12, f12
4470 cvt.s.w f14, f14
4471 cvt.s.w f16, f16
4472 swc1 f2, 224(a2)
4473 swc1 f4, 228(a2)
4474 swc1 f6, 232(a2)
4475 swc1 f8, 236(a2)
4476 swc1 f10, 240(a2)
4477 swc1 f12, 244(a2)
4478 swc1 f14, 248(a2)
4479 swc1 f16, 252(a2)
4480
4481 j ra
4482 nop
4483
4484 END(jsimd_convsamp_float_mips_dspr2)
4485
4486 /*****************************************************************************/
4487
OLDNEW
« simd/jccolext-sse2-64.asm ('K') | « simd/jsimd_mips.c ('k') | simd/jsimd_mips_dspr2_asm.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698