source/row_mips.cc - Issue 2595333002: Libyuv MIPS DSPR2 optimizations.

Side by Side Diff: source/row_mips.cc

Issue 2595333002: Libyuv MIPS DSPR2 optimizations. (Closed)

Patch Set: Created 3 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.	2 * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

(...skipping 567 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
578 " addiu %[dst_v], %[dst_v], 1 \n"	578 " addiu %[dst_v], %[dst_v], 1 \n"

579	579

580 "3: \n"	580 "3: \n"

581 ".set pop \n"	581 ".set pop \n"

582 : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),	582 : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),

583 [x] "=&r"(x), [y] "=&r"(y)	583 [x] "=&r"(x), [y] "=&r"(y)

584 : [width] "r"(width)	584 : [width] "r"(width)

585 : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");	585 : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");

586 }	586 }

587	587

588 // Convert (4 Y and 2 VU) I422 and arrange RGB values into	588 void I422ToARGBRow_DSPR2(const uint8* src_y,

589 // t5 = \| 0 \| B0 \| 0 \| b0 \|	589 const uint8* src_u,

590 // t4 = \| 0 \| B1 \| 0 \| b1 \|	590 const uint8* src_v,

591 // t9 = \| 0 \| G0 \| 0 \| g0 \|	591 uint8* rgb_buf,

592 // t8 = \| 0 \| G1 \| 0 \| g1 \|	592 const struct YuvConstants* yuvconstants,

593 // t2 = \| 0 \| R0 \| 0 \| r0 \|	593 int width) {

594 // t1 = \| 0 \| R1 \| 0 \| r1 \|	594 int x;

595 #define YUVTORGB \	595 uint32 tmp_ub = yuvconstants->kUVToB[0];

596 "lw $t0, 0(%[y_buf]) \n" \	596 uint32 tmp_ug = yuvconstants->kUVToG[0];

597 "lhu $t1, 0(%[u_buf]) \n" \	597 uint32 tmp_vg = yuvconstants->kUVToG[1];

598 "lhu $t2, 0(%[v_buf]) \n" \	598 uint32 tmp_vr = yuvconstants->kUVToR[1];

599 "preceu.ph.qbr $t1, $t1 \n" \	599 uint32 tmp_bb = yuvconstants->kUVBiasB[0];

600 "preceu.ph.qbr $t2, $t2 \n" \	600 uint32 tmp_bg = yuvconstants->kUVBiasG[0];

601 "preceu.ph.qbra $t3, $t0 \n" \	601 uint32 tmp_br = yuvconstants->kUVBiasR[0];

602 "preceu.ph.qbla $t0, $t0 \n" \	602 uint32 yg = yuvconstants->kYToRgb[0];

603 "subu.ph $t1, $t1, $s5 \n" \	603 uint32 tmp_yg;

604 "subu.ph $t2, $t2, $s5 \n" \	604 uint32 tmp_mask = 0x7fff7fff;

605 "subu.ph $t3, $t3, $s4 \n" \	605 tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) \| (tmp_bb & 0xffff);

606 "subu.ph $t0, $t0, $s4 \n" \	606 tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) \| (tmp_bg & 0xffff);

607 "mul.ph $t3, $t3, $s0 \n" \	607 tmp_br = ((uint)(tmp_br & 0xffff) << 16) \| (tmp_br & 0xffff);

608 "mul.ph $t0, $t0, $s0 \n" \	608 tmp_yg = ((uint)(yg & 0xffff) << 16) \| (yg & 0xffff);

609 "shll.ph $t4, $t1, 0x7 \n" \	609 tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) \| (tmp_ub & 0xffff)) + 0x00010001;

610 "subu.ph $t4, $t4, $t1 \n" \	610 tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) \| (tmp_ug & 0xffff);

611 "mul.ph $t6, $t1, $s1 \n" \	611 tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) \| (tmp_vg & 0xffff);

612 "mul.ph $t1, $t2, $s2 \n" \	612 tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) \| (tmp_vr & 0xffff)) + 0x00010001;

613 "addq_s.ph $t5, $t4, $t3 \n" \	613 yg = yg * 0x0101;

614 "addq_s.ph $t4, $t4, $t0 \n" \	614

615 "shra.ph $t5, $t5, 6 \n" \	615 for (x = 0; x < width - 1; x += 2) {

616 "shra.ph $t4, $t4, 6 \n" \	616 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

617 "addiu %[u_buf], 2 \n" \	617 uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;

618 "addiu %[v_buf], 2 \n" \	618 __asm__ __volatile__ (

619 "addu.ph $t6, $t6, $t1 \n" \	619 ".set push \n"

620 "mul.ph $t1, $t2, $s3 \n" \	620 ".set noreorder \n"

621 "addu.ph $t9, $t6, $t3 \n" \	621 "lbu %[tmp_t7], 0(%[src_y]) \n"

622 "addu.ph $t8, $t6, $t0 \n" \	622 "lbu %[tmp_t1], 1(%[src_y]) \n"

623 "shra.ph $t9, $t9, 6 \n" \	623 "mul %[tmp_t7], %[tmp_t7], %[yg] \n"

624 "shra.ph $t8, $t8, 6 \n" \	624 "mul %[tmp_t1], %[tmp_t1], %[yg] \n"

625 "addu.ph $t2, $t1, $t3 \n" \	625 "lbu %[tmp_t2], 0(%[src_u]) \n"

626 "addu.ph $t1, $t1, $t0 \n" \	626 "lbu %[tmp_t3], 0(%[src_v]) \n"

627 "shra.ph $t2, $t2, 6 \n" \	627 "replv.ph %[tmp_t2], %[tmp_t2] \n"

628 "shra.ph $t1, $t1, 6 \n" \	628 "replv.ph %[tmp_t3], %[tmp_t3] \n"

629 "subu.ph $t5, $t5, $s5 \n" \	629 "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"

630 "subu.ph $t4, $t4, $s5 \n" \	630 "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"

631 "subu.ph $t9, $t9, $s5 \n" \	631 "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"

632 "subu.ph $t8, $t8, $s5 \n" \	632 "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"

633 "subu.ph $t2, $t2, $s5 \n" \	633 "srl %[tmp_t7], %[tmp_t7], 16 \n"

634 "subu.ph $t1, $t1, $s5 \n" \	634 "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"

635 "shll_s.ph $t5, $t5, 8 \n" \	635 "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"

636 "shll_s.ph $t4, $t4, 8 \n" \	636 "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"

637 "shll_s.ph $t9, $t9, 8 \n" \	637 "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"

638 "shll_s.ph $t8, $t8, 8 \n" \	638 "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"

639 "shll_s.ph $t2, $t2, 8 \n" \	639 "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"

640 "shll_s.ph $t1, $t1, 8 \n" \	640 "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"

641 "shra.ph $t5, $t5, 8 \n" \	641 "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"

642 "shra.ph $t4, $t4, 8 \n" \	642 "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"

643 "shra.ph $t9, $t9, 8 \n" \	643 "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"

644 "shra.ph $t8, $t8, 8 \n" \	644 "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"

645 "shra.ph $t2, $t2, 8 \n" \	645 "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"

646 "shra.ph $t1, $t1, 8 \n" \	646 "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"

647 "addu.ph $t5, $t5, $s5 \n" \	647 "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"

648 "addu.ph $t4, $t4, $s5 \n" \	648 "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"

649 "addu.ph $t9, $t9, $s5 \n" \	649 "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

650 "addu.ph $t8, $t8, $s5 \n" \	650 "precrq.ph.w %[tmp_t9], %[tmp_t8], %[tmp_t7] \n"

651 "addu.ph $t2, $t2, $s5 \n" \	651 "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"

652 "addu.ph $t1, $t1, $s5 \n"	652 "precr.qb.ph %[tmp_t8], %[tmp_t9], %[tmp_t7] \n"

653	653 "precrq.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

654 // TODO(fbarchard): accept yuv conversion constants.	654 "sw %[tmp_t8], 0(%[rgb_buf]) \n"

655 void I422ToARGBRow_DSPR2(const uint8* y_buf,	655 "sw %[tmp_t7], 4(%[rgb_buf]) \n"

	656 ".set pop \n"

	657 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	658 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	659 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	660 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	661 [tmp_t9] "=&r" (tmp_t9)

	662 :[src_y] "r" (src_y), [src_u] "r" (src_u),[src_v] "r" (src_v),

	663 [tmp_ub] "r" (tmp_ub), [tmp_ug] "r" (tmp_ug), [yg] "r" (yg),

	664 [tmp_vg] "r" (tmp_vg), [tmp_vr] "r" (tmp_vr),

	665 [tmp_bb] "r" (tmp_bb), [tmp_bg] "r" (tmp_bg),

	666 [tmp_br] "r" (tmp_br), [tmp_yg] "r" (tmp_yg),

	667 [rgb_buf] "r" (rgb_buf), [tmp_mask] "r" (tmp_mask)

	668 );

	669 src_y += 2;

	670 src_u += 1;

	671 src_v += 1;

	672 rgb_buf += 8; // Advance 4 pixels.

	673 }

	674 }

	675

	676 // Bilinear filter 8x2 -> 8x1

	677 void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

	678 ptrdiff_t src_stride, int dst_width,

	679 int source_y_fraction) {

	680 int y0_fraction = 256 - source_y_fraction;

	681 const uint8* src_ptr1 = src_ptr + src_stride;

	682

	683 __asm__ __volatile__ (

	684 ".set push \n"

	685 ".set noreorder \n"

	686

	687 "replv.ph $t0, %[y0_fraction] \n"

	688 "replv.ph $t1, %[source_y_fraction] \n"

	689

	690 "1: \n"

	691 "lw $t2, 0(%[src_ptr]) \n"

	692 "lw $t3, 0(%[src_ptr1]) \n"

	693 "lw $t4, 4(%[src_ptr]) \n"

	694 "lw $t5, 4(%[src_ptr1]) \n"

	695 "muleu_s.ph.qbl $t6, $t2, $t0 \n"

	696 "muleu_s.ph.qbr $t7, $t2, $t0 \n"

	697 "muleu_s.ph.qbl $t8, $t3, $t1 \n"

	698 "muleu_s.ph.qbr $t9, $t3, $t1 \n"

	699 "muleu_s.ph.qbl $t2, $t4, $t0 \n"

	700 "muleu_s.ph.qbr $t3, $t4, $t0 \n"

	701 "muleu_s.ph.qbl $t4, $t5, $t1 \n"

	702 "muleu_s.ph.qbr $t5, $t5, $t1 \n"

	703 "addq.ph $t6, $t6, $t8 \n"

	704 "addq.ph $t7, $t7, $t9 \n"

	705 "addq.ph $t2, $t2, $t4 \n"

	706 "addq.ph $t3, $t3, $t5 \n"

	707 "shra_r.ph $t6, $t6, 8 \n"

	708 "shra_r.ph $t7, $t7, 8 \n"

	709 "shra_r.ph $t2, $t2, 8 \n"

	710 "shra_r.ph $t3, $t3, 8 \n"

	711 "precr.qb.ph $t6, $t6, $t7 \n"

	712 "precr.qb.ph $t2, $t2, $t3 \n"

	713 "addiu %[src_ptr], %[src_ptr], 8 \n"

	714 "addiu %[src_ptr1], %[src_ptr1], 8 \n"

	715 "addiu %[dst_width], %[dst_width], -8 \n"

	716 "sw $t6, 0(%[dst_ptr]) \n"

	717 "sw $t2, 4(%[dst_ptr]) \n"

	718 "bgtz %[dst_width], 1b \n"

	719 " addiu %[dst_ptr], %[dst_ptr], 8 \n"

	720

	721 ".set pop \n"

	722 : [dst_ptr] "+r" (dst_ptr),

	723 [src_ptr1] "+r" (src_ptr1),

	724 [src_ptr] "+r" (src_ptr),

	725 [dst_width] "+r" (dst_width)

	726 : [source_y_fraction] "r" (source_y_fraction),

	727 [y0_fraction] "r" (y0_fraction),

	728 [src_stride] "r" (src_stride)

	729 : "t0", "t1", "t2", "t3", "t4", "t5",

	730 "t6", "t7", "t8", "t9"

	731 );

	732 }

	733 #include <stdio.h>

	734 void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {

	735 int x;

	736 uint32 tmp_mask = 0xff;

	737 uint32 tmp_t1;

	738 for (x = 0; x < (width - 1); ++x) {

	739 __asm__ __volatile__ (

	740 ".set push \n"

	741 ".set noreorder \n"

	742 "ulw %[tmp_t1], 0(%[src_rgb24]) \n"

	743 "addiu %[dst_argb], %[dst_argb], 4 \n"

	744 "addiu %[src_rgb24], %[src_rgb24], 3 \n"

	745 "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"

	746 "sw %[tmp_t1], -4(%[dst_argb]) \n"

	747 ".set pop \n"

	748 :[src_rgb24] "+r" (src_rgb24), [dst_argb] "+r" (dst_argb),

	749 [tmp_t1] "=&r" (tmp_t1)

	750 :[tmp_mask] "r" (tmp_mask)

	751 : "memory"

	752 );

	753 }

	754 uint8 b = src_rgb24[0];

	755 uint8 g = src_rgb24[1];

	756 uint8 r = src_rgb24[2];

	757 dst_argb[0] = b;

	758 dst_argb[1] = g;

	759 dst_argb[2] = r;

	760 dst_argb[3] = 255u;

	761 }

	762

	763 void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {

	764 int x;

	765 uint32 tmp_mask = 0xff;

	766 uint32 tmp_t1, tmp_t2;

	767 for (x = 0; x < (width-1); ++x) {

	768 __asm__ __volatile__ (

	769 ".set push \n"

	770 ".set noreorder \n"

	771 "ulw %[tmp_t1], 0(%[src_raw]) \n"

	772 "addiu %[dst_argb], %[dst_argb], 4 \n"

	773 "addiu %[src_raw], %[src_raw], 3 \n"

	774 "srl %[tmp_t2], %[tmp_t1], 16 \n"

	775 "ins %[tmp_t1], %[tmp_mask], 24, 8 \n"

	776 "ins %[tmp_t1], %[tmp_t1], 16, 8 \n"

	777 "ins %[tmp_t1], %[tmp_t2], 0, 8 \n"

	778 "sw %[tmp_t1], -4(%[dst_argb]) \n"

	779 ".set pop \n"

	780 :[src_raw] "+r" (src_raw), [dst_argb] "+r" (dst_argb),

	781 [tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2)

	782 :[tmp_mask] "r" (tmp_mask)

	783 : "memory"

	784 );

	785 }

	786 uint8 r = src_raw[0];

	787 uint8 g = src_raw[1];

	788 uint8 b = src_raw[2];

	789 dst_argb[0] = b;

	790 dst_argb[1] = g;

	791 dst_argb[2] = r;

	792 dst_argb[3] = 255u;

	793 }

	794

	795 void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb,

	796 int width) {

	797 int x;

	798 uint32 tmp_mask = 0xff;

	799 uint32 tmp_t1, tmp_t2, tmp_t3;

	800 for (x = 0; x < width; ++x) {

	801 __asm__ __volatile__ (

	802 ".set push \n"

	803 ".set noreorder \n"

	804 "lhu %[tmp_t1], 0(%[src_rgb565]) \n"

	805 "addiu %[dst_argb], %[dst_argb], 4 \n"

	806 "addiu %[src_rgb565], %[src_rgb565], 2 \n"

	807 "sll %[tmp_t2], %[tmp_t1], 8 \n"

	808 "ins %[tmp_t2], %[tmp_mask], 24,8 \n"

	809 "ins %[tmp_t2], %[tmp_t1], 3, 16 \n"

	810 "ins %[tmp_t2], %[tmp_t1], 5, 11 \n"

	811 "srl %[tmp_t3], %[tmp_t1], 9 \n"

	812 "ins %[tmp_t2], %[tmp_t3], 8, 2 \n"

	813 "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"

	814 "srl %[tmp_t3], %[tmp_t1], 2 \n"

	815 "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"

	816 "sw %[tmp_t2], -4(%[dst_argb]) \n"

	817 ".set pop \n"

	818 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	819 [tmp_t3] "=&r" (tmp_t3), [src_rgb565] "+r" (src_rgb565),

	820 [dst_argb] "+r" (dst_argb)

	821 :[tmp_mask] "r" (tmp_mask)

	822 );

	823 }

	824 }

	825

	826 void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555, uint8* dst_argb,

	827 int width) {

	828 int x;

	829 uint32 tmp_t1, tmp_t2, tmp_t3;

	830 for (x = 0; x < width; ++x) {

	831 __asm__ __volatile__ (

	832 ".set push \n"

	833 ".set noreorder \n"

	834 "lh %[tmp_t1], 0(%[src_argb1555]) \n"

	835 "addiu %[dst_argb], %[dst_argb], 4 \n"

	836 "addiu %[src_argb1555], %[src_argb1555], 2 \n"

	837 "sll %[tmp_t2], %[tmp_t1], 9 \n"

	838 "ins %[tmp_t2], %[tmp_t1], 4, 15 \n"

	839 "ins %[tmp_t2], %[tmp_t1], 6, 10 \n"

	840 "srl %[tmp_t3], %[tmp_t1], 7 \n"

	841 "ins %[tmp_t2], %[tmp_t3], 8, 3 \n"

	842 "ins %[tmp_t2], %[tmp_t1], 3, 5 \n"

	843 "srl %[tmp_t3], %[tmp_t1], 2 \n"

	844 "ins %[tmp_t2], %[tmp_t3], 0, 3 \n"

	845 "sw %[tmp_t2], -4(%[dst_argb]) \n"

	846 ".set pop \n"

	847 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	848 [tmp_t3] "=&r" (tmp_t3), [src_argb1555] "+r" (src_argb1555),

	849 [dst_argb] "+r" (dst_argb)

	850 :

	851 );

	852 }

	853 }

	854

	855 void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444, uint8* dst_argb,

	856 int width) {

	857 int x;

	858 uint32 tmp_t1;

	859 for (x = 0; x < width; ++x) {

	860 __asm__ __volatile__ (

	861 ".set push \n"

	862 ".set noreorder \n"

	863 "lh %[tmp_t1], 0(%[src_argb4444]) \n"

	864 "addiu %[dst_argb], %[dst_argb], 4 \n"

	865 "addiu %[src_argb4444], %[src_argb4444], 2 \n"

	866 "ins %[tmp_t1], %[tmp_t1], 16, 16 \n"

	867 "ins %[tmp_t1], %[tmp_t1], 12, 16 \n"

	868 "ins %[tmp_t1], %[tmp_t1], 8, 12 \n"

	869 "ins %[tmp_t1], %[tmp_t1], 4, 8 \n"

	870 "sw %[tmp_t1], -4(%[dst_argb]) \n"

	871 ".set pop \n"

	872 :[src_argb4444] "+r" (src_argb4444), [dst_argb] "+r" (dst_argb),

	873 [tmp_t1] "=&r" (tmp_t1)

	874 );

	875 }

	876 }

	877

	878 void I444ToARGBRow_DSPR2(const uint8* y_buf,

656 const uint8* u_buf,	879 const uint8* u_buf,

657 const uint8* v_buf,	880 const uint8* v_buf,

658 uint8* rgb_buf,	881 uint8* rgb_buf,

659 const struct YuvConstants* yuvconstants,	882 const struct YuvConstants* yuvconstants,

660 int width) {	883 int width) {

661 __asm__ __volatile__(	884 int x;

662 ".set push \n"	885 uint32 tmp_ub = yuvconstants->kUVToB[0];

663 ".set noreorder \n"	886 uint32 tmp_ug = yuvconstants->kUVToG[0];

664 "beqz %[width], 2f \n"	887 uint32 tmp_vg = yuvconstants->kUVToG[1];

665 " repl.ph $s0, 74 \n" // \|YG\|YG\| = \|74\|74\|	888 uint32 tmp_vr = yuvconstants->kUVToR[1];

666 "repl.ph $s1, -25 \n" // \|UG\|UG\| = \|-25\|-25\|	889 uint32 tmp_bb = yuvconstants->kUVBiasB[0];

667 "repl.ph $s2, -52 \n" // \|VG\|VG\| = \|-52\|-52\|	890 uint32 tmp_bg = yuvconstants->kUVBiasG[0];

668 "repl.ph $s3, 102 \n" // \|VR\|VR\| = \|102\|102\|	891 uint32 tmp_br = yuvconstants->kUVBiasR[0];

669 "repl.ph $s4, 16 \n" // \|0\|16\|0\|16\|	892 uint32 yg = yuvconstants->kYToRgb[0];

670 "repl.ph $s5, 128 \n" // \|128\|128\| // clipping	893 uint32 tmp_mask = 0x7fff7fff;

671 "lui $s6, 0xff00 \n"	894 uint32 tmp_yg;

672 "ori $s6, 0xff00 \n" // \|ff\|00\|ff\|00\|ff\|	895

673	896 tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) \| (tmp_bb & 0xffff);

674 "1: \n" YUVTORGB	897 tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) \| (tmp_bg & 0xffff);

675 // Arranging into argb format	898 tmp_br = ((uint)(tmp_br & 0xffff) << 16) \| (tmp_br & 0xffff);

676 "precr.qb.ph $t4, $t8, $t4 \n" // \|G1\|g1\|B1\|b1\|	899 tmp_yg = ((uint)(yg & 0xffff) << 16) \| (yg & 0xffff);

677 "precr.qb.ph $t5, $t9, $t5 \n" // \|G0\|g0\|B0\|b0\|	900 tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) \| (tmp_ub & 0xffff)) + 0x00010001;

678 "addiu %[width], -4 \n"	901 tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) \| (tmp_ug & 0xffff);

679 "precrq.qb.ph $t8, $t4, $t5 \n" // \|G1\|B1\|G0\|B0\|	902 tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) \| (tmp_vg & 0xffff);

680 "precr.qb.ph $t9, $t4, $t5 \n" // \|g1\|b1\|g0\|b0\|	903 tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) \| (tmp_vr & 0xffff)) + 0x00010001;

681 "precr.qb.ph $t2, $t1, $t2 \n" // \|R1\|r1\|R0\|r0\|	904 yg = yg * 0x0101;

682	905

683 "addiu %[y_buf], 4 \n"	906 for (x = 0; x < width-1; x+=2) {

684 "preceu.ph.qbla $t1, $t2 \n" // \|0 \|R1\|0 \|R0\|	907 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

685 "preceu.ph.qbra $t2, $t2 \n" // \|0 \|r1\|0 \|r0\|	908 uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;

686 "or $t1, $t1, $s6 \n" // \|ff\|R1\|ff\|R0\|	909 __asm__ __volatile__ (

687 "or $t2, $t2, $s6 \n" // \|ff\|r1\|ff\|r0\|	910 ".set push \n"

688 "precrq.ph.w $t0, $t2, $t9 \n" // \|ff\|r1\|g1\|b1\|	911 ".set noreorder \n"

689 "precrq.ph.w $t3, $t1, $t8 \n" // \|ff\|R1\|G1\|B1\|	912 "lbu %[tmp_t7], 0(%[y_buf]) \n"

690 "sll $t9, $t9, 16 \n"	913 "lbu %[tmp_t1], 1(%[y_buf]) \n"

691 "sll $t8, $t8, 16 \n"	914 "mul %[tmp_t7], %[tmp_t7], %[yg] \n"

692 "packrl.ph $t2, $t2, $t9 \n" // \|ff\|r0\|g0\|b0\|	915 "mul %[tmp_t1], %[tmp_t1], %[yg] \n"

693 "packrl.ph $t1, $t1, $t8 \n" // \|ff\|R0\|G0\|B0\|	916 "lh %[tmp_t2], 0(%[u_buf]) \n"

694 // Store results.	917 "lh %[tmp_t3], 0(%[v_buf]) \n"

695 "sw $t2, 0(%[rgb_buf]) \n"	918 "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"

696 "sw $t0, 4(%[rgb_buf]) \n"	919 "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"

697 "sw $t1, 8(%[rgb_buf]) \n"	920 "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"

698 "sw $t3, 12(%[rgb_buf]) \n"	921 "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"

699 "bnez %[width], 1b \n"	922 "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"

700 " addiu %[rgb_buf], 16 \n"	923 "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"

701 "2: \n"	924 "srl %[tmp_t7], %[tmp_t7], 16 \n"

702 ".set pop \n"	925 "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"

703 : [y_buf] "+r"(y_buf), [u_buf] "+r"(u_buf), [v_buf] "+r"(v_buf),	926 "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"

704 [width] "+r"(width), [rgb_buf] "+r"(rgb_buf)	927 "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"

705 :	928 "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"

706 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",	929 "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"

707 "s2", "s3", "s4", "s5", "s6");	930 "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"

708 }	931 "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"

709	932 "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"

710 // Bilinear filter 8x2 -> 8x1	933 "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"

711 void InterpolateRow_DSPR2(uint8* dst_ptr,	934 "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"

712 const uint8* src_ptr,	935 "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"

713 ptrdiff_t src_stride,	936 "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"

714 int dst_width,	937 "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"

715 int source_y_fraction) {	938 "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"

716 int y0_fraction = 256 - source_y_fraction;	939 "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"

717 const uint8* src_ptr1 = src_ptr + src_stride;	940 "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

718	941 "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"

719 __asm__ __volatile__(	942 "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"

720 ".set push \n"	943 "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"

721 ".set noreorder \n"	944 "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"

722	945 "sw %[tmp_t8], 0(%[rgb_buf]) \n"

723 "replv.ph $t0, %[y0_fraction] \n"	946 "sw %[tmp_t7], 4(%[rgb_buf]) \n"

724 "replv.ph $t1, %[source_y_fraction] \n"	947 ".set pop \n"

725	948 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

726 "1: \n"	949 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

727 "lw $t2, 0(%[src_ptr]) \n"	950 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

728 "lw $t3, 0(%[src_ptr1]) \n"	951 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

729 "lw $t4, 4(%[src_ptr]) \n"	952 [tmp_t9] "=&r" (tmp_t9)

730 "lw $t5, 4(%[src_ptr1]) \n"	953 :[y_buf] "r" (y_buf), [yg] "r" (yg),

731 "muleu_s.ph.qbl $t6, $t2, $t0 \n"	954 [u_buf] "r" (u_buf),[v_buf] "r" (v_buf),

732 "muleu_s.ph.qbr $t7, $t2, $t0 \n"	955 [tmp_ub] "r" (tmp_ub), [tmp_ug] "r" (tmp_ug),

733 "muleu_s.ph.qbl $t8, $t3, $t1 \n"	956 [tmp_vg] "r" (tmp_vg), [tmp_vr] "r" (tmp_vr),

734 "muleu_s.ph.qbr $t9, $t3, $t1 \n"	957 [tmp_bb] "r" (tmp_bb), [tmp_bg] "r" (tmp_bg),

735 "muleu_s.ph.qbl $t2, $t4, $t0 \n"	958 [tmp_br] "r" (tmp_br), [tmp_yg] "r" (tmp_yg),

736 "muleu_s.ph.qbr $t3, $t4, $t0 \n"	959 [rgb_buf] "r" (rgb_buf), [tmp_mask] "r" (tmp_mask)

737 "muleu_s.ph.qbl $t4, $t5, $t1 \n"	960 );

738 "muleu_s.ph.qbr $t5, $t5, $t1 \n"	961 y_buf += 2;

739 "addq.ph $t6, $t6, $t8 \n"	962 u_buf += 2;

740 "addq.ph $t7, $t7, $t9 \n"	963 v_buf += 2;

741 "addq.ph $t2, $t2, $t4 \n"	964 rgb_buf += 8; // Advance 1 pixel.

742 "addq.ph $t3, $t3, $t5 \n"	965 }

743 "shra.ph $t6, $t6, 8 \n"	966 }

744 "shra.ph $t7, $t7, 8 \n"	967

745 "shra.ph $t2, $t2, 8 \n"	968 void I422ToARGB4444Row_DSPR2(const uint8* src_y,

746 "shra.ph $t3, $t3, 8 \n"	969 const uint8* src_u,

747 "precr.qb.ph $t6, $t6, $t7 \n"	970 const uint8* src_v,

748 "precr.qb.ph $t2, $t2, $t3 \n"	971 uint8* dst_argb4444,

749 "addiu %[src_ptr], %[src_ptr], 8 \n"	972 const struct YuvConstants* yuvconstants,

750 "addiu %[src_ptr1], %[src_ptr1], 8 \n"	973 int width) {

751 "addiu %[dst_width], %[dst_width], -8 \n"	974 int x;

752 "sw $t6, 0(%[dst_ptr]) \n"	975 uint32 tmp_ub = yuvconstants->kUVToB[0];

753 "sw $t2, 4(%[dst_ptr]) \n"	976 uint32 tmp_ug = yuvconstants->kUVToG[0];

754 "bgtz %[dst_width], 1b \n"	977 uint32 tmp_vg = yuvconstants->kUVToG[1];

755 " addiu %[dst_ptr], %[dst_ptr], 8 \n"	978 uint32 tmp_vr = yuvconstants->kUVToR[1];

756	979 uint32 tmp_bb = yuvconstants->kUVBiasB[0];

757 ".set pop \n"	980 uint32 tmp_bg = yuvconstants->kUVBiasG[0];

758 : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),	981 uint32 tmp_br = yuvconstants->kUVBiasR[0];

759 [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)	982 uint32 yg = yuvconstants->kYToRgb[0];

760 : [source_y_fraction] "r"(source_y_fraction),	983 uint32 tmp_yg;

761 [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)	984 uint32 tmp_mask = 0x7fff7fff;

762 : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");	985 tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) \| (tmp_bb & 0xffff);

763 }	986 tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) \| (tmp_bg & 0xffff);

	987 tmp_br = ((uint)(tmp_br & 0xffff) << 16) \| (tmp_br & 0xffff);

	988 tmp_yg = ((uint)(yg & 0xffff) << 16) \| (yg & 0xffff);

	989 tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) \| (tmp_ub & 0xffff)) + 0x00010001;

	990 tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) \| (tmp_ug & 0xffff);

	991 tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) \| (tmp_vg & 0xffff);

	992 tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) \| (tmp_vr & 0xffff)) + 0x00010001;

	993 yg = yg * 0x0101;

	994

	995 for (x = 0; x < width - 1; x += 2) {

	996 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	997 uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;

	998 __asm__ __volatile__ (

	999 ".set push \n"

	1000 ".set noreorder \n"

	1001 "lbu %[tmp_t7], 0(%[src_y]) \n"

	1002 "lbu %[tmp_t1], 1(%[src_y]) \n"

	1003 "mul %[tmp_t7], %[tmp_t7], %[yg] \n"

	1004 "mul %[tmp_t1], %[tmp_t1], %[yg] \n"

	1005 "lbu %[tmp_t2], 0(%[src_u]) \n"

	1006 "lbu %[tmp_t3], 0(%[src_v]) \n"

	1007 "replv.ph %[tmp_t2], %[tmp_t2] \n"

	1008 "replv.ph %[tmp_t3], %[tmp_t3] \n"

	1009 "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"

	1010 "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"

	1011 "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"

	1012 "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"

	1013 "srl %[tmp_t7], %[tmp_t7], 16 \n"

	1014 "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"

	1015 "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"

	1016 "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"

	1017 "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"

	1018 "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"

	1019 "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"

	1020 "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"

	1021 "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"

	1022 "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"

	1023 "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"

	1024 "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"

	1025 "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"

	1026 "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"

	1027 "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"

	1028 "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"

	1029 "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

	1030 "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"

	1031 "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"

	1032 "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"

	1033 "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"

	1034 "shrl.qb %[tmp_t1], %[tmp_t8], 4 \n"

	1035 "shrl.qb %[tmp_t2], %[tmp_t7], 4 \n"

	1036 "shrl.ph %[tmp_t8], %[tmp_t1], 4 \n"

	1037 "shrl.ph %[tmp_t7], %[tmp_t2], 4 \n"

	1038 "or %[tmp_t8], %[tmp_t8], %[tmp_t1] \n"

	1039 "or %[tmp_t7], %[tmp_t7], %[tmp_t2] \n"

	1040 "precr.qb.ph %[tmp_t8], %[tmp_t7], %[tmp_t8] \n"

	1041 "sw %[tmp_t8], 0(%[dst_argb4444]) \n"

	1042 ".set pop \n"

	1043 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1044 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1045 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1046 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1047 [tmp_t9] "=&r" (tmp_t9)

	1048 :[dst_argb4444] "r" (dst_argb4444), [yg] "r" (yg),

	1049 [src_u] "r" (src_u), [src_v] "r" (src_v),

	1050 [src_y] "r" (src_y),

	1051 [tmp_ub] "r" (tmp_ub), [tmp_ug] "r" (tmp_ug),

	1052 [tmp_vg] "r" (tmp_vg), [tmp_vr] "r" (tmp_vr),

	1053 [tmp_bb] "r" (tmp_bb), [tmp_bg] "r" (tmp_bg),

	1054 [tmp_br] "r" (tmp_br), [tmp_yg] "r" (tmp_yg),

	1055 [tmp_mask] "r" (tmp_mask)

	1056 );

	1057 src_y += 2;

	1058 src_u += 1;

	1059 src_v += 1;

	1060 dst_argb4444 += 4; // Advance 2 pixels.

	1061 }

	1062 }

	1063

	1064 void I422ToARGB1555Row_DSPR2(const uint8* src_y,

	1065 const uint8* src_u,

	1066 const uint8* src_v,

	1067 uint8* dst_argb1555,

	1068 const struct YuvConstants* yuvconstants,

	1069 int width) {

	1070 int x;

	1071 uint32 tmp_ub = yuvconstants->kUVToB[0];

	1072 uint32 tmp_ug = yuvconstants->kUVToG[0];

	1073 uint32 tmp_vg = yuvconstants->kUVToG[1];

	1074 uint32 tmp_vr = yuvconstants->kUVToR[1];

	1075 uint32 tmp_bb = yuvconstants->kUVBiasB[0];

	1076 uint32 tmp_bg = yuvconstants->kUVBiasG[0];

	1077 uint32 tmp_br = yuvconstants->kUVBiasR[0];

	1078 uint32 yg = yuvconstants->kYToRgb[0];

	1079 uint32 tmp_yg;

	1080 uint32 tmp_mask = 0x80008000;

	1081 tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) \| (tmp_bb & 0xffff);

	1082 tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) \| (tmp_bg & 0xffff);

	1083 tmp_br = ((uint)(tmp_br & 0xffff) << 16) \| (tmp_br & 0xffff);

	1084 tmp_yg = ((uint)(yg & 0xffff) << 16) \| (yg & 0xffff);

	1085 tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) \| (tmp_ub & 0xffff)) + 0x00010001;

	1086 tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) \| (tmp_ug & 0xffff);

	1087 tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) \| (tmp_vg & 0xffff);

	1088 tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) \| (tmp_vr & 0xffff)) + 0x00010001;

	1089 yg = yg * 0x0101;

	1090

	1091 for (x = 0; x < width - 1; x += 2) {

	1092 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1093 uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;

	1094 __asm__ __volatile__ (

	1095 ".set push \n"

	1096 ".set noreorder \n"

	1097 "lbu %[tmp_t7], 0(%[src_y]) \n"

	1098 "lbu %[tmp_t1], 1(%[src_y]) \n"

	1099 "mul %[tmp_t7], %[tmp_t7], %[yg] \n"

	1100 "mul %[tmp_t1], %[tmp_t1], %[yg] \n"

	1101 "lbu %[tmp_t2], 0(%[src_u]) \n"

	1102 "lbu %[tmp_t3], 0(%[src_v]) \n"

	1103 "replv.ph %[tmp_t2], %[tmp_t2] \n"

	1104 "replv.ph %[tmp_t3], %[tmp_t3] \n"

	1105 "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"

	1106 "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"

	1107 "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"

	1108 "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"

	1109 "srl %[tmp_t7], %[tmp_t7], 16 \n"

	1110 "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"

	1111 "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"

	1112 "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"

	1113 "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"

	1114 "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"

	1115 "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"

	1116 "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"

	1117 "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"

	1118 "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"

	1119 "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"

	1120 "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"

	1121 "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"

	1122 "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"

	1123 "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"

	1124 "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"

	1125 "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

	1126 "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"

	1127 "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"

	1128 "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"

	1129 "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"

	1130 "ins %[tmp_t3], %[tmp_t8], 7, 24 \n"

	1131 "ins %[tmp_t3], %[tmp_t8], 10, 16 \n"

	1132 "ins %[tmp_t3], %[tmp_t8], 13, 8 \n"

	1133 "ins %[tmp_t4], %[tmp_t7], 7, 24 \n"

	1134 "ins %[tmp_t4], %[tmp_t7], 10, 16 \n"

	1135 "ins %[tmp_t4], %[tmp_t7], 13, 8 \n"

	1136 "precrq.ph.w %[tmp_t8], %[tmp_t4], %[tmp_t3] \n"

	1137 "or %[tmp_t8], %[tmp_t8], %[tmp_mask]\n"

	1138 "sw %[tmp_t8], 0(%[dst_argb1555]) \n"

	1139 ".set pop \n"

	1140 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1141 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1142 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1143 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1144 [tmp_t9] "=&r" (tmp_t9)

	1145 :[dst_argb1555] "r" (dst_argb1555), [yg] "r" (yg),

	1146 [src_u] "r" (src_u), [src_v] "r" (src_v),

	1147 [src_y] "r" (src_y),

	1148 [tmp_ub] "r" (tmp_ub), [tmp_ug] "r" (tmp_ug),

	1149 [tmp_vg] "r" (tmp_vg), [tmp_vr] "r" (tmp_vr),

	1150 [tmp_bb] "r" (tmp_bb), [tmp_bg] "r" (tmp_bg),

	1151 [tmp_br] "r" (tmp_br), [tmp_yg] "r" (tmp_yg),

	1152 [tmp_mask] "r" (tmp_mask)

	1153 );

	1154 src_y += 2;

	1155 src_u += 1;

	1156 src_v += 1;

	1157 dst_argb1555 += 4; // Advance 2 pixels.

	1158 }

	1159 }

	1160

	1161 void NV12ToARGBRow_DSPR2(const uint8* src_y,

	1162 const uint8* src_uv,

	1163 uint8* rgb_buf,

	1164 const struct YuvConstants* yuvconstants,

	1165 int width) {

	1166 int x;

	1167 uint32 tmp_ub = yuvconstants->kUVToB[0];

	1168 uint32 tmp_ug = yuvconstants->kUVToG[0];

	1169 uint32 tmp_vg = yuvconstants->kUVToG[1];

	1170 uint32 tmp_vr = yuvconstants->kUVToR[1];

	1171 uint32 tmp_bb = yuvconstants->kUVBiasB[0];

	1172 uint32 tmp_bg = yuvconstants->kUVBiasG[0];

	1173 uint32 tmp_br = yuvconstants->kUVBiasR[0];

	1174 uint32 yg = yuvconstants->kYToRgb[0];

	1175 uint32 tmp_mask = 0x7fff7fff;

	1176 uint32 tmp_yg;

	1177 tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) \| (tmp_bb & 0xffff);

	1178 tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) \| (tmp_bg & 0xffff);

	1179 tmp_br = ((uint)(tmp_br & 0xffff) << 16) \| (tmp_br & 0xffff);

	1180 tmp_yg = ((uint)(yg & 0xffff) << 16) \| (yg & 0xffff);

	1181 tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) \| (tmp_ub & 0xffff)) + 0x00010001;

	1182 tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) \| (tmp_ug & 0xffff);

	1183 tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) \| (tmp_vg & 0xffff);

	1184 tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) \| (tmp_vr & 0xffff)) + 0x00010001;

	1185 yg = yg * 0x0101;

	1186

	1187 for (x = 0; x < width - 1; x += 2) {

	1188 uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1189 uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;

	1190 __asm__ __volatile__ (

	1191 ".set push \n"

	1192 ".set noreorder \n"

	1193 "lbu %[tmp_t7], 0(%[src_y]) \n"

	1194 "lbu %[tmp_t1], 1(%[src_y]) \n"

	1195 "mul %[tmp_t7], %[tmp_t7], %[yg] \n"

	1196 "mul %[tmp_t1], %[tmp_t1], %[yg] \n"

	1197 "lbu %[tmp_t2], 0(%[src_uv]) \n"

	1198 "lbu %[tmp_t3], 1(%[src_uv]) \n"

	1199 "replv.ph %[tmp_t2], %[tmp_t2] \n"

	1200 "replv.ph %[tmp_t3], %[tmp_t3] \n"

	1201 "mul.ph %[tmp_t4], %[tmp_t2], %[tmp_ub] \n"

	1202 "mul.ph %[tmp_t5], %[tmp_t2], %[tmp_ug] \n"

	1203 "mul.ph %[tmp_t6], %[tmp_t3], %[tmp_vr] \n"

	1204 "mul.ph %[tmp_t3], %[tmp_t3], %[tmp_vg] \n"

	1205 "srl %[tmp_t7], %[tmp_t7], 16 \n"

	1206 "ins %[tmp_t1], %[tmp_t7], 0, 16 \n"

	1207 "addq_s.ph %[tmp_t7], %[tmp_t1], %[tmp_bb] \n"

	1208 "addq_s.ph %[tmp_t8], %[tmp_t1], %[tmp_bg] \n"

	1209 "addq_s.ph %[tmp_t9], %[tmp_t1], %[tmp_br] \n"

	1210 "addq_s.ph %[tmp_t5], %[tmp_t5], %[tmp_t3] \n"

	1211 "addq_s.ph %[tmp_t7], %[tmp_t7], %[tmp_t4] \n"

	1212 "subq_s.ph %[tmp_t8], %[tmp_t8], %[tmp_t5] \n"

	1213 "addq_s.ph %[tmp_t9], %[tmp_t9], %[tmp_t6] \n"

	1214 "shra.ph %[tmp_t7], %[tmp_t7], 6 \n"

	1215 "shra.ph %[tmp_t8], %[tmp_t8], 6 \n"

	1216 "shra.ph %[tmp_t9], %[tmp_t9], 6 \n"

	1217 "shll_s.ph %[tmp_t7], %[tmp_t7], 7 \n"

	1218 "shll_s.ph %[tmp_t8], %[tmp_t8], 7 \n"

	1219 "shll_s.ph %[tmp_t9], %[tmp_t9], 7 \n"

	1220 "precrqu_s.qb.ph %[tmp_t8], %[tmp_mask], %[tmp_t8] \n"

	1221 "precrqu_s.qb.ph %[tmp_t7], %[tmp_t9], %[tmp_t7] \n"

	1222 "precrq.ph.w %[tmp_t2], %[tmp_t8], %[tmp_t7] \n"

	1223 "ins %[tmp_t7], %[tmp_t8], 16, 16 \n"

	1224 "precr.qb.ph %[tmp_t8], %[tmp_t2], %[tmp_t7] \n"

	1225 "precrq.qb.ph %[tmp_t7], %[tmp_t2], %[tmp_t7] \n"

	1226 "sw %[tmp_t8], 0(%[rgb_buf]) \n"

	1227 "sw %[tmp_t7], 4(%[rgb_buf]) \n"

	1228 ".set pop \n"

	1229 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1230 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1231 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1232 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1233 [tmp_t9] "=&r" (tmp_t9)

	1234 :[src_y] "r" (src_y), [src_uv] "r" (src_uv), [yg] "r" (yg),

	1235 [tmp_ub] "r" (tmp_ub), [tmp_ug] "r" (tmp_ug),

	1236 [tmp_vg] "r" (tmp_vg), [tmp_vr] "r" (tmp_vr),

	1237 [tmp_bb] "r" (tmp_bb), [tmp_bg] "r" (tmp_bg),

	1238 [tmp_br] "r" (tmp_br), [tmp_yg] "r" (tmp_yg),

	1239 [rgb_buf] "r" (rgb_buf), [tmp_mask] "r" (tmp_mask)

	1240 );

	1241

	1242 src_y += 2;

	1243 src_uv += 2;

	1244 rgb_buf += 8; // Advance 2 pixels.

	1245 }

	1246 }

	1247

	1248 void BGRAToUVRow_DSPR2(const uint8* src_rgb0, int src_stride_rgb,

	1249 uint8* dst_u, uint8* dst_v, int width) {

	1250 const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;

	1251 int x;

	1252 int const1 = 0xffda0000;

	1253 int const2 = 0x0070ffb6;

	1254 int const3 = 0x00700000;

	1255 int const4 = 0xffeeffa2;

	1256 int const5 = 0x100;

	1257 for (x = 0; x < width - 1; x += 2) {

	1258 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1259 int tmp_t6, tmp_t7, tmp_t8;

	1260 __asm__ __volatile__ (

	1261 ".set push \n"

	1262 ".set noreorder \n"

	1263 "lw %[tmp_t1], 0(%[src_rgb0]) \n"

	1264 "lw %[tmp_t2], 4(%[src_rgb0]) \n"

	1265 "lw %[tmp_t3], 0(%[src_rgb1]) \n"

	1266 "lw %[tmp_t4], 4(%[src_rgb1]) \n"

	1267 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1268 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1269 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1270 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1271 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1272 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1273 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1274 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1275 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"

	1276 "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"

	1277 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"

	1278 "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"

	1279 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"

	1280 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"

	1281 "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"

	1282 "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"

	1283 "mult $ac0, %[const5], %[const5] \n"

	1284 "mult $ac1, %[const5], %[const5] \n"

	1285 "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1286 "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"

	1287 "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1288 "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"

	1289 "extr_r.w %[tmp_t7], $ac0, 9 \n"

	1290 "extr_r.w %[tmp_t8], $ac1, 9 \n"

	1291 "addiu %[dst_u], %[dst_u], 1 \n"

	1292 "addiu %[dst_v], %[dst_v], 1 \n"

	1293 "addiu %[src_rgb0], %[src_rgb0], 8 \n"

	1294 "addiu %[src_rgb1], %[src_rgb1], 8 \n"

	1295 "sb %[tmp_t7], -1(%[dst_u]) \n"

	1296 "sb %[tmp_t8], -1(%[dst_v]) \n"

	1297 ".set pop \n"

	1298 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1299 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1300 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1301 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1302 [src_rgb0] "+r" (src_rgb0), [src_rgb1] "+r" (src_rgb1),

	1303 [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v)

	1304 :[const1] "r" (const1), [const2] "r" (const2),

	1305 [const3] "r" (const3), [const4] "r" (const4),

	1306 [const5] "r" (const5)

	1307 :"hi", "lo" ,"$ac1lo", "$ac1hi"

	1308 );

	1309 }

	1310 }

	1311

	1312 void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {

	1313 int x;

	1314 int const1 = 0x00420000;

	1315 int const2 = 0x00190081;

	1316 int const5 = 0x40;

	1317 for (x = 0; x < width; x+=4) {

	1318 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1319 int tmp_t6, tmp_t7, tmp_t8;

	1320 __asm__ __volatile__ (

	1321 ".set push \n"

	1322 ".set noreorder \n"

	1323 "lw %[tmp_t1], 0(%[src_argb0]) \n"

	1324 "lw %[tmp_t2], 4(%[src_argb0]) \n"

	1325 "lw %[tmp_t3], 8(%[src_argb0]) \n"

	1326 "lw %[tmp_t4], 12(%[src_argb0]) \n"

	1327 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1328 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1329 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1330 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1331 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1332 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1333 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1334 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1335 "mult $ac0, %[const5], %[const5] \n"

	1336 "mult $ac1, %[const5], %[const5] \n"

	1337 "mult $ac2, %[const5], %[const5] \n"

	1338 "mult $ac3, %[const5], %[const5] \n"

	1339 "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1340 "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"

	1341 "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"

	1342 "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"

	1343 "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1344 "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"

	1345 "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"

	1346 "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"

	1347 "extr_r.w %[tmp_t1], $ac0, 8 \n"

	1348 "extr_r.w %[tmp_t2], $ac1, 8 \n"

	1349 "extr_r.w %[tmp_t3], $ac2, 8 \n"

	1350 "extr_r.w %[tmp_t4], $ac3, 8 \n"

	1351 "addiu %[src_argb0],%[src_argb0], 16 \n"

	1352 "addiu %[dst_y], %[dst_y], 4 \n"

	1353 "sb %[tmp_t1], -4(%[dst_y]) \n"

	1354 "sb %[tmp_t2], -3(%[dst_y]) \n"

	1355 "sb %[tmp_t3], -2(%[dst_y]) \n"

	1356 "sb %[tmp_t4], -1(%[dst_y]) \n"

	1357 ".set pop \n"

	1358 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1359 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1360 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1361 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1362 [src_argb0] "+r" (src_argb0), [dst_y] "+r" (dst_y)

	1363 :[const1] "r" (const1), [const2] "r" (const2),

	1364 [const5] "r" (const5)

	1365 :"hi", "lo" , "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi",

	1366 "$ac3lo", "$ac3hi"

	1367 );

	1368 }

	1369 }

	1370

	1371 void ABGRToUVRow_DSPR2(const uint8* src_rgb0, int src_stride_rgb,

	1372 uint8* dst_u, uint8* dst_v, int width) {

	1373 const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;

	1374 int x;

	1375 int const1 = 0xffb6ffda;

	1376 int const2 = 0x00000070;

	1377 int const3 = 0xffa20070;

	1378 int const4 = 0x0000ffee;

	1379 int const5 = 0x100;

	1380

	1381 for (x = 0; x < width - 1; x += 2) {

	1382 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1383 int tmp_t6, tmp_t7, tmp_t8;

	1384 __asm__ __volatile__ (

	1385 ".set push \n"

	1386 ".set noreorder \n"

	1387 "lw %[tmp_t1], 0(%[src_rgb0]) \n"

	1388 "lw %[tmp_t2], 4(%[src_rgb0]) \n"

	1389 "lw %[tmp_t3], 0(%[src_rgb1]) \n"

	1390 "lw %[tmp_t4], 4(%[src_rgb1]) \n"

	1391 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1392 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1393 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1394 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1395 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1396 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1397 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1398 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1399 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"

	1400 "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"

	1401 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"

	1402 "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"

	1403 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"

	1404 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"

	1405 "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"

	1406 "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"

	1407 "mult $ac0, %[const5], %[const5] \n"

	1408 "mult $ac1, %[const5], %[const5] \n"

	1409 "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1410 "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"

	1411 "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1412 "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"

	1413 "extr_r.w %[tmp_t7], $ac0, 9 \n"

	1414 "extr_r.w %[tmp_t8], $ac1, 9 \n"

	1415 "addiu %[dst_u], %[dst_u], 1 \n"

	1416 "addiu %[dst_v], %[dst_v], 1 \n"

	1417 "addiu %[src_rgb0], %[src_rgb0], 8 \n"

	1418 "addiu %[src_rgb1], %[src_rgb1], 8 \n"

	1419 "sb %[tmp_t7], -1(%[dst_u]) \n"

	1420 "sb %[tmp_t8], -1(%[dst_v]) \n"

	1421 ".set pop \n"

	1422 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1423 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1424 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1425 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1426 [src_rgb0] "+r" (src_rgb0), [src_rgb1] "+r" (src_rgb1),

	1427 [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v)

	1428 :[const1] "r" (const1), [const2] "r" (const2),

	1429 [const3] "r" (const3), [const4] "r" (const4),

	1430 [const5] "r" (const5)

	1431 :"hi", "lo" ,"$ac1lo", "$ac1hi"

	1432 );

	1433 }

	1434 }

	1435

	1436 void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {

	1437 int x;

	1438 int const1 = 0x00810019;

	1439 int const2 = 0x00000042;

	1440 int const5 = 0x40;

	1441 for (x = 0; x < width; x+=4) {

	1442 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1443 int tmp_t6, tmp_t7, tmp_t8;

	1444 __asm__ __volatile__ (

	1445 ".set push \n"

	1446 ".set noreorder \n"

	1447 "lw %[tmp_t1], 0(%[src_argb0]) \n"

	1448 "lw %[tmp_t2], 4(%[src_argb0]) \n"

	1449 "lw %[tmp_t3], 8(%[src_argb0]) \n"

	1450 "lw %[tmp_t4], 12(%[src_argb0]) \n"

	1451 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1452 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1453 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1454 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1455 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1456 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1457 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1458 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1459 "mult $ac0, %[const5], %[const5] \n"

	1460 "mult $ac1, %[const5], %[const5] \n"

	1461 "mult $ac2, %[const5], %[const5] \n"

	1462 "mult $ac3, %[const5], %[const5] \n"

	1463 "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1464 "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"

	1465 "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"

	1466 "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"

	1467 "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1468 "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"

	1469 "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"

	1470 "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"

	1471 "extr_r.w %[tmp_t1], $ac0, 8 \n"

	1472 "extr_r.w %[tmp_t2], $ac1, 8 \n"

	1473 "extr_r.w %[tmp_t3], $ac2, 8 \n"

	1474 "extr_r.w %[tmp_t4], $ac3, 8 \n"

	1475 "addiu %[dst_y], %[dst_y], 4 \n"

	1476 "addiu %[src_argb0],%[src_argb0], 16 \n"

	1477 "sb %[tmp_t1], -4(%[dst_y]) \n"

	1478 "sb %[tmp_t2], -3(%[dst_y]) \n"

	1479 "sb %[tmp_t3], -2(%[dst_y]) \n"

	1480 "sb %[tmp_t4], -1(%[dst_y]) \n"

	1481 ".set pop \n"

	1482 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1483 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1484 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1485 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1486 [src_argb0] "+r" (src_argb0), [dst_y] "+r" (dst_y)

	1487 :[const1] "r" (const1), [const2] "r" (const2),

	1488 [const5] "r" (const5)

	1489 :"hi", "lo" , "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi",

	1490 "$ac3lo", "$ac3hi"

	1491 );

	1492 }

	1493 }

	1494

	1495 void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {

	1496 int x;

	1497 int const1 = 0x00810042;

	1498 int const2 = 0x00000019;

	1499 int const5 = 0x40;

	1500 for (x = 0; x < width; x+=4) {

	1501 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1502 int tmp_t6, tmp_t7, tmp_t8;

	1503 __asm__ __volatile__ (

	1504 ".set push \n"

	1505 ".set noreorder \n"

	1506 "lw %[tmp_t1], 0(%[src_argb0]) \n"

	1507 "lw %[tmp_t2], 4(%[src_argb0]) \n"

	1508 "lw %[tmp_t3], 8(%[src_argb0]) \n"

	1509 "lw %[tmp_t4], 12(%[src_argb0]) \n"

	1510 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1511 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1512 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1513 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1514 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1515 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1516 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1517 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1518 "mult $ac0, %[const5], %[const5] \n"

	1519 "mult $ac1, %[const5], %[const5] \n"

	1520 "mult $ac2, %[const5], %[const5] \n"

	1521 "mult $ac3, %[const5], %[const5] \n"

	1522 "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1523 "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"

	1524 "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"

	1525 "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"

	1526 "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1527 "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"

	1528 "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"

	1529 "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"

	1530 "extr_r.w %[tmp_t1], $ac0, 8 \n"

	1531 "extr_r.w %[tmp_t2], $ac1, 8 \n"

	1532 "extr_r.w %[tmp_t3], $ac2, 8 \n"

	1533 "extr_r.w %[tmp_t4], $ac3, 8 \n"

	1534 "addiu %[src_argb0],%[src_argb0], 16 \n"

	1535 "addiu %[dst_y], %[dst_y], 4 \n"

	1536 "sb %[tmp_t1], -4(%[dst_y]) \n"

	1537 "sb %[tmp_t2], -3(%[dst_y]) \n"

	1538 "sb %[tmp_t3], -2(%[dst_y]) \n"

	1539 "sb %[tmp_t4], -1(%[dst_y]) \n"

	1540 ".set pop \n"

	1541 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1542 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1543 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1544 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1545 [src_argb0] "+r" (src_argb0), [dst_y] "+r" (dst_y)

	1546 :[const1] "r" (const1), [const2] "r" (const2),

	1547 [const5] "r" (const5)

	1548 :"hi", "lo" , "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi",

	1549 "$ac3lo", "$ac3hi"

	1550 );

	1551 }

	1552 }

	1553

	1554 void RGBAToUVRow_DSPR2(const uint8* src_rgb0, int src_stride_rgb,

	1555 uint8* dst_u, uint8* dst_v, int width) {

	1556 const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;

	1557 int x;

	1558 int const1 = 0xffb60070;

	1559 int const2 = 0x0000ffda;

	1560 int const3 = 0xffa2ffee;

	1561 int const4 = 0x00000070;

	1562 int const5 = 0x100;

	1563

	1564 for (x = 0; x < width - 1; x += 2) {

	1565 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1566 int tmp_t6, tmp_t7, tmp_t8;

	1567 __asm__ __volatile__ (

	1568 ".set push \n"

	1569 ".set noreorder \n"

	1570 "ulw %[tmp_t1], 0+1(%[src_rgb0]) \n"

	1571 "ulw %[tmp_t2], 4+1(%[src_rgb0]) \n"

	1572 "ulw %[tmp_t3], 0+1(%[src_rgb1]) \n"

	1573 "ulw %[tmp_t4], 4+1(%[src_rgb1]) \n"

	1574 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1575 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1576 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1577 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1578 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1579 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1580 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1581 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1582 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"

	1583 "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"

	1584 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"

	1585 "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"

	1586 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"

	1587 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"

	1588 "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"

	1589 "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"

	1590 "mult $ac0, %[const5], %[const5] \n"

	1591 "mult $ac1, %[const5], %[const5] \n"

	1592 "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1593 "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"

	1594 "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1595 "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"

	1596 "extr_r.w %[tmp_t7], $ac0, 9 \n"

	1597 "extr_r.w %[tmp_t8], $ac1, 9 \n"

	1598 "addiu %[src_rgb0], %[src_rgb0], 8 \n"

	1599 "addiu %[src_rgb1], %[src_rgb1], 8 \n"

	1600 "addiu %[dst_u], %[dst_u], 1 \n"

	1601 "addiu %[dst_v], %[dst_v], 1 \n"

	1602 "sb %[tmp_t7], -1(%[dst_u]) \n"

	1603 "sb %[tmp_t8], -1(%[dst_v]) \n"

	1604 ".set pop \n"

	1605 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1606 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1607 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1608 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1609 [src_rgb0] "+r" (src_rgb0), [src_rgb1] "+r" (src_rgb1),

	1610 [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v)

	1611 :[const1] "r" (const1), [const2] "r" (const2),

	1612 [const3] "r" (const3), [const4] "r" (const4),

	1613 [const5] "r" (const5)

	1614 :"hi", "lo" ,"$ac1lo", "$ac1hi"

	1615 );

	1616 }

	1617 }

	1618

	1619 void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {

	1620 int x;

	1621 int const1 = 0x00420081;

	1622 int const2 = 0x00190000;

	1623 int const5 = 0x40;

	1624 for (x = 0; x < width; x+=4) {

	1625 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1626 int tmp_t6, tmp_t7, tmp_t8;

	1627 __asm__ __volatile__ (

	1628 ".set push \n"

	1629 ".set noreorder \n"

	1630 "lw %[tmp_t1], 0(%[src_argb0]) \n"

	1631 "lw %[tmp_t2], 4(%[src_argb0]) \n"

	1632 "lw %[tmp_t3], 8(%[src_argb0]) \n"

	1633 "lw %[tmp_t4], 12(%[src_argb0]) \n"

	1634 "preceu.ph.qbl %[tmp_t5], %[tmp_t1] \n"

	1635 "preceu.ph.qbr %[tmp_t1], %[tmp_t1] \n"

	1636 "preceu.ph.qbl %[tmp_t6], %[tmp_t2] \n"

	1637 "preceu.ph.qbr %[tmp_t2], %[tmp_t2] \n"

	1638 "preceu.ph.qbl %[tmp_t7], %[tmp_t3] \n"

	1639 "preceu.ph.qbr %[tmp_t3], %[tmp_t3] \n"

	1640 "preceu.ph.qbl %[tmp_t8], %[tmp_t4] \n"

	1641 "preceu.ph.qbr %[tmp_t4], %[tmp_t4] \n"

	1642 "mult $ac0, %[const5], %[const5] \n"

	1643 "mult $ac1, %[const5], %[const5] \n"

	1644 "mult $ac2, %[const5], %[const5] \n"

	1645 "mult $ac3, %[const5], %[const5] \n"

	1646 "dpa.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1647 "dpa.w.ph $ac1, %[tmp_t6], %[const1] \n"

	1648 "dpa.w.ph $ac2, %[tmp_t7], %[const1] \n"

	1649 "dpa.w.ph $ac3, %[tmp_t8], %[const1] \n"

	1650 "dpa.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1651 "dpa.w.ph $ac1, %[tmp_t2], %[const2] \n"

	1652 "dpa.w.ph $ac2, %[tmp_t3], %[const2] \n"

	1653 "dpa.w.ph $ac3, %[tmp_t4], %[const2] \n"

	1654 "extr_r.w %[tmp_t1], $ac0, 8 \n"

	1655 "extr_r.w %[tmp_t2], $ac1, 8 \n"

	1656 "extr_r.w %[tmp_t3], $ac2, 8 \n"

	1657 "extr_r.w %[tmp_t4], $ac3, 8 \n"

	1658 "addiu %[dst_y], %[dst_y], 4 \n"

	1659 "addiu %[src_argb0],%[src_argb0], 16 \n"

	1660 "sb %[tmp_t1], -4(%[dst_y]) \n"

	1661 "sb %[tmp_t2], -3(%[dst_y]) \n"

	1662 "sb %[tmp_t3], -2(%[dst_y]) \n"

	1663 "sb %[tmp_t4], -1(%[dst_y]) \n"

	1664 ".set pop \n"

	1665 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1666 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1667 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1668 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1669 [src_argb0] "+r" (src_argb0), [dst_y] "+r" (dst_y)

	1670 :[const1] "r" (const1), [const2] "r" (const2),

	1671 [const5] "r" (const5)

	1672 :"hi", "lo" , "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi",

	1673 "$ac3lo", "$ac3hi"

	1674 );

	1675 }

	1676 }

	1677

	1678 void ARGBToUVRow_DSPR2(const uint8* src_rgb0, int src_stride_rgb,

	1679 uint8* dst_u, uint8* dst_v, int width) {

	1680 const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;

	1681 int x;

	1682 int const1 = 0xffb60070;

	1683 int const2 = 0x0000ffda;

	1684 int const3 = 0xffa2ffee;

	1685 int const4 = 0x00000070;

	1686 int const5 = 0x100;

	1687

	1688 for (x = 0; x < width - 1; x += 2) {

	1689 int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;

	1690 int tmp_t6, tmp_t7, tmp_t8;

	1691 __asm__ __volatile__ (

	1692 ".set push \n"

	1693 ".set noreorder \n"

	1694 "lw %[tmp_t1], 0(%[src_rgb0]) \n"

	1695 "lw %[tmp_t2], 4(%[src_rgb0]) \n"

	1696 "lw %[tmp_t3], 0(%[src_rgb1]) \n"

	1697 "lw %[tmp_t4], 4(%[src_rgb1]) \n"

	1698 "preceu.ph.qbr %[tmp_t5], %[tmp_t1] \n"

	1699 "preceu.ph.qbl %[tmp_t1], %[tmp_t1] \n"

	1700 "preceu.ph.qbr %[tmp_t6], %[tmp_t2] \n"

	1701 "preceu.ph.qbl %[tmp_t2], %[tmp_t2] \n"

	1702 "preceu.ph.qbr %[tmp_t7], %[tmp_t3] \n"

	1703 "preceu.ph.qbl %[tmp_t3], %[tmp_t3] \n"

	1704 "preceu.ph.qbr %[tmp_t8], %[tmp_t4] \n"

	1705 "preceu.ph.qbl %[tmp_t4], %[tmp_t4] \n"

	1706 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t6] \n"

	1707 "addu.ph %[tmp_t7], %[tmp_t7], %[tmp_t8] \n"

	1708 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t2] \n"

	1709 "addu.ph %[tmp_t3], %[tmp_t3], %[tmp_t4] \n"

	1710 "addu.ph %[tmp_t5], %[tmp_t5], %[tmp_t7] \n"

	1711 "addu.ph %[tmp_t1], %[tmp_t1], %[tmp_t3] \n"

	1712 "shrl.ph %[tmp_t5], %[tmp_t5], 2 \n"

	1713 "shrl.ph %[tmp_t1], %[tmp_t1], 2 \n"

	1714 "mult $ac0, %[const5], %[const5] \n"

	1715 "mult $ac1, %[const5], %[const5] \n"

	1716 "dpaq_s.w.ph $ac0, %[tmp_t5], %[const1] \n"

	1717 "dpaq_s.w.ph $ac1, %[tmp_t5], %[const3] \n"

	1718 "dpaq_s.w.ph $ac0, %[tmp_t1], %[const2] \n"

	1719 "dpaq_s.w.ph $ac1, %[tmp_t1], %[const4] \n"

	1720 "extr_r.w %[tmp_t7], $ac0, 9 \n"

	1721 "extr_r.w %[tmp_t8], $ac1, 9 \n"

	1722 "addiu %[src_rgb0], %[src_rgb0], 8 \n"

	1723 "addiu %[src_rgb1], %[src_rgb1], 8 \n"

	1724 "addiu %[dst_u], %[dst_u], 1 \n"

	1725 "addiu %[dst_v], %[dst_v], 1 \n"

	1726 "sb %[tmp_t7], -1(%[dst_u]) \n"

	1727 "sb %[tmp_t8], -1(%[dst_v]) \n"

	1728 ".set pop \n"

	1729 :[tmp_t1] "=&r" (tmp_t1), [tmp_t2] "=&r" (tmp_t2),

	1730 [tmp_t3] "=&r" (tmp_t3), [tmp_t4] "=&r" (tmp_t4),

	1731 [tmp_t5] "=&r" (tmp_t5), [tmp_t6] "=&r" (tmp_t6),

	1732 [tmp_t7] "=&r" (tmp_t7), [tmp_t8] "=&r" (tmp_t8),

	1733 [src_rgb0] "+r" (src_rgb0), [src_rgb1] "+r" (src_rgb1),

	1734 [dst_u] "+r" (dst_u), [dst_v] "+r" (dst_v)

	1735 :[const1] "r" (const1), [const2] "r" (const2),

	1736 [const3] "r" (const3), [const4] "r" (const4),

	1737 [const5] "r" (const5)

	1738 :"hi", "lo" ,"$ac1lo", "$ac1hi"

	1739 );

	1740 }

	1741 }

	1742

764 #endif // __mips_dsp_rev >= 2	1743 #endif // __mips_dsp_rev >= 2

765	1744

766 #endif // defined(__mips__)	1745 #endif // defined(__mips__)

767	1746

768 #ifdef __cplusplus	1747 #ifdef __cplusplus

769 } // extern "C"	1748 } // extern "C"

770 } // namespace libyuv	1749 } // namespace libyuv

771 #endif	1750 #endif

OLD	NEW

« no previous file with comments | « source/row_any.cc ('k') | source/scale.cc » ('j') | no next file with comments »