OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include "libyuv/scale_row.h" | |
12 | |
13 // This module is for GCC MSA | |
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | |
15 #include "libyuv/macros_msa.h" | |
16 | |
17 #ifdef __cplusplus | |
18 namespace libyuv { | |
19 extern "C" { | |
20 #endif | |
21 | |
22 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, | |
fbarchard1
2016/11/30 01:15:48
This function, and Linear, arent really used in pr
manojkumar.bhosale
2016/12/01 13:06:06
Done. The performance is slightly degraded (2.6x t
| |
23 ptrdiff_t src_stride, | |
24 uint8_t* dst_argb, | |
25 int dst_width) { | |
26 int x; | |
27 v16u8 src0, src1, src2, src3, dst0, dst1; | |
28 | |
29 for (x = 0; x < dst_width; x += 8) { | |
30 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); | |
31 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); | |
32 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); | |
33 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); | |
34 dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); | |
35 dst1 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); | |
36 ST_UB2(dst0, dst1, dst_argb, 16); | |
37 src_argb += 64; | |
38 dst_argb += 32; | |
39 } | |
40 } | |
41 | |
42 void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, | |
43 ptrdiff_t src_stride, | |
44 uint8_t* dst_argb, | |
45 int dst_width) { | |
46 int x; | |
47 v16u8 src0, src1, vec0, vec1, dst0; | |
48 | |
49 for (x = 0; x < dst_width; x += 4) { | |
50 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); | |
51 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); | |
52 vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); | |
53 vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); | |
54 dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); | |
55 ST_UB(dst0, dst_argb); | |
56 src_argb += 32; | |
57 dst_argb += 16; | |
58 } | |
59 } | |
60 | |
61 void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, | |
62 ptrdiff_t src_stride, | |
63 uint8_t* dst_argb, | |
64 int dst_width) { | |
65 int x; | |
66 const uint8_t* nxt_argb = src_argb + src_stride; | |
fbarchard1
2016/11/30 01:15:48
nit re nxt_argb
I havent used this variable name
manojkumar.bhosale
2016/12/01 13:06:06
Done. Also, we avoided using stride in addressing
| |
67 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; | |
68 v8u16 reg0, reg1, reg2, reg3; | |
69 | |
70 for (x = 0; x < dst_width; x += 4) { | |
71 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); | |
fbarchard1
2016/11/30 01:15:48
This is pretty large/slow?
This is an important fu
manojkumar.bhosale
2016/12/01 13:06:06
Modified this function to replace interleave & pac
| |
72 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); | |
73 src2 = (v16u8)__msa_ld_b((v16i8*)nxt_argb, 0); | |
74 src3 = (v16u8)__msa_ld_b((v16i8*)nxt_argb, 16); | |
75 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); | |
76 vec1 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); | |
77 vec2 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); | |
78 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); | |
79 reg0 = (v8u16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); | |
80 reg1 = (v8u16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); | |
81 reg2 = (v8u16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); | |
82 reg3 = (v8u16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); | |
83 reg0 = (v8u16)__msa_hadd_u_h((v16u8)reg0, (v16u8)reg0); | |
84 reg1 = (v8u16)__msa_hadd_u_h((v16u8)reg1, (v16u8)reg1); | |
85 reg2 = (v8u16)__msa_hadd_u_h((v16u8)reg2, (v16u8)reg2); | |
86 reg3 = (v8u16)__msa_hadd_u_h((v16u8)reg3, (v16u8)reg3); | |
87 reg0 += reg2; | |
88 reg1 += reg3; | |
89 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); | |
90 reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); | |
fbarchard1
2016/11/30 01:15:48
Does __msa_srari_h() do rounding?
Can you refer me
manojkumar.bhosale
2016/12/01 13:06:06
Please refer to the MSA instruction set at,
https:
fbarchard1
2016/12/06 00:45:19
Acknowledged.
srari is Immediate Shift Right Arith
| |
91 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); | |
92 ST_UB(dst0, dst_argb); | |
93 src_argb += 32; | |
94 nxt_argb += 32; | |
95 dst_argb += 16; | |
96 } | |
97 } | |
98 | |
99 void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, | |
100 ptrdiff_t src_stride, | |
101 int32_t src_stepx, | |
102 uint8_t* dst_argb, | |
103 int dst_width) { | |
104 int x; | |
105 int32_t stepx = src_stepx * 4; | |
106 int32_t data0, data1, data2, data3; | |
107 | |
108 for (x = 0; x < dst_width; x += 4) { | |
109 data0 = LW(src_argb); | |
110 data1 = LW(src_argb + stepx); | |
111 data2 = LW(src_argb + stepx * 2); | |
112 data3 = LW(src_argb + stepx * 3); | |
113 SW(data0, dst_argb); | |
114 SW(data1, dst_argb + 4); | |
115 SW(data2, dst_argb + 8); | |
116 SW(data3, dst_argb + 12); | |
117 src_argb += stepx * 4; | |
118 dst_argb += 16; | |
119 } | |
120 } | |
121 | |
122 void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, | |
123 ptrdiff_t src_stride, | |
124 int src_stepx, | |
125 uint8* dst_argb, | |
126 int dst_width) { | |
127 int x; | |
128 const uint8* nxt_argb = src_argb + src_stride; | |
129 int32_t stepx = src_stepx * 4; | |
130 int64_t data0, data1, data2, data3; | |
131 v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; | |
132 v16u8 vec0, vec1, vec2, vec3; | |
133 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; | |
134 v16u8 dst0; | |
135 | |
136 for (x = 0; x < dst_width; x += 4) { | |
137 data0 = LD(src_argb); | |
138 data1 = LD(src_argb + stepx); | |
139 data2 = LD(src_argb + stepx * 2); | |
140 data3 = LD(src_argb + stepx * 3); | |
141 src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); | |
142 src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); | |
143 src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); | |
144 src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); | |
145 data0 = LD(nxt_argb); | |
146 data1 = LD(nxt_argb + stepx); | |
147 data2 = LD(nxt_argb + stepx * 2); | |
148 data3 = LD(nxt_argb + stepx * 3); | |
149 src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); | |
150 src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); | |
151 src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); | |
152 src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); | |
153 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); | |
154 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); | |
155 vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); | |
156 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); | |
157 reg0 = __msa_hadd_u_h(vec0, vec0); | |
158 reg1 = __msa_hadd_u_h(vec1, vec1); | |
159 reg2 = __msa_hadd_u_h(vec2, vec2); | |
160 reg3 = __msa_hadd_u_h(vec3, vec3); | |
161 reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); | |
162 reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); | |
163 reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); | |
164 reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); | |
165 reg4 += reg6; | |
166 reg5 += reg7; | |
167 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); | |
168 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); | |
169 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); | |
170 ST_UB(dst0, dst_argb); | |
171 src_argb += stepx * 4; | |
172 nxt_argb += stepx * 4; | |
173 dst_argb += 16; | |
174 } | |
175 } | |
176 | |
177 #ifdef __cplusplus | |
178 } // extern "C" | |
179 } // namespace libyuv | |
180 #endif | |
181 | |
182 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) | |
OLD | NEW |