OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <algorithm> | |
6 #include "skia/ext/convolver.h" | |
7 #include "skia/ext/convolver_mips_dspr2.h" | |
8 #include "third_party/skia/include/core/SkTypes.h" | |
9 | |
10 namespace skia { | |
11 // Convolves horizontally along a single row. The row data is given in | |
12 // |src_data| and continues for the num_values() of the filter. | |
13 template<bool has_alpha> | |
14 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, | |
15 const ConvolutionFilter1D& filter, | |
16 unsigned char* out_row) { | |
17 #if SIMD_MIPS_DSPR2 | |
18 int row_to_filter = 0; | |
19 int num_values = filter.num_values(); | |
20 if (has_alpha) { | |
Justin Novosad
2013/05/23 15:32:03
Since this branch is outside of the loop, we're no
Teodora Novkovic
2013/05/24 16:02:20
Done.
| |
21 for (int out_x = 0; out_x < num_values; out_x++) { | |
22 // Get the filter that determines the current output pixel. | |
23 int filter_offset, filter_length; | |
24 const ConvolutionFilter1D::Fixed* filter_values = | |
25 filter.FilterForValue(out_x, &filter_offset, &filter_length); | |
26 int filter_x = 0; | |
27 | |
28 __asm__ __volatile__ ( | |
29 ".set push \n" | |
30 ".set noreorder \n" | |
31 | |
32 "beqz %[filter_len], 3f \n" | |
33 " sll $t0, %[filter_offset], 2 \n" | |
34 "addu %[rtf], %[src_data], $t0 \n" | |
35 "mtlo $0, $ac0 \n" | |
36 "mtlo $0, $ac1 \n" | |
37 "mtlo $0, $ac2 \n" | |
38 "mtlo $0, $ac3 \n" | |
39 "srl $t7, %[filter_len], 2 \n" | |
40 "beqz $t7, 2f \n" | |
41 " li %[fx], 0 \n" | |
42 | |
43 "11: \n" | |
44 "addu $t4, %[filter_val], %[fx] \n" | |
45 "sll $t5, %[fx], 1 \n" | |
46 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| | |
47 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| | |
48 "addu $t0, %[rtf], $t5 \n" | |
49 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| | |
50 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| | |
51 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| | |
52 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| | |
53 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| | |
54 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| | |
55 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0| | |
56 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| | |
57 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| | |
58 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| | |
59 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0) | |
60 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) | |
61 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) | |
62 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) | |
63 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| | |
64 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| | |
65 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2| | |
66 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| | |
67 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| | |
68 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| | |
69 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2) | |
70 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) | |
71 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) | |
72 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) | |
73 "addiu $t7, $t7, -1 \n" | |
74 "bgtz $t7, 11b \n" | |
75 " addiu %[fx], %[fx], 8 \n" | |
76 | |
77 "2: \n" | |
78 "andi $t7, %[filter_len], 0x3 \n" // residual | |
79 "beqz $t7, 3f \n" | |
80 " nop \n" | |
81 | |
82 "21: \n" | |
83 "sll $t1, %[fx], 1 \n" | |
84 "addu $t2, %[filter_val], %[fx] \n" | |
85 "addu $t0, %[rtf], $t1 \n" | |
86 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] | |
87 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] | |
88 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] | |
89 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] | |
90 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2] | |
91 "maddu $ac3, $t6, $t1 \n" | |
92 "maddu $ac2, $t6, $t2 \n" | |
93 "maddu $ac1, $t6, $t3 \n" | |
94 "maddu $ac0, $t6, $t4 \n" | |
95 "addiu $t7, $t7, -1 \n" | |
96 "bgtz $t7, 21b \n" | |
97 " addiu %[fx], %[fx], 2 \n" | |
98 | |
99 "3: \n" | |
100 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits | |
101 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits | |
102 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits | |
103 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits | |
104 "sll $t5, %[out_x], 2 \n" | |
105 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | | |
106 "addu $t5, %[out_row], $t5 \n" | |
107 "append $t2, $t3, 16 \n" | |
108 "append $t0, $t1, 16 \n" | |
109 "subu.ph $t1, $t0, $t6 \n" | |
110 "shll_s.ph $t1, $t1, 8 \n" | |
111 "shra.ph $t1, $t1, 8 \n" | |
112 "addu.ph $t1, $t1, $t6 \n" | |
113 "subu.ph $t3, $t2, $t6 \n" | |
114 "shll_s.ph $t3, $t3, 8 \n" | |
115 "shra.ph $t3, $t3, 8 \n" | |
116 "addu.ph $t3, $t3, $t6 \n" | |
117 "precr.qb.ph $t0, $t1, $t3 \n" | |
118 "usw $t0, 0($t5) \n" | |
119 | |
120 ".set pop \n" | |
121 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), | |
122 [rtf] "+r" (row_to_filter) | |
123 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), | |
124 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), | |
125 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) | |
126 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", | |
127 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" | |
128 ); | |
129 } | |
130 } else { | |
131 for (int out_x = 0; out_x < num_values; out_x++) { | |
132 // Get the filter that determines the current output pixel. | |
133 int filter_offset, filter_length; | |
134 const ConvolutionFilter1D::Fixed* filter_values = | |
135 filter.FilterForValue(out_x, &filter_offset, &filter_length); | |
136 int filter_x = 0; | |
137 __asm__ __volatile__ ( | |
138 ".set push \n" | |
139 ".set noreorder \n" | |
140 | |
141 "beqz %[filter_len], 3f \n" | |
142 " sll $t0, %[filter_offset], 2 \n" | |
143 "addu %[rtf], %[src_data], $t0 \n" | |
144 "mtlo $0, $ac1 \n" | |
145 "mtlo $0, $ac2 \n" | |
146 "mtlo $0, $ac3 \n" | |
147 "srl $t7, %[filter_len], 2 \n" | |
148 "beqz $t7, 2f \n" | |
149 " li %[fx], 0 \n" | |
150 | |
151 "11: \n" | |
152 "addu $t4, %[filter_val], %[fx] \n" | |
153 "sll $t5, %[fx], 1 \n" | |
154 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| | |
155 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| | |
156 "addu $t0, %[rtf], $t5 \n" | |
157 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| | |
158 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| | |
159 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| | |
160 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| | |
161 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| | |
162 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| | |
163 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| | |
164 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| | |
165 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| | |
166 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) | |
167 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) | |
168 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) | |
169 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| | |
170 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| | |
171 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| | |
172 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| | |
173 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| | |
174 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) | |
175 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) | |
176 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) | |
177 "addiu $t7, $t7, -1 \n" | |
178 "bgtz $t7, 11b \n" | |
179 " addiu %[fx], %[fx], 8 \n" | |
180 | |
181 "2: \n" | |
182 "andi $t7, %[filter_len], 0x3 \n" // residual | |
183 "beqz $t7, 3f \n" | |
184 " nop \n" | |
185 | |
186 "21: \n" | |
187 "sll $t1, %[fx], 1 \n" | |
188 "addu $t2, %[filter_val], %[fx] \n" | |
189 "addu $t0, %[rtf], $t1 \n" | |
190 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] | |
191 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] | |
192 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] | |
193 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] | |
194 "maddu $ac3, $t6, $t1 \n" | |
195 "maddu $ac2, $t6, $t2 \n" | |
196 "maddu $ac1, $t6, $t3 \n" | |
197 "addiu $t7, $t7, -1 \n" | |
198 "bgtz $t7, 21b \n" | |
199 " addiu %[fx], %[fx], 2 \n" | |
200 | |
201 "3: \n" | |
202 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits | |
203 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits | |
204 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits | |
205 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | | |
206 "sll $t8, %[out_x], 2 \n" | |
207 "addu $t8, %[out_row], $t8 \n" | |
208 "append $t2, $t3, 16 \n" | |
209 "andi $t1, 0xFFFF \n" | |
210 "subu.ph $t5, $t1, $t6 \n" | |
211 "shll_s.ph $t5, $t5, 8 \n" | |
212 "shra.ph $t5, $t5, 8 \n" | |
213 "addu.ph $t5, $t5, $t6 \n" | |
214 "subu.ph $t4, $t2, $t6 \n" | |
215 "shll_s.ph $t4, $t4, 8 \n" | |
216 "shra.ph $t4, $t4, 8 \n" | |
217 "addu.ph $t4, $t4, $t6 \n" | |
218 "precr.qb.ph $t0, $t5, $t4 \n" | |
219 "usw $t0, 0($t8) \n" | |
220 | |
221 ".set pop \n" | |
222 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), | |
223 [rtf] "+r" (row_to_filter) | |
224 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), | |
225 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), | |
226 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) | |
227 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", | |
228 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" | |
229 ); | |
230 } | |
231 } | |
232 #endif | |
233 } | |
234 template<bool has_alpha> | |
235 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, | |
236 int filter_length, | |
237 unsigned char* const* source_data_rows, | |
238 int pixel_width, | |
239 unsigned char* out_row) { | |
240 #if SIMD_MIPS_DSPR2 | |
241 // We go through each column in the output and do a vertical convolution, | |
242 // generating one output pixel each time. | |
243 int byte_offset; | |
244 int cnt; | |
245 int filter_y; | |
246 if (has_alpha) { | |
Justin Novosad
2013/05/23 15:32:03
Same here.
Teodora Novkovic
2013/05/24 16:02:20
Done.
| |
247 for (int out_x = 0; out_x < pixel_width; out_x++) { | |
248 __asm__ __volatile__ ( | |
249 ".set push \n" | |
250 ".set noreorder \n" | |
251 | |
252 "beqz %[filter_len], 3f \n" | |
253 " sll %[offset], %[out_x], 2 \n" | |
254 "mtlo $0, $ac0 \n" | |
255 "mtlo $0, $ac1 \n" | |
256 "mtlo $0, $ac2 \n" | |
257 "mtlo $0, $ac3 \n" | |
258 "srl %[cnt], %[filter_len], 2 \n" | |
259 "beqz %[cnt], 2f \n" | |
260 " li %[fy], 0 \n" | |
261 | |
262 "11: \n" | |
263 "sll $t1, %[fy], 1 \n" | |
264 "addu $t0, %[src_data_rows], $t1 \n" | |
265 "lw $t1, 0($t0) \n" | |
266 "lw $t2, 4($t0) \n" | |
267 "lw $t3, 8($t0) \n" | |
268 "lw $t4, 12($t0) \n" | |
269 "addu $t1, $t1, %[offset] \n" | |
270 "addu $t2, $t2, %[offset] \n" | |
271 "addu $t3, $t3, %[offset] \n" | |
272 "addu $t4, $t4, %[offset] \n" | |
273 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| | |
274 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| | |
275 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| | |
276 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| | |
277 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| | |
278 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| | |
279 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0| | |
280 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| | |
281 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| | |
282 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| | |
283 "addu $t6, %[filter_val], %[fy] \n" | |
284 "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0| | |
285 "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2| | |
286 "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0) | |
287 "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0) | |
288 "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0) | |
289 "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0) | |
290 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| | |
291 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2| | |
292 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2| | |
293 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| | |
294 "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2| | |
295 "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2| | |
296 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2) | |
297 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2) | |
298 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2) | |
299 "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2) | |
300 "addiu %[cnt], %[cnt], -1 \n" | |
301 "bgtz %[cnt], 11b \n" | |
302 " addiu %[fy], %[fy], 8 \n" | |
303 | |
304 "2: \n" | |
305 "andi %[cnt], %[filter_len], 0x3 \n" // residual | |
306 "beqz %[cnt], 3f \n" | |
307 " nop \n" | |
308 | |
309 "21: \n" | |
310 "addu $t0, %[filter_val], %[fy] \n" | |
311 "lh $t4, 0($t0) \n" // t4=filter_val[fx] | |
312 "sll $t1, %[fy], 1 \n" | |
313 "addu $t0, %[src_data_rows], $t1 \n" | |
314 "lw $t1, 0($t0) \n" | |
315 "addu $t0, $t1, %[offset] \n" | |
316 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] | |
317 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] | |
318 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] | |
319 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2] | |
320 "maddu $ac0, $t4, $t1 \n" | |
321 "maddu $ac1, $t4, $t2 \n" | |
322 "maddu $ac2, $t4, $t3 \n" | |
323 "maddu $ac3, $t4, $t0 \n" | |
324 "addiu %[cnt], %[cnt], -1 \n" | |
325 "bgtz %[cnt], 21b \n" | |
326 " addiu %[fy], %[fy], 2 \n" | |
327 | |
328 "3: \n" | |
329 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits | |
330 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits | |
331 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits | |
332 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits | |
333 "repl.ph $t4, 128 \n" // t4 = | 128 | 128 | | |
334 "addu $t5, %[out_row], %[offset] \n" | |
335 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| | |
336 "append $t0, $t1, 16 \n" // t0 = |0|a|0|b| | |
337 "subu.ph $t1, $t0, $t4 \n" | |
338 "shll_s.ph $t1, $t1, 8 \n" | |
339 "shra.ph $t1, $t1, 8 \n" | |
340 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b) | |
341 "subu.ph $t2, $t2, $t4 \n" | |
342 "shll_s.ph $t2, $t2, 8 \n" | |
343 "shra.ph $t2, $t2, 8 \n" | |
344 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r) | |
345 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b) | |
346 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r | |
347 "pick.ph $t0, $t2, $t3 \n" | |
348 "andi $t3, $t0, 0xFF \n" | |
349 "srl $t4, $t0, 16 \n" | |
350 "cmp.lt.ph $t3, $t4 \n" | |
351 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch | |
352 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a) | |
353 "cmp.lt.ph $t3, $t0 \n" | |
354 "pick.ph $t0, $t0, $t3 \n" | |
355 "ins $t1, $t0, 16, 8 \n" | |
356 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| | |
357 "usw $t0, 0($t5) \n" | |
358 | |
359 ".set pop \n" | |
360 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), | |
361 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), | |
362 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) | |
363 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), | |
364 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) | |
365 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", | |
366 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory" | |
367 ); | |
368 } | |
369 } else { | |
370 for (int out_x = 0; out_x < pixel_width; out_x++) { | |
371 __asm__ __volatile__ ( | |
372 ".set push \n" | |
373 ".set noreorder \n" | |
374 | |
375 "beqz %[filter_len], 3f \n" | |
376 " sll %[offset], %[out_x], 2 \n" | |
377 "mtlo $0, $ac0 \n" | |
378 "mtlo $0, $ac1 \n" | |
379 "mtlo $0, $ac2 \n" | |
380 "srl %[cnt], %[filter_len], 2 \n" | |
381 "beqz %[cnt], 2f \n" | |
382 " li %[fy], 0 \n" | |
383 | |
384 "11: \n" | |
385 "sll $t1, %[fy], 1 \n" | |
386 "addu $t0, %[src_data_rows], $t1 \n" | |
387 "lw $t1, 0($t0) \n" | |
388 "lw $t2, 4($t0) \n" | |
389 "lw $t3, 8($t0) \n" | |
390 "lw $t4, 12($t0) \n" | |
391 "addu $t1, $t1, %[offset] \n" | |
392 "addu $t2, $t2, %[offset] \n" | |
393 "addu $t3, $t3, %[offset] \n" | |
394 "addu $t4, $t4, %[offset] \n" | |
395 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| | |
396 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| | |
397 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| | |
398 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| | |
399 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| | |
400 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| | |
401 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| | |
402 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| | |
403 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| | |
404 "addu $t6, %[filter_val], %[fy] \n" | |
405 "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0| | |
406 "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0| | |
407 "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0) | |
408 "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0) | |
409 "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0) | |
410 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| | |
411 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2| | |
412 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| | |
413 "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2| | |
414 "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2| | |
415 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0) | |
416 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0) | |
417 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0) | |
418 "addiu %[cnt], %[cnt], -1 \n" | |
419 "bgtz %[cnt], 11b \n" | |
420 " addiu %[fy], %[fy], 8 \n" | |
421 | |
422 "2: \n" | |
423 "andi %[cnt], %[filter_len], 0x3 \n" // residual | |
424 "beqz %[cnt], 3f \n" | |
425 " nop \n" | |
426 | |
427 "21: \n" | |
428 "addu $t0, %[filter_val], %[fy] \n" | |
429 "lh $t4, 0($t0) \n" // filter_val[fx] | |
430 "sll $t1, %[fy], 1 \n" | |
431 "addu $t0, %[src_data_rows], $t1 \n" | |
432 "lw $t1, 0($t0) \n" | |
433 "addu $t0, $t1, %[offset] \n" | |
434 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] | |
435 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] | |
436 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] | |
437 "maddu $ac0, $t4, $t1 \n" | |
438 "maddu $ac1, $t4, $t2 \n" | |
439 "maddu $ac2, $t4, $t3 \n" | |
440 "addiu %[cnt], %[cnt], -1 \n" | |
441 "bgtz %[cnt], 21b \n" | |
442 " addiu %[fy], %[fy], 2 \n" | |
443 | |
444 "3: \n" | |
445 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits | |
446 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits | |
447 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits | |
448 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | | |
449 "addu $t5, %[out_row], %[offset] \n" | |
450 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| | |
451 "andi $t1, $t1, 0xFFFF \n" | |
452 "subu.ph $t1, $t1, $t6 \n" | |
453 "shll_s.ph $t1, $t1, 8 \n" | |
454 "shra.ph $t1, $t1, 8 \n" | |
455 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b) | |
456 "subu.ph $t2, $t2, $t6 \n" | |
457 "shll_s.ph $t2, $t2, 8 \n" | |
458 "shra.ph $t2, $t2, 8 \n" | |
459 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r) | |
460 "li $t0, 0xFF \n" | |
461 "ins $t1, $t0, 16, 8 \n" | |
462 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| | |
463 "usw $t0, 0($t5) \n" | |
464 | |
465 ".set pop \n" | |
466 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), | |
467 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), | |
468 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) | |
469 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), | |
470 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) | |
471 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", | |
472 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory" | |
473 ); | |
474 } | |
475 } | |
476 #endif | |
477 } | |
478 | |
479 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, | |
480 int filter_length, | |
481 unsigned char* const* source_data_rows, | |
482 int pixel_width, | |
483 unsigned char* out_row, | |
484 bool has_alpha) { | |
485 if (has_alpha) { | |
Justin Novosad
2013/05/23 15:32:03
This is the conditional branch I was talking about
Teodora Novkovic
2013/05/24 16:02:20
Done.
| |
486 ConvolveVertically_mips_dspr2<true>(filter_val, | |
487 filter_length, | |
488 source_data_rows, | |
489 pixel_width, | |
490 out_row); | |
491 } else { | |
492 ConvolveVertically_mips_dspr2<false>(filter_val, | |
493 filter_length, | |
494 source_data_rows, | |
495 pixel_width, | |
496 out_row); | |
497 } | |
498 } | |
499 | |
500 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, | |
501 const ConvolutionFilter1D& filter, | |
502 unsigned char* out_row, | |
503 bool has_alpha) { | |
504 if (has_alpha) { | |
505 ConvolveHorizontally_mips_dspr2<true>(src_data, | |
506 filter, | |
507 out_row); | |
508 } else { | |
509 ConvolveHorizontally_mips_dspr2<false>(src_data, | |
510 filter, | |
511 out_row); | |
512 } | |
513 } | |
514 } // namespace skia | |
OLD | NEW |