Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(811)

Side by Side Diff: skia/ext/convolver_mips_dspr2.cc

Issue 14929006: [MIPS] Added MIPS DSPr2 optimization for BGRAConvolve2D routine (Closed)
Patch Set: Add a non-templated parameter "has_alpha" in ConvolveHorizontally_SSE2 routine Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « skia/ext/convolver_mips_dspr2.h ('k') | skia/skia.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #include <algorithm>
Stephen White 2013/05/17 22:14:40 This file needs a license header.
2 #include "skia/ext/convolver.h"
3 #include "skia/ext/convolver_mips_dspr2.h"
4 #include "third_party/skia/include/core/SkTypes.h"
5
6 namespace skia {
7 // Convolves horizontally along a single row. The row data is given in
8 // |src_data| and continues for the num_values() of the filter.
9 template<bool has_alpha>
10 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
11 const ConvolutionFilter1D& filter,
12 unsigned char* out_row) {
13 #if SIMD_MIPS_DSPR2
14 int row_to_filter = 0;
15 int num_values = filter.num_values();
16 if(has_alpha) {
17 for (int out_x = 0; out_x < num_values; out_x++) {
18 // Get the filter that determines the current output pixel.
19 int filter_offset, filter_length;
20 const ConvolutionFilter1D::Fixed* filter_values =
21 filter.FilterForValue(out_x, &filter_offset, &filter_length);
22 int filter_x = 0;
23
24 __asm__ __volatile__ (
25 ".set push \n"
26 ".set noreorder \n"
27
28 "beqz %[filter_len], 3f \n"
29 " sll $t0, %[filter_offset], 2 \n"
30 "addu %[rtf], %[src_data], $t0 \n"
31 "mtlo $0, $ac0 \n"
32 "mtlo $0, $ac1 \n"
33 "mtlo $0, $ac2 \n"
34 "mtlo $0, $ac3 \n"
35 "srl $t7, %[filter_len], 2 \n"
36 "beqz $t7, 2f \n"
37 " li %[fx], 0 \n"
38
39 "11: \n"
40 "addu $t4, %[filter_val], %[fx] \n"
41 "sll $t5, %[fx], 1 \n"
42 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]|
43 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]|
44 "addu $t0, %[rtf], $t5 \n"
45 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0|
46 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1|
47 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2|
48 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3|
49 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0|
50 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0|
51 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0|
52 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0|
53 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0|
54 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0|
55 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0)
56 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0)
57 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0)
58 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0)
59 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2|
60 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2|
61 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2|
62 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2|
63 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2|
64 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2|
65 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2)
66 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2)
67 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2)
68 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2)
69 "addiu $t7, $t7, -1 \n"
70 "bgtz $t7, 11b \n"
71 " addiu %[fx], %[fx], 8 \n"
72
73 "2: \n"
74 "andi $t7, %[filter_len], 0x3 \n" // residual
75 "beqz $t7, 3f \n"
76 " nop \n"
77
78 "21: \n"
79 "sll $t1, %[fx], 1 \n"
80 "addu $t2, %[filter_val], %[fx] \n"
81 "addu $t0, %[rtf], $t1 \n"
82 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]
83 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]
84 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]
85 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]
86 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2]
87 "maddu $ac3, $t6, $t1 \n"
88 "maddu $ac2, $t6, $t2 \n"
89 "maddu $ac1, $t6, $t3 \n"
90 "maddu $ac0, $t6, $t4 \n"
91 "addiu $t7, $t7, -1 \n"
92 "bgtz $t7, 21b \n"
93 " addiu %[fx], %[fx], 2 \n"
94
95 "3: \n"
96 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits
97 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits
98 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits
99 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits
100 "sll $t5, %[out_x], 2 \n"
101 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
102 "addu $t5, %[out_row], $t5 \n"
103 "append $t2, $t3, 16 \n"
104 "append $t0, $t1, 16 \n"
105 "subu.ph $t1, $t0, $t6 \n"
106 "shll_s.ph $t1, $t1, 8 \n"
107 "shra.ph $t1, $t1, 8 \n"
108 "addu.ph $t1, $t1, $t6 \n"
109 "subu.ph $t3, $t2, $t6 \n"
110 "shll_s.ph $t3, $t3, 8 \n"
111 "shra.ph $t3, $t3, 8 \n"
112 "addu.ph $t3, $t3, $t6 \n"
113 "precr.qb.ph $t0, $t1, $t3 \n"
114 "usw $t0, 0($t5) \n"
115
116 ".set pop \n"
117 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
118 [rtf] "+r" (row_to_filter)
119 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
120 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
121 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
122 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
123 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
124 );
125 }
126 } else {
127 for (int out_x = 0; out_x < num_values; out_x++) {
128 // Get the filter that determines the current output pixel.
129 int filter_offset, filter_length;
130 const ConvolutionFilter1D::Fixed* filter_values =
131 filter.FilterForValue(out_x, &filter_offset, &filter_length);
132 int filter_x = 0;
133 __asm__ __volatile__ (
134 ".set push \n"
135 ".set noreorder \n"
136
137 "beqz %[filter_len], 3f \n"
138 " sll $t0, %[filter_offset], 2 \n"
139 "addu %[rtf], %[src_data], $t0 \n"
140 "mtlo $0, $ac1 \n"
141 "mtlo $0, $ac2 \n"
142 "mtlo $0, $ac3 \n"
143 "srl $t7, %[filter_len], 2 \n"
144 "beqz $t7, 2f \n"
145 " li %[fx], 0 \n"
146
147 "11: \n"
148 "addu $t4, %[filter_val], %[fx] \n"
149 "sll $t5, %[fx], 1 \n"
150 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]|
151 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]|
152 "addu $t0, %[rtf], $t5 \n"
153 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0|
154 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1|
155 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2|
156 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3|
157 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0|
158 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0|
159 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0|
160 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0|
161 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0|
162 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0)
163 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0)
164 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0)
165 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2|
166 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2|
167 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2|
168 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2|
169 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2|
170 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2)
171 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2)
172 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2)
173 "addiu $t7, $t7, -1 \n"
174 "bgtz $t7, 11b \n"
175 " addiu %[fx], %[fx], 8 \n"
176
177 "2: \n"
178 "andi $t7, %[filter_len], 0x3 \n" // residual
179 "beqz $t7, 3f \n"
180 " nop \n"
181
182 "21: \n"
183 "sll $t1, %[fx], 1 \n"
184 "addu $t2, %[filter_val], %[fx] \n"
185 "addu $t0, %[rtf], $t1 \n"
186 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]
187 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]
188 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]
189 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]
190 "maddu $ac3, $t6, $t1 \n"
191 "maddu $ac2, $t6, $t2 \n"
192 "maddu $ac1, $t6, $t3 \n"
193 "addiu $t7, $t7, -1 \n"
194 "bgtz $t7, 21b \n"
195 " addiu %[fx], %[fx], 2 \n"
196
197 "3: \n"
198 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits
199 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits
200 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits
201 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
202 "sll $t8, %[out_x], 2 \n"
203 "addu $t8, %[out_row], $t8 \n"
204 "append $t2, $t3, 16 \n"
205 "andi $t1, 0xFFFF \n"
206 "subu.ph $t5, $t1, $t6 \n"
207 "shll_s.ph $t5, $t5, 8 \n"
208 "shra.ph $t5, $t5, 8 \n"
209 "addu.ph $t5, $t5, $t6 \n"
210 "subu.ph $t4, $t2, $t6 \n"
211 "shll_s.ph $t4, $t4, 8 \n"
212 "shra.ph $t4, $t4, 8 \n"
213 "addu.ph $t4, $t4, $t6 \n"
214 "precr.qb.ph $t0, $t5, $t4 \n"
215 "usw $t0, 0($t8) \n"
216
217 ".set pop \n"
218 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
219 [rtf] "+r" (row_to_filter)
220 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
221 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
222 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
223 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
224 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
225 );
226 }
227 }
228 #endif
229 }
230 template<bool has_alpha>
231 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
232 int filter_length,
233 unsigned char* const* source_data_rows,
234 int pixel_width,
235 unsigned char* out_row) {
236 #if SIMD_MIPS_DSPR2
237 // We go through each column in the output and do a vertical convolution,
238 // generating one output pixel each time.
239 int byte_offset;
240 int cnt;
241 int filter_y;
242 if(has_alpha) {
243 for (int out_x = 0; out_x < pixel_width; out_x++) {
244 __asm__ __volatile__ (
245 ".set push \n"
246 ".set noreorder \n"
247
248 "beqz %[filter_len], 3f \n"
249 " sll %[offset], %[out_x], 2 \n"
250 "mtlo $0, $ac0 \n"
251 "mtlo $0, $ac1 \n"
252 "mtlo $0, $ac2 \n"
253 "mtlo $0, $ac3 \n"
254 "srl %[cnt], %[filter_len], 2 \n"
255 "beqz %[cnt], 2f \n"
256 " li %[fy], 0 \n"
257
258 "11: \n"
259 "sll $t1, %[fy], 1 \n"
260 "addu $t0, %[src_data_rows], $t1 \n"
261 "lw $t1, 0($t0) \n"
262 "lw $t2, 4($t0) \n"
263 "lw $t3, 8($t0) \n"
264 "lw $t4, 12($t0) \n"
265 "addu $t1, $t1, %[offset] \n"
266 "addu $t2, $t2, %[offset] \n"
267 "addu $t3, $t3, %[offset] \n"
268 "addu $t4, $t4, %[offset] \n"
269 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0|
270 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1|
271 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0|
272 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1|
273 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0|
274 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0|
275 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0|
276 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0|
277 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0|
278 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0|
279 "addu $t6, %[filter_val], %[fy] \n"
280 "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0|
281 "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2|
282 "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0)
283 "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0)
284 "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0)
285 "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0)
286 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2|
287 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2|
288 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2|
289 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2|
290 "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2|
291 "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2|
292 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2)
293 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2)
294 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2)
295 "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2)
296 "addiu %[cnt], %[cnt], -1 \n"
297 "bgtz %[cnt], 11b \n"
298 " addiu %[fy], %[fy], 8 \n"
299
300 "2: \n"
301 "andi %[cnt], %[filter_len], 0x3 \n" // residual
302 "beqz %[cnt], 3f \n"
303 " nop \n"
304
305 "21: \n"
306 "addu $t0, %[filter_val], %[fy] \n"
307 "lh $t4, 0($t0) \n" // t4=filter_val[fx]
308 "sll $t1, %[fy], 1 \n"
309 "addu $t0, %[src_data_rows], $t1 \n"
310 "lw $t1, 0($t0) \n"
311 "addu $t0, $t1, %[offset] \n"
312 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]
313 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]
314 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]
315 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2]
316 "maddu $ac0, $t4, $t1 \n"
317 "maddu $ac1, $t4, $t2 \n"
318 "maddu $ac2, $t4, $t3 \n"
319 "maddu $ac3, $t4, $t0 \n"
320 "addiu %[cnt], %[cnt], -1 \n"
321 "bgtz %[cnt], 21b \n"
322 " addiu %[fy], %[fy], 2 \n"
323
324 "3: \n"
325 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits
326 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits
327 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits
328 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits
329 "repl.ph $t4, 128 \n" // t4 = | 128 | 128 |
330 "addu $t5, %[out_row], %[offset] \n"
331 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r|
332 "append $t0, $t1, 16 \n" // t0 = |0|a|0|b|
333 "subu.ph $t1, $t0, $t4 \n"
334 "shll_s.ph $t1, $t1, 8 \n"
335 "shra.ph $t1, $t1, 8 \n"
336 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b)
337 "subu.ph $t2, $t2, $t4 \n"
338 "shll_s.ph $t2, $t2, 8 \n"
339 "shra.ph $t2, $t2, 8 \n"
340 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r)
341 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b)
342 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r
343 "pick.ph $t0, $t2, $t3 \n"
344 "andi $t3, $t0, 0xFF \n"
345 "srl $t4, $t0, 16 \n"
346 "cmp.lt.ph $t3, $t4 \n"
347 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch
348 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a)
349 "cmp.lt.ph $t3, $t0 \n"
350 "pick.ph $t0, $t0, $t3 \n"
351 "ins $t1, $t0, 16, 8 \n"
352 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r|
353 "usw $t0, 0($t5) \n"
354
355 ".set pop \n"
356 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
357 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
358 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
359 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
360 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
361 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
362 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"
363 );
364 }
365 } else {
366 for (int out_x = 0; out_x < pixel_width; out_x++) {
367 __asm__ __volatile__ (
368 ".set push \n"
369 ".set noreorder \n"
370
371 "beqz %[filter_len], 3f \n"
372 " sll %[offset], %[out_x], 2 \n"
373 "mtlo $0, $ac0 \n"
374 "mtlo $0, $ac1 \n"
375 "mtlo $0, $ac2 \n"
376 "srl %[cnt], %[filter_len], 2 \n"
377 "beqz %[cnt], 2f \n"
378 " li %[fy], 0 \n"
379
380 "11: \n"
381 "sll $t1, %[fy], 1 \n"
382 "addu $t0, %[src_data_rows], $t1 \n"
383 "lw $t1, 0($t0) \n"
384 "lw $t2, 4($t0) \n"
385 "lw $t3, 8($t0) \n"
386 "lw $t4, 12($t0) \n"
387 "addu $t1, $t1, %[offset] \n"
388 "addu $t2, $t2, %[offset] \n"
389 "addu $t3, $t3, %[offset] \n"
390 "addu $t4, $t4, %[offset] \n"
391 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0|
392 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1|
393 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0|
394 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1|
395 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0|
396 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0|
397 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0|
398 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0|
399 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0|
400 "addu $t6, %[filter_val], %[fy] \n"
401 "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0|
402 "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0|
403 "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0)
404 "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0)
405 "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0)
406 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2|
407 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2|
408 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2|
409 "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2|
410 "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2|
411 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0)
412 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0)
413 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0)
414 "addiu %[cnt], %[cnt], -1 \n"
415 "bgtz %[cnt], 11b \n"
416 " addiu %[fy], %[fy], 8 \n"
417
418 "2: \n"
419 "andi %[cnt], %[filter_len], 0x3 \n" // residual
420 "beqz %[cnt], 3f \n"
421 " nop \n"
422
423 "21: \n"
424 "addu $t0, %[filter_val], %[fy] \n"
425 "lh $t4, 0($t0) \n" // filter_val[fx]
426 "sll $t1, %[fy], 1 \n"
427 "addu $t0, %[src_data_rows], $t1 \n"
428 "lw $t1, 0($t0) \n"
429 "addu $t0, $t1, %[offset] \n"
430 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]
431 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]
432 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]
433 "maddu $ac0, $t4, $t1 \n"
434 "maddu $ac1, $t4, $t2 \n"
435 "maddu $ac2, $t4, $t3 \n"
436 "addiu %[cnt], %[cnt], -1 \n"
437 "bgtz %[cnt], 21b \n"
438 " addiu %[fy], %[fy], 2 \n"
439
440 "3: \n"
441 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits
442 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits
443 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits
444 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 |
445 "addu $t5, %[out_row], %[offset] \n"
446 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r|
447 "andi $t1, $t1, 0xFFFF \n"
448 "subu.ph $t1, $t1, $t6 \n"
449 "shll_s.ph $t1, $t1, 8 \n"
450 "shra.ph $t1, $t1, 8 \n"
451 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b)
452 "subu.ph $t2, $t2, $t6 \n"
453 "shll_s.ph $t2, $t2, 8 \n"
454 "shra.ph $t2, $t2, 8 \n"
455 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r)
456 "li $t0, 0xFF \n"
457 "ins $t1, $t0, 16, 8 \n"
458 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r|
459 "usw $t0, 0($t5) \n"
460
461 ".set pop \n"
462 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
463 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
464 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
465 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
466 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
467 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
468 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"
469 );
470 }
471 }
472 #endif
473 }
474
475 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
476 int filter_length,
477 unsigned char* const* source_data_rows,
478 int pixel_width,
479 unsigned char* out_row,
480 bool has_alpha) {
481 if (has_alpha) {
482 ConvolveVertically_mips_dspr2<true>(filter_val,
483 filter_length,
484 source_data_rows,
485 pixel_width,
486 out_row);
487 } else {
488 ConvolveVertically_mips_dspr2<false>(filter_val,
489 filter_length,
490 source_data_rows,
491 pixel_width,
492 out_row);
493 }
494 }
495
496 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
497 const ConvolutionFilter1D& filter,
498 unsigned char* out_row,
499 bool has_alpha) {
500 if (has_alpha) {
501 ConvolveHorizontally_mips_dspr2<true>(src_data,
502 filter,
503 out_row);
504 } else {
505 ConvolveHorizontally_mips_dspr2<false>(src_data,
506 filter,
507 out_row);
508 }
509 }
510 } // namespace skia
OLDNEW
« no previous file with comments | « skia/ext/convolver_mips_dspr2.h ('k') | skia/skia.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698