OLD | NEW |
(Empty) | |
| 1 #include <algorithm> |
| 2 #include "skia/ext/convolver.h" |
| 3 #include "skia/ext/convolver_mips_dspr2.h" |
| 4 #include "third_party/skia/include/core/SkTypes.h" |
| 5 |
| 6 namespace skia { |
| 7 // Convolves horizontally along a single row. The row data is given in |
| 8 // |src_data| and continues for the num_values() of the filter. |
| 9 template<bool has_alpha> |
| 10 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, |
| 11 const ConvolutionFilter1D& filter, |
| 12 unsigned char* out_row) { |
| 13 #if SIMD_MIPS_DSPR2 |
| 14 int row_to_filter = 0; |
| 15 int num_values = filter.num_values(); |
| 16 if(has_alpha) { |
| 17 for (int out_x = 0; out_x < num_values; out_x++) { |
| 18 // Get the filter that determines the current output pixel. |
| 19 int filter_offset, filter_length; |
| 20 const ConvolutionFilter1D::Fixed* filter_values = |
| 21 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 22 int filter_x = 0; |
| 23 |
| 24 __asm__ __volatile__ ( |
| 25 ".set push \n" |
| 26 ".set noreorder \n" |
| 27 |
| 28 "beqz %[filter_len], 3f \n" |
| 29 " sll $t0, %[filter_offset], 2 \n" |
| 30 "addu %[rtf], %[src_data], $t0 \n" |
| 31 "mtlo $0, $ac0 \n" |
| 32 "mtlo $0, $ac1 \n" |
| 33 "mtlo $0, $ac2 \n" |
| 34 "mtlo $0, $ac3 \n" |
| 35 "srl $t7, %[filter_len], 2 \n" |
| 36 "beqz $t7, 2f \n" |
| 37 " li %[fx], 0 \n" |
| 38 |
| 39 "11: \n" |
| 40 "addu $t4, %[filter_val], %[fx] \n" |
| 41 "sll $t5, %[fx], 1 \n" |
| 42 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| |
| 43 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| |
| 44 "addu $t0, %[rtf], $t5 \n" |
| 45 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| |
| 46 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| |
| 47 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| |
| 48 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| |
| 49 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| |
| 50 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| |
| 51 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0| |
| 52 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| |
| 53 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| |
| 54 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| |
| 55 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0) |
| 56 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) |
| 57 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) |
| 58 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) |
| 59 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| |
| 60 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| |
| 61 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2| |
| 62 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| |
| 63 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| |
| 64 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| |
| 65 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2) |
| 66 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) |
| 67 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) |
| 68 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) |
| 69 "addiu $t7, $t7, -1 \n" |
| 70 "bgtz $t7, 11b \n" |
| 71 " addiu %[fx], %[fx], 8 \n" |
| 72 |
| 73 "2: \n" |
| 74 "andi $t7, %[filter_len], 0x3 \n" // residual |
| 75 "beqz $t7, 3f \n" |
| 76 " nop \n" |
| 77 |
| 78 "21: \n" |
| 79 "sll $t1, %[fx], 1 \n" |
| 80 "addu $t2, %[filter_val], %[fx] \n" |
| 81 "addu $t0, %[rtf], $t1 \n" |
| 82 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] |
| 83 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] |
| 84 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] |
| 85 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] |
| 86 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2] |
| 87 "maddu $ac3, $t6, $t1 \n" |
| 88 "maddu $ac2, $t6, $t2 \n" |
| 89 "maddu $ac1, $t6, $t3 \n" |
| 90 "maddu $ac0, $t6, $t4 \n" |
| 91 "addiu $t7, $t7, -1 \n" |
| 92 "bgtz $t7, 21b \n" |
| 93 " addiu %[fx], %[fx], 2 \n" |
| 94 |
| 95 "3: \n" |
| 96 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits |
| 97 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits |
| 98 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits |
| 99 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits |
| 100 "sll $t5, %[out_x], 2 \n" |
| 101 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | |
| 102 "addu $t5, %[out_row], $t5 \n" |
| 103 "append $t2, $t3, 16 \n" |
| 104 "append $t0, $t1, 16 \n" |
| 105 "subu.ph $t1, $t0, $t6 \n" |
| 106 "shll_s.ph $t1, $t1, 8 \n" |
| 107 "shra.ph $t1, $t1, 8 \n" |
| 108 "addu.ph $t1, $t1, $t6 \n" |
| 109 "subu.ph $t3, $t2, $t6 \n" |
| 110 "shll_s.ph $t3, $t3, 8 \n" |
| 111 "shra.ph $t3, $t3, 8 \n" |
| 112 "addu.ph $t3, $t3, $t6 \n" |
| 113 "precr.qb.ph $t0, $t1, $t3 \n" |
| 114 "usw $t0, 0($t5) \n" |
| 115 |
| 116 ".set pop \n" |
| 117 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), |
| 118 [rtf] "+r" (row_to_filter) |
| 119 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), |
| 120 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), |
| 121 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) |
| 122 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", |
| 123 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" |
| 124 ); |
| 125 } |
| 126 } else { |
| 127 for (int out_x = 0; out_x < num_values; out_x++) { |
| 128 // Get the filter that determines the current output pixel. |
| 129 int filter_offset, filter_length; |
| 130 const ConvolutionFilter1D::Fixed* filter_values = |
| 131 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 132 int filter_x = 0; |
| 133 __asm__ __volatile__ ( |
| 134 ".set push \n" |
| 135 ".set noreorder \n" |
| 136 |
| 137 "beqz %[filter_len], 3f \n" |
| 138 " sll $t0, %[filter_offset], 2 \n" |
| 139 "addu %[rtf], %[src_data], $t0 \n" |
| 140 "mtlo $0, $ac1 \n" |
| 141 "mtlo $0, $ac2 \n" |
| 142 "mtlo $0, $ac3 \n" |
| 143 "srl $t7, %[filter_len], 2 \n" |
| 144 "beqz $t7, 2f \n" |
| 145 " li %[fx], 0 \n" |
| 146 |
| 147 "11: \n" |
| 148 "addu $t4, %[filter_val], %[fx] \n" |
| 149 "sll $t5, %[fx], 1 \n" |
| 150 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| |
| 151 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| |
| 152 "addu $t0, %[rtf], $t5 \n" |
| 153 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| |
| 154 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| |
| 155 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| |
| 156 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| |
| 157 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| |
| 158 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| |
| 159 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| |
| 160 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| |
| 161 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| |
| 162 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) |
| 163 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) |
| 164 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) |
| 165 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| |
| 166 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| |
| 167 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| |
| 168 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| |
| 169 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| |
| 170 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) |
| 171 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) |
| 172 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) |
| 173 "addiu $t7, $t7, -1 \n" |
| 174 "bgtz $t7, 11b \n" |
| 175 " addiu %[fx], %[fx], 8 \n" |
| 176 |
| 177 "2: \n" |
| 178 "andi $t7, %[filter_len], 0x3 \n" // residual |
| 179 "beqz $t7, 3f \n" |
| 180 " nop \n" |
| 181 |
| 182 "21: \n" |
| 183 "sll $t1, %[fx], 1 \n" |
| 184 "addu $t2, %[filter_val], %[fx] \n" |
| 185 "addu $t0, %[rtf], $t1 \n" |
| 186 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] |
| 187 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] |
| 188 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] |
| 189 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] |
| 190 "maddu $ac3, $t6, $t1 \n" |
| 191 "maddu $ac2, $t6, $t2 \n" |
| 192 "maddu $ac1, $t6, $t3 \n" |
| 193 "addiu $t7, $t7, -1 \n" |
| 194 "bgtz $t7, 21b \n" |
| 195 " addiu %[fx], %[fx], 2 \n" |
| 196 |
| 197 "3: \n" |
| 198 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits |
| 199 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits |
| 200 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits |
| 201 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | |
| 202 "sll $t8, %[out_x], 2 \n" |
| 203 "addu $t8, %[out_row], $t8 \n" |
| 204 "append $t2, $t3, 16 \n" |
| 205 "andi $t1, 0xFFFF \n" |
| 206 "subu.ph $t5, $t1, $t6 \n" |
| 207 "shll_s.ph $t5, $t5, 8 \n" |
| 208 "shra.ph $t5, $t5, 8 \n" |
| 209 "addu.ph $t5, $t5, $t6 \n" |
| 210 "subu.ph $t4, $t2, $t6 \n" |
| 211 "shll_s.ph $t4, $t4, 8 \n" |
| 212 "shra.ph $t4, $t4, 8 \n" |
| 213 "addu.ph $t4, $t4, $t6 \n" |
| 214 "precr.qb.ph $t0, $t5, $t4 \n" |
| 215 "usw $t0, 0($t8) \n" |
| 216 |
| 217 ".set pop \n" |
| 218 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), |
| 219 [rtf] "+r" (row_to_filter) |
| 220 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), |
| 221 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), |
| 222 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) |
| 223 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", |
| 224 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" |
| 225 ); |
| 226 } |
| 227 } |
| 228 #endif |
| 229 } |
| 230 template<bool has_alpha> |
| 231 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, |
| 232 int filter_length, |
| 233 unsigned char* const* source_data_rows, |
| 234 int pixel_width, |
| 235 unsigned char* out_row) { |
| 236 #if SIMD_MIPS_DSPR2 |
| 237 // We go through each column in the output and do a vertical convolution, |
| 238 // generating one output pixel each time. |
| 239 int byte_offset; |
| 240 int cnt; |
| 241 int filter_y; |
| 242 if(has_alpha) { |
| 243 for (int out_x = 0; out_x < pixel_width; out_x++) { |
| 244 __asm__ __volatile__ ( |
| 245 ".set push \n" |
| 246 ".set noreorder \n" |
| 247 |
| 248 "beqz %[filter_len], 3f \n" |
| 249 " sll %[offset], %[out_x], 2 \n" |
| 250 "mtlo $0, $ac0 \n" |
| 251 "mtlo $0, $ac1 \n" |
| 252 "mtlo $0, $ac2 \n" |
| 253 "mtlo $0, $ac3 \n" |
| 254 "srl %[cnt], %[filter_len], 2 \n" |
| 255 "beqz %[cnt], 2f \n" |
| 256 " li %[fy], 0 \n" |
| 257 |
| 258 "11: \n" |
| 259 "sll $t1, %[fy], 1 \n" |
| 260 "addu $t0, %[src_data_rows], $t1 \n" |
| 261 "lw $t1, 0($t0) \n" |
| 262 "lw $t2, 4($t0) \n" |
| 263 "lw $t3, 8($t0) \n" |
| 264 "lw $t4, 12($t0) \n" |
| 265 "addu $t1, $t1, %[offset] \n" |
| 266 "addu $t2, $t2, %[offset] \n" |
| 267 "addu $t3, $t3, %[offset] \n" |
| 268 "addu $t4, $t4, %[offset] \n" |
| 269 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| |
| 270 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| |
| 271 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| |
| 272 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| |
| 273 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| |
| 274 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| |
| 275 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0| |
| 276 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| |
| 277 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| |
| 278 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| |
| 279 "addu $t6, %[filter_val], %[fy] \n" |
| 280 "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0| |
| 281 "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2| |
| 282 "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0) |
| 283 "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0) |
| 284 "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0) |
| 285 "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0) |
| 286 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| |
| 287 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2| |
| 288 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2| |
| 289 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| |
| 290 "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2| |
| 291 "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2| |
| 292 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2) |
| 293 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2) |
| 294 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2) |
| 295 "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2) |
| 296 "addiu %[cnt], %[cnt], -1 \n" |
| 297 "bgtz %[cnt], 11b \n" |
| 298 " addiu %[fy], %[fy], 8 \n" |
| 299 |
| 300 "2: \n" |
| 301 "andi %[cnt], %[filter_len], 0x3 \n" // residual |
| 302 "beqz %[cnt], 3f \n" |
| 303 " nop \n" |
| 304 |
| 305 "21: \n" |
| 306 "addu $t0, %[filter_val], %[fy] \n" |
| 307 "lh $t4, 0($t0) \n" // t4=filter_val[fx] |
| 308 "sll $t1, %[fy], 1 \n" |
| 309 "addu $t0, %[src_data_rows], $t1 \n" |
| 310 "lw $t1, 0($t0) \n" |
| 311 "addu $t0, $t1, %[offset] \n" |
| 312 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] |
| 313 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] |
| 314 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] |
| 315 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2] |
| 316 "maddu $ac0, $t4, $t1 \n" |
| 317 "maddu $ac1, $t4, $t2 \n" |
| 318 "maddu $ac2, $t4, $t3 \n" |
| 319 "maddu $ac3, $t4, $t0 \n" |
| 320 "addiu %[cnt], %[cnt], -1 \n" |
| 321 "bgtz %[cnt], 21b \n" |
| 322 " addiu %[fy], %[fy], 2 \n" |
| 323 |
| 324 "3: \n" |
| 325 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits |
| 326 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits |
| 327 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits |
| 328 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits |
| 329 "repl.ph $t4, 128 \n" // t4 = | 128 | 128 | |
| 330 "addu $t5, %[out_row], %[offset] \n" |
| 331 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| |
| 332 "append $t0, $t1, 16 \n" // t0 = |0|a|0|b| |
| 333 "subu.ph $t1, $t0, $t4 \n" |
| 334 "shll_s.ph $t1, $t1, 8 \n" |
| 335 "shra.ph $t1, $t1, 8 \n" |
| 336 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b) |
| 337 "subu.ph $t2, $t2, $t4 \n" |
| 338 "shll_s.ph $t2, $t2, 8 \n" |
| 339 "shra.ph $t2, $t2, 8 \n" |
| 340 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r) |
| 341 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b) |
| 342 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r |
| 343 "pick.ph $t0, $t2, $t3 \n" |
| 344 "andi $t3, $t0, 0xFF \n" |
| 345 "srl $t4, $t0, 16 \n" |
| 346 "cmp.lt.ph $t3, $t4 \n" |
| 347 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch |
| 348 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a) |
| 349 "cmp.lt.ph $t3, $t0 \n" |
| 350 "pick.ph $t0, $t0, $t3 \n" |
| 351 "ins $t1, $t0, 16, 8 \n" |
| 352 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| |
| 353 "usw $t0, 0($t5) \n" |
| 354 |
| 355 ".set pop \n" |
| 356 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), |
| 357 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), |
| 358 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) |
| 359 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), |
| 360 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) |
| 361 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", |
| 362 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory" |
| 363 ); |
| 364 } |
| 365 } else { |
| 366 for (int out_x = 0; out_x < pixel_width; out_x++) { |
| 367 __asm__ __volatile__ ( |
| 368 ".set push \n" |
| 369 ".set noreorder \n" |
| 370 |
| 371 "beqz %[filter_len], 3f \n" |
| 372 " sll %[offset], %[out_x], 2 \n" |
| 373 "mtlo $0, $ac0 \n" |
| 374 "mtlo $0, $ac1 \n" |
| 375 "mtlo $0, $ac2 \n" |
| 376 "srl %[cnt], %[filter_len], 2 \n" |
| 377 "beqz %[cnt], 2f \n" |
| 378 " li %[fy], 0 \n" |
| 379 |
| 380 "11: \n" |
| 381 "sll $t1, %[fy], 1 \n" |
| 382 "addu $t0, %[src_data_rows], $t1 \n" |
| 383 "lw $t1, 0($t0) \n" |
| 384 "lw $t2, 4($t0) \n" |
| 385 "lw $t3, 8($t0) \n" |
| 386 "lw $t4, 12($t0) \n" |
| 387 "addu $t1, $t1, %[offset] \n" |
| 388 "addu $t2, $t2, %[offset] \n" |
| 389 "addu $t3, $t3, %[offset] \n" |
| 390 "addu $t4, $t4, %[offset] \n" |
| 391 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| |
| 392 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| |
| 393 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| |
| 394 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| |
| 395 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| |
| 396 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| |
| 397 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| |
| 398 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| |
| 399 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| |
| 400 "addu $t6, %[filter_val], %[fy] \n" |
| 401 "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0| |
| 402 "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0| |
| 403 "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0) |
| 404 "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0) |
| 405 "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0) |
| 406 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| |
| 407 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2| |
| 408 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| |
| 409 "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2| |
| 410 "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2| |
| 411 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0) |
| 412 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0) |
| 413 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0) |
| 414 "addiu %[cnt], %[cnt], -1 \n" |
| 415 "bgtz %[cnt], 11b \n" |
| 416 " addiu %[fy], %[fy], 8 \n" |
| 417 |
| 418 "2: \n" |
| 419 "andi %[cnt], %[filter_len], 0x3 \n" // residual |
| 420 "beqz %[cnt], 3f \n" |
| 421 " nop \n" |
| 422 |
| 423 "21: \n" |
| 424 "addu $t0, %[filter_val], %[fy] \n" |
| 425 "lh $t4, 0($t0) \n" // filter_val[fx] |
| 426 "sll $t1, %[fy], 1 \n" |
| 427 "addu $t0, %[src_data_rows], $t1 \n" |
| 428 "lw $t1, 0($t0) \n" |
| 429 "addu $t0, $t1, %[offset] \n" |
| 430 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] |
| 431 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] |
| 432 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] |
| 433 "maddu $ac0, $t4, $t1 \n" |
| 434 "maddu $ac1, $t4, $t2 \n" |
| 435 "maddu $ac2, $t4, $t3 \n" |
| 436 "addiu %[cnt], %[cnt], -1 \n" |
| 437 "bgtz %[cnt], 21b \n" |
| 438 " addiu %[fy], %[fy], 2 \n" |
| 439 |
| 440 "3: \n" |
| 441 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits |
| 442 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits |
| 443 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits |
| 444 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | |
| 445 "addu $t5, %[out_row], %[offset] \n" |
| 446 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| |
| 447 "andi $t1, $t1, 0xFFFF \n" |
| 448 "subu.ph $t1, $t1, $t6 \n" |
| 449 "shll_s.ph $t1, $t1, 8 \n" |
| 450 "shra.ph $t1, $t1, 8 \n" |
| 451 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b) |
| 452 "subu.ph $t2, $t2, $t6 \n" |
| 453 "shll_s.ph $t2, $t2, 8 \n" |
| 454 "shra.ph $t2, $t2, 8 \n" |
| 455 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r) |
| 456 "li $t0, 0xFF \n" |
| 457 "ins $t1, $t0, 16, 8 \n" |
| 458 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| |
| 459 "usw $t0, 0($t5) \n" |
| 460 |
| 461 ".set pop \n" |
| 462 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), |
| 463 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), |
| 464 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) |
| 465 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), |
| 466 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) |
| 467 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", |
| 468 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory" |
| 469 ); |
| 470 } |
| 471 } |
| 472 #endif |
| 473 } |
| 474 |
| 475 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, |
| 476 int filter_length, |
| 477 unsigned char* const* source_data_rows, |
| 478 int pixel_width, |
| 479 unsigned char* out_row, |
| 480 bool has_alpha) { |
| 481 if (has_alpha) { |
| 482 ConvolveVertically_mips_dspr2<true>(filter_val, |
| 483 filter_length, |
| 484 source_data_rows, |
| 485 pixel_width, |
| 486 out_row); |
| 487 } else { |
| 488 ConvolveVertically_mips_dspr2<false>(filter_val, |
| 489 filter_length, |
| 490 source_data_rows, |
| 491 pixel_width, |
| 492 out_row); |
| 493 } |
| 494 } |
| 495 |
| 496 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, |
| 497 const ConvolutionFilter1D& filter, |
| 498 unsigned char* out_row, |
| 499 bool has_alpha) { |
| 500 if (has_alpha) { |
| 501 ConvolveHorizontally_mips_dspr2<true>(src_data, |
| 502 filter, |
| 503 out_row); |
| 504 } else { |
| 505 ConvolveHorizontally_mips_dspr2<false>(src_data, |
| 506 filter, |
| 507 out_row); |
| 508 } |
| 509 } |
| 510 } // namespace skia |
OLD | NEW |