skia/ext/convolver_mips_dspr2.cc - Issue 15742005: (Patch by Teodora Novkovic <teodora.petrovic@gmail.com>, originally reviewed at https://codereview…

Side by Side Diff: skia/ext/convolver_mips_dspr2.cc

Issue 15742005: (Patch by Teodora Novkovic <teodora.petrovic@gmail.com>, originally reviewed at https://codereview… (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include <algorithm>

	6 #include "skia/ext/convolver.h"

	7 #include "skia/ext/convolver_mips_dspr2.h"

	8 #include "third_party/skia/include/core/SkTypes.h"

	9

	10 namespace skia {

	11 // Convolves horizontally along a single row. The row data is given in

	12 // \|src_data\| and continues for the num_values() of the filter.

	13 template<bool has_alpha>

	14 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,

	15 const ConvolutionFilter1D& filter,

	16 unsigned char* out_row) {

	17 #if SIMD_MIPS_DSPR2

	18 int row_to_filter = 0;

	19 int num_values = filter.num_values();

	20 if (has_alpha) {
	Justin Novosad 2013/05/23 15:32:03 Since this branch is outside of the loop, we're no Since this branch is outside of the loop, we're not getting any performance benefit from having has_alpha as a template argument. There is going to be a run-time conditional branch on has_alpha for each call to this method. The only difference is whether the branch happens here or in the caller. Unless there is tangible evidence (benchmarks?) that prove this has a measurable impact on performance, this code should all go into the non-template overload below. On the other hand, if I am wrong and there is a case for this template, this function should be in the unnamed namespace (or static). Teodora Novkovic 2013/05/24 16:02:20 Done. Show quoted text On 2013/05/23 15:32:03, junov wrote: > Since this branch is outside of the loop, we're not getting any performance > benefit from having has_alpha as a template argument. There is going to be a > run-time conditional branch on has_alpha for each call to this method. The only > difference is whether the branch happens here or in the caller. Unless there is > tangible evidence (benchmarks?) that prove this has a measurable impact on > performance, this code should all go into the non-template overload below. On > the other hand, if I am wrong and there is a case for this template, this > function should be in the unnamed namespace (or static). Done.
	21 for (int out_x = 0; out_x < num_values; out_x++) {

	22 // Get the filter that determines the current output pixel.

	23 int filter_offset, filter_length;

	24 const ConvolutionFilter1D::Fixed* filter_values =

	25 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	26 int filter_x = 0;

	27

	28 __asm__ __volatile__ (

	29 ".set push \n"

	30 ".set noreorder \n"

	31

	32 "beqz %[filter_len], 3f \n"

	33 " sll $t0, %[filter_offset], 2 \n"

	34 "addu %[rtf], %[src_data], $t0 \n"

	35 "mtlo $0, $ac0 \n"

	36 "mtlo $0, $ac1 \n"

	37 "mtlo $0, $ac2 \n"

	38 "mtlo $0, $ac3 \n"

	39 "srl $t7, %[filter_len], 2 \n"

	40 "beqz $t7, 2f \n"

	41 " li %[fx], 0 \n"

	42

	43 "11: \n"

	44 "addu $t4, %[filter_val], %[fx] \n"

	45 "sll $t5, %[fx], 1 \n"

	46 "ulw $t6, 0($t4) \n" // t6 = \|cur[1]\|cur[0]\|

	47 "ulw $t8, 4($t4) \n" // t8 = \|cur[3]\|cur[2]\|

	48 "addu $t0, %[rtf], $t5 \n"

	49 "lw $t1, 0($t0) \n" // t1 = \|a0\|b0\|g0\|r0\|

	50 "lw $t2, 4($t0) \n" // t2 = \|a1\|b1\|g1\|r1\|

	51 "lw $t3, 8($t0) \n" // t3 = \|a2\|b2\|g2\|r2\|

	52 "lw $t4, 12($t0) \n" // t4 = \|a3\|b3\|g3\|r3\|

	53 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = \|a1\|g1\|a0\|g0\|

	54 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = \|b1\|r1\|b0\|r0\|

	55 "preceu.ph.qbla $t1, $t0 \n" // t1 = \|0\|a1\|0\|a0\|

	56 "preceu.ph.qbra $t2, $t0 \n" // t2 = \|0\|g1\|0\|g0\|

	57 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|b1\|0\|b0\|

	58 "preceu.ph.qbra $t5, $t5 \n" // t5 = \|0\|r1\|0\|r0\|

	59 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cura1)+(cura0)

	60 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(curb1)+(curb0)

	61 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(curg1)+(curg0)

	62 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(curr1)+(curr0)

	63 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = \|a3\|g3\|a2\|g2\|

	64 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = \|b3\|r3\|b2\|r2\|

	65 "preceu.ph.qbla $t1, $t0 \n" // t1 = \|0\|a3\|0\|a2\|

	66 "preceu.ph.qbra $t2, $t0 \n" // t2 = \|0\|g3\|0\|g2\|

	67 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|b3\|0\|b2\|

	68 "preceu.ph.qbra $t5, $t5 \n" // t5 = \|0\|r3\|0\|r2\|

	69 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cura3)+(cura2)

	70 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(curb3)+(curb2)

	71 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(curg3)+(curg2)

	72 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(curr3)+(curr2)

	73 "addiu $t7, $t7, -1 \n"

	74 "bgtz $t7, 11b \n"

	75 " addiu %[fx], %[fx], 8 \n"

	76

	77 "2: \n"

	78 "andi $t7, %[filter_len], 0x3 \n" // residual

	79 "beqz $t7, 3f \n"

	80 " nop \n"

	81

	82 "21: \n"

	83 "sll $t1, %[fx], 1 \n"

	84 "addu $t2, %[filter_val], %[fx] \n"

	85 "addu $t0, %[rtf], $t1 \n"

	86 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]

	87 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]

	88 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]

	89 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]

	90 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2]

	91 "maddu $ac3, $t6, $t1 \n"

	92 "maddu $ac2, $t6, $t2 \n"

	93 "maddu $ac1, $t6, $t3 \n"

	94 "maddu $ac0, $t6, $t4 \n"

	95 "addiu $t7, $t7, -1 \n"

	96 "bgtz $t7, 21b \n"

	97 " addiu %[fx], %[fx], 2 \n"

	98

	99 "3: \n"

	100 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits

	101 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits

	102 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits

	103 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits

	104 "sll $t5, %[out_x], 2 \n"

	105 "repl.ph $t6, 128 \n" // t6 = \| 128 \| 128 \|

	106 "addu $t5, %[out_row], $t5 \n"

	107 "append $t2, $t3, 16 \n"

	108 "append $t0, $t1, 16 \n"

	109 "subu.ph $t1, $t0, $t6 \n"

	110 "shll_s.ph $t1, $t1, 8 \n"

	111 "shra.ph $t1, $t1, 8 \n"

	112 "addu.ph $t1, $t1, $t6 \n"

	113 "subu.ph $t3, $t2, $t6 \n"

	114 "shll_s.ph $t3, $t3, 8 \n"

	115 "shra.ph $t3, $t3, 8 \n"

	116 "addu.ph $t3, $t3, $t6 \n"

	117 "precr.qb.ph $t0, $t1, $t3 \n"

	118 "usw $t0, 0($t5) \n"

	119

	120 ".set pop \n"

	121 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),

	122 [rtf] "+r" (row_to_filter)

	123 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),

	124 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),

	125 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)

	126 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",

	127 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"

	128 );

	129 }

	130 } else {

	131 for (int out_x = 0; out_x < num_values; out_x++) {

	132 // Get the filter that determines the current output pixel.

	133 int filter_offset, filter_length;

	134 const ConvolutionFilter1D::Fixed* filter_values =

	135 filter.FilterForValue(out_x, &filter_offset, &filter_length);

	136 int filter_x = 0;

	137 __asm__ __volatile__ (

	138 ".set push \n"

	139 ".set noreorder \n"

	140

	141 "beqz %[filter_len], 3f \n"

	142 " sll $t0, %[filter_offset], 2 \n"

	143 "addu %[rtf], %[src_data], $t0 \n"

	144 "mtlo $0, $ac1 \n"

	145 "mtlo $0, $ac2 \n"

	146 "mtlo $0, $ac3 \n"

	147 "srl $t7, %[filter_len], 2 \n"

	148 "beqz $t7, 2f \n"

	149 " li %[fx], 0 \n"

	150

	151 "11: \n"

	152 "addu $t4, %[filter_val], %[fx] \n"

	153 "sll $t5, %[fx], 1 \n"

	154 "ulw $t6, 0($t4) \n" // t6 = \|cur[1]\|cur[0]\|

	155 "ulw $t8, 4($t4) \n" // t8 = \|cur[3]\|cur[2]\|

	156 "addu $t0, %[rtf], $t5 \n"

	157 "lw $t1, 0($t0) \n" // t1 = \|a0\|b0\|g0\|r0\|

	158 "lw $t2, 4($t0) \n" // t2 = \|a1\|b1\|g1\|r1\|

	159 "lw $t3, 8($t0) \n" // t3 = \|a2\|b2\|g2\|r2\|

	160 "lw $t4, 12($t0) \n" // t4 = \|a3\|b3\|g3\|r3\|

	161 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = \|a1\|g1\|a0\|g0\|

	162 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = \|b1\|r1\|b0\|r0\|

	163 "preceu.ph.qbra $t2, $t0 \n" // t2 = \|0\|g1\|0\|g0\|

	164 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|b1\|0\|b0\|

	165 "preceu.ph.qbra $t5, $t5 \n" // t5 = \|0\|r1\|0\|r0\|

	166 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(curb1)+(curb0)

	167 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(curg1)+(curg0)

	168 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(curr1)+(curr0)

	169 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = \|a3\|g3\|a2\|g2\|

	170 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = \|b3\|r3\|b2\|r2\|

	171 "preceu.ph.qbra $t2, $t0 \n" // t2 = \|0\|g3\|0\|g2\|

	172 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|b3\|0\|b2\|

	173 "preceu.ph.qbra $t5, $t5 \n" // t5 = \|0\|r3\|0\|r2\|

	174 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(curb3)+(curb2)

	175 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(curg3)+(curg2)

	176 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(curr3)+(curr2)

	177 "addiu $t7, $t7, -1 \n"

	178 "bgtz $t7, 11b \n"

	179 " addiu %[fx], %[fx], 8 \n"

	180

	181 "2: \n"

	182 "andi $t7, %[filter_len], 0x3 \n" // residual

	183 "beqz $t7, 3f \n"

	184 " nop \n"

	185

	186 "21: \n"

	187 "sll $t1, %[fx], 1 \n"

	188 "addu $t2, %[filter_val], %[fx] \n"

	189 "addu $t0, %[rtf], $t1 \n"

	190 "lh $t6, 0($t2) \n" // t6 = filter_val[fx]

	191 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0]

	192 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1]

	193 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2]

	194 "maddu $ac3, $t6, $t1 \n"

	195 "maddu $ac2, $t6, $t2 \n"

	196 "maddu $ac1, $t6, $t3 \n"

	197 "addiu $t7, $t7, -1 \n"

	198 "bgtz $t7, 21b \n"

	199 " addiu %[fx], %[fx], 2 \n"

	200

	201 "3: \n"

	202 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits

	203 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits

	204 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits

	205 "repl.ph $t6, 128 \n" // t6 = \| 128 \| 128 \|

	206 "sll $t8, %[out_x], 2 \n"

	207 "addu $t8, %[out_row], $t8 \n"

	208 "append $t2, $t3, 16 \n"

	209 "andi $t1, 0xFFFF \n"

	210 "subu.ph $t5, $t1, $t6 \n"

	211 "shll_s.ph $t5, $t5, 8 \n"

	212 "shra.ph $t5, $t5, 8 \n"

	213 "addu.ph $t5, $t5, $t6 \n"

	214 "subu.ph $t4, $t2, $t6 \n"

	215 "shll_s.ph $t4, $t4, 8 \n"

	216 "shra.ph $t4, $t4, 8 \n"

	217 "addu.ph $t4, $t4, $t6 \n"

	218 "precr.qb.ph $t0, $t5, $t4 \n"

	219 "usw $t0, 0($t8) \n"

	220

	221 ".set pop \n"

	222 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),

	223 [rtf] "+r" (row_to_filter)

	224 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),

	225 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),

	226 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)

	227 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",

	228 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"

	229 );

	230 }

	231 }

	232 #endif

	233 }

	234 template<bool has_alpha>

	235 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,

	236 int filter_length,

	237 unsigned char* const* source_data_rows,

	238 int pixel_width,

	239 unsigned char* out_row) {

	240 #if SIMD_MIPS_DSPR2

	241 // We go through each column in the output and do a vertical convolution,

	242 // generating one output pixel each time.

	243 int byte_offset;

	244 int cnt;

	245 int filter_y;

	246 if (has_alpha) {
	Justin Novosad 2013/05/23 15:32:03 Same here. Same here. Teodora Novkovic 2013/05/24 16:02:20 Done. Show quoted text On 2013/05/23 15:32:03, junov wrote: > Same here. Done.
	247 for (int out_x = 0; out_x < pixel_width; out_x++) {

	248 __asm__ __volatile__ (

	249 ".set push \n"

	250 ".set noreorder \n"

	251

	252 "beqz %[filter_len], 3f \n"

	253 " sll %[offset], %[out_x], 2 \n"

	254 "mtlo $0, $ac0 \n"

	255 "mtlo $0, $ac1 \n"

	256 "mtlo $0, $ac2 \n"

	257 "mtlo $0, $ac3 \n"

	258 "srl %[cnt], %[filter_len], 2 \n"

	259 "beqz %[cnt], 2f \n"

	260 " li %[fy], 0 \n"

	261

	262 "11: \n"

	263 "sll $t1, %[fy], 1 \n"

	264 "addu $t0, %[src_data_rows], $t1 \n"

	265 "lw $t1, 0($t0) \n"

	266 "lw $t2, 4($t0) \n"

	267 "lw $t3, 8($t0) \n"

	268 "lw $t4, 12($t0) \n"

	269 "addu $t1, $t1, %[offset] \n"

	270 "addu $t2, $t2, %[offset] \n"

	271 "addu $t3, $t3, %[offset] \n"

	272 "addu $t4, $t4, %[offset] \n"

	273 "lw $t1, 0($t1) \n" // t1 = \|a0\|b0\|g0\|r0\|

	274 "lw $t2, 0($t2) \n" // t2 = \|a1\|b1\|g1\|r1\|

	275 "lw $t3, 0($t3) \n" // t3 = \|a0\|b0\|g0\|r0\|

	276 "lw $t4, 0($t4) \n" // t4 = \|a1\|b1\|g1\|r1\|

	277 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = \|a1\|g1\|a0\|g0\|

	278 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = \|b1\|r1\|b0\|r0\|

	279 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|a1\|0\|a0\|

	280 "preceu.ph.qbra $t1, $t5 \n" // t1 = \|0\|g1\|0\|g0\|

	281 "preceu.ph.qbla $t2, $t6 \n" // t2 = \|0\|b1\|0\|b0\|

	282 "preceu.ph.qbra $t5, $t6 \n" // t5 = \|0\|r1\|0\|r0\|

	283 "addu $t6, %[filter_val], %[fy] \n"

	284 "ulw $t7, 0($t6) \n" // t7 = \|cur_1\|cur_0\|

	285 "ulw $t6, 4($t6) \n" // t6 = \|cur_3\|cur_2\|

	286 "dpa.w.ph $ac0, $t5, $t7 \n" // (curr1)+(curr0)

	287 "dpa.w.ph $ac1, $t1, $t7 \n" // (curg1)+(curg0)

	288 "dpa.w.ph $ac2, $t2, $t7 \n" // (curb1)+(curb0)

	289 "dpa.w.ph $ac3, $t0, $t7 \n" // (cura1)+(cura0)

	290 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = \|a3\|g3\|a2\|g2\|

	291 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = \|b3\|r3\|b2\|r2\|

	292 "preceu.ph.qbla $t0, $t5 \n" // t0 = \|0\|a3\|0\|a2\|

	293 "preceu.ph.qbra $t1, $t5 \n" // t1 = \|0\|g3\|0\|g2\|

	294 "preceu.ph.qbla $t2, $t7 \n" // t2 = \|0\|b3\|0\|b2\|

	295 "preceu.ph.qbra $t5, $t7 \n" // t5 = \|0\|r3\|0\|r2\|

	296 "dpa.w.ph $ac0, $t5, $t6 \n" // (curr3)+(curr2)

	297 "dpa.w.ph $ac1, $t1, $t6 \n" // (curg3)+(curg2)

	298 "dpa.w.ph $ac2, $t2, $t6 \n" // (curb3)+(curb2)

	299 "dpa.w.ph $ac3, $t0, $t6 \n" // (cura3)+(cura2)

	300 "addiu %[cnt], %[cnt], -1 \n"

	301 "bgtz %[cnt], 11b \n"

	302 " addiu %[fy], %[fy], 8 \n"

	303

	304 "2: \n"

	305 "andi %[cnt], %[filter_len], 0x3 \n" // residual

	306 "beqz %[cnt], 3f \n"

	307 " nop \n"

	308

	309 "21: \n"

	310 "addu $t0, %[filter_val], %[fy] \n"

	311 "lh $t4, 0($t0) \n" // t4=filter_val[fx]

	312 "sll $t1, %[fy], 1 \n"

	313 "addu $t0, %[src_data_rows], $t1 \n"

	314 "lw $t1, 0($t0) \n"

	315 "addu $t0, $t1, %[offset] \n"

	316 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]

	317 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]

	318 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]

	319 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2]

	320 "maddu $ac0, $t4, $t1 \n"

	321 "maddu $ac1, $t4, $t2 \n"

	322 "maddu $ac2, $t4, $t3 \n"

	323 "maddu $ac3, $t4, $t0 \n"

	324 "addiu %[cnt], %[cnt], -1 \n"

	325 "bgtz %[cnt], 21b \n"

	326 " addiu %[fy], %[fy], 2 \n"

	327

	328 "3: \n"

	329 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits

	330 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits

	331 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits

	332 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits

	333 "repl.ph $t4, 128 \n" // t4 = \| 128 \| 128 \|

	334 "addu $t5, %[out_row], %[offset] \n"

	335 "append $t2, $t3, 16 \n" // t2 = \|0\|g\|0\|r\|

	336 "append $t0, $t1, 16 \n" // t0 = \|0\|a\|0\|b\|

	337 "subu.ph $t1, $t0, $t4 \n"

	338 "shll_s.ph $t1, $t1, 8 \n"

	339 "shra.ph $t1, $t1, 8 \n"

	340 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)\|Clamp(b)

	341 "subu.ph $t2, $t2, $t4 \n"

	342 "shll_s.ph $t2, $t2, 8 \n"

	343 "shra.ph $t2, $t2, 8 \n"

	344 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)\|Clamp(r)

	345 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b)

	346 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r

	347 "pick.ph $t0, $t2, $t3 \n"

	348 "andi $t3, $t0, 0xFF \n"

	349 "srl $t4, $t0, 16 \n"

	350 "cmp.lt.ph $t3, $t4 \n"

	351 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch

	352 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a)

	353 "cmp.lt.ph $t3, $t0 \n"

	354 "pick.ph $t0, $t0, $t3 \n"

	355 "ins $t1, $t0, 16, 8 \n"

	356 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = \|a\|b\|g\|r\|

	357 "usw $t0, 0($t5) \n"

	358

	359 ".set pop \n"

	360 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),

	361 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),

	362 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)

	363 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),

	364 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)

	365 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",

	366 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"

	367 );

	368 }

	369 } else {

	370 for (int out_x = 0; out_x < pixel_width; out_x++) {

	371 __asm__ __volatile__ (

	372 ".set push \n"

	373 ".set noreorder \n"

	374

	375 "beqz %[filter_len], 3f \n"

	376 " sll %[offset], %[out_x], 2 \n"

	377 "mtlo $0, $ac0 \n"

	378 "mtlo $0, $ac1 \n"

	379 "mtlo $0, $ac2 \n"

	380 "srl %[cnt], %[filter_len], 2 \n"

	381 "beqz %[cnt], 2f \n"

	382 " li %[fy], 0 \n"

	383

	384 "11: \n"

	385 "sll $t1, %[fy], 1 \n"

	386 "addu $t0, %[src_data_rows], $t1 \n"

	387 "lw $t1, 0($t0) \n"

	388 "lw $t2, 4($t0) \n"

	389 "lw $t3, 8($t0) \n"

	390 "lw $t4, 12($t0) \n"

	391 "addu $t1, $t1, %[offset] \n"

	392 "addu $t2, $t2, %[offset] \n"

	393 "addu $t3, $t3, %[offset] \n"

	394 "addu $t4, $t4, %[offset] \n"

	395 "lw $t1, 0($t1) \n" // t1 = \|a0\|b0\|g0\|r0\|

	396 "lw $t2, 0($t2) \n" // t2 = \|a1\|b1\|g1\|r1\|

	397 "lw $t3, 0($t3) \n" // t3 = \|a0\|b0\|g0\|r0\|

	398 "lw $t4, 0($t4) \n" // t4 = \|a1\|b1\|g1\|r1\|

	399 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = \|a1\|g1\|a0\|g0\|

	400 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = \|b1\|r1\|b0\|r0\|

	401 "preceu.ph.qbra $t1, $t5 \n" // t1 = \|0\|g1\|0\|g0\|

	402 "preceu.ph.qbla $t2, $t6 \n" // t2 = \|0\|b1\|0\|b0\|

	403 "preceu.ph.qbra $t5, $t6 \n" // t5 = \|0\|r1\|0\|r0\|

	404 "addu $t6, %[filter_val], %[fy] \n"

	405 "ulw $t0, 0($t6) \n" // t0 = \|cur_1\|cur_0\|

	406 "ulw $t6, 4($t6) \n" // t6 = \|cur_1\|cur_0\|

	407 "dpa.w.ph $ac0, $t5, $t0 \n" // (curr1)+(curr0)

	408 "dpa.w.ph $ac1, $t1, $t0 \n" // (curg1)+(curg0)

	409 "dpa.w.ph $ac2, $t2, $t0 \n" // (curb1)+(curb0)

	410 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = \|a3\|g3\|a2\|g2\|

	411 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = \|b3\|r3\|b2\|r2\|

	412 "preceu.ph.qbra $t1, $t5 \n" // t1 = \|0\|g3\|0\|g2\|

	413 "preceu.ph.qbla $t2, $t0 \n" // t2 = \|0\|b3\|0\|b2\|

	414 "preceu.ph.qbra $t5, $t0 \n" // t5 = \|0\|r3\|0\|r2\|

	415 "dpa.w.ph $ac0, $t5, $t6 \n" // (curr1)+(curr0)

	416 "dpa.w.ph $ac1, $t1, $t6 \n" // (curg1)+(curg0)

	417 "dpa.w.ph $ac2, $t2, $t6 \n" // (curb1)+(curb0)

	418 "addiu %[cnt], %[cnt], -1 \n"

	419 "bgtz %[cnt], 11b \n"

	420 " addiu %[fy], %[fy], 8 \n"

	421

	422 "2: \n"

	423 "andi %[cnt], %[filter_len], 0x3 \n" // residual

	424 "beqz %[cnt], 3f \n"

	425 " nop \n"

	426

	427 "21: \n"

	428 "addu $t0, %[filter_val], %[fy] \n"

	429 "lh $t4, 0($t0) \n" // filter_val[fx]

	430 "sll $t1, %[fy], 1 \n"

	431 "addu $t0, %[src_data_rows], $t1 \n"

	432 "lw $t1, 0($t0) \n"

	433 "addu $t0, $t1, %[offset] \n"

	434 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0]

	435 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1]

	436 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2]

	437 "maddu $ac0, $t4, $t1 \n"

	438 "maddu $ac1, $t4, $t2 \n"

	439 "maddu $ac2, $t4, $t3 \n"

	440 "addiu %[cnt], %[cnt], -1 \n"

	441 "bgtz %[cnt], 21b \n"

	442 " addiu %[fy], %[fy], 2 \n"

	443

	444 "3: \n"

	445 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits

	446 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits

	447 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits

	448 "repl.ph $t6, 128 \n" // t6 = \| 128 \| 128 \|

	449 "addu $t5, %[out_row], %[offset] \n"

	450 "append $t2, $t3, 16 \n" // t2 = \|0\|g\|0\|r\|

	451 "andi $t1, $t1, 0xFFFF \n"

	452 "subu.ph $t1, $t1, $t6 \n"

	453 "shll_s.ph $t1, $t1, 8 \n"

	454 "shra.ph $t1, $t1, 8 \n"

	455 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)\|Clamp(b)

	456 "subu.ph $t2, $t2, $t6 \n"

	457 "shll_s.ph $t2, $t2, 8 \n"

	458 "shra.ph $t2, $t2, 8 \n"

	459 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)\|Clamp(r)

	460 "li $t0, 0xFF \n"

	461 "ins $t1, $t0, 16, 8 \n"

	462 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = \|a\|b\|g\|r\|

	463 "usw $t0, 0($t5) \n"

	464

	465 ".set pop \n"

	466 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),

	467 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),

	468 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)

	469 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),

	470 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)

	471 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",

	472 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"

	473 );

	474 }

	475 }

	476 #endif

	477 }

	478

	479 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,

	480 int filter_length,

	481 unsigned char* const* source_data_rows,

	482 int pixel_width,

	483 unsigned char* out_row,

	484 bool has_alpha) {

	485 if (has_alpha) {
	Justin Novosad 2013/05/23 15:32:03 This is the conditional branch I was talking about This is the conditional branch I was talking about in previous comment. It executes once per call, so there should be no win from templating, only extra complexity in the code. Teodora Novkovic 2013/05/24 16:02:20 Done. Show quoted text On 2013/05/23 15:32:03, junov wrote: > This is the conditional branch I was talking about in previous comment. It > executes once per call, so there should be no win from templating, only extra > complexity in the code. Done.
	486 ConvolveVertically_mips_dspr2<true>(filter_val,

	487 filter_length,

	488 source_data_rows,

	489 pixel_width,

	490 out_row);

	491 } else {

	492 ConvolveVertically_mips_dspr2<false>(filter_val,

	493 filter_length,

	494 source_data_rows,

	495 pixel_width,

	496 out_row);

	497 }

	498 }

	499

	500 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,

	501 const ConvolutionFilter1D& filter,

	502 unsigned char* out_row,

	503 bool has_alpha) {

	504 if (has_alpha) {

	505 ConvolveHorizontally_mips_dspr2<true>(src_data,

	506 filter,

	507 out_row);

	508 } else {

	509 ConvolveHorizontally_mips_dspr2<false>(src_data,

	510 filter,

	511 out_row);

	512 }

	513 }

	514 } // namespace skia

OLD	NEW

« skia/ext/convolver_SSE2.cc ('K') | « skia/ext/convolver_mips_dspr2.h ('k') | skia/skia.gyp » ('j') | skia/skia.gyp » ('J')