OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // |
| 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- |
| 9 // |
| 10 // MIPS version of dsp functions |
| 11 // |
| 12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
| 13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
| 14 |
| 15 #include "./dsp.h" |
| 16 |
| 17 #if defined(WEBP_USE_MIPS_DSP_R2) |
| 18 |
| 19 #include "./mips_macro.h" |
| 20 |
| 21 static const int kC1 = 20091 + (1 << 16); |
| 22 static const int kC2 = 35468; |
| 23 |
| 24 #define MUL(a, b) (((a) * (b)) >> 16) |
| 25 |
| 26 static void TransformDC(const int16_t* in, uint8_t* dst) { |
| 27 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10; |
| 28 |
| 29 __asm__ volatile ( |
| 30 LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst, |
| 31 0, 0, 0, 0, |
| 32 0, 1, 2, 3, |
| 33 BPS) |
| 34 "lh %[temp5], 0(%[in]) \n\t" |
| 35 "addiu %[temp5], %[temp5], 4 \n\t" |
| 36 "ins %[temp5], %[temp5], 16, 16 \n\t" |
| 37 "shra.ph %[temp5], %[temp5], 3 \n\t" |
| 38 CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2, |
| 39 temp3, temp1, temp2, temp3, temp4) |
| 40 STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3, |
| 41 temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5, |
| 42 dst, 0, 1, 2, 3, BPS) |
| 43 |
| 44 OUTPUT_EARLY_CLOBBER_REGS_10() |
| 45 : [in]"r"(in), [dst]"r"(dst) |
| 46 : "memory" |
| 47 ); |
| 48 } |
| 49 |
| 50 static void TransformAC3(const int16_t* in, uint8_t* dst) { |
| 51 const int a = in[0] + 4; |
| 52 int c4 = MUL(in[4], kC2); |
| 53 const int d4 = MUL(in[4], kC1); |
| 54 const int c1 = MUL(in[1], kC2); |
| 55 const int d1 = MUL(in[1], kC1); |
| 56 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
| 57 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
| 58 |
| 59 __asm__ volatile ( |
| 60 "ins %[c4], %[d4], 16, 16 \n\t" |
| 61 "replv.ph %[temp1], %[a] \n\t" |
| 62 "replv.ph %[temp4], %[d1] \n\t" |
| 63 ADD_SUB_HALVES(temp2, temp3, temp1, c4) |
| 64 "replv.ph %[temp5], %[c1] \n\t" |
| 65 SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4, |
| 66 temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5) |
| 67 LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst, |
| 68 0, 0, 0, 0, |
| 69 0, 1, 2, 3, |
| 70 BPS) |
| 71 CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16, |
| 72 temp11, temp17, temp3, temp5, temp11, temp12) |
| 73 PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2, |
| 74 temp4, temp7, temp6, temp10, temp9) |
| 75 STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11, |
| 76 temp17, temp12, temp18, temp1, temp8, temp2, temp4, |
| 77 temp7, temp6, dst, 0, 1, 2, 3, BPS) |
| 78 |
| 79 OUTPUT_EARLY_CLOBBER_REGS_18(), |
| 80 [c4]"+&r"(c4) |
| 81 : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1) |
| 82 : "memory" |
| 83 ); |
| 84 } |
| 85 |
| 86 static void TransformOne(const int16_t* in, uint8_t* dst) { |
| 87 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
| 88 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18; |
| 89 |
| 90 __asm__ volatile ( |
| 91 "ulw %[temp1], 0(%[in]) \n\t" |
| 92 "ulw %[temp2], 16(%[in]) \n\t" |
| 93 LOAD_IN_X2(temp5, temp6, 24, 26) |
| 94 ADD_SUB_HALVES(temp3, temp4, temp1, temp2) |
| 95 LOAD_IN_X2(temp1, temp2, 8, 10) |
| 96 MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14, |
| 97 temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6, |
| 98 temp13, temp11, temp14, temp12) |
| 99 INSERT_HALF_X2(temp8, temp7, temp10, temp9) |
| 100 "ulw %[temp17], 4(%[in]) \n\t" |
| 101 "ulw %[temp18], 20(%[in]) \n\t" |
| 102 ADD_SUB_HALVES(temp1, temp2, temp3, temp8) |
| 103 ADD_SUB_HALVES(temp5, temp6, temp4, temp7) |
| 104 ADD_SUB_HALVES(temp7, temp8, temp17, temp18) |
| 105 LOAD_IN_X2(temp17, temp18, 12, 14) |
| 106 LOAD_IN_X2(temp9, temp10, 28, 30) |
| 107 MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17, |
| 108 temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10, |
| 109 temp15, temp4, temp16, temp17) |
| 110 INSERT_HALF_X2(temp11, temp12, temp13, temp14) |
| 111 ADD_SUB_HALVES(temp17, temp8, temp8, temp11) |
| 112 ADD_SUB_HALVES(temp3, temp4, temp7, temp12) |
| 113 |
| 114 // horizontal |
| 115 SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6) |
| 116 INSERT_HALF_X2(temp1, temp6, temp5, temp2) |
| 117 SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8) |
| 118 "repl.ph %[temp2], 0x4 \n\t" |
| 119 INSERT_HALF_X2(temp3, temp8, temp17, temp4) |
| 120 "addq.ph %[temp1], %[temp1], %[temp2] \n\t" |
| 121 "addq.ph %[temp6], %[temp6], %[temp2] \n\t" |
| 122 ADD_SUB_HALVES(temp2, temp4, temp1, temp3) |
| 123 ADD_SUB_HALVES(temp5, temp7, temp6, temp8) |
| 124 MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18, |
| 125 temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15, |
| 126 temp6, temp17, temp8, temp18) |
| 127 MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16, |
| 128 temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14, |
| 129 temp18, temp12, temp17, temp16) |
| 130 INSERT_HALF_X2(temp1, temp3, temp9, temp13) |
| 131 INSERT_HALF_X2(temp6, temp8, temp11, temp15) |
| 132 SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15, |
| 133 temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8, |
| 134 temp6) |
| 135 PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13, |
| 136 temp16, temp11, temp10, temp15, temp14) |
| 137 LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst, |
| 138 0, 0, 0, 0, |
| 139 0, 1, 2, 3, |
| 140 BPS) |
| 141 CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10, |
| 142 temp11, temp10, temp11, temp14, temp15) |
| 143 STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11, |
| 144 temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4, |
| 145 dst, 0, 1, 2, 3, BPS) |
| 146 |
| 147 OUTPUT_EARLY_CLOBBER_REGS_18() |
| 148 : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2) |
| 149 : "memory", "hi", "lo" |
| 150 ); |
| 151 } |
| 152 |
| 153 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { |
| 154 TransformOne(in, dst); |
| 155 if (do_two) { |
| 156 TransformOne(in + 16, dst + 4); |
| 157 } |
| 158 } |
| 159 |
| 160 static WEBP_INLINE void FilterLoop26(uint8_t* p, |
| 161 int hstride, int vstride, int size, |
| 162 int thresh, int ithresh, int hev_thresh) { |
| 163 const int thresh2 = 2 * thresh + 1; |
| 164 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9; |
| 165 int temp10, temp11, temp12, temp13, temp14, temp15; |
| 166 |
| 167 __asm__ volatile ( |
| 168 ".set push \n\t" |
| 169 ".set noreorder \n\t" |
| 170 "1: \n\t" |
| 171 "negu %[temp1], %[hstride] \n\t" |
| 172 "addiu %[size], %[size], -1 \n\t" |
| 173 "sll %[temp2], %[hstride], 1 \n\t" |
| 174 "sll %[temp3], %[temp1], 1 \n\t" |
| 175 "addu %[temp4], %[temp2], %[hstride] \n\t" |
| 176 "addu %[temp5], %[temp3], %[temp1] \n\t" |
| 177 "lbu %[temp7], 0(%[p]) \n\t" |
| 178 "sll %[temp6], %[temp3], 1 \n\t" |
| 179 "lbux %[temp8], %[temp5](%[p]) \n\t" |
| 180 "lbux %[temp9], %[temp3](%[p]) \n\t" |
| 181 "lbux %[temp10], %[temp1](%[p]) \n\t" |
| 182 "lbux %[temp11], %[temp6](%[p]) \n\t" |
| 183 "lbux %[temp12], %[hstride](%[p]) \n\t" |
| 184 "lbux %[temp13], %[temp2](%[p]) \n\t" |
| 185 "lbux %[temp14], %[temp4](%[p]) \n\t" |
| 186 "subu %[temp1], %[temp10], %[temp7] \n\t" |
| 187 "subu %[temp2], %[temp9], %[temp12] \n\t" |
| 188 "absq_s.w %[temp3], %[temp1] \n\t" |
| 189 "absq_s.w %[temp4], %[temp2] \n\t" |
| 190 "negu %[temp1], %[temp1] \n\t" |
| 191 "sll %[temp3], %[temp3], 2 \n\t" |
| 192 "addu %[temp15], %[temp3], %[temp4] \n\t" |
| 193 "subu %[temp3], %[temp15], %[thresh2] \n\t" |
| 194 "sll %[temp6], %[temp1], 1 \n\t" |
| 195 "bgtz %[temp3], 3f \n\t" |
| 196 " subu %[temp4], %[temp11], %[temp8] \n\t" |
| 197 "absq_s.w %[temp4], %[temp4] \n\t" |
| 198 "shll_s.w %[temp2], %[temp2], 24 \n\t" |
| 199 "subu %[temp4], %[temp4], %[ithresh] \n\t" |
| 200 "bgtz %[temp4], 3f \n\t" |
| 201 " subu %[temp3], %[temp8], %[temp9] \n\t" |
| 202 "absq_s.w %[temp3], %[temp3] \n\t" |
| 203 "subu %[temp3], %[temp3], %[ithresh] \n\t" |
| 204 "bgtz %[temp3], 3f \n\t" |
| 205 " subu %[temp5], %[temp9], %[temp10] \n\t" |
| 206 "absq_s.w %[temp3], %[temp5] \n\t" |
| 207 "absq_s.w %[temp5], %[temp5] \n\t" |
| 208 "subu %[temp3], %[temp3], %[ithresh] \n\t" |
| 209 "bgtz %[temp3], 3f \n\t" |
| 210 " subu %[temp3], %[temp14], %[temp13] \n\t" |
| 211 "absq_s.w %[temp3], %[temp3] \n\t" |
| 212 "slt %[temp5], %[hev_thresh], %[temp5] \n\t" |
| 213 "subu %[temp3], %[temp3], %[ithresh] \n\t" |
| 214 "bgtz %[temp3], 3f \n\t" |
| 215 " subu %[temp3], %[temp13], %[temp12] \n\t" |
| 216 "absq_s.w %[temp3], %[temp3] \n\t" |
| 217 "sra %[temp4], %[temp2], 24 \n\t" |
| 218 "subu %[temp3], %[temp3], %[ithresh] \n\t" |
| 219 "bgtz %[temp3], 3f \n\t" |
| 220 " subu %[temp15], %[temp12], %[temp7] \n\t" |
| 221 "absq_s.w %[temp3], %[temp15] \n\t" |
| 222 "absq_s.w %[temp15], %[temp15] \n\t" |
| 223 "subu %[temp3], %[temp3], %[ithresh] \n\t" |
| 224 "bgtz %[temp3], 3f \n\t" |
| 225 " slt %[temp15], %[hev_thresh], %[temp15] \n\t" |
| 226 "addu %[temp3], %[temp6], %[temp1] \n\t" |
| 227 "or %[temp2], %[temp5], %[temp15] \n\t" |
| 228 "addu %[temp5], %[temp4], %[temp3] \n\t" |
| 229 "beqz %[temp2], 4f \n\t" |
| 230 " shra_r.w %[temp1], %[temp5], 3 \n\t" |
| 231 "addiu %[temp2], %[temp5], 3 \n\t" |
| 232 "sra %[temp2], %[temp2], 3 \n\t" |
| 233 "shll_s.w %[temp1], %[temp1], 27 \n\t" |
| 234 "shll_s.w %[temp2], %[temp2], 27 \n\t" |
| 235 "subu %[temp3], %[p], %[hstride] \n\t" |
| 236 "sra %[temp1], %[temp1], 27 \n\t" |
| 237 "sra %[temp2], %[temp2], 27 \n\t" |
| 238 "subu %[temp1], %[temp7], %[temp1] \n\t" |
| 239 "addu %[temp2], %[temp10], %[temp2] \n\t" |
| 240 "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t" |
| 241 "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t" |
| 242 "sb %[temp2], 0(%[temp3]) \n\t" |
| 243 "j 3f \n\t" |
| 244 " sb %[temp1], 0(%[p]) \n\t" |
| 245 "4: \n\t" |
| 246 "shll_s.w %[temp5], %[temp5], 24 \n\t" |
| 247 "subu %[temp14], %[p], %[hstride] \n\t" |
| 248 "subu %[temp11], %[temp14], %[hstride] \n\t" |
| 249 "sra %[temp6], %[temp5], 24 \n\t" |
| 250 "sll %[temp1], %[temp6], 3 \n\t" |
| 251 "subu %[temp15], %[temp11], %[hstride] \n\t" |
| 252 "addu %[temp2], %[temp6], %[temp1] \n\t" |
| 253 "sll %[temp3], %[temp2], 1 \n\t" |
| 254 "addu %[temp4], %[temp3], %[temp2] \n\t" |
| 255 "addiu %[temp2], %[temp2], 63 \n\t" |
| 256 "addiu %[temp3], %[temp3], 63 \n\t" |
| 257 "addiu %[temp4], %[temp4], 63 \n\t" |
| 258 "sra %[temp2], %[temp2], 7 \n\t" |
| 259 "sra %[temp3], %[temp3], 7 \n\t" |
| 260 "sra %[temp4], %[temp4], 7 \n\t" |
| 261 "addu %[temp1], %[temp8], %[temp2] \n\t" |
| 262 "addu %[temp5], %[temp9], %[temp3] \n\t" |
| 263 "addu %[temp6], %[temp10], %[temp4] \n\t" |
| 264 "subu %[temp8], %[temp7], %[temp4] \n\t" |
| 265 "subu %[temp7], %[temp12], %[temp3] \n\t" |
| 266 "addu %[temp10], %[p], %[hstride] \n\t" |
| 267 "subu %[temp9], %[temp13], %[temp2] \n\t" |
| 268 "addu %[temp12], %[temp10], %[hstride] \n\t" |
| 269 "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t" |
| 270 "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t" |
| 271 "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t" |
| 272 "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t" |
| 273 "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t" |
| 274 "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t" |
| 275 "sb %[temp2], 0(%[temp15]) \n\t" |
| 276 "sb %[temp3], 0(%[temp11]) \n\t" |
| 277 "sb %[temp4], 0(%[temp14]) \n\t" |
| 278 "sb %[temp5], 0(%[p]) \n\t" |
| 279 "sb %[temp6], 0(%[temp10]) \n\t" |
| 280 "sb %[temp8], 0(%[temp12]) \n\t" |
| 281 "3: \n\t" |
| 282 "bgtz %[size], 1b \n\t" |
| 283 " addu %[p], %[p], %[vstride] \n\t" |
| 284 ".set pop \n\t" |
| 285 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3), |
| 286 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), |
| 287 [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9), |
| 288 [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12), |
| 289 [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15), |
| 290 [size]"+&r"(size), [p]"+&r"(p) |
| 291 : [hstride]"r"(hstride), [thresh2]"r"(thresh2), |
| 292 [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh), |
| 293 [VP8kclip1]"r"(VP8kclip1) |
| 294 : "memory" |
| 295 ); |
| 296 } |
| 297 |
| 298 static WEBP_INLINE void FilterLoop24(uint8_t* p, |
| 299 int hstride, int vstride, int size, |
| 300 int thresh, int ithresh, int hev_thresh) { |
| 301 int p0, q0, p1, q1, p2, q2, p3, q3; |
| 302 int step1, step2, temp1, temp2, temp3, temp4; |
| 303 uint8_t* pTemp0; |
| 304 uint8_t* pTemp1; |
| 305 const int thresh2 = 2 * thresh + 1; |
| 306 |
| 307 __asm__ volatile ( |
| 308 ".set push \n\t" |
| 309 ".set noreorder \n\t" |
| 310 "bltz %[size], 3f \n\t" |
| 311 " nop \n\t" |
| 312 "2: \n\t" |
| 313 "negu %[step1], %[hstride] \n\t" |
| 314 "lbu %[q0], 0(%[p]) \n\t" |
| 315 "lbux %[p0], %[step1](%[p]) \n\t" |
| 316 "subu %[step1], %[step1], %[hstride] \n\t" |
| 317 "lbux %[q1], %[hstride](%[p]) \n\t" |
| 318 "subu %[temp1], %[p0], %[q0] \n\t" |
| 319 "lbux %[p1], %[step1](%[p]) \n\t" |
| 320 "addu %[step2], %[hstride], %[hstride] \n\t" |
| 321 "absq_s.w %[temp2], %[temp1] \n\t" |
| 322 "subu %[temp3], %[p1], %[q1] \n\t" |
| 323 "absq_s.w %[temp4], %[temp3] \n\t" |
| 324 "sll %[temp2], %[temp2], 2 \n\t" |
| 325 "addu %[temp2], %[temp2], %[temp4] \n\t" |
| 326 "subu %[temp4], %[temp2], %[thresh2] \n\t" |
| 327 "subu %[step1], %[step1], %[hstride] \n\t" |
| 328 "bgtz %[temp4], 0f \n\t" |
| 329 " lbux %[p2], %[step1](%[p]) \n\t" |
| 330 "subu %[step1], %[step1], %[hstride] \n\t" |
| 331 "lbux %[q2], %[step2](%[p]) \n\t" |
| 332 "lbux %[p3], %[step1](%[p]) \n\t" |
| 333 "subu %[temp4], %[p2], %[p1] \n\t" |
| 334 "addu %[step2], %[step2], %[hstride] \n\t" |
| 335 "subu %[temp2], %[p3], %[p2] \n\t" |
| 336 "absq_s.w %[temp4], %[temp4] \n\t" |
| 337 "absq_s.w %[temp2], %[temp2] \n\t" |
| 338 "lbux %[q3], %[step2](%[p]) \n\t" |
| 339 "subu %[temp4], %[temp4], %[ithresh] \n\t" |
| 340 "negu %[temp1], %[temp1] \n\t" |
| 341 "bgtz %[temp4], 0f \n\t" |
| 342 " subu %[temp2], %[temp2], %[ithresh] \n\t" |
| 343 "subu %[p3], %[p1], %[p0] \n\t" |
| 344 "bgtz %[temp2], 0f \n\t" |
| 345 " absq_s.w %[p3], %[p3] \n\t" |
| 346 "subu %[temp4], %[q3], %[q2] \n\t" |
| 347 "subu %[pTemp0], %[p], %[hstride] \n\t" |
| 348 "absq_s.w %[temp4], %[temp4] \n\t" |
| 349 "subu %[temp2], %[p3], %[ithresh] \n\t" |
| 350 "sll %[step1], %[temp1], 1 \n\t" |
| 351 "bgtz %[temp2], 0f \n\t" |
| 352 " subu %[temp4], %[temp4], %[ithresh] \n\t" |
| 353 "subu %[temp2], %[q2], %[q1] \n\t" |
| 354 "bgtz %[temp4], 0f \n\t" |
| 355 " absq_s.w %[temp2], %[temp2] \n\t" |
| 356 "subu %[q3], %[q1], %[q0] \n\t" |
| 357 "absq_s.w %[q3], %[q3] \n\t" |
| 358 "subu %[temp2], %[temp2], %[ithresh] \n\t" |
| 359 "addu %[temp1], %[temp1], %[step1] \n\t" |
| 360 "bgtz %[temp2], 0f \n\t" |
| 361 " subu %[temp4], %[q3], %[ithresh] \n\t" |
| 362 "slt %[p3], %[hev_thresh], %[p3] \n\t" |
| 363 "bgtz %[temp4], 0f \n\t" |
| 364 " slt %[q3], %[hev_thresh], %[q3] \n\t" |
| 365 "or %[q3], %[q3], %[p3] \n\t" |
| 366 "bgtz %[q3], 1f \n\t" |
| 367 " shra_r.w %[temp2], %[temp1], 3 \n\t" |
| 368 "addiu %[temp1], %[temp1], 3 \n\t" |
| 369 "sra %[temp1], %[temp1], 3 \n\t" |
| 370 "shll_s.w %[temp2], %[temp2], 27 \n\t" |
| 371 "shll_s.w %[temp1], %[temp1], 27 \n\t" |
| 372 "addu %[pTemp1], %[p], %[hstride] \n\t" |
| 373 "sra %[temp2], %[temp2], 27 \n\t" |
| 374 "sra %[temp1], %[temp1], 27 \n\t" |
| 375 "addiu %[step1], %[temp2], 1 \n\t" |
| 376 "sra %[step1], %[step1], 1 \n\t" |
| 377 "addu %[p0], %[p0], %[temp1] \n\t" |
| 378 "addu %[p1], %[p1], %[step1] \n\t" |
| 379 "subu %[q0], %[q0], %[temp2] \n\t" |
| 380 "subu %[q1], %[q1], %[step1] \n\t" |
| 381 "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t" |
| 382 "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t" |
| 383 "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t" |
| 384 "sb %[temp2], 0(%[pTemp0]) \n\t" |
| 385 "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t" |
| 386 "subu %[pTemp0], %[pTemp0], %[hstride] \n\t" |
| 387 "sb %[temp3], 0(%[p]) \n\t" |
| 388 "sb %[temp4], 0(%[pTemp1]) \n\t" |
| 389 "j 0f \n\t" |
| 390 " sb %[temp1], 0(%[pTemp0]) \n\t" |
| 391 "1: \n\t" |
| 392 "shll_s.w %[temp3], %[temp3], 24 \n\t" |
| 393 "sra %[temp3], %[temp3], 24 \n\t" |
| 394 "addu %[temp1], %[temp1], %[temp3] \n\t" |
| 395 "shra_r.w %[temp2], %[temp1], 3 \n\t" |
| 396 "addiu %[temp1], %[temp1], 3 \n\t" |
| 397 "shll_s.w %[temp2], %[temp2], 27 \n\t" |
| 398 "sra %[temp1], %[temp1], 3 \n\t" |
| 399 "shll_s.w %[temp1], %[temp1], 27 \n\t" |
| 400 "sra %[temp2], %[temp2], 27 \n\t" |
| 401 "sra %[temp1], %[temp1], 27 \n\t" |
| 402 "addu %[p0], %[p0], %[temp1] \n\t" |
| 403 "subu %[q0], %[q0], %[temp2] \n\t" |
| 404 "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t" |
| 405 "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t" |
| 406 "sb %[temp2], 0(%[p]) \n\t" |
| 407 "sb %[temp1], 0(%[pTemp0]) \n\t" |
| 408 "0: \n\t" |
| 409 "subu %[size], %[size], 1 \n\t" |
| 410 "bgtz %[size], 2b \n\t" |
| 411 " addu %[p], %[p], %[vstride] \n\t" |
| 412 "3: \n\t" |
| 413 ".set pop \n\t" |
| 414 : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1), |
| 415 [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3), |
| 416 [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1), |
| 417 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), |
| 418 [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p), |
| 419 [size]"+&r"(size) |
| 420 : [vstride]"r"(vstride), [ithresh]"r"(ithresh), |
| 421 [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride), |
| 422 [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) |
| 423 : "memory" |
| 424 ); |
| 425 } |
| 426 |
| 427 // on macroblock edges |
| 428 static void VFilter16(uint8_t* p, int stride, |
| 429 int thresh, int ithresh, int hev_thresh) { |
| 430 FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
| 431 } |
| 432 |
| 433 static void HFilter16(uint8_t* p, int stride, |
| 434 int thresh, int ithresh, int hev_thresh) { |
| 435 FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
| 436 } |
| 437 |
| 438 // 8-pixels wide variant, for chroma filtering |
| 439 static void VFilter8(uint8_t* u, uint8_t* v, int stride, |
| 440 int thresh, int ithresh, int hev_thresh) { |
| 441 FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh); |
| 442 FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh); |
| 443 } |
| 444 |
| 445 static void HFilter8(uint8_t* u, uint8_t* v, int stride, |
| 446 int thresh, int ithresh, int hev_thresh) { |
| 447 FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh); |
| 448 FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh); |
| 449 } |
| 450 |
| 451 // on three inner edges |
| 452 static void VFilter16i(uint8_t* p, int stride, |
| 453 int thresh, int ithresh, int hev_thresh) { |
| 454 int k; |
| 455 for (k = 3; k > 0; --k) { |
| 456 p += 4 * stride; |
| 457 FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh); |
| 458 } |
| 459 } |
| 460 |
| 461 static void HFilter16i(uint8_t* p, int stride, |
| 462 int thresh, int ithresh, int hev_thresh) { |
| 463 int k; |
| 464 for (k = 3; k > 0; --k) { |
| 465 p += 4; |
| 466 FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh); |
| 467 } |
| 468 } |
| 469 |
| 470 static void VFilter8i(uint8_t* u, uint8_t* v, int stride, |
| 471 int thresh, int ithresh, int hev_thresh) { |
| 472 FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
| 473 FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh); |
| 474 } |
| 475 |
| 476 static void HFilter8i(uint8_t* u, uint8_t* v, int stride, |
| 477 int thresh, int ithresh, int hev_thresh) { |
| 478 FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
| 479 FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); |
| 480 } |
| 481 |
| 482 #undef MUL |
| 483 |
| 484 //------------------------------------------------------------------------------ |
| 485 // Simple In-loop filtering (Paragraph 15.2) |
| 486 |
| 487 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { |
| 488 int i; |
| 489 const int thresh2 = 2 * thresh + 1; |
| 490 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
| 491 uint8_t* p1 = p - stride; |
| 492 __asm__ volatile ( |
| 493 ".set push \n\t" |
| 494 ".set noreorder \n\t" |
| 495 "li %[i], 16 \n\t" |
| 496 "0: \n\t" |
| 497 "negu %[temp4], %[stride] \n\t" |
| 498 "sll %[temp5], %[temp4], 1 \n\t" |
| 499 "lbu %[temp2], 0(%[p]) \n\t" |
| 500 "lbux %[temp3], %[stride](%[p]) \n\t" |
| 501 "lbux %[temp1], %[temp4](%[p]) \n\t" |
| 502 "lbux %[temp0], %[temp5](%[p]) \n\t" |
| 503 "subu %[temp7], %[temp1], %[temp2] \n\t" |
| 504 "subu %[temp6], %[temp0], %[temp3] \n\t" |
| 505 "absq_s.w %[temp4], %[temp7] \n\t" |
| 506 "absq_s.w %[temp5], %[temp6] \n\t" |
| 507 "sll %[temp4], %[temp4], 2 \n\t" |
| 508 "subu %[temp5], %[temp5], %[thresh2] \n\t" |
| 509 "addu %[temp5], %[temp4], %[temp5] \n\t" |
| 510 "negu %[temp8], %[temp7] \n\t" |
| 511 "bgtz %[temp5], 1f \n\t" |
| 512 " addiu %[i], %[i], -1 \n\t" |
| 513 "sll %[temp4], %[temp8], 1 \n\t" |
| 514 "shll_s.w %[temp5], %[temp6], 24 \n\t" |
| 515 "addu %[temp3], %[temp4], %[temp8] \n\t" |
| 516 "sra %[temp5], %[temp5], 24 \n\t" |
| 517 "addu %[temp3], %[temp3], %[temp5] \n\t" |
| 518 "addiu %[temp7], %[temp3], 3 \n\t" |
| 519 "sra %[temp7], %[temp7], 3 \n\t" |
| 520 "shra_r.w %[temp8], %[temp3], 3 \n\t" |
| 521 "shll_s.w %[temp0], %[temp7], 27 \n\t" |
| 522 "shll_s.w %[temp4], %[temp8], 27 \n\t" |
| 523 "sra %[temp0], %[temp0], 27 \n\t" |
| 524 "sra %[temp4], %[temp4], 27 \n\t" |
| 525 "addu %[temp7], %[temp1], %[temp0] \n\t" |
| 526 "subu %[temp2], %[temp2], %[temp4] \n\t" |
| 527 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" |
| 528 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" |
| 529 "sb %[temp3], 0(%[p1]) \n\t" |
| 530 "sb %[temp4], 0(%[p]) \n\t" |
| 531 "1: \n\t" |
| 532 "addiu %[p1], %[p1], 1 \n\t" |
| 533 "bgtz %[i], 0b \n\t" |
| 534 " addiu %[p], %[p], 1 \n\t" |
| 535 " .set pop \n\t" |
| 536 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 537 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 538 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), |
| 539 [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1) |
| 540 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) |
| 541 : "memory" |
| 542 ); |
| 543 } |
| 544 |
| 545 // TEMP0 = SRC[A + A1 * BPS] |
| 546 // TEMP1 = SRC[B + B1 * BPS] |
| 547 // TEMP2 = SRC[C + C1 * BPS] |
| 548 // TEMP3 = SRC[D + D1 * BPS] |
| 549 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \ |
| 550 A, A1, B, B1, C, C1, D, D1, SRC) \ |
| 551 "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
| 552 "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
| 553 "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
| 554 "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
| 555 |
| 556 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) { |
| 557 int i; |
| 558 const int thresh2 = 2 * thresh + 1; |
| 559 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; |
| 560 __asm__ volatile ( |
| 561 ".set push \n\t" |
| 562 ".set noreorder \n\t" |
| 563 "li %[i], 16 \n\t" |
| 564 "0: \n\t" |
| 565 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p) |
| 566 "subu %[temp7], %[temp1], %[temp2] \n\t" |
| 567 "subu %[temp6], %[temp0], %[temp3] \n\t" |
| 568 "absq_s.w %[temp4], %[temp7] \n\t" |
| 569 "absq_s.w %[temp5], %[temp6] \n\t" |
| 570 "sll %[temp4], %[temp4], 2 \n\t" |
| 571 "addu %[temp5], %[temp4], %[temp5] \n\t" |
| 572 "subu %[temp5], %[temp5], %[thresh2] \n\t" |
| 573 "negu %[temp8], %[temp7] \n\t" |
| 574 "bgtz %[temp5], 1f \n\t" |
| 575 " addiu %[i], %[i], -1 \n\t" |
| 576 "sll %[temp4], %[temp8], 1 \n\t" |
| 577 "shll_s.w %[temp5], %[temp6], 24 \n\t" |
| 578 "addu %[temp3], %[temp4], %[temp8] \n\t" |
| 579 "sra %[temp5], %[temp5], 24 \n\t" |
| 580 "addu %[temp3], %[temp3], %[temp5] \n\t" |
| 581 "addiu %[temp7], %[temp3], 3 \n\t" |
| 582 "sra %[temp7], %[temp7], 3 \n\t" |
| 583 "shra_r.w %[temp8], %[temp3], 3 \n\t" |
| 584 "shll_s.w %[temp0], %[temp7], 27 \n\t" |
| 585 "shll_s.w %[temp4], %[temp8], 27 \n\t" |
| 586 "sra %[temp0], %[temp0], 27 \n\t" |
| 587 "sra %[temp4], %[temp4], 27 \n\t" |
| 588 "addu %[temp7], %[temp1], %[temp0] \n\t" |
| 589 "subu %[temp2], %[temp2], %[temp4] \n\t" |
| 590 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t" |
| 591 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t" |
| 592 "sb %[temp3], -1(%[p]) \n\t" |
| 593 "sb %[temp4], 0(%[p]) \n\t" |
| 594 "1: \n\t" |
| 595 "bgtz %[i], 0b \n\t" |
| 596 " addu %[p], %[p], %[stride] \n\t" |
| 597 ".set pop \n\t" |
| 598 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 599 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 600 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), |
| 601 [p]"+&r"(p), [i]"=&r"(i) |
| 602 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2) |
| 603 : "memory" |
| 604 ); |
| 605 } |
| 606 |
| 607 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) { |
| 608 int k; |
| 609 for (k = 3; k > 0; --k) { |
| 610 p += 4 * stride; |
| 611 SimpleVFilter16(p, stride, thresh); |
| 612 } |
| 613 } |
| 614 |
| 615 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { |
| 616 int k; |
| 617 for (k = 3; k > 0; --k) { |
| 618 p += 4; |
| 619 SimpleHFilter16(p, stride, thresh); |
| 620 } |
| 621 } |
| 622 |
| 623 // DST[A * BPS] = TEMP0 |
| 624 // DST[B + C * BPS] = TEMP1 |
| 625 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \ |
| 626 "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \ |
| 627 "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t" |
| 628 |
| 629 static void VE4(uint8_t* dst) { // vertical |
| 630 const uint8_t* top = dst - BPS; |
| 631 int temp0, temp1, temp2, temp3, temp4, temp5, temp6; |
| 632 __asm__ volatile ( |
| 633 "ulw %[temp0], -1(%[top]) \n\t" |
| 634 "ulh %[temp1], 3(%[top]) \n\t" |
| 635 "preceu.ph.qbr %[temp2], %[temp0] \n\t" |
| 636 "preceu.ph.qbl %[temp3], %[temp0] \n\t" |
| 637 "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
| 638 "packrl.ph %[temp5], %[temp3], %[temp2] \n\t" |
| 639 "packrl.ph %[temp6], %[temp4], %[temp3] \n\t" |
| 640 "shll.ph %[temp5], %[temp5], 1 \n\t" |
| 641 "shll.ph %[temp6], %[temp6], 1 \n\t" |
| 642 "addq.ph %[temp2], %[temp5], %[temp2] \n\t" |
| 643 "addq.ph %[temp6], %[temp6], %[temp4] \n\t" |
| 644 "addq.ph %[temp2], %[temp2], %[temp3] \n\t" |
| 645 "addq.ph %[temp6], %[temp6], %[temp3] \n\t" |
| 646 "shra_r.ph %[temp2], %[temp2], 2 \n\t" |
| 647 "shra_r.ph %[temp6], %[temp6], 2 \n\t" |
| 648 "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t" |
| 649 STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst) |
| 650 STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst) |
| 651 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 652 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 653 [temp6]"=&r"(temp6) |
| 654 : [top]"r"(top), [dst]"r"(dst) |
| 655 : "memory" |
| 656 ); |
| 657 } |
| 658 |
| 659 static void DC4(uint8_t* dst) { // DC |
| 660 int temp0, temp1, temp2, temp3, temp4; |
| 661 __asm__ volatile ( |
| 662 "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t" |
| 663 LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
| 664 "ins %[temp1], %[temp2], 8, 8 \n\t" |
| 665 "ins %[temp1], %[temp3], 16, 8 \n\t" |
| 666 "ins %[temp1], %[temp4], 24, 8 \n\t" |
| 667 "raddu.w.qb %[temp0], %[temp0] \n\t" |
| 668 "raddu.w.qb %[temp1], %[temp1] \n\t" |
| 669 "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 670 "shra_r.w %[temp0], %[temp0], 3 \n\t" |
| 671 "replv.qb %[temp0], %[temp0] \n\t" |
| 672 STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst) |
| 673 STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst) |
| 674 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 675 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4) |
| 676 : [dst]"r"(dst) |
| 677 : "memory" |
| 678 ); |
| 679 } |
| 680 |
| 681 static void RD4(uint8_t* dst) { // Down-right |
| 682 int temp0, temp1, temp2, temp3, temp4; |
| 683 int temp5, temp6, temp7, temp8; |
| 684 __asm__ volatile ( |
| 685 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
| 686 "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t" |
| 687 "ins %[temp1], %[temp0], 16, 16 \n\t" |
| 688 "preceu.ph.qbr %[temp5], %[temp7] \n\t" |
| 689 "ins %[temp2], %[temp1], 16, 16 \n\t" |
| 690 "preceu.ph.qbl %[temp4], %[temp7] \n\t" |
| 691 "ins %[temp3], %[temp2], 16, 16 \n\t" |
| 692 "shll.ph %[temp2], %[temp2], 1 \n\t" |
| 693 "addq.ph %[temp3], %[temp3], %[temp1] \n\t" |
| 694 "packrl.ph %[temp6], %[temp5], %[temp1] \n\t" |
| 695 "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
| 696 "addq.ph %[temp1], %[temp1], %[temp5] \n\t" |
| 697 "shll.ph %[temp6], %[temp6], 1 \n\t" |
| 698 "addq.ph %[temp1], %[temp1], %[temp6] \n\t" |
| 699 "packrl.ph %[temp0], %[temp4], %[temp5] \n\t" |
| 700 "addq.ph %[temp8], %[temp5], %[temp4] \n\t" |
| 701 "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
| 702 "shll.ph %[temp0], %[temp0], 1 \n\t" |
| 703 "shra_r.ph %[temp1], %[temp1], 2 \n\t" |
| 704 "addq.ph %[temp8], %[temp0], %[temp8] \n\t" |
| 705 "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t" |
| 706 "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t" |
| 707 "shra_r.ph %[temp8], %[temp8], 2 \n\t" |
| 708 "ins %[temp7], %[temp5], 0, 8 \n\t" |
| 709 "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t" |
| 710 "raddu.w.qb %[temp4], %[temp7] \n\t" |
| 711 "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t" |
| 712 "shra_r.w %[temp4], %[temp4], 2 \n\t" |
| 713 STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst) |
| 714 "prepend %[temp2], %[temp8], 8 \n\t" |
| 715 "prepend %[temp6], %[temp4], 8 \n\t" |
| 716 STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst) |
| 717 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 718 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 719 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) |
| 720 : [dst]"r"(dst) |
| 721 : "memory" |
| 722 ); |
| 723 } |
| 724 |
| 725 // TEMP0 = SRC[A * BPS] |
| 726 // TEMP1 = SRC[B + C * BPS] |
| 727 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \ |
| 728 "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \ |
| 729 "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t" |
| 730 |
| 731 static void LD4(uint8_t* dst) { // Down-Left |
| 732 int temp0, temp1, temp2, temp3, temp4; |
| 733 int temp5, temp6, temp7, temp8, temp9; |
| 734 __asm__ volatile ( |
| 735 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
| 736 "preceu.ph.qbl %[temp2], %[temp0] \n\t" |
| 737 "preceu.ph.qbr %[temp3], %[temp0] \n\t" |
| 738 "preceu.ph.qbr %[temp4], %[temp1] \n\t" |
| 739 "preceu.ph.qbl %[temp5], %[temp1] \n\t" |
| 740 "packrl.ph %[temp6], %[temp2], %[temp3] \n\t" |
| 741 "packrl.ph %[temp7], %[temp4], %[temp2] \n\t" |
| 742 "packrl.ph %[temp8], %[temp5], %[temp4] \n\t" |
| 743 "shll.ph %[temp6], %[temp6], 1 \n\t" |
| 744 "addq.ph %[temp9], %[temp2], %[temp6] \n\t" |
| 745 "shll.ph %[temp7], %[temp7], 1 \n\t" |
| 746 "addq.ph %[temp9], %[temp9], %[temp3] \n\t" |
| 747 "shll.ph %[temp8], %[temp8], 1 \n\t" |
| 748 "shra_r.ph %[temp9], %[temp9], 2 \n\t" |
| 749 "addq.ph %[temp3], %[temp4], %[temp7] \n\t" |
| 750 "addq.ph %[temp0], %[temp5], %[temp8] \n\t" |
| 751 "addq.ph %[temp3], %[temp3], %[temp2] \n\t" |
| 752 "addq.ph %[temp0], %[temp0], %[temp4] \n\t" |
| 753 "shra_r.ph %[temp3], %[temp3], 2 \n\t" |
| 754 "shra_r.ph %[temp0], %[temp0], 2 \n\t" |
| 755 "srl %[temp1], %[temp1], 24 \n\t" |
| 756 "sll %[temp1], %[temp1], 1 \n\t" |
| 757 "raddu.w.qb %[temp5], %[temp5] \n\t" |
| 758 "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t" |
| 759 "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t" |
| 760 "addu %[temp1], %[temp1], %[temp5] \n\t" |
| 761 "shra_r.w %[temp1], %[temp1], 2 \n\t" |
| 762 STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst) |
| 763 "prepend %[temp9], %[temp0], 8 \n\t" |
| 764 "prepend %[temp3], %[temp1], 8 \n\t" |
| 765 STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst) |
| 766 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 767 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 768 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), |
| 769 [temp9]"=&r"(temp9) |
| 770 : [dst]"r"(dst) |
| 771 : "memory" |
| 772 ); |
| 773 } |
| 774 |
| 775 //------------------------------------------------------------------------------ |
| 776 // Chroma |
| 777 |
| 778 static void DC8uv(uint8_t* dst) { // DC |
| 779 int temp0, temp1, temp2, temp3, temp4; |
| 780 int temp5, temp6, temp7, temp8, temp9; |
| 781 __asm__ volatile ( |
| 782 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
| 783 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
| 784 LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst) |
| 785 "raddu.w.qb %[temp0], %[temp0] \n\t" |
| 786 "raddu.w.qb %[temp1], %[temp1] \n\t" |
| 787 "addu %[temp2], %[temp2], %[temp3] \n\t" |
| 788 "addu %[temp4], %[temp4], %[temp5] \n\t" |
| 789 "addu %[temp6], %[temp6], %[temp7] \n\t" |
| 790 "addu %[temp8], %[temp8], %[temp9] \n\t" |
| 791 "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 792 "addu %[temp2], %[temp2], %[temp4] \n\t" |
| 793 "addu %[temp6], %[temp6], %[temp8] \n\t" |
| 794 "addu %[temp0], %[temp0], %[temp2] \n\t" |
| 795 "addu %[temp0], %[temp0], %[temp6] \n\t" |
| 796 "shra_r.w %[temp0], %[temp0], 4 \n\t" |
| 797 "replv.qb %[temp0], %[temp0] \n\t" |
| 798 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
| 799 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
| 800 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
| 801 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
| 802 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
| 803 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
| 804 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
| 805 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
| 806 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 807 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 808 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), |
| 809 [temp9]"=&r"(temp9) |
| 810 : [dst]"r"(dst) |
| 811 : "memory" |
| 812 ); |
| 813 } |
| 814 |
| 815 static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples |
| 816 int temp0, temp1; |
| 817 __asm__ volatile ( |
| 818 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst) |
| 819 "raddu.w.qb %[temp0], %[temp0] \n\t" |
| 820 "raddu.w.qb %[temp1], %[temp1] \n\t" |
| 821 "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 822 "shra_r.w %[temp0], %[temp0], 3 \n\t" |
| 823 "replv.qb %[temp0], %[temp0] \n\t" |
| 824 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
| 825 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
| 826 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
| 827 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
| 828 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
| 829 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
| 830 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
| 831 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
| 832 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1) |
| 833 : [dst]"r"(dst) |
| 834 : "memory" |
| 835 ); |
| 836 } |
| 837 |
| 838 static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples |
| 839 int temp0, temp1, temp2, temp3, temp4; |
| 840 int temp5, temp6, temp7, temp8; |
| 841 __asm__ volatile ( |
| 842 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst) |
| 843 LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst) |
| 844 "addu %[temp2], %[temp2], %[temp3] \n\t" |
| 845 "addu %[temp4], %[temp4], %[temp5] \n\t" |
| 846 "addu %[temp6], %[temp6], %[temp7] \n\t" |
| 847 "addu %[temp8], %[temp8], %[temp1] \n\t" |
| 848 "addu %[temp2], %[temp2], %[temp4] \n\t" |
| 849 "addu %[temp6], %[temp6], %[temp8] \n\t" |
| 850 "addu %[temp0], %[temp6], %[temp2] \n\t" |
| 851 "shra_r.w %[temp0], %[temp0], 3 \n\t" |
| 852 "replv.qb %[temp0], %[temp0] \n\t" |
| 853 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst) |
| 854 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst) |
| 855 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst) |
| 856 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst) |
| 857 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst) |
| 858 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst) |
| 859 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst) |
| 860 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst) |
| 861 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), |
| 862 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), |
| 863 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8) |
| 864 : [dst]"r"(dst) |
| 865 : "memory" |
| 866 ); |
| 867 } |
| 868 |
| 869 #undef LOAD_8_BYTES |
| 870 #undef STORE_8_BYTES |
| 871 #undef LOAD_4_BYTES |
| 872 |
| 873 #define CLIPPING(SIZE) \ |
| 874 "preceu.ph.qbl %[temp2], %[temp0] \n\t" \ |
| 875 "preceu.ph.qbr %[temp0], %[temp0] \n\t" \ |
| 876 ".if " #SIZE " == 8 \n\t" \ |
| 877 "preceu.ph.qbl %[temp3], %[temp1] \n\t" \ |
| 878 "preceu.ph.qbr %[temp1], %[temp1] \n\t" \ |
| 879 ".endif \n\t" \ |
| 880 "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \ |
| 881 "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \ |
| 882 ".if " #SIZE " == 8 \n\t" \ |
| 883 "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \ |
| 884 "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \ |
| 885 ".endif \n\t" \ |
| 886 "shll_s.ph %[temp2], %[temp2], 7 \n\t" \ |
| 887 "shll_s.ph %[temp0], %[temp0], 7 \n\t" \ |
| 888 ".if " #SIZE " == 8 \n\t" \ |
| 889 "shll_s.ph %[temp3], %[temp3], 7 \n\t" \ |
| 890 "shll_s.ph %[temp1], %[temp1], 7 \n\t" \ |
| 891 ".endif \n\t" \ |
| 892 "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \ |
| 893 ".if " #SIZE " == 8 \n\t" \ |
| 894 "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \ |
| 895 ".endif \n\t" |
| 896 |
| 897 |
| 898 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \ |
| 899 int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \ |
| 900 int temp0, temp1, temp2, temp3; \ |
| 901 __asm__ volatile ( \ |
| 902 ".if " #SIZE " < 8 \n\t" \ |
| 903 "ulw %[temp0], 0(%[top]) \n\t" \ |
| 904 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ |
| 905 CLIPPING(4) \ |
| 906 "usw %[temp0], 0(%[dst]) \n\t" \ |
| 907 ".else \n\t" \ |
| 908 "ulw %[temp0], 0(%[top]) \n\t" \ |
| 909 "ulw %[temp1], 4(%[top]) \n\t" \ |
| 910 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \ |
| 911 CLIPPING(8) \ |
| 912 "usw %[temp0], 0(%[dst]) \n\t" \ |
| 913 "usw %[temp1], 4(%[dst]) \n\t" \ |
| 914 ".if " #SIZE " == 16 \n\t" \ |
| 915 "ulw %[temp0], 8(%[top]) \n\t" \ |
| 916 "ulw %[temp1], 12(%[top]) \n\t" \ |
| 917 CLIPPING(8) \ |
| 918 "usw %[temp0], 8(%[dst]) \n\t" \ |
| 919 "usw %[temp1], 12(%[dst]) \n\t" \ |
| 920 ".endif \n\t" \ |
| 921 ".endif \n\t" \ |
| 922 : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
| 923 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ |
| 924 : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \ |
| 925 : "memory" \ |
| 926 ); \ |
| 927 } while (0) |
| 928 |
| 929 #define CLIP_TO_DST(DST, SIZE) do { \ |
| 930 int y; \ |
| 931 const uint8_t* top = (DST) - BPS; \ |
| 932 const int top_1 = ((int)top[-1] << 16) + top[-1]; \ |
| 933 for (y = 0; y < (SIZE); ++y) { \ |
| 934 CLIP_8B_TO_DST((DST), top, (SIZE)); \ |
| 935 (DST) += BPS; \ |
| 936 } \ |
| 937 } while (0) |
| 938 |
| 939 #define TRUE_MOTION(DST, SIZE) \ |
| 940 static void TrueMotion##SIZE(uint8_t* (DST)) { \ |
| 941 CLIP_TO_DST((DST), (SIZE)); \ |
| 942 } |
| 943 |
| 944 TRUE_MOTION(dst, 4) |
| 945 TRUE_MOTION(dst, 8) |
| 946 TRUE_MOTION(dst, 16) |
| 947 |
| 948 #undef TRUE_MOTION |
| 949 #undef CLIP_TO_DST |
| 950 #undef CLIP_8B_TO_DST |
| 951 #undef CLIPPING |
| 952 |
| 953 //------------------------------------------------------------------------------ |
| 954 // Entry point |
| 955 |
| 956 extern void VP8DspInitMIPSdspR2(void); |
| 957 |
| 958 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) { |
| 959 VP8TransformDC = TransformDC; |
| 960 VP8TransformAC3 = TransformAC3; |
| 961 VP8Transform = TransformTwo; |
| 962 |
| 963 VP8VFilter16 = VFilter16; |
| 964 VP8HFilter16 = HFilter16; |
| 965 VP8VFilter8 = VFilter8; |
| 966 VP8HFilter8 = HFilter8; |
| 967 VP8VFilter16i = VFilter16i; |
| 968 VP8HFilter16i = HFilter16i; |
| 969 VP8VFilter8i = VFilter8i; |
| 970 VP8HFilter8i = HFilter8i; |
| 971 VP8SimpleVFilter16 = SimpleVFilter16; |
| 972 VP8SimpleHFilter16 = SimpleHFilter16; |
| 973 VP8SimpleVFilter16i = SimpleVFilter16i; |
| 974 VP8SimpleHFilter16i = SimpleHFilter16i; |
| 975 |
| 976 VP8PredLuma4[0] = DC4; |
| 977 VP8PredLuma4[1] = TrueMotion4; |
| 978 VP8PredLuma4[2] = VE4; |
| 979 VP8PredLuma4[4] = RD4; |
| 980 VP8PredLuma4[6] = LD4; |
| 981 |
| 982 VP8PredChroma8[0] = DC8uv; |
| 983 VP8PredChroma8[1] = TrueMotion8; |
| 984 VP8PredChroma8[4] = DC8uvNoTop; |
| 985 VP8PredChroma8[5] = DC8uvNoLeft; |
| 986 |
| 987 VP8PredLuma16[1] = TrueMotion16; |
| 988 } |
| 989 |
| 990 #else // !WEBP_USE_MIPS_DSP_R2 |
| 991 |
| 992 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2) |
| 993 |
| 994 #endif // WEBP_USE_MIPS_DSP_R2 |
OLD | NEW |