| OLD | NEW |
| (Empty) |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. | |
| 2 // | |
| 3 // Use of this source code is governed by a BSD-style license | |
| 4 // that can be found in the COPYING file in the root of the source | |
| 5 // tree. An additional intellectual property rights grant can be found | |
| 6 // in the file PATENTS. All contributing project authors may | |
| 7 // be found in the AUTHORS file in the root of the source tree. | |
| 8 // ----------------------------------------------------------------------------- | |
| 9 // | |
| 10 // MIPS version of lossless functions | |
| 11 // | |
| 12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) | |
| 13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com) | |
| 14 | |
| 15 #include "./dsp.h" | |
| 16 #include "./lossless.h" | |
| 17 | |
| 18 #if defined(WEBP_USE_MIPS32) | |
| 19 | |
| 20 #include <assert.h> | |
| 21 #include <math.h> | |
| 22 #include <stdlib.h> | |
| 23 #include <string.h> | |
| 24 | |
| 25 #define APPROX_LOG_WITH_CORRECTION_MAX 65536 | |
| 26 #define APPROX_LOG_MAX 4096 | |
| 27 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 | |
| 28 | |
| 29 static float FastSLog2Slow(uint32_t v) { | |
| 30 assert(v >= LOG_LOOKUP_IDX_MAX); | |
| 31 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { | |
| 32 uint32_t log_cnt, y, correction; | |
| 33 const int c24 = 24; | |
| 34 const float v_f = (float)v; | |
| 35 uint32_t temp; | |
| 36 | |
| 37 // Xf = 256 = 2^8 | |
| 38 // log_cnt is index of leading one in upper 24 bits | |
| 39 __asm__ volatile( | |
| 40 "clz %[log_cnt], %[v] \n\t" | |
| 41 "addiu %[y], $zero, 1 \n\t" | |
| 42 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" | |
| 43 "sllv %[y], %[y], %[log_cnt] \n\t" | |
| 44 "srlv %[temp], %[v], %[log_cnt] \n\t" | |
| 45 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), | |
| 46 [temp]"=r"(temp) | |
| 47 : [c24]"r"(c24), [v]"r"(v) | |
| 48 ); | |
| 49 | |
| 50 // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 | |
| 51 // Xf = floor(Xf) * (1 + (v % y) / v) | |
| 52 // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) | |
| 53 // The correction factor: log(1 + d) ~ d; for very small d values, so | |
| 54 // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v | |
| 55 // LOG_2_RECIPROCAL ~ 23/16 | |
| 56 | |
| 57 // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) | |
| 58 correction = (23 * (v & (y - 1))) >> 4; | |
| 59 return v_f * (kLog2Table[temp] + log_cnt) + correction; | |
| 60 } else { | |
| 61 return (float)(LOG_2_RECIPROCAL * v * log((double)v)); | |
| 62 } | |
| 63 } | |
| 64 | |
| 65 static float FastLog2Slow(uint32_t v) { | |
| 66 assert(v >= LOG_LOOKUP_IDX_MAX); | |
| 67 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { | |
| 68 uint32_t log_cnt, y; | |
| 69 const int c24 = 24; | |
| 70 double log_2; | |
| 71 uint32_t temp; | |
| 72 | |
| 73 __asm__ volatile( | |
| 74 "clz %[log_cnt], %[v] \n\t" | |
| 75 "addiu %[y], $zero, 1 \n\t" | |
| 76 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" | |
| 77 "sllv %[y], %[y], %[log_cnt] \n\t" | |
| 78 "srlv %[temp], %[v], %[log_cnt] \n\t" | |
| 79 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), | |
| 80 [temp]"=r"(temp) | |
| 81 : [c24]"r"(c24), [v]"r"(v) | |
| 82 ); | |
| 83 | |
| 84 log_2 = kLog2Table[temp] + log_cnt; | |
| 85 if (v >= APPROX_LOG_MAX) { | |
| 86 // Since the division is still expensive, add this correction factor only | |
| 87 // for large values of 'v'. | |
| 88 | |
| 89 const uint32_t correction = (23 * (v & (y - 1))) >> 4; | |
| 90 log_2 += (double)correction / v; | |
| 91 } | |
| 92 return (float)log_2; | |
| 93 } else { | |
| 94 return (float)(LOG_2_RECIPROCAL * log((double)v)); | |
| 95 } | |
| 96 } | |
| 97 | |
| 98 // C version of this function: | |
| 99 // int i = 0; | |
| 100 // int64_t cost = 0; | |
| 101 // const uint32_t* pop = &population[4]; | |
| 102 // const uint32_t* LoopEnd = &population[length]; | |
| 103 // while (pop != LoopEnd) { | |
| 104 // ++i; | |
| 105 // cost += i * *pop; | |
| 106 // cost += i * *(pop + 1); | |
| 107 // pop += 2; | |
| 108 // } | |
| 109 // return (double)cost; | |
| 110 static double ExtraCost(const uint32_t* const population, int length) { | |
| 111 int i, temp0, temp1; | |
| 112 const uint32_t* pop = &population[4]; | |
| 113 const uint32_t* const LoopEnd = &population[length]; | |
| 114 | |
| 115 __asm__ volatile( | |
| 116 "mult $zero, $zero \n\t" | |
| 117 "xor %[i], %[i], %[i] \n\t" | |
| 118 "beq %[pop], %[LoopEnd], 2f \n\t" | |
| 119 "1: \n\t" | |
| 120 "lw %[temp0], 0(%[pop]) \n\t" | |
| 121 "lw %[temp1], 4(%[pop]) \n\t" | |
| 122 "addiu %[i], %[i], 1 \n\t" | |
| 123 "addiu %[pop], %[pop], 8 \n\t" | |
| 124 "madd %[i], %[temp0] \n\t" | |
| 125 "madd %[i], %[temp1] \n\t" | |
| 126 "bne %[pop], %[LoopEnd], 1b \n\t" | |
| 127 "2: \n\t" | |
| 128 "mfhi %[temp0] \n\t" | |
| 129 "mflo %[temp1] \n\t" | |
| 130 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), | |
| 131 [i]"=&r"(i), [pop]"+r"(pop) | |
| 132 : [LoopEnd]"r"(LoopEnd) | |
| 133 : "memory", "hi", "lo" | |
| 134 ); | |
| 135 | |
| 136 return (double)((int64_t)temp0 << 32 | temp1); | |
| 137 } | |
| 138 | |
| 139 // C version of this function: | |
| 140 // int i = 0; | |
| 141 // int64_t cost = 0; | |
| 142 // const uint32_t* pX = &X[4]; | |
| 143 // const uint32_t* pY = &Y[4]; | |
| 144 // const uint32_t* LoopEnd = &X[length]; | |
| 145 // while (pX != LoopEnd) { | |
| 146 // const uint32_t xy0 = *pX + *pY; | |
| 147 // const uint32_t xy1 = *(pX + 1) + *(pY + 1); | |
| 148 // ++i; | |
| 149 // cost += i * xy0; | |
| 150 // cost += i * xy1; | |
| 151 // pX += 2; | |
| 152 // pY += 2; | |
| 153 // } | |
| 154 // return (double)cost; | |
| 155 static double ExtraCostCombined(const uint32_t* const X, | |
| 156 const uint32_t* const Y, int length) { | |
| 157 int i, temp0, temp1, temp2, temp3; | |
| 158 const uint32_t* pX = &X[4]; | |
| 159 const uint32_t* pY = &Y[4]; | |
| 160 const uint32_t* const LoopEnd = &X[length]; | |
| 161 | |
| 162 __asm__ volatile( | |
| 163 "mult $zero, $zero \n\t" | |
| 164 "xor %[i], %[i], %[i] \n\t" | |
| 165 "beq %[pX], %[LoopEnd], 2f \n\t" | |
| 166 "1: \n\t" | |
| 167 "lw %[temp0], 0(%[pX]) \n\t" | |
| 168 "lw %[temp1], 0(%[pY]) \n\t" | |
| 169 "lw %[temp2], 4(%[pX]) \n\t" | |
| 170 "lw %[temp3], 4(%[pY]) \n\t" | |
| 171 "addiu %[i], %[i], 1 \n\t" | |
| 172 "addu %[temp0], %[temp0], %[temp1] \n\t" | |
| 173 "addu %[temp2], %[temp2], %[temp3] \n\t" | |
| 174 "addiu %[pX], %[pX], 8 \n\t" | |
| 175 "addiu %[pY], %[pY], 8 \n\t" | |
| 176 "madd %[i], %[temp0] \n\t" | |
| 177 "madd %[i], %[temp2] \n\t" | |
| 178 "bne %[pX], %[LoopEnd], 1b \n\t" | |
| 179 "2: \n\t" | |
| 180 "mfhi %[temp0] \n\t" | |
| 181 "mflo %[temp1] \n\t" | |
| 182 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), | |
| 183 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), | |
| 184 [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY) | |
| 185 : [LoopEnd]"r"(LoopEnd) | |
| 186 : "memory", "hi", "lo" | |
| 187 ); | |
| 188 | |
| 189 return (double)((int64_t)temp0 << 32 | temp1); | |
| 190 } | |
| 191 | |
| 192 #define HUFFMAN_COST_PASS \ | |
| 193 __asm__ volatile( \ | |
| 194 "sll %[temp1], %[temp0], 3 \n\t" \ | |
| 195 "addiu %[temp3], %[streak], -3 \n\t" \ | |
| 196 "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ | |
| 197 "blez %[temp3], 1f \n\t" \ | |
| 198 "srl %[temp1], %[temp1], 1 \n\t" \ | |
| 199 "addu %[temp3], %[pcnts], %[temp1] \n\t" \ | |
| 200 "lw %[temp0], 4(%[temp2]) \n\t" \ | |
| 201 "lw %[temp1], 0(%[temp3]) \n\t" \ | |
| 202 "addu %[temp0], %[temp0], %[streak] \n\t" \ | |
| 203 "addiu %[temp1], %[temp1], 1 \n\t" \ | |
| 204 "sw %[temp0], 4(%[temp2]) \n\t" \ | |
| 205 "sw %[temp1], 0(%[temp3]) \n\t" \ | |
| 206 "b 2f \n\t" \ | |
| 207 "1: \n\t" \ | |
| 208 "lw %[temp0], 0(%[temp2]) \n\t" \ | |
| 209 "addu %[temp0], %[temp0], %[streak] \n\t" \ | |
| 210 "sw %[temp0], 0(%[temp2]) \n\t" \ | |
| 211 "2: \n\t" \ | |
| 212 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ | |
| 213 [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ | |
| 214 : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ | |
| 215 [streak]"r"(streak) \ | |
| 216 : "memory" \ | |
| 217 ); | |
| 218 | |
| 219 // Returns the various RLE counts | |
| 220 static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { | |
| 221 int i; | |
| 222 int streak = 0; | |
| 223 VP8LStreaks stats; | |
| 224 int* const pstreaks = &stats.streaks[0][0]; | |
| 225 int* const pcnts = &stats.counts[0]; | |
| 226 int temp0, temp1, temp2, temp3; | |
| 227 memset(&stats, 0, sizeof(stats)); | |
| 228 for (i = 0; i < length - 1; ++i) { | |
| 229 ++streak; | |
| 230 if (population[i] == population[i + 1]) { | |
| 231 continue; | |
| 232 } | |
| 233 temp0 = (population[i] != 0); | |
| 234 HUFFMAN_COST_PASS | |
| 235 streak = 0; | |
| 236 } | |
| 237 ++streak; | |
| 238 temp0 = (population[i] != 0); | |
| 239 HUFFMAN_COST_PASS | |
| 240 | |
| 241 return stats; | |
| 242 } | |
| 243 | |
| 244 static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X, | |
| 245 const uint32_t* Y, int length) { | |
| 246 int i; | |
| 247 int streak = 0; | |
| 248 VP8LStreaks stats; | |
| 249 int* const pstreaks = &stats.streaks[0][0]; | |
| 250 int* const pcnts = &stats.counts[0]; | |
| 251 int temp0, temp1, temp2, temp3; | |
| 252 memset(&stats, 0, sizeof(stats)); | |
| 253 for (i = 0; i < length - 1; ++i) { | |
| 254 const uint32_t xy = X[i] + Y[i]; | |
| 255 const uint32_t xy_next = X[i + 1] + Y[i + 1]; | |
| 256 ++streak; | |
| 257 if (xy == xy_next) { | |
| 258 continue; | |
| 259 } | |
| 260 temp0 = (xy != 0); | |
| 261 HUFFMAN_COST_PASS | |
| 262 streak = 0; | |
| 263 } | |
| 264 { | |
| 265 const uint32_t xy = X[i] + Y[i]; | |
| 266 ++streak; | |
| 267 temp0 = (xy != 0); | |
| 268 HUFFMAN_COST_PASS | |
| 269 } | |
| 270 | |
| 271 return stats; | |
| 272 } | |
| 273 | |
| 274 #define ASM_START \ | |
| 275 __asm__ volatile( \ | |
| 276 ".set push \n\t" \ | |
| 277 ".set at \n\t" \ | |
| 278 ".set macro \n\t" \ | |
| 279 "1: \n\t" | |
| 280 | |
| 281 // P2 = P0 + P1 | |
| 282 // A..D - offsets | |
| 283 // E - temp variable to tell macro | |
| 284 // if pointer should be incremented | |
| 285 // literal_ and successive histograms could be unaligned | |
| 286 // so we must use ulw and usw | |
| 287 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ | |
| 288 "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ | |
| 289 "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ | |
| 290 "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ | |
| 291 "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ | |
| 292 "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ | |
| 293 "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ | |
| 294 "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ | |
| 295 "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ | |
| 296 "addu %[temp4], %[temp4], %[temp0] \n\t" \ | |
| 297 "addu %[temp5], %[temp5], %[temp1] \n\t" \ | |
| 298 "addu %[temp6], %[temp6], %[temp2] \n\t" \ | |
| 299 "addu %[temp7], %[temp7], %[temp3] \n\t" \ | |
| 300 "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ | |
| 301 ".if " #E " == 1 \n\t" \ | |
| 302 "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ | |
| 303 ".endif \n\t" \ | |
| 304 "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ | |
| 305 "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ | |
| 306 "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ | |
| 307 "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ | |
| 308 "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ | |
| 309 "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ | |
| 310 ".set pop \n\t" \ | |
| 311 | |
| 312 #define ASM_END_COMMON_0 \ | |
| 313 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ | |
| 314 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ | |
| 315 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ | |
| 316 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ | |
| 317 [pa]"+r"(pa), [pout]"+r"(pout) | |
| 318 | |
| 319 #define ASM_END_COMMON_1 \ | |
| 320 : [LoopEnd]"r"(LoopEnd) \ | |
| 321 : "memory", "at" \ | |
| 322 ); | |
| 323 | |
| 324 #define ASM_END_0 \ | |
| 325 ASM_END_COMMON_0 \ | |
| 326 , [pb]"+r"(pb) \ | |
| 327 ASM_END_COMMON_1 | |
| 328 | |
| 329 #define ASM_END_1 \ | |
| 330 ASM_END_COMMON_0 \ | |
| 331 ASM_END_COMMON_1 | |
| 332 | |
| 333 #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ | |
| 334 const uint32_t* pa = (const uint32_t*)(A); \ | |
| 335 const uint32_t* pb = (const uint32_t*)(B); \ | |
| 336 uint32_t* pout = (uint32_t*)(OUT); \ | |
| 337 const uint32_t* const LoopEnd = pa + (SIZE); \ | |
| 338 assert((SIZE) % 4 == 0); \ | |
| 339 ASM_START \ | |
| 340 ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ | |
| 341 ASM_END_0 \ | |
| 342 if ((EXTRA_SIZE) > 0) { \ | |
| 343 const int last = (EXTRA_SIZE); \ | |
| 344 int i; \ | |
| 345 for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ | |
| 346 } \ | |
| 347 } while (0) | |
| 348 | |
| 349 #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ | |
| 350 const uint32_t* pa = (const uint32_t*)(A); \ | |
| 351 uint32_t* pout = (uint32_t*)(OUT); \ | |
| 352 const uint32_t* const LoopEnd = pa + (SIZE); \ | |
| 353 assert((SIZE) % 4 == 0); \ | |
| 354 ASM_START \ | |
| 355 ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ | |
| 356 ASM_END_1 \ | |
| 357 if ((EXTRA_SIZE) > 0) { \ | |
| 358 const int last = (EXTRA_SIZE); \ | |
| 359 int i; \ | |
| 360 for (i = 0; i < last; ++i) pout[i] += pa[i]; \ | |
| 361 } \ | |
| 362 } while (0) | |
| 363 | |
| 364 static void HistogramAdd(const VP8LHistogram* const a, | |
| 365 const VP8LHistogram* const b, | |
| 366 VP8LHistogram* const out) { | |
| 367 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | |
| 368 const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) | |
| 369 - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); | |
| 370 assert(a->palette_code_bits_ == b->palette_code_bits_); | |
| 371 | |
| 372 if (b != out) { | |
| 373 ADD_VECTOR(a->literal_, b->literal_, out->literal_, | |
| 374 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); | |
| 375 ADD_VECTOR(a->distance_, b->distance_, out->distance_, | |
| 376 NUM_DISTANCE_CODES, 0); | |
| 377 ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); | |
| 378 ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); | |
| 379 ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); | |
| 380 } else { | |
| 381 ADD_VECTOR_EQ(a->literal_, out->literal_, | |
| 382 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); | |
| 383 ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); | |
| 384 ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); | |
| 385 ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); | |
| 386 ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); | |
| 387 } | |
| 388 } | |
| 389 | |
| 390 #undef ADD_VECTOR_EQ | |
| 391 #undef ADD_VECTOR | |
| 392 #undef ASM_END_1 | |
| 393 #undef ASM_END_0 | |
| 394 #undef ASM_END_COMMON_1 | |
| 395 #undef ASM_END_COMMON_0 | |
| 396 #undef ADD_TO_OUT | |
| 397 #undef ASM_START | |
| 398 | |
| 399 #endif // WEBP_USE_MIPS32 | |
| 400 | |
| 401 //------------------------------------------------------------------------------ | |
| 402 // Entry point | |
| 403 | |
| 404 extern void VP8LDspInitMIPS32(void); | |
| 405 | |
| 406 void VP8LDspInitMIPS32(void) { | |
| 407 #if defined(WEBP_USE_MIPS32) | |
| 408 VP8LFastSLog2Slow = FastSLog2Slow; | |
| 409 VP8LFastLog2Slow = FastLog2Slow; | |
| 410 VP8LExtraCost = ExtraCost; | |
| 411 VP8LExtraCostCombined = ExtraCostCombined; | |
| 412 VP8LHuffmanCostCount = HuffmanCostCount; | |
| 413 VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount; | |
| 414 VP8LHistogramAdd = HistogramAdd; | |
| 415 #endif // WEBP_USE_MIPS32 | |
| 416 } | |
| OLD | NEW |