OLD | NEW |
(Empty) | |
| 1 // Copyright 2014 Google Inc. All Rights Reserved. |
| 2 // |
| 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- |
| 9 // |
| 10 // MIPS version of lossless functions |
| 11 // |
| 12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
| 13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
| 14 |
| 15 #include "./dsp.h" |
| 16 #include "./lossless.h" |
| 17 |
| 18 #if defined(WEBP_USE_MIPS32) |
| 19 |
| 20 #include <assert.h> |
| 21 #include <math.h> |
| 22 #include <stdlib.h> |
| 23 #include <string.h> |
| 24 |
| 25 #define APPROX_LOG_WITH_CORRECTION_MAX 65536 |
| 26 #define APPROX_LOG_MAX 4096 |
| 27 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086 |
| 28 |
| 29 static float FastSLog2Slow(uint32_t v) { |
| 30 assert(v >= LOG_LOOKUP_IDX_MAX); |
| 31 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 32 uint32_t log_cnt, y, correction; |
| 33 const int c24 = 24; |
| 34 const float v_f = (float)v; |
| 35 uint32_t temp; |
| 36 |
| 37 // Xf = 256 = 2^8 |
| 38 // log_cnt is index of leading one in upper 24 bits |
| 39 __asm__ volatile( |
| 40 "clz %[log_cnt], %[v] \n\t" |
| 41 "addiu %[y], $zero, 1 \n\t" |
| 42 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 43 "sllv %[y], %[y], %[log_cnt] \n\t" |
| 44 "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 45 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), |
| 46 [temp]"=r"(temp) |
| 47 : [c24]"r"(c24), [v]"r"(v) |
| 48 ); |
| 49 |
| 50 // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 |
| 51 // Xf = floor(Xf) * (1 + (v % y) / v) |
| 52 // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) |
| 53 // The correction factor: log(1 + d) ~ d; for very small d values, so |
| 54 // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v |
| 55 // LOG_2_RECIPROCAL ~ 23/16 |
| 56 |
| 57 // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) |
| 58 correction = (23 * (v & (y - 1))) >> 4; |
| 59 return v_f * (kLog2Table[temp] + log_cnt) + correction; |
| 60 } else { |
| 61 return (float)(LOG_2_RECIPROCAL * v * log((double)v)); |
| 62 } |
| 63 } |
| 64 |
| 65 static float FastLog2Slow(uint32_t v) { |
| 66 assert(v >= LOG_LOOKUP_IDX_MAX); |
| 67 if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 68 uint32_t log_cnt, y; |
| 69 const int c24 = 24; |
| 70 double log_2; |
| 71 uint32_t temp; |
| 72 |
| 73 __asm__ volatile( |
| 74 "clz %[log_cnt], %[v] \n\t" |
| 75 "addiu %[y], $zero, 1 \n\t" |
| 76 "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 77 "sllv %[y], %[y], %[log_cnt] \n\t" |
| 78 "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 79 : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y), |
| 80 [temp]"=r"(temp) |
| 81 : [c24]"r"(c24), [v]"r"(v) |
| 82 ); |
| 83 |
| 84 log_2 = kLog2Table[temp] + log_cnt; |
| 85 if (v >= APPROX_LOG_MAX) { |
| 86 // Since the division is still expensive, add this correction factor only |
| 87 // for large values of 'v'. |
| 88 |
| 89 const uint32_t correction = (23 * (v & (y - 1))) >> 4; |
| 90 log_2 += (double)correction / v; |
| 91 } |
| 92 return (float)log_2; |
| 93 } else { |
| 94 return (float)(LOG_2_RECIPROCAL * log((double)v)); |
| 95 } |
| 96 } |
| 97 |
| 98 // C version of this function: |
| 99 // int i = 0; |
| 100 // int64_t cost = 0; |
| 101 // const uint32_t* pop = &population[4]; |
| 102 // const uint32_t* LoopEnd = &population[length]; |
| 103 // while (pop != LoopEnd) { |
| 104 // ++i; |
| 105 // cost += i * *pop; |
| 106 // cost += i * *(pop + 1); |
| 107 // pop += 2; |
| 108 // } |
| 109 // return (double)cost; |
| 110 static double ExtraCost(const uint32_t* const population, int length) { |
| 111 int i, temp0, temp1; |
| 112 const uint32_t* pop = &population[4]; |
| 113 const uint32_t* const LoopEnd = &population[length]; |
| 114 |
| 115 __asm__ volatile( |
| 116 "mult $zero, $zero \n\t" |
| 117 "xor %[i], %[i], %[i] \n\t" |
| 118 "beq %[pop], %[LoopEnd], 2f \n\t" |
| 119 "1: \n\t" |
| 120 "lw %[temp0], 0(%[pop]) \n\t" |
| 121 "lw %[temp1], 4(%[pop]) \n\t" |
| 122 "addiu %[i], %[i], 1 \n\t" |
| 123 "addiu %[pop], %[pop], 8 \n\t" |
| 124 "madd %[i], %[temp0] \n\t" |
| 125 "madd %[i], %[temp1] \n\t" |
| 126 "bne %[pop], %[LoopEnd], 1b \n\t" |
| 127 "2: \n\t" |
| 128 "mfhi %[temp0] \n\t" |
| 129 "mflo %[temp1] \n\t" |
| 130 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), |
| 131 [i]"=&r"(i), [pop]"+r"(pop) |
| 132 : [LoopEnd]"r"(LoopEnd) |
| 133 : "memory", "hi", "lo" |
| 134 ); |
| 135 |
| 136 return (double)((int64_t)temp0 << 32 | temp1); |
| 137 } |
| 138 |
| 139 // C version of this function: |
| 140 // int i = 0; |
| 141 // int64_t cost = 0; |
| 142 // const uint32_t* pX = &X[4]; |
| 143 // const uint32_t* pY = &Y[4]; |
| 144 // const uint32_t* LoopEnd = &X[length]; |
| 145 // while (pX != LoopEnd) { |
| 146 // const uint32_t xy0 = *pX + *pY; |
| 147 // const uint32_t xy1 = *(pX + 1) + *(pY + 1); |
| 148 // ++i; |
| 149 // cost += i * xy0; |
| 150 // cost += i * xy1; |
| 151 // pX += 2; |
| 152 // pY += 2; |
| 153 // } |
| 154 // return (double)cost; |
| 155 static double ExtraCostCombined(const uint32_t* const X, |
| 156 const uint32_t* const Y, int length) { |
| 157 int i, temp0, temp1, temp2, temp3; |
| 158 const uint32_t* pX = &X[4]; |
| 159 const uint32_t* pY = &Y[4]; |
| 160 const uint32_t* const LoopEnd = &X[length]; |
| 161 |
| 162 __asm__ volatile( |
| 163 "mult $zero, $zero \n\t" |
| 164 "xor %[i], %[i], %[i] \n\t" |
| 165 "beq %[pX], %[LoopEnd], 2f \n\t" |
| 166 "1: \n\t" |
| 167 "lw %[temp0], 0(%[pX]) \n\t" |
| 168 "lw %[temp1], 0(%[pY]) \n\t" |
| 169 "lw %[temp2], 4(%[pX]) \n\t" |
| 170 "lw %[temp3], 4(%[pY]) \n\t" |
| 171 "addiu %[i], %[i], 1 \n\t" |
| 172 "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 173 "addu %[temp2], %[temp2], %[temp3] \n\t" |
| 174 "addiu %[pX], %[pX], 8 \n\t" |
| 175 "addiu %[pY], %[pY], 8 \n\t" |
| 176 "madd %[i], %[temp0] \n\t" |
| 177 "madd %[i], %[temp2] \n\t" |
| 178 "bne %[pX], %[LoopEnd], 1b \n\t" |
| 179 "2: \n\t" |
| 180 "mfhi %[temp0] \n\t" |
| 181 "mflo %[temp1] \n\t" |
| 182 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), |
| 183 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), |
| 184 [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY) |
| 185 : [LoopEnd]"r"(LoopEnd) |
| 186 : "memory", "hi", "lo" |
| 187 ); |
| 188 |
| 189 return (double)((int64_t)temp0 << 32 | temp1); |
| 190 } |
| 191 |
| 192 #define HUFFMAN_COST_PASS \ |
| 193 __asm__ volatile( \ |
| 194 "sll %[temp1], %[temp0], 3 \n\t" \ |
| 195 "addiu %[temp3], %[streak], -3 \n\t" \ |
| 196 "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ |
| 197 "blez %[temp3], 1f \n\t" \ |
| 198 "srl %[temp1], %[temp1], 1 \n\t" \ |
| 199 "addu %[temp3], %[pcnts], %[temp1] \n\t" \ |
| 200 "lw %[temp0], 4(%[temp2]) \n\t" \ |
| 201 "lw %[temp1], 0(%[temp3]) \n\t" \ |
| 202 "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 203 "addiu %[temp1], %[temp1], 1 \n\t" \ |
| 204 "sw %[temp0], 4(%[temp2]) \n\t" \ |
| 205 "sw %[temp1], 0(%[temp3]) \n\t" \ |
| 206 "b 2f \n\t" \ |
| 207 "1: \n\t" \ |
| 208 "lw %[temp0], 0(%[temp2]) \n\t" \ |
| 209 "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 210 "sw %[temp0], 0(%[temp2]) \n\t" \ |
| 211 "2: \n\t" \ |
| 212 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ |
| 213 [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ |
| 214 : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ |
| 215 [streak]"r"(streak) \ |
| 216 : "memory" \ |
| 217 ); |
| 218 |
| 219 // Returns the various RLE counts |
| 220 static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) { |
| 221 int i; |
| 222 int streak = 0; |
| 223 VP8LStreaks stats; |
| 224 int* const pstreaks = &stats.streaks[0][0]; |
| 225 int* const pcnts = &stats.counts[0]; |
| 226 int temp0, temp1, temp2, temp3; |
| 227 memset(&stats, 0, sizeof(stats)); |
| 228 for (i = 0; i < length - 1; ++i) { |
| 229 ++streak; |
| 230 if (population[i] == population[i + 1]) { |
| 231 continue; |
| 232 } |
| 233 temp0 = (population[i] != 0); |
| 234 HUFFMAN_COST_PASS |
| 235 streak = 0; |
| 236 } |
| 237 ++streak; |
| 238 temp0 = (population[i] != 0); |
| 239 HUFFMAN_COST_PASS |
| 240 |
| 241 return stats; |
| 242 } |
| 243 |
| 244 static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X, |
| 245 const uint32_t* Y, int length) { |
| 246 int i; |
| 247 int streak = 0; |
| 248 VP8LStreaks stats; |
| 249 int* const pstreaks = &stats.streaks[0][0]; |
| 250 int* const pcnts = &stats.counts[0]; |
| 251 int temp0, temp1, temp2, temp3; |
| 252 memset(&stats, 0, sizeof(stats)); |
| 253 for (i = 0; i < length - 1; ++i) { |
| 254 const uint32_t xy = X[i] + Y[i]; |
| 255 const uint32_t xy_next = X[i + 1] + Y[i + 1]; |
| 256 ++streak; |
| 257 if (xy == xy_next) { |
| 258 continue; |
| 259 } |
| 260 temp0 = (xy != 0); |
| 261 HUFFMAN_COST_PASS |
| 262 streak = 0; |
| 263 } |
| 264 { |
| 265 const uint32_t xy = X[i] + Y[i]; |
| 266 ++streak; |
| 267 temp0 = (xy != 0); |
| 268 HUFFMAN_COST_PASS |
| 269 } |
| 270 |
| 271 return stats; |
| 272 } |
| 273 |
| 274 #define ASM_START \ |
| 275 __asm__ volatile( \ |
| 276 ".set push \n\t" \ |
| 277 ".set at \n\t" \ |
| 278 ".set macro \n\t" \ |
| 279 "1: \n\t" |
| 280 |
| 281 // P2 = P0 + P1 |
| 282 // A..D - offsets |
| 283 // E - temp variable to tell macro |
| 284 // if pointer should be incremented |
| 285 // literal_ and successive histograms could be unaligned |
| 286 // so we must use ulw and usw |
| 287 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ |
| 288 "ulw %[temp0], "#A"(%["#P0"]) \n\t" \ |
| 289 "ulw %[temp1], "#B"(%["#P0"]) \n\t" \ |
| 290 "ulw %[temp2], "#C"(%["#P0"]) \n\t" \ |
| 291 "ulw %[temp3], "#D"(%["#P0"]) \n\t" \ |
| 292 "ulw %[temp4], "#A"(%["#P1"]) \n\t" \ |
| 293 "ulw %[temp5], "#B"(%["#P1"]) \n\t" \ |
| 294 "ulw %[temp6], "#C"(%["#P1"]) \n\t" \ |
| 295 "ulw %[temp7], "#D"(%["#P1"]) \n\t" \ |
| 296 "addu %[temp4], %[temp4], %[temp0] \n\t" \ |
| 297 "addu %[temp5], %[temp5], %[temp1] \n\t" \ |
| 298 "addu %[temp6], %[temp6], %[temp2] \n\t" \ |
| 299 "addu %[temp7], %[temp7], %[temp3] \n\t" \ |
| 300 "addiu %["#P0"], %["#P0"], 16 \n\t" \ |
| 301 ".if "#E" == 1 \n\t" \ |
| 302 "addiu %["#P1"], %["#P1"], 16 \n\t" \ |
| 303 ".endif \n\t" \ |
| 304 "usw %[temp4], "#A"(%["#P2"]) \n\t" \ |
| 305 "usw %[temp5], "#B"(%["#P2"]) \n\t" \ |
| 306 "usw %[temp6], "#C"(%["#P2"]) \n\t" \ |
| 307 "usw %[temp7], "#D"(%["#P2"]) \n\t" \ |
| 308 "addiu %["#P2"], %["#P2"], 16 \n\t" \ |
| 309 "bne %["#P0"], %[LoopEnd], 1b \n\t" \ |
| 310 ".set pop \n\t" \ |
| 311 |
| 312 #define ASM_END_COMMON_0 \ |
| 313 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
| 314 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ |
| 315 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ |
| 316 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ |
| 317 [pa]"+r"(pa), [pout]"+r"(pout) |
| 318 |
| 319 #define ASM_END_COMMON_1 \ |
| 320 : [LoopEnd]"r"(LoopEnd) \ |
| 321 : "memory", "at" \ |
| 322 ); |
| 323 |
| 324 #define ASM_END_0 \ |
| 325 ASM_END_COMMON_0 \ |
| 326 , [pb]"+r"(pb) \ |
| 327 ASM_END_COMMON_1 |
| 328 |
| 329 #define ASM_END_1 \ |
| 330 ASM_END_COMMON_0 \ |
| 331 ASM_END_COMMON_1 |
| 332 |
| 333 #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ |
| 334 const uint32_t* pa = (const uint32_t*)(A); \ |
| 335 const uint32_t* pb = (const uint32_t*)(B); \ |
| 336 uint32_t* pout = (uint32_t*)(OUT); \ |
| 337 const uint32_t* const LoopEnd = pa + (SIZE); \ |
| 338 assert((SIZE) % 4 == 0); \ |
| 339 ASM_START \ |
| 340 ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ |
| 341 ASM_END_0 \ |
| 342 if ((EXTRA_SIZE) > 0) { \ |
| 343 const int last = (EXTRA_SIZE); \ |
| 344 int i; \ |
| 345 for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ |
| 346 } \ |
| 347 } while (0) |
| 348 |
| 349 #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ |
| 350 const uint32_t* pa = (const uint32_t*)(A); \ |
| 351 uint32_t* pout = (uint32_t*)(OUT); \ |
| 352 const uint32_t* const LoopEnd = pa + (SIZE); \ |
| 353 assert((SIZE) % 4 == 0); \ |
| 354 ASM_START \ |
| 355 ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ |
| 356 ASM_END_1 \ |
| 357 if ((EXTRA_SIZE) > 0) { \ |
| 358 const int last = (EXTRA_SIZE); \ |
| 359 int i; \ |
| 360 for (i = 0; i < last; ++i) pout[i] += pa[i]; \ |
| 361 } \ |
| 362 } while (0) |
| 363 |
| 364 static void HistogramAdd(const VP8LHistogram* const a, |
| 365 const VP8LHistogram* const b, |
| 366 VP8LHistogram* const out) { |
| 367 uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
| 368 const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) |
| 369 - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); |
| 370 assert(a->palette_code_bits_ == b->palette_code_bits_); |
| 371 |
| 372 if (b != out) { |
| 373 ADD_VECTOR(a->literal_, b->literal_, out->literal_, |
| 374 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); |
| 375 ADD_VECTOR(a->distance_, b->distance_, out->distance_, |
| 376 NUM_DISTANCE_CODES, 0); |
| 377 ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); |
| 378 ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); |
| 379 ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); |
| 380 } else { |
| 381 ADD_VECTOR_EQ(a->literal_, out->literal_, |
| 382 NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); |
| 383 ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); |
| 384 ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); |
| 385 ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); |
| 386 ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); |
| 387 } |
| 388 } |
| 389 |
| 390 #undef ADD_VECTOR_EQ |
| 391 #undef ADD_VECTOR |
| 392 #undef ASM_END_1 |
| 393 #undef ASM_END_0 |
| 394 #undef ASM_END_COMMON_1 |
| 395 #undef ASM_END_COMMON_0 |
| 396 #undef ADD_TO_OUT |
| 397 #undef ASM_START |
| 398 |
| 399 #endif // WEBP_USE_MIPS32 |
| 400 |
| 401 //------------------------------------------------------------------------------ |
| 402 // Entry point |
| 403 |
| 404 extern void VP8LDspInitMIPS32(void); |
| 405 |
| 406 void VP8LDspInitMIPS32(void) { |
| 407 #if defined(WEBP_USE_MIPS32) |
| 408 VP8LFastSLog2Slow = FastSLog2Slow; |
| 409 VP8LFastLog2Slow = FastLog2Slow; |
| 410 VP8LExtraCost = ExtraCost; |
| 411 VP8LExtraCostCombined = ExtraCostCombined; |
| 412 VP8LHuffmanCostCount = HuffmanCostCount; |
| 413 VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount; |
| 414 VP8LHistogramAdd = HistogramAdd; |
| 415 #endif // WEBP_USE_MIPS32 |
| 416 } |
OLD | NEW |