OLD | NEW |
1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // This code is licensed under the same terms as WebM: | 3 // This code is licensed under the same terms as WebM: |
4 // Software License Agreement: http://www.webmproject.org/license/software/ | 4 // Software License Agreement: http://www.webmproject.org/license/software/ |
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ | 5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ |
6 // ----------------------------------------------------------------------------- | 6 // ----------------------------------------------------------------------------- |
7 // | 7 // |
8 // Speed-critical encoding functions. | 8 // Speed-critical encoding functions. |
9 // | 9 // |
10 // Author: Skal (pascal.massimino@gmail.com) | 10 // Author: Skal (pascal.massimino@gmail.com) |
11 | 11 |
12 #include <stdlib.h> // for abs() | 12 #include <stdlib.h> // for abs() |
13 #include "./dsp.h" | 13 #include "./dsp.h" |
14 #include "../enc/vp8enci.h" | 14 #include "../enc/vp8enci.h" |
15 | 15 |
16 #if defined(__cplusplus) || defined(c_plusplus) | 16 #if defined(__cplusplus) || defined(c_plusplus) |
17 extern "C" { | 17 extern "C" { |
18 #endif | 18 #endif |
19 | 19 |
| 20 static WEBP_INLINE uint8_t clip_8b(int v) { |
| 21 return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; |
| 22 } |
| 23 |
| 24 static WEBP_INLINE int clip_max(int v, int max) { |
| 25 return (v > max) ? max : v; |
| 26 } |
| 27 |
20 //------------------------------------------------------------------------------ | 28 //------------------------------------------------------------------------------ |
21 // Compute susceptibility based on DCT-coeff histograms: | 29 // Compute susceptibility based on DCT-coeff histograms: |
22 // the higher, the "easier" the macroblock is to compress. | 30 // the higher, the "easier" the macroblock is to compress. |
23 | 31 |
24 static int ClipAlpha(int alpha) { | |
25 return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; | |
26 } | |
27 | |
28 int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { | |
29 int num = 0, den = 0, val = 0; | |
30 int k; | |
31 int alpha; | |
32 // note: changing this loop to avoid the numerous "k + 1" slows things down. | |
33 for (k = 0; k < MAX_COEFF_THRESH; ++k) { | |
34 if (histo[k + 1]) { | |
35 val += histo[k + 1]; | |
36 num += val * (k + 1); | |
37 den += (k + 1) * (k + 1); | |
38 } | |
39 } | |
40 // we scale the value to a usable [0..255] range | |
41 alpha = den ? 10 * num / den - 5 : 0; | |
42 return ClipAlpha(alpha); | |
43 } | |
44 | |
45 const int VP8DspScan[16 + 4 + 4] = { | 32 const int VP8DspScan[16 + 4 + 4] = { |
46 // Luma | 33 // Luma |
47 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, | 34 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, |
48 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, | 35 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, |
49 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, | 36 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, |
50 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, | 37 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, |
51 | 38 |
52 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U | 39 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U |
53 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V | 40 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V |
54 }; | 41 }; |
55 | 42 |
56 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, | 43 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, |
57 int start_block, int end_block) { | 44 int start_block, int end_block, |
58 int histo[MAX_COEFF_THRESH + 1] = { 0 }; | 45 VP8Histogram* const histo) { |
59 int16_t out[16]; | 46 int j; |
60 int j, k; | |
61 for (j = start_block; j < end_block; ++j) { | 47 for (j = start_block; j < end_block; ++j) { |
| 48 int k; |
| 49 int16_t out[16]; |
| 50 |
62 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); | 51 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); |
63 | 52 |
64 // Convert coefficients to bin (within out[]). | 53 // Convert coefficients to bin. |
65 for (k = 0; k < 16; ++k) { | 54 for (k = 0; k < 16; ++k) { |
66 const int v = abs(out[k]) >> 2; | 55 const int v = abs(out[k]) >> 3; // TODO(skal): add rounding? |
67 out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; | 56 const int clipped_value = clip_max(v, MAX_COEFF_THRESH); |
68 } | 57 histo->distribution[clipped_value]++; |
69 | |
70 // Use bin to update histogram. | |
71 for (k = 0; k < 16; ++k) { | |
72 histo[out[k]]++; | |
73 } | 58 } |
74 } | 59 } |
75 | |
76 return VP8GetAlpha(histo); | |
77 } | 60 } |
78 | 61 |
79 //------------------------------------------------------------------------------ | 62 //------------------------------------------------------------------------------ |
80 // run-time tables (~4k) | 63 // run-time tables (~4k) |
81 | 64 |
82 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] | 65 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] |
83 | 66 |
84 // We declare this variable 'volatile' to prevent instruction reordering | 67 // We declare this variable 'volatile' to prevent instruction reordering |
85 // and make sure it's set to true _last_ (so as to be thread-safe) | 68 // and make sure it's set to true _last_ (so as to be thread-safe) |
86 static volatile int tables_ok = 0; | 69 static volatile int tables_ok = 0; |
87 | 70 |
88 static void InitTables(void) { | 71 static void InitTables(void) { |
89 if (!tables_ok) { | 72 if (!tables_ok) { |
90 int i; | 73 int i; |
91 for (i = -255; i <= 255 + 255; ++i) { | 74 for (i = -255; i <= 255 + 255; ++i) { |
92 clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i; | 75 clip1[255 + i] = clip_8b(i); |
93 } | 76 } |
94 tables_ok = 1; | 77 tables_ok = 1; |
95 } | 78 } |
96 } | 79 } |
97 | 80 |
98 static WEBP_INLINE uint8_t clip_8b(int v) { | |
99 return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; | |
100 } | |
101 | 81 |
102 //------------------------------------------------------------------------------ | 82 //------------------------------------------------------------------------------ |
103 // Transforms (Paragraph 14.4) | 83 // Transforms (Paragraph 14.4) |
104 | 84 |
105 #define STORE(x, y, v) \ | 85 #define STORE(x, y, v) \ |
106 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) | 86 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) |
107 | 87 |
108 static const int kC1 = 20091 + (1 << 16); | 88 static const int kC1 = 20091 + (1 << 16); |
109 static const int kC2 = 35468; | 89 static const int kC2 = 35468; |
110 #define MUL(a, b) (((a) * (b)) >> 16) | 90 #define MUL(a, b) (((a) * (b)) >> 16) |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
147 ITransformOne(ref, in, dst); | 127 ITransformOne(ref, in, dst); |
148 if (do_two) { | 128 if (do_two) { |
149 ITransformOne(ref + 4, in + 16, dst + 4); | 129 ITransformOne(ref + 4, in + 16, dst + 4); |
150 } | 130 } |
151 } | 131 } |
152 | 132 |
153 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { | 133 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { |
154 int i; | 134 int i; |
155 int tmp[16]; | 135 int tmp[16]; |
156 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { | 136 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { |
157 const int d0 = src[0] - ref[0]; | 137 const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255]) |
158 const int d1 = src[1] - ref[1]; | 138 const int d1 = src[1] - ref[1]; |
159 const int d2 = src[2] - ref[2]; | 139 const int d2 = src[2] - ref[2]; |
160 const int d3 = src[3] - ref[3]; | 140 const int d3 = src[3] - ref[3]; |
161 const int a0 = (d0 + d3) << 3; | 141 const int a0 = (d0 + d3); // 10b [-510,510] |
162 const int a1 = (d1 + d2) << 3; | 142 const int a1 = (d1 + d2); |
163 const int a2 = (d1 - d2) << 3; | 143 const int a2 = (d1 - d2); |
164 const int a3 = (d0 - d3) << 3; | 144 const int a3 = (d0 - d3); |
165 tmp[0 + i * 4] = (a0 + a1); | 145 tmp[0 + i * 4] = (a0 + a1) << 3; // 14b [-8160,8160] |
166 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; | 146 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542] |
167 tmp[2 + i * 4] = (a0 - a1); | 147 tmp[2 + i * 4] = (a0 - a1) << 3; |
168 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; | 148 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9; |
169 } | 149 } |
170 for (i = 0; i < 4; ++i) { | 150 for (i = 0; i < 4; ++i) { |
171 const int a0 = (tmp[0 + i] + tmp[12 + i]); | 151 const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b |
172 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); | 152 const int a1 = (tmp[4 + i] + tmp[ 8 + i]); |
173 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); | 153 const int a2 = (tmp[4 + i] - tmp[ 8 + i]); |
174 const int a3 = (tmp[0 + i] - tmp[12 + i]); | 154 const int a3 = (tmp[0 + i] - tmp[12 + i]); |
175 out[0 + i] = (a0 + a1 + 7) >> 4; | 155 out[0 + i] = (a0 + a1 + 7) >> 4; // 12b |
176 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); | 156 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); |
177 out[8 + i] = (a0 - a1 + 7) >> 4; | 157 out[8 + i] = (a0 - a1 + 7) >> 4; |
178 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); | 158 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); |
179 } | 159 } |
180 } | 160 } |
181 | 161 |
182 static void ITransformWHT(const int16_t* in, int16_t* out) { | 162 static void ITransformWHT(const int16_t* in, int16_t* out) { |
183 int tmp[16]; | 163 int tmp[16]; |
184 int i; | 164 int i; |
185 for (i = 0; i < 4; ++i) { | 165 for (i = 0; i < 4; ++i) { |
(...skipping 396 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
582 // reconstructed samples. | 562 // reconstructed samples. |
583 | 563 |
584 // Hadamard transform | 564 // Hadamard transform |
585 // Returns the weighted sum of the absolute value of transformed coefficients. | 565 // Returns the weighted sum of the absolute value of transformed coefficients. |
586 static int TTransform(const uint8_t* in, const uint16_t* w) { | 566 static int TTransform(const uint8_t* in, const uint16_t* w) { |
587 int sum = 0; | 567 int sum = 0; |
588 int tmp[16]; | 568 int tmp[16]; |
589 int i; | 569 int i; |
590 // horizontal pass | 570 // horizontal pass |
591 for (i = 0; i < 4; ++i, in += BPS) { | 571 for (i = 0; i < 4; ++i, in += BPS) { |
592 const int a0 = (in[0] + in[2]) << 2; | 572 const int a0 = in[0] + in[2]; |
593 const int a1 = (in[1] + in[3]) << 2; | 573 const int a1 = in[1] + in[3]; |
594 const int a2 = (in[1] - in[3]) << 2; | 574 const int a2 = in[1] - in[3]; |
595 const int a3 = (in[0] - in[2]) << 2; | 575 const int a3 = in[0] - in[2]; |
596 tmp[0 + i * 4] = a0 + a1 + (a0 != 0); | 576 tmp[0 + i * 4] = a0 + a1; |
597 tmp[1 + i * 4] = a3 + a2; | 577 tmp[1 + i * 4] = a3 + a2; |
598 tmp[2 + i * 4] = a3 - a2; | 578 tmp[2 + i * 4] = a3 - a2; |
599 tmp[3 + i * 4] = a0 - a1; | 579 tmp[3 + i * 4] = a0 - a1; |
600 } | 580 } |
601 // vertical pass | 581 // vertical pass |
602 for (i = 0; i < 4; ++i, ++w) { | 582 for (i = 0; i < 4; ++i, ++w) { |
603 const int a0 = (tmp[0 + i] + tmp[8 + i]); | 583 const int a0 = tmp[0 + i] + tmp[8 + i]; |
604 const int a1 = (tmp[4 + i] + tmp[12+ i]); | 584 const int a1 = tmp[4 + i] + tmp[12+ i]; |
605 const int a2 = (tmp[4 + i] - tmp[12+ i]); | 585 const int a2 = tmp[4 + i] - tmp[12+ i]; |
606 const int a3 = (tmp[0 + i] - tmp[8 + i]); | 586 const int a3 = tmp[0 + i] - tmp[8 + i]; |
607 const int b0 = a0 + a1; | 587 const int b0 = a0 + a1; |
608 const int b1 = a3 + a2; | 588 const int b1 = a3 + a2; |
609 const int b2 = a3 - a2; | 589 const int b2 = a3 - a2; |
610 const int b3 = a0 - a1; | 590 const int b3 = a0 - a1; |
611 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 | 591 |
612 sum += w[ 0] * ((abs(b0) + 3) >> 3); | 592 sum += w[ 0] * abs(b0); |
613 sum += w[ 4] * ((abs(b1) + 3) >> 3); | 593 sum += w[ 4] * abs(b1); |
614 sum += w[ 8] * ((abs(b2) + 3) >> 3); | 594 sum += w[ 8] * abs(b2); |
615 sum += w[12] * ((abs(b3) + 3) >> 3); | 595 sum += w[12] * abs(b3); |
616 } | 596 } |
617 return sum; | 597 return sum; |
618 } | 598 } |
619 | 599 |
620 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, | 600 static int Disto4x4(const uint8_t* const a, const uint8_t* const b, |
621 const uint16_t* const w) { | 601 const uint16_t* const w) { |
622 const int sum1 = TTransform(a, w); | 602 const int sum1 = TTransform(a, w); |
623 const int sum2 = TTransform(b, w); | 603 const int sum2 = TTransform(b, w); |
624 return (abs(sum2 - sum1) + 8) >> 4; | 604 return abs(sum2 - sum1) >> 5; |
625 } | 605 } |
626 | 606 |
627 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, | 607 static int Disto16x16(const uint8_t* const a, const uint8_t* const b, |
628 const uint16_t* const w) { | 608 const uint16_t* const w) { |
629 int D = 0; | 609 int D = 0; |
630 int x, y; | 610 int x, y; |
631 for (y = 0; y < 16 * BPS; y += 4 * BPS) { | 611 for (y = 0; y < 16 * BPS; y += 4 * BPS) { |
632 for (x = 0; x < 16; x += 4) { | 612 for (x = 0; x < 16; x += 4) { |
633 D += Disto4x4(a + x + y, b + x + y, w); | 613 D += Disto4x4(a + x + y, b + x + y, w); |
634 } | 614 } |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
699 VP8Metric VP8SSE16x16; | 679 VP8Metric VP8SSE16x16; |
700 VP8Metric VP8SSE8x8; | 680 VP8Metric VP8SSE8x8; |
701 VP8Metric VP8SSE16x8; | 681 VP8Metric VP8SSE16x8; |
702 VP8Metric VP8SSE4x4; | 682 VP8Metric VP8SSE4x4; |
703 VP8WMetric VP8TDisto4x4; | 683 VP8WMetric VP8TDisto4x4; |
704 VP8WMetric VP8TDisto16x16; | 684 VP8WMetric VP8TDisto16x16; |
705 VP8QuantizeBlock VP8EncQuantizeBlock; | 685 VP8QuantizeBlock VP8EncQuantizeBlock; |
706 VP8BlockCopy VP8Copy4x4; | 686 VP8BlockCopy VP8Copy4x4; |
707 | 687 |
708 extern void VP8EncDspInitSSE2(void); | 688 extern void VP8EncDspInitSSE2(void); |
| 689 extern void VP8EncDspInitNEON(void); |
709 | 690 |
710 void VP8EncDspInit(void) { | 691 void VP8EncDspInit(void) { |
711 InitTables(); | 692 InitTables(); |
712 | 693 |
713 // default C implementations | 694 // default C implementations |
714 VP8CollectHistogram = CollectHistogram; | 695 VP8CollectHistogram = CollectHistogram; |
715 VP8ITransform = ITransform; | 696 VP8ITransform = ITransform; |
716 VP8FTransform = FTransform; | 697 VP8FTransform = FTransform; |
717 VP8ITransformWHT = ITransformWHT; | 698 VP8ITransformWHT = ITransformWHT; |
718 VP8FTransformWHT = FTransformWHT; | 699 VP8FTransformWHT = FTransformWHT; |
719 VP8EncPredLuma4 = Intra4Preds; | 700 VP8EncPredLuma4 = Intra4Preds; |
720 VP8EncPredLuma16 = Intra16Preds; | 701 VP8EncPredLuma16 = Intra16Preds; |
721 VP8EncPredChroma8 = IntraChromaPreds; | 702 VP8EncPredChroma8 = IntraChromaPreds; |
722 VP8SSE16x16 = SSE16x16; | 703 VP8SSE16x16 = SSE16x16; |
723 VP8SSE8x8 = SSE8x8; | 704 VP8SSE8x8 = SSE8x8; |
724 VP8SSE16x8 = SSE16x8; | 705 VP8SSE16x8 = SSE16x8; |
725 VP8SSE4x4 = SSE4x4; | 706 VP8SSE4x4 = SSE4x4; |
726 VP8TDisto4x4 = Disto4x4; | 707 VP8TDisto4x4 = Disto4x4; |
727 VP8TDisto16x16 = Disto16x16; | 708 VP8TDisto16x16 = Disto16x16; |
728 VP8EncQuantizeBlock = QuantizeBlock; | 709 VP8EncQuantizeBlock = QuantizeBlock; |
729 VP8Copy4x4 = Copy4x4; | 710 VP8Copy4x4 = Copy4x4; |
730 | 711 |
731 // If defined, use CPUInfo() to overwrite some pointers with faster versions. | 712 // If defined, use CPUInfo() to overwrite some pointers with faster versions. |
732 if (VP8GetCPUInfo) { | 713 if (VP8GetCPUInfo) { |
733 #if defined(WEBP_USE_SSE2) | 714 #if defined(WEBP_USE_SSE2) |
734 if (VP8GetCPUInfo(kSSE2)) { | 715 if (VP8GetCPUInfo(kSSE2)) { |
735 VP8EncDspInitSSE2(); | 716 VP8EncDspInitSSE2(); |
736 } | 717 } |
| 718 #elif defined(WEBP_USE_NEON) |
| 719 if (VP8GetCPUInfo(kNEON)) { |
| 720 VP8EncDspInitNEON(); |
| 721 } |
737 #endif | 722 #endif |
738 } | 723 } |
739 } | 724 } |
740 | 725 |
741 #if defined(__cplusplus) || defined(c_plusplus) | 726 #if defined(__cplusplus) || defined(c_plusplus) |
742 } // extern "C" | 727 } // extern "C" |
743 #endif | 728 #endif |
OLD | NEW |