source/libvpx/vp9/encoder/vp9_dct.c - Issue 11555023: libvpx: Add VP9 decoder.

Side by Side Diff: source/libvpx/vp9/encoder/vp9_dct.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11

	12 #include <assert.h>

	13 #include <math.h>

	14 #include "vpx_ports/config.h"

	15 #include "vp9/common/vp9_systemdependent.h"

	16

	17 #include "vp9/common/vp9_blockd.h"

	18

	19 // TODO: these transforms can be converted into integer forms to reduce

	20 // the complexity

	21 static const float dct_4[16] = {

	22 0.500000000000000, 0.500000000000000, 0.500000000000000, 0.500000000000000,

	23 0.653281482438188, 0.270598050073099, -0.270598050073099, -0.653281482438188,

	24 0.500000000000000, -0.500000000000000, -0.500000000000000, 0.500000000000000,

	25 0.270598050073099, -0.653281482438188, 0.653281482438188, -0.270598050073099

	26 };

	27

	28 static const float adst_4[16] = {

	29 0.228013428883779, 0.428525073124360, 0.577350269189626, 0.656538502008139,

	30 0.577350269189626, 0.577350269189626, 0.000000000000000, -0.577350269189626,

	31 0.656538502008139, -0.228013428883779, -0.577350269189626, 0.428525073124359,

	32 0.428525073124360, -0.656538502008139, 0.577350269189626, -0.228013428883779

	33 };

	34

	35 static const float dct_8[64] = {

	36 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.3535533905932 74,

	37 0.353553390593274, 0.353553390593274, 0.353553390593274, 0.3535533905932 74,

	38 0.490392640201615, 0.415734806151273, 0.277785116509801, 0.0975451610080 64,

	39 -0.097545161008064, -0.277785116509801, -0.415734806151273, -0.4903926402016 15,

	40 0.461939766255643, 0.191341716182545, -0.191341716182545, -0.4619397662556 43,

	41 -0.461939766255643, -0.191341716182545, 0.191341716182545, 0.4619397662556 43,

	42 0.415734806151273, -0.097545161008064, -0.490392640201615, -0.2777851165098 01,

	43 0.277785116509801, 0.490392640201615, 0.097545161008064, -0.4157348061512 73,

	44 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.3535533905932 74,

	45 0.353553390593274, -0.353553390593274, -0.353553390593274, 0.3535533905932 74,

	46 0.277785116509801, -0.490392640201615, 0.097545161008064, 0.4157348061512 73,

	47 -0.415734806151273, -0.097545161008064, 0.490392640201615, -0.2777851165098 01,

	48 0.191341716182545, -0.461939766255643, 0.461939766255643, -0.1913417161825 45,

	49 -0.191341716182545, 0.461939766255643, -0.461939766255643, 0.1913417161825 45,

	50 0.097545161008064, -0.277785116509801, 0.415734806151273, -0.4903926402016 15,

	51 0.490392640201615, -0.415734806151273, 0.277785116509801, -0.0975451610080 64

	52 };

	53

	54 static const float adst_8[64] = {

	55 0.089131608307533, 0.175227946595735, 0.255357107325376, 0.3267903880321 45,

	56 0.387095214016349, 0.434217976756762, 0.466553967085785, 0.4830020216355 09,

	57 0.255357107325376, 0.434217976756762, 0.483002021635509, 0.3870952140163 49,

	58 0.175227946595735, -0.089131608307533, -0.326790388032145, -0.4665539670857 85,

	59 0.387095214016349, 0.466553967085785, 0.175227946595735, -0.2553571073253 76,

	60 -0.483002021635509, -0.326790388032145, 0.089131608307533, 0.4342179767567 62,

	61 0.466553967085785, 0.255357107325376, -0.326790388032145, -0.4342179767567 62,

	62 0.089131608307533, 0.483002021635509, 0.175227946595735, -0.3870952140163 48,

	63 0.483002021635509, -0.089131608307533, -0.466553967085785, 0.1752279465957 35,

	64 0.434217976756762, -0.255357107325376, -0.387095214016348, 0.3267903880321 45,

	65 0.434217976756762, -0.387095214016348, -0.089131608307533, 0.4665539670857 86,

	66 -0.326790388032145, -0.175227946595735, 0.483002021635509, -0.2553571073253 75,

	67 0.326790388032145, -0.483002021635509, 0.387095214016349, -0.0891316083075 34,

	68 -0.255357107325377, 0.466553967085785, -0.434217976756762, 0.1752279465957 36,

	69 0.175227946595735, -0.326790388032145, 0.434217976756762, -0.4830020216355 09,

	70 0.466553967085785, -0.387095214016348, 0.255357107325376, -0.0891316083075 32

	71 };

	72

	73 /* Converted the transforms to integers. */

	74 static const int16_t dct_i4[16] = {

	75 16384, 16384, 16384, 16384,

	76 21407, 8867, -8867, -21407,

	77 16384, -16384, -16384, 16384,

	78 8867, -21407, 21407, -8867

	79 };

	80

	81 static const int16_t adst_i4[16] = {

	82 7472, 14042, 18919, 21513,

	83 18919, 18919, 0, -18919,

	84 21513, -7472, -18919, 14042,

	85 14042, -21513, 18919, -7472

	86 };

	87

	88 static const int16_t dct_i8[64] = {

	89 11585, 11585, 11585, 11585,

	90 11585, 11585, 11585, 11585,

	91 16069, 13623, 9102, 3196,

	92 -3196, -9102, -13623, -16069,

	93 15137, 6270, -6270, -15137,

	94 -15137, -6270, 6270, 15137,

	95 13623, -3196, -16069, -9102,

	96 9102, 16069, 3196, -13623,

	97 11585, -11585, -11585, 11585,

	98 11585, -11585, -11585, 11585,

	99 9102, -16069, 3196, 13623,

	100 -13623, -3196, 16069, -9102,

	101 6270, -15137, 15137, -6270,

	102 -6270, 15137, -15137, 6270,

	103 3196, -9102, 13623, -16069,

	104 16069, -13623, 9102, -3196

	105 };

	106

	107 static const int16_t adst_i8[64] = {

	108 2921, 5742, 8368, 10708,

	109 12684, 14228, 15288, 15827,

	110 8368, 14228, 15827, 12684,

	111 5742, -2921, -10708, -15288,

	112 12684, 15288, 5742, -8368,

	113 -15827, -10708, 2921, 14228,

	114 15288, 8368, -10708, -14228,

	115 2921, 15827, 5742, -12684,

	116 15827, -2921, -15288, 5742,

	117 14228, -8368, -12684, 10708,

	118 14228, -12684, -2921, 15288,

	119 -10708, -5742, 15827, -8368,

	120 10708, -15827, 12684, -2921,

	121 -8368, 15288, -14228, 5742,

	122 5742, -10708, 14228, -15827,

	123 15288, -12684, 8368, -2921

	124 };

	125

	126 static const float dct_16[256] = {

	127 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0 .250000,

	128 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0.250000, 0 .250000,

	129 0.351851, 0.338330, 0.311806, 0.273300, 0.224292, 0.166664, 0.102631, 0 .034654,

	130 -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0 .351851,

	131 0.346760, 0.293969, 0.196424, 0.068975, -0.068975, -0.196424, -0.293969, -0 .346760,

	132 -0.346760, -0.293969, -0.196424, -0.068975, 0.068975, 0.196424, 0.293969, 0 .346760,

	133 0.338330, 0.224292, 0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0 .102631,

	134 0.102631, 0.273300, 0.351851, 0.311806, 0.166664, -0.034654, -0.224292, -0 .338330,

	135 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0 .326641,

	136 0.326641, 0.135299, -0.135299, -0.326641, -0.326641, -0.135299, 0.135299, 0 .326641,

	137 0.311806, 0.034654, -0.273300, -0.338330, -0.102631, 0.224292, 0.351851, 0 .166664,

	138 -0.166664, -0.351851, -0.224292, 0.102631, 0.338330, 0.273300, -0.034654, -0 .311806,

	139 0.293969, -0.068975, -0.346760, -0.196424, 0.196424, 0.346760, 0.068975, -0 .293969,

	140 -0.293969, 0.068975, 0.346760, 0.196424, -0.196424, -0.346760, -0.068975, 0 .293969,

	141 0.273300, -0.166664, -0.338330, 0.034654, 0.351851, 0.102631, -0.311806, -0 .224292,

	142 0.224292, 0.311806, -0.102631, -0.351851, -0.034654, 0.338330, 0.166664, -0 .273300,

	143 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0 .250000,

	144 0.250000, -0.250000, -0.250000, 0.250000, 0.250000, -0.250000, -0.250000, 0 .250000,

	145 0.224292, -0.311806, -0.102631, 0.351851, -0.034654, -0.338330, 0.166664, 0 .273300,

	146 -0.273300, -0.166664, 0.338330, 0.034654, -0.351851, 0.102631, 0.311806, -0 .224292,

	147 0.196424, -0.346760, 0.068975, 0.293969, -0.293969, -0.068975, 0.346760, -0 .196424,

	148 -0.196424, 0.346760, -0.068975, -0.293969, 0.293969, 0.068975, -0.346760, 0 .196424,

	149 0.166664, -0.351851, 0.224292, 0.102631, -0.338330, 0.273300, 0.034654, -0 .311806,

	150 0.311806, -0.034654, -0.273300, 0.338330, -0.102631, -0.224292, 0.351851, -0 .166664,

	151 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0 .135299,

	152 0.135299, -0.326641, 0.326641, -0.135299, -0.135299, 0.326641, -0.326641, 0 .135299,

	153 0.102631, -0.273300, 0.351851, -0.311806, 0.166664, 0.034654, -0.224292, 0 .338330,

	154 -0.338330, 0.224292, -0.034654, -0.166664, 0.311806, -0.351851, 0.273300, -0 .102631,

	155 0.068975, -0.196424, 0.293969, -0.346760, 0.346760, -0.293969, 0.196424, -0 .068975,

	156 -0.068975, 0.196424, -0.293969, 0.346760, -0.346760, 0.293969, -0.196424, 0 .068975,

	157 0.034654, -0.102631, 0.166664, -0.224292, 0.273300, -0.311806, 0.338330, -0 .351851,

	158 0.351851, -0.338330, 0.311806, -0.273300, 0.224292, -0.166664, 0.102631, -0 .034654

	159 };

	160

	161 static const float adst_16[256] = {

	162 0.033094, 0.065889, 0.098087, 0.129396, 0.159534, 0.188227, 0.215215, 0 .240255,

	163 0.263118, 0.283599, 0.301511, 0.316693, 0.329007, 0.338341, 0.344612, 0 .347761,

	164 0.098087, 0.188227, 0.263118, 0.316693, 0.344612, 0.344612, 0.316693, 0 .263118,

	165 0.188227, 0.098087, 0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0 .344612,

	166 0.159534, 0.283599, 0.344612, 0.329007, 0.240255, 0.098087, -0.065889, -0 .215215,

	167 -0.316693, -0.347761, -0.301511, -0.188227, -0.033094, 0.129396, 0.263118, 0 .338341,

	168 0.215215, 0.338341, 0.316693, 0.159534, -0.065889, -0.263118, -0.347761, -0 .283599,

	169 -0.098087, 0.129396, 0.301511, 0.344612, 0.240255, 0.033094, -0.188227, -0 .329007,

	170 0.263118, 0.344612, 0.188227, -0.098087, -0.316693, -0.316693, -0.098087, 0 .188227,

	171 0.344612, 0.263118, 0.000000, -0.263118, -0.344612, -0.188227, 0.098087, 0 .316693,

	172 0.301511, 0.301511, 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0 .301511,

	173 0.000000, -0.301511, -0.301511, -0.000000, 0.301511, 0.301511, 0.000000, -0 .301511,

	174 0.329007, 0.215215, -0.188227, -0.338341, -0.033094, 0.316693, 0.240255, -0 .159534,

	175 -0.344612, -0.065889, 0.301511, 0.263118, -0.129396, -0.347761, -0.098087, 0 .283599,

	176 0.344612, 0.098087, -0.316693, -0.188227, 0.263118, 0.263118, -0.188227, -0 .316693,

	177 0.098087, 0.344612, 0.000000, -0.344612, -0.098087, 0.316693, 0.188227, -0 .263118,

	178 0.347761, -0.033094, -0.344612, 0.065889, 0.338341, -0.098087, -0.329007, 0 .129396,

	179 0.316693, -0.159534, -0.301511, 0.188227, 0.283599, -0.215215, -0.263118, 0 .240255,

	180 0.338341, -0.159534, -0.263118, 0.283599, 0.129396, -0.344612, 0.033094, 0 .329007,

	181 -0.188227, -0.240255, 0.301511, 0.098087, -0.347761, 0.065889, 0.316693, -0 .215215,

	182 0.316693, -0.263118, -0.098087, 0.344612, -0.188227, -0.188227, 0.344612, -0 .098087,

	183 -0.263118, 0.316693, 0.000000, -0.316693, 0.263118, 0.098087, -0.344612, 0 .188227,

	184 0.283599, -0.329007, 0.098087, 0.215215, -0.347761, 0.188227, 0.129396, -0 .338341,

	185 0.263118, 0.033094, -0.301511, 0.316693, -0.065889, -0.240255, 0.344612, -0 .159534,

	186 0.240255, -0.347761, 0.263118, -0.033094, -0.215215, 0.344612, -0.283599, 0 .065889,

	187 0.188227, -0.338341, 0.301511, -0.098087, -0.159534, 0.329007, -0.316693, 0 .129396,

	188 0.188227, -0.316693, 0.344612, -0.263118, 0.098087, 0.098087, -0.263118, 0 .344612,

	189 -0.316693, 0.188227, 0.000000, -0.188227, 0.316693, -0.344612, 0.263118, -0 .098087,

	190 0.129396, -0.240255, 0.316693, -0.347761, 0.329007, -0.263118, 0.159534, -0 .033094,

	191 -0.098087, 0.215215, -0.301511, 0.344612, -0.338341, 0.283599, -0.188227, 0 .065889,

	192 0.065889, -0.129396, 0.188227, -0.240255, 0.283599, -0.316693, 0.338341, -0 .347761,

	193 0.344612, -0.329007, 0.301511, -0.263118, 0.215215, -0.159534, 0.098087, -0 .033094

	194 };

	195

	196 /* Converted the transforms to integers. */

	197 static const int16_t dct_i16[256] = {

	198 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,

	199 8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192,

	200 11529, 11086, 10217, 8955, 7350, 5461, 3363, 1136,

	201 -1136, -3363, -5461, -7350, -8955, -10217, -11086, -11529,

	202 11363, 9633, 6436, 2260, -2260, -6436, -9633, -11363,

	203 -11363, -9633, -6436, -2260, 2260, 6436, 9633, 11363,

	204 11086, 7350, 1136, -5461, -10217, -11529, -8955, -3363,

	205 3363, 8955, 11529, 10217, 5461, -1136, -7350, -11086,

	206 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,

	207 10703, 4433, -4433, -10703, -10703, -4433, 4433, 10703,

	208 10217, 1136, -8955, -11086, -3363, 7350, 11529, 5461,

	209 -5461, -11529, -7350, 3363, 11086, 8955, -1136, -10217,

	210 9633, -2260, -11363, -6436, 6436, 11363, 2260, -9633,

	211 -9633, 2260, 11363, 6436, -6436, -11363, -2260, 9633,

	212 8955, -5461, -11086, 1136, 11529, 3363, -10217, -7350,

	213 7350, 10217, -3363, -11529, -1136, 11086, 5461, -8955,

	214 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,

	215 8192, -8192, -8192, 8192, 8192, -8192, -8192, 8192,

	216 7350, -10217, -3363, 11529, -1136, -11086, 5461, 8955,

	217 -8955, -5461, 11086, 1136, -11529, 3363, 10217, -7350,

	218 6436, -11363, 2260, 9633, -9633, -2260, 11363, -6436,

	219 -6436, 11363, -2260, -9633, 9633, 2260, -11363, 6436,

	220 5461, -11529, 7350, 3363, -11086, 8955, 1136, -10217,

	221 10217, -1136, -8955, 11086, -3363, -7350, 11529, -5461,

	222 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,

	223 4433, -10703, 10703, -4433, -4433, 10703, -10703, 4433,

	224 3363, -8955, 11529, -10217, 5461, 1136, -7350, 11086,

	225 -11086, 7350, -1136, -5461, 10217, -11529, 8955, -3363,

	226 2260, -6436, 9633, -11363, 11363, -9633, 6436, -2260,

	227 -2260, 6436, -9633, 11363, -11363, 9633, -6436, 2260,

	228 1136, -3363, 5461, -7350, 8955, -10217, 11086, -11529,

	229 11529, -11086, 10217, -8955, 7350, -5461, 3363, -1136

	230 };

	231

	232 static const int16_t adst_i16[256] = {

	233 1084, 2159, 3214, 4240, 5228, 6168, 7052, 7873,

	234 8622, 9293, 9880, 10377, 10781, 11087, 11292, 11395,

	235 3214, 6168, 8622, 10377, 11292, 11292, 10377, 8622,

	236 6168, 3214, 0, -3214, -6168, -8622, -10377, -11292,

	237 5228, 9293, 11292, 10781, 7873, 3214, -2159, -7052,

	238 -10377, -11395, -9880, -6168, -1084, 4240, 8622, 11087,

	239 7052, 11087, 10377, 5228, -2159, -8622, -11395, -9293,

	240 -3214, 4240, 9880, 11292, 7873, 1084, -6168, -10781,

	241 8622, 11292, 6168, -3214, -10377, -10377, -3214, 6168,

	242 11292, 8622, 0, -8622, -11292, -6168, 3214, 10377,

	243 9880, 9880, 0, -9880, -9880, 0, 9880, 9880,

	244 0, -9880, -9880, 0, 9880, 9880, 0, -9880,

	245 10781, 7052, -6168, -11087, -1084, 10377, 7873, -5228,

	246 -11292, -2159, 9880, 8622, -4240, -11395, -3214, 9293,

	247 11292, 3214, -10377, -6168, 8622, 8622, -6168, -10377,

	248 3214, 11292, 0, -11292, -3214, 10377, 6168, -8622,

	249 11395, -1084, -11292, 2159, 11087, -3214, -10781, 4240,

	250 10377, -5228, -9880, 6168, 9293, -7052, -8622, 7873,

	251 11087, -5228, -8622, 9293, 4240, -11292, 1084, 10781,

	252 -6168, -7873, 9880, 3214, -11395, 2159, 10377, -7052,

	253 10377, -8622, -3214, 11292, -6168, -6168, 11292, -3214,

	254 -8622, 10377, 0, -10377, 8622, 3214, -11292, 6168,

	255 9293, -10781, 3214, 7052, -11395, 6168, 4240, -11087,

	256 8622, 1084, -9880, 10377, -2159, -7873, 11292, -5228,

	257 7873, -11395, 8622, -1084, -7052, 11292, -9293, 2159,

	258 6168, -11087, 9880, -3214, -5228, 10781, -10377, 4240,

	259 6168, -10377, 11292, -8622, 3214, 3214, -8622, 11292,

	260 -10377, 6168, 0, -6168, 10377, -11292, 8622, -3214,

	261 4240, -7873, 10377, -11395, 10781, -8622, 5228, -1084,

	262 -3214, 7052, -9880, 11292, -11087, 9293, -6168, 2159,

	263 2159, -4240, 6168, -7873, 9293, -10377, 11087, -11395,

	264 11292, -10781, 9880, -8622, 7052, -5228, 3214, -1084

	265 };

	266

	267 static const int xC1S7 = 16069;

	268 static const int xC2S6 = 15137;

	269 static const int xC3S5 = 13623;

	270 static const int xC4S4 = 11585;

	271 static const int xC5S3 = 9102;

	272 static const int xC6S2 = 6270;

	273 static const int xC7S1 = 3196;

	274

	275 #define SHIFT_BITS 14

	276 #define DOROUND(X) X += (1<<(SHIFT_BITS-1));

	277

	278 #define FINAL_SHIFT 3

	279 #define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))

	280 #define IN_SHIFT (FINAL_SHIFT+1)

	281

	282

	283 void vp9_short_fdct8x8_c(short InputData, short OutputData, int pitch) {

	284 int loop;

	285 int short_pitch = pitch >> 1;

	286 int is07, is12, is34, is56;

	287 int is0734, is1256;

	288 int id07, id12, id34, id56;

	289 int irot_input_x, irot_input_y;

	290 int icommon_product1; // Re-used product (c4s4 * (s12 - s56))

	291 int icommon_product2; // Re-used product (c4s4 * (d12 + d56))

	292 int temp1, temp2; // intermediate variable for computation

	293

	294 int InterData[64];

	295 int *ip = InterData;

	296 short *op = OutputData;

	297

	298 for (loop = 0; loop < 8; loop++) {

	299 // Pre calculate some common sums and differences.

	300 is07 = (InputData[0] + InputData[7]) << IN_SHIFT;

	301 is12 = (InputData[1] + InputData[2]) << IN_SHIFT;

	302 is34 = (InputData[3] + InputData[4]) << IN_SHIFT;

	303 is56 = (InputData[5] + InputData[6]) << IN_SHIFT;

	304 id07 = (InputData[0] - InputData[7]) << IN_SHIFT;

	305 id12 = (InputData[1] - InputData[2]) << IN_SHIFT;

	306 id34 = (InputData[3] - InputData[4]) << IN_SHIFT;

	307 id56 = (InputData[5] - InputData[6]) << IN_SHIFT;

	308

	309 is0734 = is07 + is34;

	310 is1256 = is12 + is56;

	311

	312 // Pre-Calculate some common product terms.

	313 icommon_product1 = xC4S4 * (is12 - is56);

	314 DOROUND(icommon_product1)

	315 icommon_product1 >>= SHIFT_BITS;

	316

	317 icommon_product2 = xC4S4 * (id12 + id56);

	318 DOROUND(icommon_product2)

	319 icommon_product2 >>= SHIFT_BITS;

	320

	321

	322 ip[0] = (xC4S4 * (is0734 + is1256));

	323 DOROUND(ip[0]);

	324 ip[0] >>= SHIFT_BITS;

	325

	326 ip[4] = (xC4S4 * (is0734 - is1256));

	327 DOROUND(ip[4]);

	328 ip[4] >>= SHIFT_BITS;

	329

	330 // Define inputs to rotation for outputs 2 and 6

	331 irot_input_x = id12 - id56;

	332 irot_input_y = is07 - is34;

	333

	334 // Apply rotation for outputs 2 and 6.

	335 temp1 = xC6S2 * irot_input_x;

	336 DOROUND(temp1);

	337 temp1 >>= SHIFT_BITS;

	338 temp2 = xC2S6 * irot_input_y;

	339 DOROUND(temp2);

	340 temp2 >>= SHIFT_BITS;

	341 ip[2] = temp1 + temp2;

	342

	343 temp1 = xC6S2 * irot_input_y;

	344 DOROUND(temp1);

	345 temp1 >>= SHIFT_BITS;

	346 temp2 = xC2S6 * irot_input_x;

	347 DOROUND(temp2);

	348 temp2 >>= SHIFT_BITS;

	349 ip[6] = temp1 - temp2;

	350

	351 // Define inputs to rotation for outputs 1 and 7

	352 irot_input_x = icommon_product1 + id07;

	353 irot_input_y = -(id34 + icommon_product2);

	354

	355 // Apply rotation for outputs 1 and 7.

	356 temp1 = xC1S7 * irot_input_x;

	357 DOROUND(temp1);

	358 temp1 >>= SHIFT_BITS;

	359 temp2 = xC7S1 * irot_input_y;

	360 DOROUND(temp2);

	361 temp2 >>= SHIFT_BITS;

	362 ip[1] = temp1 - temp2;

	363

	364 temp1 = xC7S1 * irot_input_x;

	365 DOROUND(temp1);

	366 temp1 >>= SHIFT_BITS;

	367 temp2 = xC1S7 * irot_input_y;

	368 DOROUND(temp2);

	369 temp2 >>= SHIFT_BITS;

	370 ip[7] = temp1 + temp2;

	371

	372 // Define inputs to rotation for outputs 3 and 5

	373 irot_input_x = id07 - icommon_product1;

	374 irot_input_y = id34 - icommon_product2;

	375

	376 // Apply rotation for outputs 3 and 5.

	377 temp1 = xC3S5 * irot_input_x;

	378 DOROUND(temp1);

	379 temp1 >>= SHIFT_BITS;

	380 temp2 = xC5S3 * irot_input_y;

	381 DOROUND(temp2);

	382 temp2 >>= SHIFT_BITS;

	383 ip[3] = temp1 - temp2;

	384

	385

	386 temp1 = xC5S3 * irot_input_x;

	387 DOROUND(temp1);

	388 temp1 >>= SHIFT_BITS;

	389 temp2 = xC3S5 * irot_input_y;

	390 DOROUND(temp2);

	391 temp2 >>= SHIFT_BITS;

	392 ip[5] = temp1 + temp2;

	393

	394 // Increment data pointer for next row

	395 InputData += short_pitch;

	396 ip += 8;

	397 }

	398

	399 // Performed DCT on rows, now transform the columns

	400 ip = InterData;

	401 for (loop = 0; loop < 8; loop++) {

	402 // Pre calculate some common sums and differences.

	403 is07 = ip[0 * 8] + ip[7 * 8];

	404 is12 = ip[1 * 8] + ip[2 * 8];

	405 is34 = ip[3 * 8] + ip[4 * 8];

	406 is56 = ip[5 * 8] + ip[6 * 8];

	407

	408 id07 = ip[0 * 8] - ip[7 * 8];

	409 id12 = ip[1 * 8] - ip[2 * 8];

	410 id34 = ip[3 * 8] - ip[4 * 8];

	411 id56 = ip[5 * 8] - ip[6 * 8];

	412

	413 is0734 = is07 + is34;

	414 is1256 = is12 + is56;

	415

	416 // Pre-Calculate some common product terms

	417 icommon_product1 = xC4S4 * (is12 - is56);

	418 icommon_product2 = xC4S4 * (id12 + id56);

	419 DOROUND(icommon_product1)

	420 DOROUND(icommon_product2)

	421 icommon_product1 >>= SHIFT_BITS;

	422 icommon_product2 >>= SHIFT_BITS;

	423

	424

	425 temp1 = xC4S4 * (is0734 + is1256);

	426 temp2 = xC4S4 * (is0734 - is1256);

	427 DOROUND(temp1);

	428 DOROUND(temp2);

	429 temp1 >>= SHIFT_BITS;

	430

	431 temp2 >>= SHIFT_BITS;

	432 op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;

	433 op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	434

	435 // Define inputs to rotation for outputs 2 and 6

	436 irot_input_x = id12 - id56;

	437 irot_input_y = is07 - is34;

	438

	439 // Apply rotation for outputs 2 and 6.

	440 temp1 = xC6S2 * irot_input_x;

	441 DOROUND(temp1);

	442 temp1 >>= SHIFT_BITS;

	443 temp2 = xC2S6 * irot_input_y;

	444 DOROUND(temp2);

	445 temp2 >>= SHIFT_BITS;

	446 op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	447

	448 temp1 = xC6S2 * irot_input_y;

	449 DOROUND(temp1);

	450 temp1 >>= SHIFT_BITS;

	451 temp2 = xC2S6 * irot_input_x;

	452 DOROUND(temp2);

	453 temp2 >>= SHIFT_BITS;

	454 op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	455

	456 // Define inputs to rotation for outputs 1 and 7

	457 irot_input_x = icommon_product1 + id07;

	458 irot_input_y = -(id34 + icommon_product2);

	459

	460 // Apply rotation for outputs 1 and 7.

	461 temp1 = xC1S7 * irot_input_x;

	462 DOROUND(temp1);

	463 temp1 >>= SHIFT_BITS;

	464 temp2 = xC7S1 * irot_input_y;

	465 DOROUND(temp2);

	466 temp2 >>= SHIFT_BITS;

	467 op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	468

	469 temp1 = xC7S1 * irot_input_x;

	470 DOROUND(temp1);

	471 temp1 >>= SHIFT_BITS;

	472 temp2 = xC1S7 * irot_input_y;

	473 DOROUND(temp2);

	474 temp2 >>= SHIFT_BITS;

	475 op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	476

	477 // Define inputs to rotation for outputs 3 and 5

	478 irot_input_x = id07 - icommon_product1;

	479 irot_input_y = id34 - icommon_product2;

	480

	481 // Apply rotation for outputs 3 and 5.

	482 temp1 = xC3S5 * irot_input_x;

	483 DOROUND(temp1);

	484 temp1 >>= SHIFT_BITS;

	485 temp2 = xC5S3 * irot_input_y;

	486 DOROUND(temp2);

	487 temp2 >>= SHIFT_BITS;

	488 op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	489

	490

	491 temp1 = xC5S3 * irot_input_x;

	492 DOROUND(temp1);

	493 temp1 >>= SHIFT_BITS;

	494 temp2 = xC3S5 * irot_input_y;

	495 DOROUND(temp2);

	496 temp2 >>= SHIFT_BITS;

	497 op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

	498

	499 // Increment data pointer for next column.

	500 ip++;

	501 op++;

	502 }

	503 }

	504

	505 void vp9_short_fhaar2x2_c(short input, short output, int pitch) {

	506 /* [1 1; 1 -1] orthogonal transform */

	507 /* use position: 0,1, 4, 8 */

	508 int i;

	509 short *ip1 = input;

	510 short *op1 = output;

	511 for (i = 0; i < 16; i++) {

	512 op1[i] = 0;

	513 }

	514

	515 op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;

	516 op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;

	517 op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;

	518 op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;

	519 }

	520

	521 /* For test */

	522 #define TEST_INT 1

	523 #if TEST_INT

	524 #define vp9_fht_int_c vp9_fht_c

	525 #else

	526 #define vp9_fht_float_c vp9_fht_c

	527 #endif

	528

	529 void vp9_fht_float_c(const int16_t input, int pitch, int16_t output,

	530 TX_TYPE tx_type, int tx_dim) {

	531 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	532 {

	533 int i, j, k;

	534 float bufa[256], bufb[256]; // buffers are for floating-point test purpose

	535 // the implementation could be simplified in

	536 // conjunction with integer transform

	537 const int16_t *ip = input;

	538 int16_t *op = output;

	539

	540 float *pfa = &bufa[0];

	541 float *pfb = &bufb[0];

	542

	543 // pointers to vertical and horizontal transforms

	544 const float ptv, pth;

	545

	546 assert(tx_type != DCT_DCT);

	547 // load and convert residual array into floating-point

	548 for (j = 0; j < tx_dim; j++) {

	549 for (i = 0; i < tx_dim; i++) {

	550 pfa[i] = (float)ip[i];

	551 }

	552 pfa += tx_dim;

	553 ip += pitch / 2;

	554 }

	555

	556 // vertical transformation

	557 pfa = &bufa[0];

	558 pfb = &bufb[0];

	559

	560 switch (tx_type) {

	561 case ADST_ADST :

	562 case ADST_DCT :

	563 ptv = (tx_dim == 4) ? &adst_4[0] :

	564 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

	565 break;

	566

	567 default :

	568 ptv = (tx_dim == 4) ? &dct_4[0] :

	569 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

	570 break;

	571 }

	572

	573 for (j = 0; j < tx_dim; j++) {

	574 for (i = 0; i < tx_dim; i++) {

	575 pfb[i] = 0;

	576 for (k = 0; k < tx_dim; k++) {

	577 pfb[i] += ptv[k] * pfa[(k * tx_dim)];

	578 }

	579 pfa += 1;

	580 }

	581 pfb += tx_dim;

	582 ptv += tx_dim;

	583 pfa = &bufa[0];

	584 }

	585

	586 // horizontal transformation

	587 pfa = &bufa[0];

	588 pfb = &bufb[0];

	589

	590 switch (tx_type) {

	591 case ADST_ADST :

	592 case DCT_ADST :

	593 pth = (tx_dim == 4) ? &adst_4[0] :

	594 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

	595 break;

	596

	597 default :

	598 pth = (tx_dim == 4) ? &dct_4[0] :

	599 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

	600 break;

	601 }

	602

	603 for (j = 0; j < tx_dim; j++) {

	604 for (i = 0; i < tx_dim; i++) {

	605 pfa[i] = 0;

	606 for (k = 0; k < tx_dim; k++) {

	607 pfa[i] += pfb[k] * pth[k];

	608 }

	609 pth += tx_dim;

	610 }

	611

	612 pfa += tx_dim;

	613 pfb += tx_dim;

	614 // pth -= tx_dim * tx_dim;

	615

	616 switch (tx_type) {

	617 case ADST_ADST :

	618 case DCT_ADST :

	619 pth = (tx_dim == 4) ? &adst_4[0] :

	620 ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

	621 break;

	622

	623 default :

	624 pth = (tx_dim == 4) ? &dct_4[0] :

	625 ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

	626 break;

	627 }

	628 }

	629

	630 // convert to short integer format and load BLOCKD buffer

	631 op = output;

	632 pfa = &bufa[0];

	633

	634 for (j = 0; j < tx_dim; j++) {

	635 for (i = 0; i < tx_dim; i++) {

	636 op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :

	637 -(int16_t)(- 8 * pfa[i] + 0.49);

	638 }

	639 op += tx_dim;

	640 pfa += tx_dim;

	641 }

	642 }

	643 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	644 }

	645

	646 /* Converted the transforms to integer form. */

	647 #define VERTICAL_SHIFT 11

	648 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

	649 #define HORIZONTAL_SHIFT 16

	650 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

	651 void vp9_fht_int_c(const int16_t input, int pitch, int16_t output,

	652 TX_TYPE tx_type, int tx_dim) {

	653 int i, j, k;

	654 int16_t imbuf[256];

	655

	656 const int16_t *ip = input;

	657 int16_t *op = output;

	658 int16_t *im = &imbuf[0];

	659

	660 /* pointers to vertical and horizontal transforms. */

	661 const int16_t ptv = NULL, pth = NULL;

	662

	663 switch (tx_type) {

	664 case ADST_ADST :

	665 ptv = pth = (tx_dim == 4) ? &adst_i4[0]

	666 : ((tx_dim == 8) ? &adst_i8[0]

	667 : &adst_i16[0]);

	668 break;

	669 case ADST_DCT :

	670 ptv = (tx_dim == 4) ? &adst_i4[0]

	671 : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

	672 pth = (tx_dim == 4) ? &dct_i4[0]

	673 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

	674 break;

	675 case DCT_ADST :

	676 ptv = (tx_dim == 4) ? &dct_i4[0]

	677 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

	678 pth = (tx_dim == 4) ? &adst_i4[0]

	679 : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

	680 break;

	681 case DCT_DCT :

	682 ptv = pth = (tx_dim == 4) ? &dct_i4[0]

	683 : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

	684 break;

	685 default:

	686 assert(0);

	687 break;

	688 }

	689

	690 /* vertical transformation */

	691 for (j = 0; j < tx_dim; j++) {

	692 for (i = 0; i < tx_dim; i++) {

	693 int temp = 0;

	694

	695 for (k = 0; k < tx_dim; k++) {

	696 temp += ptv[k] * ip[(k * (pitch >> 1))];

	697 }

	698

	699 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

	700 ip++;

	701 }

	702 im += tx_dim; // 16

	703 ptv += tx_dim;

	704 ip = input;

	705 }

	706

	707 /* horizontal transformation */

	708 im = &imbuf[0];

	709

	710 for (j = 0; j < tx_dim; j++) {

	711 const int16_t *pthc = pth;

	712

	713 for (i = 0; i < tx_dim; i++) {

	714 int temp = 0;

	715

	716 for (k = 0; k < tx_dim; k++) {

	717 temp += im[k] * pthc[k];

	718 }

	719

	720 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

	721 pthc += tx_dim;

	722 }

	723

	724 im += tx_dim; // 16

	725 op += tx_dim;

	726 }

	727 }

	728

	729 void vp9_short_fdct4x4_c(short input, short output, int pitch) {

	730 int i;

	731 int a1, b1, c1, d1;

	732 short *ip = input;

	733 short *op = output;

	734

	735 for (i = 0; i < 4; i++) {

	736 a1 = ((ip[0] + ip[3]) << 5);

	737 b1 = ((ip[1] + ip[2]) << 5);

	738 c1 = ((ip[1] - ip[2]) << 5);

	739 d1 = ((ip[0] - ip[3]) << 5);

	740

	741 op[0] = a1 + b1;

	742 op[2] = a1 - b1;

	743

	744 op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12;

	745 op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12;

	746

	747 ip += pitch / 2;

	748 op += 4;

	749

	750 }

	751 ip = output;

	752 op = output;

	753 for (i = 0; i < 4; i++) {

	754 a1 = ip[0] + ip[12];

	755 b1 = ip[4] + ip[8];

	756 c1 = ip[4] - ip[8];

	757 d1 = ip[0] - ip[12];

	758

	759 op[0] = (a1 + b1 + 7) >> 4;

	760 op[8] = (a1 - b1 + 7) >> 4;

	761

	762 op[4] = ((c1 * 2217 + d1 * 5352 + 12000) >> 16) + (d1 != 0);

	763 op[12] = (d1 * 2217 - c1 * 5352 + 51000) >> 16;

	764

	765 ip++;

	766 op++;

	767 }

	768 }

	769

	770 void vp9_short_fdct8x4_c(short input, short output, int pitch)

	771 {

	772 vp9_short_fdct4x4_c(input, output, pitch);

	773 vp9_short_fdct4x4_c(input + 4, output + 16, pitch);

	774 }

	775

	776 void vp9_short_walsh4x4_c(short input, short output, int pitch) {

	777 int i;

	778 int a1, b1, c1, d1;

	779 short *ip = input;

	780 short *op = output;

	781 int pitch_short = pitch >> 1;

	782

	783 for (i = 0; i < 4; i++) {

	784 a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

	785 b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

	786 c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

	787 d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

	788

	789 op[0] = (a1 + b1 + 1) >> 1;

	790 op[4] = (c1 + d1) >> 1;

	791 op[8] = (a1 - b1) >> 1;

	792 op[12] = (d1 - c1) >> 1;

	793

	794 ip++;

	795 op++;

	796 }

	797 ip = output;

	798 op = output;

	799

	800 for (i = 0; i < 4; i++) {

	801 a1 = ip[0] + ip[3];

	802 b1 = ip[1] + ip[2];

	803 c1 = ip[1] - ip[2];

	804 d1 = ip[0] - ip[3];

	805

	806 op[0] = (a1 + b1 + 1) >> 1;

	807 op[1] = (c1 + d1) >> 1;

	808 op[2] = (a1 - b1) >> 1;

	809 op[3] = (d1 - c1) >> 1;

	810

	811 ip += 4;

	812 op += 4;

	813 }

	814 }

	815

	816 #if CONFIG_LOSSLESS

	817 void vp9_short_walsh4x4_lossless_c(short input, short output, int pitch) {

	818 int i;

	819 int a1, b1, c1, d1;

	820 short *ip = input;

	821 short *op = output;

	822 int pitch_short = pitch >> 1;

	823

	824 for (i = 0; i < 4; i++) {

	825 a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

	826 b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

	827 c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

	828 d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

	829

	830 op[0] = (a1 + b1 + 1) >> 1;

	831 op[4] = (c1 + d1) >> 1;

	832 op[8] = (a1 - b1) >> 1;

	833 op[12] = (d1 - c1) >> 1;

	834

	835 ip++;

	836 op++;

	837 }

	838 ip = output;

	839 op = output;

	840

	841 for (i = 0; i < 4; i++) {

	842 a1 = ip[0] + ip[3];

	843 b1 = ip[1] + ip[2];

	844 c1 = ip[1] - ip[2];

	845 d1 = ip[0] - ip[3];

	846

	847 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	848 op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	849 op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	850 op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	851

	852 ip += 4;

	853 op += 4;

	854 }

	855 }

	856

	857 void vp9_short_walsh4x4_x8_c(short input, short output, int pitch) {

	858 int i;

	859 int a1, b1, c1, d1;

	860 short *ip = input;

	861 short *op = output;

	862 int pitch_short = pitch >> 1;

	863

	864 for (i = 0; i < 4; i++) {

	865 a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

	866 b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

	867 c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

	868 d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

	869

	870 op[0] = (a1 + b1 + 1) >> 1;

	871 op[4] = (c1 + d1) >> 1;

	872 op[8] = (a1 - b1) >> 1;

	873 op[12] = (d1 - c1) >> 1;

	874

	875 ip++;

	876 op++;

	877 }

	878 ip = output;

	879 op = output;

	880

	881 for (i = 0; i < 4; i++) {

	882 a1 = ip[0] + ip[3];

	883 b1 = ip[1] + ip[2];

	884 c1 = ip[1] - ip[2];

	885 d1 = ip[0] - ip[3];

	886

	887 op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;

	888 op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;

	889 op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;

	890 op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;

	891

	892 ip += 4;

	893 op += 4;

	894 }

	895 }

	896

	897 void vp9_short_walsh8x4_x8_c(short input, short output, int pitch) {

	898 vp9_short_walsh4x4_x8_c(input, output, pitch);

	899 vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

	900 }

	901 #endif

	902

	903 #define TEST_INT_16x16_DCT 1

	904 #if !TEST_INT_16x16_DCT

	905 static const double C1 = 0.995184726672197;

	906 static const double C2 = 0.98078528040323;

	907 static const double C3 = 0.956940335732209;

	908 static const double C4 = 0.923879532511287;

	909 static const double C5 = 0.881921264348355;

	910 static const double C6 = 0.831469612302545;

	911 static const double C7 = 0.773010453362737;

	912 static const double C8 = 0.707106781186548;

	913 static const double C9 = 0.634393284163646;

	914 static const double C10 = 0.555570233019602;

	915 static const double C11 = 0.471396736825998;

	916 static const double C12 = 0.38268343236509;

	917 static const double C13 = 0.290284677254462;

	918 static const double C14 = 0.195090322016128;

	919 static const double C15 = 0.098017140329561;

	920

	921 static void dct16x16_1d(double input[16], double output[16]) {

	922 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	923 {

	924 double step[16];

	925 double intermediate[16];

	926 double temp1, temp2;

	927

	928 // step 1

	929 step[ 0] = input[0] + input[15];

	930 step[ 1] = input[1] + input[14];

	931 step[ 2] = input[2] + input[13];

	932 step[ 3] = input[3] + input[12];

	933 step[ 4] = input[4] + input[11];

	934 step[ 5] = input[5] + input[10];

	935 step[ 6] = input[6] + input[ 9];

	936 step[ 7] = input[7] + input[ 8];

	937 step[ 8] = input[7] - input[ 8];

	938 step[ 9] = input[6] - input[ 9];

	939 step[10] = input[5] - input[10];

	940 step[11] = input[4] - input[11];

	941 step[12] = input[3] - input[12];

	942 step[13] = input[2] - input[13];

	943 step[14] = input[1] - input[14];

	944 step[15] = input[0] - input[15];

	945

	946 // step 2

	947 output[0] = step[0] + step[7];

	948 output[1] = step[1] + step[6];

	949 output[2] = step[2] + step[5];

	950 output[3] = step[3] + step[4];

	951 output[4] = step[3] - step[4];

	952 output[5] = step[2] - step[5];

	953 output[6] = step[1] - step[6];

	954 output[7] = step[0] - step[7];

	955

	956 temp1 = step[ 8]*C7;

	957 temp2 = step[15]*C9;

	958 output[ 8] = temp1 + temp2;

	959

	960 temp1 = step[ 9]*C11;

	961 temp2 = step[14]*C5;

	962 output[ 9] = temp1 - temp2;

	963

	964 temp1 = step[10]*C3;

	965 temp2 = step[13]*C13;

	966 output[10] = temp1 + temp2;

	967

	968 temp1 = step[11]*C15;

	969 temp2 = step[12]*C1;

	970 output[11] = temp1 - temp2;

	971

	972 temp1 = step[11]*C1;

	973 temp2 = step[12]*C15;

	974 output[12] = temp2 + temp1;

	975

	976 temp1 = step[10]*C13;

	977 temp2 = step[13]*C3;

	978 output[13] = temp2 - temp1;

	979

	980 temp1 = step[ 9]*C5;

	981 temp2 = step[14]*C11;

	982 output[14] = temp2 + temp1;

	983

	984 temp1 = step[ 8]*C9;

	985 temp2 = step[15]*C7;

	986 output[15] = temp2 - temp1;

	987

	988 // step 3

	989 step[ 0] = output[0] + output[3];

	990 step[ 1] = output[1] + output[2];

	991 step[ 2] = output[1] - output[2];

	992 step[ 3] = output[0] - output[3];

	993

	994 temp1 = output[4]*C14;

	995 temp2 = output[7]*C2;

	996 step[ 4] = temp1 + temp2;

	997

	998 temp1 = output[5]*C10;

	999 temp2 = output[6]*C6;

	1000 step[ 5] = temp1 + temp2;

	1001

	1002 temp1 = output[5]*C6;

	1003 temp2 = output[6]*C10;

	1004 step[ 6] = temp2 - temp1;

	1005

	1006 temp1 = output[4]*C2;

	1007 temp2 = output[7]*C14;

	1008 step[ 7] = temp2 - temp1;

	1009

	1010 step[ 8] = output[ 8] + output[11];

	1011 step[ 9] = output[ 9] + output[10];

	1012 step[10] = output[ 9] - output[10];

	1013 step[11] = output[ 8] - output[11];

	1014

	1015 step[12] = output[12] + output[15];

	1016 step[13] = output[13] + output[14];

	1017 step[14] = output[13] - output[14];

	1018 step[15] = output[12] - output[15];

	1019

	1020 // step 4

	1021 output[ 0] = (step[ 0] + step[ 1]);

	1022 output[ 8] = (step[ 0] - step[ 1]);

	1023

	1024 temp1 = step[2]*C12;

	1025 temp2 = step[3]*C4;

	1026 temp1 = temp1 + temp2;

	1027 output[ 4] = 2(temp1C8);

	1028

	1029 temp1 = step[2]*C4;

	1030 temp2 = step[3]*C12;

	1031 temp1 = temp2 - temp1;

	1032 output[12] = 2(temp1C8);

	1033

	1034 output[ 2] = 2((step[4] + step[ 5])C8);

	1035 output[14] = 2((step[7] - step[ 6])C8);

	1036

	1037 temp1 = step[4] - step[5];

	1038 temp2 = step[6] + step[7];

	1039 output[ 6] = (temp1 + temp2);

	1040 output[10] = (temp1 - temp2);

	1041

	1042 intermediate[8] = step[8] + step[14];

	1043 intermediate[9] = step[9] + step[15];

	1044

	1045 temp1 = intermediate[8]*C12;

	1046 temp2 = intermediate[9]*C4;

	1047 temp1 = temp1 - temp2;

	1048 output[3] = 2(temp1C8);

	1049

	1050 temp1 = intermediate[8]*C4;

	1051 temp2 = intermediate[9]*C12;

	1052 temp1 = temp2 + temp1;

	1053 output[13] = 2(temp1C8);

	1054

	1055 output[ 9] = 2((step[10] + step[11])C8);

	1056

	1057 intermediate[11] = step[10] - step[11];

	1058 intermediate[12] = step[12] + step[13];

	1059 intermediate[13] = step[12] - step[13];

	1060 intermediate[14] = step[ 8] - step[14];

	1061 intermediate[15] = step[ 9] - step[15];

	1062

	1063 output[15] = (intermediate[11] + intermediate[12]);

	1064 output[ 1] = -(intermediate[11] - intermediate[12]);

	1065

	1066 output[ 7] = 2(intermediate[13]C8);

	1067

	1068 temp1 = intermediate[14]*C12;

	1069 temp2 = intermediate[15]*C4;

	1070 temp1 = temp1 - temp2;

	1071 output[11] = -2(temp1C8);

	1072

	1073 temp1 = intermediate[14]*C4;

	1074 temp2 = intermediate[15]*C12;

	1075 temp1 = temp2 + temp1;

	1076 output[ 5] = 2(temp1C8);

	1077 }

	1078 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1079 }

	1080

	1081 void vp9_short_fdct16x16_c(short input, short out, int pitch) {

	1082 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1083 {

	1084 int shortpitch = pitch >> 1;

	1085 int i, j;

	1086 double output[256];

	1087 // First transform columns

	1088 for (i = 0; i < 16; i++) {

	1089 double temp_in[16], temp_out[16];

	1090 for (j = 0; j < 16; j++)

	1091 temp_in[j] = input[j*shortpitch + i];

	1092 dct16x16_1d(temp_in, temp_out);

	1093 for (j = 0; j < 16; j++)

	1094 output[j*16 + i] = temp_out[j];

	1095 }

	1096 // Then transform rows

	1097 for (i = 0; i < 16; ++i) {

	1098 double temp_in[16], temp_out[16];

	1099 for (j = 0; j < 16; ++j)

	1100 temp_in[j] = output[j + i*16];

	1101 dct16x16_1d(temp_in, temp_out);

	1102 for (j = 0; j < 16; ++j)

	1103 output[j + i*16] = temp_out[j];

	1104 }

	1105 // Scale by some magic number

	1106 for (i = 0; i < 256; i++)

	1107 out[i] = (short)round(output[i]/2);

	1108 }

	1109 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1110 }

	1111

	1112 #else

	1113 static const int16_t C1 = 16305;

	1114 static const int16_t C2 = 16069;

	1115 static const int16_t C3 = 15679;

	1116 static const int16_t C4 = 15137;

	1117 static const int16_t C5 = 14449;

	1118 static const int16_t C6 = 13623;

	1119 static const int16_t C7 = 12665;

	1120 static const int16_t C8 = 11585;

	1121 static const int16_t C9 = 10394;

	1122 static const int16_t C10 = 9102;

	1123 static const int16_t C11 = 7723;

	1124 static const int16_t C12 = 6270;

	1125 static const int16_t C13 = 4756;

	1126 static const int16_t C14 = 3196;

	1127 static const int16_t C15 = 1606;

	1128

	1129 #define RIGHT_SHIFT 14

	1130 #define ROUNDING (1 << (RIGHT_SHIFT - 1))

	1131

	1132 static void dct16x16_1d(int16_t input[16], int16_t output[16],

	1133 int last_shift_bits) {

	1134 int16_t step[16];

	1135 int intermediate[16];

	1136 int temp1, temp2;

	1137 int final_shift = RIGHT_SHIFT;

	1138 int final_rounding = ROUNDING;

	1139 int output_shift = 0;

	1140 int output_rounding = 0;

	1141

	1142 final_shift += last_shift_bits;

	1143 if (final_shift > 0)

	1144 final_rounding = 1 << (final_shift - 1);

	1145

	1146 output_shift += last_shift_bits;

	1147 if (output_shift > 0)

	1148 output_rounding = 1 << (output_shift - 1);

	1149

	1150 // step 1

	1151 step[ 0] = input[0] + input[15];

	1152 step[ 1] = input[1] + input[14];

	1153 step[ 2] = input[2] + input[13];

	1154 step[ 3] = input[3] + input[12];

	1155 step[ 4] = input[4] + input[11];

	1156 step[ 5] = input[5] + input[10];

	1157 step[ 6] = input[6] + input[ 9];

	1158 step[ 7] = input[7] + input[ 8];

	1159 step[ 8] = input[7] - input[ 8];

	1160 step[ 9] = input[6] - input[ 9];

	1161 step[10] = input[5] - input[10];

	1162 step[11] = input[4] - input[11];

	1163 step[12] = input[3] - input[12];

	1164 step[13] = input[2] - input[13];

	1165 step[14] = input[1] - input[14];

	1166 step[15] = input[0] - input[15];

	1167

	1168 // step 2

	1169 output[0] = step[0] + step[7];

	1170 output[1] = step[1] + step[6];

	1171 output[2] = step[2] + step[5];

	1172 output[3] = step[3] + step[4];

	1173 output[4] = step[3] - step[4];

	1174 output[5] = step[2] - step[5];

	1175 output[6] = step[1] - step[6];

	1176 output[7] = step[0] - step[7];

	1177

	1178 temp1 = step[ 8] * C7;

	1179 temp2 = step[15] * C9;

	1180 output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

	1181

	1182 temp1 = step[ 9] * C11;

	1183 temp2 = step[14] * C5;

	1184 output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;

	1185

	1186 temp1 = step[10] * C3;

	1187 temp2 = step[13] * C13;

	1188 output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

	1189

	1190 temp1 = step[11] * C15;

	1191 temp2 = step[12] * C1;

	1192 output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;

	1193

	1194 temp1 = step[11] * C1;

	1195 temp2 = step[12] * C15;

	1196 output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;

	1197

	1198 temp1 = step[10] * C13;

	1199 temp2 = step[13] * C3;

	1200 output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

	1201

	1202 temp1 = step[ 9] * C5;

	1203 temp2 = step[14] * C11;

	1204 output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;

	1205

	1206 temp1 = step[ 8] * C9;

	1207 temp2 = step[15] * C7;

	1208 output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

	1209

	1210 // step 3

	1211 step[ 0] = output[0] + output[3];

	1212 step[ 1] = output[1] + output[2];

	1213 step[ 2] = output[1] - output[2];

	1214 step[ 3] = output[0] - output[3];

	1215

	1216 temp1 = output[4] * C14;

	1217 temp2 = output[7] * C2;

	1218 step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

	1219

	1220 temp1 = output[5] * C10;

	1221 temp2 = output[6] * C6;

	1222 step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

	1223

	1224 temp1 = output[5] * C6;

	1225 temp2 = output[6] * C10;

	1226 step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

	1227

	1228 temp1 = output[4] * C2;

	1229 temp2 = output[7] * C14;

	1230 step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

	1231

	1232 step[ 8] = output[ 8] + output[11];

	1233 step[ 9] = output[ 9] + output[10];

	1234 step[10] = output[ 9] - output[10];

	1235 step[11] = output[ 8] - output[11];

	1236

	1237 step[12] = output[12] + output[15];

	1238 step[13] = output[13] + output[14];

	1239 step[14] = output[13] - output[14];

	1240 step[15] = output[12] - output[15];

	1241

	1242 // step 4

	1243 output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift;

	1244 output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift;

	1245

	1246 temp1 = step[2] * C12;

	1247 temp2 = step[3] * C4;

	1248 temp1 = (temp1 + temp2 + final_rounding) >> final_shift;

	1249 output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1250

	1251 temp1 = step[2] * C4;

	1252 temp2 = step[3] * C12;

	1253 temp1 = (temp2 - temp1 + final_rounding) >> final_shift;

	1254 output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1255

	1256 output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding)

	1257 >> final_shift;

	1258 output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding)

	1259 >> final_shift;

	1260

	1261 temp1 = step[4] - step[5];

	1262 temp2 = step[6] + step[7];

	1263 output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift;

	1264 output[10] = (temp1 - temp2 + output_rounding) >> output_shift;

	1265

	1266 intermediate[8] = step[8] + step[14];

	1267 intermediate[9] = step[9] + step[15];

	1268

	1269 temp1 = intermediate[8] * C12;

	1270 temp2 = intermediate[9] * C4;

	1271 temp1 = (temp1 - temp2 + final_rounding) >> final_shift;

	1272 output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1273

	1274 temp1 = intermediate[8] * C4;

	1275 temp2 = intermediate[9] * C12;

	1276 temp1 = (temp2 + temp1 + final_rounding) >> final_shift;

	1277 output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1278

	1279 output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding)

	1280 >> final_shift;

	1281

	1282 intermediate[11] = step[10] - step[11];

	1283 intermediate[12] = step[12] + step[13];

	1284 intermediate[13] = step[12] - step[13];

	1285 intermediate[14] = step[ 8] - step[14];

	1286 intermediate[15] = step[ 9] - step[15];

	1287

	1288 output[15] = (intermediate[11] + intermediate[12] + output_rounding)

	1289 >> output_shift;

	1290 output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding)

	1291 >> output_shift;

	1292

	1293 output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift;

	1294

	1295 temp1 = intermediate[14] * C12;

	1296 temp2 = intermediate[15] * C4;

	1297 temp1 = (temp1 - temp2 + final_rounding) >> final_shift;

	1298 output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1299

	1300 temp1 = intermediate[14] * C4;

	1301 temp2 = intermediate[15] * C12;

	1302 temp1 = (temp2 + temp1 + final_rounding) >> final_shift;

	1303 output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

	1304 }

	1305

	1306 void vp9_short_fdct16x16_c(int16_t input, int16_t out, int pitch) {

	1307 int shortpitch = pitch >> 1;

	1308 int i, j;

	1309 int16_t output[256];

	1310 int16_t *outptr = &output[0];

	1311

	1312 // First transform columns

	1313 for (i = 0; i < 16; i++) {

	1314 int16_t temp_in[16];

	1315 int16_t temp_out[16];

	1316 for (j = 0; j < 16; j++)

	1317 temp_in[j] = input[j * shortpitch + i];

	1318 dct16x16_1d(temp_in, temp_out, 0);

	1319 for (j = 0; j < 16; j++)

	1320 output[j * 16 + i] = temp_out[j];

	1321 }

	1322

	1323 // Then transform rows

	1324 for (i = 0; i < 16; ++i) {

	1325 dct16x16_1d(outptr, out, 1);

	1326 outptr += 16;

	1327 out += 16;

	1328 }

	1329 }

	1330 #undef RIGHT_SHIFT

	1331 #undef ROUNDING

	1332 #endif

OLD	NEW

« libvpx.gyp ('K') | « source/libvpx/vp9/encoder/vp9_boolhuff.c ('k') | source/libvpx/vp9/encoder/vp9_encodeframe.h » ('j') | no next file with comments »