source/libvpx/vp9/common/vp9_idctllm.c - Issue 11555023: libvpx: Add VP9 decoder.

Side by Side Diff: source/libvpx/vp9/common/vp9_idctllm.c

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11

	12 /****************************************************************************

	13 * Notes:

	14 *

	15 * This implementation makes use of 16 bit fixed point verio of two multiply

	16 * constants:

	17 * 1. sqrt(2) * cos (pi/8)

	18 * 2. sqrt(2) * sin (pi/8)

	19 * Becuase the first constant is bigger than 1, to maintain the same 16 bit

	20 * fixed point precision as the second one, we use a trick of

	21 * x * a = x + x*(a-1)

	22 * so

	23 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

	24 **************************************************************************/

	25 #include <assert.h>

	26 #include <math.h>

	27 #include "vpx_ports/config.h"

	28 #include "vp9/common/vp9_systemdependent.h"

	29

	30 #include "vp9/common/vp9_blockd.h"

	31

	32 static const int cospi8sqrt2minus1 = 20091;

	33 static const int sinpi8sqrt2 = 35468;

	34 static const int rounding = 0;

	35

	36 // TODO: these transforms can be further converted into integer forms

	37 // for complexity optimization

	38 static const float idct_4[16] = {

	39 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.2705980500730 99,

	40 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.6532814824381 88,

	41 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.6532814824381 88,

	42 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.2705980500730 99

	43 };

	44

	45 static const float iadst_4[16] = {

	46 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.4285250731243 60,

	47 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.6565385020081 39,

	48 0.577350269189626, 0, -0.577350269189626, 0.5773502691896 26,

	49 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.2280134288837 79

	50 };

	51

	52 static const float idct_8[64] = {

	53 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.4157348061512 73,

	54 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.0975451610080 64,

	55 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.0975451610080 64,

	56 -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.2777851165098 01,

	57 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.4903926402016 15,

	58 -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.4157348061512 73,

	59 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.2777851165098 01,

	60 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.4903926402016 15,

	61 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.2777851165098 01,

	62 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.4903926402016 15,

	63 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.4903926402016 15,

	64 -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.4157348061512 73,

	65 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.0975451610080 64,

	66 -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.2777851165098 01,

	67 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.4157348061512 73,

	68 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.0975451610080 64

	69 };

	70

	71 static const float iadst_8[64] = {

	72 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.4665539670857 85,

	73 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.1752279465957 35,

	74 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.2553571073253 76,

	75 -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.3267903880321 45,

	76 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.3267903880321 45,

	77 -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.4342179767567 62,

	78 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.4342179767567 62,

	79 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.4830020216355 09,

	80 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.0891316083075 33,

	81 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.4665539670857 85,

	82 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.4830020216355 09,

	83 -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.3870952140163 48,

	84 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.1752279465957 35,

	85 -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.2553571073253 76,

	86 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.3870952140163 48,

	87 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.0891316083075 32

	88 };

	89

	90 static const int16_t idct_i4[16] = {

	91 8192, 10703, 8192, 4433,

	92 8192, 4433, -8192, -10703,

	93 8192, -4433, -8192, 10703,

	94 8192, -10703, 8192, -4433

	95 };

	96

	97 static const int16_t iadst_i4[16] = {

	98 3736, 9459, 10757, 7021,

	99 7021, 9459, -3736, -10757,

	100 9459, 0, -9459, 9459,

	101 10757, -9459, 7021, -3736

	102 };

	103

	104 static const int16_t idct_i8[64] = {

	105 5793, 8035, 7568, 6811,

	106 5793, 4551, 3135, 1598,

	107 5793, 6811, 3135, -1598,

	108 -5793, -8035, -7568, -4551,

	109 5793, 4551, -3135, -8035,

	110 -5793, 1598, 7568, 6811,

	111 5793, 1598, -7568, -4551,

	112 5793, 6811, -3135, -8035,

	113 5793, -1598, -7568, 4551,

	114 5793, -6811, -3135, 8035,

	115 5793, -4551, -3135, 8035,

	116 -5793, -1598, 7568, -6811,

	117 5793, -6811, 3135, 1598,

	118 -5793, 8035, -7568, 4551,

	119 5793, -8035, 7568, -6811,

	120 5793, -4551, 3135, -1598

	121 };

	122

	123 static const int16_t iadst_i8[64] = {

	124 1460, 4184, 6342, 7644,

	125 7914, 7114, 5354, 2871,

	126 2871, 7114, 7644, 4184,

	127 -1460, -6342, -7914, -5354,

	128 4184, 7914, 2871, -5354,

	129 -7644, -1460, 6342, 7114,

	130 5354, 6342, -4184, -7114,

	131 2871, 7644, -1460, -7914,

	132 6342, 2871, -7914, 1460,

	133 7114, -5354, -4184, 7644,

	134 7114, -1460, -5354, 7914,

	135 -4184, -2871, 7644, -6342,

	136 7644, -5354, 1460, 2871,

	137 -6342, 7914, -7114, 4184,

	138 7914, -7644, 7114, -6342,

	139 5354, -4184, 2871, -1460

	140 };

	141

	142 static float idct_16[256] = {

	143 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0 .273300,

	144 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0 .034654,

	145 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0 .166664,

	146 -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0 .102631,

	147 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0 .338330,

	148 -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0 .166664,

	149 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0 .034654,

	150 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0 .224292,

	151 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0 .351851,

	152 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0 .273300,

	153 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0 .102631,

	154 -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0 .311806,

	155 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0 .311806,

	156 -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0 .338330,

	157 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0 .224292,

	158 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0 .351851,

	159 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0 .224292,

	160 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0 .351851,

	161 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0 .311806,

	162 -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0 .338330,

	163 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0 .102631,

	164 -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0 .311806,

	165 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0 .351851,

	166 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0 .273300,

	167 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0 .034654,

	168 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0 .224292,

	169 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0 .338330,

	170 -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0 .166664,

	171 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0 .166664,

	172 -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0 .102631,

	173 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0 .273300,

	174 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0 .034654

	175 };

	176

	177 static float iadst_16[256] = {

	178 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0 .344612,

	179 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0 .065889,

	180 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0 .098087,

	181 -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0 .129396,

	182 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0 .316693,

	183 -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0 .188227,

	184 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0 .188227,

	185 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0 .240255,

	186 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0 .263118,

	187 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0 .283599,

	188 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0 .263118,

	189 -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0 .316693,

	190 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0 .188227,

	191 -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0 .338341,

	192 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0 .316693,

	193 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0 .347761,

	194 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0 .098087,

	195 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0 .344612,

	196 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0 .344612,

	197 -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0 .329007,

	198 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0 .000000,

	199 -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0 .301511,

	200 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0 .344612,

	201 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0 .263118,

	202 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0 .098087,

	203 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0 .215215,

	204 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0 .316693,

	205 -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0 .159534,

	206 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0 .188227,

	207 -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0 .098087,

	208 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0 .263118,

	209 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0 .033094

	210 };

	211

	212 static const int16_t idct_i16[256] = {

	213 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478,

	214 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568,

	215 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731,

	216 -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,

	217 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543,

	218 -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731,

	219 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568,

	220 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675,

	221 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765,

	222 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478,

	223 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682,

	224 -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109,

	225 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109,

	226 -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543,

	227 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675,

	228 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765,

	229 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675,

	230 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765,

	231 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109,

	232 -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543,

	233 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682,

	234 -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109,

	235 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765,

	236 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478,

	237 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568,

	238 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675,

	239 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543,

	240 -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731,

	241 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731,

	242 -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682,

	243 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478,

	244 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568

	245 };

	246

	247 static const int16_t iadst_i16[256] = {

	248 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646,

	249 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080,

	250 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607,

	251 -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,

	252 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189,

	253 -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084,

	254 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084,

	255 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936,

	256 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311,

	257 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646,

	258 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311,

	259 -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189,

	260 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084,

	261 -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543,

	262 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189,

	263 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698,

	264 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607,

	265 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646,

	266 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646,

	267 -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390,

	268 4940, 0, -4940, 4940, 0, -4940, 4940, 0,

	269 -4940, 4940, 0, -4940, 4940, 0, -4940, 4940,

	270 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646,

	271 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311,

	272 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607,

	273 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526,

	274 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189,

	275 -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614,

	276 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084,

	277 -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607,

	278 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311,

	279 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542

	280 };

	281

	282 void vp9_ihtllm_float_c(const int16_t input, int16_t output, int pitch,

	283 TX_TYPE tx_type, int tx_dim) {

	284 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	285 {

	286 int i, j, k;

	287 float bufa[256], bufb[256]; // buffers are for floating-point test purpose

	288 // the implementation could be simplified in

	289 // conjunction with integer transform

	290 const int16_t *ip = input;

	291 int16_t *op = output;

	292 int shortpitch = pitch >> 1;

	293

	294 float *pfa = &bufa[0];

	295 float *pfb = &bufb[0];

	296

	297 // pointers to vertical and horizontal transforms

	298 const float ptv, pth;

	299

	300 assert(tx_type != DCT_DCT);

	301 // load and convert residual array into floating-point

	302 for(j = 0; j < tx_dim; j++) {

	303 for(i = 0; i < tx_dim; i++) {

	304 pfa[i] = (float)ip[i];

	305 }

	306 pfa += tx_dim;

	307 ip += tx_dim;

	308 }

	309

	310 // vertical transformation

	311 pfa = &bufa[0];

	312 pfb = &bufb[0];

	313

	314 switch(tx_type) {

	315 case ADST_ADST :

	316 case ADST_DCT :

	317 ptv = (tx_dim == 4) ? &iadst_4[0] :

	318 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

	319 break;

	320

	321 default :

	322 ptv = (tx_dim == 4) ? &idct_4[0] :

	323 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

	324 break;

	325 }

	326

	327 for(j = 0; j < tx_dim; j++) {

	328 for(i = 0; i < tx_dim; i++) {

	329 pfb[i] = 0 ;

	330 for(k = 0; k < tx_dim; k++) {

	331 pfb[i] += ptv[k] * pfa[(k * tx_dim)];

	332 }

	333 pfa += 1;

	334 }

	335

	336 pfb += tx_dim;

	337 ptv += tx_dim;

	338 pfa = &bufa[0];

	339 }

	340

	341 // horizontal transformation

	342 pfa = &bufa[0];

	343 pfb = &bufb[0];

	344

	345 switch(tx_type) {

	346 case ADST_ADST :

	347 case DCT_ADST :

	348 pth = (tx_dim == 4) ? &iadst_4[0] :

	349 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

	350 break;

	351

	352 default :

	353 pth = (tx_dim == 4) ? &idct_4[0] :

	354 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

	355 break;

	356 }

	357

	358 for(j = 0; j < tx_dim; j++) {

	359 for(i = 0; i < tx_dim; i++) {

	360 pfa[i] = 0;

	361 for(k = 0; k < tx_dim; k++) {

	362 pfa[i] += pfb[k] * pth[k];

	363 }

	364 pth += tx_dim;

	365 }

	366

	367 pfa += tx_dim;

	368 pfb += tx_dim;

	369

	370 switch(tx_type) {

	371 case ADST_ADST :

	372 case DCT_ADST :

	373 pth = (tx_dim == 4) ? &iadst_4[0] :

	374 ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);

	375 break;

	376

	377 default :

	378 pth = (tx_dim == 4) ? &idct_4[0] :

	379 ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);

	380 break;

	381 }

	382 }

	383

	384 // convert to short integer format and load BLOCKD buffer

	385 op = output;

	386 pfa = &bufa[0];

	387

	388 for(j = 0; j < tx_dim; j++) {

	389 for(i = 0; i < tx_dim; i++) {

	390 op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) :

	391 -(int16_t)( - pfa[i] / 8 + 0.49);

	392 }

	393

	394 op += shortpitch;

	395 pfa += tx_dim;

	396 }

	397 }

	398 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	399 }

	400

	401 /* Converted the transforms to integer form. */

	402 #define VERTICAL_SHIFT 14 // 16

	403 #define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

	404 #define HORIZONTAL_SHIFT 17 // 15

	405 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

	406 void vp9_ihtllm_c(const int16_t input, int16_t output, int pitch,

	407 TX_TYPE tx_type, int tx_dim) {

	408 int i, j, k;

	409 int16_t imbuf[256];

	410

	411 const int16_t *ip = input;

	412 int16_t *op = output;

	413 int16_t *im = &imbuf[0];

	414

	415 /* pointers to vertical and horizontal transforms. */

	416 const int16_t ptv = NULL, pth = NULL;

	417 int shortpitch = pitch >> 1;

	418

	419 switch (tx_type) {

	420 case ADST_ADST :

	421 ptv = pth = (tx_dim == 4) ? &iadst_i4[0]

	422 : ((tx_dim == 8) ? &iadst_i8[0]

	423 : &iadst_i16[0]);

	424 break;

	425 case ADST_DCT :

	426 ptv = (tx_dim == 4) ? &iadst_i4[0]

	427 : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

	428 pth = (tx_dim == 4) ? &idct_i4[0]

	429 : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

	430 break;

	431 case DCT_ADST :

	432 ptv = (tx_dim == 4) ? &idct_i4[0]

	433 : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

	434 pth = (tx_dim == 4) ? &iadst_i4[0]

	435 : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

	436 break;

	437 case DCT_DCT :

	438 ptv = pth = (tx_dim == 4) ? &idct_i4[0]

	439 : ((tx_dim == 8) ? &idct_i8[0]

	440 : &idct_i16[0]);

	441 break;

	442 default:

	443 assert(0);

	444 break;

	445 }

	446

	447 /* vertical transformation */

	448 for (j = 0; j < tx_dim; j++) {

	449 for (i = 0; i < tx_dim; i++) {

	450 int temp = 0;

	451

	452 for (k = 0; k < tx_dim; k++) {

	453 temp += ptv[k] * ip[(k * tx_dim)];

	454 }

	455

	456 im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

	457 ip++;

	458 }

	459 im += tx_dim; // 16

	460 ptv += tx_dim;

	461 ip = input;

	462 }

	463

	464 /* horizontal transformation */

	465 im = &imbuf[0];

	466

	467 for (j = 0; j < tx_dim; j++) {

	468 const int16_t *pthc = pth;

	469

	470 for (i = 0; i < tx_dim; i++) {

	471 int temp = 0;

	472

	473 for (k = 0; k < tx_dim; k++) {

	474 temp += im[k] * pthc[k];

	475 }

	476

	477 op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

	478 pthc += tx_dim;

	479 }

	480

	481 im += tx_dim; // 16

	482 op += shortpitch;

	483 }

	484 }

	485

	486 void vp9_short_idct4x4llm_c(short input, short output, int pitch) {

	487 int i;

	488 int a1, b1, c1, d1;

	489

	490 short *ip = input;

	491 short *op = output;

	492 int temp1, temp2;

	493 int shortpitch = pitch >> 1;

	494

	495 for (i = 0; i < 4; i++) {

	496 a1 = ip[0] + ip[8];

	497 b1 = ip[0] - ip[8];

	498

	499 temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;

	500 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);

	501 c1 = temp1 - temp2;

	502

	503 temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);

	504 temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;

	505 d1 = temp1 + temp2;

	506

	507 op[shortpitch * 0] = a1 + d1;

	508 op[shortpitch * 3] = a1 - d1;

	509

	510 op[shortpitch * 1] = b1 + c1;

	511 op[shortpitch * 2] = b1 - c1;

	512

	513 ip++;

	514 op++;

	515 }

	516

	517 ip = output;

	518 op = output;

	519

	520 for (i = 0; i < 4; i++) {

	521 a1 = ip[0] + ip[2];

	522 b1 = ip[0] - ip[2];

	523

	524 temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;

	525 temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);

	526 c1 = temp1 - temp2;

	527

	528 temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);

	529 temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;

	530 d1 = temp1 + temp2;

	531

	532 op[0] = (a1 + d1 + 16) >> 5;

	533 op[3] = (a1 - d1 + 16) >> 5;

	534

	535 op[1] = (b1 + c1 + 16) >> 5;

	536 op[2] = (b1 - c1 + 16) >> 5;

	537

	538 ip += shortpitch;

	539 op += shortpitch;

	540 }

	541 }

	542

	543 void vp9_short_idct4x4llm_1_c(short input, short output, int pitch) {

	544 int i;

	545 int a1;

	546 short *op = output;

	547 int shortpitch = pitch >> 1;

	548 a1 = ((input[0] + 16) >> 5);

	549 for (i = 0; i < 4; i++) {

	550 op[0] = a1;

	551 op[1] = a1;

	552 op[2] = a1;

	553 op[3] = a1;

	554 op += shortpitch;

	555 }

	556 }

	557

	558 void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,

	559 unsigned char *dst_ptr, int pitch, int stride) {

	560 int a1 = ((input_dc + 16) >> 5);

	561 int r, c;

	562

	563 for (r = 0; r < 4; r++) {

	564 for (c = 0; c < 4; c++) {

	565 int a = a1 + pred_ptr[c];

	566

	567 if (a < 0)

	568 a = 0;

	569

	570 if (a > 255)

	571 a = 255;

	572

	573 dst_ptr[c] = (unsigned char) a;

	574 }

	575

	576 dst_ptr += stride;

	577 pred_ptr += pitch;

	578 }

	579 }

	580

	581 void vp9_short_inv_walsh4x4_c(short input, short output) {

	582 int i;

	583 int a1, b1, c1, d1;

	584 short *ip = input;

	585 short *op = output;

	586

	587 for (i = 0; i < 4; i++) {

	588 a1 = ((ip[0] + ip[3]));

	589 b1 = ((ip[1] + ip[2]));

	590 c1 = ((ip[1] - ip[2]));

	591 d1 = ((ip[0] - ip[3]));

	592

	593 op[0] = (a1 + b1 + 1) >> 1;

	594 op[1] = (c1 + d1) >> 1;

	595 op[2] = (a1 - b1) >> 1;

	596 op[3] = (d1 - c1) >> 1;

	597

	598 ip += 4;

	599 op += 4;

	600 }

	601

	602 ip = output;

	603 op = output;

	604 for (i = 0; i < 4; i++) {

	605 a1 = ip[0] + ip[12];

	606 b1 = ip[4] + ip[8];

	607 c1 = ip[4] - ip[8];

	608 d1 = ip[0] - ip[12];

	609 op[0] = (a1 + b1 + 1) >> 1;

	610 op[4] = (c1 + d1) >> 1;

	611 op[8] = (a1 - b1) >> 1;

	612 op[12] = (d1 - c1) >> 1;

	613 ip++;

	614 op++;

	615 }

	616 }

	617

	618 void vp9_short_inv_walsh4x4_1_c(short in, short out) {

	619 int i;

	620 short tmp[4];

	621 short *ip = in;

	622 short *op = tmp;

	623

	624 op[0] = (ip[0] + 1) >> 1;

	625 op[1] = op[2] = op[3] = (ip[0] >> 1);

	626

	627 ip = tmp;

	628 op = out;

	629 for (i = 0; i < 4; i++) {

	630 op[0] = (ip[0] + 1) >> 1;

	631 op[4] = op[8] = op[12] = (ip[0] >> 1);

	632 ip++;

	633 op++;

	634 }

	635 }

	636

	637 #if CONFIG_LOSSLESS

	638 void vp9_short_inv_walsh4x4_lossless_c(short input, short output) {

	639 int i;

	640 int a1, b1, c1, d1;

	641 short *ip = input;

	642 short *op = output;

	643

	644 for (i = 0; i < 4; i++) {

	645 a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

	646 b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

	647 c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

	648 d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

	649

	650 op[0] = (a1 + b1 + 1) >> 1;

	651 op[1] = (c1 + d1) >> 1;

	652 op[2] = (a1 - b1) >> 1;

	653 op[3] = (d1 - c1) >> 1;

	654

	655 ip += 4;

	656 op += 4;

	657 }

	658

	659 ip = output;

	660 op = output;

	661 for (i = 0; i < 4; i++) {

	662 a1 = ip[0] + ip[12];

	663 b1 = ip[4] + ip[8];

	664 c1 = ip[4] - ip[8];

	665 d1 = ip[0] - ip[12];

	666

	667

	668 op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	669 op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	670 op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	671 op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	672

	673 ip++;

	674 op++;

	675 }

	676 }

	677

	678 void vp9_short_inv_walsh4x4_1_lossless_c(short in, short out) {

	679 int i;

	680 short tmp[4];

	681 short *ip = in;

	682 short *op = tmp;

	683

	684 op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;

	685 op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);

	686

	687 ip = tmp;

	688 op = out;

	689 for (i = 0; i < 4; i++) {

	690 op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

	691 op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;

	692 ip++;

	693 op++;

	694 }

	695 }

	696

	697 void vp9_short_inv_walsh4x4_x8_c(short input, short output, int pitch) {

	698 int i;

	699 int a1, b1, c1, d1;

	700 short *ip = input;

	701 short *op = output;

	702 int shortpitch = pitch >> 1;

	703

	704 for (i = 0; i < 4; i++) {

	705 a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;

	706 b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;

	707 c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;

	708 d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;

	709

	710 op[0] = (a1 + b1 + 1) >> 1;

	711 op[1] = (c1 + d1) >> 1;

	712 op[2] = (a1 - b1) >> 1;

	713 op[3] = (d1 - c1) >> 1;

	714

	715 ip += 4;

	716 op += shortpitch;

	717 }

	718

	719 ip = output;

	720 op = output;

	721 for (i = 0; i < 4; i++) {

	722 a1 = ip[shortpitch * 0] + ip[shortpitch * 3];

	723 b1 = ip[shortpitch * 1] + ip[shortpitch * 2];

	724 c1 = ip[shortpitch * 1] - ip[shortpitch * 2];

	725 d1 = ip[shortpitch * 0] - ip[shortpitch * 3];

	726

	727

	728 op[shortpitch * 0] = (a1 + b1 + 1) >> 1;

	729 op[shortpitch * 1] = (c1 + d1) >> 1;

	730 op[shortpitch * 2] = (a1 - b1) >> 1;

	731 op[shortpitch * 3] = (d1 - c1) >> 1;

	732

	733 ip++;

	734 op++;

	735 }

	736 }

	737

	738 void vp9_short_inv_walsh4x4_1_x8_c(short in, short out, int pitch) {

	739 int i;

	740 short tmp[4];

	741 short *ip = in;

	742 short *op = tmp;

	743 int shortpitch = pitch >> 1;

	744

	745 op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

	746 op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);

	747

	748

	749 ip = tmp;

	750 op = out;

	751 for (i = 0; i < 4; i++) {

	752 op[shortpitch * 0] = (ip[0] + 1) >> 1;

	753 op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;

	754 ip++;

	755 op++;

	756 }

	757 }

	758

	759 void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,

	760 unsigned char *dst_ptr,

	761 int pitch, int stride) {

	762 int r, c;

	763 short tmp[16];

	764 vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);

	765

	766 for (r = 0; r < 4; r++) {

	767 for (c = 0; c < 4; c++) {

	768 int a = tmp[r * 4 + c] + pred_ptr[c];

	769 if (a < 0)

	770 a = 0;

	771

	772 if (a > 255)

	773 a = 255;

	774

	775 dst_ptr[c] = (unsigned char) a;

	776 }

	777

	778 dst_ptr += stride;

	779 pred_ptr += pitch;

	780 }

	781 }

	782 #endif

	783

	784 void vp9_dc_only_idct_add_8x8_c(short input_dc,

	785 unsigned char *pred_ptr,

	786 unsigned char *dst_ptr,

	787 int pitch, int stride) {

	788 int a1 = ((input_dc + 16) >> 5);

	789 int r, c, b;

	790 unsigned char *orig_pred = pred_ptr;

	791 unsigned char *orig_dst = dst_ptr;

	792 for (b = 0; b < 4; b++) {

	793 for (r = 0; r < 4; r++) {

	794 for (c = 0; c < 4; c++) {

	795 int a = a1 + pred_ptr[c];

	796

	797 if (a < 0)

	798 a = 0;

	799

	800 if (a > 255)

	801 a = 255;

	802

	803 dst_ptr[c] = (unsigned char) a;

	804 }

	805

	806 dst_ptr += stride;

	807 pred_ptr += pitch;

	808 }

	809 dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;

	810 pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;

	811 }

	812 }

	813

	814 #define W1 2841 /* 2048sqrt(2)cos(1pi/16) /

	815 #define W2 2676 /* 2048sqrt(2)cos(2pi/16) /

	816 #define W3 2408 /* 2048sqrt(2)cos(3pi/16) /

	817 #define W5 1609 /* 2048sqrt(2)cos(5pi/16) /

	818 #define W6 1108 /* 2048sqrt(2)cos(6pi/16) /

	819 #define W7 565 /* 2048sqrt(2)cos(7pi/16) /

	820

	821 /* row (horizontal) IDCT

	822 *

	823 * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- *

	824 * ( k + - ) * l ) l=0 8 2

	825 *

	826 * where: c[0] = 128 c[1..7] = 128sqrt(2) /

	827

	828 static void idctrow(int *blk) {

	829 int x0, x1, x2, x3, x4, x5, x6, x7, x8;

	830 /* shortcut */

	831 if (!((x1 = blk[4] << 11) \| (x2 = blk[6]) \| (x3 = blk[2]) \|

	832 (x4 = blk[1]) \| (x5 = blk[7]) \| (x6 = blk[5]) \| (x7 = blk[3]))) {

	833 blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

	834 = blk[5] = blk[6] = blk[7] = blk[0] << 3 ;

	835 return;

	836 }

	837

	838 x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */

	839 /* first stage */

	840 x8 = W7 * (x4 + x5);

	841 x4 = x8 + (W1 - W7) * x4;

	842 x5 = x8 - (W1 + W7) * x5;

	843 x8 = W3 * (x6 + x7);

	844 x6 = x8 - (W3 - W5) * x6;

	845 x7 = x8 - (W3 + W5) * x7;

	846

	847 /* second stage */

	848 x8 = x0 + x1;

	849 x0 -= x1;

	850 x1 = W6 * (x3 + x2);

	851 x2 = x1 - (W2 + W6) * x2;

	852 x3 = x1 + (W2 - W6) * x3;

	853 x1 = x4 + x6;

	854 x4 -= x6;

	855 x6 = x5 + x7;

	856 x5 -= x7;

	857

	858 /* third stage */

	859 x7 = x8 + x3;

	860 x8 -= x3;

	861 x3 = x0 + x2;

	862 x0 -= x2;

	863 x2 = (181 * (x4 + x5) + 128) >> 8;

	864 x4 = (181 * (x4 - x5) + 128) >> 8;

	865

	866 /* fourth stage */

	867 blk[0] = (x7 + x1) >> 8;

	868 blk[1] = (x3 + x2) >> 8;

	869 blk[2] = (x0 + x4) >> 8;

	870 blk[3] = (x8 + x6) >> 8;

	871 blk[4] = (x8 - x6) >> 8;

	872 blk[5] = (x0 - x4) >> 8;

	873 blk[6] = (x3 - x2) >> 8;

	874 blk[7] = (x7 - x1) >> 8;

	875 }

	876

	877 /* column (vertical) IDCT

	878 *

	879 * 7 pi 1 dst[8k] = sum c[l] src[8l]

	880 * cos( -- * ( k + - ) * l ) l=0 8 2

	881 *

	882 * where: c[0] = 1/1024 c[1..7] = (1/1024)sqrt(2) /

	883 static void idctcol(int *blk) {

	884 int x0, x1, x2, x3, x4, x5, x6, x7, x8;

	885

	886 /* shortcut */

	887 if (!((x1 = (blk[8 * 4] << 8)) \| (x2 = blk[8 * 6]) \| (x3 = blk[8 * 2]) \|

	888 (x4 = blk[8 * 1]) \| (x5 = blk[8 * 7]) \| (x6 = blk[8 * 5]) \|

	889 (x7 = blk[8 * 3]))) {

	890 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

	891 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6 ]

	892 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

	893 return;

	894 }

	895

	896 x0 = (blk[8 * 0] << 8) + 16384;

	897

	898 /* first stage */

	899 x8 = W7 * (x4 + x5) + 4;

	900 x4 = (x8 + (W1 - W7) * x4) >> 3;

	901 x5 = (x8 - (W1 + W7) * x5) >> 3;

	902 x8 = W3 * (x6 + x7) + 4;

	903 x6 = (x8 - (W3 - W5) * x6) >> 3;

	904 x7 = (x8 - (W3 + W5) * x7) >> 3;

	905

	906 /* second stage */

	907 x8 = x0 + x1;

	908 x0 -= x1;

	909 x1 = W6 * (x3 + x2) + 4;

	910 x2 = (x1 - (W2 + W6) * x2) >> 3;

	911 x3 = (x1 + (W2 - W6) * x3) >> 3;

	912 x1 = x4 + x6;

	913 x4 -= x6;

	914 x6 = x5 + x7;

	915 x5 -= x7;

	916

	917 /* third stage */

	918 x7 = x8 + x3;

	919 x8 -= x3;

	920 x3 = x0 + x2;

	921 x0 -= x2;

	922 x2 = (181 * (x4 + x5) + 128) >> 8;

	923 x4 = (181 * (x4 - x5) + 128) >> 8;

	924

	925 /* fourth stage */

	926 blk[8 * 0] = (x7 + x1) >> 14;

	927 blk[8 * 1] = (x3 + x2) >> 14;

	928 blk[8 * 2] = (x0 + x4) >> 14;

	929 blk[8 * 3] = (x8 + x6) >> 14;

	930 blk[8 * 4] = (x8 - x6) >> 14;

	931 blk[8 * 5] = (x0 - x4) >> 14;

	932 blk[8 * 6] = (x3 - x2) >> 14;

	933 blk[8 * 7] = (x7 - x1) >> 14;

	934 }

	935

	936 #define TX_DIM 8

	937 void vp9_short_idct8x8_c(short coefs, short block, int pitch) {

	938 int X[TX_DIM * TX_DIM];

	939 int i, j;

	940 int shortpitch = pitch >> 1;

	941

	942 for (i = 0; i < TX_DIM; i++) {

	943 for (j = 0; j < TX_DIM; j++) {

	944 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

	945 + (coefs[i * TX_DIM + j] < 0)) >> 2;

	946 }

	947 }

	948 for (i = 0; i < 8; i++)

	949 idctrow(X + 8 * i);

	950

	951 for (i = 0; i < 8; i++)

	952 idctcol(X + i);

	953

	954 for (i = 0; i < TX_DIM; i++) {

	955 for (j = 0; j < TX_DIM; j++) {

	956 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;

	957 }

	958 }

	959 }

	960

	961 /* Row IDCT when only first 4 coefficients are non-zero. */

	962 static void idctrow10(int *blk) {

	963 int x0, x1, x2, x3, x4, x5, x6, x7, x8;

	964

	965 /* shortcut */

	966 if (!((x1 = blk[4] << 11) \| (x2 = blk[6]) \| (x3 = blk[2]) \|

	967 (x4 = blk[1]) \| (x5 = blk[7]) \| (x6 = blk[5]) \| (x7 = blk[3]))) {

	968 blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

	969 = blk[5] = blk[6] = blk[7] = blk[0] << 3;

	970 return;

	971 }

	972

	973 x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */

	974 /* first stage */

	975 x5 = W7 * x4;

	976 x4 = W1 * x4;

	977 x6 = W3 * x7;

	978 x7 = -W5 * x7;

	979

	980 /* second stage */

	981 x2 = W6 * x3;

	982 x3 = W2 * x3;

	983 x1 = x4 + x6;

	984 x4 -= x6;

	985 x6 = x5 + x7;

	986 x5 -= x7;

	987

	988 /* third stage */

	989 x7 = x0 + x3;

	990 x8 = x0 - x3;

	991 x3 = x0 + x2;

	992 x0 -= x2;

	993 x2 = (181 * (x4 + x5) + 128) >> 8;

	994 x4 = (181 * (x4 - x5) + 128) >> 8;

	995

	996 /* fourth stage */

	997 blk[0] = (x7 + x1) >> 8;

	998 blk[1] = (x3 + x2) >> 8;

	999 blk[2] = (x0 + x4) >> 8;

	1000 blk[3] = (x8 + x6) >> 8;

	1001 blk[4] = (x8 - x6) >> 8;

	1002 blk[5] = (x0 - x4) >> 8;

	1003 blk[6] = (x3 - x2) >> 8;

	1004 blk[7] = (x7 - x1) >> 8;

	1005 }

	1006

	1007 /* Column (vertical) IDCT when only first 4 coefficients are non-zero. */

	1008 static void idctcol10(int *blk) {

	1009 int x0, x1, x2, x3, x4, x5, x6, x7, x8;

	1010

	1011 /* shortcut */

	1012 if (!((x1 = (blk[8 * 4] << 8)) \| (x2 = blk[8 * 6]) \| (x3 = blk[8 * 2]) \|

	1013 (x4 = blk[8 * 1]) \| (x5 = blk[8 * 7]) \| (x6 = blk[8 * 5]) \|

	1014 (x7 = blk[8 * 3]))) {

	1015 blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

	1016 = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

	1017 = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

	1018 return;

	1019 }

	1020

	1021 x0 = (blk[8 * 0] << 8) + 16384;

	1022

	1023 /* first stage */

	1024 x5 = (W7 * x4 + 4) >> 3;

	1025 x4 = (W1 * x4 + 4) >> 3;

	1026 x6 = (W3 * x7 + 4) >> 3;

	1027 x7 = (-W5 * x7 + 4) >> 3;

	1028

	1029 /* second stage */

	1030 x2 = (W6 * x3 + 4) >> 3;

	1031 x3 = (W2 * x3 + 4) >> 3;

	1032 x1 = x4 + x6;

	1033 x4 -= x6;

	1034 x6 = x5 + x7;

	1035 x5 -= x7;

	1036

	1037 /* third stage */

	1038 x7 = x0 + x3;

	1039 x8 = x0 - x3;

	1040 x3 = x0 + x2;

	1041 x0 -= x2;

	1042 x2 = (181 * (x4 + x5) + 128) >> 8;

	1043 x4 = (181 * (x4 - x5) + 128) >> 8;

	1044

	1045 /* fourth stage */

	1046 blk[8 * 0] = (x7 + x1) >> 14;

	1047 blk[8 * 1] = (x3 + x2) >> 14;

	1048 blk[8 * 2] = (x0 + x4) >> 14;

	1049 blk[8 * 3] = (x8 + x6) >> 14;

	1050 blk[8 * 4] = (x8 - x6) >> 14;

	1051 blk[8 * 5] = (x0 - x4) >> 14;

	1052 blk[8 * 6] = (x3 - x2) >> 14;

	1053 blk[8 * 7] = (x7 - x1) >> 14;

	1054 }

	1055

	1056 void vp9_short_idct10_8x8_c(short coefs, short block, int pitch) {

	1057 int X[TX_DIM * TX_DIM];

	1058 int i, j;

	1059 int shortpitch = pitch >> 1;

	1060

	1061 for (i = 0; i < TX_DIM; i++) {

	1062 for (j = 0; j < TX_DIM; j++) {

	1063 X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

	1064 + (coefs[i * TX_DIM + j] < 0)) >> 2;

	1065 }

	1066 }

	1067

	1068 /* Do first 4 row idct only since non-zero dct coefficients are all in

	1069 * upper-left 4x4 area. */

	1070 for (i = 0; i < 4; i++)

	1071 idctrow10(X + 8 * i);

	1072

	1073 for (i = 0; i < 8; i++)

	1074 idctcol10(X + i);

	1075

	1076 for (i = 0; i < TX_DIM; i++) {

	1077 for (j = 0; j < TX_DIM; j++) {

	1078 block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1;

	1079 }

	1080 }

	1081 }

	1082

	1083 void vp9_short_ihaar2x2_c(short input, short output, int pitch) {

	1084 int i;

	1085 short *ip = input; // 0,1, 4, 8

	1086 short *op = output;

	1087 for (i = 0; i < 16; i++) {

	1088 op[i] = 0;

	1089 }

	1090

	1091 op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;

	1092 op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;

	1093 op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;

	1094 op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;

	1095 }

	1096

	1097

	1098 #if 0

	1099 // Keep a really bad float version as reference for now.

	1100 void vp9_short_idct16x16_c(short input, short output, int pitch) {

	1101

	1102 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1103 {

	1104 double x;

	1105 const int short_pitch = pitch >> 1;

	1106 int i, j, k, l;

	1107 for (l = 0; l < 16; ++l) {

	1108 for (k = 0; k < 16; ++k) {

	1109 double s = 0;

	1110 for (i = 0; i < 16; ++i) {

	1111 for (j = 0; j < 16; ++j) {

	1112 x=cos(PIj(l+0.5)/16.0)cos(PIi(k+0.5)/16.0)input[i*16+j]/32;

	1113 if (i != 0)

	1114 x *= sqrt(2.0);

	1115 if (j != 0)

	1116 x *= sqrt(2.0);

	1117 s += x;

	1118 }

	1119 }

	1120 output[k*short_pitch+l] = (short)round(s);

	1121 }

	1122 }

	1123 }

	1124 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1125 }

	1126 #endif

	1127

	1128 #define TEST_INT_16x16_IDCT 1

	1129 #if !TEST_INT_16x16_IDCT

	1130 static const double C1 = 0.995184726672197;

	1131 static const double C2 = 0.98078528040323;

	1132 static const double C3 = 0.956940335732209;

	1133 static const double C4 = 0.923879532511287;

	1134 static const double C5 = 0.881921264348355;

	1135 static const double C6 = 0.831469612302545;

	1136 static const double C7 = 0.773010453362737;

	1137 static const double C8 = 0.707106781186548;

	1138 static const double C9 = 0.634393284163646;

	1139 static const double C10 = 0.555570233019602;

	1140 static const double C11 = 0.471396736825998;

	1141 static const double C12 = 0.38268343236509;

	1142 static const double C13 = 0.290284677254462;

	1143 static const double C14 = 0.195090322016128;

	1144 static const double C15 = 0.098017140329561;

	1145

	1146

	1147 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {

	1148

	1149 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1150 {

	1151 double step[16];

	1152 double intermediate[16];

	1153 double temp1, temp2;

	1154

	1155

	1156 // step 1 and 2

	1157 step[ 0] = input[0] + input[8];

	1158 step[ 1] = input[0] - input[8];

	1159

	1160 temp1 = input[4]*C12;

	1161 temp2 = input[12]*C4;

	1162

	1163 temp1 -= temp2;

	1164 temp1 *= C8;

	1165

	1166 step[ 2] = 2*(temp1);

	1167

	1168 temp1 = input[4]*C4;

	1169 temp2 = input[12]*C12;

	1170 temp1 += temp2;

	1171 temp1 = (temp1);

	1172 temp1 *= C8;

	1173 step[ 3] = 2*(temp1);

	1174

	1175 temp1 = input[2]*C8;

	1176 temp1 = 2*(temp1);

	1177 temp2 = input[6] + input[10];

	1178

	1179 step[ 4] = temp1 + temp2;

	1180 step[ 5] = temp1 - temp2;

	1181

	1182 temp1 = input[14]*C8;

	1183 temp1 = 2*(temp1);

	1184 temp2 = input[6] - input[10];

	1185

	1186 step[ 6] = temp2 - temp1;

	1187 step[ 7] = temp2 + temp1;

	1188

	1189 // for odd input

	1190 temp1 = input[3]*C12;

	1191 temp2 = input[13]*C4;

	1192 temp1 += temp2;

	1193 temp1 = (temp1);

	1194 temp1 *= C8;

	1195 intermediate[ 8] = 2*(temp1);

	1196

	1197 temp1 = input[3]*C4;

	1198 temp2 = input[13]*C12;

	1199 temp2 -= temp1;

	1200 temp2 = (temp2);

	1201 temp2 *= C8;

	1202 intermediate[ 9] = 2*(temp2);

	1203

	1204 intermediate[10] = 2(input[9]C8);

	1205 intermediate[11] = input[15] - input[1];

	1206 intermediate[12] = input[15] + input[1];

	1207 intermediate[13] = 2((input[7]C8));

	1208

	1209 temp1 = input[11]*C12;

	1210 temp2 = input[5]*C4;

	1211 temp2 -= temp1;

	1212 temp2 = (temp2);

	1213 temp2 *= C8;

	1214 intermediate[14] = 2*(temp2);

	1215

	1216 temp1 = input[11]*C4;

	1217 temp2 = input[5]*C12;

	1218 temp1 += temp2;

	1219 temp1 = (temp1);

	1220 temp1 *= C8;

	1221 intermediate[15] = 2*(temp1);

	1222

	1223 step[ 8] = intermediate[ 8] + intermediate[14];

	1224 step[ 9] = intermediate[ 9] + intermediate[15];

	1225 step[10] = intermediate[10] + intermediate[11];

	1226 step[11] = intermediate[10] - intermediate[11];

	1227 step[12] = intermediate[12] + intermediate[13];

	1228 step[13] = intermediate[12] - intermediate[13];

	1229 step[14] = intermediate[ 8] - intermediate[14];

	1230 step[15] = intermediate[ 9] - intermediate[15];

	1231

	1232 // step 3

	1233 output[0] = step[ 0] + step[ 3];

	1234 output[1] = step[ 1] + step[ 2];

	1235 output[2] = step[ 1] - step[ 2];

	1236 output[3] = step[ 0] - step[ 3];

	1237

	1238 temp1 = step[ 4]*C14;

	1239 temp2 = step[ 7]*C2;

	1240 temp1 -= temp2;

	1241 output[4] = (temp1);

	1242

	1243 temp1 = step[ 4]*C2;

	1244 temp2 = step[ 7]*C14;

	1245 temp1 += temp2;

	1246 output[7] = (temp1);

	1247

	1248 temp1 = step[ 5]*C10;

	1249 temp2 = step[ 6]*C6;

	1250 temp1 -= temp2;

	1251 output[5] = (temp1);

	1252

	1253 temp1 = step[ 5]*C6;

	1254 temp2 = step[ 6]*C10;

	1255 temp1 += temp2;

	1256 output[6] = (temp1);

	1257

	1258 output[8] = step[ 8] + step[11];

	1259 output[9] = step[ 9] + step[10];

	1260 output[10] = step[ 9] - step[10];

	1261 output[11] = step[ 8] - step[11];

	1262 output[12] = step[12] + step[15];

	1263 output[13] = step[13] + step[14];

	1264 output[14] = step[13] - step[14];

	1265 output[15] = step[12] - step[15];

	1266

	1267 // output 4

	1268 step[ 0] = output[0] + output[7];

	1269 step[ 1] = output[1] + output[6];

	1270 step[ 2] = output[2] + output[5];

	1271 step[ 3] = output[3] + output[4];

	1272 step[ 4] = output[3] - output[4];

	1273 step[ 5] = output[2] - output[5];

	1274 step[ 6] = output[1] - output[6];

	1275 step[ 7] = output[0] - output[7];

	1276

	1277 temp1 = output[8]*C7;

	1278 temp2 = output[15]*C9;

	1279 temp1 -= temp2;

	1280 step[ 8] = (temp1);

	1281

	1282 temp1 = output[9]*C11;

	1283 temp2 = output[14]*C5;

	1284 temp1 += temp2;

	1285 step[ 9] = (temp1);

	1286

	1287 temp1 = output[10]*C3;

	1288 temp2 = output[13]*C13;

	1289 temp1 -= temp2;

	1290 step[10] = (temp1);

	1291

	1292 temp1 = output[11]*C15;

	1293 temp2 = output[12]*C1;

	1294 temp1 += temp2;

	1295 step[11] = (temp1);

	1296

	1297 temp1 = output[11]*C1;

	1298 temp2 = output[12]*C15;

	1299 temp2 -= temp1;

	1300 step[12] = (temp2);

	1301

	1302 temp1 = output[10]*C13;

	1303 temp2 = output[13]*C3;

	1304 temp1 += temp2;

	1305 step[13] = (temp1);

	1306

	1307 temp1 = output[9]*C5;

	1308 temp2 = output[14]*C11;

	1309 temp2 -= temp1;

	1310 step[14] = (temp2);

	1311

	1312 temp1 = output[8]*C9;

	1313 temp2 = output[15]*C7;

	1314 temp1 += temp2;

	1315 step[15] = (temp1);

	1316

	1317 // step 5

	1318 output[0] = (step[0] + step[15]);

	1319 output[1] = (step[1] + step[14]);

	1320 output[2] = (step[2] + step[13]);

	1321 output[3] = (step[3] + step[12]);

	1322 output[4] = (step[4] + step[11]);

	1323 output[5] = (step[5] + step[10]);

	1324 output[6] = (step[6] + step[ 9]);

	1325 output[7] = (step[7] + step[ 8]);

	1326

	1327 output[15] = (step[0] - step[15]);

	1328 output[14] = (step[1] - step[14]);

	1329 output[13] = (step[2] - step[13]);

	1330 output[12] = (step[3] - step[12]);

	1331 output[11] = (step[4] - step[11]);

	1332 output[10] = (step[5] - step[10]);

	1333 output[9] = (step[6] - step[ 9]);

	1334 output[8] = (step[7] - step[ 8]);

	1335 }

	1336 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1337 }

	1338

	1339 // Remove once an int version of iDCT is written

	1340 #if 0

	1341 void reference_16x16_idct_1d(double input[16], double output[16]) {

	1342

	1343 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1344 {

	1345 const double kPi = 3.141592653589793238462643383279502884;

	1346 const double kSqrt2 = 1.414213562373095048801688724209698;

	1347 for (int k = 0; k < 16; k++) {

	1348 output[k] = 0.0;

	1349 for (int n = 0; n < 16; n++) {

	1350 output[k] += input[n]cos(kPi(2k+1)n/32.0);

	1351 if (n == 0)

	1352 output[k] = output[k]/kSqrt2;

	1353 }

	1354 }

	1355 }

	1356 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1357 }

	1358 #endif

	1359

	1360 void vp9_short_idct16x16_c(short input, short output, int pitch) {

	1361

	1362 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1363 {

	1364 double out[1616], out2[1616];

	1365 const int short_pitch = pitch >> 1;

	1366 int i, j;

	1367 // First transform rows

	1368 for (i = 0; i < 16; ++i) {

	1369 double temp_in[16], temp_out[16];

	1370 for (j = 0; j < 16; ++j)

	1371 temp_in[j] = input[j + i*short_pitch];

	1372 butterfly_16x16_idct_1d(temp_in, temp_out);

	1373 for (j = 0; j < 16; ++j)

	1374 out[j + i*16] = temp_out[j];

	1375 }

	1376 // Then transform columns

	1377 for (i = 0; i < 16; ++i) {

	1378 double temp_in[16], temp_out[16];

	1379 for (j = 0; j < 16; ++j)

	1380 temp_in[j] = out[j*16 + i];

	1381 butterfly_16x16_idct_1d(temp_in, temp_out);

	1382 for (j = 0; j < 16; ++j)

	1383 out2[j*16 + i] = temp_out[j];

	1384 }

	1385 for (i = 0; i < 16*16; ++i)

	1386 output[i] = round(out2[i]/128);

	1387 }

	1388 vp9_clear_system_state(); // Make it simd safe : __asm emms;

	1389 }

	1390

	1391 #else

	1392 static const int16_t C1 = 16305;

	1393 static const int16_t C2 = 16069;

	1394 static const int16_t C3 = 15679;

	1395 static const int16_t C4 = 15137;

	1396 static const int16_t C5 = 14449;

	1397 static const int16_t C6 = 13623;

	1398 static const int16_t C7 = 12665;

	1399 static const int16_t C8 = 11585;

	1400 static const int16_t C9 = 10394;

	1401 static const int16_t C10 = 9102;

	1402 static const int16_t C11 = 7723;

	1403 static const int16_t C12 = 6270;

	1404 static const int16_t C13 = 4756;

	1405 static const int16_t C14 = 3196;

	1406 static const int16_t C15 = 1606;

	1407

	1408 #define INITIAL_SHIFT 2

	1409 #define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))

	1410 #define RIGHT_SHIFT 14

	1411 #define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))

	1412

	1413 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],

	1414 int last_shift_bits) {

	1415 int16_t step[16];

	1416 int intermediate[16];

	1417 int temp1, temp2;

	1418

	1419 int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;

	1420 int step1_rounding = 1 << (step1_shift - 1);

	1421 int last_rounding = 0;

	1422

	1423 if (last_shift_bits > 0)

	1424 last_rounding = 1 << (last_shift_bits - 1);

	1425

	1426 // step 1 and 2

	1427 step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1428 step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1429

	1430 temp1 = input[4] * C12;

	1431 temp2 = input[12] * C4;

	1432 temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1433 temp1 *= C8;

	1434 step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;

	1435

	1436 temp1 = input[4] * C4;

	1437 temp2 = input[12] * C12;

	1438 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1439 temp1 *= C8;

	1440 step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;

	1441

	1442 temp1 = input[2] * C8;

	1443 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1444 temp2 = input[6] + input[10];

	1445 step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1446 step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1447

	1448 temp1 = input[14] * C8;

	1449 temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1450 temp2 = input[6] - input[10];

	1451 step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1452 step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1453

	1454 // for odd input

	1455 temp1 = input[3] * C12;

	1456 temp2 = input[13] * C4;

	1457 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1458 temp1 *= C8;

	1459 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1460

	1461 temp1 = input[3] * C4;

	1462 temp2 = input[13] * C12;

	1463 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1464 temp2 *= C8;

	1465 intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1466

	1467 intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1468 intermediate[11] = input[15] - input[1];

	1469 intermediate[12] = input[15] + input[1];

	1470 intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1471

	1472 temp1 = input[11] * C12;

	1473 temp2 = input[5] * C4;

	1474 temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1475 temp2 *= C8;

	1476 intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1477

	1478 temp1 = input[11] * C4;

	1479 temp2 = input[5] * C12;

	1480 temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1481 temp1 *= C8;

	1482 intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1483

	1484 step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)

	1485 >> INITIAL_SHIFT;

	1486 step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)

	1487 >> INITIAL_SHIFT;

	1488 step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)

	1489 >> INITIAL_SHIFT;

	1490 step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)

	1491 >> INITIAL_SHIFT;

	1492 step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)

	1493 >> INITIAL_SHIFT;

	1494 step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)

	1495 >> INITIAL_SHIFT;

	1496 step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)

	1497 >> INITIAL_SHIFT;

	1498 step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)

	1499 >> INITIAL_SHIFT;

	1500

	1501 // step 3

	1502 output[0] = step[ 0] + step[ 3];

	1503 output[1] = step[ 1] + step[ 2];

	1504 output[2] = step[ 1] - step[ 2];

	1505 output[3] = step[ 0] - step[ 3];

	1506

	1507 temp1 = step[ 4] * C14;

	1508 temp2 = step[ 7] * C2;

	1509 output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1510

	1511 temp1 = step[ 4] * C2;

	1512 temp2 = step[ 7] * C14;

	1513 output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1514

	1515 temp1 = step[ 5] * C10;

	1516 temp2 = step[ 6] * C6;

	1517 output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1518

	1519 temp1 = step[ 5] * C6;

	1520 temp2 = step[ 6] * C10;

	1521 output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1522

	1523 output[8] = step[ 8] + step[11];

	1524 output[9] = step[ 9] + step[10];

	1525 output[10] = step[ 9] - step[10];

	1526 output[11] = step[ 8] - step[11];

	1527 output[12] = step[12] + step[15];

	1528 output[13] = step[13] + step[14];

	1529 output[14] = step[13] - step[14];

	1530 output[15] = step[12] - step[15];

	1531

	1532 // output 4

	1533 step[ 0] = output[0] + output[7];

	1534 step[ 1] = output[1] + output[6];

	1535 step[ 2] = output[2] + output[5];

	1536 step[ 3] = output[3] + output[4];

	1537 step[ 4] = output[3] - output[4];

	1538 step[ 5] = output[2] - output[5];

	1539 step[ 6] = output[1] - output[6];

	1540 step[ 7] = output[0] - output[7];

	1541

	1542 temp1 = output[8] * C7;

	1543 temp2 = output[15] * C9;

	1544 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1545

	1546 temp1 = output[9] * C11;

	1547 temp2 = output[14] * C5;

	1548 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1549

	1550 temp1 = output[10] * C3;

	1551 temp2 = output[13] * C13;

	1552 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1553

	1554 temp1 = output[11] * C15;

	1555 temp2 = output[12] * C1;

	1556 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1557

	1558 temp1 = output[11] * C1;

	1559 temp2 = output[12] * C15;

	1560 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1561

	1562 temp1 = output[10] * C13;

	1563 temp2 = output[13] * C3;

	1564 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1565

	1566 temp1 = output[9] * C5;

	1567 temp2 = output[14] * C11;

	1568 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1569

	1570 temp1 = output[8] * C9;

	1571 temp2 = output[15] * C7;

	1572 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1573

	1574 // step 5

	1575 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;

	1576 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;

	1577 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;

	1578 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;

	1579 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;

	1580 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;

	1581 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;

	1582 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;

	1583

	1584 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;

	1585 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;

	1586 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;

	1587 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;

	1588 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;

	1589 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;

	1590 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;

	1591 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;

	1592 }

	1593

	1594 void vp9_short_idct16x16_c(int16_t input, int16_t output, int pitch) {

	1595 int16_t out[16 * 16];

	1596 int16_t *outptr = &out[0];

	1597 const int short_pitch = pitch >> 1;

	1598 int i, j;

	1599 int16_t temp_in[16], temp_out[16];

	1600

	1601 // First transform rows

	1602 for (i = 0; i < 16; ++i) {

	1603 butterfly_16x16_idct_1d(input, outptr, 0);

	1604 input += short_pitch;

	1605 outptr += 16;

	1606 }

	1607

	1608 // Then transform columns

	1609 for (i = 0; i < 16; ++i) {

	1610 for (j = 0; j < 16; ++j)

	1611 temp_in[j] = out[j * 16 + i];

	1612 butterfly_16x16_idct_1d(temp_in, temp_out, 3);

	1613 for (j = 0; j < 16; ++j)

	1614 output[j * 16 + i] = temp_out[j];

	1615 }

	1616 }

	1617

	1618 /* The following function is called when we know the maximum number of non-zero

	1619 * dct coefficients is less or equal 10.

	1620 */

	1621 static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],

	1622 int last_shift_bits) {

	1623 int16_t step[16] = {0};

	1624 int intermediate[16] = {0};

	1625 int temp1, temp2;

	1626 int last_rounding = 0;

	1627

	1628 if (last_shift_bits > 0)

	1629 last_rounding = 1 << (last_shift_bits - 1);

	1630

	1631 // step 1 and 2

	1632 step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1633 step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1634

	1635 temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1636 step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1637 step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1638

	1639 // for odd input

	1640 temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1641 temp1 *= C8;

	1642 intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1643

	1644 temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1645 temp1 *= C8;

	1646 intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1647

	1648 step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1649 step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1650 step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1651 step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1652 step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1653 step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1654 step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1655 step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

	1656

	1657 // step 3

	1658 output[0] = step[ 0];

	1659 output[1] = step[ 1];

	1660 output[2] = step[ 1];

	1661 output[3] = step[ 0];

	1662

	1663 temp1 = step[ 4] * C14;

	1664 output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1665

	1666 temp1 = step[ 4] * C2;

	1667 output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1668

	1669 temp1 = step[ 5] * C10;

	1670 output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1671

	1672 temp1 = step[ 5] * C6;

	1673 output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1674

	1675 output[8] = step[ 8] + step[11];

	1676 output[9] = step[ 9] + step[10];

	1677 output[10] = step[ 9] - step[10];

	1678 output[11] = step[ 8] - step[11];

	1679 output[12] = step[12] + step[15];

	1680 output[13] = step[13] + step[14];

	1681 output[14] = step[13] - step[14];

	1682 output[15] = step[12] - step[15];

	1683

	1684 // output 4

	1685 step[ 0] = output[0] + output[7];

	1686 step[ 1] = output[1] + output[6];

	1687 step[ 2] = output[2] + output[5];

	1688 step[ 3] = output[3] + output[4];

	1689 step[ 4] = output[3] - output[4];

	1690 step[ 5] = output[2] - output[5];

	1691 step[ 6] = output[1] - output[6];

	1692 step[ 7] = output[0] - output[7];

	1693

	1694 temp1 = output[8] * C7;

	1695 temp2 = output[15] * C9;

	1696 step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1697

	1698 temp1 = output[9] * C11;

	1699 temp2 = output[14] * C5;

	1700 step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1701

	1702 temp1 = output[10] * C3;

	1703 temp2 = output[13] * C13;

	1704 step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1705

	1706 temp1 = output[11] * C15;

	1707 temp2 = output[12] * C1;

	1708 step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1709

	1710 temp1 = output[11] * C1;

	1711 temp2 = output[12] * C15;

	1712 step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1713

	1714 temp1 = output[10] * C13;

	1715 temp2 = output[13] * C3;

	1716 step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1717

	1718 temp1 = output[9] * C5;

	1719 temp2 = output[14] * C11;

	1720 step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1721

	1722 temp1 = output[8] * C9;

	1723 temp2 = output[15] * C7;

	1724 step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

	1725

	1726 // step 5

	1727 output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;

	1728 output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;

	1729 output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;

	1730 output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;

	1731 output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;

	1732 output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;

	1733 output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;

	1734 output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;

	1735

	1736 output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;

	1737 output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;

	1738 output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;

	1739 output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;

	1740 output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;

	1741 output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;

	1742 output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;

	1743 output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;

	1744 }

	1745

	1746 void vp9_short_idct10_16x16_c(int16_t input, int16_t output, int pitch) {

	1747 int16_t out[16 * 16];

	1748 int16_t *outptr = &out[0];

	1749 const int short_pitch = pitch >> 1;

	1750 int i, j;

	1751 int16_t temp_in[16], temp_out[16];

	1752

	1753 /* First transform rows. Since all non-zero dct coefficients are in

	1754 * upper-left 4x4 area, we only need to calculate first 4 rows here.

	1755 */

	1756 vpx_memset(out, 0, sizeof(out));

	1757 for (i = 0; i < 4; ++i) {

	1758 butterfly_16x16_idct10_1d(input, outptr, 0);

	1759 input += short_pitch;

	1760 outptr += 16;

	1761 }

	1762

	1763 // Then transform columns

	1764 for (i = 0; i < 16; ++i) {

	1765 for (j = 0; j < 16; ++j)

	1766 temp_in[j] = out[j*16 + i];

	1767 butterfly_16x16_idct10_1d(temp_in, temp_out, 3);

	1768 for (j = 0; j < 16; ++j)

	1769 output[j*16 + i] = temp_out[j];

	1770 }

	1771 }

	1772 #undef INITIAL_SHIFT

	1773 #undef INITIAL_ROUNDING

	1774 #undef RIGHT_SHIFT

	1775 #undef RIGHT_ROUNDING

	1776 #endif

OLD	NEW

« libvpx.gyp ('K') | « source/libvpx/vp9/common/vp9_header.h ('k') | source/libvpx/vp9/common/vp9_implicit_segmentation.c » ('j') | no next file with comments »