Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(559)

Side by Side Diff: source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Issue 1169543007: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 3187 matching lines...) Expand 10 before | Expand all | Expand 10 after
3198 3198
3199 // Only upper-left 8x8 has non-zero coeff 3199 // Only upper-left 8x8 has non-zero coeff
3200 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3200 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
3201 int stride) { 3201 int stride) {
3202 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3202 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3203 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3203 const __m128i final_rounding = _mm_set1_epi16(1<<5);
3204 3204
3205 // idct constants for each stage 3205 // idct constants for each stage
3206 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3206 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3207 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3207 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3208 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3209 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3210 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3211 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3212 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3208 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3213 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3209 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3214 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3210 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3215 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3211 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3216 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3217 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3218 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3219 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3220 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3212 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3221 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3213 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3222 3214
3223 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3215 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3224 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3216 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3225 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3226 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3227 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3228 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3229 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3217 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3230 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3218 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3231 3219
3232 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3220 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3233 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3221 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3234 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3235 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3236 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3222 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3237 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3223 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3238 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3224 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3239 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3225 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3240 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3226 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3241 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3227 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3242 3228
3243 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3229 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3244 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3230 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3245 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3246 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3247 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3231 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3248 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3232 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3249 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3233 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3250 3234
3251 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3235 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3252 3236
3253 __m128i in[32], col[32]; 3237 __m128i in[32], col[32];
3254 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3238 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3255 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3239 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3256 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3240 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
3257 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3241 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
3258 stp1_30, stp1_31; 3242 stp1_30, stp1_31;
3259 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3243 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3260 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3244 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3261 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3245 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
3262 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3246 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
3263 stp2_30, stp2_31; 3247 stp2_30, stp2_31;
3264 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3248 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3265 int i; 3249 int i;
3266 // Load input data.
3267 LOAD_DQCOEFF(in[0], input);
3268 LOAD_DQCOEFF(in[8], input);
3269 LOAD_DQCOEFF(in[16], input);
3270 LOAD_DQCOEFF(in[24], input);
3271 LOAD_DQCOEFF(in[1], input);
3272 LOAD_DQCOEFF(in[9], input);
3273 LOAD_DQCOEFF(in[17], input);
3274 LOAD_DQCOEFF(in[25], input);
3275 LOAD_DQCOEFF(in[2], input);
3276 LOAD_DQCOEFF(in[10], input);
3277 LOAD_DQCOEFF(in[18], input);
3278 LOAD_DQCOEFF(in[26], input);
3279 LOAD_DQCOEFF(in[3], input);
3280 LOAD_DQCOEFF(in[11], input);
3281 LOAD_DQCOEFF(in[19], input);
3282 LOAD_DQCOEFF(in[27], input);
3283 3250
3284 LOAD_DQCOEFF(in[4], input); 3251 // Load input data. Only need to load the top left 8x8 block.
3285 LOAD_DQCOEFF(in[12], input); 3252 in[0] = _mm_load_si128((const __m128i *)input);
3286 LOAD_DQCOEFF(in[20], input); 3253 in[1] = _mm_load_si128((const __m128i *)(input + 32));
3287 LOAD_DQCOEFF(in[28], input); 3254 in[2] = _mm_load_si128((const __m128i *)(input + 64));
3288 LOAD_DQCOEFF(in[5], input); 3255 in[3] = _mm_load_si128((const __m128i *)(input + 96));
3289 LOAD_DQCOEFF(in[13], input); 3256 in[4] = _mm_load_si128((const __m128i *)(input + 128));
3290 LOAD_DQCOEFF(in[21], input); 3257 in[5] = _mm_load_si128((const __m128i *)(input + 160));
3291 LOAD_DQCOEFF(in[29], input); 3258 in[6] = _mm_load_si128((const __m128i *)(input + 192));
3292 LOAD_DQCOEFF(in[6], input); 3259 in[7] = _mm_load_si128((const __m128i *)(input + 224));
3293 LOAD_DQCOEFF(in[14], input); 3260
3294 LOAD_DQCOEFF(in[22], input); 3261 for (i = 8; i < 32; ++i) {
3295 LOAD_DQCOEFF(in[30], input); 3262 in[i] = _mm_setzero_si128();
3296 LOAD_DQCOEFF(in[7], input); 3263 }
3297 LOAD_DQCOEFF(in[15], input);
3298 LOAD_DQCOEFF(in[23], input);
3299 LOAD_DQCOEFF(in[31], input);
3300 3264
3301 array_transpose_8x8(in, in); 3265 array_transpose_8x8(in, in);
3266 // TODO(hkuang): Following transposes are unnecessary. But remove them will
3267 // lead to performance drop on some devices.
3302 array_transpose_8x8(in + 8, in + 8); 3268 array_transpose_8x8(in + 8, in + 8);
3303 array_transpose_8x8(in + 16, in + 16); 3269 array_transpose_8x8(in + 16, in + 16);
3304 array_transpose_8x8(in + 24, in + 24); 3270 array_transpose_8x8(in + 24, in + 24);
3305 3271
3306 IDCT32 3272 IDCT32_34
3307 3273
3308 // 1_D: Store 32 intermediate results for each 8x32 block. 3274 // 1_D: Store 32 intermediate results for each 8x32 block.
3309 col[0] = _mm_add_epi16(stp1_0, stp1_31); 3275 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3310 col[1] = _mm_add_epi16(stp1_1, stp1_30); 3276 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3311 col[2] = _mm_add_epi16(stp1_2, stp1_29); 3277 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3312 col[3] = _mm_add_epi16(stp1_3, stp1_28); 3278 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3313 col[4] = _mm_add_epi16(stp1_4, stp1_27); 3279 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3314 col[5] = _mm_add_epi16(stp1_5, stp1_26); 3280 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3315 col[6] = _mm_add_epi16(stp1_6, stp1_25); 3281 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3316 col[7] = _mm_add_epi16(stp1_7, stp1_24); 3282 col[7] = _mm_add_epi16(stp1_7, stp1_24);
(...skipping 931 matching lines...) Expand 10 before | Expand all | Expand 10 after
4248 vp9_highbd_idct16(temp_in, temp_out, bd); 4214 vp9_highbd_idct16(temp_in, temp_out, bd);
4249 for (j = 0; j < 16; ++j) { 4215 for (j = 0; j < 16; ++j) {
4250 dest[j * stride + i] = highbd_clip_pixel_add( 4216 dest[j * stride + i] = highbd_clip_pixel_add(
4251 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4217 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4252 } 4218 }
4253 } 4219 }
4254 } 4220 }
4255 } 4221 }
4256 4222
4257 #endif // CONFIG_VP9_HIGHBITDEPTH 4223 #endif // CONFIG_VP9_HIGHBITDEPTH
OLDNEW
« no previous file with comments | « source/libvpx/vp9/common/vp9_rtcd_defs.pl ('k') | source/libvpx/vp9/decoder/vp9_decodeframe.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698