| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 3187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3198 | 3198 |
| 3199 // Only upper-left 8x8 has non-zero coeff | 3199 // Only upper-left 8x8 has non-zero coeff |
| 3200 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, | 3200 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
| 3201 int stride) { | 3201 int stride) { |
| 3202 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); | 3202 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
| 3203 const __m128i final_rounding = _mm_set1_epi16(1<<5); | 3203 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
| 3204 | 3204 |
| 3205 // idct constants for each stage | 3205 // idct constants for each stage |
| 3206 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); | 3206 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
| 3207 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); | 3207 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
| 3208 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); | |
| 3209 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); | |
| 3210 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); | |
| 3211 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); | |
| 3212 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); | 3208 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
| 3213 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); | 3209 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
| 3214 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); | 3210 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
| 3215 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); | 3211 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
| 3216 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); | |
| 3217 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); | |
| 3218 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); | |
| 3219 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); | |
| 3220 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); | 3212 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
| 3221 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); | 3213 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
| 3222 | 3214 |
| 3223 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); | 3215 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
| 3224 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); | 3216 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
| 3225 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); | |
| 3226 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); | |
| 3227 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); | |
| 3228 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); | |
| 3229 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); | 3217 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
| 3230 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); | 3218 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
| 3231 | 3219 |
| 3232 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); | 3220 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
| 3233 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); | 3221 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
| 3234 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); | |
| 3235 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); | |
| 3236 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); | 3222 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
| 3237 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); | 3223 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
| 3238 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); | 3224 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
| 3239 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); | 3225 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
| 3240 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); | 3226 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
| 3241 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); | 3227 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
| 3242 | 3228 |
| 3243 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); | 3229 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
| 3244 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); | 3230 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
| 3245 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); | |
| 3246 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); | |
| 3247 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); | 3231 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
| 3248 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); | 3232 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
| 3249 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); | 3233 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
| 3250 | 3234 |
| 3251 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); | 3235 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
| 3252 | 3236 |
| 3253 __m128i in[32], col[32]; | 3237 __m128i in[32], col[32]; |
| 3254 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, | 3238 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
| 3255 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, | 3239 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
| 3256 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, | 3240 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
| 3257 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, | 3241 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
| 3258 stp1_30, stp1_31; | 3242 stp1_30, stp1_31; |
| 3259 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, | 3243 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
| 3260 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, | 3244 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
| 3261 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, | 3245 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
| 3262 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, | 3246 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
| 3263 stp2_30, stp2_31; | 3247 stp2_30, stp2_31; |
| 3264 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | 3248 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
| 3265 int i; | 3249 int i; |
| 3266 // Load input data. | |
| 3267 LOAD_DQCOEFF(in[0], input); | |
| 3268 LOAD_DQCOEFF(in[8], input); | |
| 3269 LOAD_DQCOEFF(in[16], input); | |
| 3270 LOAD_DQCOEFF(in[24], input); | |
| 3271 LOAD_DQCOEFF(in[1], input); | |
| 3272 LOAD_DQCOEFF(in[9], input); | |
| 3273 LOAD_DQCOEFF(in[17], input); | |
| 3274 LOAD_DQCOEFF(in[25], input); | |
| 3275 LOAD_DQCOEFF(in[2], input); | |
| 3276 LOAD_DQCOEFF(in[10], input); | |
| 3277 LOAD_DQCOEFF(in[18], input); | |
| 3278 LOAD_DQCOEFF(in[26], input); | |
| 3279 LOAD_DQCOEFF(in[3], input); | |
| 3280 LOAD_DQCOEFF(in[11], input); | |
| 3281 LOAD_DQCOEFF(in[19], input); | |
| 3282 LOAD_DQCOEFF(in[27], input); | |
| 3283 | 3250 |
| 3284 LOAD_DQCOEFF(in[4], input); | 3251 // Load input data. Only need to load the top left 8x8 block. |
| 3285 LOAD_DQCOEFF(in[12], input); | 3252 in[0] = _mm_load_si128((const __m128i *)input); |
| 3286 LOAD_DQCOEFF(in[20], input); | 3253 in[1] = _mm_load_si128((const __m128i *)(input + 32)); |
| 3287 LOAD_DQCOEFF(in[28], input); | 3254 in[2] = _mm_load_si128((const __m128i *)(input + 64)); |
| 3288 LOAD_DQCOEFF(in[5], input); | 3255 in[3] = _mm_load_si128((const __m128i *)(input + 96)); |
| 3289 LOAD_DQCOEFF(in[13], input); | 3256 in[4] = _mm_load_si128((const __m128i *)(input + 128)); |
| 3290 LOAD_DQCOEFF(in[21], input); | 3257 in[5] = _mm_load_si128((const __m128i *)(input + 160)); |
| 3291 LOAD_DQCOEFF(in[29], input); | 3258 in[6] = _mm_load_si128((const __m128i *)(input + 192)); |
| 3292 LOAD_DQCOEFF(in[6], input); | 3259 in[7] = _mm_load_si128((const __m128i *)(input + 224)); |
| 3293 LOAD_DQCOEFF(in[14], input); | 3260 |
| 3294 LOAD_DQCOEFF(in[22], input); | 3261 for (i = 8; i < 32; ++i) { |
| 3295 LOAD_DQCOEFF(in[30], input); | 3262 in[i] = _mm_setzero_si128(); |
| 3296 LOAD_DQCOEFF(in[7], input); | 3263 } |
| 3297 LOAD_DQCOEFF(in[15], input); | |
| 3298 LOAD_DQCOEFF(in[23], input); | |
| 3299 LOAD_DQCOEFF(in[31], input); | |
| 3300 | 3264 |
| 3301 array_transpose_8x8(in, in); | 3265 array_transpose_8x8(in, in); |
| 3266 // TODO(hkuang): Following transposes are unnecessary. But remove them will |
| 3267 // lead to performance drop on some devices. |
| 3302 array_transpose_8x8(in + 8, in + 8); | 3268 array_transpose_8x8(in + 8, in + 8); |
| 3303 array_transpose_8x8(in + 16, in + 16); | 3269 array_transpose_8x8(in + 16, in + 16); |
| 3304 array_transpose_8x8(in + 24, in + 24); | 3270 array_transpose_8x8(in + 24, in + 24); |
| 3305 | 3271 |
| 3306 IDCT32 | 3272 IDCT32_34 |
| 3307 | 3273 |
| 3308 // 1_D: Store 32 intermediate results for each 8x32 block. | 3274 // 1_D: Store 32 intermediate results for each 8x32 block. |
| 3309 col[0] = _mm_add_epi16(stp1_0, stp1_31); | 3275 col[0] = _mm_add_epi16(stp1_0, stp1_31); |
| 3310 col[1] = _mm_add_epi16(stp1_1, stp1_30); | 3276 col[1] = _mm_add_epi16(stp1_1, stp1_30); |
| 3311 col[2] = _mm_add_epi16(stp1_2, stp1_29); | 3277 col[2] = _mm_add_epi16(stp1_2, stp1_29); |
| 3312 col[3] = _mm_add_epi16(stp1_3, stp1_28); | 3278 col[3] = _mm_add_epi16(stp1_3, stp1_28); |
| 3313 col[4] = _mm_add_epi16(stp1_4, stp1_27); | 3279 col[4] = _mm_add_epi16(stp1_4, stp1_27); |
| 3314 col[5] = _mm_add_epi16(stp1_5, stp1_26); | 3280 col[5] = _mm_add_epi16(stp1_5, stp1_26); |
| 3315 col[6] = _mm_add_epi16(stp1_6, stp1_25); | 3281 col[6] = _mm_add_epi16(stp1_6, stp1_25); |
| 3316 col[7] = _mm_add_epi16(stp1_7, stp1_24); | 3282 col[7] = _mm_add_epi16(stp1_7, stp1_24); |
| (...skipping 931 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4248 vp9_highbd_idct16(temp_in, temp_out, bd); | 4214 vp9_highbd_idct16(temp_in, temp_out, bd); |
| 4249 for (j = 0; j < 16; ++j) { | 4215 for (j = 0; j < 16; ++j) { |
| 4250 dest[j * stride + i] = highbd_clip_pixel_add( | 4216 dest[j * stride + i] = highbd_clip_pixel_add( |
| 4251 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); | 4217 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); |
| 4252 } | 4218 } |
| 4253 } | 4219 } |
| 4254 } | 4220 } |
| 4255 } | 4221 } |
| 4256 | 4222 |
| 4257 #endif // CONFIG_VP9_HIGHBITDEPTH | 4223 #endif // CONFIG_VP9_HIGHBITDEPTH |
| OLD | NEW |