| OLD | NEW | 
|    1 /* |    1 /* | 
|    2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. |    2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 
|    3  * |    3  * | 
|    4  *  Use of this source code is governed by a BSD-style license |    4  *  Use of this source code is governed by a BSD-style license | 
|    5  *  that can be found in the LICENSE file in the root of the source |    5  *  that can be found in the LICENSE file in the root of the source | 
|    6  *  tree. An additional intellectual property rights grant can be found |    6  *  tree. An additional intellectual property rights grant can be found | 
|    7  *  in the file PATENTS.  All contributing project authors may |    7  *  in the file PATENTS.  All contributing project authors may | 
|    8  *  be found in the AUTHORS file in the root of the source tree. |    8  *  be found in the AUTHORS file in the root of the source tree. | 
|    9  */ |    9  */ | 
|   10  |   10  | 
| (...skipping 72 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
|   83     s1 = _mm_adds_epu16(s1, t1); |   83     s1 = _mm_adds_epu16(s1, t1); | 
|   84     ref += ref_stride; |   84     ref += ref_stride; | 
|   85   } |   85   } | 
|   86  |   86  | 
|   87   src_line = _mm_load_si128((const __m128i *)ref); |   87   src_line = _mm_load_si128((const __m128i *)ref); | 
|   88   t0 = _mm_unpacklo_epi8(src_line, zero); |   88   t0 = _mm_unpacklo_epi8(src_line, zero); | 
|   89   t1 = _mm_unpackhi_epi8(src_line, zero); |   89   t1 = _mm_unpackhi_epi8(src_line, zero); | 
|   90   s0 = _mm_adds_epu16(s0, t0); |   90   s0 = _mm_adds_epu16(s0, t0); | 
|   91   s1 = _mm_adds_epu16(s1, t1); |   91   s1 = _mm_adds_epu16(s1, t1); | 
|   92  |   92  | 
 |   93   if (height == 64) { | 
 |   94     s0 = _mm_srai_epi16(s0, 5); | 
 |   95     s1 = _mm_srai_epi16(s1, 5); | 
 |   96   } else if (height == 32) { | 
 |   97     s0 = _mm_srai_epi16(s0, 4); | 
 |   98     s1 = _mm_srai_epi16(s1, 4); | 
 |   99   } else { | 
 |  100     s0 = _mm_srai_epi16(s0, 3); | 
 |  101     s1 = _mm_srai_epi16(s1, 3); | 
 |  102   } | 
 |  103  | 
|   93   _mm_store_si128((__m128i *)hbuf, s0); |  104   _mm_store_si128((__m128i *)hbuf, s0); | 
|   94   hbuf += 8; |  105   hbuf += 8; | 
|   95   _mm_store_si128((__m128i *)hbuf, s1); |  106   _mm_store_si128((__m128i *)hbuf, s1); | 
|   96 } |  107 } | 
|   97  |  108  | 
|   98 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { |  109 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { | 
|   99   __m128i zero = _mm_setzero_si128(); |  110   __m128i zero = _mm_setzero_si128(); | 
|  100   __m128i src_line = _mm_load_si128((const __m128i *)ref); |  111   __m128i src_line = _mm_load_si128((const __m128i *)ref); | 
|  101   __m128i s0 = _mm_sad_epu8(src_line, zero); |  112   __m128i s0 = _mm_sad_epu8(src_line, zero); | 
|  102   __m128i s1; |  113   __m128i s1; | 
|  103   (void) width;  // width = 64 |  114   int i; | 
 |  115   const int norm_factor = 3 + (width >> 5); | 
|  104  |  116  | 
|  105   ref += 16; |  117   for (i = 16; i < width; i += 16) { | 
|  106   src_line = _mm_load_si128((const __m128i *)ref); |  118     ref += 16; | 
|  107   s1 = _mm_sad_epu8(src_line, zero); |  119     src_line = _mm_load_si128((const __m128i *)ref); | 
|  108   s0 = _mm_adds_epu16(s0, s1); |  120     s1 = _mm_sad_epu8(src_line, zero); | 
|  109  |  121     s0 = _mm_adds_epu16(s0, s1); | 
|  110   ref += 16; |  122   } | 
|  111   src_line = _mm_load_si128((const __m128i *)ref); |  | 
|  112   s1 = _mm_sad_epu8(src_line, zero); |  | 
|  113   s0 = _mm_adds_epu16(s0, s1); |  | 
|  114  |  | 
|  115   ref += 16; |  | 
|  116   src_line = _mm_load_si128((const __m128i *)ref); |  | 
|  117   s1 = _mm_sad_epu8(src_line, zero); |  | 
|  118   s0 = _mm_adds_epu16(s0, s1); |  | 
|  119  |  123  | 
|  120   s1 = _mm_srli_si128(s0, 8); |  124   s1 = _mm_srli_si128(s0, 8); | 
|  121   s0 = _mm_adds_epu16(s0, s1); |  125   s0 = _mm_adds_epu16(s0, s1); | 
|  122  |  126  | 
|  123   return _mm_extract_epi16(s0, 0); |  127   return _mm_extract_epi16(s0, 0) >> norm_factor; | 
|  124 } |  128 } | 
|  125  |  129  | 
|  126 int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, |  130 int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, | 
|  127                         const int width) { |  131                         const int bwl) { | 
|  128   int idx; |  132   int idx; | 
|  129   __m128i zero = _mm_setzero_si128(); |  133   int width = 4 << bwl; | 
|  130   __m128i sum; |  134   int16_t mean; | 
|  131   __m128i v0 = _mm_loadu_si128((const __m128i *)ref); |  135   __m128i v0 = _mm_loadu_si128((const __m128i *)ref); | 
|  132   __m128i v1 = _mm_load_si128((const __m128i *)src); |  136   __m128i v1 = _mm_load_si128((const __m128i *)src); | 
|  133   __m128i diff = _mm_subs_epi16(v0, v1); |  137   __m128i diff = _mm_subs_epi16(v0, v1); | 
|  134   __m128i sign = _mm_srai_epi16(diff, 15); |  138   __m128i sum = diff; | 
|  135  |  139   __m128i sse = _mm_madd_epi16(diff, diff); | 
|  136   diff = _mm_xor_si128(diff, sign); |  | 
|  137   sum = _mm_sub_epi16(diff, sign); |  | 
|  138  |  | 
|  139   (void) width;  // width = 64; |  | 
|  140  |  140  | 
|  141   ref += 8; |  141   ref += 8; | 
|  142   src += 8; |  142   src += 8; | 
|  143  |  143  | 
|  144   v0 = _mm_unpacklo_epi16(sum, zero); |  144   for (idx = 8; idx < width; idx += 8) { | 
|  145   v1 = _mm_unpackhi_epi16(sum, zero); |  | 
|  146   sum = _mm_add_epi32(v0, v1); |  | 
|  147  |  | 
|  148   for (idx = 1; idx < 8; ++idx) { |  | 
|  149     v0 = _mm_loadu_si128((const __m128i *)ref); |  145     v0 = _mm_loadu_si128((const __m128i *)ref); | 
|  150     v1 = _mm_load_si128((const __m128i *)src); |  146     v1 = _mm_load_si128((const __m128i *)src); | 
|  151     diff = _mm_subs_epi16(v0, v1); |  147     diff = _mm_subs_epi16(v0, v1); | 
|  152     sign = _mm_srai_epi16(diff, 15); |  | 
|  153     diff = _mm_xor_si128(diff, sign); |  | 
|  154     diff = _mm_sub_epi16(diff, sign); |  | 
|  155  |  148  | 
|  156     v0 = _mm_unpacklo_epi16(diff, zero); |  149     sum = _mm_add_epi16(sum, diff); | 
|  157     v1 = _mm_unpackhi_epi16(diff, zero); |  150     v0  = _mm_madd_epi16(diff, diff); | 
|  158  |  151     sse = _mm_add_epi32(sse, v0); | 
|  159     sum = _mm_add_epi32(sum, v0); |  | 
|  160     sum = _mm_add_epi32(sum, v1); |  | 
|  161  |  152  | 
|  162     ref += 8; |  153     ref += 8; | 
|  163     src += 8; |  154     src += 8; | 
|  164   } |  155   } | 
|  165  |  156  | 
|  166   v0 = _mm_srli_si128(sum, 8); |  157   v0  = _mm_srli_si128(sum, 8); | 
|  167   sum = _mm_add_epi32(sum, v0); |  158   sum = _mm_add_epi16(sum, v0); | 
|  168   v0 = _mm_srli_epi64(sum, 32); |  159   v0  = _mm_srli_epi64(sum, 32); | 
|  169   sum = _mm_add_epi32(sum, v0); |  160   sum = _mm_add_epi16(sum, v0); | 
 |  161   v0  = _mm_srli_epi32(sum, 16); | 
 |  162   sum = _mm_add_epi16(sum, v0); | 
|  170  |  163  | 
|  171   return _mm_cvtsi128_si32(sum); |  164   v1  = _mm_srli_si128(sse, 8); | 
 |  165   sse = _mm_add_epi32(sse, v1); | 
 |  166   v1  = _mm_srli_epi64(sse, 32); | 
 |  167   sse = _mm_add_epi32(sse, v1); | 
 |  168  | 
 |  169   mean = _mm_extract_epi16(sum, 0); | 
 |  170  | 
 |  171   return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); | 
|  172 } |  172 } | 
| OLD | NEW |