OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); | 54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); |
55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); | 55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); |
56 avg = _mm_extract_epi16(s0, 0); | 56 avg = _mm_extract_epi16(s0, 0); |
57 return (avg + 8) >> 4; | 57 return (avg + 8) >> 4; |
58 } | 58 } |
59 | 59 |
60 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, | 60 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, |
61 const int ref_stride, const int height) { | 61 const int ref_stride, const int height) { |
62 int idx; | 62 int idx; |
63 __m128i zero = _mm_setzero_si128(); | 63 __m128i zero = _mm_setzero_si128(); |
64 __m128i src_line = _mm_load_si128((const __m128i *)ref); | 64 __m128i src_line = _mm_loadu_si128((const __m128i *)ref); |
65 __m128i s0 = _mm_unpacklo_epi8(src_line, zero); | 65 __m128i s0 = _mm_unpacklo_epi8(src_line, zero); |
66 __m128i s1 = _mm_unpackhi_epi8(src_line, zero); | 66 __m128i s1 = _mm_unpackhi_epi8(src_line, zero); |
67 __m128i t0, t1; | 67 __m128i t0, t1; |
68 int height_1 = height - 1; | 68 int height_1 = height - 1; |
69 ref += ref_stride; | 69 ref += ref_stride; |
70 | 70 |
71 for (idx = 1; idx < height_1; idx += 2) { | 71 for (idx = 1; idx < height_1; idx += 2) { |
72 src_line = _mm_load_si128((const __m128i *)ref); | 72 src_line = _mm_loadu_si128((const __m128i *)ref); |
73 t0 = _mm_unpacklo_epi8(src_line, zero); | 73 t0 = _mm_unpacklo_epi8(src_line, zero); |
74 t1 = _mm_unpackhi_epi8(src_line, zero); | 74 t1 = _mm_unpackhi_epi8(src_line, zero); |
75 s0 = _mm_adds_epu16(s0, t0); | 75 s0 = _mm_adds_epu16(s0, t0); |
76 s1 = _mm_adds_epu16(s1, t1); | 76 s1 = _mm_adds_epu16(s1, t1); |
77 ref += ref_stride; | 77 ref += ref_stride; |
78 | 78 |
79 src_line = _mm_load_si128((const __m128i *)ref); | 79 src_line = _mm_loadu_si128((const __m128i *)ref); |
80 t0 = _mm_unpacklo_epi8(src_line, zero); | 80 t0 = _mm_unpacklo_epi8(src_line, zero); |
81 t1 = _mm_unpackhi_epi8(src_line, zero); | 81 t1 = _mm_unpackhi_epi8(src_line, zero); |
82 s0 = _mm_adds_epu16(s0, t0); | 82 s0 = _mm_adds_epu16(s0, t0); |
83 s1 = _mm_adds_epu16(s1, t1); | 83 s1 = _mm_adds_epu16(s1, t1); |
84 ref += ref_stride; | 84 ref += ref_stride; |
85 } | 85 } |
86 | 86 |
87 src_line = _mm_load_si128((const __m128i *)ref); | 87 src_line = _mm_loadu_si128((const __m128i *)ref); |
88 t0 = _mm_unpacklo_epi8(src_line, zero); | 88 t0 = _mm_unpacklo_epi8(src_line, zero); |
89 t1 = _mm_unpackhi_epi8(src_line, zero); | 89 t1 = _mm_unpackhi_epi8(src_line, zero); |
90 s0 = _mm_adds_epu16(s0, t0); | 90 s0 = _mm_adds_epu16(s0, t0); |
91 s1 = _mm_adds_epu16(s1, t1); | 91 s1 = _mm_adds_epu16(s1, t1); |
92 | 92 |
93 if (height == 64) { | 93 if (height == 64) { |
94 s0 = _mm_srai_epi16(s0, 5); | 94 s0 = _mm_srai_epi16(s0, 5); |
95 s1 = _mm_srai_epi16(s1, 5); | 95 s1 = _mm_srai_epi16(s1, 5); |
96 } else if (height == 32) { | 96 } else if (height == 32) { |
97 s0 = _mm_srai_epi16(s0, 4); | 97 s0 = _mm_srai_epi16(s0, 4); |
98 s1 = _mm_srai_epi16(s1, 4); | 98 s1 = _mm_srai_epi16(s1, 4); |
99 } else { | 99 } else { |
100 s0 = _mm_srai_epi16(s0, 3); | 100 s0 = _mm_srai_epi16(s0, 3); |
101 s1 = _mm_srai_epi16(s1, 3); | 101 s1 = _mm_srai_epi16(s1, 3); |
102 } | 102 } |
103 | 103 |
104 _mm_store_si128((__m128i *)hbuf, s0); | 104 _mm_storeu_si128((__m128i *)hbuf, s0); |
105 hbuf += 8; | 105 hbuf += 8; |
106 _mm_store_si128((__m128i *)hbuf, s1); | 106 _mm_storeu_si128((__m128i *)hbuf, s1); |
107 } | 107 } |
108 | 108 |
109 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { | 109 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { |
110 __m128i zero = _mm_setzero_si128(); | 110 __m128i zero = _mm_setzero_si128(); |
111 __m128i src_line = _mm_load_si128((const __m128i *)ref); | 111 __m128i src_line = _mm_load_si128((const __m128i *)ref); |
112 __m128i s0 = _mm_sad_epu8(src_line, zero); | 112 __m128i s0 = _mm_sad_epu8(src_line, zero); |
113 __m128i s1; | 113 __m128i s1; |
114 int i; | 114 int i; |
115 const int norm_factor = 3 + (width >> 5); | 115 const int norm_factor = 3 + (width >> 5); |
116 | 116 |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
163 | 163 |
164 v1 = _mm_srli_si128(sse, 8); | 164 v1 = _mm_srli_si128(sse, 8); |
165 sse = _mm_add_epi32(sse, v1); | 165 sse = _mm_add_epi32(sse, v1); |
166 v1 = _mm_srli_epi64(sse, 32); | 166 v1 = _mm_srli_epi64(sse, 32); |
167 sse = _mm_add_epi32(sse, v1); | 167 sse = _mm_add_epi32(sse, v1); |
168 | 168 |
169 mean = _mm_extract_epi16(sum, 0); | 169 mean = _mm_extract_epi16(sum, 0); |
170 | 170 |
171 return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); | 171 return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); |
172 } | 172 } |
OLD | NEW |