OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
49 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); | 49 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); |
50 s0 = _mm_adds_epu16(s0, s1); | 50 s0 = _mm_adds_epu16(s0, s1); |
51 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); | 51 s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); |
52 s0 = _mm_adds_epu16(s0, s1); | 52 s0 = _mm_adds_epu16(s0, s1); |
53 | 53 |
54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); | 54 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); |
55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); | 55 s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); |
56 avg = _mm_extract_epi16(s0, 0); | 56 avg = _mm_extract_epi16(s0, 0); |
57 return (avg + 8) >> 4; | 57 return (avg + 8) >> 4; |
58 } | 58 } |
| 59 |
| 60 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, |
| 61 const int ref_stride, const int height) { |
| 62 int idx; |
| 63 __m128i zero = _mm_setzero_si128(); |
| 64 __m128i src_line = _mm_load_si128((const __m128i *)ref); |
| 65 __m128i s0 = _mm_unpacklo_epi8(src_line, zero); |
| 66 __m128i s1 = _mm_unpackhi_epi8(src_line, zero); |
| 67 __m128i t0, t1; |
| 68 int height_1 = height - 1; |
| 69 ref += ref_stride; |
| 70 |
| 71 for (idx = 1; idx < height_1; idx += 2) { |
| 72 src_line = _mm_load_si128((const __m128i *)ref); |
| 73 t0 = _mm_unpacklo_epi8(src_line, zero); |
| 74 t1 = _mm_unpackhi_epi8(src_line, zero); |
| 75 s0 = _mm_adds_epu16(s0, t0); |
| 76 s1 = _mm_adds_epu16(s1, t1); |
| 77 ref += ref_stride; |
| 78 |
| 79 src_line = _mm_load_si128((const __m128i *)ref); |
| 80 t0 = _mm_unpacklo_epi8(src_line, zero); |
| 81 t1 = _mm_unpackhi_epi8(src_line, zero); |
| 82 s0 = _mm_adds_epu16(s0, t0); |
| 83 s1 = _mm_adds_epu16(s1, t1); |
| 84 ref += ref_stride; |
| 85 } |
| 86 |
| 87 src_line = _mm_load_si128((const __m128i *)ref); |
| 88 t0 = _mm_unpacklo_epi8(src_line, zero); |
| 89 t1 = _mm_unpackhi_epi8(src_line, zero); |
| 90 s0 = _mm_adds_epu16(s0, t0); |
| 91 s1 = _mm_adds_epu16(s1, t1); |
| 92 |
| 93 _mm_store_si128((__m128i *)hbuf, s0); |
| 94 hbuf += 8; |
| 95 _mm_store_si128((__m128i *)hbuf, s1); |
| 96 } |
| 97 |
| 98 int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { |
| 99 __m128i zero = _mm_setzero_si128(); |
| 100 __m128i src_line = _mm_load_si128((const __m128i *)ref); |
| 101 __m128i s0 = _mm_sad_epu8(src_line, zero); |
| 102 __m128i s1; |
| 103 (void) width; // width = 64 |
| 104 |
| 105 ref += 16; |
| 106 src_line = _mm_load_si128((const __m128i *)ref); |
| 107 s1 = _mm_sad_epu8(src_line, zero); |
| 108 s0 = _mm_adds_epu16(s0, s1); |
| 109 |
| 110 ref += 16; |
| 111 src_line = _mm_load_si128((const __m128i *)ref); |
| 112 s1 = _mm_sad_epu8(src_line, zero); |
| 113 s0 = _mm_adds_epu16(s0, s1); |
| 114 |
| 115 ref += 16; |
| 116 src_line = _mm_load_si128((const __m128i *)ref); |
| 117 s1 = _mm_sad_epu8(src_line, zero); |
| 118 s0 = _mm_adds_epu16(s0, s1); |
| 119 |
| 120 s1 = _mm_srli_si128(s0, 8); |
| 121 s0 = _mm_adds_epu16(s0, s1); |
| 122 |
| 123 return _mm_extract_epi16(s0, 0); |
| 124 } |
| 125 |
| 126 int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, |
| 127 const int width) { |
| 128 int idx; |
| 129 __m128i zero = _mm_setzero_si128(); |
| 130 __m128i sum; |
| 131 __m128i v0 = _mm_loadu_si128((const __m128i *)ref); |
| 132 __m128i v1 = _mm_load_si128((const __m128i *)src); |
| 133 __m128i diff = _mm_subs_epi16(v0, v1); |
| 134 __m128i sign = _mm_srai_epi16(diff, 15); |
| 135 |
| 136 diff = _mm_xor_si128(diff, sign); |
| 137 sum = _mm_sub_epi16(diff, sign); |
| 138 |
| 139 (void) width; // width = 64; |
| 140 |
| 141 ref += 8; |
| 142 src += 8; |
| 143 |
| 144 v0 = _mm_unpacklo_epi16(sum, zero); |
| 145 v1 = _mm_unpackhi_epi16(sum, zero); |
| 146 sum = _mm_add_epi32(v0, v1); |
| 147 |
| 148 for (idx = 1; idx < 8; ++idx) { |
| 149 v0 = _mm_loadu_si128((const __m128i *)ref); |
| 150 v1 = _mm_load_si128((const __m128i *)src); |
| 151 diff = _mm_subs_epi16(v0, v1); |
| 152 sign = _mm_srai_epi16(diff, 15); |
| 153 diff = _mm_xor_si128(diff, sign); |
| 154 diff = _mm_sub_epi16(diff, sign); |
| 155 |
| 156 v0 = _mm_unpacklo_epi16(diff, zero); |
| 157 v1 = _mm_unpackhi_epi16(diff, zero); |
| 158 |
| 159 sum = _mm_add_epi32(sum, v0); |
| 160 sum = _mm_add_epi32(sum, v1); |
| 161 |
| 162 ref += 8; |
| 163 src += 8; |
| 164 } |
| 165 |
| 166 v0 = _mm_srli_si128(sum, 8); |
| 167 sum = _mm_add_epi32(sum, v0); |
| 168 v0 = _mm_srli_epi64(sum, 32); |
| 169 sum = _mm_add_epi32(sum, v0); |
| 170 |
| 171 return _mm_cvtsi128_si32(sum); |
| 172 } |
OLD | NEW |