Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Unified Diff: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c

Issue 996503002: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_svc_layercontext.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
diff --git a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 0a105629f067478a5754101879b38364e07af345..f49949940395e9ef29ec8c76470e4122c843879f 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -90,6 +90,17 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
s0 = _mm_adds_epu16(s0, t0);
s1 = _mm_adds_epu16(s1, t1);
+ if (height == 64) {
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+ } else if (height == 32) {
+ s0 = _mm_srai_epi16(s0, 4);
+ s1 = _mm_srai_epi16(s1, 4);
+ } else {
+ s0 = _mm_srai_epi16(s0, 3);
+ s1 = _mm_srai_epi16(s1, 3);
+ }
+
_mm_store_si128((__m128i *)hbuf, s0);
hbuf += 8;
_mm_store_si128((__m128i *)hbuf, s1);
@@ -100,73 +111,62 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
__m128i src_line = _mm_load_si128((const __m128i *)ref);
__m128i s0 = _mm_sad_epu8(src_line, zero);
__m128i s1;
- (void) width; // width = 64
-
- ref += 16;
- src_line = _mm_load_si128((const __m128i *)ref);
- s1 = _mm_sad_epu8(src_line, zero);
- s0 = _mm_adds_epu16(s0, s1);
+ int i;
+ const int norm_factor = 3 + (width >> 5);
- ref += 16;
- src_line = _mm_load_si128((const __m128i *)ref);
- s1 = _mm_sad_epu8(src_line, zero);
- s0 = _mm_adds_epu16(s0, s1);
-
- ref += 16;
- src_line = _mm_load_si128((const __m128i *)ref);
- s1 = _mm_sad_epu8(src_line, zero);
- s0 = _mm_adds_epu16(s0, s1);
+ for (i = 16; i < width; i += 16) {
+ ref += 16;
+ src_line = _mm_load_si128((const __m128i *)ref);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_adds_epu16(s0, s1);
+ }
s1 = _mm_srli_si128(s0, 8);
s0 = _mm_adds_epu16(s0, s1);
- return _mm_extract_epi16(s0, 0);
+ return _mm_extract_epi16(s0, 0) >> norm_factor;
}
-int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
- const int width) {
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+ const int bwl) {
int idx;
- __m128i zero = _mm_setzero_si128();
- __m128i sum;
+ int width = 4 << bwl;
+ int16_t mean;
__m128i v0 = _mm_loadu_si128((const __m128i *)ref);
__m128i v1 = _mm_load_si128((const __m128i *)src);
__m128i diff = _mm_subs_epi16(v0, v1);
- __m128i sign = _mm_srai_epi16(diff, 15);
-
- diff = _mm_xor_si128(diff, sign);
- sum = _mm_sub_epi16(diff, sign);
-
- (void) width; // width = 64;
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
ref += 8;
src += 8;
- v0 = _mm_unpacklo_epi16(sum, zero);
- v1 = _mm_unpackhi_epi16(sum, zero);
- sum = _mm_add_epi32(v0, v1);
-
- for (idx = 1; idx < 8; ++idx) {
+ for (idx = 8; idx < width; idx += 8) {
v0 = _mm_loadu_si128((const __m128i *)ref);
v1 = _mm_load_si128((const __m128i *)src);
diff = _mm_subs_epi16(v0, v1);
- sign = _mm_srai_epi16(diff, 15);
- diff = _mm_xor_si128(diff, sign);
- diff = _mm_sub_epi16(diff, sign);
-
- v0 = _mm_unpacklo_epi16(diff, zero);
- v1 = _mm_unpackhi_epi16(diff, zero);
- sum = _mm_add_epi32(sum, v0);
- sum = _mm_add_epi32(sum, v1);
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
ref += 8;
src += 8;
}
- v0 = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi32(sum, v0);
- v0 = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi32(sum, v0);
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = _mm_extract_epi16(sum, 0);
- return _mm_cvtsi128_si32(sum);
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
}
« no previous file with comments | « source/libvpx/vp9/encoder/vp9_svc_layercontext.c ('k') | source/libvpx/vp9/vp9_cx_iface.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698