OLD | NEW |
| (Empty) |
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "remoting/capturer/differ_block_sse2.h" | |
6 | |
7 #if defined(_MSC_VER) | |
8 #include <intrin.h> | |
9 #else | |
10 #include <mmintrin.h> | |
11 #include <emmintrin.h> | |
12 #endif | |
13 | |
14 #include "remoting/capturer/differ_block.h" | |
15 | |
16 namespace remoting { | |
17 | |
18 extern int BlockDifference_SSE2_W16(const uint8* image1, const uint8* image2, | |
19 int stride) { | |
20 __m128i acc = _mm_setzero_si128(); | |
21 __m128i v0; | |
22 __m128i v1; | |
23 __m128i sad; | |
24 for (int y = 0; y < kBlockSize; ++y) { | |
25 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
26 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
27 v0 = _mm_loadu_si128(i1); | |
28 v1 = _mm_loadu_si128(i2); | |
29 sad = _mm_sad_epu8(v0, v1); | |
30 acc = _mm_adds_epu16(acc, sad); | |
31 v0 = _mm_loadu_si128(i1 + 1); | |
32 v1 = _mm_loadu_si128(i2 + 1); | |
33 sad = _mm_sad_epu8(v0, v1); | |
34 acc = _mm_adds_epu16(acc, sad); | |
35 v0 = _mm_loadu_si128(i1 + 2); | |
36 v1 = _mm_loadu_si128(i2 + 2); | |
37 sad = _mm_sad_epu8(v0, v1); | |
38 acc = _mm_adds_epu16(acc, sad); | |
39 v0 = _mm_loadu_si128(i1 + 3); | |
40 v1 = _mm_loadu_si128(i2 + 3); | |
41 sad = _mm_sad_epu8(v0, v1); | |
42 acc = _mm_adds_epu16(acc, sad); | |
43 | |
44 // This essential means sad = acc >> 64. We only care about the lower 16 | |
45 // bits. | |
46 sad = _mm_shuffle_epi32(acc, 0xEE); | |
47 sad = _mm_adds_epu16(sad, acc); | |
48 int diff = _mm_cvtsi128_si32(sad); | |
49 if (diff) | |
50 return 1; | |
51 image1 += stride; | |
52 image2 += stride; | |
53 } | |
54 return 0; | |
55 } | |
56 | |
57 extern int BlockDifference_SSE2_W32(const uint8* image1, const uint8* image2, | |
58 int stride) { | |
59 __m128i acc = _mm_setzero_si128(); | |
60 __m128i v0; | |
61 __m128i v1; | |
62 __m128i sad; | |
63 for (int y = 0; y < kBlockSize; ++y) { | |
64 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1); | |
65 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2); | |
66 v0 = _mm_loadu_si128(i1); | |
67 v1 = _mm_loadu_si128(i2); | |
68 sad = _mm_sad_epu8(v0, v1); | |
69 acc = _mm_adds_epu16(acc, sad); | |
70 v0 = _mm_loadu_si128(i1 + 1); | |
71 v1 = _mm_loadu_si128(i2 + 1); | |
72 sad = _mm_sad_epu8(v0, v1); | |
73 acc = _mm_adds_epu16(acc, sad); | |
74 v0 = _mm_loadu_si128(i1 + 2); | |
75 v1 = _mm_loadu_si128(i2 + 2); | |
76 sad = _mm_sad_epu8(v0, v1); | |
77 acc = _mm_adds_epu16(acc, sad); | |
78 v0 = _mm_loadu_si128(i1 + 3); | |
79 v1 = _mm_loadu_si128(i2 + 3); | |
80 sad = _mm_sad_epu8(v0, v1); | |
81 acc = _mm_adds_epu16(acc, sad); | |
82 v0 = _mm_loadu_si128(i1 + 4); | |
83 v1 = _mm_loadu_si128(i2 + 4); | |
84 sad = _mm_sad_epu8(v0, v1); | |
85 acc = _mm_adds_epu16(acc, sad); | |
86 v0 = _mm_loadu_si128(i1 + 5); | |
87 v1 = _mm_loadu_si128(i2 + 5); | |
88 sad = _mm_sad_epu8(v0, v1); | |
89 acc = _mm_adds_epu16(acc, sad); | |
90 v0 = _mm_loadu_si128(i1 + 6); | |
91 v1 = _mm_loadu_si128(i2 + 6); | |
92 sad = _mm_sad_epu8(v0, v1); | |
93 acc = _mm_adds_epu16(acc, sad); | |
94 v0 = _mm_loadu_si128(i1 + 7); | |
95 v1 = _mm_loadu_si128(i2 + 7); | |
96 sad = _mm_sad_epu8(v0, v1); | |
97 acc = _mm_adds_epu16(acc, sad); | |
98 | |
99 // This essential means sad = acc >> 64. We only care about the lower 16 | |
100 // bits. | |
101 sad = _mm_shuffle_epi32(acc, 0xEE); | |
102 sad = _mm_adds_epu16(sad, acc); | |
103 int diff = _mm_cvtsi128_si32(sad); | |
104 if (diff) | |
105 return 1; | |
106 image1 += stride; | |
107 image2 += stride; | |
108 } | |
109 return 0; | |
110 } | |
111 | |
112 } // namespace remoting | |
OLD | NEW |