OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include "libyuv/row.h" |
| 12 #include "libyuv/rotate_row.h" |
| 13 |
| 14 #ifdef __cplusplus |
| 15 namespace libyuv { |
| 16 extern "C" { |
| 17 #endif |
| 18 |
| 19 // This module is for Visual C x86. |
| 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ |
| 21 defined(_MSC_VER) && !defined(__clang__) |
| 22 |
| 23 __declspec(naked) |
| 24 void TransposeWx8_SSSE3(const uint8* src, int src_stride, |
| 25 uint8* dst, int dst_stride, int width) { |
| 26 __asm { |
| 27 push edi |
| 28 push esi |
| 29 push ebp |
| 30 mov eax, [esp + 12 + 4] // src |
| 31 mov edi, [esp + 12 + 8] // src_stride |
| 32 mov edx, [esp + 12 + 12] // dst |
| 33 mov esi, [esp + 12 + 16] // dst_stride |
| 34 mov ecx, [esp + 12 + 20] // width |
| 35 |
| 36 // Read in the data from the source pointer. |
| 37 // First round of bit swap. |
| 38 align 4 |
| 39 convertloop: |
| 40 movq xmm0, qword ptr [eax] |
| 41 lea ebp, [eax + 8] |
| 42 movq xmm1, qword ptr [eax + edi] |
| 43 lea eax, [eax + 2 * edi] |
| 44 punpcklbw xmm0, xmm1 |
| 45 movq xmm2, qword ptr [eax] |
| 46 movdqa xmm1, xmm0 |
| 47 palignr xmm1, xmm1, 8 |
| 48 movq xmm3, qword ptr [eax + edi] |
| 49 lea eax, [eax + 2 * edi] |
| 50 punpcklbw xmm2, xmm3 |
| 51 movdqa xmm3, xmm2 |
| 52 movq xmm4, qword ptr [eax] |
| 53 palignr xmm3, xmm3, 8 |
| 54 movq xmm5, qword ptr [eax + edi] |
| 55 punpcklbw xmm4, xmm5 |
| 56 lea eax, [eax + 2 * edi] |
| 57 movdqa xmm5, xmm4 |
| 58 movq xmm6, qword ptr [eax] |
| 59 palignr xmm5, xmm5, 8 |
| 60 movq xmm7, qword ptr [eax + edi] |
| 61 punpcklbw xmm6, xmm7 |
| 62 mov eax, ebp |
| 63 movdqa xmm7, xmm6 |
| 64 palignr xmm7, xmm7, 8 |
| 65 // Second round of bit swap. |
| 66 punpcklwd xmm0, xmm2 |
| 67 punpcklwd xmm1, xmm3 |
| 68 movdqa xmm2, xmm0 |
| 69 movdqa xmm3, xmm1 |
| 70 palignr xmm2, xmm2, 8 |
| 71 palignr xmm3, xmm3, 8 |
| 72 punpcklwd xmm4, xmm6 |
| 73 punpcklwd xmm5, xmm7 |
| 74 movdqa xmm6, xmm4 |
| 75 movdqa xmm7, xmm5 |
| 76 palignr xmm6, xmm6, 8 |
| 77 palignr xmm7, xmm7, 8 |
| 78 // Third round of bit swap. |
| 79 // Write to the destination pointer. |
| 80 punpckldq xmm0, xmm4 |
| 81 movq qword ptr [edx], xmm0 |
| 82 movdqa xmm4, xmm0 |
| 83 palignr xmm4, xmm4, 8 |
| 84 movq qword ptr [edx + esi], xmm4 |
| 85 lea edx, [edx + 2 * esi] |
| 86 punpckldq xmm2, xmm6 |
| 87 movdqa xmm6, xmm2 |
| 88 palignr xmm6, xmm6, 8 |
| 89 movq qword ptr [edx], xmm2 |
| 90 punpckldq xmm1, xmm5 |
| 91 movq qword ptr [edx + esi], xmm6 |
| 92 lea edx, [edx + 2 * esi] |
| 93 movdqa xmm5, xmm1 |
| 94 movq qword ptr [edx], xmm1 |
| 95 palignr xmm5, xmm5, 8 |
| 96 punpckldq xmm3, xmm7 |
| 97 movq qword ptr [edx + esi], xmm5 |
| 98 lea edx, [edx + 2 * esi] |
| 99 movq qword ptr [edx], xmm3 |
| 100 movdqa xmm7, xmm3 |
| 101 palignr xmm7, xmm7, 8 |
| 102 sub ecx, 8 |
| 103 movq qword ptr [edx + esi], xmm7 |
| 104 lea edx, [edx + 2 * esi] |
| 105 jg convertloop |
| 106 |
| 107 pop ebp |
| 108 pop esi |
| 109 pop edi |
| 110 ret |
| 111 } |
| 112 } |
| 113 |
| 114 __declspec(naked) |
| 115 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, |
| 116 uint8* dst_a, int dst_stride_a, |
| 117 uint8* dst_b, int dst_stride_b, |
| 118 int w) { |
| 119 __asm { |
| 120 push ebx |
| 121 push esi |
| 122 push edi |
| 123 push ebp |
| 124 mov eax, [esp + 16 + 4] // src |
| 125 mov edi, [esp + 16 + 8] // src_stride |
| 126 mov edx, [esp + 16 + 12] // dst_a |
| 127 mov esi, [esp + 16 + 16] // dst_stride_a |
| 128 mov ebx, [esp + 16 + 20] // dst_b |
| 129 mov ebp, [esp + 16 + 24] // dst_stride_b |
| 130 mov ecx, esp |
| 131 sub esp, 4 + 16 |
| 132 and esp, ~15 |
| 133 mov [esp + 16], ecx |
| 134 mov ecx, [ecx + 16 + 28] // w |
| 135 |
| 136 align 4 |
| 137 convertloop: |
| 138 // Read in the data from the source pointer. |
| 139 // First round of bit swap. |
| 140 movdqu xmm0, [eax] |
| 141 movdqu xmm1, [eax + edi] |
| 142 lea eax, [eax + 2 * edi] |
| 143 movdqa xmm7, xmm0 // use xmm7 as temp register. |
| 144 punpcklbw xmm0, xmm1 |
| 145 punpckhbw xmm7, xmm1 |
| 146 movdqa xmm1, xmm7 |
| 147 movdqu xmm2, [eax] |
| 148 movdqu xmm3, [eax + edi] |
| 149 lea eax, [eax + 2 * edi] |
| 150 movdqa xmm7, xmm2 |
| 151 punpcklbw xmm2, xmm3 |
| 152 punpckhbw xmm7, xmm3 |
| 153 movdqa xmm3, xmm7 |
| 154 movdqu xmm4, [eax] |
| 155 movdqu xmm5, [eax + edi] |
| 156 lea eax, [eax + 2 * edi] |
| 157 movdqa xmm7, xmm4 |
| 158 punpcklbw xmm4, xmm5 |
| 159 punpckhbw xmm7, xmm5 |
| 160 movdqa xmm5, xmm7 |
| 161 movdqu xmm6, [eax] |
| 162 movdqu xmm7, [eax + edi] |
| 163 lea eax, [eax + 2 * edi] |
| 164 movdqu [esp], xmm5 // backup xmm5 |
| 165 neg edi |
| 166 movdqa xmm5, xmm6 // use xmm5 as temp register. |
| 167 punpcklbw xmm6, xmm7 |
| 168 punpckhbw xmm5, xmm7 |
| 169 movdqa xmm7, xmm5 |
| 170 lea eax, [eax + 8 * edi + 16] |
| 171 neg edi |
| 172 // Second round of bit swap. |
| 173 movdqa xmm5, xmm0 |
| 174 punpcklwd xmm0, xmm2 |
| 175 punpckhwd xmm5, xmm2 |
| 176 movdqa xmm2, xmm5 |
| 177 movdqa xmm5, xmm1 |
| 178 punpcklwd xmm1, xmm3 |
| 179 punpckhwd xmm5, xmm3 |
| 180 movdqa xmm3, xmm5 |
| 181 movdqa xmm5, xmm4 |
| 182 punpcklwd xmm4, xmm6 |
| 183 punpckhwd xmm5, xmm6 |
| 184 movdqa xmm6, xmm5 |
| 185 movdqu xmm5, [esp] // restore xmm5 |
| 186 movdqu [esp], xmm6 // backup xmm6 |
| 187 movdqa xmm6, xmm5 // use xmm6 as temp register. |
| 188 punpcklwd xmm5, xmm7 |
| 189 punpckhwd xmm6, xmm7 |
| 190 movdqa xmm7, xmm6 |
| 191 // Third round of bit swap. |
| 192 // Write to the destination pointer. |
| 193 movdqa xmm6, xmm0 |
| 194 punpckldq xmm0, xmm4 |
| 195 punpckhdq xmm6, xmm4 |
| 196 movdqa xmm4, xmm6 |
| 197 movdqu xmm6, [esp] // restore xmm6 |
| 198 movlpd qword ptr [edx], xmm0 |
| 199 movhpd qword ptr [ebx], xmm0 |
| 200 movlpd qword ptr [edx + esi], xmm4 |
| 201 lea edx, [edx + 2 * esi] |
| 202 movhpd qword ptr [ebx + ebp], xmm4 |
| 203 lea ebx, [ebx + 2 * ebp] |
| 204 movdqa xmm0, xmm2 // use xmm0 as the temp register. |
| 205 punpckldq xmm2, xmm6 |
| 206 movlpd qword ptr [edx], xmm2 |
| 207 movhpd qword ptr [ebx], xmm2 |
| 208 punpckhdq xmm0, xmm6 |
| 209 movlpd qword ptr [edx + esi], xmm0 |
| 210 lea edx, [edx + 2 * esi] |
| 211 movhpd qword ptr [ebx + ebp], xmm0 |
| 212 lea ebx, [ebx + 2 * ebp] |
| 213 movdqa xmm0, xmm1 // use xmm0 as the temp register. |
| 214 punpckldq xmm1, xmm5 |
| 215 movlpd qword ptr [edx], xmm1 |
| 216 movhpd qword ptr [ebx], xmm1 |
| 217 punpckhdq xmm0, xmm5 |
| 218 movlpd qword ptr [edx + esi], xmm0 |
| 219 lea edx, [edx + 2 * esi] |
| 220 movhpd qword ptr [ebx + ebp], xmm0 |
| 221 lea ebx, [ebx + 2 * ebp] |
| 222 movdqa xmm0, xmm3 // use xmm0 as the temp register. |
| 223 punpckldq xmm3, xmm7 |
| 224 movlpd qword ptr [edx], xmm3 |
| 225 movhpd qword ptr [ebx], xmm3 |
| 226 punpckhdq xmm0, xmm7 |
| 227 sub ecx, 8 |
| 228 movlpd qword ptr [edx + esi], xmm0 |
| 229 lea edx, [edx + 2 * esi] |
| 230 movhpd qword ptr [ebx + ebp], xmm0 |
| 231 lea ebx, [ebx + 2 * ebp] |
| 232 jg convertloop |
| 233 |
| 234 mov esp, [esp + 16] |
| 235 pop ebp |
| 236 pop edi |
| 237 pop esi |
| 238 pop ebx |
| 239 ret |
| 240 } |
| 241 } |
| 242 |
| 243 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| 244 |
| 245 #ifdef __cplusplus |
| 246 } // extern "C" |
| 247 } // namespace libyuv |
| 248 #endif |
OLD | NEW |