| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 88 // Arrange third value for pixels 0,1,2,3,4,5 | 88 // Arrange third value for pixels 0,1,2,3,4,5 |
| 89 static uvec8 kShufAb2 = | 89 static uvec8 kShufAb2 = |
| 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; | 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
| 91 | 91 |
| 92 // Scaling values for boxes of 3x2 and 2x2 | 92 // Scaling values for boxes of 3x2 and 2x2 |
| 93 static uvec16 kScaleAb2 = | 93 static uvec16 kScaleAb2 = |
| 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; | 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
| 95 | 95 |
| 96 // Reads 32 pixels, throws half away and writes 16 pixels. | 96 // Reads 32 pixels, throws half away and writes 16 pixels. |
| 97 __declspec(naked) | 97 __declspec(naked) |
| 98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 99 uint8* dst_ptr, int dst_width) { | 99 uint8* dst_ptr, int dst_width) { |
| 100 __asm { | 100 __asm { |
| 101 mov eax, [esp + 4] // src_ptr | 101 mov eax, [esp + 4] // src_ptr |
| 102 // src_stride ignored | 102 // src_stride ignored |
| 103 mov edx, [esp + 12] // dst_ptr | 103 mov edx, [esp + 12] // dst_ptr |
| 104 mov ecx, [esp + 16] // dst_width | 104 mov ecx, [esp + 16] // dst_width |
| 105 | 105 |
| 106 wloop: | 106 wloop: |
| 107 movdqu xmm0, [eax] | 107 movdqu xmm0, [eax] |
| 108 movdqu xmm1, [eax + 16] | 108 movdqu xmm1, [eax + 16] |
| 109 lea eax, [eax + 32] | 109 lea eax, [eax + 32] |
| 110 psrlw xmm0, 8 // isolate odd pixels. | 110 psrlw xmm0, 8 // isolate odd pixels. |
| 111 psrlw xmm1, 8 | 111 psrlw xmm1, 8 |
| 112 packuswb xmm0, xmm1 | 112 packuswb xmm0, xmm1 |
| 113 movdqu [edx], xmm0 | 113 movdqu [edx], xmm0 |
| 114 lea edx, [edx + 16] | 114 lea edx, [edx + 16] |
| 115 sub ecx, 16 | 115 sub ecx, 16 |
| 116 jg wloop | 116 jg wloop |
| 117 | 117 |
| 118 ret | 118 ret |
| 119 } | 119 } |
| 120 } | 120 } |
| 121 | 121 |
| 122 // Blends 32x1 rectangle to 16x1. | 122 // Blends 32x1 rectangle to 16x1. |
| 123 __declspec(naked) | 123 __declspec(naked) |
| 124 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 125 uint8* dst_ptr, int dst_width) { | 125 uint8* dst_ptr, int dst_width) { |
| 126 __asm { | 126 __asm { |
| 127 mov eax, [esp + 4] // src_ptr | 127 mov eax, [esp + 4] // src_ptr |
| 128 // src_stride | 128 // src_stride |
| 129 mov edx, [esp + 12] // dst_ptr | 129 mov edx, [esp + 12] // dst_ptr |
| 130 mov ecx, [esp + 16] // dst_width | 130 mov ecx, [esp + 16] // dst_width |
| 131 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 131 |
| 132 psrlw xmm5, 8 | 132 pcmpeqb xmm4, xmm4 // constant 0x0101 |
| 133 psrlw xmm4, 15 |
| 134 packuswb xmm4, xmm4 |
| 135 pxor xmm5, xmm5 // constant 0 |
| 133 | 136 |
| 134 wloop: | 137 wloop: |
| 135 movdqu xmm0, [eax] | 138 movdqu xmm0, [eax] |
| 136 movdqu xmm1, [eax + 16] | 139 movdqu xmm1, [eax + 16] |
| 137 lea eax, [eax + 32] | 140 lea eax, [eax + 32] |
| 138 | 141 pmaddubsw xmm0, xmm4 // horizontal add |
| 139 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | 142 pmaddubsw xmm1, xmm4 |
| 140 psrlw xmm0, 8 | 143 pavgw xmm0, xmm5 // (x + 1) / 2 |
| 141 movdqa xmm3, xmm1 | 144 pavgw xmm1, xmm5 |
| 142 psrlw xmm1, 8 | |
| 143 pand xmm2, xmm5 | |
| 144 pand xmm3, xmm5 | |
| 145 pavgw xmm0, xmm2 | |
| 146 pavgw xmm1, xmm3 | |
| 147 packuswb xmm0, xmm1 | 145 packuswb xmm0, xmm1 |
| 148 | |
| 149 movdqu [edx], xmm0 | 146 movdqu [edx], xmm0 |
| 150 lea edx, [edx + 16] | 147 lea edx, [edx + 16] |
| 151 sub ecx, 16 | 148 sub ecx, 16 |
| 152 jg wloop | 149 jg wloop |
| 153 | 150 |
| 154 ret | 151 ret |
| 155 } | 152 } |
| 156 } | 153 } |
| 157 | 154 |
| 158 // Blends 32x2 rectangle to 16x1. | 155 // Blends 32x2 rectangle to 16x1. |
| 159 __declspec(naked) | 156 __declspec(naked) |
| 160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 161 uint8* dst_ptr, int dst_width) { | 158 uint8* dst_ptr, int dst_width) { |
| 162 __asm { | 159 __asm { |
| 163 push esi | 160 push esi |
| 164 mov eax, [esp + 4 + 4] // src_ptr | 161 mov eax, [esp + 4 + 4] // src_ptr |
| 165 mov esi, [esp + 4 + 8] // src_stride | 162 mov esi, [esp + 4 + 8] // src_stride |
| 166 mov edx, [esp + 4 + 12] // dst_ptr | 163 mov edx, [esp + 4 + 12] // dst_ptr |
| 167 mov ecx, [esp + 4 + 16] // dst_width | 164 mov ecx, [esp + 4 + 16] // dst_width |
| 168 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff | 165 |
| 169 psrlw xmm5, 8 | 166 pcmpeqb xmm4, xmm4 // constant 0x0101 |
| 167 psrlw xmm4, 15 |
| 168 packuswb xmm4, xmm4 |
| 169 pxor xmm5, xmm5 // constant 0 |
| 170 | 170 |
| 171 wloop: | 171 wloop: |
| 172 movdqu xmm0, [eax] | 172 movdqu xmm0, [eax] |
| 173 movdqu xmm1, [eax + 16] | 173 movdqu xmm1, [eax + 16] |
| 174 movdqu xmm2, [eax + esi] | 174 movdqu xmm2, [eax + esi] |
| 175 movdqu xmm3, [eax + esi + 16] | 175 movdqu xmm3, [eax + esi + 16] |
| 176 lea eax, [eax + 32] | 176 lea eax, [eax + 32] |
| 177 pavgb xmm0, xmm2 // average rows | 177 pmaddubsw xmm0, xmm4 // horizontal add |
| 178 pavgb xmm1, xmm3 | 178 pmaddubsw xmm1, xmm4 |
| 179 | 179 pmaddubsw xmm2, xmm4 |
| 180 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) | 180 pmaddubsw xmm3, xmm4 |
| 181 psrlw xmm0, 8 | 181 paddw xmm0, xmm2 // vertical add |
| 182 movdqa xmm3, xmm1 | 182 paddw xmm1, xmm3 |
| 183 psrlw xmm1, 8 | 183 psrlw xmm0, 1 |
| 184 pand xmm2, xmm5 | 184 psrlw xmm1, 1 |
| 185 pand xmm3, xmm5 | 185 pavgw xmm0, xmm5 // (x + 1) / 2 |
| 186 pavgw xmm0, xmm2 | 186 pavgw xmm1, xmm5 |
| 187 pavgw xmm1, xmm3 | |
| 188 packuswb xmm0, xmm1 | 187 packuswb xmm0, xmm1 |
| 189 | |
| 190 movdqu [edx], xmm0 | 188 movdqu [edx], xmm0 |
| 191 lea edx, [edx + 16] | 189 lea edx, [edx + 16] |
| 192 sub ecx, 16 | 190 sub ecx, 16 |
| 193 jg wloop | 191 jg wloop |
| 194 | 192 |
| 195 pop esi | 193 pop esi |
| 196 ret | 194 ret |
| 197 } | 195 } |
| 198 } | 196 } |
| 199 | 197 |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 238 | 236 |
| 239 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b | 237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
| 240 vpsrlw ymm4, ymm4, 15 | 238 vpsrlw ymm4, ymm4, 15 |
| 241 vpackuswb ymm4, ymm4, ymm4 | 239 vpackuswb ymm4, ymm4, ymm4 |
| 242 vpxor ymm5, ymm5, ymm5 // constant 0 | 240 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 243 | 241 |
| 244 wloop: | 242 wloop: |
| 245 vmovdqu ymm0, [eax] | 243 vmovdqu ymm0, [eax] |
| 246 vmovdqu ymm1, [eax + 32] | 244 vmovdqu ymm1, [eax + 32] |
| 247 lea eax, [eax + 64] | 245 lea eax, [eax + 64] |
| 248 | 246 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
| 249 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally | |
| 250 vpmaddubsw ymm1, ymm1, ymm4 | 247 vpmaddubsw ymm1, ymm1, ymm4 |
| 251 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 | 248 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
| 252 vpavgw ymm1, ymm1, ymm5 | 249 vpavgw ymm1, ymm1, ymm5 |
| 253 vpackuswb ymm0, ymm0, ymm1 | 250 vpackuswb ymm0, ymm0, ymm1 |
| 254 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 255 | |
| 256 vmovdqu [edx], ymm0 | 252 vmovdqu [edx], ymm0 |
| 257 lea edx, [edx + 32] | 253 lea edx, [edx + 32] |
| 258 sub ecx, 32 | 254 sub ecx, 32 |
| 259 jg wloop | 255 jg wloop |
| 260 | 256 |
| 261 vzeroupper | 257 vzeroupper |
| 262 ret | 258 ret |
| 263 } | 259 } |
| 264 } | 260 } |
| 265 | 261 |
| 262 // For rounding, average = (sum + 2) / 4 |
| 263 // becomes average((sum >> 1), 0) |
| 266 // Blends 64x2 rectangle to 32x1. | 264 // Blends 64x2 rectangle to 32x1. |
| 267 __declspec(naked) | 265 __declspec(naked) |
| 268 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 269 uint8* dst_ptr, int dst_width) { | 267 uint8* dst_ptr, int dst_width) { |
| 270 __asm { | 268 __asm { |
| 271 push esi | 269 push esi |
| 272 mov eax, [esp + 4 + 4] // src_ptr | 270 mov eax, [esp + 4 + 4] // src_ptr |
| 273 mov esi, [esp + 4 + 8] // src_stride | 271 mov esi, [esp + 4 + 8] // src_stride |
| 274 mov edx, [esp + 4 + 12] // dst_ptr | 272 mov edx, [esp + 4 + 12] // dst_ptr |
| 275 mov ecx, [esp + 4 + 16] // dst_width | 273 mov ecx, [esp + 4 + 16] // dst_width |
| 276 | 274 |
| 277 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b | 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b |
| 278 vpsrlw ymm4, ymm4, 15 | 276 vpsrlw ymm4, ymm4, 15 |
| 279 vpackuswb ymm4, ymm4, ymm4 | 277 vpackuswb ymm4, ymm4, ymm4 |
| 280 vpxor ymm5, ymm5, ymm5 // constant 0 | 278 vpxor ymm5, ymm5, ymm5 // constant 0 |
| 281 | 279 |
| 282 wloop: | 280 wloop: |
| 283 vmovdqu ymm0, [eax] // average rows | 281 vmovdqu ymm0, [eax] |
| 284 vmovdqu ymm1, [eax + 32] | 282 vmovdqu ymm1, [eax + 32] |
| 285 vpavgb ymm0, ymm0, [eax + esi] | 283 vmovdqu ymm2, [eax + esi] |
| 286 vpavgb ymm1, ymm1, [eax + esi + 32] | 284 vmovdqu ymm3, [eax + esi + 32] |
| 287 lea eax, [eax + 64] | 285 lea eax, [eax + 64] |
| 288 | 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add |
| 289 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally | |
| 290 vpmaddubsw ymm1, ymm1, ymm4 | 287 vpmaddubsw ymm1, ymm1, ymm4 |
| 288 vpmaddubsw ymm2, ymm2, ymm4 |
| 289 vpmaddubsw ymm3, ymm3, ymm4 |
| 290 vpaddw ymm0, ymm0, ymm2 // vertical add |
| 291 vpaddw ymm1, ymm1, ymm3 |
| 292 vpsrlw ymm0, ymm0, 1 |
| 293 vpsrlw ymm1, ymm1, 1 |
| 291 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 | 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 |
| 292 vpavgw ymm1, ymm1, ymm5 | 295 vpavgw ymm1, ymm1, ymm5 |
| 293 vpackuswb ymm0, ymm0, ymm1 | 296 vpackuswb ymm0, ymm0, ymm1 |
| 294 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb | 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb |
| 295 | |
| 296 vmovdqu [edx], ymm0 | 298 vmovdqu [edx], ymm0 |
| 297 lea edx, [edx + 32] | 299 lea edx, [edx + 32] |
| 298 sub ecx, 32 | 300 sub ecx, 32 |
| 299 jg wloop | 301 jg wloop |
| 300 | 302 |
| 301 pop esi | 303 pop esi |
| 302 vzeroupper | 304 vzeroupper |
| 303 ret | 305 ret |
| 304 } | 306 } |
| 305 } | 307 } |
| (...skipping 1038 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1344 idiv ecx | 1346 idiv ecx |
| 1345 ret | 1347 ret |
| 1346 } | 1348 } |
| 1347 } | 1349 } |
| 1348 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) | 1350 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) |
| 1349 | 1351 |
| 1350 #ifdef __cplusplus | 1352 #ifdef __cplusplus |
| 1351 } // extern "C" | 1353 } // extern "C" |
| 1352 } // namespace libyuv | 1354 } // namespace libyuv |
| 1353 #endif | 1355 #endif |
| OLD | NEW |