| OLD | NEW |
| 1 // VERSION 2 | 1 // VERSION 2 |
| 2 /* | 2 /* |
| 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 4 * | 4 * |
| 5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
| 6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
| 7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
| 8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
| 9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
| 10 */ | 10 */ |
| 11 | 11 |
| 12 #include "libyuv/row.h" | 12 #include "libyuv/row.h" |
| 13 | 13 |
| 14 #ifdef __cplusplus | 14 #ifdef __cplusplus |
| 15 namespace libyuv { | 15 namespace libyuv { |
| 16 extern "C" { | 16 extern "C" { |
| 17 #endif | 17 #endif |
| 18 | 18 |
| 19 // clang-format off | |
| 20 | |
| 21 // This module is for GCC x86 and x64. | 19 // This module is for GCC x86 and x64. |
| 22 #if !defined(LIBYUV_DISABLE_X86) && \ | 20 #if !defined(LIBYUV_DISABLE_X86) && \ |
| 23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) | 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) |
| 24 | 22 |
| 25 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | 23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
| 26 | 24 |
| 27 // Constants for ARGB | 25 // Constants for ARGB |
| 28 static vec8 kARGBToY = { | 26 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, |
| 29 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 | 27 13, 65, 33, 0, 13, 65, 33, 0}; |
| 30 }; | |
| 31 | 28 |
| 32 // JPeg full range. | 29 // JPeg full range. |
| 33 static vec8 kARGBToYJ = { | 30 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, |
| 34 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 | 31 15, 75, 38, 0, 15, 75, 38, 0}; |
| 35 }; | |
| 36 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) | 32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
| 37 | 33 |
| 38 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | 34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
| 39 | 35 |
| 40 static vec8 kARGBToU = { | 36 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, |
| 41 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 | 37 112, -74, -38, 0, 112, -74, -38, 0}; |
| 38 |
| 39 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, |
| 40 127, -84, -43, 0, 127, -84, -43, 0}; |
| 41 |
| 42 static vec8 kARGBToV = { |
| 43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
| 42 }; | 44 }; |
| 43 | 45 |
| 44 static vec8 kARGBToUJ = { | 46 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, |
| 45 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 | 47 -20, -107, 127, 0, -20, -107, 127, 0}; |
| 46 }; | |
| 47 | |
| 48 static vec8 kARGBToV = { | |
| 49 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, | |
| 50 }; | |
| 51 | |
| 52 static vec8 kARGBToVJ = { | |
| 53 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 | |
| 54 }; | |
| 55 | 48 |
| 56 // Constants for BGRA | 49 // Constants for BGRA |
| 57 static vec8 kBGRAToY = { | 50 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, |
| 58 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 | 51 0, 33, 65, 13, 0, 33, 65, 13}; |
| 59 }; | |
| 60 | 52 |
| 61 static vec8 kBGRAToU = { | 53 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, |
| 62 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 | 54 0, -38, -74, 112, 0, -38, -74, 112}; |
| 63 }; | |
| 64 | 55 |
| 65 static vec8 kBGRAToV = { | 56 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, |
| 66 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 | 57 0, 112, -94, -18, 0, 112, -94, -18}; |
| 67 }; | |
| 68 | 58 |
| 69 // Constants for ABGR | 59 // Constants for ABGR |
| 70 static vec8 kABGRToY = { | 60 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, |
| 71 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 | 61 33, 65, 13, 0, 33, 65, 13, 0}; |
| 72 }; | |
| 73 | 62 |
| 74 static vec8 kABGRToU = { | 63 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, |
| 75 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 | 64 -38, -74, 112, 0, -38, -74, 112, 0}; |
| 76 }; | |
| 77 | 65 |
| 78 static vec8 kABGRToV = { | 66 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, |
| 79 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 | 67 112, -94, -18, 0, 112, -94, -18, 0}; |
| 80 }; | |
| 81 | 68 |
| 82 // Constants for RGBA. | 69 // Constants for RGBA. |
| 83 static vec8 kRGBAToY = { | 70 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, |
| 84 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 | 71 0, 13, 65, 33, 0, 13, 65, 33}; |
| 85 }; | |
| 86 | 72 |
| 87 static vec8 kRGBAToU = { | 73 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, |
| 88 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 | 74 0, 112, -74, -38, 0, 112, -74, -38}; |
| 89 }; | |
| 90 | 75 |
| 91 static vec8 kRGBAToV = { | 76 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, |
| 92 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 | 77 0, -18, -94, 112, 0, -18, -94, 112}; |
| 93 }; | |
| 94 | 78 |
| 95 static uvec8 kAddY16 = { | 79 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, |
| 96 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u | 80 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; |
| 97 }; | |
| 98 | 81 |
| 99 // 7 bit fixed point 0.5. | 82 // 7 bit fixed point 0.5. |
| 100 static vec16 kAddYJ64 = { | 83 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; |
| 101 64, 64, 64, 64, 64, 64, 64, 64 | |
| 102 }; | |
| 103 | 84 |
| 104 static uvec8 kAddUV128 = { | 85 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
| 105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, | 86 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| 106 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | |
| 107 }; | |
| 108 | 87 |
| 109 static uvec16 kAddUVJ128 = { | 88 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, |
| 110 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u | 89 0x8080u, 0x8080u, 0x8080u, 0x8080u}; |
| 111 }; | |
| 112 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) | 90 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
| 113 | 91 |
| 114 #ifdef HAS_RGB24TOARGBROW_SSSE3 | 92 #ifdef HAS_RGB24TOARGBROW_SSSE3 |
| 115 | 93 |
| 116 // Shuffle table for converting RGB24 to ARGB. | 94 // Shuffle table for converting RGB24 to ARGB. |
| 117 static uvec8 kShuffleMaskRGB24ToARGB = { | 95 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, |
| 118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u | 96 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; |
| 119 }; | |
| 120 | 97 |
| 121 // Shuffle table for converting RAW to ARGB. | 98 // Shuffle table for converting RAW to ARGB. |
| 122 static uvec8 kShuffleMaskRAWToARGB = { | 99 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, |
| 123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u | 100 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; |
| 124 }; | |
| 125 | 101 |
| 126 // Shuffle table for converting RAW to RGB24. First 8. | 102 // Shuffle table for converting RAW to RGB24. First 8. |
| 127 static const uvec8 kShuffleMaskRAWToRGB24_0 = { | 103 static const uvec8 kShuffleMaskRAWToRGB24_0 = { |
| 128 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, | 104 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, |
| 129 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 105 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| 130 }; | |
| 131 | 106 |
| 132 // Shuffle table for converting RAW to RGB24. Middle 8. | 107 // Shuffle table for converting RAW to RGB24. Middle 8. |
| 133 static const uvec8 kShuffleMaskRAWToRGB24_1 = { | 108 static const uvec8 kShuffleMaskRAWToRGB24_1 = { |
| 134 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, | 109 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, |
| 135 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 110 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| 136 }; | |
| 137 | 111 |
| 138 // Shuffle table for converting RAW to RGB24. Last 8. | 112 // Shuffle table for converting RAW to RGB24. Last 8. |
| 139 static const uvec8 kShuffleMaskRAWToRGB24_2 = { | 113 static const uvec8 kShuffleMaskRAWToRGB24_2 = { |
| 140 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, | 114 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, |
| 141 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u | 115 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; |
| 142 }; | |
| 143 | 116 |
| 144 // Shuffle table for converting ARGB to RGB24. | 117 // Shuffle table for converting ARGB to RGB24. |
| 145 static uvec8 kShuffleMaskARGBToRGB24 = { | 118 static uvec8 kShuffleMaskARGBToRGB24 = { |
| 146 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u | 119 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; |
| 147 }; | |
| 148 | 120 |
| 149 // Shuffle table for converting ARGB to RAW. | 121 // Shuffle table for converting ARGB to RAW. |
| 150 static uvec8 kShuffleMaskARGBToRAW = { | 122 static uvec8 kShuffleMaskARGBToRAW = { |
| 151 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u | 123 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; |
| 152 }; | |
| 153 | 124 |
| 154 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 | 125 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
| 155 static uvec8 kShuffleMaskARGBToRGB24_0 = { | 126 static uvec8 kShuffleMaskARGBToRGB24_0 = { |
| 156 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u | 127 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; |
| 157 }; | |
| 158 | 128 |
| 159 // YUY2 shuf 16 Y to 32 Y. | 129 // YUY2 shuf 16 Y to 32 Y. |
| 160 static const lvec8 kShuffleYUY2Y = { | 130 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, |
| 161 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, | 131 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, |
| 162 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | 132 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; |
| 163 }; | |
| 164 | 133 |
| 165 // YUY2 shuf 8 UV to 16 UV. | 134 // YUY2 shuf 8 UV to 16 UV. |
| 166 static const lvec8 kShuffleYUY2UV = { | 135 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, |
| 167 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, | 136 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, |
| 168 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 | 137 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; |
| 169 }; | |
| 170 | 138 |
| 171 // UYVY shuf 16 Y to 32 Y. | 139 // UYVY shuf 16 Y to 32 Y. |
| 172 static const lvec8 kShuffleUYVYY = { | 140 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, |
| 173 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, | 141 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, |
| 174 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 | 142 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; |
| 175 }; | |
| 176 | 143 |
| 177 // UYVY shuf 8 UV to 16 UV. | 144 // UYVY shuf 8 UV to 16 UV. |
| 178 static const lvec8 kShuffleUYVYUV = { | 145 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, |
| 179 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, | 146 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, |
| 180 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 | 147 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; |
| 181 }; | |
| 182 | 148 |
| 183 // NV21 shuf 8 VU to 16 UV. | 149 // NV21 shuf 8 VU to 16 UV. |
| 184 static const lvec8 kShuffleNV21 = { | 150 static const lvec8 kShuffleNV21 = { |
| 185 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 186 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, | 152 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, |
| 187 }; | 153 }; |
| 188 #endif // HAS_RGB24TOARGBROW_SSSE3 | 154 #endif // HAS_RGB24TOARGBROW_SSSE3 |
| 189 | 155 |
| 190 #ifdef HAS_J400TOARGBROW_SSE2 | 156 #ifdef HAS_J400TOARGBROW_SSE2 |
| 191 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { | 157 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { |
| 192 asm volatile ( | 158 asm volatile ( |
| 193 "pcmpeqb %%xmm5,%%xmm5 \n" | 159 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 194 "pslld $0x18,%%xmm5 \n" | 160 "pslld $0x18,%%xmm5 \n" |
| 195 LABELALIGN | 161 LABELALIGN |
| 196 "1: \n" | 162 "1: \n" |
| (...skipping 367 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 564 "lea " MEMLEA(0x8,1) ",%1 \n" | 530 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 565 "sub $0x4,%2 \n" | 531 "sub $0x4,%2 \n" |
| 566 "jg 1b \n" | 532 "jg 1b \n" |
| 567 : "+r"(src), // %0 | 533 : "+r"(src), // %0 |
| 568 "+r"(dst), // %1 | 534 "+r"(dst), // %1 |
| 569 "+r"(width) // %2 | 535 "+r"(width) // %2 |
| 570 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 536 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 571 ); | 537 ); |
| 572 } | 538 } |
| 573 | 539 |
| 574 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, | 540 void ARGBToRGB565DitherRow_SSE2(const uint8* src, |
| 575 const uint32 dither4, int width) { | 541 uint8* dst, |
| 576 asm volatile ( | 542 const uint32 dither4, |
| 577 "movd %3,%%xmm6 \n" | 543 int width) { |
| 578 "punpcklbw %%xmm6,%%xmm6 \n" | 544 asm volatile( |
| 579 "movdqa %%xmm6,%%xmm7 \n" | 545 "movd %3,%%xmm6 \n" |
| 580 "punpcklwd %%xmm6,%%xmm6 \n" | 546 "punpcklbw %%xmm6,%%xmm6 \n" |
| 581 "punpckhwd %%xmm7,%%xmm7 \n" | 547 "movdqa %%xmm6,%%xmm7 \n" |
| 582 "pcmpeqb %%xmm3,%%xmm3 \n" | 548 "punpcklwd %%xmm6,%%xmm6 \n" |
| 583 "psrld $0x1b,%%xmm3 \n" | 549 "punpckhwd %%xmm7,%%xmm7 \n" |
| 584 "pcmpeqb %%xmm4,%%xmm4 \n" | 550 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 585 "psrld $0x1a,%%xmm4 \n" | 551 "psrld $0x1b,%%xmm3 \n" |
| 586 "pslld $0x5,%%xmm4 \n" | 552 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 587 "pcmpeqb %%xmm5,%%xmm5 \n" | 553 "psrld $0x1a,%%xmm4 \n" |
| 588 "pslld $0xb,%%xmm5 \n" | 554 "pslld $0x5,%%xmm4 \n" |
| 555 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 556 "pslld $0xb,%%xmm5 \n" |
| 589 | 557 |
| 590 LABELALIGN | 558 LABELALIGN |
| 591 "1: \n" | 559 "1: \n" |
| 592 "movdqu (%0),%%xmm0 \n" | 560 "movdqu (%0),%%xmm0 \n" |
| 593 "paddusb %%xmm6,%%xmm0 \n" | 561 "paddusb %%xmm6,%%xmm0 \n" |
| 594 "movdqa %%xmm0,%%xmm1 \n" | 562 "movdqa %%xmm0,%%xmm1 \n" |
| 595 "movdqa %%xmm0,%%xmm2 \n" | 563 "movdqa %%xmm0,%%xmm2 \n" |
| 596 "pslld $0x8,%%xmm0 \n" | 564 "pslld $0x8,%%xmm0 \n" |
| 597 "psrld $0x3,%%xmm1 \n" | 565 "psrld $0x3,%%xmm1 \n" |
| 598 "psrld $0x5,%%xmm2 \n" | 566 "psrld $0x5,%%xmm2 \n" |
| 599 "psrad $0x10,%%xmm0 \n" | 567 "psrad $0x10,%%xmm0 \n" |
| 600 "pand %%xmm3,%%xmm1 \n" | 568 "pand %%xmm3,%%xmm1 \n" |
| 601 "pand %%xmm4,%%xmm2 \n" | 569 "pand %%xmm4,%%xmm2 \n" |
| 602 "pand %%xmm5,%%xmm0 \n" | 570 "pand %%xmm5,%%xmm0 \n" |
| 603 "por %%xmm2,%%xmm1 \n" | 571 "por %%xmm2,%%xmm1 \n" |
| 604 "por %%xmm1,%%xmm0 \n" | 572 "por %%xmm1,%%xmm0 \n" |
| 605 "packssdw %%xmm0,%%xmm0 \n" | 573 "packssdw %%xmm0,%%xmm0 \n" |
| 606 "lea 0x10(%0),%0 \n" | 574 "lea 0x10(%0),%0 \n" |
| 607 "movq %%xmm0,(%1) \n" | 575 "movq %%xmm0,(%1) \n" |
| 608 "lea 0x8(%1),%1 \n" | 576 "lea 0x8(%1),%1 \n" |
| 609 "sub $0x4,%2 \n" | 577 "sub $0x4,%2 \n" |
| 610 "jg 1b \n" | 578 "jg 1b \n" |
| 611 : "+r"(src), // %0 | 579 : "+r"(src), // %0 |
| 612 "+r"(dst), // %1 | 580 "+r"(dst), // %1 |
| 613 "+r"(width) // %2 | 581 "+r"(width) // %2 |
| 614 : "m"(dither4) // %3 | 582 : "m"(dither4) // %3 |
| 615 : "memory", "cc", | 583 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
| 616 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 584 "xmm7"); |
| 617 ); | |
| 618 } | 585 } |
| 619 | 586 |
| 620 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 | 587 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 |
| 621 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst, | 588 void ARGBToRGB565DitherRow_AVX2(const uint8* src, |
| 622 const uint32 dither4, int width) { | 589 uint8* dst, |
| 623 asm volatile ( | 590 const uint32 dither4, |
| 624 "vbroadcastss %3,%%xmm6 \n" | 591 int width) { |
| 625 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" | 592 asm volatile( |
| 626 "vpermq $0xd8,%%ymm6,%%ymm6 \n" | 593 "vbroadcastss %3,%%xmm6 \n" |
| 627 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" | 594 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" |
| 628 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" | 595 "vpermq $0xd8,%%ymm6,%%ymm6 \n" |
| 629 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" | 596 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" |
| 630 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" | 597 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" |
| 631 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" | 598 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" |
| 632 "vpslld $0x5,%%ymm4,%%ymm4 \n" | 599 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" |
| 633 "vpslld $0xb,%%ymm3,%%ymm5 \n" | 600 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" |
| 601 "vpslld $0x5,%%ymm4,%%ymm4 \n" |
| 602 "vpslld $0xb,%%ymm3,%%ymm5 \n" |
| 634 | 603 |
| 635 LABELALIGN | 604 LABELALIGN |
| 636 "1: \n" | 605 "1: \n" |
| 637 "vmovdqu (%0),%%ymm0 \n" | 606 "vmovdqu (%0),%%ymm0 \n" |
| 638 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" | 607 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" |
| 639 "vpsrld $0x5,%%ymm0,%%ymm2 \n" | 608 "vpsrld $0x5,%%ymm0,%%ymm2 \n" |
| 640 "vpsrld $0x3,%%ymm0,%%ymm1 \n" | 609 "vpsrld $0x3,%%ymm0,%%ymm1 \n" |
| 641 "vpsrld $0x8,%%ymm0,%%ymm0 \n" | 610 "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
| 642 "vpand %%ymm4,%%ymm2,%%ymm2 \n" | 611 "vpand %%ymm4,%%ymm2,%%ymm2 \n" |
| 643 "vpand %%ymm3,%%ymm1,%%ymm1 \n" | 612 "vpand %%ymm3,%%ymm1,%%ymm1 \n" |
| 644 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 613 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
| 645 "vpor %%ymm2,%%ymm1,%%ymm1 \n" | 614 "vpor %%ymm2,%%ymm1,%%ymm1 \n" |
| 646 "vpor %%ymm1,%%ymm0,%%ymm0 \n" | 615 "vpor %%ymm1,%%ymm0,%%ymm0 \n" |
| 647 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" | 616 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
| 648 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 617 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 649 "lea 0x20(%0),%0 \n" | 618 "lea 0x20(%0),%0 \n" |
| 650 "vmovdqu %%xmm0,(%1) \n" | 619 "vmovdqu %%xmm0,(%1) \n" |
| 651 "lea 0x10(%1),%1 \n" | 620 "lea 0x10(%1),%1 \n" |
| 652 "sub $0x8,%2 \n" | 621 "sub $0x8,%2 \n" |
| 653 "jg 1b \n" | 622 "jg 1b \n" |
| 654 "vzeroupper \n" | 623 "vzeroupper \n" |
| 655 : "+r"(src), // %0 | 624 : "+r"(src), // %0 |
| 656 "+r"(dst), // %1 | 625 "+r"(dst), // %1 |
| 657 "+r"(width) // %2 | 626 "+r"(width) // %2 |
| 658 : "m"(dither4) // %3 | 627 : "m"(dither4) // %3 |
| 659 : "memory", "cc", | 628 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
| 660 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 629 "xmm7"); |
| 661 ); | |
| 662 } | 630 } |
| 663 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 | 631 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 |
| 664 | 632 |
| 665 | |
| 666 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { | 633 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { |
| 667 asm volatile ( | 634 asm volatile ( |
| 668 "pcmpeqb %%xmm4,%%xmm4 \n" | 635 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 669 "psrld $0x1b,%%xmm4 \n" | 636 "psrld $0x1b,%%xmm4 \n" |
| 670 "movdqa %%xmm4,%%xmm5 \n" | 637 "movdqa %%xmm4,%%xmm5 \n" |
| 671 "pslld $0x5,%%xmm5 \n" | 638 "pslld $0x5,%%xmm5 \n" |
| 672 "movdqa %%xmm4,%%xmm6 \n" | 639 "movdqa %%xmm4,%%xmm6 \n" |
| 673 "pslld $0xa,%%xmm6 \n" | 640 "pslld $0xa,%%xmm6 \n" |
| 674 "pcmpeqb %%xmm7,%%xmm7 \n" | 641 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 675 "pslld $0xf,%%xmm7 \n" | 642 "pslld $0xf,%%xmm7 \n" |
| (...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 804 "+r"(width) // %2 | 771 "+r"(width) // %2 |
| 805 : "m"(kARGBToYJ), // %3 | 772 : "m"(kARGBToYJ), // %3 |
| 806 "m"(kAddYJ64) // %4 | 773 "m"(kAddYJ64) // %4 |
| 807 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 774 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 808 ); | 775 ); |
| 809 } | 776 } |
| 810 #endif // HAS_ARGBTOYJROW_SSSE3 | 777 #endif // HAS_ARGBTOYJROW_SSSE3 |
| 811 | 778 |
| 812 #ifdef HAS_ARGBTOYROW_AVX2 | 779 #ifdef HAS_ARGBTOYROW_AVX2 |
| 813 // vpermd for vphaddw + vpackuswb vpermd. | 780 // vpermd for vphaddw + vpackuswb vpermd. |
| 814 static const lvec32 kPermdARGBToY_AVX = { | 781 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; |
| 815 0, 4, 1, 5, 2, 6, 3, 7 | |
| 816 }; | |
| 817 | 782 |
| 818 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. | 783 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
| 819 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { | 784 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { |
| 820 asm volatile ( | 785 asm volatile ( |
| 821 "vbroadcastf128 %3,%%ymm4 \n" | 786 "vbroadcastf128 %3,%%ymm4 \n" |
| 822 "vbroadcastf128 %4,%%ymm5 \n" | 787 "vbroadcastf128 %4,%%ymm5 \n" |
| 823 "vmovdqu %5,%%ymm6 \n" | 788 "vmovdqu %5,%%ymm6 \n" |
| 824 LABELALIGN | 789 LABELALIGN |
| 825 "1: \n" | 790 "1: \n" |
| 826 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 791 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 891 "+r"(width) // %2 | 856 "+r"(width) // %2 |
| 892 : "m"(kARGBToYJ), // %3 | 857 : "m"(kARGBToYJ), // %3 |
| 893 "m"(kAddYJ64), // %4 | 858 "m"(kAddYJ64), // %4 |
| 894 "m"(kPermdARGBToY_AVX) // %5 | 859 "m"(kPermdARGBToY_AVX) // %5 |
| 895 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 860 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 896 ); | 861 ); |
| 897 } | 862 } |
| 898 #endif // HAS_ARGBTOYJROW_AVX2 | 863 #endif // HAS_ARGBTOYJROW_AVX2 |
| 899 | 864 |
| 900 #ifdef HAS_ARGBTOUVROW_SSSE3 | 865 #ifdef HAS_ARGBTOUVROW_SSSE3 |
| 901 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 866 void ARGBToUVRow_SSSE3(const uint8* src_argb0, |
| 902 uint8* dst_u, uint8* dst_v, int width) { | 867 int src_stride_argb, |
| 868 uint8* dst_u, |
| 869 uint8* dst_v, |
| 870 int width) { |
| 903 asm volatile ( | 871 asm volatile ( |
| 904 "movdqa %5,%%xmm3 \n" | 872 "movdqa %5,%%xmm3 \n" |
| 905 "movdqa %6,%%xmm4 \n" | 873 "movdqa %6,%%xmm4 \n" |
| 906 "movdqa %7,%%xmm5 \n" | 874 "movdqa %7,%%xmm5 \n" |
| 907 "sub %1,%2 \n" | 875 "sub %1,%2 \n" |
| 908 LABELALIGN | 876 LABELALIGN |
| 909 "1: \n" | 877 "1: \n" |
| 910 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 878 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 911 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 879 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 912 "pavgb %%xmm7,%%xmm0 \n" | 880 "pavgb %%xmm7,%%xmm0 \n" |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 956 "m"(kAddUV128) // %7 | 924 "m"(kAddUV128) // %7 |
| 957 : "memory", "cc", NACL_R14 | 925 : "memory", "cc", NACL_R14 |
| 958 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 926 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 959 ); | 927 ); |
| 960 } | 928 } |
| 961 #endif // HAS_ARGBTOUVROW_SSSE3 | 929 #endif // HAS_ARGBTOUVROW_SSSE3 |
| 962 | 930 |
| 963 #ifdef HAS_ARGBTOUVROW_AVX2 | 931 #ifdef HAS_ARGBTOUVROW_AVX2 |
| 964 // vpshufb for vphaddw + vpackuswb packed to shorts. | 932 // vpshufb for vphaddw + vpackuswb packed to shorts. |
| 965 static const lvec8 kShufARGBToUV_AVX = { | 933 static const lvec8 kShufARGBToUV_AVX = { |
| 966 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, | 934 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
| 967 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 | 935 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; |
| 968 }; | 936 void ARGBToUVRow_AVX2(const uint8* src_argb0, |
| 969 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 937 int src_stride_argb, |
| 970 uint8* dst_u, uint8* dst_v, int width) { | 938 uint8* dst_u, |
| 939 uint8* dst_v, |
| 940 int width) { |
| 971 asm volatile ( | 941 asm volatile ( |
| 972 "vbroadcastf128 %5,%%ymm5 \n" | 942 "vbroadcastf128 %5,%%ymm5 \n" |
| 973 "vbroadcastf128 %6,%%ymm6 \n" | 943 "vbroadcastf128 %6,%%ymm6 \n" |
| 974 "vbroadcastf128 %7,%%ymm7 \n" | 944 "vbroadcastf128 %7,%%ymm7 \n" |
| 975 "sub %1,%2 \n" | 945 "sub %1,%2 \n" |
| 976 LABELALIGN | 946 LABELALIGN |
| 977 "1: \n" | 947 "1: \n" |
| 978 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 948 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 979 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 949 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 980 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 950 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1019 "m"(kARGBToV), // %6 | 989 "m"(kARGBToV), // %6 |
| 1020 "m"(kARGBToU), // %7 | 990 "m"(kARGBToU), // %7 |
| 1021 "m"(kShufARGBToUV_AVX) // %8 | 991 "m"(kShufARGBToUV_AVX) // %8 |
| 1022 : "memory", "cc", NACL_R14 | 992 : "memory", "cc", NACL_R14 |
| 1023 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 993 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 1024 ); | 994 ); |
| 1025 } | 995 } |
| 1026 #endif // HAS_ARGBTOUVROW_AVX2 | 996 #endif // HAS_ARGBTOUVROW_AVX2 |
| 1027 | 997 |
| 1028 #ifdef HAS_ARGBTOUVJROW_AVX2 | 998 #ifdef HAS_ARGBTOUVJROW_AVX2 |
| 1029 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, | 999 void ARGBToUVJRow_AVX2(const uint8* src_argb0, |
| 1030 uint8* dst_u, uint8* dst_v, int width) { | 1000 int src_stride_argb, |
| 1001 uint8* dst_u, |
| 1002 uint8* dst_v, |
| 1003 int width) { |
| 1031 asm volatile ( | 1004 asm volatile ( |
| 1032 "vbroadcastf128 %5,%%ymm5 \n" | 1005 "vbroadcastf128 %5,%%ymm5 \n" |
| 1033 "vbroadcastf128 %6,%%ymm6 \n" | 1006 "vbroadcastf128 %6,%%ymm6 \n" |
| 1034 "vbroadcastf128 %7,%%ymm7 \n" | 1007 "vbroadcastf128 %7,%%ymm7 \n" |
| 1035 "sub %1,%2 \n" | 1008 "sub %1,%2 \n" |
| 1036 LABELALIGN | 1009 LABELALIGN |
| 1037 "1: \n" | 1010 "1: \n" |
| 1038 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 1011 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 1039 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 1012 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 1040 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" | 1013 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1080 "m"(kARGBToVJ), // %6 | 1053 "m"(kARGBToVJ), // %6 |
| 1081 "m"(kARGBToUJ), // %7 | 1054 "m"(kARGBToUJ), // %7 |
| 1082 "m"(kShufARGBToUV_AVX) // %8 | 1055 "m"(kShufARGBToUV_AVX) // %8 |
| 1083 : "memory", "cc", NACL_R14 | 1056 : "memory", "cc", NACL_R14 |
| 1084 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 1057 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 1085 ); | 1058 ); |
| 1086 } | 1059 } |
| 1087 #endif // HAS_ARGBTOUVJROW_AVX2 | 1060 #endif // HAS_ARGBTOUVJROW_AVX2 |
| 1088 | 1061 |
| 1089 #ifdef HAS_ARGBTOUVJROW_SSSE3 | 1062 #ifdef HAS_ARGBTOUVJROW_SSSE3 |
| 1090 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1063 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, |
| 1091 uint8* dst_u, uint8* dst_v, int width) { | 1064 int src_stride_argb, |
| 1065 uint8* dst_u, |
| 1066 uint8* dst_v, |
| 1067 int width) { |
| 1092 asm volatile ( | 1068 asm volatile ( |
| 1093 "movdqa %5,%%xmm3 \n" | 1069 "movdqa %5,%%xmm3 \n" |
| 1094 "movdqa %6,%%xmm4 \n" | 1070 "movdqa %6,%%xmm4 \n" |
| 1095 "movdqa %7,%%xmm5 \n" | 1071 "movdqa %7,%%xmm5 \n" |
| 1096 "sub %1,%2 \n" | 1072 "sub %1,%2 \n" |
| 1097 LABELALIGN | 1073 LABELALIGN |
| 1098 "1: \n" | 1074 "1: \n" |
| 1099 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1075 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1100 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1076 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1101 "pavgb %%xmm7,%%xmm0 \n" | 1077 "pavgb %%xmm7,%%xmm0 \n" |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1144 "m"(kARGBToVJ), // %5 | 1120 "m"(kARGBToVJ), // %5 |
| 1145 "m"(kARGBToUJ), // %6 | 1121 "m"(kARGBToUJ), // %6 |
| 1146 "m"(kAddUVJ128) // %7 | 1122 "m"(kAddUVJ128) // %7 |
| 1147 : "memory", "cc", NACL_R14 | 1123 : "memory", "cc", NACL_R14 |
| 1148 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1124 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1149 ); | 1125 ); |
| 1150 } | 1126 } |
| 1151 #endif // HAS_ARGBTOUVJROW_SSSE3 | 1127 #endif // HAS_ARGBTOUVJROW_SSSE3 |
| 1152 | 1128 |
| 1153 #ifdef HAS_ARGBTOUV444ROW_SSSE3 | 1129 #ifdef HAS_ARGBTOUV444ROW_SSSE3 |
| 1154 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1130 void ARGBToUV444Row_SSSE3(const uint8* src_argb, |
| 1131 uint8* dst_u, |
| 1132 uint8* dst_v, |
| 1155 int width) { | 1133 int width) { |
| 1156 asm volatile ( | 1134 asm volatile ( |
| 1157 "movdqa %4,%%xmm3 \n" | 1135 "movdqa %4,%%xmm3 \n" |
| 1158 "movdqa %5,%%xmm4 \n" | 1136 "movdqa %5,%%xmm4 \n" |
| 1159 "movdqa %6,%%xmm5 \n" | 1137 "movdqa %6,%%xmm5 \n" |
| 1160 "sub %1,%2 \n" | 1138 "sub %1,%2 \n" |
| 1161 LABELALIGN | 1139 LABELALIGN |
| 1162 "1: \n" | 1140 "1: \n" |
| 1163 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1141 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1164 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 1142 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1234 "jg 1b \n" | 1212 "jg 1b \n" |
| 1235 : "+r"(src_bgra), // %0 | 1213 : "+r"(src_bgra), // %0 |
| 1236 "+r"(dst_y), // %1 | 1214 "+r"(dst_y), // %1 |
| 1237 "+r"(width) // %2 | 1215 "+r"(width) // %2 |
| 1238 : "m"(kBGRAToY), // %3 | 1216 : "m"(kBGRAToY), // %3 |
| 1239 "m"(kAddY16) // %4 | 1217 "m"(kAddY16) // %4 |
| 1240 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1218 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1241 ); | 1219 ); |
| 1242 } | 1220 } |
| 1243 | 1221 |
| 1244 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, | 1222 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, |
| 1245 uint8* dst_u, uint8* dst_v, int width) { | 1223 int src_stride_bgra, |
| 1224 uint8* dst_u, |
| 1225 uint8* dst_v, |
| 1226 int width) { |
| 1246 asm volatile ( | 1227 asm volatile ( |
| 1247 "movdqa %5,%%xmm3 \n" | 1228 "movdqa %5,%%xmm3 \n" |
| 1248 "movdqa %6,%%xmm4 \n" | 1229 "movdqa %6,%%xmm4 \n" |
| 1249 "movdqa %7,%%xmm5 \n" | 1230 "movdqa %7,%%xmm5 \n" |
| 1250 "sub %1,%2 \n" | 1231 "sub %1,%2 \n" |
| 1251 LABELALIGN | 1232 LABELALIGN |
| 1252 "1: \n" | 1233 "1: \n" |
| 1253 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1234 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1254 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1235 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1255 "pavgb %%xmm7,%%xmm0 \n" | 1236 "pavgb %%xmm7,%%xmm0 \n" |
| (...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1363 "jg 1b \n" | 1344 "jg 1b \n" |
| 1364 : "+r"(src_rgba), // %0 | 1345 : "+r"(src_rgba), // %0 |
| 1365 "+r"(dst_y), // %1 | 1346 "+r"(dst_y), // %1 |
| 1366 "+r"(width) // %2 | 1347 "+r"(width) // %2 |
| 1367 : "m"(kRGBAToY), // %3 | 1348 : "m"(kRGBAToY), // %3 |
| 1368 "m"(kAddY16) // %4 | 1349 "m"(kAddY16) // %4 |
| 1369 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1350 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1370 ); | 1351 ); |
| 1371 } | 1352 } |
| 1372 | 1353 |
| 1373 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, | 1354 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, |
| 1374 uint8* dst_u, uint8* dst_v, int width) { | 1355 int src_stride_abgr, |
| 1356 uint8* dst_u, |
| 1357 uint8* dst_v, |
| 1358 int width) { |
| 1375 asm volatile ( | 1359 asm volatile ( |
| 1376 "movdqa %5,%%xmm3 \n" | 1360 "movdqa %5,%%xmm3 \n" |
| 1377 "movdqa %6,%%xmm4 \n" | 1361 "movdqa %6,%%xmm4 \n" |
| 1378 "movdqa %7,%%xmm5 \n" | 1362 "movdqa %7,%%xmm5 \n" |
| 1379 "sub %1,%2 \n" | 1363 "sub %1,%2 \n" |
| 1380 LABELALIGN | 1364 LABELALIGN |
| 1381 "1: \n" | 1365 "1: \n" |
| 1382 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1366 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1383 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1367 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1384 "pavgb %%xmm7,%%xmm0 \n" | 1368 "pavgb %%xmm7,%%xmm0 \n" |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1424 "+rm"(width) // %3 | 1408 "+rm"(width) // %3 |
| 1425 : "r"((intptr_t)(src_stride_abgr)), // %4 | 1409 : "r"((intptr_t)(src_stride_abgr)), // %4 |
| 1426 "m"(kABGRToV), // %5 | 1410 "m"(kABGRToV), // %5 |
| 1427 "m"(kABGRToU), // %6 | 1411 "m"(kABGRToU), // %6 |
| 1428 "m"(kAddUV128) // %7 | 1412 "m"(kAddUV128) // %7 |
| 1429 : "memory", "cc", NACL_R14 | 1413 : "memory", "cc", NACL_R14 |
| 1430 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1414 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1431 ); | 1415 ); |
| 1432 } | 1416 } |
| 1433 | 1417 |
| 1434 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, | 1418 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, |
| 1435 uint8* dst_u, uint8* dst_v, int width) { | 1419 int src_stride_rgba, |
| 1420 uint8* dst_u, |
| 1421 uint8* dst_v, |
| 1422 int width) { |
| 1436 asm volatile ( | 1423 asm volatile ( |
| 1437 "movdqa %5,%%xmm3 \n" | 1424 "movdqa %5,%%xmm3 \n" |
| 1438 "movdqa %6,%%xmm4 \n" | 1425 "movdqa %6,%%xmm4 \n" |
| 1439 "movdqa %7,%%xmm5 \n" | 1426 "movdqa %7,%%xmm5 \n" |
| 1440 "sub %1,%2 \n" | 1427 "sub %1,%2 \n" |
| 1441 LABELALIGN | 1428 LABELALIGN |
| 1442 "1: \n" | 1429 "1: \n" |
| 1443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 1430 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 1444 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 | 1431 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
| 1445 "pavgb %%xmm7,%%xmm0 \n" | 1432 "pavgb %%xmm7,%%xmm0 \n" |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1488 "m"(kRGBAToU), // %6 | 1475 "m"(kRGBAToU), // %6 |
| 1489 "m"(kAddUV128) // %7 | 1476 "m"(kAddUV128) // %7 |
| 1490 : "memory", "cc", NACL_R14 | 1477 : "memory", "cc", NACL_R14 |
| 1491 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1478 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
| 1492 ); | 1479 ); |
| 1493 } | 1480 } |
| 1494 | 1481 |
| 1495 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | 1482 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
| 1496 | 1483 |
| 1497 // Read 8 UV from 444 | 1484 // Read 8 UV from 444 |
| 1498 #define READYUV444 \ | 1485 #define READYUV444 \ |
| 1499 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1486 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1500 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1487 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1501 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1488 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 1502 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1489 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1503 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1490 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1504 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1491 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1505 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1492 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1506 | 1493 |
| 1507 // Read 4 UV from 422, upsample to 8 UV | 1494 // Read 4 UV from 422, upsample to 8 UV |
| 1508 #define READYUV422 \ | 1495 #define READYUV422 \ |
| 1509 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1496 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1510 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1497 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1511 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1498 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 1512 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1499 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1513 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1500 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1514 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1501 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1515 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1502 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1516 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1503 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1517 | 1504 |
| 1518 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. | 1505 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. |
| 1519 #define READYUVA422 \ | 1506 #define READYUVA422 \ |
| 1520 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1507 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1521 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1508 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1522 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ | 1509 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 1523 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1510 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1524 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1511 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1525 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1512 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1526 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1513 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1527 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ | 1514 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
| 1528 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1515 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 1529 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" | 1516 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" |
| 1530 | 1517 |
| 1531 // Read 4 UV from NV12, upsample to 8 UV | 1518 // Read 4 UV from NV12, upsample to 8 UV |
| 1532 #define READNV12 \ | 1519 #define READNV12 \ |
| 1533 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1520 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1534 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ | 1521 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
| 1535 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1522 "punpcklwd %%xmm0,%%xmm0 \n" \ |
| 1536 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1537 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1524 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1538 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1539 | 1526 |
| 1540 // Read 4 VU from NV21, upsample to 8 UV | 1527 // Read 4 VU from NV21, upsample to 8 UV |
| 1541 #define READNV21 \ | 1528 #define READNV21 \ |
| 1542 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ | 1529 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
| 1543 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ | 1530 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ |
| 1544 "pshufb %[kShuffleNV21], %%xmm0 \n" \ | 1531 "pshufb %[kShuffleNV21], %%xmm0 \n" \ |
| 1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1532 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1546 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1533 "punpcklbw %%xmm4,%%xmm4 \n" \ |
| 1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1534 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
| 1548 | 1535 |
| 1549 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. | 1536 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
| 1550 #define READYUY2 \ | 1537 #define READYUY2 \ |
| 1551 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ | 1538 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ |
| 1552 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ | 1539 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ |
| 1553 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ | 1540 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ |
| 1554 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ | 1541 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ |
| 1555 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" | 1542 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" |
| 1556 | 1543 |
| 1557 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. | 1544 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
| 1558 #define READUYVY \ | 1545 #define READUYVY \ |
| 1559 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ | 1546 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ |
| 1560 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ | 1547 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ |
| 1561 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ | 1548 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ |
| 1562 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ | 1549 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ |
| 1563 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" | 1550 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" |
| 1564 | 1551 |
| 1565 #if defined(__x86_64__) | 1552 #if defined(__x86_64__) |
| 1566 #define YUVTORGB_SETUP(yuvconstants) \ | 1553 #define YUVTORGB_SETUP(yuvconstants) \ |
| 1567 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ | 1554 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ |
| 1568 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ | 1555 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ |
| 1569 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ | 1556 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ |
| 1570 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ | 1557 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ |
| 1571 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ | 1558 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ |
| 1572 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ | 1559 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ |
| 1573 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" | 1560 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" |
| 1574 // Convert 8 pixels: 8 UV and 8 Y | 1561 // Convert 8 pixels: 8 UV and 8 Y |
| 1575 #define YUVTORGB(yuvconstants) \ | 1562 #define YUVTORGB(yuvconstants) \ |
| 1576 "movdqa %%xmm0,%%xmm1 \n" \ | 1563 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1577 "movdqa %%xmm0,%%xmm2 \n" \ | 1564 "movdqa %%xmm0,%%xmm2 \n" \ |
| 1578 "movdqa %%xmm0,%%xmm3 \n" \ | 1565 "movdqa %%xmm0,%%xmm3 \n" \ |
| 1579 "movdqa %%xmm11,%%xmm0 \n" \ | 1566 "movdqa %%xmm11,%%xmm0 \n" \ |
| 1580 "pmaddubsw %%xmm8,%%xmm1 \n" \ | 1567 "pmaddubsw %%xmm8,%%xmm1 \n" \ |
| 1581 "psubw %%xmm1,%%xmm0 \n" \ | 1568 "psubw %%xmm1,%%xmm0 \n" \ |
| 1582 "movdqa %%xmm12,%%xmm1 \n" \ | 1569 "movdqa %%xmm12,%%xmm1 \n" \ |
| 1583 "pmaddubsw %%xmm9,%%xmm2 \n" \ | 1570 "pmaddubsw %%xmm9,%%xmm2 \n" \ |
| 1584 "psubw %%xmm2,%%xmm1 \n" \ | 1571 "psubw %%xmm2,%%xmm1 \n" \ |
| 1585 "movdqa %%xmm13,%%xmm2 \n" \ | 1572 "movdqa %%xmm13,%%xmm2 \n" \ |
| 1586 "pmaddubsw %%xmm10,%%xmm3 \n" \ | 1573 "pmaddubsw %%xmm10,%%xmm3 \n" \ |
| 1587 "psubw %%xmm3,%%xmm2 \n" \ | 1574 "psubw %%xmm3,%%xmm2 \n" \ |
| 1588 "pmulhuw %%xmm14,%%xmm4 \n" \ | 1575 "pmulhuw %%xmm14,%%xmm4 \n" \ |
| 1589 "paddsw %%xmm4,%%xmm0 \n" \ | 1576 "paddsw %%xmm4,%%xmm0 \n" \ |
| 1590 "paddsw %%xmm4,%%xmm1 \n" \ | 1577 "paddsw %%xmm4,%%xmm1 \n" \ |
| 1591 "paddsw %%xmm4,%%xmm2 \n" \ | 1578 "paddsw %%xmm4,%%xmm2 \n" \ |
| 1592 "psraw $0x6,%%xmm0 \n" \ | 1579 "psraw $0x6,%%xmm0 \n" \ |
| 1593 "psraw $0x6,%%xmm1 \n" \ | 1580 "psraw $0x6,%%xmm1 \n" \ |
| 1594 "psraw $0x6,%%xmm2 \n" \ | 1581 "psraw $0x6,%%xmm2 \n" \ |
| 1595 "packuswb %%xmm0,%%xmm0 \n" \ | 1582 "packuswb %%xmm0,%%xmm0 \n" \ |
| 1596 "packuswb %%xmm1,%%xmm1 \n" \ | 1583 "packuswb %%xmm1,%%xmm1 \n" \ |
| 1597 "packuswb %%xmm2,%%xmm2 \n" | 1584 "packuswb %%xmm2,%%xmm2 \n" |
| 1598 #define YUVTORGB_REGS \ | 1585 #define YUVTORGB_REGS \ |
| 1599 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", | 1586 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
| 1600 | 1587 |
| 1601 #else | 1588 #else |
| 1602 #define YUVTORGB_SETUP(yuvconstants) | 1589 #define YUVTORGB_SETUP(yuvconstants) |
| 1603 // Convert 8 pixels: 8 UV and 8 Y | 1590 // Convert 8 pixels: 8 UV and 8 Y |
| 1604 #define YUVTORGB(yuvconstants) \ | 1591 #define YUVTORGB(yuvconstants) \ |
| 1605 "movdqa %%xmm0,%%xmm1 \n" \ | 1592 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1606 "movdqa %%xmm0,%%xmm2 \n" \ | 1593 "movdqa %%xmm0,%%xmm2 \n" \ |
| 1607 "movdqa %%xmm0,%%xmm3 \n" \ | 1594 "movdqa %%xmm0,%%xmm3 \n" \ |
| 1608 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ | 1595 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ |
| 1609 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ | 1596 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ |
| 1610 "psubw %%xmm1,%%xmm0 \n" \ | 1597 "psubw %%xmm1,%%xmm0 \n" \ |
| 1611 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ | 1598 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ |
| 1612 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ | 1599 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ |
| 1613 "psubw %%xmm2,%%xmm1 \n" \ | 1600 "psubw %%xmm2,%%xmm1 \n" \ |
| 1614 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ | 1601 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ |
| 1615 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ | 1602 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ |
| 1616 "psubw %%xmm3,%%xmm2 \n" \ | 1603 "psubw %%xmm3,%%xmm2 \n" \ |
| 1617 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ | 1604 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ |
| 1618 "paddsw %%xmm4,%%xmm0 \n" \ | 1605 "paddsw %%xmm4,%%xmm0 \n" \ |
| 1619 "paddsw %%xmm4,%%xmm1 \n" \ | 1606 "paddsw %%xmm4,%%xmm1 \n" \ |
| 1620 "paddsw %%xmm4,%%xmm2 \n" \ | 1607 "paddsw %%xmm4,%%xmm2 \n" \ |
| 1621 "psraw $0x6,%%xmm0 \n" \ | 1608 "psraw $0x6,%%xmm0 \n" \ |
| 1622 "psraw $0x6,%%xmm1 \n" \ | 1609 "psraw $0x6,%%xmm1 \n" \ |
| 1623 "psraw $0x6,%%xmm2 \n" \ | 1610 "psraw $0x6,%%xmm2 \n" \ |
| 1624 "packuswb %%xmm0,%%xmm0 \n" \ | 1611 "packuswb %%xmm0,%%xmm0 \n" \ |
| 1625 "packuswb %%xmm1,%%xmm1 \n" \ | 1612 "packuswb %%xmm1,%%xmm1 \n" \ |
| 1626 "packuswb %%xmm2,%%xmm2 \n" | 1613 "packuswb %%xmm2,%%xmm2 \n" |
| 1627 #define YUVTORGB_REGS | 1614 #define YUVTORGB_REGS |
| 1628 #endif | 1615 #endif |
| 1629 | 1616 |
| 1630 // Store 8 ARGB values. | 1617 // Store 8 ARGB values. |
| 1631 #define STOREARGB \ | 1618 #define STOREARGB \ |
| 1632 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1619 "punpcklbw %%xmm1,%%xmm0 \n" \ |
| 1633 "punpcklbw %%xmm5,%%xmm2 \n" \ | 1620 "punpcklbw %%xmm5,%%xmm2 \n" \ |
| 1634 "movdqa %%xmm0,%%xmm1 \n" \ | 1621 "movdqa %%xmm0,%%xmm1 \n" \ |
| 1635 "punpcklwd %%xmm2,%%xmm0 \n" \ | 1622 "punpcklwd %%xmm2,%%xmm0 \n" \ |
| 1636 "punpckhwd %%xmm2,%%xmm1 \n" \ | 1623 "punpckhwd %%xmm2,%%xmm1 \n" \ |
| 1637 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ | 1624 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ |
| 1638 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ | 1625 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ |
| 1639 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" | 1626 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" |
| 1640 | 1627 |
| 1641 // Store 8 RGBA values. | 1628 // Store 8 RGBA values. |
| 1642 #define STORERGBA \ | 1629 #define STORERGBA \ |
| 1643 "pcmpeqb %%xmm5,%%xmm5 \n" \ | 1630 "pcmpeqb %%xmm5,%%xmm5 \n" \ |
| 1644 "punpcklbw %%xmm2,%%xmm1 \n" \ | 1631 "punpcklbw %%xmm2,%%xmm1 \n" \ |
| 1645 "punpcklbw %%xmm0,%%xmm5 \n" \ | 1632 "punpcklbw %%xmm0,%%xmm5 \n" \ |
| 1646 "movdqa %%xmm5,%%xmm0 \n" \ | 1633 "movdqa %%xmm5,%%xmm0 \n" \ |
| 1647 "punpcklwd %%xmm1,%%xmm5 \n" \ | 1634 "punpcklwd %%xmm1,%%xmm5 \n" \ |
| 1648 "punpckhwd %%xmm1,%%xmm0 \n" \ | 1635 "punpckhwd %%xmm1,%%xmm0 \n" \ |
| 1649 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ | 1636 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ |
| 1650 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ | 1637 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ |
| 1651 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" | 1638 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" |
| 1652 | 1639 |
| 1653 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, | 1640 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
| (...skipping 98 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1752 } | 1739 } |
| 1753 | 1740 |
| 1754 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 | 1741 #ifdef HAS_I422ALPHATOARGBROW_SSSE3 |
| 1755 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, | 1742 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, |
| 1756 const uint8* u_buf, | 1743 const uint8* u_buf, |
| 1757 const uint8* v_buf, | 1744 const uint8* v_buf, |
| 1758 const uint8* a_buf, | 1745 const uint8* a_buf, |
| 1759 uint8* dst_argb, | 1746 uint8* dst_argb, |
| 1760 const struct YuvConstants* yuvconstants, | 1747 const struct YuvConstants* yuvconstants, |
| 1761 int width) { | 1748 int width) { |
| 1749 // clang-format off |
| 1762 asm volatile ( | 1750 asm volatile ( |
| 1763 YUVTORGB_SETUP(yuvconstants) | 1751 YUVTORGB_SETUP(yuvconstants) |
| 1764 "sub %[u_buf],%[v_buf] \n" | 1752 "sub %[u_buf],%[v_buf] \n" |
| 1765 LABELALIGN | 1753 LABELALIGN |
| 1766 "1: \n" | 1754 "1: \n" |
| 1767 READYUVA422 | 1755 READYUVA422 |
| 1768 YUVTORGB(yuvconstants) | 1756 YUVTORGB(yuvconstants) |
| 1769 STOREARGB | 1757 STOREARGB |
| 1770 "subl $0x8,%[width] \n" | 1758 "subl $0x8,%[width] \n" |
| 1771 "jg 1b \n" | 1759 "jg 1b \n" |
| 1772 : [y_buf]"+r"(y_buf), // %[y_buf] | 1760 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1773 [u_buf]"+r"(u_buf), // %[u_buf] | 1761 [u_buf]"+r"(u_buf), // %[u_buf] |
| 1774 [v_buf]"+r"(v_buf), // %[v_buf] | 1762 [v_buf]"+r"(v_buf), // %[v_buf] |
| 1775 [a_buf]"+r"(a_buf), // %[a_buf] | 1763 [a_buf]"+r"(a_buf), // %[a_buf] |
| 1776 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1764 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1777 #if defined(__i386__) | 1765 #if defined(__i386__) |
| 1778 [width]"+m"(width) // %[width] | 1766 [width]"+m"(width) // %[width] |
| 1779 #else | 1767 #else |
| 1780 [width]"+rm"(width) // %[width] | 1768 [width]"+rm"(width) // %[width] |
| 1781 #endif | 1769 #endif |
| 1782 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1770 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1783 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1771 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
| 1784 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1772 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1785 ); | 1773 ); |
| 1774 // clang-format on |
| 1786 } | 1775 } |
| 1787 #endif // HAS_I422ALPHATOARGBROW_SSSE3 | 1776 #endif // HAS_I422ALPHATOARGBROW_SSSE3 |
| 1788 | 1777 |
| 1789 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, | 1778 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 1790 const uint8* uv_buf, | 1779 const uint8* uv_buf, |
| 1791 uint8* dst_argb, | 1780 uint8* dst_argb, |
| 1792 const struct YuvConstants* yuvconstants, | 1781 const struct YuvConstants* yuvconstants, |
| 1793 int width) { | 1782 int width) { |
| 1783 // clang-format off |
| 1794 asm volatile ( | 1784 asm volatile ( |
| 1795 YUVTORGB_SETUP(yuvconstants) | 1785 YUVTORGB_SETUP(yuvconstants) |
| 1796 "pcmpeqb %%xmm5,%%xmm5 \n" | 1786 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1797 LABELALIGN | 1787 LABELALIGN |
| 1798 "1: \n" | 1788 "1: \n" |
| 1799 READNV12 | 1789 READNV12 |
| 1800 YUVTORGB(yuvconstants) | 1790 YUVTORGB(yuvconstants) |
| 1801 STOREARGB | 1791 STOREARGB |
| 1802 "sub $0x8,%[width] \n" | 1792 "sub $0x8,%[width] \n" |
| 1803 "jg 1b \n" | 1793 "jg 1b \n" |
| 1804 : [y_buf]"+r"(y_buf), // %[y_buf] | 1794 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1805 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 1795 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 1806 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1796 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1807 [width]"+rm"(width) // %[width] | 1797 [width]"+rm"(width) // %[width] |
| 1808 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1798 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1809 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1799 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
| 1810 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1800 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1811 ); | 1801 ); |
| 1802 // clang-format on |
| 1812 } | 1803 } |
| 1813 | 1804 |
| 1814 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, | 1805 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
| 1815 const uint8* vu_buf, | 1806 const uint8* vu_buf, |
| 1816 uint8* dst_argb, | 1807 uint8* dst_argb, |
| 1817 const struct YuvConstants* yuvconstants, | 1808 const struct YuvConstants* yuvconstants, |
| 1818 int width) { | 1809 int width) { |
| 1810 // clang-format off |
| 1819 asm volatile ( | 1811 asm volatile ( |
| 1820 YUVTORGB_SETUP(yuvconstants) | 1812 YUVTORGB_SETUP(yuvconstants) |
| 1821 "pcmpeqb %%xmm5,%%xmm5 \n" | 1813 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1822 LABELALIGN | 1814 LABELALIGN |
| 1823 "1: \n" | 1815 "1: \n" |
| 1824 READNV21 | 1816 READNV21 |
| 1825 YUVTORGB(yuvconstants) | 1817 YUVTORGB(yuvconstants) |
| 1826 STOREARGB | 1818 STOREARGB |
| 1827 "sub $0x8,%[width] \n" | 1819 "sub $0x8,%[width] \n" |
| 1828 "jg 1b \n" | 1820 "jg 1b \n" |
| 1829 : [y_buf]"+r"(y_buf), // %[y_buf] | 1821 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 1830 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 1822 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
| 1831 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1823 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1832 [width]"+rm"(width) // %[width] | 1824 [width]"+rm"(width) // %[width] |
| 1833 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1825 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1834 [kShuffleNV21]"m"(kShuffleNV21) | 1826 [kShuffleNV21]"m"(kShuffleNV21) |
| 1835 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1827 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
| 1836 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1828 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1837 ); | 1829 ); |
| 1830 // clang-format on |
| 1838 } | 1831 } |
| 1839 | 1832 |
| 1840 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, | 1833 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, |
| 1841 uint8* dst_argb, | 1834 uint8* dst_argb, |
| 1842 const struct YuvConstants* yuvconstants, | 1835 const struct YuvConstants* yuvconstants, |
| 1843 int width) { | 1836 int width) { |
| 1837 // clang-format off |
| 1844 asm volatile ( | 1838 asm volatile ( |
| 1845 YUVTORGB_SETUP(yuvconstants) | 1839 YUVTORGB_SETUP(yuvconstants) |
| 1846 "pcmpeqb %%xmm5,%%xmm5 \n" | 1840 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1847 LABELALIGN | 1841 LABELALIGN |
| 1848 "1: \n" | 1842 "1: \n" |
| 1849 READYUY2 | 1843 READYUY2 |
| 1850 YUVTORGB(yuvconstants) | 1844 YUVTORGB(yuvconstants) |
| 1851 STOREARGB | 1845 STOREARGB |
| 1852 "sub $0x8,%[width] \n" | 1846 "sub $0x8,%[width] \n" |
| 1853 "jg 1b \n" | 1847 "jg 1b \n" |
| 1854 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 1848 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
| 1855 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1849 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1856 [width]"+rm"(width) // %[width] | 1850 [width]"+rm"(width) // %[width] |
| 1857 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1851 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1858 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 1852 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
| 1859 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 1853 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
| 1860 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1854 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
| 1861 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1855 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1862 ); | 1856 ); |
| 1857 // clang-format on |
| 1863 } | 1858 } |
| 1864 | 1859 |
| 1865 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, | 1860 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, |
| 1866 uint8* dst_argb, | 1861 uint8* dst_argb, |
| 1867 const struct YuvConstants* yuvconstants, | 1862 const struct YuvConstants* yuvconstants, |
| 1868 int width) { | 1863 int width) { |
| 1864 // clang-format off |
| 1869 asm volatile ( | 1865 asm volatile ( |
| 1870 YUVTORGB_SETUP(yuvconstants) | 1866 YUVTORGB_SETUP(yuvconstants) |
| 1871 "pcmpeqb %%xmm5,%%xmm5 \n" | 1867 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 1872 LABELALIGN | 1868 LABELALIGN |
| 1873 "1: \n" | 1869 "1: \n" |
| 1874 READUYVY | 1870 READUYVY |
| 1875 YUVTORGB(yuvconstants) | 1871 YUVTORGB(yuvconstants) |
| 1876 STOREARGB | 1872 STOREARGB |
| 1877 "sub $0x8,%[width] \n" | 1873 "sub $0x8,%[width] \n" |
| 1878 "jg 1b \n" | 1874 "jg 1b \n" |
| 1879 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 1875 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
| 1880 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 1876 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 1881 [width]"+rm"(width) // %[width] | 1877 [width]"+rm"(width) // %[width] |
| 1882 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 1878 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 1883 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 1879 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
| 1884 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 1880 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
| 1885 : "memory", "cc", YUVTORGB_REGS // Does not use r14. | 1881 : "memory", "cc", YUVTORGB_REGS // Does not use r14. |
| 1886 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1882 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1887 ); | 1883 ); |
| 1884 // clang-format on |
| 1888 } | 1885 } |
| 1889 | 1886 |
| 1890 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, | 1887 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 1891 const uint8* u_buf, | 1888 const uint8* u_buf, |
| 1892 const uint8* v_buf, | 1889 const uint8* v_buf, |
| 1893 uint8* dst_rgba, | 1890 uint8* dst_rgba, |
| 1894 const struct YuvConstants* yuvconstants, | 1891 const struct YuvConstants* yuvconstants, |
| 1895 int width) { | 1892 int width) { |
| 1896 asm volatile ( | 1893 asm volatile ( |
| 1897 YUVTORGB_SETUP(yuvconstants) | 1894 YUVTORGB_SETUP(yuvconstants) |
| (...skipping 13 matching lines...) Expand all Loading... |
| 1911 [width]"+rm"(width) // %[width] | 1908 [width]"+rm"(width) // %[width] |
| 1912 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 1909 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 1913 : "memory", "cc", NACL_R14 YUVTORGB_REGS | 1910 : "memory", "cc", NACL_R14 YUVTORGB_REGS |
| 1914 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 1911 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 1915 ); | 1912 ); |
| 1916 } | 1913 } |
| 1917 | 1914 |
| 1918 #endif // HAS_I422TOARGBROW_SSSE3 | 1915 #endif // HAS_I422TOARGBROW_SSSE3 |
| 1919 | 1916 |
| 1920 // Read 16 UV from 444 | 1917 // Read 16 UV from 444 |
| 1921 #define READYUV444_AVX2 \ | 1918 #define READYUV444_AVX2 \ |
| 1922 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1919 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1923 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1920 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1924 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ | 1921 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ |
| 1925 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1922 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1926 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ | 1923 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ |
| 1927 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1924 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1928 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1925 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1929 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1926 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1930 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1927 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1931 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1928 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1932 | 1929 |
| 1933 // Read 8 UV from 422, upsample to 16 UV. | 1930 // Read 8 UV from 422, upsample to 16 UV. |
| 1934 #define READYUV422_AVX2 \ | 1931 #define READYUV422_AVX2 \ |
| 1935 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1932 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1936 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1933 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1937 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1934 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 1938 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1935 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1939 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1936 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1940 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1937 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1941 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1938 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1942 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1939 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1943 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1940 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1944 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1941 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1945 | 1942 |
| 1946 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. | 1943 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. |
| 1947 #define READYUVA422_AVX2 \ | 1944 #define READYUVA422_AVX2 \ |
| 1948 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1945 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1949 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1946 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1950 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1947 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
| 1951 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 1948 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1952 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1949 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1953 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1950 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1954 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1951 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1955 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1952 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1956 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1953 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1957 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1954 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
| 1958 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1955 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
| 1959 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ | 1956 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
| 1960 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" | 1957 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
| 1961 | 1958 |
| 1962 // Read 8 UV from NV12, upsample to 16 UV. | 1959 // Read 8 UV from NV12, upsample to 16 UV. |
| 1963 #define READNV12_AVX2 \ | 1960 #define READNV12_AVX2 \ |
| 1964 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1961 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
| 1965 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1962 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
| 1966 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1963 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1967 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1964 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1968 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1965 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1969 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1966 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1970 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1967 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1971 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1968 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1972 | 1969 |
| 1973 // Read 8 VU from NV21, upsample to 16 UV. | 1970 // Read 8 VU from NV21, upsample to 16 UV. |
| 1974 #define READNV21_AVX2 \ | 1971 #define READNV21_AVX2 \ |
| 1975 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ | 1972 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ |
| 1976 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ | 1973 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ |
| 1977 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1974 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1978 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ | 1975 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ |
| 1979 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1976 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1980 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1977 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1981 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1978 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1982 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1979 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1983 | 1980 |
| 1984 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. | 1981 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. |
| 1985 #define READYUY2_AVX2 \ | 1982 #define READYUY2_AVX2 \ |
| 1986 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ | 1983 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ |
| 1987 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ | 1984 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ |
| 1988 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ | 1985 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ |
| 1989 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ | 1986 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ |
| 1990 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" | 1987 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" |
| 1991 | 1988 |
| 1992 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. | 1989 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. |
| 1993 #define READUYVY_AVX2 \ | 1990 #define READUYVY_AVX2 \ |
| 1994 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ | 1991 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ |
| 1995 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ | 1992 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ |
| 1996 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ | 1993 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ |
| 1997 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ | 1994 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ |
| 1998 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" | 1995 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" |
| 1999 | 1996 |
| 2000 #if defined(__x86_64__) | 1997 #if defined(__x86_64__) |
| 2001 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ | 1998 #define YUVTORGB_SETUP_AVX2(yuvconstants) \ |
| 2002 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ | 1999 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ |
| 2003 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ | 2000 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ |
| 2004 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ | 2001 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ |
| 2005 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ | 2002 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ |
| 2006 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ | 2003 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ |
| 2007 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ | 2004 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ |
| 2008 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" | 2005 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" |
| 2009 #define YUVTORGB_AVX2(yuvconstants) \ | 2006 #define YUVTORGB_AVX2(yuvconstants) \ |
| 2010 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ | 2007 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ |
| 2011 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ | 2008 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ |
| 2012 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ | 2009 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ |
| 2013 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ | 2010 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ |
| 2014 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ | 2011 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ |
| 2015 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ | 2012 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ |
| 2016 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ | 2013 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ |
| 2017 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 2014 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
| 2018 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2015 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
| 2019 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2016 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
| 2020 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2017 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
| 2021 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2018 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
| 2022 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2019 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
| 2023 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2020 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 2024 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2021 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
| 2025 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2022 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 2026 #define YUVTORGB_REGS_AVX2 \ | 2023 #define YUVTORGB_REGS_AVX2 \ |
| 2027 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", | 2024 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
| 2028 #else // Convert 16 pixels: 16 UV and 16 Y. | 2025 #else // Convert 16 pixels: 16 UV and 16 Y. |
| 2029 #define YUVTORGB_SETUP_AVX2(yuvconstants) | 2026 #define YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2030 #define YUVTORGB_AVX2(yuvconstants) \ | 2027 #define YUVTORGB_AVX2(yuvconstants) \ |
| 2031 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ | 2028 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ |
| 2032 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ | 2029 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ |
| 2033 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ | 2030 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ |
| 2034 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ | 2031 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ |
| 2035 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 2032 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
| 2036 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ | 2033 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ |
| 2037 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 2034 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
| 2038 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ | 2035 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ |
| 2039 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ | 2036 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ |
| 2040 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ | 2037 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ |
| 2041 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ | 2038 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ |
| 2042 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2039 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
| 2043 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2040 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
| 2044 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2041 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
| 2045 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2042 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
| 2046 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2043 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
| 2047 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2044 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 2048 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2045 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
| 2049 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2046 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
| 2050 #define YUVTORGB_REGS_AVX2 | 2047 #define YUVTORGB_REGS_AVX2 |
| 2051 #endif | 2048 #endif |
| 2052 | 2049 |
| 2053 // Store 16 ARGB values. | 2050 // Store 16 ARGB values. |
| 2054 #define STOREARGB_AVX2 \ | 2051 #define STOREARGB_AVX2 \ |
| 2055 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ | 2052 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 2056 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 2053 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 2057 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ | 2054 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ |
| 2058 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ | 2055 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ |
| 2059 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ | 2056 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ |
| 2060 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ | 2057 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ |
| 2061 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ | 2058 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ |
| 2062 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ | 2059 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ |
| 2063 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" | 2060 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" |
| 2064 | 2061 |
| 2065 #ifdef HAS_I444TOARGBROW_AVX2 | 2062 #ifdef HAS_I444TOARGBROW_AVX2 |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2125 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2122 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
| 2126 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2123 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2127 ); | 2124 ); |
| 2128 } | 2125 } |
| 2129 #endif // HAS_I422TOARGBROW_AVX2 | 2126 #endif // HAS_I422TOARGBROW_AVX2 |
| 2130 | 2127 |
| 2131 #if defined(HAS_I422ALPHATOARGBROW_AVX2) | 2128 #if defined(HAS_I422ALPHATOARGBROW_AVX2) |
| 2132 // 16 pixels | 2129 // 16 pixels |
| 2133 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. | 2130 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. |
| 2134 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, | 2131 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, |
| 2135 const uint8* u_buf, | 2132 const uint8* u_buf, |
| 2136 const uint8* v_buf, | 2133 const uint8* v_buf, |
| 2137 const uint8* a_buf, | 2134 const uint8* a_buf, |
| 2138 uint8* dst_argb, | 2135 uint8* dst_argb, |
| 2139 const struct YuvConstants* yuvconstants, | 2136 const struct YuvConstants* yuvconstants, |
| 2140 int width) { | 2137 int width) { |
| 2138 // clang-format off |
| 2141 asm volatile ( | 2139 asm volatile ( |
| 2142 YUVTORGB_SETUP_AVX2(yuvconstants) | 2140 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2143 "sub %[u_buf],%[v_buf] \n" | 2141 "sub %[u_buf],%[v_buf] \n" |
| 2144 LABELALIGN | 2142 LABELALIGN |
| 2145 "1: \n" | 2143 "1: \n" |
| 2146 READYUVA422_AVX2 | 2144 READYUVA422_AVX2 |
| 2147 YUVTORGB_AVX2(yuvconstants) | 2145 YUVTORGB_AVX2(yuvconstants) |
| 2148 STOREARGB_AVX2 | 2146 STOREARGB_AVX2 |
| 2149 "subl $0x10,%[width] \n" | 2147 "subl $0x10,%[width] \n" |
| 2150 "jg 1b \n" | 2148 "jg 1b \n" |
| 2151 "vzeroupper \n" | 2149 "vzeroupper \n" |
| 2152 : [y_buf]"+r"(y_buf), // %[y_buf] | 2150 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2153 [u_buf]"+r"(u_buf), // %[u_buf] | 2151 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2154 [v_buf]"+r"(v_buf), // %[v_buf] | 2152 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2155 [a_buf]"+r"(a_buf), // %[a_buf] | 2153 [a_buf]"+r"(a_buf), // %[a_buf] |
| 2156 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2154 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2157 #if defined(__i386__) | 2155 #if defined(__i386__) |
| 2158 [width]"+m"(width) // %[width] | 2156 [width]"+m"(width) // %[width] |
| 2159 #else | 2157 #else |
| 2160 [width]"+rm"(width) // %[width] | 2158 [width]"+rm"(width) // %[width] |
| 2161 #endif | 2159 #endif |
| 2162 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2160 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2163 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2161 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
| 2164 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2162 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2165 ); | 2163 ); |
| 2164 // clang-format on |
| 2166 } | 2165 } |
| 2167 #endif // HAS_I422ALPHATOARGBROW_AVX2 | 2166 #endif // HAS_I422ALPHATOARGBROW_AVX2 |
| 2168 | 2167 |
| 2169 #if defined(HAS_I422TORGBAROW_AVX2) | 2168 #if defined(HAS_I422TORGBAROW_AVX2) |
| 2170 // 16 pixels | 2169 // 16 pixels |
| 2171 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2170 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| 2172 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, | 2171 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, |
| 2173 const uint8* u_buf, | 2172 const uint8* u_buf, |
| 2174 const uint8* v_buf, | 2173 const uint8* v_buf, |
| 2175 uint8* dst_argb, | 2174 uint8* dst_argb, |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2210 #endif // HAS_I422TORGBAROW_AVX2 | 2209 #endif // HAS_I422TORGBAROW_AVX2 |
| 2211 | 2210 |
| 2212 #if defined(HAS_NV12TOARGBROW_AVX2) | 2211 #if defined(HAS_NV12TOARGBROW_AVX2) |
| 2213 // 16 pixels. | 2212 // 16 pixels. |
| 2214 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2213 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2215 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, | 2214 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2216 const uint8* uv_buf, | 2215 const uint8* uv_buf, |
| 2217 uint8* dst_argb, | 2216 uint8* dst_argb, |
| 2218 const struct YuvConstants* yuvconstants, | 2217 const struct YuvConstants* yuvconstants, |
| 2219 int width) { | 2218 int width) { |
| 2219 // clang-format off |
| 2220 asm volatile ( | 2220 asm volatile ( |
| 2221 YUVTORGB_SETUP_AVX2(yuvconstants) | 2221 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2222 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2223 LABELALIGN | 2223 LABELALIGN |
| 2224 "1: \n" | 2224 "1: \n" |
| 2225 READNV12_AVX2 | 2225 READNV12_AVX2 |
| 2226 YUVTORGB_AVX2(yuvconstants) | 2226 YUVTORGB_AVX2(yuvconstants) |
| 2227 STOREARGB_AVX2 | 2227 STOREARGB_AVX2 |
| 2228 "sub $0x10,%[width] \n" | 2228 "sub $0x10,%[width] \n" |
| 2229 "jg 1b \n" | 2229 "jg 1b \n" |
| 2230 "vzeroupper \n" | 2230 "vzeroupper \n" |
| 2231 : [y_buf]"+r"(y_buf), // %[y_buf] | 2231 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2232 [uv_buf]"+r"(uv_buf), // %[uv_buf] | 2232 [uv_buf]"+r"(uv_buf), // %[uv_buf] |
| 2233 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2233 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2234 [width]"+rm"(width) // %[width] | 2234 [width]"+rm"(width) // %[width] |
| 2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2235 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2236 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
| 2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2237 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2238 ); | 2238 ); |
| 2239 // clang-format on |
| 2239 } | 2240 } |
| 2240 #endif // HAS_NV12TOARGBROW_AVX2 | 2241 #endif // HAS_NV12TOARGBROW_AVX2 |
| 2241 | 2242 |
| 2242 #if defined(HAS_NV21TOARGBROW_AVX2) | 2243 #if defined(HAS_NV21TOARGBROW_AVX2) |
| 2243 // 16 pixels. | 2244 // 16 pixels. |
| 2244 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2245 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2245 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, | 2246 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, |
| 2246 const uint8* vu_buf, | 2247 const uint8* vu_buf, |
| 2247 uint8* dst_argb, | 2248 uint8* dst_argb, |
| 2248 const struct YuvConstants* yuvconstants, | 2249 const struct YuvConstants* yuvconstants, |
| 2249 int width) { | 2250 int width) { |
| 2251 // clang-format off |
| 2250 asm volatile ( | 2252 asm volatile ( |
| 2251 YUVTORGB_SETUP_AVX2(yuvconstants) | 2253 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2252 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2254 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2253 LABELALIGN | 2255 LABELALIGN |
| 2254 "1: \n" | 2256 "1: \n" |
| 2255 READNV21_AVX2 | 2257 READNV21_AVX2 |
| 2256 YUVTORGB_AVX2(yuvconstants) | 2258 YUVTORGB_AVX2(yuvconstants) |
| 2257 STOREARGB_AVX2 | 2259 STOREARGB_AVX2 |
| 2258 "sub $0x10,%[width] \n" | 2260 "sub $0x10,%[width] \n" |
| 2259 "jg 1b \n" | 2261 "jg 1b \n" |
| 2260 "vzeroupper \n" | 2262 "vzeroupper \n" |
| 2261 : [y_buf]"+r"(y_buf), // %[y_buf] | 2263 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2262 [vu_buf]"+r"(vu_buf), // %[vu_buf] | 2264 [vu_buf]"+r"(vu_buf), // %[vu_buf] |
| 2263 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2265 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2264 [width]"+rm"(width) // %[width] | 2266 [width]"+rm"(width) // %[width] |
| 2265 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2267 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2266 [kShuffleNV21]"m"(kShuffleNV21) | 2268 [kShuffleNV21]"m"(kShuffleNV21) |
| 2267 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2269 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
| 2268 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2270 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2269 ); | 2271 ); |
| 2272 // clang-format on |
| 2270 } | 2273 } |
| 2271 #endif // HAS_NV21TOARGBROW_AVX2 | 2274 #endif // HAS_NV21TOARGBROW_AVX2 |
| 2272 | 2275 |
| 2273 #if defined(HAS_YUY2TOARGBROW_AVX2) | 2276 #if defined(HAS_YUY2TOARGBROW_AVX2) |
| 2274 // 16 pixels. | 2277 // 16 pixels. |
| 2275 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2278 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2276 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, | 2279 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, |
| 2277 uint8* dst_argb, | 2280 uint8* dst_argb, |
| 2278 const struct YuvConstants* yuvconstants, | 2281 const struct YuvConstants* yuvconstants, |
| 2279 int width) { | 2282 int width) { |
| 2283 // clang-format off |
| 2280 asm volatile ( | 2284 asm volatile ( |
| 2281 YUVTORGB_SETUP_AVX2(yuvconstants) | 2285 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2286 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2283 LABELALIGN | 2287 LABELALIGN |
| 2284 "1: \n" | 2288 "1: \n" |
| 2285 READYUY2_AVX2 | 2289 READYUY2_AVX2 |
| 2286 YUVTORGB_AVX2(yuvconstants) | 2290 YUVTORGB_AVX2(yuvconstants) |
| 2287 STOREARGB_AVX2 | 2291 STOREARGB_AVX2 |
| 2288 "sub $0x10,%[width] \n" | 2292 "sub $0x10,%[width] \n" |
| 2289 "jg 1b \n" | 2293 "jg 1b \n" |
| 2290 "vzeroupper \n" | 2294 "vzeroupper \n" |
| 2291 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] | 2295 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] |
| 2292 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2296 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2293 [width]"+rm"(width) // %[width] | 2297 [width]"+rm"(width) // %[width] |
| 2294 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2298 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2295 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), | 2299 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), |
| 2296 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) | 2300 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) |
| 2297 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2301 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
| 2298 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2302 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2299 ); | 2303 ); |
| 2304 // clang-format on |
| 2300 } | 2305 } |
| 2301 #endif // HAS_YUY2TOARGBROW_AVX2 | 2306 #endif // HAS_YUY2TOARGBROW_AVX2 |
| 2302 | 2307 |
| 2303 #if defined(HAS_UYVYTOARGBROW_AVX2) | 2308 #if defined(HAS_UYVYTOARGBROW_AVX2) |
| 2304 // 16 pixels. | 2309 // 16 pixels. |
| 2305 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2310 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2306 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, | 2311 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, |
| 2307 uint8* dst_argb, | 2312 uint8* dst_argb, |
| 2308 const struct YuvConstants* yuvconstants, | 2313 const struct YuvConstants* yuvconstants, |
| 2309 int width) { | 2314 int width) { |
| 2315 // clang-format off |
| 2310 asm volatile ( | 2316 asm volatile ( |
| 2311 YUVTORGB_SETUP_AVX2(yuvconstants) | 2317 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2312 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2318 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2313 LABELALIGN | 2319 LABELALIGN |
| 2314 "1: \n" | 2320 "1: \n" |
| 2315 READUYVY_AVX2 | 2321 READUYVY_AVX2 |
| 2316 YUVTORGB_AVX2(yuvconstants) | 2322 YUVTORGB_AVX2(yuvconstants) |
| 2317 STOREARGB_AVX2 | 2323 STOREARGB_AVX2 |
| 2318 "sub $0x10,%[width] \n" | 2324 "sub $0x10,%[width] \n" |
| 2319 "jg 1b \n" | 2325 "jg 1b \n" |
| 2320 "vzeroupper \n" | 2326 "vzeroupper \n" |
| 2321 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] | 2327 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] |
| 2322 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2328 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2323 [width]"+rm"(width) // %[width] | 2329 [width]"+rm"(width) // %[width] |
| 2324 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] | 2330 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] |
| 2325 [kShuffleUYVYY]"m"(kShuffleUYVYY), | 2331 [kShuffleUYVYY]"m"(kShuffleUYVYY), |
| 2326 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) | 2332 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) |
| 2327 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. | 2333 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. |
| 2328 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2334 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2329 ); | 2335 ); |
| 2336 // clang-format on |
| 2330 } | 2337 } |
| 2331 #endif // HAS_UYVYTOARGBROW_AVX2 | 2338 #endif // HAS_UYVYTOARGBROW_AVX2 |
| 2332 | 2339 |
| 2333 #ifdef HAS_I400TOARGBROW_SSE2 | 2340 #ifdef HAS_I400TOARGBROW_SSE2 |
| 2334 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { | 2341 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { |
| 2335 asm volatile ( | 2342 asm volatile ( |
| 2336 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 | 2343 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 |
| 2337 "movd %%eax,%%xmm2 \n" | 2344 "movd %%eax,%%xmm2 \n" |
| 2338 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 2345 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
| 2339 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 | 2346 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2417 "+rm"(width) // %2 | 2424 "+rm"(width) // %2 |
| 2418 : | 2425 : |
| 2419 : "memory", "cc", "eax" | 2426 : "memory", "cc", "eax" |
| 2420 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 2427 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 2421 ); | 2428 ); |
| 2422 } | 2429 } |
| 2423 #endif // HAS_I400TOARGBROW_AVX2 | 2430 #endif // HAS_I400TOARGBROW_AVX2 |
| 2424 | 2431 |
| 2425 #ifdef HAS_MIRRORROW_SSSE3 | 2432 #ifdef HAS_MIRRORROW_SSSE3 |
| 2426 // Shuffle table for reversing the bytes. | 2433 // Shuffle table for reversing the bytes. |
| 2427 static uvec8 kShuffleMirror = { | 2434 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, |
| 2428 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | 2435 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
| 2429 }; | |
| 2430 | 2436 |
| 2431 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { | 2437 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
| 2432 intptr_t temp_width = (intptr_t)(width); | 2438 intptr_t temp_width = (intptr_t)(width); |
| 2433 asm volatile ( | 2439 asm volatile ( |
| 2434 "movdqa %3,%%xmm5 \n" | 2440 "movdqa %3,%%xmm5 \n" |
| 2435 LABELALIGN | 2441 LABELALIGN |
| 2436 "1: \n" | 2442 "1: \n" |
| 2437 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 | 2443 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 |
| 2438 "pshufb %%xmm5,%%xmm0 \n" | 2444 "pshufb %%xmm5,%%xmm0 \n" |
| 2439 "movdqu %%xmm0," MEMACCESS(1) " \n" | 2445 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| (...skipping 30 matching lines...) Expand all Loading... |
| 2470 "+r"(temp_width) // %2 | 2476 "+r"(temp_width) // %2 |
| 2471 : "m"(kShuffleMirror) // %3 | 2477 : "m"(kShuffleMirror) // %3 |
| 2472 : "memory", "cc", NACL_R14 | 2478 : "memory", "cc", NACL_R14 |
| 2473 "xmm0", "xmm5" | 2479 "xmm0", "xmm5" |
| 2474 ); | 2480 ); |
| 2475 } | 2481 } |
| 2476 #endif // HAS_MIRRORROW_AVX2 | 2482 #endif // HAS_MIRRORROW_AVX2 |
| 2477 | 2483 |
| 2478 #ifdef HAS_MIRRORUVROW_SSSE3 | 2484 #ifdef HAS_MIRRORUVROW_SSSE3 |
| 2479 // Shuffle table for reversing the bytes of UV channels. | 2485 // Shuffle table for reversing the bytes of UV channels. |
| 2480 static uvec8 kShuffleMirrorUV = { | 2486 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, |
| 2481 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 2487 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; |
| 2482 }; | 2488 void MirrorUVRow_SSSE3(const uint8* src, |
| 2483 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 2489 uint8* dst_u, |
| 2490 uint8* dst_v, |
| 2484 int width) { | 2491 int width) { |
| 2485 intptr_t temp_width = (intptr_t)(width); | 2492 intptr_t temp_width = (intptr_t)(width); |
| 2486 asm volatile ( | 2493 asm volatile ( |
| 2487 "movdqa %4,%%xmm1 \n" | 2494 "movdqa %4,%%xmm1 \n" |
| 2488 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" | 2495 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" |
| 2489 "sub %1,%2 \n" | 2496 "sub %1,%2 \n" |
| 2490 LABELALIGN | 2497 LABELALIGN |
| 2491 "1: \n" | 2498 "1: \n" |
| 2492 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2499 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 2493 "lea " MEMLEA(-0x10,0) ",%0 \n" | 2500 "lea " MEMLEA(-0x10,0) ",%0 \n" |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2528 "+r"(temp_width) // %2 | 2535 "+r"(temp_width) // %2 |
| 2529 : | 2536 : |
| 2530 : "memory", "cc" | 2537 : "memory", "cc" |
| 2531 , "xmm0" | 2538 , "xmm0" |
| 2532 ); | 2539 ); |
| 2533 } | 2540 } |
| 2534 #endif // HAS_ARGBMIRRORROW_SSE2 | 2541 #endif // HAS_ARGBMIRRORROW_SSE2 |
| 2535 | 2542 |
| 2536 #ifdef HAS_ARGBMIRRORROW_AVX2 | 2543 #ifdef HAS_ARGBMIRRORROW_AVX2 |
| 2537 // Shuffle table for reversing the bytes. | 2544 // Shuffle table for reversing the bytes. |
| 2538 static const ulvec32 kARGBShuffleMirror_AVX2 = { | 2545 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; |
| 2539 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u | |
| 2540 }; | |
| 2541 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { | 2546 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
| 2542 intptr_t temp_width = (intptr_t)(width); | 2547 intptr_t temp_width = (intptr_t)(width); |
| 2543 asm volatile ( | 2548 asm volatile ( |
| 2544 "vmovdqu %3,%%ymm5 \n" | 2549 "vmovdqu %3,%%ymm5 \n" |
| 2545 LABELALIGN | 2550 LABELALIGN |
| 2546 "1: \n" | 2551 "1: \n" |
| 2547 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 | 2552 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 |
| 2548 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 2553 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
| 2549 "lea " MEMLEA(0x20,1) ",%1 \n" | 2554 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 2550 "sub $0x8,%2 \n" | 2555 "sub $0x8,%2 \n" |
| 2551 "jg 1b \n" | 2556 "jg 1b \n" |
| 2552 "vzeroupper \n" | 2557 "vzeroupper \n" |
| 2553 : "+r"(src), // %0 | 2558 : "+r"(src), // %0 |
| 2554 "+r"(dst), // %1 | 2559 "+r"(dst), // %1 |
| 2555 "+r"(temp_width) // %2 | 2560 "+r"(temp_width) // %2 |
| 2556 : "m"(kARGBShuffleMirror_AVX2) // %3 | 2561 : "m"(kARGBShuffleMirror_AVX2) // %3 |
| 2557 : "memory", "cc", NACL_R14 | 2562 : "memory", "cc", NACL_R14 |
| 2558 "xmm0", "xmm5" | 2563 "xmm0", "xmm5" |
| 2559 ); | 2564 ); |
| 2560 } | 2565 } |
| 2561 #endif // HAS_ARGBMIRRORROW_AVX2 | 2566 #endif // HAS_ARGBMIRRORROW_AVX2 |
| 2562 | 2567 |
| 2563 #ifdef HAS_SPLITUVROW_AVX2 | 2568 #ifdef HAS_SPLITUVROW_AVX2 |
| 2564 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 2569 void SplitUVRow_AVX2(const uint8* src_uv, |
| 2570 uint8* dst_u, |
| 2571 uint8* dst_v, |
| 2565 int width) { | 2572 int width) { |
| 2566 asm volatile ( | 2573 asm volatile ( |
| 2567 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2574 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2568 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 2575 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
| 2569 "sub %1,%2 \n" | 2576 "sub %1,%2 \n" |
| 2570 LABELALIGN | 2577 LABELALIGN |
| 2571 "1: \n" | 2578 "1: \n" |
| 2572 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 2579 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 2573 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 2580 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 2574 "lea " MEMLEA(0x40,0) ",%0 \n" | 2581 "lea " MEMLEA(0x40,0) ",%0 \n" |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2591 "+r"(dst_v), // %2 | 2598 "+r"(dst_v), // %2 |
| 2592 "+r"(width) // %3 | 2599 "+r"(width) // %3 |
| 2593 : | 2600 : |
| 2594 : "memory", "cc", NACL_R14 | 2601 : "memory", "cc", NACL_R14 |
| 2595 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2602 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 2596 ); | 2603 ); |
| 2597 } | 2604 } |
| 2598 #endif // HAS_SPLITUVROW_AVX2 | 2605 #endif // HAS_SPLITUVROW_AVX2 |
| 2599 | 2606 |
| 2600 #ifdef HAS_SPLITUVROW_SSE2 | 2607 #ifdef HAS_SPLITUVROW_SSE2 |
| 2601 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 2608 void SplitUVRow_SSE2(const uint8* src_uv, |
| 2609 uint8* dst_u, |
| 2610 uint8* dst_v, |
| 2602 int width) { | 2611 int width) { |
| 2603 asm volatile ( | 2612 asm volatile ( |
| 2604 "pcmpeqb %%xmm5,%%xmm5 \n" | 2613 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 2605 "psrlw $0x8,%%xmm5 \n" | 2614 "psrlw $0x8,%%xmm5 \n" |
| 2606 "sub %1,%2 \n" | 2615 "sub %1,%2 \n" |
| 2607 LABELALIGN | 2616 LABELALIGN |
| 2608 "1: \n" | 2617 "1: \n" |
| 2609 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2618 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 2610 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 2619 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 2611 "lea " MEMLEA(0x20,0) ",%0 \n" | 2620 "lea " MEMLEA(0x20,0) ",%0 \n" |
| (...skipping 15 matching lines...) Expand all Loading... |
| 2627 "+r"(dst_v), // %2 | 2636 "+r"(dst_v), // %2 |
| 2628 "+r"(width) // %3 | 2637 "+r"(width) // %3 |
| 2629 : | 2638 : |
| 2630 : "memory", "cc", NACL_R14 | 2639 : "memory", "cc", NACL_R14 |
| 2631 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 2640 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 2632 ); | 2641 ); |
| 2633 } | 2642 } |
| 2634 #endif // HAS_SPLITUVROW_SSE2 | 2643 #endif // HAS_SPLITUVROW_SSE2 |
| 2635 | 2644 |
| 2636 #ifdef HAS_MERGEUVROW_AVX2 | 2645 #ifdef HAS_MERGEUVROW_AVX2 |
| 2637 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 2646 void MergeUVRow_AVX2(const uint8* src_u, |
| 2647 const uint8* src_v, |
| 2648 uint8* dst_uv, |
| 2638 int width) { | 2649 int width) { |
| 2639 asm volatile ( | 2650 asm volatile ( |
| 2640 "sub %0,%1 \n" | 2651 "sub %0,%1 \n" |
| 2641 LABELALIGN | 2652 LABELALIGN |
| 2642 "1: \n" | 2653 "1: \n" |
| 2643 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 2654 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 2644 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 | 2655 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 |
| 2645 "lea " MEMLEA(0x20,0) ",%0 \n" | 2656 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 2646 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" | 2657 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" |
| 2647 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" | 2658 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2658 "+r"(dst_uv), // %2 | 2669 "+r"(dst_uv), // %2 |
| 2659 "+r"(width) // %3 | 2670 "+r"(width) // %3 |
| 2660 : | 2671 : |
| 2661 : "memory", "cc", NACL_R14 | 2672 : "memory", "cc", NACL_R14 |
| 2662 "xmm0", "xmm1", "xmm2" | 2673 "xmm0", "xmm1", "xmm2" |
| 2663 ); | 2674 ); |
| 2664 } | 2675 } |
| 2665 #endif // HAS_MERGEUVROW_AVX2 | 2676 #endif // HAS_MERGEUVROW_AVX2 |
| 2666 | 2677 |
| 2667 #ifdef HAS_MERGEUVROW_SSE2 | 2678 #ifdef HAS_MERGEUVROW_SSE2 |
| 2668 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 2679 void MergeUVRow_SSE2(const uint8* src_u, |
| 2680 const uint8* src_v, |
| 2681 uint8* dst_uv, |
| 2669 int width) { | 2682 int width) { |
| 2670 asm volatile ( | 2683 asm volatile ( |
| 2671 "sub %0,%1 \n" | 2684 "sub %0,%1 \n" |
| 2672 LABELALIGN | 2685 LABELALIGN |
| 2673 "1: \n" | 2686 "1: \n" |
| 2674 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 2687 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 2675 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 2688 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
| 2676 "lea " MEMLEA(0x10,0) ",%0 \n" | 2689 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 2677 "movdqa %%xmm0,%%xmm2 \n" | 2690 "movdqa %%xmm0,%%xmm2 \n" |
| 2678 "punpcklbw %%xmm1,%%xmm0 \n" | 2691 "punpcklbw %%xmm1,%%xmm0 \n" |
| (...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2752 : "memory", "cc" | 2765 : "memory", "cc" |
| 2753 , "xmm0", "xmm1" | 2766 , "xmm0", "xmm1" |
| 2754 ); | 2767 ); |
| 2755 } | 2768 } |
| 2756 #endif // HAS_COPYROW_AVX | 2769 #endif // HAS_COPYROW_AVX |
| 2757 | 2770 |
| 2758 #ifdef HAS_COPYROW_ERMS | 2771 #ifdef HAS_COPYROW_ERMS |
| 2759 // Multiple of 1. | 2772 // Multiple of 1. |
| 2760 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { | 2773 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { |
| 2761 size_t width_tmp = (size_t)(width); | 2774 size_t width_tmp = (size_t)(width); |
| 2762 asm volatile ( | 2775 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" |
| 2763 "rep movsb " MEMMOVESTRING(0,1) " \n" | 2776 : "+S"(src), // %0 |
| 2764 : "+S"(src), // %0 | 2777 "+D"(dst), // %1 |
| 2765 "+D"(dst), // %1 | 2778 "+c"(width_tmp) // %2 |
| 2766 "+c"(width_tmp) // %2 | 2779 : |
| 2767 : | 2780 : "memory", "cc"); |
| 2768 : "memory", "cc" | |
| 2769 ); | |
| 2770 } | 2781 } |
| 2771 #endif // HAS_COPYROW_ERMS | 2782 #endif // HAS_COPYROW_ERMS |
| 2772 | 2783 |
| 2773 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 | 2784 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
| 2774 // width in pixels | 2785 // width in pixels |
| 2775 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { | 2786 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
| 2776 asm volatile ( | 2787 asm volatile ( |
| 2777 "pcmpeqb %%xmm0,%%xmm0 \n" | 2788 "pcmpeqb %%xmm0,%%xmm0 \n" |
| 2778 "pslld $0x18,%%xmm0 \n" | 2789 "pslld $0x18,%%xmm0 \n" |
| 2779 "pcmpeqb %%xmm1,%%xmm1 \n" | 2790 "pcmpeqb %%xmm1,%%xmm1 \n" |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2831 : | 2842 : |
| 2832 : "memory", "cc" | 2843 : "memory", "cc" |
| 2833 , "xmm0", "xmm1", "xmm2" | 2844 , "xmm0", "xmm1", "xmm2" |
| 2834 ); | 2845 ); |
| 2835 } | 2846 } |
| 2836 #endif // HAS_ARGBCOPYALPHAROW_AVX2 | 2847 #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
| 2837 | 2848 |
| 2838 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 | 2849 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 |
| 2839 // width in pixels | 2850 // width in pixels |
| 2840 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { | 2851 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { |
| 2841 asm volatile ( | 2852 asm volatile ( |
| 2842 LABELALIGN | 2853 LABELALIGN |
| 2843 "1: \n" | 2854 "1: \n" |
| 2844 "movdqu " MEMACCESS(0) ", %%xmm0 \n" | 2855 "movdqu " MEMACCESS(0) ", %%xmm0 \n" |
| 2845 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" | 2856 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" |
| 2846 "lea " MEMLEA(0x20, 0) ", %0 \n" | 2857 "lea " MEMLEA(0x20, 0) ", %0 \n" |
| 2847 "psrld $0x18, %%xmm0 \n" | 2858 "psrld $0x18, %%xmm0 \n" |
| 2848 "psrld $0x18, %%xmm1 \n" | 2859 "psrld $0x18, %%xmm1 \n" |
| 2849 "packssdw %%xmm1, %%xmm0 \n" | 2860 "packssdw %%xmm1, %%xmm0 \n" |
| 2850 "packuswb %%xmm0, %%xmm0 \n" | 2861 "packuswb %%xmm0, %%xmm0 \n" |
| 2851 "movq %%xmm0," MEMACCESS(1) " \n" | 2862 "movq %%xmm0," MEMACCESS(1) " \n" |
| 2852 "lea " MEMLEA(0x8, 1) ", %1 \n" | 2863 "lea " MEMLEA(0x8, 1) ", %1 \n" |
| 2853 "sub $0x8, %2 \n" | 2864 "sub $0x8, %2 \n" |
| 2854 "jg 1b \n" | 2865 "jg 1b \n" |
| 2855 : "+r"(src_argb), // %0 | 2866 : "+r"(src_argb), // %0 |
| 2856 "+r"(dst_a), // %1 | 2867 "+r"(dst_a), // %1 |
| 2857 "+rm"(width) // %2 | 2868 "+rm"(width) // %2 |
| 2858 : | 2869 : |
| 2859 : "memory", "cc" | 2870 : "memory", "cc" |
| 2860 , "xmm0", "xmm1" | 2871 , "xmm0", "xmm1" |
| 2861 ); | 2872 ); |
| 2862 } | 2873 } |
| 2863 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 | 2874 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 |
| 2864 | 2875 |
| 2865 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 | 2876 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 |
| 2866 static const uvec8 kShuffleAlphaShort_AVX2 = { | 2877 static const uvec8 kShuffleAlphaShort_AVX2 = { |
| 2867 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, | 2878 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, |
| 2868 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u | 2879 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; |
| 2869 }; | |
| 2870 | 2880 |
| 2871 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { | 2881 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { |
| 2872 asm volatile ( | 2882 asm volatile ( |
| 2873 "vmovdqa %3,%%ymm4 \n" | 2883 "vmovdqa %3,%%ymm4 \n" |
| 2874 "vbroadcastf128 %4,%%ymm5 \n" | 2884 "vbroadcastf128 %4,%%ymm5 \n" |
| 2875 LABELALIGN | 2885 LABELALIGN |
| 2876 "1: \n" | 2886 "1: \n" |
| 2877 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" | 2887 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" |
| 2878 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" | 2888 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" |
| 2879 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 | 2889 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 |
| 2880 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | 2890 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
| 2881 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" | 2891 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" |
| 2882 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" | 2892 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2969 : "memory", "cc" | 2979 : "memory", "cc" |
| 2970 , "xmm0", "xmm1", "xmm2" | 2980 , "xmm0", "xmm1", "xmm2" |
| 2971 ); | 2981 ); |
| 2972 } | 2982 } |
| 2973 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 | 2983 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
| 2974 | 2984 |
| 2975 #ifdef HAS_SETROW_X86 | 2985 #ifdef HAS_SETROW_X86 |
| 2976 void SetRow_X86(uint8* dst, uint8 v8, int width) { | 2986 void SetRow_X86(uint8* dst, uint8 v8, int width) { |
| 2977 size_t width_tmp = (size_t)(width >> 2); | 2987 size_t width_tmp = (size_t)(width >> 2); |
| 2978 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. | 2988 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. |
| 2979 asm volatile ( | 2989 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" |
| 2980 "rep stosl " MEMSTORESTRING(eax,0) " \n" | 2990 : "+D"(dst), // %0 |
| 2981 : "+D"(dst), // %0 | 2991 "+c"(width_tmp) // %1 |
| 2982 "+c"(width_tmp) // %1 | 2992 : "a"(v32) // %2 |
| 2983 : "a"(v32) // %2 | 2993 : "memory", "cc"); |
| 2984 : "memory", "cc"); | |
| 2985 } | 2994 } |
| 2986 | 2995 |
| 2987 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { | 2996 void SetRow_ERMS(uint8* dst, uint8 v8, int width) { |
| 2988 size_t width_tmp = (size_t)(width); | 2997 size_t width_tmp = (size_t)(width); |
| 2989 asm volatile ( | 2998 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" |
| 2990 "rep stosb " MEMSTORESTRING(al,0) " \n" | 2999 : "+D"(dst), // %0 |
| 2991 : "+D"(dst), // %0 | 3000 "+c"(width_tmp) // %1 |
| 2992 "+c"(width_tmp) // %1 | 3001 : "a"(v8) // %2 |
| 2993 : "a"(v8) // %2 | 3002 : "memory", "cc"); |
| 2994 : "memory", "cc"); | |
| 2995 } | 3003 } |
| 2996 | 3004 |
| 2997 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { | 3005 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { |
| 2998 size_t width_tmp = (size_t)(width); | 3006 size_t width_tmp = (size_t)(width); |
| 2999 asm volatile ( | 3007 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" |
| 3000 "rep stosl " MEMSTORESTRING(eax,0) " \n" | 3008 : "+D"(dst_argb), // %0 |
| 3001 : "+D"(dst_argb), // %0 | 3009 "+c"(width_tmp) // %1 |
| 3002 "+c"(width_tmp) // %1 | 3010 : "a"(v32) // %2 |
| 3003 : "a"(v32) // %2 | 3011 : "memory", "cc"); |
| 3004 : "memory", "cc"); | |
| 3005 } | 3012 } |
| 3006 #endif // HAS_SETROW_X86 | 3013 #endif // HAS_SETROW_X86 |
| 3007 | 3014 |
| 3008 #ifdef HAS_YUY2TOYROW_SSE2 | 3015 #ifdef HAS_YUY2TOYROW_SSE2 |
| 3009 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { | 3016 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { |
| 3010 asm volatile ( | 3017 asm volatile ( |
| 3011 "pcmpeqb %%xmm5,%%xmm5 \n" | 3018 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3012 "psrlw $0x8,%%xmm5 \n" | 3019 "psrlw $0x8,%%xmm5 \n" |
| 3013 LABELALIGN | 3020 LABELALIGN |
| 3014 "1: \n" | 3021 "1: \n" |
| 3015 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3022 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3016 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3023 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3017 "lea " MEMLEA(0x20,0) ",%0 \n" | 3024 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3018 "pand %%xmm5,%%xmm0 \n" | 3025 "pand %%xmm5,%%xmm0 \n" |
| 3019 "pand %%xmm5,%%xmm1 \n" | 3026 "pand %%xmm5,%%xmm1 \n" |
| 3020 "packuswb %%xmm1,%%xmm0 \n" | 3027 "packuswb %%xmm1,%%xmm0 \n" |
| 3021 "movdqu %%xmm0," MEMACCESS(1) " \n" | 3028 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 3022 "lea " MEMLEA(0x10,1) ",%1 \n" | 3029 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 3023 "sub $0x10,%2 \n" | 3030 "sub $0x10,%2 \n" |
| 3024 "jg 1b \n" | 3031 "jg 1b \n" |
| 3025 : "+r"(src_yuy2), // %0 | 3032 : "+r"(src_yuy2), // %0 |
| 3026 "+r"(dst_y), // %1 | 3033 "+r"(dst_y), // %1 |
| 3027 "+r"(width) // %2 | 3034 "+r"(width) // %2 |
| 3028 : | 3035 : |
| 3029 : "memory", "cc" | 3036 : "memory", "cc" |
| 3030 , "xmm0", "xmm1", "xmm5" | 3037 , "xmm0", "xmm1", "xmm5" |
| 3031 ); | 3038 ); |
| 3032 } | 3039 } |
| 3033 | 3040 |
| 3034 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, | 3041 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, |
| 3035 uint8* dst_u, uint8* dst_v, int width) { | 3042 int stride_yuy2, |
| 3043 uint8* dst_u, |
| 3044 uint8* dst_v, |
| 3045 int width) { |
| 3036 asm volatile ( | 3046 asm volatile ( |
| 3037 "pcmpeqb %%xmm5,%%xmm5 \n" | 3047 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3038 "psrlw $0x8,%%xmm5 \n" | 3048 "psrlw $0x8,%%xmm5 \n" |
| 3039 "sub %1,%2 \n" | 3049 "sub %1,%2 \n" |
| 3040 LABELALIGN | 3050 LABELALIGN |
| 3041 "1: \n" | 3051 "1: \n" |
| 3042 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3052 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3043 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3053 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3044 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3054 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
| 3045 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3055 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 3063 "+r"(dst_u), // %1 | 3073 "+r"(dst_u), // %1 |
| 3064 "+r"(dst_v), // %2 | 3074 "+r"(dst_v), // %2 |
| 3065 "+r"(width) // %3 | 3075 "+r"(width) // %3 |
| 3066 : "r"((intptr_t)(stride_yuy2)) // %4 | 3076 : "r"((intptr_t)(stride_yuy2)) // %4 |
| 3067 : "memory", "cc", NACL_R14 | 3077 : "memory", "cc", NACL_R14 |
| 3068 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3078 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3069 ); | 3079 ); |
| 3070 } | 3080 } |
| 3071 | 3081 |
| 3072 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, | 3082 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
| 3073 uint8* dst_u, uint8* dst_v, int width) { | 3083 uint8* dst_u, |
| 3084 uint8* dst_v, |
| 3085 int width) { |
| 3074 asm volatile ( | 3086 asm volatile ( |
| 3075 "pcmpeqb %%xmm5,%%xmm5 \n" | 3087 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3076 "psrlw $0x8,%%xmm5 \n" | 3088 "psrlw $0x8,%%xmm5 \n" |
| 3077 "sub %1,%2 \n" | 3089 "sub %1,%2 \n" |
| 3078 LABELALIGN | 3090 LABELALIGN |
| 3079 "1: \n" | 3091 "1: \n" |
| 3080 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3092 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3081 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3093 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3082 "lea " MEMLEA(0x20,0) ",%0 \n" | 3094 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3083 "psrlw $0x8,%%xmm0 \n" | 3095 "psrlw $0x8,%%xmm0 \n" |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3119 "jg 1b \n" | 3131 "jg 1b \n" |
| 3120 : "+r"(src_uyvy), // %0 | 3132 : "+r"(src_uyvy), // %0 |
| 3121 "+r"(dst_y), // %1 | 3133 "+r"(dst_y), // %1 |
| 3122 "+r"(width) // %2 | 3134 "+r"(width) // %2 |
| 3123 : | 3135 : |
| 3124 : "memory", "cc" | 3136 : "memory", "cc" |
| 3125 , "xmm0", "xmm1" | 3137 , "xmm0", "xmm1" |
| 3126 ); | 3138 ); |
| 3127 } | 3139 } |
| 3128 | 3140 |
| 3129 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, | 3141 void UYVYToUVRow_SSE2(const uint8* src_uyvy, |
| 3130 uint8* dst_u, uint8* dst_v, int width) { | 3142 int stride_uyvy, |
| 3143 uint8* dst_u, |
| 3144 uint8* dst_v, |
| 3145 int width) { |
| 3131 asm volatile ( | 3146 asm volatile ( |
| 3132 "pcmpeqb %%xmm5,%%xmm5 \n" | 3147 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3133 "psrlw $0x8,%%xmm5 \n" | 3148 "psrlw $0x8,%%xmm5 \n" |
| 3134 "sub %1,%2 \n" | 3149 "sub %1,%2 \n" |
| 3135 LABELALIGN | 3150 LABELALIGN |
| 3136 "1: \n" | 3151 "1: \n" |
| 3137 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3152 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3138 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3153 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3139 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 | 3154 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
| 3140 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 | 3155 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 3158 "+r"(dst_u), // %1 | 3173 "+r"(dst_u), // %1 |
| 3159 "+r"(dst_v), // %2 | 3174 "+r"(dst_v), // %2 |
| 3160 "+r"(width) // %3 | 3175 "+r"(width) // %3 |
| 3161 : "r"((intptr_t)(stride_uyvy)) // %4 | 3176 : "r"((intptr_t)(stride_uyvy)) // %4 |
| 3162 : "memory", "cc", NACL_R14 | 3177 : "memory", "cc", NACL_R14 |
| 3163 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 3178 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 3164 ); | 3179 ); |
| 3165 } | 3180 } |
| 3166 | 3181 |
| 3167 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, | 3182 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
| 3168 uint8* dst_u, uint8* dst_v, int width) { | 3183 uint8* dst_u, |
| 3184 uint8* dst_v, |
| 3185 int width) { |
| 3169 asm volatile ( | 3186 asm volatile ( |
| 3170 "pcmpeqb %%xmm5,%%xmm5 \n" | 3187 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3171 "psrlw $0x8,%%xmm5 \n" | 3188 "psrlw $0x8,%%xmm5 \n" |
| 3172 "sub %1,%2 \n" | 3189 "sub %1,%2 \n" |
| 3173 LABELALIGN | 3190 LABELALIGN |
| 3174 "1: \n" | 3191 "1: \n" |
| 3175 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3192 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3176 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 3193 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 3177 "lea " MEMLEA(0x20,0) ",%0 \n" | 3194 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 3178 "pand %%xmm5,%%xmm0 \n" | 3195 "pand %%xmm5,%%xmm0 \n" |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3220 "vzeroupper \n" | 3237 "vzeroupper \n" |
| 3221 : "+r"(src_yuy2), // %0 | 3238 : "+r"(src_yuy2), // %0 |
| 3222 "+r"(dst_y), // %1 | 3239 "+r"(dst_y), // %1 |
| 3223 "+r"(width) // %2 | 3240 "+r"(width) // %2 |
| 3224 : | 3241 : |
| 3225 : "memory", "cc" | 3242 : "memory", "cc" |
| 3226 , "xmm0", "xmm1", "xmm5" | 3243 , "xmm0", "xmm1", "xmm5" |
| 3227 ); | 3244 ); |
| 3228 } | 3245 } |
| 3229 | 3246 |
| 3230 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, | 3247 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, |
| 3231 uint8* dst_u, uint8* dst_v, int width) { | 3248 int stride_yuy2, |
| 3249 uint8* dst_u, |
| 3250 uint8* dst_v, |
| 3251 int width) { |
| 3232 asm volatile ( | 3252 asm volatile ( |
| 3233 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3253 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3234 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3254 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
| 3235 "sub %1,%2 \n" | 3255 "sub %1,%2 \n" |
| 3236 LABELALIGN | 3256 LABELALIGN |
| 3237 "1: \n" | 3257 "1: \n" |
| 3238 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3258 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 3239 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3259 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 3240 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3260 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
| 3241 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) | 3261 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) |
| (...skipping 18 matching lines...) Expand all Loading... |
| 3260 "+r"(dst_u), // %1 | 3280 "+r"(dst_u), // %1 |
| 3261 "+r"(dst_v), // %2 | 3281 "+r"(dst_v), // %2 |
| 3262 "+r"(width) // %3 | 3282 "+r"(width) // %3 |
| 3263 : "r"((intptr_t)(stride_yuy2)) // %4 | 3283 : "r"((intptr_t)(stride_yuy2)) // %4 |
| 3264 : "memory", "cc", NACL_R14 | 3284 : "memory", "cc", NACL_R14 |
| 3265 "xmm0", "xmm1", "xmm5" | 3285 "xmm0", "xmm1", "xmm5" |
| 3266 ); | 3286 ); |
| 3267 } | 3287 } |
| 3268 | 3288 |
| 3269 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, | 3289 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
| 3270 uint8* dst_u, uint8* dst_v, int width) { | 3290 uint8* dst_u, |
| 3291 uint8* dst_v, |
| 3292 int width) { |
| 3271 asm volatile ( | 3293 asm volatile ( |
| 3272 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3294 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3273 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3295 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
| 3274 "sub %1,%2 \n" | 3296 "sub %1,%2 \n" |
| 3275 LABELALIGN | 3297 LABELALIGN |
| 3276 "1: \n" | 3298 "1: \n" |
| 3277 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3299 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 3278 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3300 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 3279 "lea " MEMLEA(0x40,0) ",%0 \n" | 3301 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 3280 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3302 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3320 "jg 1b \n" | 3342 "jg 1b \n" |
| 3321 "vzeroupper \n" | 3343 "vzeroupper \n" |
| 3322 : "+r"(src_uyvy), // %0 | 3344 : "+r"(src_uyvy), // %0 |
| 3323 "+r"(dst_y), // %1 | 3345 "+r"(dst_y), // %1 |
| 3324 "+r"(width) // %2 | 3346 "+r"(width) // %2 |
| 3325 : | 3347 : |
| 3326 : "memory", "cc" | 3348 : "memory", "cc" |
| 3327 , "xmm0", "xmm1", "xmm5" | 3349 , "xmm0", "xmm1", "xmm5" |
| 3328 ); | 3350 ); |
| 3329 } | 3351 } |
| 3330 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, | 3352 void UYVYToUVRow_AVX2(const uint8* src_uyvy, |
| 3331 uint8* dst_u, uint8* dst_v, int width) { | 3353 int stride_uyvy, |
| 3354 uint8* dst_u, |
| 3355 uint8* dst_v, |
| 3356 int width) { |
| 3332 asm volatile ( | 3357 asm volatile ( |
| 3333 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3358 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3334 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3359 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
| 3335 "sub %1,%2 \n" | 3360 "sub %1,%2 \n" |
| 3336 | 3361 |
| 3337 LABELALIGN | 3362 LABELALIGN |
| 3338 "1: \n" | 3363 "1: \n" |
| 3339 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3364 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 3340 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3365 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 3341 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 | 3366 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 3361 "+r"(dst_u), // %1 | 3386 "+r"(dst_u), // %1 |
| 3362 "+r"(dst_v), // %2 | 3387 "+r"(dst_v), // %2 |
| 3363 "+r"(width) // %3 | 3388 "+r"(width) // %3 |
| 3364 : "r"((intptr_t)(stride_uyvy)) // %4 | 3389 : "r"((intptr_t)(stride_uyvy)) // %4 |
| 3365 : "memory", "cc", NACL_R14 | 3390 : "memory", "cc", NACL_R14 |
| 3366 "xmm0", "xmm1", "xmm5" | 3391 "xmm0", "xmm1", "xmm5" |
| 3367 ); | 3392 ); |
| 3368 } | 3393 } |
| 3369 | 3394 |
| 3370 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, | 3395 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
| 3371 uint8* dst_u, uint8* dst_v, int width) { | 3396 uint8* dst_u, |
| 3397 uint8* dst_v, |
| 3398 int width) { |
| 3372 asm volatile ( | 3399 asm volatile ( |
| 3373 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3400 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3374 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" | 3401 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" |
| 3375 "sub %1,%2 \n" | 3402 "sub %1,%2 \n" |
| 3376 LABELALIGN | 3403 LABELALIGN |
| 3377 "1: \n" | 3404 "1: \n" |
| 3378 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 3405 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 3379 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 3406 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 3380 "lea " MEMLEA(0x40,0) ",%0 \n" | 3407 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 3381 "vpand %%ymm5,%%ymm0,%%ymm0 \n" | 3408 "vpand %%ymm5,%%ymm0,%%ymm0 \n" |
| (...skipping 18 matching lines...) Expand all Loading... |
| 3400 "+r"(width) // %3 | 3427 "+r"(width) // %3 |
| 3401 : | 3428 : |
| 3402 : "memory", "cc", NACL_R14 | 3429 : "memory", "cc", NACL_R14 |
| 3403 "xmm0", "xmm1", "xmm5" | 3430 "xmm0", "xmm1", "xmm5" |
| 3404 ); | 3431 ); |
| 3405 } | 3432 } |
| 3406 #endif // HAS_YUY2TOYROW_AVX2 | 3433 #endif // HAS_YUY2TOYROW_AVX2 |
| 3407 | 3434 |
| 3408 #ifdef HAS_ARGBBLENDROW_SSSE3 | 3435 #ifdef HAS_ARGBBLENDROW_SSSE3 |
| 3409 // Shuffle table for isolating alpha. | 3436 // Shuffle table for isolating alpha. |
| 3410 static uvec8 kShuffleAlpha = { | 3437 static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 3411 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 3438 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; |
| 3412 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | |
| 3413 }; | |
| 3414 | 3439 |
| 3415 // Blend 8 pixels at a time | 3440 // Blend 8 pixels at a time |
| 3416 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 3441 void ARGBBlendRow_SSSE3(const uint8* src_argb0, |
| 3417 uint8* dst_argb, int width) { | 3442 const uint8* src_argb1, |
| 3443 uint8* dst_argb, |
| 3444 int width) { |
| 3418 asm volatile ( | 3445 asm volatile ( |
| 3419 "pcmpeqb %%xmm7,%%xmm7 \n" | 3446 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 3420 "psrlw $0xf,%%xmm7 \n" | 3447 "psrlw $0xf,%%xmm7 \n" |
| 3421 "pcmpeqb %%xmm6,%%xmm6 \n" | 3448 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 3422 "psrlw $0x8,%%xmm6 \n" | 3449 "psrlw $0x8,%%xmm6 \n" |
| 3423 "pcmpeqb %%xmm5,%%xmm5 \n" | 3450 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3424 "psllw $0x8,%%xmm5 \n" | 3451 "psllw $0x8,%%xmm5 \n" |
| 3425 "pcmpeqb %%xmm4,%%xmm4 \n" | 3452 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 3426 "pslld $0x18,%%xmm4 \n" | 3453 "pslld $0x18,%%xmm4 \n" |
| 3427 "sub $0x4,%3 \n" | 3454 "sub $0x4,%3 \n" |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3492 ); | 3519 ); |
| 3493 } | 3520 } |
| 3494 #endif // HAS_ARGBBLENDROW_SSSE3 | 3521 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 3495 | 3522 |
| 3496 #ifdef HAS_BLENDPLANEROW_SSSE3 | 3523 #ifdef HAS_BLENDPLANEROW_SSSE3 |
| 3497 // Blend 8 pixels at a time. | 3524 // Blend 8 pixels at a time. |
| 3498 // unsigned version of math | 3525 // unsigned version of math |
| 3499 // =((A2*C2)+(B2*(255-C2))+255)/256 | 3526 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 3500 // signed version of math | 3527 // signed version of math |
| 3501 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 | 3528 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
| 3502 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, | 3529 void BlendPlaneRow_SSSE3(const uint8* src0, |
| 3503 const uint8* alpha, uint8* dst, int width) { | 3530 const uint8* src1, |
| 3504 asm volatile ( | 3531 const uint8* alpha, |
| 3505 "pcmpeqb %%xmm5,%%xmm5 \n" | 3532 uint8* dst, |
| 3506 "psllw $0x8,%%xmm5 \n" | 3533 int width) { |
| 3507 "mov $0x80808080,%%eax \n" | 3534 asm volatile( |
| 3508 "movd %%eax,%%xmm6 \n" | 3535 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 3509 "pshufd $0x0,%%xmm6,%%xmm6 \n" | 3536 "psllw $0x8,%%xmm5 \n" |
| 3510 "mov $0x807f807f,%%eax \n" | 3537 "mov $0x80808080,%%eax \n" |
| 3511 "movd %%eax,%%xmm7 \n" | 3538 "movd %%eax,%%xmm6 \n" |
| 3512 "pshufd $0x0,%%xmm7,%%xmm7 \n" | 3539 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
| 3513 "sub %2,%0 \n" | 3540 "mov $0x807f807f,%%eax \n" |
| 3514 "sub %2,%1 \n" | 3541 "movd %%eax,%%xmm7 \n" |
| 3515 "sub %2,%3 \n" | 3542 "pshufd $0x0,%%xmm7,%%xmm7 \n" |
| 3543 "sub %2,%0 \n" |
| 3544 "sub %2,%1 \n" |
| 3545 "sub %2,%3 \n" |
| 3516 | 3546 |
| 3517 // 8 pixel loop. | 3547 // 8 pixel loop. |
| 3518 LABELALIGN | 3548 LABELALIGN |
| 3519 "1: \n" | 3549 "1: \n" |
| 3520 "movq (%2),%%xmm0 \n" | 3550 "movq (%2),%%xmm0 \n" |
| 3521 "punpcklbw %%xmm0,%%xmm0 \n" | 3551 "punpcklbw %%xmm0,%%xmm0 \n" |
| 3522 "pxor %%xmm5,%%xmm0 \n" | 3552 "pxor %%xmm5,%%xmm0 \n" |
| 3523 "movq (%0,%2,1),%%xmm1 \n" | 3553 "movq (%0,%2,1),%%xmm1 \n" |
| 3524 "movq (%1,%2,1),%%xmm2 \n" | 3554 "movq (%1,%2,1),%%xmm2 \n" |
| 3525 "punpcklbw %%xmm2,%%xmm1 \n" | 3555 "punpcklbw %%xmm2,%%xmm1 \n" |
| 3526 "psubb %%xmm6,%%xmm1 \n" | 3556 "psubb %%xmm6,%%xmm1 \n" |
| 3527 "pmaddubsw %%xmm1,%%xmm0 \n" | 3557 "pmaddubsw %%xmm1,%%xmm0 \n" |
| 3528 "paddw %%xmm7,%%xmm0 \n" | 3558 "paddw %%xmm7,%%xmm0 \n" |
| 3529 "psrlw $0x8,%%xmm0 \n" | 3559 "psrlw $0x8,%%xmm0 \n" |
| 3530 "packuswb %%xmm0,%%xmm0 \n" | 3560 "packuswb %%xmm0,%%xmm0 \n" |
| 3531 "movq %%xmm0,(%3,%2,1) \n" | 3561 "movq %%xmm0,(%3,%2,1) \n" |
| 3532 "lea 0x8(%2),%2 \n" | 3562 "lea 0x8(%2),%2 \n" |
| 3533 "sub $0x8,%4 \n" | 3563 "sub $0x8,%4 \n" |
| 3534 "jg 1b \n" | 3564 "jg 1b \n" |
| 3535 : "+r"(src0), // %0 | 3565 : "+r"(src0), // %0 |
| 3536 "+r"(src1), // %1 | 3566 "+r"(src1), // %1 |
| 3537 "+r"(alpha), // %2 | 3567 "+r"(alpha), // %2 |
| 3538 "+r"(dst), // %3 | 3568 "+r"(dst), // %3 |
| 3539 "+rm"(width) // %4 | 3569 "+rm"(width) // %4 |
| 3540 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" | 3570 ::"memory", |
| 3541 ); | 3571 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); |
| 3542 } | 3572 } |
| 3543 #endif // HAS_BLENDPLANEROW_SSSE3 | 3573 #endif // HAS_BLENDPLANEROW_SSSE3 |
| 3544 | 3574 |
| 3545 #ifdef HAS_BLENDPLANEROW_AVX2 | 3575 #ifdef HAS_BLENDPLANEROW_AVX2 |
| 3546 // Blend 32 pixels at a time. | 3576 // Blend 32 pixels at a time. |
| 3547 // unsigned version of math | 3577 // unsigned version of math |
| 3548 // =((A2*C2)+(B2*(255-C2))+255)/256 | 3578 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 3549 // signed version of math | 3579 // signed version of math |
| 3550 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 | 3580 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
| 3551 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, | 3581 void BlendPlaneRow_AVX2(const uint8* src0, |
| 3552 const uint8* alpha, uint8* dst, int width) { | 3582 const uint8* src1, |
| 3553 asm volatile ( | 3583 const uint8* alpha, |
| 3554 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3584 uint8* dst, |
| 3555 "vpsllw $0x8,%%ymm5,%%ymm5 \n" | 3585 int width) { |
| 3556 "mov $0x80808080,%%eax \n" | 3586 asm volatile( |
| 3557 "vmovd %%eax,%%xmm6 \n" | 3587 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3558 "vbroadcastss %%xmm6,%%ymm6 \n" | 3588 "vpsllw $0x8,%%ymm5,%%ymm5 \n" |
| 3559 "mov $0x807f807f,%%eax \n" | 3589 "mov $0x80808080,%%eax \n" |
| 3560 "vmovd %%eax,%%xmm7 \n" | 3590 "vmovd %%eax,%%xmm6 \n" |
| 3561 "vbroadcastss %%xmm7,%%ymm7 \n" | 3591 "vbroadcastss %%xmm6,%%ymm6 \n" |
| 3562 "sub %2,%0 \n" | 3592 "mov $0x807f807f,%%eax \n" |
| 3563 "sub %2,%1 \n" | 3593 "vmovd %%eax,%%xmm7 \n" |
| 3564 "sub %2,%3 \n" | 3594 "vbroadcastss %%xmm7,%%ymm7 \n" |
| 3595 "sub %2,%0 \n" |
| 3596 "sub %2,%1 \n" |
| 3597 "sub %2,%3 \n" |
| 3565 | 3598 |
| 3566 // 32 pixel loop. | 3599 // 32 pixel loop. |
| 3567 LABELALIGN | 3600 LABELALIGN |
| 3568 "1: \n" | 3601 "1: \n" |
| 3569 "vmovdqu (%2),%%ymm0 \n" | 3602 "vmovdqu (%2),%%ymm0 \n" |
| 3570 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" | 3603 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" |
| 3571 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" | 3604 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" |
| 3572 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" | 3605 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" |
| 3573 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" | 3606 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" |
| 3574 "vmovdqu (%0,%2,1),%%ymm1 \n" | 3607 "vmovdqu (%0,%2,1),%%ymm1 \n" |
| 3575 "vmovdqu (%1,%2,1),%%ymm2 \n" | 3608 "vmovdqu (%1,%2,1),%%ymm2 \n" |
| 3576 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" | 3609 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" |
| 3577 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 3610 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
| 3578 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" | 3611 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" |
| 3579 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" | 3612 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" |
| 3580 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" | 3613 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
| 3581 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" | 3614 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" |
| 3582 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" | 3615 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" |
| 3583 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" | 3616 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" |
| 3584 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" | 3617 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" |
| 3585 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3618 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
| 3586 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" | 3619 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" |
| 3587 "vmovdqu %%ymm0,(%3,%2,1) \n" | 3620 "vmovdqu %%ymm0,(%3,%2,1) \n" |
| 3588 "lea 0x20(%2),%2 \n" | 3621 "lea 0x20(%2),%2 \n" |
| 3589 "sub $0x20,%4 \n" | 3622 "sub $0x20,%4 \n" |
| 3590 "jg 1b \n" | 3623 "jg 1b \n" |
| 3591 "vzeroupper \n" | 3624 "vzeroupper \n" |
| 3592 : "+r"(src0), // %0 | 3625 : "+r"(src0), // %0 |
| 3593 "+r"(src1), // %1 | 3626 "+r"(src1), // %1 |
| 3594 "+r"(alpha), // %2 | 3627 "+r"(alpha), // %2 |
| 3595 "+r"(dst), // %3 | 3628 "+r"(dst), // %3 |
| 3596 "+rm"(width) // %4 | 3629 "+rm"(width) // %4 |
| 3597 :: "memory", "cc", "eax", | 3630 ::"memory", |
| 3598 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3631 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", |
| 3599 ); | 3632 "xmm7"); |
| 3600 } | 3633 } |
| 3601 #endif // HAS_BLENDPLANEROW_AVX2 | 3634 #endif // HAS_BLENDPLANEROW_AVX2 |
| 3602 | 3635 |
| 3603 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3636 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 3604 // Shuffle table duplicating alpha | 3637 // Shuffle table duplicating alpha |
| 3605 static uvec8 kShuffleAlpha0 = { | 3638 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, |
| 3606 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3639 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; |
| 3607 }; | 3640 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 3608 static uvec8 kShuffleAlpha1 = { | 3641 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; |
| 3609 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | |
| 3610 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u | |
| 3611 }; | |
| 3612 // Attenuate 4 pixels at a time. | 3642 // Attenuate 4 pixels at a time. |
| 3613 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { | 3643 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
| 3614 asm volatile ( | 3644 asm volatile ( |
| 3615 "pcmpeqb %%xmm3,%%xmm3 \n" | 3645 "pcmpeqb %%xmm3,%%xmm3 \n" |
| 3616 "pslld $0x18,%%xmm3 \n" | 3646 "pslld $0x18,%%xmm3 \n" |
| 3617 "movdqa %3,%%xmm4 \n" | 3647 "movdqa %3,%%xmm4 \n" |
| 3618 "movdqa %4,%%xmm5 \n" | 3648 "movdqa %4,%%xmm5 \n" |
| 3619 | 3649 |
| 3620 // 4 pixel loop. | 3650 // 4 pixel loop. |
| 3621 LABELALIGN | 3651 LABELALIGN |
| (...skipping 25 matching lines...) Expand all Loading... |
| 3647 : "m"(kShuffleAlpha0), // %3 | 3677 : "m"(kShuffleAlpha0), // %3 |
| 3648 "m"(kShuffleAlpha1) // %4 | 3678 "m"(kShuffleAlpha1) // %4 |
| 3649 : "memory", "cc" | 3679 : "memory", "cc" |
| 3650 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3680 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 3651 ); | 3681 ); |
| 3652 } | 3682 } |
| 3653 #endif // HAS_ARGBATTENUATEROW_SSSE3 | 3683 #endif // HAS_ARGBATTENUATEROW_SSSE3 |
| 3654 | 3684 |
| 3655 #ifdef HAS_ARGBATTENUATEROW_AVX2 | 3685 #ifdef HAS_ARGBATTENUATEROW_AVX2 |
| 3656 // Shuffle table duplicating alpha. | 3686 // Shuffle table duplicating alpha. |
| 3657 static const uvec8 kShuffleAlpha_AVX2 = { | 3687 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, |
| 3658 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u | 3688 128u, 128u, 14u, 15u, 14u, 15u, |
| 3659 }; | 3689 14u, 15u, 128u, 128u}; |
| 3660 // Attenuate 8 pixels at a time. | 3690 // Attenuate 8 pixels at a time. |
| 3661 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { | 3691 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
| 3662 asm volatile ( | 3692 asm volatile ( |
| 3663 "vbroadcastf128 %3,%%ymm4 \n" | 3693 "vbroadcastf128 %3,%%ymm4 \n" |
| 3664 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3694 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 3665 "vpslld $0x18,%%ymm5,%%ymm5 \n" | 3695 "vpslld $0x18,%%ymm5,%%ymm5 \n" |
| 3666 "sub %0,%1 \n" | 3696 "sub %0,%1 \n" |
| 3667 | 3697 |
| 3668 // 8 pixel loop. | 3698 // 8 pixel loop. |
| 3669 LABELALIGN | 3699 LABELALIGN |
| (...skipping 20 matching lines...) Expand all Loading... |
| 3690 "+r"(width) // %2 | 3720 "+r"(width) // %2 |
| 3691 : "m"(kShuffleAlpha_AVX2) // %3 | 3721 : "m"(kShuffleAlpha_AVX2) // %3 |
| 3692 : "memory", "cc" | 3722 : "memory", "cc" |
| 3693 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 3723 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 3694 ); | 3724 ); |
| 3695 } | 3725 } |
| 3696 #endif // HAS_ARGBATTENUATEROW_AVX2 | 3726 #endif // HAS_ARGBATTENUATEROW_AVX2 |
| 3697 | 3727 |
| 3698 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 | 3728 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
| 3699 // Unattenuate 4 pixels at a time. | 3729 // Unattenuate 4 pixels at a time. |
| 3700 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 3730 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, |
| 3731 uint8* dst_argb, |
| 3701 int width) { | 3732 int width) { |
| 3702 uintptr_t alpha; | 3733 uintptr_t alpha; |
| 3703 asm volatile ( | 3734 asm volatile ( |
| 3704 // 4 pixel loop. | 3735 // 4 pixel loop. |
| 3705 LABELALIGN | 3736 LABELALIGN |
| 3706 "1: \n" | 3737 "1: \n" |
| 3707 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 3738 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 3708 "movzb " MEMACCESS2(0x03,0) ",%3 \n" | 3739 "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
| 3709 "punpcklbw %%xmm0,%%xmm0 \n" | 3740 "punpcklbw %%xmm0,%%xmm0 \n" |
| 3710 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 | 3741 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
| (...skipping 26 matching lines...) Expand all Loading... |
| 3737 : "r"(fixed_invtbl8) // %4 | 3768 : "r"(fixed_invtbl8) // %4 |
| 3738 : "memory", "cc", NACL_R14 | 3769 : "memory", "cc", NACL_R14 |
| 3739 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3770 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 3740 ); | 3771 ); |
| 3741 } | 3772 } |
| 3742 #endif // HAS_ARGBUNATTENUATEROW_SSE2 | 3773 #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
| 3743 | 3774 |
| 3744 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 | 3775 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
| 3745 // Shuffle table duplicating alpha. | 3776 // Shuffle table duplicating alpha. |
| 3746 static const uvec8 kUnattenShuffleAlpha_AVX2 = { | 3777 static const uvec8 kUnattenShuffleAlpha_AVX2 = { |
| 3747 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u | 3778 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; |
| 3748 }; | |
| 3749 // Unattenuate 8 pixels at a time. | 3779 // Unattenuate 8 pixels at a time. |
| 3750 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 3780 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, |
| 3781 uint8* dst_argb, |
| 3751 int width) { | 3782 int width) { |
| 3752 uintptr_t alpha; | 3783 uintptr_t alpha; |
| 3753 asm volatile ( | 3784 asm volatile ( |
| 3754 "sub %0,%1 \n" | 3785 "sub %0,%1 \n" |
| 3755 "vbroadcastf128 %5,%%ymm5 \n" | 3786 "vbroadcastf128 %5,%%ymm5 \n" |
| 3756 | 3787 |
| 3757 // 8 pixel loop. | 3788 // 8 pixel loop. |
| 3758 LABELALIGN | 3789 LABELALIGN |
| 3759 "1: \n" | 3790 "1: \n" |
| 3760 // replace VPGATHER | 3791 // replace VPGATHER |
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3855 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 3886 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 3856 ); | 3887 ); |
| 3857 } | 3888 } |
| 3858 #endif // HAS_ARGBGRAYROW_SSSE3 | 3889 #endif // HAS_ARGBGRAYROW_SSSE3 |
| 3859 | 3890 |
| 3860 #ifdef HAS_ARGBSEPIAROW_SSSE3 | 3891 #ifdef HAS_ARGBSEPIAROW_SSSE3 |
| 3861 // b = (r * 35 + g * 68 + b * 17) >> 7 | 3892 // b = (r * 35 + g * 68 + b * 17) >> 7 |
| 3862 // g = (r * 45 + g * 88 + b * 22) >> 7 | 3893 // g = (r * 45 + g * 88 + b * 22) >> 7 |
| 3863 // r = (r * 50 + g * 98 + b * 24) >> 7 | 3894 // r = (r * 50 + g * 98 + b * 24) >> 7 |
| 3864 // Constant for ARGB color to sepia tone | 3895 // Constant for ARGB color to sepia tone |
| 3865 static vec8 kARGBToSepiaB = { | 3896 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, |
| 3866 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 | 3897 17, 68, 35, 0, 17, 68, 35, 0}; |
| 3867 }; | |
| 3868 | 3898 |
| 3869 static vec8 kARGBToSepiaG = { | 3899 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, |
| 3870 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 | 3900 22, 88, 45, 0, 22, 88, 45, 0}; |
| 3871 }; | |
| 3872 | 3901 |
| 3873 static vec8 kARGBToSepiaR = { | 3902 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, |
| 3874 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 | 3903 24, 98, 50, 0, 24, 98, 50, 0}; |
| 3875 }; | |
| 3876 | 3904 |
| 3877 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 3905 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| 3878 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { | 3906 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
| 3879 asm volatile ( | 3907 asm volatile ( |
| 3880 "movdqa %2,%%xmm2 \n" | 3908 "movdqa %2,%%xmm2 \n" |
| 3881 "movdqa %3,%%xmm3 \n" | 3909 "movdqa %3,%%xmm3 \n" |
| 3882 "movdqa %4,%%xmm4 \n" | 3910 "movdqa %4,%%xmm4 \n" |
| 3883 | 3911 |
| 3884 // 8 pixel loop. | 3912 // 8 pixel loop. |
| 3885 LABELALIGN | 3913 LABELALIGN |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3928 "m"(kARGBToSepiaR) // %4 | 3956 "m"(kARGBToSepiaR) // %4 |
| 3929 : "memory", "cc" | 3957 : "memory", "cc" |
| 3930 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 3958 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 3931 ); | 3959 ); |
| 3932 } | 3960 } |
| 3933 #endif // HAS_ARGBSEPIAROW_SSSE3 | 3961 #endif // HAS_ARGBSEPIAROW_SSSE3 |
| 3934 | 3962 |
| 3935 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 | 3963 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 3936 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 3964 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| 3937 // Same as Sepia except matrix is provided. | 3965 // Same as Sepia except matrix is provided. |
| 3938 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 3966 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, |
| 3939 const int8* matrix_argb, int width) { | 3967 uint8* dst_argb, |
| 3968 const int8* matrix_argb, |
| 3969 int width) { |
| 3940 asm volatile ( | 3970 asm volatile ( |
| 3941 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 3971 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
| 3942 "pshufd $0x00,%%xmm5,%%xmm2 \n" | 3972 "pshufd $0x00,%%xmm5,%%xmm2 \n" |
| 3943 "pshufd $0x55,%%xmm5,%%xmm3 \n" | 3973 "pshufd $0x55,%%xmm5,%%xmm3 \n" |
| 3944 "pshufd $0xaa,%%xmm5,%%xmm4 \n" | 3974 "pshufd $0xaa,%%xmm5,%%xmm4 \n" |
| 3945 "pshufd $0xff,%%xmm5,%%xmm5 \n" | 3975 "pshufd $0xff,%%xmm5,%%xmm5 \n" |
| 3946 | 3976 |
| 3947 // 8 pixel loop. | 3977 // 8 pixel loop. |
| 3948 LABELALIGN | 3978 LABELALIGN |
| 3949 "1: \n" | 3979 "1: \n" |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3991 "+r"(width) // %2 | 4021 "+r"(width) // %2 |
| 3992 : "r"(matrix_argb) // %3 | 4022 : "r"(matrix_argb) // %3 |
| 3993 : "memory", "cc" | 4023 : "memory", "cc" |
| 3994 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4024 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 3995 ); | 4025 ); |
| 3996 } | 4026 } |
| 3997 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 | 4027 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
| 3998 | 4028 |
| 3999 #ifdef HAS_ARGBQUANTIZEROW_SSE2 | 4029 #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
| 4000 // Quantize 4 ARGB pixels (16 bytes). | 4030 // Quantize 4 ARGB pixels (16 bytes). |
| 4001 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, | 4031 void ARGBQuantizeRow_SSE2(uint8* dst_argb, |
| 4002 int interval_offset, int width) { | 4032 int scale, |
| 4033 int interval_size, |
| 4034 int interval_offset, |
| 4035 int width) { |
| 4003 asm volatile ( | 4036 asm volatile ( |
| 4004 "movd %2,%%xmm2 \n" | 4037 "movd %2,%%xmm2 \n" |
| 4005 "movd %3,%%xmm3 \n" | 4038 "movd %3,%%xmm3 \n" |
| 4006 "movd %4,%%xmm4 \n" | 4039 "movd %4,%%xmm4 \n" |
| 4007 "pshuflw $0x40,%%xmm2,%%xmm2 \n" | 4040 "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
| 4008 "pshufd $0x44,%%xmm2,%%xmm2 \n" | 4041 "pshufd $0x44,%%xmm2,%%xmm2 \n" |
| 4009 "pshuflw $0x40,%%xmm3,%%xmm3 \n" | 4042 "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
| 4010 "pshufd $0x44,%%xmm3,%%xmm3 \n" | 4043 "pshufd $0x44,%%xmm3,%%xmm3 \n" |
| 4011 "pshuflw $0x40,%%xmm4,%%xmm4 \n" | 4044 "pshuflw $0x40,%%xmm4,%%xmm4 \n" |
| 4012 "pshufd $0x44,%%xmm4,%%xmm4 \n" | 4045 "pshufd $0x44,%%xmm4,%%xmm4 \n" |
| (...skipping 28 matching lines...) Expand all Loading... |
| 4041 "r"(interval_size), // %3 | 4074 "r"(interval_size), // %3 |
| 4042 "r"(interval_offset) // %4 | 4075 "r"(interval_offset) // %4 |
| 4043 : "memory", "cc" | 4076 : "memory", "cc" |
| 4044 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4077 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4045 ); | 4078 ); |
| 4046 } | 4079 } |
| 4047 #endif // HAS_ARGBQUANTIZEROW_SSE2 | 4080 #endif // HAS_ARGBQUANTIZEROW_SSE2 |
| 4048 | 4081 |
| 4049 #ifdef HAS_ARGBSHADEROW_SSE2 | 4082 #ifdef HAS_ARGBSHADEROW_SSE2 |
| 4050 // Shade 4 pixels at a time by specified value. | 4083 // Shade 4 pixels at a time by specified value. |
| 4051 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, | 4084 void ARGBShadeRow_SSE2(const uint8* src_argb, |
| 4085 uint8* dst_argb, |
| 4086 int width, |
| 4052 uint32 value) { | 4087 uint32 value) { |
| 4053 asm volatile ( | 4088 asm volatile ( |
| 4054 "movd %3,%%xmm2 \n" | 4089 "movd %3,%%xmm2 \n" |
| 4055 "punpcklbw %%xmm2,%%xmm2 \n" | 4090 "punpcklbw %%xmm2,%%xmm2 \n" |
| 4056 "punpcklqdq %%xmm2,%%xmm2 \n" | 4091 "punpcklqdq %%xmm2,%%xmm2 \n" |
| 4057 | 4092 |
| 4058 // 4 pixel loop. | 4093 // 4 pixel loop. |
| 4059 LABELALIGN | 4094 LABELALIGN |
| 4060 "1: \n" | 4095 "1: \n" |
| 4061 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4096 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| (...skipping 15 matching lines...) Expand all Loading... |
| 4077 "+r"(width) // %2 | 4112 "+r"(width) // %2 |
| 4078 : "r"(value) // %3 | 4113 : "r"(value) // %3 |
| 4079 : "memory", "cc" | 4114 : "memory", "cc" |
| 4080 , "xmm0", "xmm1", "xmm2" | 4115 , "xmm0", "xmm1", "xmm2" |
| 4081 ); | 4116 ); |
| 4082 } | 4117 } |
| 4083 #endif // HAS_ARGBSHADEROW_SSE2 | 4118 #endif // HAS_ARGBSHADEROW_SSE2 |
| 4084 | 4119 |
| 4085 #ifdef HAS_ARGBMULTIPLYROW_SSE2 | 4120 #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
| 4086 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. | 4121 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4087 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4122 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, |
| 4088 uint8* dst_argb, int width) { | 4123 const uint8* src_argb1, |
| 4124 uint8* dst_argb, |
| 4125 int width) { |
| 4089 asm volatile ( | 4126 asm volatile ( |
| 4090 "pxor %%xmm5,%%xmm5 \n" | 4127 "pxor %%xmm5,%%xmm5 \n" |
| 4091 | 4128 |
| 4092 // 4 pixel loop. | 4129 // 4 pixel loop. |
| 4093 LABELALIGN | 4130 LABELALIGN |
| 4094 "1: \n" | 4131 "1: \n" |
| 4095 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4132 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4096 "lea " MEMLEA(0x10,0) ",%0 \n" | 4133 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4097 "movdqu " MEMACCESS(1) ",%%xmm2 \n" | 4134 "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
| 4098 "lea " MEMLEA(0x10,1) ",%1 \n" | 4135 "lea " MEMLEA(0x10,1) ",%1 \n" |
| (...skipping 16 matching lines...) Expand all Loading... |
| 4115 "+r"(width) // %3 | 4152 "+r"(width) // %3 |
| 4116 : | 4153 : |
| 4117 : "memory", "cc" | 4154 : "memory", "cc" |
| 4118 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4155 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4119 ); | 4156 ); |
| 4120 } | 4157 } |
| 4121 #endif // HAS_ARGBMULTIPLYROW_SSE2 | 4158 #endif // HAS_ARGBMULTIPLYROW_SSE2 |
| 4122 | 4159 |
| 4123 #ifdef HAS_ARGBMULTIPLYROW_AVX2 | 4160 #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
| 4124 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 4161 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| 4125 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4162 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, |
| 4126 uint8* dst_argb, int width) { | 4163 const uint8* src_argb1, |
| 4164 uint8* dst_argb, |
| 4165 int width) { |
| 4127 asm volatile ( | 4166 asm volatile ( |
| 4128 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" | 4167 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" |
| 4129 | 4168 |
| 4130 // 4 pixel loop. | 4169 // 4 pixel loop. |
| 4131 LABELALIGN | 4170 LABELALIGN |
| 4132 "1: \n" | 4171 "1: \n" |
| 4133 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" | 4172 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" |
| 4134 "lea " MEMLEA(0x20,0) ",%0 \n" | 4173 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4135 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" | 4174 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" |
| 4136 "lea " MEMLEA(0x20,1) ",%1 \n" | 4175 "lea " MEMLEA(0x20,1) ",%1 \n" |
| (...skipping 17 matching lines...) Expand all Loading... |
| 4154 : "memory", "cc" | 4193 : "memory", "cc" |
| 4155 #if defined(__AVX2__) | 4194 #if defined(__AVX2__) |
| 4156 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4195 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4157 #endif | 4196 #endif |
| 4158 ); | 4197 ); |
| 4159 } | 4198 } |
| 4160 #endif // HAS_ARGBMULTIPLYROW_AVX2 | 4199 #endif // HAS_ARGBMULTIPLYROW_AVX2 |
| 4161 | 4200 |
| 4162 #ifdef HAS_ARGBADDROW_SSE2 | 4201 #ifdef HAS_ARGBADDROW_SSE2 |
| 4163 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4202 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4164 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4203 void ARGBAddRow_SSE2(const uint8* src_argb0, |
| 4165 uint8* dst_argb, int width) { | 4204 const uint8* src_argb1, |
| 4205 uint8* dst_argb, |
| 4206 int width) { |
| 4166 asm volatile ( | 4207 asm volatile ( |
| 4167 // 4 pixel loop. | 4208 // 4 pixel loop. |
| 4168 LABELALIGN | 4209 LABELALIGN |
| 4169 "1: \n" | 4210 "1: \n" |
| 4170 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4171 "lea " MEMLEA(0x10,0) ",%0 \n" | 4212 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4172 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | 4213 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 4173 "lea " MEMLEA(0x10,1) ",%1 \n" | 4214 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4174 "paddusb %%xmm1,%%xmm0 \n" | 4215 "paddusb %%xmm1,%%xmm0 \n" |
| 4175 "movdqu %%xmm0," MEMACCESS(2) " \n" | 4216 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 4176 "lea " MEMLEA(0x10,2) ",%2 \n" | 4217 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4177 "sub $0x4,%3 \n" | 4218 "sub $0x4,%3 \n" |
| 4178 "jg 1b \n" | 4219 "jg 1b \n" |
| 4179 : "+r"(src_argb0), // %0 | 4220 : "+r"(src_argb0), // %0 |
| 4180 "+r"(src_argb1), // %1 | 4221 "+r"(src_argb1), // %1 |
| 4181 "+r"(dst_argb), // %2 | 4222 "+r"(dst_argb), // %2 |
| 4182 "+r"(width) // %3 | 4223 "+r"(width) // %3 |
| 4183 : | 4224 : |
| 4184 : "memory", "cc" | 4225 : "memory", "cc" |
| 4185 , "xmm0", "xmm1" | 4226 , "xmm0", "xmm1" |
| 4186 ); | 4227 ); |
| 4187 } | 4228 } |
| 4188 #endif // HAS_ARGBADDROW_SSE2 | 4229 #endif // HAS_ARGBADDROW_SSE2 |
| 4189 | 4230 |
| 4190 #ifdef HAS_ARGBADDROW_AVX2 | 4231 #ifdef HAS_ARGBADDROW_AVX2 |
| 4191 // Add 2 rows of ARGB pixels together, 4 pixels at a time. | 4232 // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
| 4192 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4233 void ARGBAddRow_AVX2(const uint8* src_argb0, |
| 4193 uint8* dst_argb, int width) { | 4234 const uint8* src_argb1, |
| 4235 uint8* dst_argb, |
| 4236 int width) { |
| 4194 asm volatile ( | 4237 asm volatile ( |
| 4195 // 4 pixel loop. | 4238 // 4 pixel loop. |
| 4196 LABELALIGN | 4239 LABELALIGN |
| 4197 "1: \n" | 4240 "1: \n" |
| 4198 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 4241 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 4199 "lea " MEMLEA(0x20,0) ",%0 \n" | 4242 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4200 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | 4243 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" |
| 4201 "lea " MEMLEA(0x20,1) ",%1 \n" | 4244 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 4202 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | 4245 "vmovdqu %%ymm0," MEMACCESS(2) " \n" |
| 4203 "lea " MEMLEA(0x20,2) ",%2 \n" | 4246 "lea " MEMLEA(0x20,2) ",%2 \n" |
| 4204 "sub $0x8,%3 \n" | 4247 "sub $0x8,%3 \n" |
| 4205 "jg 1b \n" | 4248 "jg 1b \n" |
| 4206 "vzeroupper \n" | 4249 "vzeroupper \n" |
| 4207 : "+r"(src_argb0), // %0 | 4250 : "+r"(src_argb0), // %0 |
| 4208 "+r"(src_argb1), // %1 | 4251 "+r"(src_argb1), // %1 |
| 4209 "+r"(dst_argb), // %2 | 4252 "+r"(dst_argb), // %2 |
| 4210 "+r"(width) // %3 | 4253 "+r"(width) // %3 |
| 4211 : | 4254 : |
| 4212 : "memory", "cc" | 4255 : "memory", "cc" |
| 4213 , "xmm0" | 4256 , "xmm0" |
| 4214 ); | 4257 ); |
| 4215 } | 4258 } |
| 4216 #endif // HAS_ARGBADDROW_AVX2 | 4259 #endif // HAS_ARGBADDROW_AVX2 |
| 4217 | 4260 |
| 4218 #ifdef HAS_ARGBSUBTRACTROW_SSE2 | 4261 #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
| 4219 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. | 4262 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. |
| 4220 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | 4263 void ARGBSubtractRow_SSE2(const uint8* src_argb0, |
| 4221 uint8* dst_argb, int width) { | 4264 const uint8* src_argb1, |
| 4265 uint8* dst_argb, |
| 4266 int width) { |
| 4222 asm volatile ( | 4267 asm volatile ( |
| 4223 // 4 pixel loop. | 4268 // 4 pixel loop. |
| 4224 LABELALIGN | 4269 LABELALIGN |
| 4225 "1: \n" | 4270 "1: \n" |
| 4226 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4271 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4227 "lea " MEMLEA(0x10,0) ",%0 \n" | 4272 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 4228 "movdqu " MEMACCESS(1) ",%%xmm1 \n" | 4273 "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
| 4229 "lea " MEMLEA(0x10,1) ",%1 \n" | 4274 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 4230 "psubusb %%xmm1,%%xmm0 \n" | 4275 "psubusb %%xmm1,%%xmm0 \n" |
| 4231 "movdqu %%xmm0," MEMACCESS(2) " \n" | 4276 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 4232 "lea " MEMLEA(0x10,2) ",%2 \n" | 4277 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 4233 "sub $0x4,%3 \n" | 4278 "sub $0x4,%3 \n" |
| 4234 "jg 1b \n" | 4279 "jg 1b \n" |
| 4235 : "+r"(src_argb0), // %0 | 4280 : "+r"(src_argb0), // %0 |
| 4236 "+r"(src_argb1), // %1 | 4281 "+r"(src_argb1), // %1 |
| 4237 "+r"(dst_argb), // %2 | 4282 "+r"(dst_argb), // %2 |
| 4238 "+r"(width) // %3 | 4283 "+r"(width) // %3 |
| 4239 : | 4284 : |
| 4240 : "memory", "cc" | 4285 : "memory", "cc" |
| 4241 , "xmm0", "xmm1" | 4286 , "xmm0", "xmm1" |
| 4242 ); | 4287 ); |
| 4243 } | 4288 } |
| 4244 #endif // HAS_ARGBSUBTRACTROW_SSE2 | 4289 #endif // HAS_ARGBSUBTRACTROW_SSE2 |
| 4245 | 4290 |
| 4246 #ifdef HAS_ARGBSUBTRACTROW_AVX2 | 4291 #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
| 4247 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 4292 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
| 4248 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, | 4293 void ARGBSubtractRow_AVX2(const uint8* src_argb0, |
| 4249 uint8* dst_argb, int width) { | 4294 const uint8* src_argb1, |
| 4295 uint8* dst_argb, |
| 4296 int width) { |
| 4250 asm volatile ( | 4297 asm volatile ( |
| 4251 // 4 pixel loop. | 4298 // 4 pixel loop. |
| 4252 LABELALIGN | 4299 LABELALIGN |
| 4253 "1: \n" | 4300 "1: \n" |
| 4254 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 4301 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 4255 "lea " MEMLEA(0x20,0) ",%0 \n" | 4302 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4256 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" | 4303 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" |
| 4257 "lea " MEMLEA(0x20,1) ",%1 \n" | 4304 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 4258 "vmovdqu %%ymm0," MEMACCESS(2) " \n" | 4305 "vmovdqu %%ymm0," MEMACCESS(2) " \n" |
| 4259 "lea " MEMLEA(0x20,2) ",%2 \n" | 4306 "lea " MEMLEA(0x20,2) ",%2 \n" |
| 4260 "sub $0x8,%3 \n" | 4307 "sub $0x8,%3 \n" |
| 4261 "jg 1b \n" | 4308 "jg 1b \n" |
| 4262 "vzeroupper \n" | 4309 "vzeroupper \n" |
| 4263 : "+r"(src_argb0), // %0 | 4310 : "+r"(src_argb0), // %0 |
| 4264 "+r"(src_argb1), // %1 | 4311 "+r"(src_argb1), // %1 |
| 4265 "+r"(dst_argb), // %2 | 4312 "+r"(dst_argb), // %2 |
| 4266 "+r"(width) // %3 | 4313 "+r"(width) // %3 |
| 4267 : | 4314 : |
| 4268 : "memory", "cc" | 4315 : "memory", "cc" |
| 4269 , "xmm0" | 4316 , "xmm0" |
| 4270 ); | 4317 ); |
| 4271 } | 4318 } |
| 4272 #endif // HAS_ARGBSUBTRACTROW_AVX2 | 4319 #endif // HAS_ARGBSUBTRACTROW_AVX2 |
| 4273 | 4320 |
| 4274 #ifdef HAS_SOBELXROW_SSE2 | 4321 #ifdef HAS_SOBELXROW_SSE2 |
| 4275 // SobelX as a matrix is | 4322 // SobelX as a matrix is |
| 4276 // -1 0 1 | 4323 // -1 0 1 |
| 4277 // -2 0 2 | 4324 // -2 0 2 |
| 4278 // -1 0 1 | 4325 // -1 0 1 |
| 4279 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4326 void SobelXRow_SSE2(const uint8* src_y0, |
| 4280 const uint8* src_y2, uint8* dst_sobelx, int width) { | 4327 const uint8* src_y1, |
| 4328 const uint8* src_y2, |
| 4329 uint8* dst_sobelx, |
| 4330 int width) { |
| 4281 asm volatile ( | 4331 asm volatile ( |
| 4282 "sub %0,%1 \n" | 4332 "sub %0,%1 \n" |
| 4283 "sub %0,%2 \n" | 4333 "sub %0,%2 \n" |
| 4284 "sub %0,%3 \n" | 4334 "sub %0,%3 \n" |
| 4285 "pxor %%xmm5,%%xmm5 \n" | 4335 "pxor %%xmm5,%%xmm5 \n" |
| 4286 | 4336 |
| 4287 // 8 pixel loop. | 4337 // 8 pixel loop. |
| 4288 LABELALIGN | 4338 LABELALIGN |
| 4289 "1: \n" | 4339 "1: \n" |
| 4290 "movq " MEMACCESS(0) ",%%xmm0 \n" | 4340 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4323 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4373 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4324 ); | 4374 ); |
| 4325 } | 4375 } |
| 4326 #endif // HAS_SOBELXROW_SSE2 | 4376 #endif // HAS_SOBELXROW_SSE2 |
| 4327 | 4377 |
| 4328 #ifdef HAS_SOBELYROW_SSE2 | 4378 #ifdef HAS_SOBELYROW_SSE2 |
| 4329 // SobelY as a matrix is | 4379 // SobelY as a matrix is |
| 4330 // -1 -2 -1 | 4380 // -1 -2 -1 |
| 4331 // 0 0 0 | 4381 // 0 0 0 |
| 4332 // 1 2 1 | 4382 // 1 2 1 |
| 4333 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, | 4383 void SobelYRow_SSE2(const uint8* src_y0, |
| 4334 uint8* dst_sobely, int width) { | 4384 const uint8* src_y1, |
| 4385 uint8* dst_sobely, |
| 4386 int width) { |
| 4335 asm volatile ( | 4387 asm volatile ( |
| 4336 "sub %0,%1 \n" | 4388 "sub %0,%1 \n" |
| 4337 "sub %0,%2 \n" | 4389 "sub %0,%2 \n" |
| 4338 "pxor %%xmm5,%%xmm5 \n" | 4390 "pxor %%xmm5,%%xmm5 \n" |
| 4339 | 4391 |
| 4340 // 8 pixel loop. | 4392 // 8 pixel loop. |
| 4341 LABELALIGN | 4393 LABELALIGN |
| 4342 "1: \n" | 4394 "1: \n" |
| 4343 "movq " MEMACCESS(0) ",%%xmm0 \n" | 4395 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 4344 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 | 4396 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4376 ); | 4428 ); |
| 4377 } | 4429 } |
| 4378 #endif // HAS_SOBELYROW_SSE2 | 4430 #endif // HAS_SOBELYROW_SSE2 |
| 4379 | 4431 |
| 4380 #ifdef HAS_SOBELROW_SSE2 | 4432 #ifdef HAS_SOBELROW_SSE2 |
| 4381 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 4433 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 4382 // A = 255 | 4434 // A = 255 |
| 4383 // R = Sobel | 4435 // R = Sobel |
| 4384 // G = Sobel | 4436 // G = Sobel |
| 4385 // B = Sobel | 4437 // B = Sobel |
| 4386 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4438 void SobelRow_SSE2(const uint8* src_sobelx, |
| 4387 uint8* dst_argb, int width) { | 4439 const uint8* src_sobely, |
| 4440 uint8* dst_argb, |
| 4441 int width) { |
| 4388 asm volatile ( | 4442 asm volatile ( |
| 4389 "sub %0,%1 \n" | 4443 "sub %0,%1 \n" |
| 4390 "pcmpeqb %%xmm5,%%xmm5 \n" | 4444 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4391 "pslld $0x18,%%xmm5 \n" | 4445 "pslld $0x18,%%xmm5 \n" |
| 4392 | 4446 |
| 4393 // 8 pixel loop. | 4447 // 8 pixel loop. |
| 4394 LABELALIGN | 4448 LABELALIGN |
| 4395 "1: \n" | 4449 "1: \n" |
| 4396 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4450 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4397 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4451 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
| (...skipping 25 matching lines...) Expand all Loading... |
| 4423 "+r"(width) // %3 | 4477 "+r"(width) // %3 |
| 4424 : | 4478 : |
| 4425 : "memory", "cc", NACL_R14 | 4479 : "memory", "cc", NACL_R14 |
| 4426 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 4480 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 4427 ); | 4481 ); |
| 4428 } | 4482 } |
| 4429 #endif // HAS_SOBELROW_SSE2 | 4483 #endif // HAS_SOBELROW_SSE2 |
| 4430 | 4484 |
| 4431 #ifdef HAS_SOBELTOPLANEROW_SSE2 | 4485 #ifdef HAS_SOBELTOPLANEROW_SSE2 |
| 4432 // Adds Sobel X and Sobel Y and stores Sobel into a plane. | 4486 // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
| 4433 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4487 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, |
| 4434 uint8* dst_y, int width) { | 4488 const uint8* src_sobely, |
| 4489 uint8* dst_y, |
| 4490 int width) { |
| 4435 asm volatile ( | 4491 asm volatile ( |
| 4436 "sub %0,%1 \n" | 4492 "sub %0,%1 \n" |
| 4437 "pcmpeqb %%xmm5,%%xmm5 \n" | 4493 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4438 "pslld $0x18,%%xmm5 \n" | 4494 "pslld $0x18,%%xmm5 \n" |
| 4439 | 4495 |
| 4440 // 8 pixel loop. | 4496 // 8 pixel loop. |
| 4441 LABELALIGN | 4497 LABELALIGN |
| 4442 "1: \n" | 4498 "1: \n" |
| 4443 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4499 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4444 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4500 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
| (...skipping 13 matching lines...) Expand all Loading... |
| 4458 ); | 4514 ); |
| 4459 } | 4515 } |
| 4460 #endif // HAS_SOBELTOPLANEROW_SSE2 | 4516 #endif // HAS_SOBELTOPLANEROW_SSE2 |
| 4461 | 4517 |
| 4462 #ifdef HAS_SOBELXYROW_SSE2 | 4518 #ifdef HAS_SOBELXYROW_SSE2 |
| 4463 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 4519 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 4464 // A = 255 | 4520 // A = 255 |
| 4465 // R = Sobel X | 4521 // R = Sobel X |
| 4466 // G = Sobel | 4522 // G = Sobel |
| 4467 // B = Sobel Y | 4523 // B = Sobel Y |
| 4468 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, | 4524 void SobelXYRow_SSE2(const uint8* src_sobelx, |
| 4469 uint8* dst_argb, int width) { | 4525 const uint8* src_sobely, |
| 4526 uint8* dst_argb, |
| 4527 int width) { |
| 4470 asm volatile ( | 4528 asm volatile ( |
| 4471 "sub %0,%1 \n" | 4529 "sub %0,%1 \n" |
| 4472 "pcmpeqb %%xmm5,%%xmm5 \n" | 4530 "pcmpeqb %%xmm5,%%xmm5 \n" |
| 4473 | 4531 |
| 4474 // 8 pixel loop. | 4532 // 8 pixel loop. |
| 4475 LABELALIGN | 4533 LABELALIGN |
| 4476 "1: \n" | 4534 "1: \n" |
| 4477 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 4535 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4478 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 | 4536 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
| 4479 "lea " MEMLEA(0x10,0) ",%0 \n" | 4537 "lea " MEMLEA(0x10,0) ",%0 \n" |
| (...skipping 25 matching lines...) Expand all Loading... |
| 4505 : | 4563 : |
| 4506 : "memory", "cc", NACL_R14 | 4564 : "memory", "cc", NACL_R14 |
| 4507 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4565 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4508 ); | 4566 ); |
| 4509 } | 4567 } |
| 4510 #endif // HAS_SOBELXYROW_SSE2 | 4568 #endif // HAS_SOBELXYROW_SSE2 |
| 4511 | 4569 |
| 4512 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 | 4570 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 4513 // Creates a table of cumulative sums where each value is a sum of all values | 4571 // Creates a table of cumulative sums where each value is a sum of all values |
| 4514 // above and to the left of the value, inclusive of the value. | 4572 // above and to the left of the value, inclusive of the value. |
| 4515 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, | 4573 void ComputeCumulativeSumRow_SSE2(const uint8* row, |
| 4516 const int32* previous_cumsum, int width) { | 4574 int32* cumsum, |
| 4575 const int32* previous_cumsum, |
| 4576 int width) { |
| 4517 asm volatile ( | 4577 asm volatile ( |
| 4518 "pxor %%xmm0,%%xmm0 \n" | 4578 "pxor %%xmm0,%%xmm0 \n" |
| 4519 "pxor %%xmm1,%%xmm1 \n" | 4579 "pxor %%xmm1,%%xmm1 \n" |
| 4520 "sub $0x4,%3 \n" | 4580 "sub $0x4,%3 \n" |
| 4521 "jl 49f \n" | 4581 "jl 49f \n" |
| 4522 "test $0xf,%1 \n" | 4582 "test $0xf,%1 \n" |
| 4523 "jne 49f \n" | 4583 "jne 49f \n" |
| 4524 | 4584 |
| 4525 // 4 pixel loop \n" | 4585 // 4 pixel loop \n" |
| 4526 LABELALIGN | 4586 LABELALIGN |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4583 "+r"(previous_cumsum), // %2 | 4643 "+r"(previous_cumsum), // %2 |
| 4584 "+r"(width) // %3 | 4644 "+r"(width) // %3 |
| 4585 : | 4645 : |
| 4586 : "memory", "cc" | 4646 : "memory", "cc" |
| 4587 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 4647 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4588 ); | 4648 ); |
| 4589 } | 4649 } |
| 4590 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 | 4650 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
| 4591 | 4651 |
| 4592 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 4593 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, | 4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, |
| 4594 int width, int area, uint8* dst, | 4654 const int32* botleft, |
| 4655 int width, |
| 4656 int area, |
| 4657 uint8* dst, |
| 4595 int count) { | 4658 int count) { |
| 4596 asm volatile ( | 4659 asm volatile ( |
| 4597 "movd %5,%%xmm5 \n" | 4660 "movd %5,%%xmm5 \n" |
| 4598 "cvtdq2ps %%xmm5,%%xmm5 \n" | 4661 "cvtdq2ps %%xmm5,%%xmm5 \n" |
| 4599 "rcpss %%xmm5,%%xmm4 \n" | 4662 "rcpss %%xmm5,%%xmm4 \n" |
| 4600 "pshufd $0x0,%%xmm4,%%xmm4 \n" | 4663 "pshufd $0x0,%%xmm4,%%xmm4 \n" |
| 4601 "sub $0x4,%3 \n" | 4664 "sub $0x4,%3 \n" |
| 4602 "jl 49f \n" | 4665 "jl 49f \n" |
| 4603 "cmpl $0x80,%5 \n" | 4666 "cmpl $0x80,%5 \n" |
| 4604 "ja 40f \n" | 4667 "ja 40f \n" |
| (...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4716 "rm"(area) // %5 | 4779 "rm"(area) // %5 |
| 4717 : "memory", "cc", NACL_R14 | 4780 : "memory", "cc", NACL_R14 |
| 4718 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 4781 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 4719 ); | 4782 ); |
| 4720 } | 4783 } |
| 4721 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 | 4784 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
| 4722 | 4785 |
| 4723 #ifdef HAS_ARGBAFFINEROW_SSE2 | 4786 #ifdef HAS_ARGBAFFINEROW_SSE2 |
| 4724 // Copy ARGB pixels from source image with slope to a row of destination. | 4787 // Copy ARGB pixels from source image with slope to a row of destination. |
| 4725 LIBYUV_API | 4788 LIBYUV_API |
| 4726 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, | 4789 void ARGBAffineRow_SSE2(const uint8* src_argb, |
| 4727 uint8* dst_argb, const float* src_dudv, int width) { | 4790 int src_argb_stride, |
| 4791 uint8* dst_argb, |
| 4792 const float* src_dudv, |
| 4793 int width) { |
| 4728 intptr_t src_argb_stride_temp = src_argb_stride; | 4794 intptr_t src_argb_stride_temp = src_argb_stride; |
| 4729 intptr_t temp; | 4795 intptr_t temp; |
| 4730 asm volatile ( | 4796 asm volatile ( |
| 4731 "movq " MEMACCESS(3) ",%%xmm2 \n" | 4797 "movq " MEMACCESS(3) ",%%xmm2 \n" |
| 4732 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" | 4798 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" |
| 4733 "shl $0x10,%1 \n" | 4799 "shl $0x10,%1 \n" |
| 4734 "add $0x4,%1 \n" | 4800 "add $0x4,%1 \n" |
| 4735 "movd %1,%%xmm5 \n" | 4801 "movd %1,%%xmm5 \n" |
| 4736 "sub $0x4,%4 \n" | 4802 "sub $0x4,%4 \n" |
| 4737 "jl 49f \n" | 4803 "jl 49f \n" |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4801 "=&r"(temp) // %5 | 4867 "=&r"(temp) // %5 |
| 4802 : | 4868 : |
| 4803 : "memory", "cc", NACL_R14 | 4869 : "memory", "cc", NACL_R14 |
| 4804 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 4870 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 4805 ); | 4871 ); |
| 4806 } | 4872 } |
| 4807 #endif // HAS_ARGBAFFINEROW_SSE2 | 4873 #endif // HAS_ARGBAFFINEROW_SSE2 |
| 4808 | 4874 |
| 4809 #ifdef HAS_INTERPOLATEROW_SSSE3 | 4875 #ifdef HAS_INTERPOLATEROW_SSSE3 |
| 4810 // Bilinear filter 16x2 -> 16x1 | 4876 // Bilinear filter 16x2 -> 16x1 |
| 4811 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 4877 void InterpolateRow_SSSE3(uint8* dst_ptr, |
| 4812 ptrdiff_t src_stride, int dst_width, | 4878 const uint8* src_ptr, |
| 4879 ptrdiff_t src_stride, |
| 4880 int dst_width, |
| 4813 int source_y_fraction) { | 4881 int source_y_fraction) { |
| 4814 asm volatile ( | 4882 asm volatile ( |
| 4815 "sub %1,%0 \n" | 4883 "sub %1,%0 \n" |
| 4816 "cmp $0x0,%3 \n" | 4884 "cmp $0x0,%3 \n" |
| 4817 "je 100f \n" | 4885 "je 100f \n" |
| 4818 "cmp $0x80,%3 \n" | 4886 "cmp $0x80,%3 \n" |
| 4819 "je 50f \n" | 4887 "je 50f \n" |
| 4820 | 4888 |
| 4821 "movd %3,%%xmm0 \n" | 4889 "movd %3,%%xmm0 \n" |
| 4822 "neg %3 \n" | 4890 "neg %3 \n" |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4882 "+r"(source_y_fraction) // %3 | 4950 "+r"(source_y_fraction) // %3 |
| 4883 : "r"((intptr_t)(src_stride)) // %4 | 4951 : "r"((intptr_t)(src_stride)) // %4 |
| 4884 : "memory", "cc", "eax", NACL_R14 | 4952 : "memory", "cc", "eax", NACL_R14 |
| 4885 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 4953 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 4886 ); | 4954 ); |
| 4887 } | 4955 } |
| 4888 #endif // HAS_INTERPOLATEROW_SSSE3 | 4956 #endif // HAS_INTERPOLATEROW_SSSE3 |
| 4889 | 4957 |
| 4890 #ifdef HAS_INTERPOLATEROW_AVX2 | 4958 #ifdef HAS_INTERPOLATEROW_AVX2 |
| 4891 // Bilinear filter 32x2 -> 32x1 | 4959 // Bilinear filter 32x2 -> 32x1 |
| 4892 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, | 4960 void InterpolateRow_AVX2(uint8* dst_ptr, |
| 4893 ptrdiff_t src_stride, int dst_width, | 4961 const uint8* src_ptr, |
| 4962 ptrdiff_t src_stride, |
| 4963 int dst_width, |
| 4894 int source_y_fraction) { | 4964 int source_y_fraction) { |
| 4895 asm volatile ( | 4965 asm volatile ( |
| 4896 "cmp $0x0,%3 \n" | 4966 "cmp $0x0,%3 \n" |
| 4897 "je 100f \n" | 4967 "je 100f \n" |
| 4898 "sub %1,%0 \n" | 4968 "sub %1,%0 \n" |
| 4899 "cmp $0x80,%3 \n" | 4969 "cmp $0x80,%3 \n" |
| 4900 "je 50f \n" | 4970 "je 50f \n" |
| 4901 | 4971 |
| 4902 "vmovd %3,%%xmm0 \n" | 4972 "vmovd %3,%%xmm0 \n" |
| 4903 "neg %3 \n" | 4973 "neg %3 \n" |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4958 "+r"(source_y_fraction) // %3 | 5028 "+r"(source_y_fraction) // %3 |
| 4959 : "r"((intptr_t)(src_stride)) // %4 | 5029 : "r"((intptr_t)(src_stride)) // %4 |
| 4960 : "memory", "cc", "eax", NACL_R14 | 5030 : "memory", "cc", "eax", NACL_R14 |
| 4961 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" | 5031 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" |
| 4962 ); | 5032 ); |
| 4963 } | 5033 } |
| 4964 #endif // HAS_INTERPOLATEROW_AVX2 | 5034 #endif // HAS_INTERPOLATEROW_AVX2 |
| 4965 | 5035 |
| 4966 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 | 5036 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
| 4967 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5037 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 4968 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5038 void ARGBShuffleRow_SSSE3(const uint8* src_argb, |
| 4969 const uint8* shuffler, int width) { | 5039 uint8* dst_argb, |
| 5040 const uint8* shuffler, |
| 5041 int width) { |
| 4970 asm volatile ( | 5042 asm volatile ( |
| 4971 "movdqu " MEMACCESS(3) ",%%xmm5 \n" | 5043 "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
| 4972 LABELALIGN | 5044 LABELALIGN |
| 4973 "1: \n" | 5045 "1: \n" |
| 4974 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5046 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 4975 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" | 5047 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
| 4976 "lea " MEMLEA(0x20,0) ",%0 \n" | 5048 "lea " MEMLEA(0x20,0) ",%0 \n" |
| 4977 "pshufb %%xmm5,%%xmm0 \n" | 5049 "pshufb %%xmm5,%%xmm0 \n" |
| 4978 "pshufb %%xmm5,%%xmm1 \n" | 5050 "pshufb %%xmm5,%%xmm1 \n" |
| 4979 "movdqu %%xmm0," MEMACCESS(1) " \n" | 5051 "movdqu %%xmm0," MEMACCESS(1) " \n" |
| 4980 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 5052 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
| 4981 "lea " MEMLEA(0x20,1) ",%1 \n" | 5053 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 4982 "sub $0x8,%2 \n" | 5054 "sub $0x8,%2 \n" |
| 4983 "jg 1b \n" | 5055 "jg 1b \n" |
| 4984 : "+r"(src_argb), // %0 | 5056 : "+r"(src_argb), // %0 |
| 4985 "+r"(dst_argb), // %1 | 5057 "+r"(dst_argb), // %1 |
| 4986 "+r"(width) // %2 | 5058 "+r"(width) // %2 |
| 4987 : "r"(shuffler) // %3 | 5059 : "r"(shuffler) // %3 |
| 4988 : "memory", "cc" | 5060 : "memory", "cc" |
| 4989 , "xmm0", "xmm1", "xmm5" | 5061 , "xmm0", "xmm1", "xmm5" |
| 4990 ); | 5062 ); |
| 4991 } | 5063 } |
| 4992 #endif // HAS_ARGBSHUFFLEROW_SSSE3 | 5064 #endif // HAS_ARGBSHUFFLEROW_SSSE3 |
| 4993 | 5065 |
| 4994 #ifdef HAS_ARGBSHUFFLEROW_AVX2 | 5066 #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
| 4995 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5067 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 4996 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, | 5068 void ARGBShuffleRow_AVX2(const uint8* src_argb, |
| 4997 const uint8* shuffler, int width) { | 5069 uint8* dst_argb, |
| 5070 const uint8* shuffler, |
| 5071 int width) { |
| 4998 asm volatile ( | 5072 asm volatile ( |
| 4999 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" | 5073 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" |
| 5000 LABELALIGN | 5074 LABELALIGN |
| 5001 "1: \n" | 5075 "1: \n" |
| 5002 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" | 5076 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 5003 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" | 5077 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 5004 "lea " MEMLEA(0x40,0) ",%0 \n" | 5078 "lea " MEMLEA(0x40,0) ",%0 \n" |
| 5005 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" | 5079 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" |
| 5006 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" | 5080 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
| 5007 "vmovdqu %%ymm0," MEMACCESS(1) " \n" | 5081 "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
| 5008 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" | 5082 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" |
| 5009 "lea " MEMLEA(0x40,1) ",%1 \n" | 5083 "lea " MEMLEA(0x40,1) ",%1 \n" |
| 5010 "sub $0x10,%2 \n" | 5084 "sub $0x10,%2 \n" |
| 5011 "jg 1b \n" | 5085 "jg 1b \n" |
| 5012 "vzeroupper \n" | 5086 "vzeroupper \n" |
| 5013 : "+r"(src_argb), // %0 | 5087 : "+r"(src_argb), // %0 |
| 5014 "+r"(dst_argb), // %1 | 5088 "+r"(dst_argb), // %1 |
| 5015 "+r"(width) // %2 | 5089 "+r"(width) // %2 |
| 5016 : "r"(shuffler) // %3 | 5090 : "r"(shuffler) // %3 |
| 5017 : "memory", "cc" | 5091 : "memory", "cc" |
| 5018 , "xmm0", "xmm1", "xmm5" | 5092 , "xmm0", "xmm1", "xmm5" |
| 5019 ); | 5093 ); |
| 5020 } | 5094 } |
| 5021 #endif // HAS_ARGBSHUFFLEROW_AVX2 | 5095 #endif // HAS_ARGBSHUFFLEROW_AVX2 |
| 5022 | 5096 |
| 5023 #ifdef HAS_ARGBSHUFFLEROW_SSE2 | 5097 #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
| 5024 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 5098 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 5025 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, | 5099 void ARGBShuffleRow_SSE2(const uint8* src_argb, |
| 5026 const uint8* shuffler, int width) { | 5100 uint8* dst_argb, |
| 5101 const uint8* shuffler, |
| 5102 int width) { |
| 5027 uintptr_t pixel_temp; | 5103 uintptr_t pixel_temp; |
| 5028 asm volatile ( | 5104 asm volatile ( |
| 5029 "pxor %%xmm5,%%xmm5 \n" | 5105 "pxor %%xmm5,%%xmm5 \n" |
| 5030 "mov " MEMACCESS(4) ",%k2 \n" | 5106 "mov " MEMACCESS(4) ",%k2 \n" |
| 5031 "cmp $0x3000102,%k2 \n" | 5107 "cmp $0x3000102,%k2 \n" |
| 5032 "je 3012f \n" | 5108 "je 3012f \n" |
| 5033 "cmp $0x10203,%k2 \n" | 5109 "cmp $0x10203,%k2 \n" |
| 5034 "je 123f \n" | 5110 "je 123f \n" |
| 5035 "cmp $0x30201,%k2 \n" | 5111 "cmp $0x30201,%k2 \n" |
| 5036 "je 321f \n" | 5112 "je 321f \n" |
| (...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5137 : "memory", "cc", NACL_R14 | 5213 : "memory", "cc", NACL_R14 |
| 5138 "xmm0", "xmm1", "xmm5" | 5214 "xmm0", "xmm1", "xmm5" |
| 5139 ); | 5215 ); |
| 5140 } | 5216 } |
| 5141 #endif // HAS_ARGBSHUFFLEROW_SSE2 | 5217 #endif // HAS_ARGBSHUFFLEROW_SSE2 |
| 5142 | 5218 |
| 5143 #ifdef HAS_I422TOYUY2ROW_SSE2 | 5219 #ifdef HAS_I422TOYUY2ROW_SSE2 |
| 5144 void I422ToYUY2Row_SSE2(const uint8* src_y, | 5220 void I422ToYUY2Row_SSE2(const uint8* src_y, |
| 5145 const uint8* src_u, | 5221 const uint8* src_u, |
| 5146 const uint8* src_v, | 5222 const uint8* src_v, |
| 5147 uint8* dst_frame, int width) { | 5223 uint8* dst_frame, |
| 5148 asm volatile ( | 5224 int width) { |
| 5225 asm volatile ( |
| 5149 "sub %1,%2 \n" | 5226 "sub %1,%2 \n" |
| 5150 LABELALIGN | 5227 LABELALIGN |
| 5151 "1: \n" | 5228 "1: \n" |
| 5152 "movq " MEMACCESS(1) ",%%xmm2 \n" | 5229 "movq " MEMACCESS(1) ",%%xmm2 \n" |
| 5153 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | 5230 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
| 5154 "lea " MEMLEA(0x8,1) ",%1 \n" | 5231 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 5155 "punpcklbw %%xmm3,%%xmm2 \n" | 5232 "punpcklbw %%xmm3,%%xmm2 \n" |
| 5156 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5233 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 5157 "lea " MEMLEA(0x10,0) ",%0 \n" | 5234 "lea " MEMLEA(0x10,0) ",%0 \n" |
| 5158 "movdqa %%xmm0,%%xmm1 \n" | 5235 "movdqa %%xmm0,%%xmm1 \n" |
| (...skipping 13 matching lines...) Expand all Loading... |
| 5172 : "memory", "cc", NACL_R14 | 5249 : "memory", "cc", NACL_R14 |
| 5173 "xmm0", "xmm1", "xmm2", "xmm3" | 5250 "xmm0", "xmm1", "xmm2", "xmm3" |
| 5174 ); | 5251 ); |
| 5175 } | 5252 } |
| 5176 #endif // HAS_I422TOYUY2ROW_SSE2 | 5253 #endif // HAS_I422TOYUY2ROW_SSE2 |
| 5177 | 5254 |
| 5178 #ifdef HAS_I422TOUYVYROW_SSE2 | 5255 #ifdef HAS_I422TOUYVYROW_SSE2 |
| 5179 void I422ToUYVYRow_SSE2(const uint8* src_y, | 5256 void I422ToUYVYRow_SSE2(const uint8* src_y, |
| 5180 const uint8* src_u, | 5257 const uint8* src_u, |
| 5181 const uint8* src_v, | 5258 const uint8* src_v, |
| 5182 uint8* dst_frame, int width) { | 5259 uint8* dst_frame, |
| 5183 asm volatile ( | 5260 int width) { |
| 5261 asm volatile ( |
| 5184 "sub %1,%2 \n" | 5262 "sub %1,%2 \n" |
| 5185 LABELALIGN | 5263 LABELALIGN |
| 5186 "1: \n" | 5264 "1: \n" |
| 5187 "movq " MEMACCESS(1) ",%%xmm2 \n" | 5265 "movq " MEMACCESS(1) ",%%xmm2 \n" |
| 5188 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 | 5266 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
| 5189 "lea " MEMLEA(0x8,1) ",%1 \n" | 5267 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 5190 "punpcklbw %%xmm3,%%xmm2 \n" | 5268 "punpcklbw %%xmm3,%%xmm2 \n" |
| 5191 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 5269 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
| 5192 "movdqa %%xmm2,%%xmm1 \n" | 5270 "movdqa %%xmm2,%%xmm1 \n" |
| 5193 "lea " MEMLEA(0x10,0) ",%0 \n" | 5271 "lea " MEMLEA(0x10,0) ",%0 \n" |
| (...skipping 11 matching lines...) Expand all Loading... |
| 5205 "+rm"(width) // %4 | 5283 "+rm"(width) // %4 |
| 5206 : | 5284 : |
| 5207 : "memory", "cc", NACL_R14 | 5285 : "memory", "cc", NACL_R14 |
| 5208 "xmm0", "xmm1", "xmm2", "xmm3" | 5286 "xmm0", "xmm1", "xmm2", "xmm3" |
| 5209 ); | 5287 ); |
| 5210 } | 5288 } |
| 5211 #endif // HAS_I422TOUYVYROW_SSE2 | 5289 #endif // HAS_I422TOUYVYROW_SSE2 |
| 5212 | 5290 |
| 5213 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 | 5291 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
| 5214 void ARGBPolynomialRow_SSE2(const uint8* src_argb, | 5292 void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
| 5215 uint8* dst_argb, const float* poly, | 5293 uint8* dst_argb, |
| 5294 const float* poly, |
| 5216 int width) { | 5295 int width) { |
| 5217 asm volatile ( | 5296 asm volatile ( |
| 5218 "pxor %%xmm3,%%xmm3 \n" | 5297 "pxor %%xmm3,%%xmm3 \n" |
| 5219 | 5298 |
| 5220 // 2 pixel loop. | 5299 // 2 pixel loop. |
| 5221 LABELALIGN | 5300 LABELALIGN |
| 5222 "1: \n" | 5301 "1: \n" |
| 5223 "movq " MEMACCESS(0) ",%%xmm0 \n" | 5302 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 5224 "lea " MEMLEA(0x8,0) ",%0 \n" | 5303 "lea " MEMLEA(0x8,0) ",%0 \n" |
| 5225 "punpcklbw %%xmm3,%%xmm0 \n" | 5304 "punpcklbw %%xmm3,%%xmm0 \n" |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5261 "+r"(width) // %2 | 5340 "+r"(width) // %2 |
| 5262 : "r"(poly) // %3 | 5341 : "r"(poly) // %3 |
| 5263 : "memory", "cc" | 5342 : "memory", "cc" |
| 5264 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 5343 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 5265 ); | 5344 ); |
| 5266 } | 5345 } |
| 5267 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 | 5346 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
| 5268 | 5347 |
| 5269 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 | 5348 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
| 5270 void ARGBPolynomialRow_AVX2(const uint8* src_argb, | 5349 void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
| 5271 uint8* dst_argb, const float* poly, | 5350 uint8* dst_argb, |
| 5351 const float* poly, |
| 5272 int width) { | 5352 int width) { |
| 5273 asm volatile ( | 5353 asm volatile ( |
| 5274 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" | 5354 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" |
| 5275 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" | 5355 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" |
| 5276 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" | 5356 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" |
| 5277 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" | 5357 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" |
| 5278 | 5358 |
| 5279 // 2 pixel loop. | 5359 // 2 pixel loop. |
| 5280 LABELALIGN | 5360 LABELALIGN |
| 5281 "1: \n" | 5361 "1: \n" |
| (...skipping 155 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5437 "+r"(width) // %2 | 5517 "+r"(width) // %2 |
| 5438 : | 5518 : |
| 5439 : "memory", "cc", | 5519 : "memory", "cc", |
| 5440 "xmm2", "xmm3" | 5520 "xmm2", "xmm3" |
| 5441 ); | 5521 ); |
| 5442 } | 5522 } |
| 5443 #endif // HAS_HALFFLOATROW_F16C | 5523 #endif // HAS_HALFFLOATROW_F16C |
| 5444 | 5524 |
| 5445 #ifdef HAS_ARGBCOLORTABLEROW_X86 | 5525 #ifdef HAS_ARGBCOLORTABLEROW_X86 |
| 5446 // Tranform ARGB pixels with color table. | 5526 // Tranform ARGB pixels with color table. |
| 5447 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, | 5527 void ARGBColorTableRow_X86(uint8* dst_argb, |
| 5528 const uint8* table_argb, |
| 5448 int width) { | 5529 int width) { |
| 5449 uintptr_t pixel_temp; | 5530 uintptr_t pixel_temp; |
| 5450 asm volatile ( | 5531 asm volatile ( |
| 5451 // 1 pixel loop. | 5532 // 1 pixel loop. |
| 5452 LABELALIGN | 5533 LABELALIGN |
| 5453 "1: \n" | 5534 "1: \n" |
| 5454 "movzb " MEMACCESS(0) ",%1 \n" | 5535 "movzb " MEMACCESS(0) ",%1 \n" |
| 5455 "lea " MEMLEA(0x4,0) ",%0 \n" | 5536 "lea " MEMLEA(0x4,0) ",%0 \n" |
| 5456 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 | 5537 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
| 5457 "mov %b1," MEMACCESS2(-0x4,0) " \n" | 5538 "mov %b1," MEMACCESS2(-0x4,0) " \n" |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5497 : "+r"(dst_argb), // %0 | 5578 : "+r"(dst_argb), // %0 |
| 5498 "=&d"(pixel_temp), // %1 | 5579 "=&d"(pixel_temp), // %1 |
| 5499 "+r"(width) // %2 | 5580 "+r"(width) // %2 |
| 5500 : "r"(table_argb) // %3 | 5581 : "r"(table_argb) // %3 |
| 5501 : "memory", "cc"); | 5582 : "memory", "cc"); |
| 5502 } | 5583 } |
| 5503 #endif // HAS_RGBCOLORTABLEROW_X86 | 5584 #endif // HAS_RGBCOLORTABLEROW_X86 |
| 5504 | 5585 |
| 5505 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5586 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5506 // Tranform RGB pixels with luma table. | 5587 // Tranform RGB pixels with luma table. |
| 5507 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, | 5588 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, |
| 5589 uint8* dst_argb, |
| 5508 int width, | 5590 int width, |
| 5509 const uint8* luma, uint32 lumacoeff) { | 5591 const uint8* luma, |
| 5592 uint32 lumacoeff) { |
| 5510 uintptr_t pixel_temp; | 5593 uintptr_t pixel_temp; |
| 5511 uintptr_t table_temp; | 5594 uintptr_t table_temp; |
| 5512 asm volatile ( | 5595 asm volatile ( |
| 5513 "movd %6,%%xmm3 \n" | 5596 "movd %6,%%xmm3 \n" |
| 5514 "pshufd $0x0,%%xmm3,%%xmm3 \n" | 5597 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
| 5515 "pcmpeqb %%xmm4,%%xmm4 \n" | 5598 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 5516 "psllw $0x8,%%xmm4 \n" | 5599 "psllw $0x8,%%xmm4 \n" |
| 5517 "pxor %%xmm5,%%xmm5 \n" | 5600 "pxor %%xmm5,%%xmm5 \n" |
| 5518 | 5601 |
| 5519 // 4 pixel loop. | 5602 // 4 pixel loop. |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 5597 "+rm"(width) // %4 | 5680 "+rm"(width) // %4 |
| 5598 : "r"(luma), // %5 | 5681 : "r"(luma), // %5 |
| 5599 "rm"(lumacoeff) // %6 | 5682 "rm"(lumacoeff) // %6 |
| 5600 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" | 5683 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" |
| 5601 ); | 5684 ); |
| 5602 } | 5685 } |
| 5603 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5686 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 5604 | 5687 |
| 5605 #endif // defined(__x86_64__) || defined(__i386__) | 5688 #endif // defined(__x86_64__) || defined(__i386__) |
| 5606 | 5689 |
| 5607 // clang-format on | |
| 5608 | |
| 5609 #ifdef __cplusplus | 5690 #ifdef __cplusplus |
| 5610 } // extern "C" | 5691 } // extern "C" |
| 5611 } // namespace libyuv | 5692 } // namespace libyuv |
| 5612 #endif | 5693 #endif |
| OLD | NEW |