| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ | 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ |
| 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ | 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ |
| 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ | 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ |
| 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ | 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ |
| 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ | 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ |
| 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ | 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ |
| 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ | 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ |
| 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ | 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ |
| 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ | 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ |
| 129 | 129 |
| 130 #ifdef HAS_I444TOARGBROW_NEON | |
| 131 void I444ToARGBRow_NEON(const uint8* src_y, | 130 void I444ToARGBRow_NEON(const uint8* src_y, |
| 132 const uint8* src_u, | 131 const uint8* src_u, |
| 133 const uint8* src_v, | 132 const uint8* src_v, |
| 134 uint8* dst_argb, | 133 uint8* dst_argb, |
| 135 const struct YuvConstants* yuvconstants, | 134 const struct YuvConstants* yuvconstants, |
| 136 int width) { | 135 int width) { |
| 137 asm volatile ( | 136 asm volatile ( |
| 138 YUVTORGB_SETUP | 137 YUVTORGB_SETUP |
| 139 "movi v23.8b, #255 \n" /* A */ | 138 "movi v23.8b, #255 \n" /* A */ |
| 140 "1: \n" | 139 "1: \n" |
| 141 READYUV444 | 140 READYUV444 |
| 142 YUVTORGB(v22, v21, v20) | 141 YUVTORGB(v22, v21, v20) |
| 143 "subs %w4, %w4, #8 \n" | 142 "subs %w4, %w4, #8 \n" |
| 144 MEMACCESS(3) | 143 MEMACCESS(3) |
| 145 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 144 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
| 146 "b.gt 1b \n" | 145 "b.gt 1b \n" |
| 147 : "+r"(src_y), // %0 | 146 : "+r"(src_y), // %0 |
| 148 "+r"(src_u), // %1 | 147 "+r"(src_u), // %1 |
| 149 "+r"(src_v), // %2 | 148 "+r"(src_v), // %2 |
| 150 "+r"(dst_argb), // %3 | 149 "+r"(dst_argb), // %3 |
| 151 "+r"(width) // %4 | 150 "+r"(width) // %4 |
| 152 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 151 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 153 [kUVToG]"r"(&yuvconstants->kUVToG), | 152 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 154 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 153 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 155 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 154 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 156 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 155 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 157 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 156 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 158 ); | 157 ); |
| 159 } | 158 } |
| 160 #endif // HAS_I444TOARGBROW_NEON | |
| 161 | 159 |
| 162 #ifdef HAS_I422TOARGBROW_NEON | |
| 163 void I422ToARGBRow_NEON(const uint8* src_y, | 160 void I422ToARGBRow_NEON(const uint8* src_y, |
| 164 const uint8* src_u, | 161 const uint8* src_u, |
| 165 const uint8* src_v, | 162 const uint8* src_v, |
| 166 uint8* dst_argb, | 163 uint8* dst_argb, |
| 167 const struct YuvConstants* yuvconstants, | 164 const struct YuvConstants* yuvconstants, |
| 168 int width) { | 165 int width) { |
| 169 asm volatile ( | 166 asm volatile ( |
| 170 YUVTORGB_SETUP | 167 YUVTORGB_SETUP |
| 171 "movi v23.8b, #255 \n" /* A */ | 168 "movi v23.8b, #255 \n" /* A */ |
| 172 "1: \n" | 169 "1: \n" |
| 173 READYUV422 | 170 READYUV422 |
| 174 YUVTORGB(v22, v21, v20) | 171 YUVTORGB(v22, v21, v20) |
| 175 "subs %w4, %w4, #8 \n" | 172 "subs %w4, %w4, #8 \n" |
| 176 MEMACCESS(3) | 173 MEMACCESS(3) |
| 177 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 174 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
| 178 "b.gt 1b \n" | 175 "b.gt 1b \n" |
| 179 : "+r"(src_y), // %0 | 176 : "+r"(src_y), // %0 |
| 180 "+r"(src_u), // %1 | 177 "+r"(src_u), // %1 |
| 181 "+r"(src_v), // %2 | 178 "+r"(src_v), // %2 |
| 182 "+r"(dst_argb), // %3 | 179 "+r"(dst_argb), // %3 |
| 183 "+r"(width) // %4 | 180 "+r"(width) // %4 |
| 184 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 181 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 185 [kUVToG]"r"(&yuvconstants->kUVToG), | 182 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 186 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 183 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 187 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 184 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 188 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 189 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 186 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 190 ); | 187 ); |
| 191 } | 188 } |
| 192 #endif // HAS_I422TOARGBROW_NEON | |
| 193 | 189 |
| 194 #ifdef HAS_I422ALPHATOARGBROW_NEON | |
| 195 void I422AlphaToARGBRow_NEON(const uint8* src_y, | 190 void I422AlphaToARGBRow_NEON(const uint8* src_y, |
| 196 const uint8* src_u, | 191 const uint8* src_u, |
| 197 const uint8* src_v, | 192 const uint8* src_v, |
| 198 const uint8* src_a, | 193 const uint8* src_a, |
| 199 uint8* dst_argb, | 194 uint8* dst_argb, |
| 200 const struct YuvConstants* yuvconstants, | 195 const struct YuvConstants* yuvconstants, |
| 201 int width) { | 196 int width) { |
| 202 asm volatile ( | 197 asm volatile ( |
| 203 YUVTORGB_SETUP | 198 YUVTORGB_SETUP |
| 204 "1: \n" | 199 "1: \n" |
| (...skipping 12 matching lines...) Expand all Loading... |
| 217 "+r"(dst_argb), // %4 | 212 "+r"(dst_argb), // %4 |
| 218 "+r"(width) // %5 | 213 "+r"(width) // %5 |
| 219 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 214 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 220 [kUVToG]"r"(&yuvconstants->kUVToG), | 215 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 221 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 222 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 217 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 223 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 224 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 225 ); | 220 ); |
| 226 } | 221 } |
| 227 #endif // HAS_I422ALPHATOARGBROW_NEON | |
| 228 | 222 |
| 229 #ifdef HAS_I411TOARGBROW_NEON | |
| 230 void I411ToARGBRow_NEON(const uint8* src_y, | 223 void I411ToARGBRow_NEON(const uint8* src_y, |
| 231 const uint8* src_u, | 224 const uint8* src_u, |
| 232 const uint8* src_v, | 225 const uint8* src_v, |
| 233 uint8* dst_argb, | 226 uint8* dst_argb, |
| 234 const struct YuvConstants* yuvconstants, | 227 const struct YuvConstants* yuvconstants, |
| 235 int width) { | 228 int width) { |
| 236 asm volatile ( | 229 asm volatile ( |
| 237 YUVTORGB_SETUP | 230 YUVTORGB_SETUP |
| 238 "movi v23.8b, #255 \n" /* A */ | 231 "movi v23.8b, #255 \n" /* A */ |
| 239 "1: \n" | 232 "1: \n" |
| 240 READYUV411 | 233 READYUV411 |
| 241 YUVTORGB(v22, v21, v20) | 234 YUVTORGB(v22, v21, v20) |
| 242 "subs %w4, %w4, #8 \n" | 235 "subs %w4, %w4, #8 \n" |
| 243 MEMACCESS(3) | 236 MEMACCESS(3) |
| 244 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
| 245 "b.gt 1b \n" | 238 "b.gt 1b \n" |
| 246 : "+r"(src_y), // %0 | 239 : "+r"(src_y), // %0 |
| 247 "+r"(src_u), // %1 | 240 "+r"(src_u), // %1 |
| 248 "+r"(src_v), // %2 | 241 "+r"(src_v), // %2 |
| 249 "+r"(dst_argb), // %3 | 242 "+r"(dst_argb), // %3 |
| 250 "+r"(width) // %4 | 243 "+r"(width) // %4 |
| 251 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 244 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 252 [kUVToG]"r"(&yuvconstants->kUVToG), | 245 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 253 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 254 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 247 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 255 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 256 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 257 ); | 250 ); |
| 258 } | 251 } |
| 259 #endif // HAS_I411TOARGBROW_NEON | |
| 260 | 252 |
| 261 #ifdef HAS_I422TORGBAROW_NEON | |
| 262 void I422ToRGBARow_NEON(const uint8* src_y, | 253 void I422ToRGBARow_NEON(const uint8* src_y, |
| 263 const uint8* src_u, | 254 const uint8* src_u, |
| 264 const uint8* src_v, | 255 const uint8* src_v, |
| 265 uint8* dst_rgba, | 256 uint8* dst_rgba, |
| 266 const struct YuvConstants* yuvconstants, | 257 const struct YuvConstants* yuvconstants, |
| 267 int width) { | 258 int width) { |
| 268 asm volatile ( | 259 asm volatile ( |
| 269 YUVTORGB_SETUP | 260 YUVTORGB_SETUP |
| 270 "movi v20.8b, #255 \n" /* A */ | 261 "movi v20.8b, #255 \n" /* A */ |
| 271 "1: \n" | 262 "1: \n" |
| 272 READYUV422 | 263 READYUV422 |
| 273 YUVTORGB(v23, v22, v21) | 264 YUVTORGB(v23, v22, v21) |
| 274 "subs %w4, %w4, #8 \n" | 265 "subs %w4, %w4, #8 \n" |
| 275 MEMACCESS(3) | 266 MEMACCESS(3) |
| 276 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" | 267 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" |
| 277 "b.gt 1b \n" | 268 "b.gt 1b \n" |
| 278 : "+r"(src_y), // %0 | 269 : "+r"(src_y), // %0 |
| 279 "+r"(src_u), // %1 | 270 "+r"(src_u), // %1 |
| 280 "+r"(src_v), // %2 | 271 "+r"(src_v), // %2 |
| 281 "+r"(dst_rgba), // %3 | 272 "+r"(dst_rgba), // %3 |
| 282 "+r"(width) // %4 | 273 "+r"(width) // %4 |
| 283 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 274 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 284 [kUVToG]"r"(&yuvconstants->kUVToG), | 275 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 285 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 276 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 286 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 277 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 287 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 288 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 279 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 289 ); | 280 ); |
| 290 } | 281 } |
| 291 #endif // HAS_I422TORGBAROW_NEON | |
| 292 | 282 |
| 293 #ifdef HAS_I422TORGB24ROW_NEON | |
| 294 void I422ToRGB24Row_NEON(const uint8* src_y, | 283 void I422ToRGB24Row_NEON(const uint8* src_y, |
| 295 const uint8* src_u, | 284 const uint8* src_u, |
| 296 const uint8* src_v, | 285 const uint8* src_v, |
| 297 uint8* dst_rgb24, | 286 uint8* dst_rgb24, |
| 298 const struct YuvConstants* yuvconstants, | 287 const struct YuvConstants* yuvconstants, |
| 299 int width) { | 288 int width) { |
| 300 asm volatile ( | 289 asm volatile ( |
| 301 YUVTORGB_SETUP | 290 YUVTORGB_SETUP |
| 302 "1: \n" | 291 "1: \n" |
| 303 READYUV422 | 292 READYUV422 |
| 304 YUVTORGB(v22, v21, v20) | 293 YUVTORGB(v22, v21, v20) |
| 305 "subs %w4, %w4, #8 \n" | 294 "subs %w4, %w4, #8 \n" |
| 306 MEMACCESS(3) | 295 MEMACCESS(3) |
| 307 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" | 296 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" |
| 308 "b.gt 1b \n" | 297 "b.gt 1b \n" |
| 309 : "+r"(src_y), // %0 | 298 : "+r"(src_y), // %0 |
| 310 "+r"(src_u), // %1 | 299 "+r"(src_u), // %1 |
| 311 "+r"(src_v), // %2 | 300 "+r"(src_v), // %2 |
| 312 "+r"(dst_rgb24), // %3 | 301 "+r"(dst_rgb24), // %3 |
| 313 "+r"(width) // %4 | 302 "+r"(width) // %4 |
| 314 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 303 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 315 [kUVToG]"r"(&yuvconstants->kUVToG), | 304 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 316 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 305 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 317 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 306 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 318 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 307 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 319 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 308 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 320 ); | 309 ); |
| 321 } | 310 } |
| 322 #endif // HAS_I422TORGB24ROW_NEON | |
| 323 | 311 |
| 324 #define ARGBTORGB565 \ | 312 #define ARGBTORGB565 \ |
| 325 "shll v0.8h, v22.8b, #8 \n" /* R */ \ | 313 "shll v0.8h, v22.8b, #8 \n" /* R */ \ |
| 326 "shll v21.8h, v21.8b, #8 \n" /* G */ \ | 314 "shll v21.8h, v21.8b, #8 \n" /* G */ \ |
| 327 "shll v20.8h, v20.8b, #8 \n" /* B */ \ | 315 "shll v20.8h, v20.8b, #8 \n" /* B */ \ |
| 328 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ | 316 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ |
| 329 "sri v0.8h, v20.8h, #11 \n" /* RGB */ | 317 "sri v0.8h, v20.8h, #11 \n" /* RGB */ |
| 330 | 318 |
| 331 #ifdef HAS_I422TORGB565ROW_NEON | |
| 332 void I422ToRGB565Row_NEON(const uint8* src_y, | 319 void I422ToRGB565Row_NEON(const uint8* src_y, |
| 333 const uint8* src_u, | 320 const uint8* src_u, |
| 334 const uint8* src_v, | 321 const uint8* src_v, |
| 335 uint8* dst_rgb565, | 322 uint8* dst_rgb565, |
| 336 const struct YuvConstants* yuvconstants, | 323 const struct YuvConstants* yuvconstants, |
| 337 int width) { | 324 int width) { |
| 338 asm volatile ( | 325 asm volatile ( |
| 339 YUVTORGB_SETUP | 326 YUVTORGB_SETUP |
| 340 "1: \n" | 327 "1: \n" |
| 341 READYUV422 | 328 READYUV422 |
| 342 YUVTORGB(v22, v21, v20) | 329 YUVTORGB(v22, v21, v20) |
| 343 "subs %w4, %w4, #8 \n" | 330 "subs %w4, %w4, #8 \n" |
| 344 ARGBTORGB565 | 331 ARGBTORGB565 |
| 345 MEMACCESS(3) | 332 MEMACCESS(3) |
| 346 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. | 333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. |
| 347 "b.gt 1b \n" | 334 "b.gt 1b \n" |
| 348 : "+r"(src_y), // %0 | 335 : "+r"(src_y), // %0 |
| 349 "+r"(src_u), // %1 | 336 "+r"(src_u), // %1 |
| 350 "+r"(src_v), // %2 | 337 "+r"(src_v), // %2 |
| 351 "+r"(dst_rgb565), // %3 | 338 "+r"(dst_rgb565), // %3 |
| 352 "+r"(width) // %4 | 339 "+r"(width) // %4 |
| 353 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 340 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 354 [kUVToG]"r"(&yuvconstants->kUVToG), | 341 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 355 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 356 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 343 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 357 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 358 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 359 ); | 346 ); |
| 360 } | 347 } |
| 361 #endif // HAS_I422TORGB565ROW_NEON | |
| 362 | 348 |
| 363 #define ARGBTOARGB1555 \ | 349 #define ARGBTOARGB1555 \ |
| 364 "shll v0.8h, v23.8b, #8 \n" /* A */ \ | 350 "shll v0.8h, v23.8b, #8 \n" /* A */ \ |
| 365 "shll v22.8h, v22.8b, #8 \n" /* R */ \ | 351 "shll v22.8h, v22.8b, #8 \n" /* R */ \ |
| 366 "shll v21.8h, v21.8b, #8 \n" /* G */ \ | 352 "shll v21.8h, v21.8b, #8 \n" /* G */ \ |
| 367 "shll v20.8h, v20.8b, #8 \n" /* B */ \ | 353 "shll v20.8h, v20.8b, #8 \n" /* B */ \ |
| 368 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ | 354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ |
| 369 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ | 355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ |
| 370 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ | 356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ |
| 371 | 357 |
| 372 #ifdef HAS_I422TOARGB1555ROW_NEON | |
| 373 void I422ToARGB1555Row_NEON(const uint8* src_y, | 358 void I422ToARGB1555Row_NEON(const uint8* src_y, |
| 374 const uint8* src_u, | 359 const uint8* src_u, |
| 375 const uint8* src_v, | 360 const uint8* src_v, |
| 376 uint8* dst_argb1555, | 361 uint8* dst_argb1555, |
| 377 const struct YuvConstants* yuvconstants, | 362 const struct YuvConstants* yuvconstants, |
| 378 int width) { | 363 int width) { |
| 379 asm volatile ( | 364 asm volatile ( |
| 380 YUVTORGB_SETUP | 365 YUVTORGB_SETUP |
| 381 "movi v23.8b, #255 \n" | 366 "movi v23.8b, #255 \n" |
| 382 "1: \n" | 367 "1: \n" |
| (...skipping 10 matching lines...) Expand all Loading... |
| 393 "+r"(dst_argb1555), // %3 | 378 "+r"(dst_argb1555), // %3 |
| 394 "+r"(width) // %4 | 379 "+r"(width) // %4 |
| 395 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 380 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 396 [kUVToG]"r"(&yuvconstants->kUVToG), | 381 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 397 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 398 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 383 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 399 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 400 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 401 ); | 386 ); |
| 402 } | 387 } |
| 403 #endif // HAS_I422TOARGB1555ROW_NEON | |
| 404 | 388 |
| 405 #define ARGBTOARGB4444 \ | 389 #define ARGBTOARGB4444 \ |
| 406 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ | 390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ |
| 407 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ | 391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ |
| 408 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ | 392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ |
| 409 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ | 393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ |
| 410 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ | 394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ |
| 411 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ | 395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ |
| 412 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ | 396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ |
| 413 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ | 397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ |
| 414 | 398 |
| 415 #ifdef HAS_I422TOARGB4444ROW_NEON | |
| 416 void I422ToARGB4444Row_NEON(const uint8* src_y, | 399 void I422ToARGB4444Row_NEON(const uint8* src_y, |
| 417 const uint8* src_u, | 400 const uint8* src_u, |
| 418 const uint8* src_v, | 401 const uint8* src_v, |
| 419 uint8* dst_argb4444, | 402 uint8* dst_argb4444, |
| 420 const struct YuvConstants* yuvconstants, | 403 const struct YuvConstants* yuvconstants, |
| 421 int width) { | 404 int width) { |
| 422 asm volatile ( | 405 asm volatile ( |
| 423 YUVTORGB_SETUP | 406 YUVTORGB_SETUP |
| 424 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 407 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
| 425 "1: \n" | 408 "1: \n" |
| (...skipping 11 matching lines...) Expand all Loading... |
| 437 "+r"(dst_argb4444), // %3 | 420 "+r"(dst_argb4444), // %3 |
| 438 "+r"(width) // %4 | 421 "+r"(width) // %4 |
| 439 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 422 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 440 [kUVToG]"r"(&yuvconstants->kUVToG), | 423 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 441 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 424 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 442 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 425 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 426 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 444 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 427 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 445 ); | 428 ); |
| 446 } | 429 } |
| 447 #endif // HAS_I422TOARGB4444ROW_NEON | |
| 448 | 430 |
| 449 #ifdef HAS_I400TOARGBROW_NEON | |
| 450 void I400ToARGBRow_NEON(const uint8* src_y, | 431 void I400ToARGBRow_NEON(const uint8* src_y, |
| 451 uint8* dst_argb, | 432 uint8* dst_argb, |
| 452 int width) { | 433 int width) { |
| 453 asm volatile ( | 434 asm volatile ( |
| 454 YUVTORGB_SETUP | 435 YUVTORGB_SETUP |
| 455 "movi v23.8b, #255 \n" | 436 "movi v23.8b, #255 \n" |
| 456 "1: \n" | 437 "1: \n" |
| 457 READYUV400 | 438 READYUV400 |
| 458 YUVTORGB(v22, v21, v20) | 439 YUVTORGB(v22, v21, v20) |
| 459 "subs %w2, %w2, #8 \n" | 440 "subs %w2, %w2, #8 \n" |
| 460 MEMACCESS(1) | 441 MEMACCESS(1) |
| 461 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 442 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
| 462 "b.gt 1b \n" | 443 "b.gt 1b \n" |
| 463 : "+r"(src_y), // %0 | 444 : "+r"(src_y), // %0 |
| 464 "+r"(dst_argb), // %1 | 445 "+r"(dst_argb), // %1 |
| 465 "+r"(width) // %2 | 446 "+r"(width) // %2 |
| 466 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), | 447 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), |
| 467 [kUVToG]"r"(&kYuvI601Constants.kUVToG), | 448 [kUVToG]"r"(&kYuvI601Constants.kUVToG), |
| 468 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), | 449 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), |
| 469 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) | 450 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) |
| 470 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 451 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 471 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 452 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 472 ); | 453 ); |
| 473 } | 454 } |
| 474 #endif // HAS_I400TOARGBROW_NEON | |
| 475 | 455 |
| 476 #ifdef HAS_J400TOARGBROW_NEON | |
| 477 void J400ToARGBRow_NEON(const uint8* src_y, | 456 void J400ToARGBRow_NEON(const uint8* src_y, |
| 478 uint8* dst_argb, | 457 uint8* dst_argb, |
| 479 int width) { | 458 int width) { |
| 480 asm volatile ( | 459 asm volatile ( |
| 481 "movi v23.8b, #255 \n" | 460 "movi v23.8b, #255 \n" |
| 482 "1: \n" | 461 "1: \n" |
| 483 MEMACCESS(0) | 462 MEMACCESS(0) |
| 484 "ld1 {v20.8b}, [%0], #8 \n" | 463 "ld1 {v20.8b}, [%0], #8 \n" |
| 485 "orr v21.8b, v20.8b, v20.8b \n" | 464 "orr v21.8b, v20.8b, v20.8b \n" |
| 486 "orr v22.8b, v20.8b, v20.8b \n" | 465 "orr v22.8b, v20.8b, v20.8b \n" |
| 487 "subs %w2, %w2, #8 \n" | 466 "subs %w2, %w2, #8 \n" |
| 488 MEMACCESS(1) | 467 MEMACCESS(1) |
| 489 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
| 490 "b.gt 1b \n" | 469 "b.gt 1b \n" |
| 491 : "+r"(src_y), // %0 | 470 : "+r"(src_y), // %0 |
| 492 "+r"(dst_argb), // %1 | 471 "+r"(dst_argb), // %1 |
| 493 "+r"(width) // %2 | 472 "+r"(width) // %2 |
| 494 : | 473 : |
| 495 : "cc", "memory", "v20", "v21", "v22", "v23" | 474 : "cc", "memory", "v20", "v21", "v22", "v23" |
| 496 ); | 475 ); |
| 497 } | 476 } |
| 498 #endif // HAS_J400TOARGBROW_NEON | |
| 499 | 477 |
| 500 #ifdef HAS_NV12TOARGBROW_NEON | |
| 501 void NV12ToARGBRow_NEON(const uint8* src_y, | 478 void NV12ToARGBRow_NEON(const uint8* src_y, |
| 502 const uint8* src_uv, | 479 const uint8* src_uv, |
| 503 uint8* dst_argb, | 480 uint8* dst_argb, |
| 504 const struct YuvConstants* yuvconstants, | 481 const struct YuvConstants* yuvconstants, |
| 505 int width) { | 482 int width) { |
| 506 asm volatile ( | 483 asm volatile ( |
| 507 YUVTORGB_SETUP | 484 YUVTORGB_SETUP |
| 508 "movi v23.8b, #255 \n" | 485 "movi v23.8b, #255 \n" |
| 509 "1: \n" | 486 "1: \n" |
| 510 READNV12 | 487 READNV12 |
| 511 YUVTORGB(v22, v21, v20) | 488 YUVTORGB(v22, v21, v20) |
| 512 "subs %w3, %w3, #8 \n" | 489 "subs %w3, %w3, #8 \n" |
| 513 MEMACCESS(2) | 490 MEMACCESS(2) |
| 514 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 491 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
| 515 "b.gt 1b \n" | 492 "b.gt 1b \n" |
| 516 : "+r"(src_y), // %0 | 493 : "+r"(src_y), // %0 |
| 517 "+r"(src_uv), // %1 | 494 "+r"(src_uv), // %1 |
| 518 "+r"(dst_argb), // %2 | 495 "+r"(dst_argb), // %2 |
| 519 "+r"(width) // %3 | 496 "+r"(width) // %3 |
| 520 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 497 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 521 [kUVToG]"r"(&yuvconstants->kUVToG), | 498 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 522 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 499 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 523 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 500 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 524 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 501 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 525 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 502 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 526 ); | 503 ); |
| 527 } | 504 } |
| 528 #endif // HAS_NV12TOARGBROW_NEON | |
| 529 | 505 |
| 530 #ifdef HAS_NV12TOARGBROW_NEON | |
| 531 void NV21ToARGBRow_NEON(const uint8* src_y, | 506 void NV21ToARGBRow_NEON(const uint8* src_y, |
| 532 const uint8* src_vu, | 507 const uint8* src_vu, |
| 533 uint8* dst_argb, | 508 uint8* dst_argb, |
| 534 const struct YuvConstants* yuvconstants, | 509 const struct YuvConstants* yuvconstants, |
| 535 int width) { | 510 int width) { |
| 536 asm volatile ( | 511 asm volatile ( |
| 537 YUVTORGB_SETUP | 512 YUVTORGB_SETUP |
| 538 "movi v23.8b, #255 \n" | 513 "movi v23.8b, #255 \n" |
| 539 "1: \n" | 514 "1: \n" |
| 540 READNV21 | 515 READNV21 |
| 541 YUVTORGB(v22, v21, v20) | 516 YUVTORGB(v22, v21, v20) |
| 542 "subs %w3, %w3, #8 \n" | 517 "subs %w3, %w3, #8 \n" |
| 543 MEMACCESS(2) | 518 MEMACCESS(2) |
| 544 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" | 519 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" |
| 545 "b.gt 1b \n" | 520 "b.gt 1b \n" |
| 546 : "+r"(src_y), // %0 | 521 : "+r"(src_y), // %0 |
| 547 "+r"(src_vu), // %1 | 522 "+r"(src_vu), // %1 |
| 548 "+r"(dst_argb), // %2 | 523 "+r"(dst_argb), // %2 |
| 549 "+r"(width) // %3 | 524 "+r"(width) // %3 |
| 550 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 525 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 551 [kUVToG]"r"(&yuvconstants->kUVToG), | 526 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 552 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 527 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 553 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 528 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 554 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 529 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 555 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 530 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 556 ); | 531 ); |
| 557 } | 532 } |
| 558 #endif // HAS_NV12TOARGBROW_NEON | |
| 559 | 533 |
| 560 #ifdef HAS_NV12TORGB565ROW_NEON | |
| 561 void NV12ToRGB565Row_NEON(const uint8* src_y, | 534 void NV12ToRGB565Row_NEON(const uint8* src_y, |
| 562 const uint8* src_uv, | 535 const uint8* src_uv, |
| 563 uint8* dst_rgb565, | 536 uint8* dst_rgb565, |
| 564 const struct YuvConstants* yuvconstants, | 537 const struct YuvConstants* yuvconstants, |
| 565 int width) { | 538 int width) { |
| 566 asm volatile ( | 539 asm volatile ( |
| 567 YUVTORGB_SETUP | 540 YUVTORGB_SETUP |
| 568 "1: \n" | 541 "1: \n" |
| 569 READNV12 | 542 READNV12 |
| 570 YUVTORGB(v22, v21, v20) | 543 YUVTORGB(v22, v21, v20) |
| 571 "subs %w3, %w3, #8 \n" | 544 "subs %w3, %w3, #8 \n" |
| 572 ARGBTORGB565 | 545 ARGBTORGB565 |
| 573 MEMACCESS(2) | 546 MEMACCESS(2) |
| 574 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. | 547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. |
| 575 "b.gt 1b \n" | 548 "b.gt 1b \n" |
| 576 : "+r"(src_y), // %0 | 549 : "+r"(src_y), // %0 |
| 577 "+r"(src_uv), // %1 | 550 "+r"(src_uv), // %1 |
| 578 "+r"(dst_rgb565), // %2 | 551 "+r"(dst_rgb565), // %2 |
| 579 "+r"(width) // %3 | 552 "+r"(width) // %3 |
| 580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 553 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 581 [kUVToG]"r"(&yuvconstants->kUVToG), | 554 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 583 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 586 ); | 559 ); |
| 587 } | 560 } |
| 588 #endif // HAS_NV12TORGB565ROW_NEON | |
| 589 | 561 |
| 590 #ifdef HAS_YUY2TOARGBROW_NEON | |
| 591 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, | 562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
| 592 uint8* dst_argb, | 563 uint8* dst_argb, |
| 593 const struct YuvConstants* yuvconstants, | 564 const struct YuvConstants* yuvconstants, |
| 594 int width) { | 565 int width) { |
| 595 int64 width64 = (int64)(width); | 566 int64 width64 = (int64)(width); |
| 596 asm volatile ( | 567 asm volatile ( |
| 597 YUVTORGB_SETUP | 568 YUVTORGB_SETUP |
| 598 "movi v23.8b, #255 \n" | 569 "movi v23.8b, #255 \n" |
| 599 "1: \n" | 570 "1: \n" |
| 600 READYUY2 | 571 READYUY2 |
| 601 YUVTORGB(v22, v21, v20) | 572 YUVTORGB(v22, v21, v20) |
| 602 "subs %w2, %w2, #8 \n" | 573 "subs %w2, %w2, #8 \n" |
| 603 MEMACCESS(1) | 574 MEMACCESS(1) |
| 604 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" | 575 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" |
| 605 "b.gt 1b \n" | 576 "b.gt 1b \n" |
| 606 : "+r"(src_yuy2), // %0 | 577 : "+r"(src_yuy2), // %0 |
| 607 "+r"(dst_argb), // %1 | 578 "+r"(dst_argb), // %1 |
| 608 "+r"(width64) // %2 | 579 "+r"(width64) // %2 |
| 609 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 580 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 610 [kUVToG]"r"(&yuvconstants->kUVToG), | 581 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 611 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 582 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 612 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 583 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 613 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 614 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 585 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 615 ); | 586 ); |
| 616 } | 587 } |
| 617 #endif // HAS_YUY2TOARGBROW_NEON | |
| 618 | 588 |
| 619 #ifdef HAS_UYVYTOARGBROW_NEON | |
| 620 void UYVYToARGBRow_NEON(const uint8* src_uyvy, | 589 void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
| 621 uint8* dst_argb, | 590 uint8* dst_argb, |
| 622 const struct YuvConstants* yuvconstants, | 591 const struct YuvConstants* yuvconstants, |
| 623 int width) { | 592 int width) { |
| 624 int64 width64 = (int64)(width); | 593 int64 width64 = (int64)(width); |
| 625 asm volatile ( | 594 asm volatile ( |
| 626 YUVTORGB_SETUP | 595 YUVTORGB_SETUP |
| 627 "movi v23.8b, #255 \n" | 596 "movi v23.8b, #255 \n" |
| 628 "1: \n" | 597 "1: \n" |
| 629 READUYVY | 598 READUYVY |
| 630 YUVTORGB(v22, v21, v20) | 599 YUVTORGB(v22, v21, v20) |
| 631 "subs %w2, %w2, #8 \n" | 600 "subs %w2, %w2, #8 \n" |
| 632 MEMACCESS(1) | 601 MEMACCESS(1) |
| 633 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" | 602 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" |
| 634 "b.gt 1b \n" | 603 "b.gt 1b \n" |
| 635 : "+r"(src_uyvy), // %0 | 604 : "+r"(src_uyvy), // %0 |
| 636 "+r"(dst_argb), // %1 | 605 "+r"(dst_argb), // %1 |
| 637 "+r"(width64) // %2 | 606 "+r"(width64) // %2 |
| 638 : [kUVToRB]"r"(&yuvconstants->kUVToRB), | 607 : [kUVToRB]"r"(&yuvconstants->kUVToRB), |
| 639 [kUVToG]"r"(&yuvconstants->kUVToG), | 608 [kUVToG]"r"(&yuvconstants->kUVToG), |
| 640 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), | 609 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), |
| 641 [kYToRgb]"r"(&yuvconstants->kYToRgb) | 610 [kYToRgb]"r"(&yuvconstants->kYToRgb) |
| 642 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", | 611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", |
| 643 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" | 612 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" |
| 644 ); | 613 ); |
| 645 } | 614 } |
| 646 #endif // HAS_UYVYTOARGBROW_NEON | |
| 647 | 615 |
| 648 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. | 616 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
| 649 #ifdef HAS_SPLITUVROW_NEON | |
| 650 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 617 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 651 int width) { | 618 int width) { |
| 652 asm volatile ( | 619 asm volatile ( |
| 653 "1: \n" | 620 "1: \n" |
| 654 MEMACCESS(0) | 621 MEMACCESS(0) |
| 655 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV | 622 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV |
| 656 "subs %w3, %w3, #16 \n" // 16 processed per loop | 623 "subs %w3, %w3, #16 \n" // 16 processed per loop |
| 657 MEMACCESS(1) | 624 MEMACCESS(1) |
| 658 "st1 {v0.16b}, [%1], #16 \n" // store U | 625 "st1 {v0.16b}, [%1], #16 \n" // store U |
| 659 MEMACCESS(2) | 626 MEMACCESS(2) |
| 660 "st1 {v1.16b}, [%2], #16 \n" // store V | 627 "st1 {v1.16b}, [%2], #16 \n" // store V |
| 661 "b.gt 1b \n" | 628 "b.gt 1b \n" |
| 662 : "+r"(src_uv), // %0 | 629 : "+r"(src_uv), // %0 |
| 663 "+r"(dst_u), // %1 | 630 "+r"(dst_u), // %1 |
| 664 "+r"(dst_v), // %2 | 631 "+r"(dst_v), // %2 |
| 665 "+r"(width) // %3 // Output registers | 632 "+r"(width) // %3 // Output registers |
| 666 : // Input registers | 633 : // Input registers |
| 667 : "cc", "memory", "v0", "v1" // Clobber List | 634 : "cc", "memory", "v0", "v1" // Clobber List |
| 668 ); | 635 ); |
| 669 } | 636 } |
| 670 #endif // HAS_SPLITUVROW_NEON | |
| 671 | 637 |
| 672 // Reads 16 U's and V's and writes out 16 pairs of UV. | 638 // Reads 16 U's and V's and writes out 16 pairs of UV. |
| 673 #ifdef HAS_MERGEUVROW_NEON | |
| 674 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, | 639 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
| 675 int width) { | 640 int width) { |
| 676 asm volatile ( | 641 asm volatile ( |
| 677 "1: \n" | 642 "1: \n" |
| 678 MEMACCESS(0) | 643 MEMACCESS(0) |
| 679 "ld1 {v0.16b}, [%0], #16 \n" // load U | 644 "ld1 {v0.16b}, [%0], #16 \n" // load U |
| 680 MEMACCESS(1) | 645 MEMACCESS(1) |
| 681 "ld1 {v1.16b}, [%1], #16 \n" // load V | 646 "ld1 {v1.16b}, [%1], #16 \n" // load V |
| 682 "subs %w3, %w3, #16 \n" // 16 processed per loop | 647 "subs %w3, %w3, #16 \n" // 16 processed per loop |
| 683 MEMACCESS(2) | 648 MEMACCESS(2) |
| 684 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV | 649 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV |
| 685 "b.gt 1b \n" | 650 "b.gt 1b \n" |
| 686 : | 651 : |
| 687 "+r"(src_u), // %0 | 652 "+r"(src_u), // %0 |
| 688 "+r"(src_v), // %1 | 653 "+r"(src_v), // %1 |
| 689 "+r"(dst_uv), // %2 | 654 "+r"(dst_uv), // %2 |
| 690 "+r"(width) // %3 // Output registers | 655 "+r"(width) // %3 // Output registers |
| 691 : // Input registers | 656 : // Input registers |
| 692 : "cc", "memory", "v0", "v1" // Clobber List | 657 : "cc", "memory", "v0", "v1" // Clobber List |
| 693 ); | 658 ); |
| 694 } | 659 } |
| 695 #endif // HAS_MERGEUVROW_NEON | |
| 696 | 660 |
| 697 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. | 661 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
| 698 #ifdef HAS_COPYROW_NEON | |
| 699 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { | 662 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
| 700 asm volatile ( | 663 asm volatile ( |
| 701 "1: \n" | 664 "1: \n" |
| 702 MEMACCESS(0) | 665 MEMACCESS(0) |
| 703 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 | 666 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 |
| 704 "subs %w2, %w2, #32 \n" // 32 processed per loop | 667 "subs %w2, %w2, #32 \n" // 32 processed per loop |
| 705 MEMACCESS(1) | 668 MEMACCESS(1) |
| 706 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 | 669 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 |
| 707 "b.gt 1b \n" | 670 "b.gt 1b \n" |
| 708 : "+r"(src), // %0 | 671 : "+r"(src), // %0 |
| 709 "+r"(dst), // %1 | 672 "+r"(dst), // %1 |
| 710 "+r"(count) // %2 // Output registers | 673 "+r"(count) // %2 // Output registers |
| 711 : // Input registers | 674 : // Input registers |
| 712 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 675 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 713 ); | 676 ); |
| 714 } | 677 } |
| 715 #endif // HAS_COPYROW_NEON | |
| 716 | 678 |
| 717 // SetRow writes 'count' bytes using an 8 bit value repeated. | 679 // SetRow writes 'count' bytes using an 8 bit value repeated. |
| 718 void SetRow_NEON(uint8* dst, uint8 v8, int count) { | 680 void SetRow_NEON(uint8* dst, uint8 v8, int count) { |
| 719 asm volatile ( | 681 asm volatile ( |
| 720 "dup v0.16b, %w2 \n" // duplicate 16 bytes | 682 "dup v0.16b, %w2 \n" // duplicate 16 bytes |
| 721 "1: \n" | 683 "1: \n" |
| 722 "subs %w1, %w1, #16 \n" // 16 bytes per loop | 684 "subs %w1, %w1, #16 \n" // 16 bytes per loop |
| 723 MEMACCESS(0) | 685 MEMACCESS(0) |
| 724 "st1 {v0.16b}, [%0], #16 \n" // store | 686 "st1 {v0.16b}, [%0], #16 \n" // store |
| 725 "b.gt 1b \n" | 687 "b.gt 1b \n" |
| (...skipping 12 matching lines...) Expand all Loading... |
| 738 MEMACCESS(0) | 700 MEMACCESS(0) |
| 739 "st1 {v0.16b}, [%0], #16 \n" // store | 701 "st1 {v0.16b}, [%0], #16 \n" // store |
| 740 "b.gt 1b \n" | 702 "b.gt 1b \n" |
| 741 : "+r"(dst), // %0 | 703 : "+r"(dst), // %0 |
| 742 "+r"(count) // %1 | 704 "+r"(count) // %1 |
| 743 : "r"(v32) // %2 | 705 : "r"(v32) // %2 |
| 744 : "cc", "memory", "v0" | 706 : "cc", "memory", "v0" |
| 745 ); | 707 ); |
| 746 } | 708 } |
| 747 | 709 |
| 748 #ifdef HAS_MIRRORROW_NEON | |
| 749 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 710 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 750 int64 width64 = (int64) width; | 711 int64 width64 = (int64) width; |
| 751 asm volatile ( | 712 asm volatile ( |
| 752 // Start at end of source row. | 713 // Start at end of source row. |
| 753 "add %0, %0, %2 \n" | 714 "add %0, %0, %2 \n" |
| 754 "sub %0, %0, #16 \n" | 715 "sub %0, %0, #16 \n" |
| 755 | 716 |
| 756 "1: \n" | 717 "1: \n" |
| 757 MEMACCESS(0) | 718 MEMACCESS(0) |
| 758 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 719 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 759 "subs %2, %2, #16 \n" // 16 pixels per loop. | 720 "subs %2, %2, #16 \n" // 16 pixels per loop. |
| 760 "rev64 v0.16b, v0.16b \n" | 721 "rev64 v0.16b, v0.16b \n" |
| 761 MEMACCESS(1) | 722 MEMACCESS(1) |
| 762 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 723 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 763 MEMACCESS(1) | 724 MEMACCESS(1) |
| 764 "st1 {v0.D}[0], [%1], #8 \n" | 725 "st1 {v0.D}[0], [%1], #8 \n" |
| 765 "b.gt 1b \n" | 726 "b.gt 1b \n" |
| 766 : "+r"(src), // %0 | 727 : "+r"(src), // %0 |
| 767 "+r"(dst), // %1 | 728 "+r"(dst), // %1 |
| 768 "+r"(width64) // %2 | 729 "+r"(width64) // %2 |
| 769 : "r"((ptrdiff_t)-16) // %3 | 730 : "r"((ptrdiff_t)-16) // %3 |
| 770 : "cc", "memory", "v0" | 731 : "cc", "memory", "v0" |
| 771 ); | 732 ); |
| 772 } | 733 } |
| 773 #endif // HAS_MIRRORROW_NEON | |
| 774 | 734 |
| 775 #ifdef HAS_MIRRORUVROW_NEON | |
| 776 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, | 735 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
| 777 int width) { | 736 int width) { |
| 778 int64 width64 = (int64) width; | 737 int64 width64 = (int64) width; |
| 779 asm volatile ( | 738 asm volatile ( |
| 780 // Start at end of source row. | 739 // Start at end of source row. |
| 781 "add %0, %0, %3, lsl #1 \n" | 740 "add %0, %0, %3, lsl #1 \n" |
| 782 "sub %0, %0, #16 \n" | 741 "sub %0, %0, #16 \n" |
| 783 | 742 |
| 784 "1: \n" | 743 "1: \n" |
| 785 MEMACCESS(0) | 744 MEMACCESS(0) |
| 786 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 | 745 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 |
| 787 "subs %3, %3, #8 \n" // 8 pixels per loop. | 746 "subs %3, %3, #8 \n" // 8 pixels per loop. |
| 788 "rev64 v0.8b, v0.8b \n" | 747 "rev64 v0.8b, v0.8b \n" |
| 789 "rev64 v1.8b, v1.8b \n" | 748 "rev64 v1.8b, v1.8b \n" |
| 790 MEMACCESS(1) | 749 MEMACCESS(1) |
| 791 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 | 750 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 |
| 792 MEMACCESS(2) | 751 MEMACCESS(2) |
| 793 "st1 {v1.8b}, [%2], #8 \n" | 752 "st1 {v1.8b}, [%2], #8 \n" |
| 794 "b.gt 1b \n" | 753 "b.gt 1b \n" |
| 795 : "+r"(src_uv), // %0 | 754 : "+r"(src_uv), // %0 |
| 796 "+r"(dst_u), // %1 | 755 "+r"(dst_u), // %1 |
| 797 "+r"(dst_v), // %2 | 756 "+r"(dst_v), // %2 |
| 798 "+r"(width64) // %3 | 757 "+r"(width64) // %3 |
| 799 : "r"((ptrdiff_t)-16) // %4 | 758 : "r"((ptrdiff_t)-16) // %4 |
| 800 : "cc", "memory", "v0", "v1" | 759 : "cc", "memory", "v0", "v1" |
| 801 ); | 760 ); |
| 802 } | 761 } |
| 803 #endif // HAS_MIRRORUVROW_NEON | |
| 804 | 762 |
| 805 #ifdef HAS_ARGBMIRRORROW_NEON | |
| 806 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { | 763 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
| 807 int64 width64 = (int64) width; | 764 int64 width64 = (int64) width; |
| 808 asm volatile ( | 765 asm volatile ( |
| 809 // Start at end of source row. | 766 // Start at end of source row. |
| 810 "add %0, %0, %2, lsl #2 \n" | 767 "add %0, %0, %2, lsl #2 \n" |
| 811 "sub %0, %0, #16 \n" | 768 "sub %0, %0, #16 \n" |
| 812 | 769 |
| 813 "1: \n" | 770 "1: \n" |
| 814 MEMACCESS(0) | 771 MEMACCESS(0) |
| 815 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 | 772 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 |
| 816 "subs %2, %2, #4 \n" // 4 pixels per loop. | 773 "subs %2, %2, #4 \n" // 4 pixels per loop. |
| 817 "rev64 v0.4s, v0.4s \n" | 774 "rev64 v0.4s, v0.4s \n" |
| 818 MEMACCESS(1) | 775 MEMACCESS(1) |
| 819 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 | 776 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 |
| 820 MEMACCESS(1) | 777 MEMACCESS(1) |
| 821 "st1 {v0.D}[0], [%1], #8 \n" | 778 "st1 {v0.D}[0], [%1], #8 \n" |
| 822 "b.gt 1b \n" | 779 "b.gt 1b \n" |
| 823 : "+r"(src), // %0 | 780 : "+r"(src), // %0 |
| 824 "+r"(dst), // %1 | 781 "+r"(dst), // %1 |
| 825 "+r"(width64) // %2 | 782 "+r"(width64) // %2 |
| 826 : "r"((ptrdiff_t)-16) // %3 | 783 : "r"((ptrdiff_t)-16) // %3 |
| 827 : "cc", "memory", "v0" | 784 : "cc", "memory", "v0" |
| 828 ); | 785 ); |
| 829 } | 786 } |
| 830 #endif // HAS_ARGBMIRRORROW_NEON | |
| 831 | 787 |
| 832 #ifdef HAS_RGB24TOARGBROW_NEON | |
| 833 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { | 788 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { |
| 834 asm volatile ( | 789 asm volatile ( |
| 835 "movi v4.8b, #255 \n" // Alpha | 790 "movi v4.8b, #255 \n" // Alpha |
| 836 "1: \n" | 791 "1: \n" |
| 837 MEMACCESS(0) | 792 MEMACCESS(0) |
| 838 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. | 793 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. |
| 839 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 794 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 840 MEMACCESS(1) | 795 MEMACCESS(1) |
| 841 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels | 796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 842 "b.gt 1b \n" | 797 "b.gt 1b \n" |
| 843 : "+r"(src_rgb24), // %0 | 798 : "+r"(src_rgb24), // %0 |
| 844 "+r"(dst_argb), // %1 | 799 "+r"(dst_argb), // %1 |
| 845 "+r"(width) // %2 | 800 "+r"(width) // %2 |
| 846 : | 801 : |
| 847 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 848 ); | 803 ); |
| 849 } | 804 } |
| 850 #endif // HAS_RGB24TOARGBROW_NEON | |
| 851 | 805 |
| 852 #ifdef HAS_RAWTOARGBROW_NEON | |
| 853 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { | 806 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { |
| 854 asm volatile ( | 807 asm volatile ( |
| 855 "movi v5.8b, #255 \n" // Alpha | 808 "movi v5.8b, #255 \n" // Alpha |
| 856 "1: \n" | 809 "1: \n" |
| 857 MEMACCESS(0) | 810 MEMACCESS(0) |
| 858 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 811 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| 859 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 812 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 860 "orr v3.8b, v1.8b, v1.8b \n" // move g | 813 "orr v3.8b, v1.8b, v1.8b \n" // move g |
| 861 "orr v4.8b, v0.8b, v0.8b \n" // move r | 814 "orr v4.8b, v0.8b, v0.8b \n" // move r |
| 862 MEMACCESS(1) | 815 MEMACCESS(1) |
| 863 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a | 816 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a |
| 864 "b.gt 1b \n" | 817 "b.gt 1b \n" |
| 865 : "+r"(src_raw), // %0 | 818 : "+r"(src_raw), // %0 |
| 866 "+r"(dst_argb), // %1 | 819 "+r"(dst_argb), // %1 |
| 867 "+r"(width) // %2 | 820 "+r"(width) // %2 |
| 868 : | 821 : |
| 869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List | 822 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 870 ); | 823 ); |
| 871 } | 824 } |
| 872 #endif // HAS_RAWTOARGBROW_NEON | |
| 873 | 825 |
| 874 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { | 826 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { |
| 875 asm volatile ( | 827 asm volatile ( |
| 876 "1: \n" | 828 "1: \n" |
| 877 MEMACCESS(0) | 829 MEMACCESS(0) |
| 878 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b | 830 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b |
| 879 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 831 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 880 "orr v3.8b, v1.8b, v1.8b \n" // move g | 832 "orr v3.8b, v1.8b, v1.8b \n" // move g |
| 881 "orr v4.8b, v0.8b, v0.8b \n" // move r | 833 "orr v4.8b, v0.8b, v0.8b \n" // move r |
| 882 MEMACCESS(1) | 834 MEMACCESS(1) |
| (...skipping 13 matching lines...) Expand all Loading... |
| 896 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ | 848 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ |
| 897 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ | 849 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ |
| 898 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 850 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
| 899 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ | 851 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ |
| 900 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ | 852 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ |
| 901 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ | 853 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ |
| 902 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ | 854 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ |
| 903 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ | 855 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ |
| 904 "dup v2.2D, v0.D[1] \n" /* R */ | 856 "dup v2.2D, v0.D[1] \n" /* R */ |
| 905 | 857 |
| 906 #ifdef HAS_RGB565TOARGBROW_NEON | |
| 907 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { | 858 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { |
| 908 asm volatile ( | 859 asm volatile ( |
| 909 "movi v3.8b, #255 \n" // Alpha | 860 "movi v3.8b, #255 \n" // Alpha |
| 910 "1: \n" | 861 "1: \n" |
| 911 MEMACCESS(0) | 862 MEMACCESS(0) |
| 912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 863 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
| 913 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 864 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 914 RGB565TOARGB | 865 RGB565TOARGB |
| 915 MEMACCESS(1) | 866 MEMACCESS(1) |
| 916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 867 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 917 "b.gt 1b \n" | 868 "b.gt 1b \n" |
| 918 : "+r"(src_rgb565), // %0 | 869 : "+r"(src_rgb565), // %0 |
| 919 "+r"(dst_argb), // %1 | 870 "+r"(dst_argb), // %1 |
| 920 "+r"(width) // %2 | 871 "+r"(width) // %2 |
| 921 : | 872 : |
| 922 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List | 873 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List |
| 923 ); | 874 ); |
| 924 } | 875 } |
| 925 #endif // HAS_RGB565TOARGBROW_NEON | |
| 926 | 876 |
| 927 #define ARGB1555TOARGB \ | 877 #define ARGB1555TOARGB \ |
| 928 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ | 878 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ |
| 929 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ | 879 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ |
| 930 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ | 880 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ |
| 931 \ | 881 \ |
| 932 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ | 882 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ |
| 933 "xtn2 v3.16b, v2.8h \n" \ | 883 "xtn2 v3.16b, v2.8h \n" \ |
| 934 \ | 884 \ |
| 935 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ | 885 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ |
| (...skipping 18 matching lines...) Expand all Loading... |
| 954 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ | 904 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ |
| 955 \ | 905 \ |
| 956 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ | 906 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ |
| 957 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ | 907 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ |
| 958 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ | 908 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ |
| 959 \ | 909 \ |
| 960 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ | 910 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ |
| 961 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ | 911 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ |
| 962 "dup v1.2D, v0.D[1] \n" /* G */ \ | 912 "dup v1.2D, v0.D[1] \n" /* G */ \ |
| 963 | 913 |
| 964 #ifdef HAS_ARGB1555TOARGBROW_NEON | |
| 965 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, | 914 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
| 966 int width) { | 915 int width) { |
| 967 asm volatile ( | 916 asm volatile ( |
| 968 "movi v3.8b, #255 \n" // Alpha | 917 "movi v3.8b, #255 \n" // Alpha |
| 969 "1: \n" | 918 "1: \n" |
| 970 MEMACCESS(0) | 919 MEMACCESS(0) |
| 971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 920 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 972 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 921 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 973 ARGB1555TOARGB | 922 ARGB1555TOARGB |
| 974 MEMACCESS(1) | 923 MEMACCESS(1) |
| 975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 924 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 976 "b.gt 1b \n" | 925 "b.gt 1b \n" |
| 977 : "+r"(src_argb1555), // %0 | 926 : "+r"(src_argb1555), // %0 |
| 978 "+r"(dst_argb), // %1 | 927 "+r"(dst_argb), // %1 |
| 979 "+r"(width) // %2 | 928 "+r"(width) // %2 |
| 980 : | 929 : |
| 981 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 930 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 982 ); | 931 ); |
| 983 } | 932 } |
| 984 #endif // HAS_ARGB1555TOARGBROW_NEON | |
| 985 | 933 |
| 986 #define ARGB4444TOARGB \ | 934 #define ARGB4444TOARGB \ |
| 987 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ | 935 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ |
| 988 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ | 936 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ |
| 989 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ | 937 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ |
| 990 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ | 938 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ |
| 991 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ | 939 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ |
| 992 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ | 940 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ |
| 993 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ | 941 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ |
| 994 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ | 942 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ |
| 995 "dup v0.2D, v2.D[1] \n" \ | 943 "dup v0.2D, v2.D[1] \n" \ |
| 996 "dup v1.2D, v3.D[1] \n" | 944 "dup v1.2D, v3.D[1] \n" |
| 997 | 945 |
| 998 #ifdef HAS_ARGB4444TOARGBROW_NEON | |
| 999 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, | 946 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
| 1000 int width) { | 947 int width) { |
| 1001 asm volatile ( | 948 asm volatile ( |
| 1002 "1: \n" | 949 "1: \n" |
| 1003 MEMACCESS(0) | 950 MEMACCESS(0) |
| 1004 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 951 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 1005 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 952 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1006 ARGB4444TOARGB | 953 ARGB4444TOARGB |
| 1007 MEMACCESS(1) | 954 MEMACCESS(1) |
| 1008 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 955 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 1009 "b.gt 1b \n" | 956 "b.gt 1b \n" |
| 1010 : "+r"(src_argb4444), // %0 | 957 : "+r"(src_argb4444), // %0 |
| 1011 "+r"(dst_argb), // %1 | 958 "+r"(dst_argb), // %1 |
| 1012 "+r"(width) // %2 | 959 "+r"(width) // %2 |
| 1013 : | 960 : |
| 1014 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List | 961 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List |
| 1015 ); | 962 ); |
| 1016 } | 963 } |
| 1017 #endif // HAS_ARGB4444TOARGBROW_NEON | |
| 1018 | 964 |
| 1019 #ifdef HAS_ARGBTORGB24ROW_NEON | |
| 1020 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { | 965 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { |
| 1021 asm volatile ( | 966 asm volatile ( |
| 1022 "1: \n" | 967 "1: \n" |
| 1023 MEMACCESS(0) | 968 MEMACCESS(0) |
| 1024 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels | 969 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels |
| 1025 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 970 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1026 MEMACCESS(1) | 971 MEMACCESS(1) |
| 1027 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. | 972 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. |
| 1028 "b.gt 1b \n" | 973 "b.gt 1b \n" |
| 1029 : "+r"(src_argb), // %0 | 974 : "+r"(src_argb), // %0 |
| 1030 "+r"(dst_rgb24), // %1 | 975 "+r"(dst_rgb24), // %1 |
| 1031 "+r"(width) // %2 | 976 "+r"(width) // %2 |
| 1032 : | 977 : |
| 1033 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List | 978 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List |
| 1034 ); | 979 ); |
| 1035 } | 980 } |
| 1036 #endif // HAS_ARGBTORGB24ROW_NEON | |
| 1037 | 981 |
| 1038 #ifdef HAS_ARGBTORAWROW_NEON | |
| 1039 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { | 982 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { |
| 1040 asm volatile ( | 983 asm volatile ( |
| 1041 "1: \n" | 984 "1: \n" |
| 1042 MEMACCESS(0) | 985 MEMACCESS(0) |
| 1043 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a | 986 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a |
| 1044 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 987 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1045 "orr v4.8b, v2.8b, v2.8b \n" // mov g | 988 "orr v4.8b, v2.8b, v2.8b \n" // mov g |
| 1046 "orr v5.8b, v1.8b, v1.8b \n" // mov b | 989 "orr v5.8b, v1.8b, v1.8b \n" // mov b |
| 1047 MEMACCESS(1) | 990 MEMACCESS(1) |
| 1048 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b | 991 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b |
| 1049 "b.gt 1b \n" | 992 "b.gt 1b \n" |
| 1050 : "+r"(src_argb), // %0 | 993 : "+r"(src_argb), // %0 |
| 1051 "+r"(dst_raw), // %1 | 994 "+r"(dst_raw), // %1 |
| 1052 "+r"(width) // %2 | 995 "+r"(width) // %2 |
| 1053 : | 996 : |
| 1054 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List | 997 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List |
| 1055 ); | 998 ); |
| 1056 } | 999 } |
| 1057 #endif // HAS_ARGBTORAWROW_NEON | |
| 1058 | 1000 |
| 1059 #ifdef HAS_YUY2TOYROW_NEON | |
| 1060 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { | 1001 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { |
| 1061 asm volatile ( | 1002 asm volatile ( |
| 1062 "1: \n" | 1003 "1: \n" |
| 1063 MEMACCESS(0) | 1004 MEMACCESS(0) |
| 1064 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. | 1005 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. |
| 1065 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1006 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| 1066 MEMACCESS(1) | 1007 MEMACCESS(1) |
| 1067 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1008 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1068 "b.gt 1b \n" | 1009 "b.gt 1b \n" |
| 1069 : "+r"(src_yuy2), // %0 | 1010 : "+r"(src_yuy2), // %0 |
| 1070 "+r"(dst_y), // %1 | 1011 "+r"(dst_y), // %1 |
| 1071 "+r"(width) // %2 | 1012 "+r"(width) // %2 |
| 1072 : | 1013 : |
| 1073 : "cc", "memory", "v0", "v1" // Clobber List | 1014 : "cc", "memory", "v0", "v1" // Clobber List |
| 1074 ); | 1015 ); |
| 1075 } | 1016 } |
| 1076 #endif // HAS_YUY2TOYROW_NEON | |
| 1077 | 1017 |
| 1078 #ifdef HAS_UYVYTOYROW_NEON | |
| 1079 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { | 1018 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { |
| 1080 asm volatile ( | 1019 asm volatile ( |
| 1081 "1: \n" | 1020 "1: \n" |
| 1082 MEMACCESS(0) | 1021 MEMACCESS(0) |
| 1083 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. | 1022 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. |
| 1084 "subs %w2, %w2, #16 \n" // 16 processed per loop. | 1023 "subs %w2, %w2, #16 \n" // 16 processed per loop. |
| 1085 MEMACCESS(1) | 1024 MEMACCESS(1) |
| 1086 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. | 1025 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. |
| 1087 "b.gt 1b \n" | 1026 "b.gt 1b \n" |
| 1088 : "+r"(src_uyvy), // %0 | 1027 : "+r"(src_uyvy), // %0 |
| 1089 "+r"(dst_y), // %1 | 1028 "+r"(dst_y), // %1 |
| 1090 "+r"(width) // %2 | 1029 "+r"(width) // %2 |
| 1091 : | 1030 : |
| 1092 : "cc", "memory", "v0", "v1" // Clobber List | 1031 : "cc", "memory", "v0", "v1" // Clobber List |
| 1093 ); | 1032 ); |
| 1094 } | 1033 } |
| 1095 #endif // HAS_UYVYTOYROW_NEON | |
| 1096 | 1034 |
| 1097 #ifdef HAS_YUY2TOUV422ROW_NEON | |
| 1098 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, | 1035 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
| 1099 int width) { | 1036 int width) { |
| 1100 asm volatile ( | 1037 asm volatile ( |
| 1101 "1: \n" | 1038 "1: \n" |
| 1102 MEMACCESS(0) | 1039 MEMACCESS(0) |
| 1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels | 1040 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels |
| 1104 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1041 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| 1105 MEMACCESS(1) | 1042 MEMACCESS(1) |
| 1106 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. | 1043 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. |
| 1107 MEMACCESS(2) | 1044 MEMACCESS(2) |
| 1108 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. | 1045 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. |
| 1109 "b.gt 1b \n" | 1046 "b.gt 1b \n" |
| 1110 : "+r"(src_yuy2), // %0 | 1047 : "+r"(src_yuy2), // %0 |
| 1111 "+r"(dst_u), // %1 | 1048 "+r"(dst_u), // %1 |
| 1112 "+r"(dst_v), // %2 | 1049 "+r"(dst_v), // %2 |
| 1113 "+r"(width) // %3 | 1050 "+r"(width) // %3 |
| 1114 : | 1051 : |
| 1115 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1052 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1116 ); | 1053 ); |
| 1117 } | 1054 } |
| 1118 #endif // HAS_YUY2TOUV422ROW_NEON | |
| 1119 | 1055 |
| 1120 #ifdef HAS_UYVYTOUV422ROW_NEON | |
| 1121 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, | 1056 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
| 1122 int width) { | 1057 int width) { |
| 1123 asm volatile ( | 1058 asm volatile ( |
| 1124 "1: \n" | 1059 "1: \n" |
| 1125 MEMACCESS(0) | 1060 MEMACCESS(0) |
| 1126 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels | 1061 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels |
| 1127 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. | 1062 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. |
| 1128 MEMACCESS(1) | 1063 MEMACCESS(1) |
| 1129 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. | 1064 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. |
| 1130 MEMACCESS(2) | 1065 MEMACCESS(2) |
| 1131 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. | 1066 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. |
| 1132 "b.gt 1b \n" | 1067 "b.gt 1b \n" |
| 1133 : "+r"(src_uyvy), // %0 | 1068 : "+r"(src_uyvy), // %0 |
| 1134 "+r"(dst_u), // %1 | 1069 "+r"(dst_u), // %1 |
| 1135 "+r"(dst_v), // %2 | 1070 "+r"(dst_v), // %2 |
| 1136 "+r"(width) // %3 | 1071 "+r"(width) // %3 |
| 1137 : | 1072 : |
| 1138 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1073 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1139 ); | 1074 ); |
| 1140 } | 1075 } |
| 1141 #endif // HAS_UYVYTOUV422ROW_NEON | |
| 1142 | 1076 |
| 1143 #ifdef HAS_YUY2TOUVROW_NEON | |
| 1144 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, | 1077 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
| 1145 uint8* dst_u, uint8* dst_v, int width) { | 1078 uint8* dst_u, uint8* dst_v, int width) { |
| 1146 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; | 1079 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; |
| 1147 asm volatile ( | 1080 asm volatile ( |
| 1148 "1: \n" | 1081 "1: \n" |
| 1149 MEMACCESS(0) | 1082 MEMACCESS(0) |
| 1150 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1083 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| 1151 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1084 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| 1152 MEMACCESS(1) | 1085 MEMACCESS(1) |
| 1153 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1086 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| 1154 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U | 1087 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U |
| 1155 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V | 1088 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V |
| 1156 MEMACCESS(2) | 1089 MEMACCESS(2) |
| 1157 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. | 1090 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. |
| 1158 MEMACCESS(3) | 1091 MEMACCESS(3) |
| 1159 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. | 1092 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. |
| 1160 "b.gt 1b \n" | 1093 "b.gt 1b \n" |
| 1161 : "+r"(src_yuy2), // %0 | 1094 : "+r"(src_yuy2), // %0 |
| 1162 "+r"(src_yuy2b), // %1 | 1095 "+r"(src_yuy2b), // %1 |
| 1163 "+r"(dst_u), // %2 | 1096 "+r"(dst_u), // %2 |
| 1164 "+r"(dst_v), // %3 | 1097 "+r"(dst_v), // %3 |
| 1165 "+r"(width) // %4 | 1098 "+r"(width) // %4 |
| 1166 : | 1099 : |
| 1167 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1100 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1168 "v5", "v6", "v7" // Clobber List | 1101 "v5", "v6", "v7" // Clobber List |
| 1169 ); | 1102 ); |
| 1170 } | 1103 } |
| 1171 #endif // HAS_YUY2TOUVROW_NEON | |
| 1172 | 1104 |
| 1173 #ifdef HAS_UYVYTOUVROW_NEON | |
| 1174 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, | 1105 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
| 1175 uint8* dst_u, uint8* dst_v, int width) { | 1106 uint8* dst_u, uint8* dst_v, int width) { |
| 1176 const uint8* src_uyvyb = src_uyvy + stride_uyvy; | 1107 const uint8* src_uyvyb = src_uyvy + stride_uyvy; |
| 1177 asm volatile ( | 1108 asm volatile ( |
| 1178 "1: \n" | 1109 "1: \n" |
| 1179 MEMACCESS(0) | 1110 MEMACCESS(0) |
| 1180 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels | 1111 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels |
| 1181 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. | 1112 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. |
| 1182 MEMACCESS(1) | 1113 MEMACCESS(1) |
| 1183 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row | 1114 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row |
| 1184 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U | 1115 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U |
| 1185 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V | 1116 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V |
| 1186 MEMACCESS(2) | 1117 MEMACCESS(2) |
| 1187 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. | 1118 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. |
| 1188 MEMACCESS(3) | 1119 MEMACCESS(3) |
| 1189 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. | 1120 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. |
| 1190 "b.gt 1b \n" | 1121 "b.gt 1b \n" |
| 1191 : "+r"(src_uyvy), // %0 | 1122 : "+r"(src_uyvy), // %0 |
| 1192 "+r"(src_uyvyb), // %1 | 1123 "+r"(src_uyvyb), // %1 |
| 1193 "+r"(dst_u), // %2 | 1124 "+r"(dst_u), // %2 |
| 1194 "+r"(dst_v), // %3 | 1125 "+r"(dst_v), // %3 |
| 1195 "+r"(width) // %4 | 1126 "+r"(width) // %4 |
| 1196 : | 1127 : |
| 1197 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1128 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1198 "v5", "v6", "v7" // Clobber List | 1129 "v5", "v6", "v7" // Clobber List |
| 1199 ); | 1130 ); |
| 1200 } | 1131 } |
| 1201 #endif // HAS_UYVYTOUVROW_NEON | |
| 1202 | 1132 |
| 1203 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. | 1133 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
| 1204 #ifdef HAS_ARGBSHUFFLEROW_NEON | |
| 1205 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, | 1134 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
| 1206 const uint8* shuffler, int width) { | 1135 const uint8* shuffler, int width) { |
| 1207 asm volatile ( | 1136 asm volatile ( |
| 1208 MEMACCESS(3) | 1137 MEMACCESS(3) |
| 1209 "ld1 {v2.16b}, [%3] \n" // shuffler | 1138 "ld1 {v2.16b}, [%3] \n" // shuffler |
| 1210 "1: \n" | 1139 "1: \n" |
| 1211 MEMACCESS(0) | 1140 MEMACCESS(0) |
| 1212 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. | 1141 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. |
| 1213 "subs %w2, %w2, #4 \n" // 4 processed per loop | 1142 "subs %w2, %w2, #4 \n" // 4 processed per loop |
| 1214 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels | 1143 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels |
| 1215 MEMACCESS(1) | 1144 MEMACCESS(1) |
| 1216 "st1 {v1.16b}, [%1], #16 \n" // store 4. | 1145 "st1 {v1.16b}, [%1], #16 \n" // store 4. |
| 1217 "b.gt 1b \n" | 1146 "b.gt 1b \n" |
| 1218 : "+r"(src_argb), // %0 | 1147 : "+r"(src_argb), // %0 |
| 1219 "+r"(dst_argb), // %1 | 1148 "+r"(dst_argb), // %1 |
| 1220 "+r"(width) // %2 | 1149 "+r"(width) // %2 |
| 1221 : "r"(shuffler) // %3 | 1150 : "r"(shuffler) // %3 |
| 1222 : "cc", "memory", "v0", "v1", "v2" // Clobber List | 1151 : "cc", "memory", "v0", "v1", "v2" // Clobber List |
| 1223 ); | 1152 ); |
| 1224 } | 1153 } |
| 1225 #endif // HAS_ARGBSHUFFLEROW_NEON | |
| 1226 | 1154 |
| 1227 #ifdef HAS_I422TOYUY2ROW_NEON | |
| 1228 void I422ToYUY2Row_NEON(const uint8* src_y, | 1155 void I422ToYUY2Row_NEON(const uint8* src_y, |
| 1229 const uint8* src_u, | 1156 const uint8* src_u, |
| 1230 const uint8* src_v, | 1157 const uint8* src_v, |
| 1231 uint8* dst_yuy2, int width) { | 1158 uint8* dst_yuy2, int width) { |
| 1232 asm volatile ( | 1159 asm volatile ( |
| 1233 "1: \n" | 1160 "1: \n" |
| 1234 MEMACCESS(0) | 1161 MEMACCESS(0) |
| 1235 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys | 1162 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys |
| 1236 "orr v2.8b, v1.8b, v1.8b \n" | 1163 "orr v2.8b, v1.8b, v1.8b \n" |
| 1237 MEMACCESS(1) | 1164 MEMACCESS(1) |
| 1238 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us | 1165 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us |
| 1239 MEMACCESS(2) | 1166 MEMACCESS(2) |
| 1240 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs | 1167 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs |
| 1241 "subs %w4, %w4, #16 \n" // 16 pixels | 1168 "subs %w4, %w4, #16 \n" // 16 pixels |
| 1242 MEMACCESS(3) | 1169 MEMACCESS(3) |
| 1243 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1170 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
| 1244 "b.gt 1b \n" | 1171 "b.gt 1b \n" |
| 1245 : "+r"(src_y), // %0 | 1172 : "+r"(src_y), // %0 |
| 1246 "+r"(src_u), // %1 | 1173 "+r"(src_u), // %1 |
| 1247 "+r"(src_v), // %2 | 1174 "+r"(src_v), // %2 |
| 1248 "+r"(dst_yuy2), // %3 | 1175 "+r"(dst_yuy2), // %3 |
| 1249 "+r"(width) // %4 | 1176 "+r"(width) // %4 |
| 1250 : | 1177 : |
| 1251 : "cc", "memory", "v0", "v1", "v2", "v3" | 1178 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 1252 ); | 1179 ); |
| 1253 } | 1180 } |
| 1254 #endif // HAS_I422TOYUY2ROW_NEON | |
| 1255 | 1181 |
| 1256 #ifdef HAS_I422TOUYVYROW_NEON | |
| 1257 void I422ToUYVYRow_NEON(const uint8* src_y, | 1182 void I422ToUYVYRow_NEON(const uint8* src_y, |
| 1258 const uint8* src_u, | 1183 const uint8* src_u, |
| 1259 const uint8* src_v, | 1184 const uint8* src_v, |
| 1260 uint8* dst_uyvy, int width) { | 1185 uint8* dst_uyvy, int width) { |
| 1261 asm volatile ( | 1186 asm volatile ( |
| 1262 "1: \n" | 1187 "1: \n" |
| 1263 MEMACCESS(0) | 1188 MEMACCESS(0) |
| 1264 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys | 1189 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys |
| 1265 "orr v3.8b, v2.8b, v2.8b \n" | 1190 "orr v3.8b, v2.8b, v2.8b \n" |
| 1266 MEMACCESS(1) | 1191 MEMACCESS(1) |
| 1267 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us | 1192 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us |
| 1268 MEMACCESS(2) | 1193 MEMACCESS(2) |
| 1269 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs | 1194 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs |
| 1270 "subs %w4, %w4, #16 \n" // 16 pixels | 1195 "subs %w4, %w4, #16 \n" // 16 pixels |
| 1271 MEMACCESS(3) | 1196 MEMACCESS(3) |
| 1272 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. | 1197 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. |
| 1273 "b.gt 1b \n" | 1198 "b.gt 1b \n" |
| 1274 : "+r"(src_y), // %0 | 1199 : "+r"(src_y), // %0 |
| 1275 "+r"(src_u), // %1 | 1200 "+r"(src_u), // %1 |
| 1276 "+r"(src_v), // %2 | 1201 "+r"(src_v), // %2 |
| 1277 "+r"(dst_uyvy), // %3 | 1202 "+r"(dst_uyvy), // %3 |
| 1278 "+r"(width) // %4 | 1203 "+r"(width) // %4 |
| 1279 : | 1204 : |
| 1280 : "cc", "memory", "v0", "v1", "v2", "v3" | 1205 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 1281 ); | 1206 ); |
| 1282 } | 1207 } |
| 1283 #endif // HAS_I422TOUYVYROW_NEON | |
| 1284 | 1208 |
| 1285 #ifdef HAS_ARGBTORGB565ROW_NEON | |
| 1286 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { | 1209 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { |
| 1287 asm volatile ( | 1210 asm volatile ( |
| 1288 "1: \n" | 1211 "1: \n" |
| 1289 MEMACCESS(0) | 1212 MEMACCESS(0) |
| 1290 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1291 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1214 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1292 ARGBTORGB565 | 1215 ARGBTORGB565 |
| 1293 MEMACCESS(1) | 1216 MEMACCESS(1) |
| 1294 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. | 1217 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. |
| 1295 "b.gt 1b \n" | 1218 "b.gt 1b \n" |
| 1296 : "+r"(src_argb), // %0 | 1219 : "+r"(src_argb), // %0 |
| 1297 "+r"(dst_rgb565), // %1 | 1220 "+r"(dst_rgb565), // %1 |
| 1298 "+r"(width) // %2 | 1221 "+r"(width) // %2 |
| 1299 : | 1222 : |
| 1300 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1223 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
| 1301 ); | 1224 ); |
| 1302 } | 1225 } |
| 1303 #endif // HAS_ARGBTORGB565ROW_NEON | |
| 1304 | 1226 |
| 1305 #ifdef HAS_ARGBTORGB565DITHERROW_NEON | |
| 1306 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, | 1227 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, |
| 1307 const uint32 dither4, int width) { | 1228 const uint32 dither4, int width) { |
| 1308 asm volatile ( | 1229 asm volatile ( |
| 1309 "dup v1.4s, %w2 \n" // dither4 | 1230 "dup v1.4s, %w2 \n" // dither4 |
| 1310 "1: \n" | 1231 "1: \n" |
| 1311 MEMACCESS(1) | 1232 MEMACCESS(1) |
| 1312 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels | 1233 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels |
| 1313 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 1234 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 1314 "uqadd v20.8b, v20.8b, v1.8b \n" | 1235 "uqadd v20.8b, v20.8b, v1.8b \n" |
| 1315 "uqadd v21.8b, v21.8b, v1.8b \n" | 1236 "uqadd v21.8b, v21.8b, v1.8b \n" |
| 1316 "uqadd v22.8b, v22.8b, v1.8b \n" | 1237 "uqadd v22.8b, v22.8b, v1.8b \n" |
| 1317 ARGBTORGB565 | 1238 ARGBTORGB565 |
| 1318 MEMACCESS(0) | 1239 MEMACCESS(0) |
| 1319 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. | 1240 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. |
| 1320 "b.gt 1b \n" | 1241 "b.gt 1b \n" |
| 1321 : "+r"(dst_rgb) // %0 | 1242 : "+r"(dst_rgb) // %0 |
| 1322 : "r"(src_argb), // %1 | 1243 : "r"(src_argb), // %1 |
| 1323 "r"(dither4), // %2 | 1244 "r"(dither4), // %2 |
| 1324 "r"(width) // %3 | 1245 "r"(width) // %3 |
| 1325 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" | 1246 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" |
| 1326 ); | 1247 ); |
| 1327 } | 1248 } |
| 1328 #endif // HAS_ARGBTORGB565ROW_NEON | |
| 1329 | 1249 |
| 1330 #ifdef HAS_ARGBTOARGB1555ROW_NEON | |
| 1331 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, | 1250 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
| 1332 int width) { | 1251 int width) { |
| 1333 asm volatile ( | 1252 asm volatile ( |
| 1334 "1: \n" | 1253 "1: \n" |
| 1335 MEMACCESS(0) | 1254 MEMACCESS(0) |
| 1336 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1255 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1337 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1256 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1338 ARGBTOARGB1555 | 1257 ARGBTOARGB1555 |
| 1339 MEMACCESS(1) | 1258 MEMACCESS(1) |
| 1340 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. | 1259 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. |
| 1341 "b.gt 1b \n" | 1260 "b.gt 1b \n" |
| 1342 : "+r"(src_argb), // %0 | 1261 : "+r"(src_argb), // %0 |
| 1343 "+r"(dst_argb1555), // %1 | 1262 "+r"(dst_argb1555), // %1 |
| 1344 "+r"(width) // %2 | 1263 "+r"(width) // %2 |
| 1345 : | 1264 : |
| 1346 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" | 1265 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" |
| 1347 ); | 1266 ); |
| 1348 } | 1267 } |
| 1349 #endif // HAS_ARGBTOARGB1555ROW_NEON | |
| 1350 | 1268 |
| 1351 #ifdef HAS_ARGBTOARGB4444ROW_NEON | |
| 1352 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, | 1269 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
| 1353 int width) { | 1270 int width) { |
| 1354 asm volatile ( | 1271 asm volatile ( |
| 1355 "movi v4.16b, #0x0f \n" // bits to clear with vbic. | 1272 "movi v4.16b, #0x0f \n" // bits to clear with vbic. |
| 1356 "1: \n" | 1273 "1: \n" |
| 1357 MEMACCESS(0) | 1274 MEMACCESS(0) |
| 1358 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels | 1275 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels |
| 1359 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1276 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1360 ARGBTOARGB4444 | 1277 ARGBTOARGB4444 |
| 1361 MEMACCESS(1) | 1278 MEMACCESS(1) |
| 1362 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. | 1279 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. |
| 1363 "b.gt 1b \n" | 1280 "b.gt 1b \n" |
| 1364 : "+r"(src_argb), // %0 | 1281 : "+r"(src_argb), // %0 |
| 1365 "+r"(dst_argb4444), // %1 | 1282 "+r"(dst_argb4444), // %1 |
| 1366 "+r"(width) // %2 | 1283 "+r"(width) // %2 |
| 1367 : | 1284 : |
| 1368 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" | 1285 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" |
| 1369 ); | 1286 ); |
| 1370 } | 1287 } |
| 1371 #endif // HAS_ARGBTOARGB4444ROW_NEON | |
| 1372 | 1288 |
| 1373 #ifdef HAS_ARGBTOYROW_NEON | |
| 1374 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { | 1289 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
| 1375 asm volatile ( | 1290 asm volatile ( |
| 1376 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1291 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 1377 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1292 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 1378 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1293 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 1379 "movi v7.8b, #16 \n" // Add 16 constant | 1294 "movi v7.8b, #16 \n" // Add 16 constant |
| 1380 "1: \n" | 1295 "1: \n" |
| 1381 MEMACCESS(0) | 1296 MEMACCESS(0) |
| 1382 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1297 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1383 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1298 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1384 "umull v3.8h, v0.8b, v4.8b \n" // B | 1299 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1385 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1300 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1386 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1301 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1387 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1302 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 1388 "uqadd v0.8b, v0.8b, v7.8b \n" | 1303 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 1389 MEMACCESS(1) | 1304 MEMACCESS(1) |
| 1390 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1305 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1391 "b.gt 1b \n" | 1306 "b.gt 1b \n" |
| 1392 : "+r"(src_argb), // %0 | 1307 : "+r"(src_argb), // %0 |
| 1393 "+r"(dst_y), // %1 | 1308 "+r"(dst_y), // %1 |
| 1394 "+r"(width) // %2 | 1309 "+r"(width) // %2 |
| 1395 : | 1310 : |
| 1396 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 1311 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 1397 ); | 1312 ); |
| 1398 } | 1313 } |
| 1399 #endif // HAS_ARGBTOYROW_NEON | |
| 1400 | 1314 |
| 1401 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON | |
| 1402 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { | 1315 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { |
| 1403 asm volatile ( | 1316 asm volatile ( |
| 1404 "1: \n" | 1317 "1: \n" |
| 1405 MEMACCESS(0) | 1318 MEMACCESS(0) |
| 1406 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix
els | 1319 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pix
els |
| 1407 "subs %w2, %w2, #16 \n" // 16 processed per loop | 1320 "subs %w2, %w2, #16 \n" // 16 processed per loop |
| 1408 MEMACCESS(1) | 1321 MEMACCESS(1) |
| 1409 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. | 1322 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. |
| 1410 "b.gt 1b \n" | 1323 "b.gt 1b \n" |
| 1411 : "+r"(src_argb), // %0 | 1324 : "+r"(src_argb), // %0 |
| 1412 "+r"(dst_a), // %1 | 1325 "+r"(dst_a), // %1 |
| 1413 "+r"(width) // %2 | 1326 "+r"(width) // %2 |
| 1414 : | 1327 : |
| 1415 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 1328 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 1416 ); | 1329 ); |
| 1417 } | 1330 } |
| 1418 #endif // HAS_ARGBEXTRACTALPHAROW_NEON | |
| 1419 | 1331 |
| 1420 #ifdef HAS_ARGBTOYJROW_NEON | |
| 1421 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { | 1332 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { |
| 1422 asm volatile ( | 1333 asm volatile ( |
| 1423 "movi v4.8b, #15 \n" // B * 0.11400 coefficient | 1334 "movi v4.8b, #15 \n" // B * 0.11400 coefficient |
| 1424 "movi v5.8b, #75 \n" // G * 0.58700 coefficient | 1335 "movi v5.8b, #75 \n" // G * 0.58700 coefficient |
| 1425 "movi v6.8b, #38 \n" // R * 0.29900 coefficient | 1336 "movi v6.8b, #38 \n" // R * 0.29900 coefficient |
| 1426 "1: \n" | 1337 "1: \n" |
| 1427 MEMACCESS(0) | 1338 MEMACCESS(0) |
| 1428 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 1339 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 1429 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1340 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 1430 "umull v3.8h, v0.8b, v4.8b \n" // B | 1341 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 1431 "umlal v3.8h, v1.8b, v5.8b \n" // G | 1342 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 1432 "umlal v3.8h, v2.8b, v6.8b \n" // R | 1343 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 1433 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y | 1344 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y |
| 1434 MEMACCESS(1) | 1345 MEMACCESS(1) |
| 1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1346 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 1436 "b.gt 1b \n" | 1347 "b.gt 1b \n" |
| 1437 : "+r"(src_argb), // %0 | 1348 : "+r"(src_argb), // %0 |
| 1438 "+r"(dst_y), // %1 | 1349 "+r"(dst_y), // %1 |
| 1439 "+r"(width) // %2 | 1350 "+r"(width) // %2 |
| 1440 : | 1351 : |
| 1441 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 1352 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
| 1442 ); | 1353 ); |
| 1443 } | 1354 } |
| 1444 #endif // HAS_ARGBTOYJROW_NEON | |
| 1445 | 1355 |
| 1446 // 8x1 pixels. | 1356 // 8x1 pixels. |
| 1447 #ifdef HAS_ARGBTOUV444ROW_NEON | |
| 1448 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1357 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1449 int width) { | 1358 int width) { |
| 1450 asm volatile ( | 1359 asm volatile ( |
| 1451 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient | 1360 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient |
| 1452 "movi v25.8b, #74 \n" // UG -0.5781 coefficient | 1361 "movi v25.8b, #74 \n" // UG -0.5781 coefficient |
| 1453 "movi v26.8b, #38 \n" // UR -0.2969 coefficient | 1362 "movi v26.8b, #38 \n" // UR -0.2969 coefficient |
| 1454 "movi v27.8b, #18 \n" // VB -0.1406 coefficient | 1363 "movi v27.8b, #18 \n" // VB -0.1406 coefficient |
| 1455 "movi v28.8b, #94 \n" // VG -0.7344 coefficient | 1364 "movi v28.8b, #94 \n" // VG -0.7344 coefficient |
| 1456 "movi v29.16b,#0x80 \n" // 128.5 | 1365 "movi v29.16b,#0x80 \n" // 128.5 |
| 1457 "1: \n" | 1366 "1: \n" |
| (...skipping 20 matching lines...) Expand all Loading... |
| 1478 "b.gt 1b \n" | 1387 "b.gt 1b \n" |
| 1479 : "+r"(src_argb), // %0 | 1388 : "+r"(src_argb), // %0 |
| 1480 "+r"(dst_u), // %1 | 1389 "+r"(dst_u), // %1 |
| 1481 "+r"(dst_v), // %2 | 1390 "+r"(dst_v), // %2 |
| 1482 "+r"(width) // %3 | 1391 "+r"(width) // %3 |
| 1483 : | 1392 : |
| 1484 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", | 1393 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", |
| 1485 "v24", "v25", "v26", "v27", "v28", "v29" | 1394 "v24", "v25", "v26", "v27", "v28", "v29" |
| 1486 ); | 1395 ); |
| 1487 } | 1396 } |
| 1488 #endif // HAS_ARGBTOUV444ROW_NEON | |
| 1489 | 1397 |
| 1490 #define RGBTOUV_SETUP_REG \ | 1398 #define RGBTOUV_SETUP_REG \ |
| 1491 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ | 1399 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ |
| 1492 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ | 1400 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ |
| 1493 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ | 1401 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ |
| 1494 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ | 1402 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ |
| 1495 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ | 1403 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ |
| 1496 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ | 1404 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ |
| 1497 | 1405 |
| 1498 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. | 1406 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. |
| 1499 #ifdef HAS_ARGBTOUV411ROW_NEON | |
| 1500 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, | 1407 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
| 1501 int width) { | 1408 int width) { |
| 1502 asm volatile ( | 1409 asm volatile ( |
| 1503 RGBTOUV_SETUP_REG | 1410 RGBTOUV_SETUP_REG |
| 1504 "1: \n" | 1411 "1: \n" |
| 1505 MEMACCESS(0) | 1412 MEMACCESS(0) |
| 1506 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1413 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1507 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1414 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1508 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1415 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| 1509 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. | 1416 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. |
| (...skipping 29 matching lines...) Expand all Loading... |
| 1539 "b.gt 1b \n" | 1446 "b.gt 1b \n" |
| 1540 : "+r"(src_argb), // %0 | 1447 : "+r"(src_argb), // %0 |
| 1541 "+r"(dst_u), // %1 | 1448 "+r"(dst_u), // %1 |
| 1542 "+r"(dst_v), // %2 | 1449 "+r"(dst_v), // %2 |
| 1543 "+r"(width) // %3 | 1450 "+r"(width) // %3 |
| 1544 : | 1451 : |
| 1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1452 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1546 "v20", "v21", "v22", "v23", "v24", "v25" | 1453 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1547 ); | 1454 ); |
| 1548 } | 1455 } |
| 1549 #endif // HAS_ARGBTOUV411ROW_NEON | |
| 1550 | 1456 |
| 1551 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1457 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1552 #define RGBTOUV(QB, QG, QR) \ | 1458 #define RGBTOUV(QB, QG, QR) \ |
| 1553 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ | 1459 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ |
| 1554 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ | 1460 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ |
| 1555 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ | 1461 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ |
| 1556 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ | 1462 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ |
| 1557 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ | 1463 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ |
| 1558 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ | 1464 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ |
| 1559 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1465 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| 1560 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ | 1466 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ |
| 1561 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ | 1467 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ |
| 1562 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ | 1468 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ |
| 1563 | 1469 |
| 1564 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. | 1470 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |
| 1565 // TODO(fbarchard): consider ptrdiff_t for all strides. | 1471 // TODO(fbarchard): consider ptrdiff_t for all strides. |
| 1566 | 1472 |
| 1567 #ifdef HAS_ARGBTOUVROW_NEON | |
| 1568 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, | 1473 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |
| 1569 uint8* dst_u, uint8* dst_v, int width) { | 1474 uint8* dst_u, uint8* dst_v, int width) { |
| 1570 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1475 const uint8* src_argb_1 = src_argb + src_stride_argb; |
| 1571 asm volatile ( | 1476 asm volatile ( |
| 1572 RGBTOUV_SETUP_REG | 1477 RGBTOUV_SETUP_REG |
| 1573 "1: \n" | 1478 "1: \n" |
| 1574 MEMACCESS(0) | 1479 MEMACCESS(0) |
| 1575 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1480 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1576 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1481 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1577 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1482 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 19 matching lines...) Expand all Loading... |
| 1597 : "+r"(src_argb), // %0 | 1502 : "+r"(src_argb), // %0 |
| 1598 "+r"(src_argb_1), // %1 | 1503 "+r"(src_argb_1), // %1 |
| 1599 "+r"(dst_u), // %2 | 1504 "+r"(dst_u), // %2 |
| 1600 "+r"(dst_v), // %3 | 1505 "+r"(dst_v), // %3 |
| 1601 "+r"(width) // %4 | 1506 "+r"(width) // %4 |
| 1602 : | 1507 : |
| 1603 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1508 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1604 "v20", "v21", "v22", "v23", "v24", "v25" | 1509 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1605 ); | 1510 ); |
| 1606 } | 1511 } |
| 1607 #endif // HAS_ARGBTOUVROW_NEON | |
| 1608 | 1512 |
| 1609 // TODO(fbarchard): Subsample match C code. | 1513 // TODO(fbarchard): Subsample match C code. |
| 1610 #ifdef HAS_ARGBTOUVJROW_NEON | |
| 1611 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, | 1514 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |
| 1612 uint8* dst_u, uint8* dst_v, int width) { | 1515 uint8* dst_u, uint8* dst_v, int width) { |
| 1613 const uint8* src_argb_1 = src_argb + src_stride_argb; | 1516 const uint8* src_argb_1 = src_argb + src_stride_argb; |
| 1614 asm volatile ( | 1517 asm volatile ( |
| 1615 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 | 1518 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 |
| 1616 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 | 1519 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 |
| 1617 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 | 1520 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 |
| 1618 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 | 1521 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 |
| 1619 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 | 1522 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 |
| 1620 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1523 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
| (...skipping 23 matching lines...) Expand all Loading... |
| 1644 : "+r"(src_argb), // %0 | 1547 : "+r"(src_argb), // %0 |
| 1645 "+r"(src_argb_1), // %1 | 1548 "+r"(src_argb_1), // %1 |
| 1646 "+r"(dst_u), // %2 | 1549 "+r"(dst_u), // %2 |
| 1647 "+r"(dst_v), // %3 | 1550 "+r"(dst_v), // %3 |
| 1648 "+r"(width) // %4 | 1551 "+r"(width) // %4 |
| 1649 : | 1552 : |
| 1650 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1553 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1651 "v20", "v21", "v22", "v23", "v24", "v25" | 1554 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1652 ); | 1555 ); |
| 1653 } | 1556 } |
| 1654 #endif // HAS_ARGBTOUVJROW_NEON | |
| 1655 | 1557 |
| 1656 #ifdef HAS_BGRATOUVROW_NEON | |
| 1657 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, | 1558 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |
| 1658 uint8* dst_u, uint8* dst_v, int width) { | 1559 uint8* dst_u, uint8* dst_v, int width) { |
| 1659 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; | 1560 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; |
| 1660 asm volatile ( | 1561 asm volatile ( |
| 1661 RGBTOUV_SETUP_REG | 1562 RGBTOUV_SETUP_REG |
| 1662 "1: \n" | 1563 "1: \n" |
| 1663 MEMACCESS(0) | 1564 MEMACCESS(0) |
| 1664 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1565 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1665 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. | 1566 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. |
| 1666 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1567 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1685 : "+r"(src_bgra), // %0 | 1586 : "+r"(src_bgra), // %0 |
| 1686 "+r"(src_bgra_1), // %1 | 1587 "+r"(src_bgra_1), // %1 |
| 1687 "+r"(dst_u), // %2 | 1588 "+r"(dst_u), // %2 |
| 1688 "+r"(dst_v), // %3 | 1589 "+r"(dst_v), // %3 |
| 1689 "+r"(width) // %4 | 1590 "+r"(width) // %4 |
| 1690 : | 1591 : |
| 1691 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1592 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1692 "v20", "v21", "v22", "v23", "v24", "v25" | 1593 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1693 ); | 1594 ); |
| 1694 } | 1595 } |
| 1695 #endif // HAS_BGRATOUVROW_NEON | |
| 1696 | 1596 |
| 1697 #ifdef HAS_ABGRTOUVROW_NEON | |
| 1698 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, | 1597 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |
| 1699 uint8* dst_u, uint8* dst_v, int width) { | 1598 uint8* dst_u, uint8* dst_v, int width) { |
| 1700 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; | 1599 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; |
| 1701 asm volatile ( | 1600 asm volatile ( |
| 1702 RGBTOUV_SETUP_REG | 1601 RGBTOUV_SETUP_REG |
| 1703 "1: \n" | 1602 "1: \n" |
| 1704 MEMACCESS(0) | 1603 MEMACCESS(0) |
| 1705 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1604 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1706 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1605 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
| 1707 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1606 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1726 : "+r"(src_abgr), // %0 | 1625 : "+r"(src_abgr), // %0 |
| 1727 "+r"(src_abgr_1), // %1 | 1626 "+r"(src_abgr_1), // %1 |
| 1728 "+r"(dst_u), // %2 | 1627 "+r"(dst_u), // %2 |
| 1729 "+r"(dst_v), // %3 | 1628 "+r"(dst_v), // %3 |
| 1730 "+r"(width) // %4 | 1629 "+r"(width) // %4 |
| 1731 : | 1630 : |
| 1732 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1631 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1733 "v20", "v21", "v22", "v23", "v24", "v25" | 1632 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1734 ); | 1633 ); |
| 1735 } | 1634 } |
| 1736 #endif // HAS_ABGRTOUVROW_NEON | |
| 1737 | 1635 |
| 1738 #ifdef HAS_RGBATOUVROW_NEON | |
| 1739 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, | 1636 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |
| 1740 uint8* dst_u, uint8* dst_v, int width) { | 1637 uint8* dst_u, uint8* dst_v, int width) { |
| 1741 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; | 1638 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; |
| 1742 asm volatile ( | 1639 asm volatile ( |
| 1743 RGBTOUV_SETUP_REG | 1640 RGBTOUV_SETUP_REG |
| 1744 "1: \n" | 1641 "1: \n" |
| 1745 MEMACCESS(0) | 1642 MEMACCESS(0) |
| 1746 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. | 1643 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. |
| 1747 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. | 1644 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. |
| 1748 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. | 1645 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1767 : "+r"(src_rgba), // %0 | 1664 : "+r"(src_rgba), // %0 |
| 1768 "+r"(src_rgba_1), // %1 | 1665 "+r"(src_rgba_1), // %1 |
| 1769 "+r"(dst_u), // %2 | 1666 "+r"(dst_u), // %2 |
| 1770 "+r"(dst_v), // %3 | 1667 "+r"(dst_v), // %3 |
| 1771 "+r"(width) // %4 | 1668 "+r"(width) // %4 |
| 1772 : | 1669 : |
| 1773 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1670 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1774 "v20", "v21", "v22", "v23", "v24", "v25" | 1671 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1775 ); | 1672 ); |
| 1776 } | 1673 } |
| 1777 #endif // HAS_RGBATOUVROW_NEON | |
| 1778 | 1674 |
| 1779 #ifdef HAS_RGB24TOUVROW_NEON | |
| 1780 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, | 1675 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |
| 1781 uint8* dst_u, uint8* dst_v, int width) { | 1676 uint8* dst_u, uint8* dst_v, int width) { |
| 1782 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; | 1677 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; |
| 1783 asm volatile ( | 1678 asm volatile ( |
| 1784 RGBTOUV_SETUP_REG | 1679 RGBTOUV_SETUP_REG |
| 1785 "1: \n" | 1680 "1: \n" |
| 1786 MEMACCESS(0) | 1681 MEMACCESS(0) |
| 1787 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. | 1682 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. |
| 1788 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. | 1683 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. |
| 1789 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1684 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1808 : "+r"(src_rgb24), // %0 | 1703 : "+r"(src_rgb24), // %0 |
| 1809 "+r"(src_rgb24_1), // %1 | 1704 "+r"(src_rgb24_1), // %1 |
| 1810 "+r"(dst_u), // %2 | 1705 "+r"(dst_u), // %2 |
| 1811 "+r"(dst_v), // %3 | 1706 "+r"(dst_v), // %3 |
| 1812 "+r"(width) // %4 | 1707 "+r"(width) // %4 |
| 1813 : | 1708 : |
| 1814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1709 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1815 "v20", "v21", "v22", "v23", "v24", "v25" | 1710 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1816 ); | 1711 ); |
| 1817 } | 1712 } |
| 1818 #endif // HAS_RGB24TOUVROW_NEON | |
| 1819 | 1713 |
| 1820 #ifdef HAS_RAWTOUVROW_NEON | |
| 1821 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, | 1714 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |
| 1822 uint8* dst_u, uint8* dst_v, int width) { | 1715 uint8* dst_u, uint8* dst_v, int width) { |
| 1823 const uint8* src_raw_1 = src_raw + src_stride_raw; | 1716 const uint8* src_raw_1 = src_raw + src_stride_raw; |
| 1824 asm volatile ( | 1717 asm volatile ( |
| 1825 RGBTOUV_SETUP_REG | 1718 RGBTOUV_SETUP_REG |
| 1826 "1: \n" | 1719 "1: \n" |
| 1827 MEMACCESS(0) | 1720 MEMACCESS(0) |
| 1828 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. | 1721 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. |
| 1829 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. | 1722 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. |
| 1830 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. | 1723 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1849 : "+r"(src_raw), // %0 | 1742 : "+r"(src_raw), // %0 |
| 1850 "+r"(src_raw_1), // %1 | 1743 "+r"(src_raw_1), // %1 |
| 1851 "+r"(dst_u), // %2 | 1744 "+r"(dst_u), // %2 |
| 1852 "+r"(dst_v), // %3 | 1745 "+r"(dst_v), // %3 |
| 1853 "+r"(width) // %4 | 1746 "+r"(width) // %4 |
| 1854 : | 1747 : |
| 1855 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1748 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1856 "v20", "v21", "v22", "v23", "v24", "v25" | 1749 "v20", "v21", "v22", "v23", "v24", "v25" |
| 1857 ); | 1750 ); |
| 1858 } | 1751 } |
| 1859 #endif // HAS_RAWTOUVROW_NEON | |
| 1860 | 1752 |
| 1861 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1753 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1862 #ifdef HAS_RGB565TOUVROW_NEON | |
| 1863 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, | 1754 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |
| 1864 uint8* dst_u, uint8* dst_v, int width) { | 1755 uint8* dst_u, uint8* dst_v, int width) { |
| 1865 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; | 1756 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; |
| 1866 asm volatile ( | 1757 asm volatile ( |
| 1867 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 | 1758 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 |
| 1868 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 | 1759 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 |
| 1869 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 | 1760 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 |
| 1870 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 | 1761 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 |
| 1871 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 | 1762 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 |
| 1872 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) | 1763 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1925 "+r"(src_rgb565_1), // %1 | 1816 "+r"(src_rgb565_1), // %1 |
| 1926 "+r"(dst_u), // %2 | 1817 "+r"(dst_u), // %2 |
| 1927 "+r"(dst_v), // %3 | 1818 "+r"(dst_v), // %3 |
| 1928 "+r"(width) // %4 | 1819 "+r"(width) // %4 |
| 1929 : | 1820 : |
| 1930 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 1821 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 1931 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", | 1822 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", |
| 1932 "v25", "v26", "v27" | 1823 "v25", "v26", "v27" |
| 1933 ); | 1824 ); |
| 1934 } | 1825 } |
| 1935 #endif // HAS_RGB565TOUVROW_NEON | |
| 1936 | 1826 |
| 1937 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1827 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 1938 #ifdef HAS_ARGB1555TOUVROW_NEON | |
| 1939 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, | 1828 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |
| 1940 uint8* dst_u, uint8* dst_v, int width) { | 1829 uint8* dst_u, uint8* dst_v, int width) { |
| 1941 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; | 1830 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; |
| 1942 asm volatile ( | 1831 asm volatile ( |
| 1943 RGBTOUV_SETUP_REG | 1832 RGBTOUV_SETUP_REG |
| 1944 "1: \n" | 1833 "1: \n" |
| 1945 MEMACCESS(0) | 1834 MEMACCESS(0) |
| 1946 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 1835 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 1947 RGB555TOARGB | 1836 RGB555TOARGB |
| 1948 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 1837 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1996 "+r"(src_argb1555_1), // %1 | 1885 "+r"(src_argb1555_1), // %1 |
| 1997 "+r"(dst_u), // %2 | 1886 "+r"(dst_u), // %2 |
| 1998 "+r"(dst_v), // %3 | 1887 "+r"(dst_v), // %3 |
| 1999 "+r"(width) // %4 | 1888 "+r"(width) // %4 |
| 2000 : | 1889 : |
| 2001 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| 2002 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 1891 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
| 2003 "v26", "v27", "v28" | 1892 "v26", "v27", "v28" |
| 2004 ); | 1893 ); |
| 2005 } | 1894 } |
| 2006 #endif // HAS_ARGB1555TOUVROW_NEON | |
| 2007 | 1895 |
| 2008 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. | 1896 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. |
| 2009 #ifdef HAS_ARGB4444TOUVROW_NEON | |
| 2010 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, | 1897 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |
| 2011 uint8* dst_u, uint8* dst_v, int width) { | 1898 uint8* dst_u, uint8* dst_v, int width) { |
| 2012 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; | 1899 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; |
| 2013 asm volatile ( | 1900 asm volatile ( |
| 2014 RGBTOUV_SETUP_REG | 1901 RGBTOUV_SETUP_REG |
| 2015 "1: \n" | 1902 "1: \n" |
| 2016 MEMACCESS(0) | 1903 MEMACCESS(0) |
| 2017 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 1904 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 2018 ARGB4444TOARGB | 1905 ARGB4444TOARGB |
| 2019 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. | 1906 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2068 "+r"(dst_u), // %2 | 1955 "+r"(dst_u), // %2 |
| 2069 "+r"(dst_v), // %3 | 1956 "+r"(dst_v), // %3 |
| 2070 "+r"(width) // %4 | 1957 "+r"(width) // %4 |
| 2071 : | 1958 : |
| 2072 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", | 1959 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", |
| 2073 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", | 1960 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", |
| 2074 "v26", "v27", "v28" | 1961 "v26", "v27", "v28" |
| 2075 | 1962 |
| 2076 ); | 1963 ); |
| 2077 } | 1964 } |
| 2078 #endif // HAS_ARGB4444TOUVROW_NEON | |
| 2079 | 1965 |
| 2080 #ifdef HAS_RGB565TOYROW_NEON | |
| 2081 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { | 1966 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { |
| 2082 asm volatile ( | 1967 asm volatile ( |
| 2083 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 1968 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
| 2084 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 1969 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
| 2085 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 1970 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
| 2086 "movi v27.8b, #16 \n" // Add 16 constant | 1971 "movi v27.8b, #16 \n" // Add 16 constant |
| 2087 "1: \n" | 1972 "1: \n" |
| 2088 MEMACCESS(0) | 1973 MEMACCESS(0) |
| 2089 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. | 1974 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. |
| 2090 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 1975 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2091 RGB565TOARGB | 1976 RGB565TOARGB |
| 2092 "umull v3.8h, v0.8b, v24.8b \n" // B | 1977 "umull v3.8h, v0.8b, v24.8b \n" // B |
| 2093 "umlal v3.8h, v1.8b, v25.8b \n" // G | 1978 "umlal v3.8h, v1.8b, v25.8b \n" // G |
| 2094 "umlal v3.8h, v2.8b, v26.8b \n" // R | 1979 "umlal v3.8h, v2.8b, v26.8b \n" // R |
| 2095 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 1980 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2096 "uqadd v0.8b, v0.8b, v27.8b \n" | 1981 "uqadd v0.8b, v0.8b, v27.8b \n" |
| 2097 MEMACCESS(1) | 1982 MEMACCESS(1) |
| 2098 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 1983 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2099 "b.gt 1b \n" | 1984 "b.gt 1b \n" |
| 2100 : "+r"(src_rgb565), // %0 | 1985 : "+r"(src_rgb565), // %0 |
| 2101 "+r"(dst_y), // %1 | 1986 "+r"(dst_y), // %1 |
| 2102 "+r"(width) // %2 | 1987 "+r"(width) // %2 |
| 2103 : | 1988 : |
| 2104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", | 1989 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", |
| 2105 "v24", "v25", "v26", "v27" | 1990 "v24", "v25", "v26", "v27" |
| 2106 ); | 1991 ); |
| 2107 } | 1992 } |
| 2108 #endif // HAS_RGB565TOYROW_NEON | |
| 2109 | 1993 |
| 2110 #ifdef HAS_ARGB1555TOYROW_NEON | |
| 2111 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { | 1994 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { |
| 2112 asm volatile ( | 1995 asm volatile ( |
| 2113 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 1996 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2114 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 1997 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2115 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 1998 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2116 "movi v7.8b, #16 \n" // Add 16 constant | 1999 "movi v7.8b, #16 \n" // Add 16 constant |
| 2117 "1: \n" | 2000 "1: \n" |
| 2118 MEMACCESS(0) | 2001 MEMACCESS(0) |
| 2119 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. | 2002 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. |
| 2120 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2003 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2121 ARGB1555TOARGB | 2004 ARGB1555TOARGB |
| 2122 "umull v3.8h, v0.8b, v4.8b \n" // B | 2005 "umull v3.8h, v0.8b, v4.8b \n" // B |
| 2123 "umlal v3.8h, v1.8b, v5.8b \n" // G | 2006 "umlal v3.8h, v1.8b, v5.8b \n" // G |
| 2124 "umlal v3.8h, v2.8b, v6.8b \n" // R | 2007 "umlal v3.8h, v2.8b, v6.8b \n" // R |
| 2125 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2008 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2126 "uqadd v0.8b, v0.8b, v7.8b \n" | 2009 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2127 MEMACCESS(1) | 2010 MEMACCESS(1) |
| 2128 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2011 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2129 "b.gt 1b \n" | 2012 "b.gt 1b \n" |
| 2130 : "+r"(src_argb1555), // %0 | 2013 : "+r"(src_argb1555), // %0 |
| 2131 "+r"(dst_y), // %1 | 2014 "+r"(dst_y), // %1 |
| 2132 "+r"(width) // %2 | 2015 "+r"(width) // %2 |
| 2133 : | 2016 : |
| 2134 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2017 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 2135 ); | 2018 ); |
| 2136 } | 2019 } |
| 2137 #endif // HAS_ARGB1555TOYROW_NEON | |
| 2138 | 2020 |
| 2139 #ifdef HAS_ARGB4444TOYROW_NEON | |
| 2140 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { | 2021 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { |
| 2141 asm volatile ( | 2022 asm volatile ( |
| 2142 "movi v24.8b, #13 \n" // B * 0.1016 coefficient | 2023 "movi v24.8b, #13 \n" // B * 0.1016 coefficient |
| 2143 "movi v25.8b, #65 \n" // G * 0.5078 coefficient | 2024 "movi v25.8b, #65 \n" // G * 0.5078 coefficient |
| 2144 "movi v26.8b, #33 \n" // R * 0.2578 coefficient | 2025 "movi v26.8b, #33 \n" // R * 0.2578 coefficient |
| 2145 "movi v27.8b, #16 \n" // Add 16 constant | 2026 "movi v27.8b, #16 \n" // Add 16 constant |
| 2146 "1: \n" | 2027 "1: \n" |
| 2147 MEMACCESS(0) | 2028 MEMACCESS(0) |
| 2148 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. | 2029 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. |
| 2149 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2030 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2150 ARGB4444TOARGB | 2031 ARGB4444TOARGB |
| 2151 "umull v3.8h, v0.8b, v24.8b \n" // B | 2032 "umull v3.8h, v0.8b, v24.8b \n" // B |
| 2152 "umlal v3.8h, v1.8b, v25.8b \n" // G | 2033 "umlal v3.8h, v1.8b, v25.8b \n" // G |
| 2153 "umlal v3.8h, v2.8b, v26.8b \n" // R | 2034 "umlal v3.8h, v2.8b, v26.8b \n" // R |
| 2154 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y | 2035 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y |
| 2155 "uqadd v0.8b, v0.8b, v27.8b \n" | 2036 "uqadd v0.8b, v0.8b, v27.8b \n" |
| 2156 MEMACCESS(1) | 2037 MEMACCESS(1) |
| 2157 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2038 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2158 "b.gt 1b \n" | 2039 "b.gt 1b \n" |
| 2159 : "+r"(src_argb4444), // %0 | 2040 : "+r"(src_argb4444), // %0 |
| 2160 "+r"(dst_y), // %1 | 2041 "+r"(dst_y), // %1 |
| 2161 "+r"(width) // %2 | 2042 "+r"(width) // %2 |
| 2162 : | 2043 : |
| 2163 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" | 2044 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" |
| 2164 ); | 2045 ); |
| 2165 } | 2046 } |
| 2166 #endif // HAS_ARGB4444TOYROW_NEON | |
| 2167 | 2047 |
| 2168 #ifdef HAS_BGRATOYROW_NEON | |
| 2169 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { | 2048 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { |
| 2170 asm volatile ( | 2049 asm volatile ( |
| 2171 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2050 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2172 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2051 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2173 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2052 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2174 "movi v7.8b, #16 \n" // Add 16 constant | 2053 "movi v7.8b, #16 \n" // Add 16 constant |
| 2175 "1: \n" | 2054 "1: \n" |
| 2176 MEMACCESS(0) | 2055 MEMACCESS(0) |
| 2177 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2056 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2178 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2057 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2179 "umull v16.8h, v1.8b, v4.8b \n" // R | 2058 "umull v16.8h, v1.8b, v4.8b \n" // R |
| 2180 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2059 "umlal v16.8h, v2.8b, v5.8b \n" // G |
| 2181 "umlal v16.8h, v3.8b, v6.8b \n" // B | 2060 "umlal v16.8h, v3.8b, v6.8b \n" // B |
| 2182 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2061 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2183 "uqadd v0.8b, v0.8b, v7.8b \n" | 2062 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2184 MEMACCESS(1) | 2063 MEMACCESS(1) |
| 2185 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2064 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2186 "b.gt 1b \n" | 2065 "b.gt 1b \n" |
| 2187 : "+r"(src_bgra), // %0 | 2066 : "+r"(src_bgra), // %0 |
| 2188 "+r"(dst_y), // %1 | 2067 "+r"(dst_y), // %1 |
| 2189 "+r"(width) // %2 | 2068 "+r"(width) // %2 |
| 2190 : | 2069 : |
| 2191 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2070 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2192 ); | 2071 ); |
| 2193 } | 2072 } |
| 2194 #endif // HAS_BGRATOYROW_NEON | |
| 2195 | 2073 |
| 2196 #ifdef HAS_ABGRTOYROW_NEON | |
| 2197 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { | 2074 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { |
| 2198 asm volatile ( | 2075 asm volatile ( |
| 2199 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2076 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2200 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2077 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2201 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2078 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2202 "movi v7.8b, #16 \n" // Add 16 constant | 2079 "movi v7.8b, #16 \n" // Add 16 constant |
| 2203 "1: \n" | 2080 "1: \n" |
| 2204 MEMACCESS(0) | 2081 MEMACCESS(0) |
| 2205 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2082 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2206 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2083 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2207 "umull v16.8h, v0.8b, v4.8b \n" // R | 2084 "umull v16.8h, v0.8b, v4.8b \n" // R |
| 2208 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2085 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2209 "umlal v16.8h, v2.8b, v6.8b \n" // B | 2086 "umlal v16.8h, v2.8b, v6.8b \n" // B |
| 2210 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2087 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2211 "uqadd v0.8b, v0.8b, v7.8b \n" | 2088 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2212 MEMACCESS(1) | 2089 MEMACCESS(1) |
| 2213 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2090 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2214 "b.gt 1b \n" | 2091 "b.gt 1b \n" |
| 2215 : "+r"(src_abgr), // %0 | 2092 : "+r"(src_abgr), // %0 |
| 2216 "+r"(dst_y), // %1 | 2093 "+r"(dst_y), // %1 |
| 2217 "+r"(width) // %2 | 2094 "+r"(width) // %2 |
| 2218 : | 2095 : |
| 2219 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2096 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2220 ); | 2097 ); |
| 2221 } | 2098 } |
| 2222 #endif // HAS_ABGRTOYROW_NEON | |
| 2223 | 2099 |
| 2224 #ifdef HAS_RGBATOYROW_NEON | |
| 2225 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { | 2100 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { |
| 2226 asm volatile ( | 2101 asm volatile ( |
| 2227 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2102 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2228 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2103 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2229 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2104 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2230 "movi v7.8b, #16 \n" // Add 16 constant | 2105 "movi v7.8b, #16 \n" // Add 16 constant |
| 2231 "1: \n" | 2106 "1: \n" |
| 2232 MEMACCESS(0) | 2107 MEMACCESS(0) |
| 2233 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. | 2108 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. |
| 2234 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2109 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2235 "umull v16.8h, v1.8b, v4.8b \n" // B | 2110 "umull v16.8h, v1.8b, v4.8b \n" // B |
| 2236 "umlal v16.8h, v2.8b, v5.8b \n" // G | 2111 "umlal v16.8h, v2.8b, v5.8b \n" // G |
| 2237 "umlal v16.8h, v3.8b, v6.8b \n" // R | 2112 "umlal v16.8h, v3.8b, v6.8b \n" // R |
| 2238 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2113 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2239 "uqadd v0.8b, v0.8b, v7.8b \n" | 2114 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2240 MEMACCESS(1) | 2115 MEMACCESS(1) |
| 2241 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2116 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2242 "b.gt 1b \n" | 2117 "b.gt 1b \n" |
| 2243 : "+r"(src_rgba), // %0 | 2118 : "+r"(src_rgba), // %0 |
| 2244 "+r"(dst_y), // %1 | 2119 "+r"(dst_y), // %1 |
| 2245 "+r"(width) // %2 | 2120 "+r"(width) // %2 |
| 2246 : | 2121 : |
| 2247 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2122 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2248 ); | 2123 ); |
| 2249 } | 2124 } |
| 2250 #endif // HAS_RGBATOYROW_NEON | |
| 2251 | 2125 |
| 2252 #ifdef HAS_RGB24TOYROW_NEON | |
| 2253 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { | 2126 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { |
| 2254 asm volatile ( | 2127 asm volatile ( |
| 2255 "movi v4.8b, #13 \n" // B * 0.1016 coefficient | 2128 "movi v4.8b, #13 \n" // B * 0.1016 coefficient |
| 2256 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2129 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2257 "movi v6.8b, #33 \n" // R * 0.2578 coefficient | 2130 "movi v6.8b, #33 \n" // R * 0.2578 coefficient |
| 2258 "movi v7.8b, #16 \n" // Add 16 constant | 2131 "movi v7.8b, #16 \n" // Add 16 constant |
| 2259 "1: \n" | 2132 "1: \n" |
| 2260 MEMACCESS(0) | 2133 MEMACCESS(0) |
| 2261 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2134 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
| 2262 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2135 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2263 "umull v16.8h, v0.8b, v4.8b \n" // B | 2136 "umull v16.8h, v0.8b, v4.8b \n" // B |
| 2264 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2137 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2265 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2138 "umlal v16.8h, v2.8b, v6.8b \n" // R |
| 2266 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2139 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2267 "uqadd v0.8b, v0.8b, v7.8b \n" | 2140 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2268 MEMACCESS(1) | 2141 MEMACCESS(1) |
| 2269 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2142 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2270 "b.gt 1b \n" | 2143 "b.gt 1b \n" |
| 2271 : "+r"(src_rgb24), // %0 | 2144 : "+r"(src_rgb24), // %0 |
| 2272 "+r"(dst_y), // %1 | 2145 "+r"(dst_y), // %1 |
| 2273 "+r"(width) // %2 | 2146 "+r"(width) // %2 |
| 2274 : | 2147 : |
| 2275 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2148 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2276 ); | 2149 ); |
| 2277 } | 2150 } |
| 2278 #endif // HAS_RGB24TOYROW_NEON | |
| 2279 | 2151 |
| 2280 #ifdef HAS_RAWTOYROW_NEON | |
| 2281 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { | 2152 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { |
| 2282 asm volatile ( | 2153 asm volatile ( |
| 2283 "movi v4.8b, #33 \n" // R * 0.2578 coefficient | 2154 "movi v4.8b, #33 \n" // R * 0.2578 coefficient |
| 2284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient | 2155 "movi v5.8b, #65 \n" // G * 0.5078 coefficient |
| 2285 "movi v6.8b, #13 \n" // B * 0.1016 coefficient | 2156 "movi v6.8b, #13 \n" // B * 0.1016 coefficient |
| 2286 "movi v7.8b, #16 \n" // Add 16 constant | 2157 "movi v7.8b, #16 \n" // Add 16 constant |
| 2287 "1: \n" | 2158 "1: \n" |
| 2288 MEMACCESS(0) | 2159 MEMACCESS(0) |
| 2289 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. | 2160 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. |
| 2290 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2161 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2291 "umull v16.8h, v0.8b, v4.8b \n" // B | 2162 "umull v16.8h, v0.8b, v4.8b \n" // B |
| 2292 "umlal v16.8h, v1.8b, v5.8b \n" // G | 2163 "umlal v16.8h, v1.8b, v5.8b \n" // G |
| 2293 "umlal v16.8h, v2.8b, v6.8b \n" // R | 2164 "umlal v16.8h, v2.8b, v6.8b \n" // R |
| 2294 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y | 2165 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y |
| 2295 "uqadd v0.8b, v0.8b, v7.8b \n" | 2166 "uqadd v0.8b, v0.8b, v7.8b \n" |
| 2296 MEMACCESS(1) | 2167 MEMACCESS(1) |
| 2297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. | 2168 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. |
| 2298 "b.gt 1b \n" | 2169 "b.gt 1b \n" |
| 2299 : "+r"(src_raw), // %0 | 2170 : "+r"(src_raw), // %0 |
| 2300 "+r"(dst_y), // %1 | 2171 "+r"(dst_y), // %1 |
| 2301 "+r"(width) // %2 | 2172 "+r"(width) // %2 |
| 2302 : | 2173 : |
| 2303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" | 2174 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" |
| 2304 ); | 2175 ); |
| 2305 } | 2176 } |
| 2306 #endif // HAS_RAWTOYROW_NEON | |
| 2307 | 2177 |
| 2308 // Bilinear filter 16x2 -> 16x1 | 2178 // Bilinear filter 16x2 -> 16x1 |
| 2309 #ifdef HAS_INTERPOLATEROW_NEON | |
| 2310 void InterpolateRow_NEON(uint8* dst_ptr, | 2179 void InterpolateRow_NEON(uint8* dst_ptr, |
| 2311 const uint8* src_ptr, ptrdiff_t src_stride, | 2180 const uint8* src_ptr, ptrdiff_t src_stride, |
| 2312 int dst_width, int source_y_fraction) { | 2181 int dst_width, int source_y_fraction) { |
| 2313 int y1_fraction = source_y_fraction; | 2182 int y1_fraction = source_y_fraction; |
| 2314 int y0_fraction = 256 - y1_fraction; | 2183 int y0_fraction = 256 - y1_fraction; |
| 2315 const uint8* src_ptr1 = src_ptr + src_stride; | 2184 const uint8* src_ptr1 = src_ptr + src_stride; |
| 2316 asm volatile ( | 2185 asm volatile ( |
| 2317 "cmp %w4, #0 \n" | 2186 "cmp %w4, #0 \n" |
| 2318 "b.eq 100f \n" | 2187 "b.eq 100f \n" |
| 2319 "cmp %w4, #128 \n" | 2188 "cmp %w4, #128 \n" |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2365 : "+r"(dst_ptr), // %0 | 2234 : "+r"(dst_ptr), // %0 |
| 2366 "+r"(src_ptr), // %1 | 2235 "+r"(src_ptr), // %1 |
| 2367 "+r"(src_ptr1), // %2 | 2236 "+r"(src_ptr1), // %2 |
| 2368 "+r"(dst_width), // %3 | 2237 "+r"(dst_width), // %3 |
| 2369 "+r"(y1_fraction), // %4 | 2238 "+r"(y1_fraction), // %4 |
| 2370 "+r"(y0_fraction) // %5 | 2239 "+r"(y0_fraction) // %5 |
| 2371 : | 2240 : |
| 2372 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" | 2241 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" |
| 2373 ); | 2242 ); |
| 2374 } | 2243 } |
| 2375 #endif // HAS_INTERPOLATEROW_NEON | |
| 2376 | 2244 |
| 2377 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr | 2245 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr |
| 2378 #ifdef HAS_ARGBBLENDROW_NEON | |
| 2379 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2246 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 2380 uint8* dst_argb, int width) { | 2247 uint8* dst_argb, int width) { |
| 2381 asm volatile ( | 2248 asm volatile ( |
| 2382 "subs %w3, %w3, #8 \n" | 2249 "subs %w3, %w3, #8 \n" |
| 2383 "b.lt 89f \n" | 2250 "b.lt 89f \n" |
| 2384 // Blend 8 pixels. | 2251 // Blend 8 pixels. |
| 2385 "8: \n" | 2252 "8: \n" |
| 2386 MEMACCESS(0) | 2253 MEMACCESS(0) |
| 2387 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels | 2254 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels |
| 2388 MEMACCESS(1) | 2255 MEMACCESS(1) |
| (...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2437 | 2304 |
| 2438 : "+r"(src_argb0), // %0 | 2305 : "+r"(src_argb0), // %0 |
| 2439 "+r"(src_argb1), // %1 | 2306 "+r"(src_argb1), // %1 |
| 2440 "+r"(dst_argb), // %2 | 2307 "+r"(dst_argb), // %2 |
| 2441 "+r"(width) // %3 | 2308 "+r"(width) // %3 |
| 2442 : | 2309 : |
| 2443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2310 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 2444 "v16", "v17", "v18" | 2311 "v16", "v17", "v18" |
| 2445 ); | 2312 ); |
| 2446 } | 2313 } |
| 2447 #endif // HAS_ARGBBLENDROW_NEON | |
| 2448 | 2314 |
| 2449 // Attenuate 8 pixels at a time. | 2315 // Attenuate 8 pixels at a time. |
| 2450 #ifdef HAS_ARGBATTENUATEROW_NEON | |
| 2451 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2316 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
| 2452 asm volatile ( | 2317 asm volatile ( |
| 2453 // Attenuate 8 pixels. | 2318 // Attenuate 8 pixels. |
| 2454 "1: \n" | 2319 "1: \n" |
| 2455 MEMACCESS(0) | 2320 MEMACCESS(0) |
| 2456 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels | 2321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels |
| 2457 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2322 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2458 "umull v4.8h, v0.8b, v3.8b \n" // b * a | 2323 "umull v4.8h, v0.8b, v3.8b \n" // b * a |
| 2459 "umull v5.8h, v1.8b, v3.8b \n" // g * a | 2324 "umull v5.8h, v1.8b, v3.8b \n" // g * a |
| 2460 "umull v6.8h, v2.8b, v3.8b \n" // r * a | 2325 "umull v6.8h, v2.8b, v3.8b \n" // r * a |
| 2461 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 | 2326 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 |
| 2462 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 | 2327 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 |
| 2463 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 | 2328 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 |
| 2464 MEMACCESS(1) | 2329 MEMACCESS(1) |
| 2465 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels | 2330 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 2466 "b.gt 1b \n" | 2331 "b.gt 1b \n" |
| 2467 : "+r"(src_argb), // %0 | 2332 : "+r"(src_argb), // %0 |
| 2468 "+r"(dst_argb), // %1 | 2333 "+r"(dst_argb), // %1 |
| 2469 "+r"(width) // %2 | 2334 "+r"(width) // %2 |
| 2470 : | 2335 : |
| 2471 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 2336 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
| 2472 ); | 2337 ); |
| 2473 } | 2338 } |
| 2474 #endif // HAS_ARGBATTENUATEROW_NEON | |
| 2475 | 2339 |
| 2476 // Quantize 8 ARGB pixels (32 bytes). | 2340 // Quantize 8 ARGB pixels (32 bytes). |
| 2477 // dst = (dst * scale >> 16) * interval_size + interval_offset; | 2341 // dst = (dst * scale >> 16) * interval_size + interval_offset; |
| 2478 #ifdef HAS_ARGBQUANTIZEROW_NEON | |
| 2479 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, | 2342 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, |
| 2480 int interval_offset, int width) { | 2343 int interval_offset, int width) { |
| 2481 asm volatile ( | 2344 asm volatile ( |
| 2482 "dup v4.8h, %w2 \n" | 2345 "dup v4.8h, %w2 \n" |
| 2483 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 | 2346 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 |
| 2484 "dup v5.8h, %w3 \n" // interval multiply. | 2347 "dup v5.8h, %w3 \n" // interval multiply. |
| 2485 "dup v6.8h, %w4 \n" // interval add | 2348 "dup v6.8h, %w4 \n" // interval add |
| 2486 | 2349 |
| 2487 // 8 pixel loop. | 2350 // 8 pixel loop. |
| 2488 "1: \n" | 2351 "1: \n" |
| (...skipping 19 matching lines...) Expand all Loading... |
| 2508 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels | 2371 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels |
| 2509 "b.gt 1b \n" | 2372 "b.gt 1b \n" |
| 2510 : "+r"(dst_argb), // %0 | 2373 : "+r"(dst_argb), // %0 |
| 2511 "+r"(width) // %1 | 2374 "+r"(width) // %1 |
| 2512 : "r"(scale), // %2 | 2375 : "r"(scale), // %2 |
| 2513 "r"(interval_size), // %3 | 2376 "r"(interval_size), // %3 |
| 2514 "r"(interval_offset) // %4 | 2377 "r"(interval_offset) // %4 |
| 2515 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" | 2378 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" |
| 2516 ); | 2379 ); |
| 2517 } | 2380 } |
| 2518 #endif // HAS_ARGBQUANTIZEROW_NEON | |
| 2519 | 2381 |
| 2520 // Shade 8 pixels at a time by specified value. | 2382 // Shade 8 pixels at a time by specified value. |
| 2521 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. | 2383 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. |
| 2522 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. | 2384 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. |
| 2523 #ifdef HAS_ARGBSHADEROW_NEON | |
| 2524 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, | 2385 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, |
| 2525 uint32 value) { | 2386 uint32 value) { |
| 2526 asm volatile ( | 2387 asm volatile ( |
| 2527 "dup v0.4s, %w3 \n" // duplicate scale value. | 2388 "dup v0.4s, %w3 \n" // duplicate scale value. |
| 2528 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. | 2389 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. |
| 2529 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. | 2390 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. |
| 2530 | 2391 |
| 2531 // 8 pixel loop. | 2392 // 8 pixel loop. |
| 2532 "1: \n" | 2393 "1: \n" |
| 2533 MEMACCESS(0) | 2394 MEMACCESS(0) |
| (...skipping 14 matching lines...) Expand all Loading... |
| 2548 MEMACCESS(1) | 2409 MEMACCESS(1) |
| 2549 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels | 2410 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels |
| 2550 "b.gt 1b \n" | 2411 "b.gt 1b \n" |
| 2551 : "+r"(src_argb), // %0 | 2412 : "+r"(src_argb), // %0 |
| 2552 "+r"(dst_argb), // %1 | 2413 "+r"(dst_argb), // %1 |
| 2553 "+r"(width) // %2 | 2414 "+r"(width) // %2 |
| 2554 : "r"(value) // %3 | 2415 : "r"(value) // %3 |
| 2555 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" | 2416 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" |
| 2556 ); | 2417 ); |
| 2557 } | 2418 } |
| 2558 #endif // HAS_ARGBSHADEROW_NEON | |
| 2559 | 2419 |
| 2560 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels | 2420 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
| 2561 // Similar to ARGBToYJ but stores ARGB. | 2421 // Similar to ARGBToYJ but stores ARGB. |
| 2562 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; | 2422 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; |
| 2563 #ifdef HAS_ARGBGRAYROW_NEON | |
| 2564 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { | 2423 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
| 2565 asm volatile ( | 2424 asm volatile ( |
| 2566 "movi v24.8b, #15 \n" // B * 0.11400 coefficient | 2425 "movi v24.8b, #15 \n" // B * 0.11400 coefficient |
| 2567 "movi v25.8b, #75 \n" // G * 0.58700 coefficient | 2426 "movi v25.8b, #75 \n" // G * 0.58700 coefficient |
| 2568 "movi v26.8b, #38 \n" // R * 0.29900 coefficient | 2427 "movi v26.8b, #38 \n" // R * 0.29900 coefficient |
| 2569 "1: \n" | 2428 "1: \n" |
| 2570 MEMACCESS(0) | 2429 MEMACCESS(0) |
| 2571 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2430 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 2572 "subs %w2, %w2, #8 \n" // 8 processed per loop. | 2431 "subs %w2, %w2, #8 \n" // 8 processed per loop. |
| 2573 "umull v4.8h, v0.8b, v24.8b \n" // B | 2432 "umull v4.8h, v0.8b, v24.8b \n" // B |
| 2574 "umlal v4.8h, v1.8b, v25.8b \n" // G | 2433 "umlal v4.8h, v1.8b, v25.8b \n" // G |
| 2575 "umlal v4.8h, v2.8b, v26.8b \n" // R | 2434 "umlal v4.8h, v2.8b, v26.8b \n" // R |
| 2576 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B | 2435 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B |
| 2577 "orr v1.8b, v0.8b, v0.8b \n" // G | 2436 "orr v1.8b, v0.8b, v0.8b \n" // G |
| 2578 "orr v2.8b, v0.8b, v0.8b \n" // R | 2437 "orr v2.8b, v0.8b, v0.8b \n" // R |
| 2579 MEMACCESS(1) | 2438 MEMACCESS(1) |
| 2580 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. | 2439 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. |
| 2581 "b.gt 1b \n" | 2440 "b.gt 1b \n" |
| 2582 : "+r"(src_argb), // %0 | 2441 : "+r"(src_argb), // %0 |
| 2583 "+r"(dst_argb), // %1 | 2442 "+r"(dst_argb), // %1 |
| 2584 "+r"(width) // %2 | 2443 "+r"(width) // %2 |
| 2585 : | 2444 : |
| 2586 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" | 2445 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" |
| 2587 ); | 2446 ); |
| 2588 } | 2447 } |
| 2589 #endif // HAS_ARGBGRAYROW_NEON | |
| 2590 | 2448 |
| 2591 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. | 2449 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
| 2592 // b = (r * 35 + g * 68 + b * 17) >> 7 | 2450 // b = (r * 35 + g * 68 + b * 17) >> 7 |
| 2593 // g = (r * 45 + g * 88 + b * 22) >> 7 | 2451 // g = (r * 45 + g * 88 + b * 22) >> 7 |
| 2594 // r = (r * 50 + g * 98 + b * 24) >> 7 | 2452 // r = (r * 50 + g * 98 + b * 24) >> 7 |
| 2595 | 2453 |
| 2596 #ifdef HAS_ARGBSEPIAROW_NEON | |
| 2597 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { | 2454 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { |
| 2598 asm volatile ( | 2455 asm volatile ( |
| 2599 "movi v20.8b, #17 \n" // BB coefficient | 2456 "movi v20.8b, #17 \n" // BB coefficient |
| 2600 "movi v21.8b, #68 \n" // BG coefficient | 2457 "movi v21.8b, #68 \n" // BG coefficient |
| 2601 "movi v22.8b, #35 \n" // BR coefficient | 2458 "movi v22.8b, #35 \n" // BR coefficient |
| 2602 "movi v24.8b, #22 \n" // GB coefficient | 2459 "movi v24.8b, #22 \n" // GB coefficient |
| 2603 "movi v25.8b, #88 \n" // GG coefficient | 2460 "movi v25.8b, #88 \n" // GG coefficient |
| 2604 "movi v26.8b, #45 \n" // GR coefficient | 2461 "movi v26.8b, #45 \n" // GR coefficient |
| 2605 "movi v28.8b, #24 \n" // BB coefficient | 2462 "movi v28.8b, #24 \n" // BB coefficient |
| 2606 "movi v29.8b, #98 \n" // BG coefficient | 2463 "movi v29.8b, #98 \n" // BG coefficient |
| (...skipping 17 matching lines...) Expand all Loading... |
| 2624 MEMACCESS(0) | 2481 MEMACCESS(0) |
| 2625 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. | 2482 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. |
| 2626 "b.gt 1b \n" | 2483 "b.gt 1b \n" |
| 2627 : "+r"(dst_argb), // %0 | 2484 : "+r"(dst_argb), // %0 |
| 2628 "+r"(width) // %1 | 2485 "+r"(width) // %1 |
| 2629 : | 2486 : |
| 2630 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", | 2487 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", |
| 2631 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" | 2488 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" |
| 2632 ); | 2489 ); |
| 2633 } | 2490 } |
| 2634 #endif // HAS_ARGBSEPIAROW_NEON | |
| 2635 | 2491 |
| 2636 // Tranform 8 ARGB pixels (32 bytes) with color matrix. | 2492 // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
| 2637 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function | 2493 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function |
| 2638 // needs to saturate. Consider doing a non-saturating version. | 2494 // needs to saturate. Consider doing a non-saturating version. |
| 2639 #ifdef HAS_ARGBCOLORMATRIXROW_NEON | |
| 2640 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, | 2495 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, |
| 2641 const int8* matrix_argb, int width) { | 2496 const int8* matrix_argb, int width) { |
| 2642 asm volatile ( | 2497 asm volatile ( |
| 2643 MEMACCESS(3) | 2498 MEMACCESS(3) |
| 2644 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. | 2499 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. |
| 2645 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. | 2500 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. |
| 2646 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. | 2501 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. |
| 2647 | 2502 |
| 2648 "1: \n" | 2503 "1: \n" |
| 2649 MEMACCESS(0) | 2504 MEMACCESS(0) |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2689 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. | 2544 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. |
| 2690 "b.gt 1b \n" | 2545 "b.gt 1b \n" |
| 2691 : "+r"(src_argb), // %0 | 2546 : "+r"(src_argb), // %0 |
| 2692 "+r"(dst_argb), // %1 | 2547 "+r"(dst_argb), // %1 |
| 2693 "+r"(width) // %2 | 2548 "+r"(width) // %2 |
| 2694 : "r"(matrix_argb) // %3 | 2549 : "r"(matrix_argb) // %3 |
| 2695 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"
, | 2550 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17"
, |
| 2696 "v18", "v19", "v22", "v23", "v24", "v25" | 2551 "v18", "v19", "v22", "v23", "v24", "v25" |
| 2697 ); | 2552 ); |
| 2698 } | 2553 } |
| 2699 #endif // HAS_ARGBCOLORMATRIXROW_NEON | |
| 2700 | 2554 |
| 2701 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. | 2555 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |
| 2702 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. | 2556 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
| 2703 #ifdef HAS_ARGBMULTIPLYROW_NEON | |
| 2704 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2557 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 2705 uint8* dst_argb, int width) { | 2558 uint8* dst_argb, int width) { |
| 2706 asm volatile ( | 2559 asm volatile ( |
| 2707 // 8 pixel loop. | 2560 // 8 pixel loop. |
| 2708 "1: \n" | 2561 "1: \n" |
| 2709 MEMACCESS(0) | 2562 MEMACCESS(0) |
| 2710 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2563 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 2711 MEMACCESS(1) | 2564 MEMACCESS(1) |
| 2712 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2565 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
| 2713 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2566 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| (...skipping 10 matching lines...) Expand all Loading... |
| 2724 "b.gt 1b \n" | 2577 "b.gt 1b \n" |
| 2725 | 2578 |
| 2726 : "+r"(src_argb0), // %0 | 2579 : "+r"(src_argb0), // %0 |
| 2727 "+r"(src_argb1), // %1 | 2580 "+r"(src_argb1), // %1 |
| 2728 "+r"(dst_argb), // %2 | 2581 "+r"(dst_argb), // %2 |
| 2729 "+r"(width) // %3 | 2582 "+r"(width) // %3 |
| 2730 : | 2583 : |
| 2731 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 2732 ); | 2585 ); |
| 2733 } | 2586 } |
| 2734 #endif // HAS_ARGBMULTIPLYROW_NEON | |
| 2735 | 2587 |
| 2736 // Add 2 rows of ARGB pixels together, 8 pixels at a time. | 2588 // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
| 2737 #ifdef HAS_ARGBADDROW_NEON | |
| 2738 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2589 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 2739 uint8* dst_argb, int width) { | 2590 uint8* dst_argb, int width) { |
| 2740 asm volatile ( | 2591 asm volatile ( |
| 2741 // 8 pixel loop. | 2592 // 8 pixel loop. |
| 2742 "1: \n" | 2593 "1: \n" |
| 2743 MEMACCESS(0) | 2594 MEMACCESS(0) |
| 2744 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2595 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 2745 MEMACCESS(1) | 2596 MEMACCESS(1) |
| 2746 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2597 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
| 2747 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2598 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 2748 "uqadd v0.8b, v0.8b, v4.8b \n" | 2599 "uqadd v0.8b, v0.8b, v4.8b \n" |
| 2749 "uqadd v1.8b, v1.8b, v5.8b \n" | 2600 "uqadd v1.8b, v1.8b, v5.8b \n" |
| 2750 "uqadd v2.8b, v2.8b, v6.8b \n" | 2601 "uqadd v2.8b, v2.8b, v6.8b \n" |
| 2751 "uqadd v3.8b, v3.8b, v7.8b \n" | 2602 "uqadd v3.8b, v3.8b, v7.8b \n" |
| 2752 MEMACCESS(2) | 2603 MEMACCESS(2) |
| 2753 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2604 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
| 2754 "b.gt 1b \n" | 2605 "b.gt 1b \n" |
| 2755 | 2606 |
| 2756 : "+r"(src_argb0), // %0 | 2607 : "+r"(src_argb0), // %0 |
| 2757 "+r"(src_argb1), // %1 | 2608 "+r"(src_argb1), // %1 |
| 2758 "+r"(dst_argb), // %2 | 2609 "+r"(dst_argb), // %2 |
| 2759 "+r"(width) // %3 | 2610 "+r"(width) // %3 |
| 2760 : | 2611 : |
| 2761 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2612 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 2762 ); | 2613 ); |
| 2763 } | 2614 } |
| 2764 #endif // HAS_ARGBADDROW_NEON | |
| 2765 | 2615 |
| 2766 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. | 2616 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
| 2767 #ifdef HAS_ARGBSUBTRACTROW_NEON | |
| 2768 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, | 2617 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
| 2769 uint8* dst_argb, int width) { | 2618 uint8* dst_argb, int width) { |
| 2770 asm volatile ( | 2619 asm volatile ( |
| 2771 // 8 pixel loop. | 2620 // 8 pixel loop. |
| 2772 "1: \n" | 2621 "1: \n" |
| 2773 MEMACCESS(0) | 2622 MEMACCESS(0) |
| 2774 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. | 2623 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. |
| 2775 MEMACCESS(1) | 2624 MEMACCESS(1) |
| 2776 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. | 2625 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. |
| 2777 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2626 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 2778 "uqsub v0.8b, v0.8b, v4.8b \n" | 2627 "uqsub v0.8b, v0.8b, v4.8b \n" |
| 2779 "uqsub v1.8b, v1.8b, v5.8b \n" | 2628 "uqsub v1.8b, v1.8b, v5.8b \n" |
| 2780 "uqsub v2.8b, v2.8b, v6.8b \n" | 2629 "uqsub v2.8b, v2.8b, v6.8b \n" |
| 2781 "uqsub v3.8b, v3.8b, v7.8b \n" | 2630 "uqsub v3.8b, v3.8b, v7.8b \n" |
| 2782 MEMACCESS(2) | 2631 MEMACCESS(2) |
| 2783 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2632 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
| 2784 "b.gt 1b \n" | 2633 "b.gt 1b \n" |
| 2785 | 2634 |
| 2786 : "+r"(src_argb0), // %0 | 2635 : "+r"(src_argb0), // %0 |
| 2787 "+r"(src_argb1), // %1 | 2636 "+r"(src_argb1), // %1 |
| 2788 "+r"(dst_argb), // %2 | 2637 "+r"(dst_argb), // %2 |
| 2789 "+r"(width) // %3 | 2638 "+r"(width) // %3 |
| 2790 : | 2639 : |
| 2791 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" | 2640 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" |
| 2792 ); | 2641 ); |
| 2793 } | 2642 } |
| 2794 #endif // HAS_ARGBSUBTRACTROW_NEON | |
| 2795 | 2643 |
| 2796 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. | 2644 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
| 2797 // A = 255 | 2645 // A = 255 |
| 2798 // R = Sobel | 2646 // R = Sobel |
| 2799 // G = Sobel | 2647 // G = Sobel |
| 2800 // B = Sobel | 2648 // B = Sobel |
| 2801 #ifdef HAS_SOBELROW_NEON | |
| 2802 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2649 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 2803 uint8* dst_argb, int width) { | 2650 uint8* dst_argb, int width) { |
| 2804 asm volatile ( | 2651 asm volatile ( |
| 2805 "movi v3.8b, #255 \n" // alpha | 2652 "movi v3.8b, #255 \n" // alpha |
| 2806 // 8 pixel loop. | 2653 // 8 pixel loop. |
| 2807 "1: \n" | 2654 "1: \n" |
| 2808 MEMACCESS(0) | 2655 MEMACCESS(0) |
| 2809 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. | 2656 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. |
| 2810 MEMACCESS(1) | 2657 MEMACCESS(1) |
| 2811 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. | 2658 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. |
| 2812 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2659 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 2813 "uqadd v0.8b, v0.8b, v1.8b \n" // add | 2660 "uqadd v0.8b, v0.8b, v1.8b \n" // add |
| 2814 "orr v1.8b, v0.8b, v0.8b \n" | 2661 "orr v1.8b, v0.8b, v0.8b \n" |
| 2815 "orr v2.8b, v0.8b, v0.8b \n" | 2662 "orr v2.8b, v0.8b, v0.8b \n" |
| 2816 MEMACCESS(2) | 2663 MEMACCESS(2) |
| 2817 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2664 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
| 2818 "b.gt 1b \n" | 2665 "b.gt 1b \n" |
| 2819 : "+r"(src_sobelx), // %0 | 2666 : "+r"(src_sobelx), // %0 |
| 2820 "+r"(src_sobely), // %1 | 2667 "+r"(src_sobely), // %1 |
| 2821 "+r"(dst_argb), // %2 | 2668 "+r"(dst_argb), // %2 |
| 2822 "+r"(width) // %3 | 2669 "+r"(width) // %3 |
| 2823 : | 2670 : |
| 2824 : "cc", "memory", "v0", "v1", "v2", "v3" | 2671 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 2825 ); | 2672 ); |
| 2826 } | 2673 } |
| 2827 #endif // HAS_SOBELROW_NEON | |
| 2828 | 2674 |
| 2829 // Adds Sobel X and Sobel Y and stores Sobel into plane. | 2675 // Adds Sobel X and Sobel Y and stores Sobel into plane. |
| 2830 #ifdef HAS_SOBELTOPLANEROW_NEON | |
| 2831 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2676 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 2832 uint8* dst_y, int width) { | 2677 uint8* dst_y, int width) { |
| 2833 asm volatile ( | 2678 asm volatile ( |
| 2834 // 16 pixel loop. | 2679 // 16 pixel loop. |
| 2835 "1: \n" | 2680 "1: \n" |
| 2836 MEMACCESS(0) | 2681 MEMACCESS(0) |
| 2837 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. | 2682 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. |
| 2838 MEMACCESS(1) | 2683 MEMACCESS(1) |
| 2839 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. | 2684 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. |
| 2840 "subs %w3, %w3, #16 \n" // 16 processed per loop. | 2685 "subs %w3, %w3, #16 \n" // 16 processed per loop. |
| 2841 "uqadd v0.16b, v0.16b, v1.16b \n" // add | 2686 "uqadd v0.16b, v0.16b, v1.16b \n" // add |
| 2842 MEMACCESS(2) | 2687 MEMACCESS(2) |
| 2843 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. | 2688 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. |
| 2844 "b.gt 1b \n" | 2689 "b.gt 1b \n" |
| 2845 : "+r"(src_sobelx), // %0 | 2690 : "+r"(src_sobelx), // %0 |
| 2846 "+r"(src_sobely), // %1 | 2691 "+r"(src_sobely), // %1 |
| 2847 "+r"(dst_y), // %2 | 2692 "+r"(dst_y), // %2 |
| 2848 "+r"(width) // %3 | 2693 "+r"(width) // %3 |
| 2849 : | 2694 : |
| 2850 : "cc", "memory", "v0", "v1" | 2695 : "cc", "memory", "v0", "v1" |
| 2851 ); | 2696 ); |
| 2852 } | 2697 } |
| 2853 #endif // HAS_SOBELTOPLANEROW_NEON | |
| 2854 | 2698 |
| 2855 // Mixes Sobel X, Sobel Y and Sobel into ARGB. | 2699 // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
| 2856 // A = 255 | 2700 // A = 255 |
| 2857 // R = Sobel X | 2701 // R = Sobel X |
| 2858 // G = Sobel | 2702 // G = Sobel |
| 2859 // B = Sobel Y | 2703 // B = Sobel Y |
| 2860 #ifdef HAS_SOBELXYROW_NEON | |
| 2861 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, | 2704 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
| 2862 uint8* dst_argb, int width) { | 2705 uint8* dst_argb, int width) { |
| 2863 asm volatile ( | 2706 asm volatile ( |
| 2864 "movi v3.8b, #255 \n" // alpha | 2707 "movi v3.8b, #255 \n" // alpha |
| 2865 // 8 pixel loop. | 2708 // 8 pixel loop. |
| 2866 "1: \n" | 2709 "1: \n" |
| 2867 MEMACCESS(0) | 2710 MEMACCESS(0) |
| 2868 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. | 2711 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. |
| 2869 MEMACCESS(1) | 2712 MEMACCESS(1) |
| 2870 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. | 2713 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. |
| 2871 "subs %w3, %w3, #8 \n" // 8 processed per loop. | 2714 "subs %w3, %w3, #8 \n" // 8 processed per loop. |
| 2872 "uqadd v1.8b, v0.8b, v2.8b \n" // add | 2715 "uqadd v1.8b, v0.8b, v2.8b \n" // add |
| 2873 MEMACCESS(2) | 2716 MEMACCESS(2) |
| 2874 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels | 2717 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels |
| 2875 "b.gt 1b \n" | 2718 "b.gt 1b \n" |
| 2876 : "+r"(src_sobelx), // %0 | 2719 : "+r"(src_sobelx), // %0 |
| 2877 "+r"(src_sobely), // %1 | 2720 "+r"(src_sobely), // %1 |
| 2878 "+r"(dst_argb), // %2 | 2721 "+r"(dst_argb), // %2 |
| 2879 "+r"(width) // %3 | 2722 "+r"(width) // %3 |
| 2880 : | 2723 : |
| 2881 : "cc", "memory", "v0", "v1", "v2", "v3" | 2724 : "cc", "memory", "v0", "v1", "v2", "v3" |
| 2882 ); | 2725 ); |
| 2883 } | 2726 } |
| 2884 #endif // HAS_SOBELXYROW_NEON | |
| 2885 | 2727 |
| 2886 // SobelX as a matrix is | 2728 // SobelX as a matrix is |
| 2887 // -1 0 1 | 2729 // -1 0 1 |
| 2888 // -2 0 2 | 2730 // -2 0 2 |
| 2889 // -1 0 1 | 2731 // -1 0 1 |
| 2890 #ifdef HAS_SOBELXROW_NEON | |
| 2891 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, | 2732 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |
| 2892 const uint8* src_y2, uint8* dst_sobelx, int width) { | 2733 const uint8* src_y2, uint8* dst_sobelx, int width) { |
| 2893 asm volatile ( | 2734 asm volatile ( |
| 2894 "1: \n" | 2735 "1: \n" |
| 2895 MEMACCESS(0) | 2736 MEMACCESS(0) |
| 2896 "ld1 {v0.8b}, [%0],%5 \n" // top | 2737 "ld1 {v0.8b}, [%0],%5 \n" // top |
| 2897 MEMACCESS(0) | 2738 MEMACCESS(0) |
| 2898 "ld1 {v1.8b}, [%0],%6 \n" | 2739 "ld1 {v1.8b}, [%0],%6 \n" |
| 2899 "usubl v0.8h, v0.8b, v1.8b \n" | 2740 "usubl v0.8h, v0.8b, v1.8b \n" |
| 2900 MEMACCESS(1) | 2741 MEMACCESS(1) |
| (...skipping 18 matching lines...) Expand all Loading... |
| 2919 : "+r"(src_y0), // %0 | 2760 : "+r"(src_y0), // %0 |
| 2920 "+r"(src_y1), // %1 | 2761 "+r"(src_y1), // %1 |
| 2921 "+r"(src_y2), // %2 | 2762 "+r"(src_y2), // %2 |
| 2922 "+r"(dst_sobelx), // %3 | 2763 "+r"(dst_sobelx), // %3 |
| 2923 "+r"(width) // %4 | 2764 "+r"(width) // %4 |
| 2924 : "r"(2LL), // %5 | 2765 : "r"(2LL), // %5 |
| 2925 "r"(6LL) // %6 | 2766 "r"(6LL) // %6 |
| 2926 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2767 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 2927 ); | 2768 ); |
| 2928 } | 2769 } |
| 2929 #endif // HAS_SOBELXROW_NEON | |
| 2930 | 2770 |
| 2931 // SobelY as a matrix is | 2771 // SobelY as a matrix is |
| 2932 // -1 -2 -1 | 2772 // -1 -2 -1 |
| 2933 // 0 0 0 | 2773 // 0 0 0 |
| 2934 // 1 2 1 | 2774 // 1 2 1 |
| 2935 #ifdef HAS_SOBELYROW_NEON | |
| 2936 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, | 2775 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |
| 2937 uint8* dst_sobely, int width) { | 2776 uint8* dst_sobely, int width) { |
| 2938 asm volatile ( | 2777 asm volatile ( |
| 2939 "1: \n" | 2778 "1: \n" |
| 2940 MEMACCESS(0) | 2779 MEMACCESS(0) |
| 2941 "ld1 {v0.8b}, [%0],%4 \n" // left | 2780 "ld1 {v0.8b}, [%0],%4 \n" // left |
| 2942 MEMACCESS(1) | 2781 MEMACCESS(1) |
| 2943 "ld1 {v1.8b}, [%1],%4 \n" | 2782 "ld1 {v1.8b}, [%1],%4 \n" |
| 2944 "usubl v0.8h, v0.8b, v1.8b \n" | 2783 "usubl v0.8h, v0.8b, v1.8b \n" |
| 2945 MEMACCESS(0) | 2784 MEMACCESS(0) |
| (...skipping 17 matching lines...) Expand all Loading... |
| 2963 "b.gt 1b \n" | 2802 "b.gt 1b \n" |
| 2964 : "+r"(src_y0), // %0 | 2803 : "+r"(src_y0), // %0 |
| 2965 "+r"(src_y1), // %1 | 2804 "+r"(src_y1), // %1 |
| 2966 "+r"(dst_sobely), // %2 | 2805 "+r"(dst_sobely), // %2 |
| 2967 "+r"(width) // %3 | 2806 "+r"(width) // %3 |
| 2968 : "r"(1LL), // %4 | 2807 : "r"(1LL), // %4 |
| 2969 "r"(6LL) // %5 | 2808 "r"(6LL) // %5 |
| 2970 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List | 2809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List |
| 2971 ); | 2810 ); |
| 2972 } | 2811 } |
| 2973 #endif // HAS_SOBELYROW_NEON | |
| 2974 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) | 2812 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) |
| 2975 | 2813 |
| 2976 #ifdef __cplusplus | 2814 #ifdef __cplusplus |
| 2977 } // extern "C" | 2815 } // extern "C" |
| 2978 } // namespace libyuv | 2816 } // namespace libyuv |
| 2979 #endif | 2817 #endif |
| OLD | NEW |