Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(557)

Side by Side Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)
Patch Set: Factor out more common shuffle code. Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/compiler/code-generator.h" 5 #include "src/compiler/code-generator.h"
6 6
7 #include "src/arm/macro-assembler-arm.h" 7 #include "src/arm/macro-assembler-arm.h"
8 #include "src/assembler-inl.h" 8 #include "src/assembler-inl.h"
9 #include "src/compilation-info.h" 9 #include "src/compilation-info.h"
10 #include "src/compiler/code-generator-impl.h" 10 #include "src/compiler/code-generator-impl.h"
(...skipping 329 matching lines...) Expand 10 before | Expand all | Expand 10 after
340 return pl; 340 return pl;
341 case kNegative: 341 case kNegative:
342 return mi; 342 return mi;
343 default: 343 default:
344 break; 344 break;
345 } 345 }
346 UNREACHABLE(); 346 UNREACHABLE();
347 return kNoCondition; 347 return kNoCondition;
348 } 348 }
349 349
350 int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {
351 // If unary shuffle, table is src0 (2 d-registers).
352 if (src0.is(src1)) return 2;
353 // Binary shuffle, table is src0, src1. They must be consecutive
354 DCHECK_EQ(src0.code() + 1, src1.code());
355 return 4; // 4 d-registers.
356 }
357
350 } // namespace 358 } // namespace
351 359
352 #define ASSEMBLE_CHECKED_LOAD_FP(Type) \ 360 #define ASSEMBLE_CHECKED_LOAD_FP(Type) \
353 do { \ 361 do { \
354 auto result = i.Output##Type##Register(); \ 362 auto result = i.Output##Type##Register(); \
355 auto offset = i.InputRegister(0); \ 363 auto offset = i.InputRegister(0); \
356 if (instr->InputAt(1)->IsRegister()) { \ 364 if (instr->InputAt(1)->IsRegister()) { \
357 __ cmp(offset, i.InputRegister(1)); \ 365 __ cmp(offset, i.InputRegister(1)); \
358 } else { \ 366 } else { \
359 __ cmp(offset, i.InputImmediate(1)); \ 367 __ cmp(offset, i.InputImmediate(1)); \
(...skipping 1819 matching lines...) Expand 10 before | Expand all | Expand 10 after
2179 } 2187 }
2180 case kArmS32x4TransposeLeft: { 2188 case kArmS32x4TransposeLeft: {
2181 Simd128Register dst = i.OutputSimd128Register(), 2189 Simd128Register dst = i.OutputSimd128Register(),
2182 src1 = i.InputSimd128Register(1); 2190 src1 = i.InputSimd128Register(1);
2183 DCHECK(dst.is(i.InputSimd128Register(0))); 2191 DCHECK(dst.is(i.InputSimd128Register(0)));
2184 // src0 = [0, 1, 2, 3], src1 = [4, 5, 6, 7] 2192 // src0 = [0, 1, 2, 3], src1 = [4, 5, 6, 7]
2185 __ vmov(kScratchQuadReg, src1); 2193 __ vmov(kScratchQuadReg, src1);
2186 __ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6] 2194 __ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]
2187 break; 2195 break;
2188 } 2196 }
2197 case kArmS32x4Shuffle: {
2198 Simd128Register dst = i.OutputSimd128Register(),
2199 src0 = i.InputSimd128Register(0),
2200 src1 = i.InputSimd128Register(1);
2201 // Check for in-place shuffles.
2202 // If dst == src0 == src1, then the shuffle is unary and we only use src0.
2203 if (dst.is(src0)) {
2204 __ vmov(kScratchQuadReg, src0);
2205 src0 = kScratchQuadReg;
2206 } else if (dst.is(src1)) {
2207 __ vmov(kScratchQuadReg, src1);
2208 src1 = kScratchQuadReg;
2209 }
2210 // Perform shuffle as a vmov per lane.
2211 int dst_code = dst.code() * 4;
2212 int src0_code = src0.code() * 4;
2213 int src1_code = src1.code() * 4;
2214 int32_t shuffle = i.InputInt32(2);
2215 for (int i = 0; i < 4; i++) {
2216 int lane = shuffle & 0x7;
2217 int src_code = src0_code;
2218 if (lane >= 4) {
2219 src_code = src1_code;
2220 lane &= 0x3;
2221 }
2222 __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
martyn.capewell 2017/05/05 14:28:49 This will become expensive when each s-register mo
bbudge 2017/05/05 20:36:28 Yes, I think I have a TODO to improve VMovExtended
2223 shuffle >>= 8;
2224 }
2225 break;
2226 }
2189 case kArmS32x4TransposeRight: { 2227 case kArmS32x4TransposeRight: {
2190 Simd128Register dst = i.OutputSimd128Register(), 2228 Simd128Register dst = i.OutputSimd128Register(),
2191 src1 = i.InputSimd128Register(1); 2229 src1 = i.InputSimd128Register(1);
2192 DCHECK(dst.is(i.InputSimd128Register(0))); 2230 DCHECK(dst.is(i.InputSimd128Register(0)));
2193 // src0 = [4, 5, 6, 7], src1 = [0, 1, 2, 3] (flipped from TransposeLeft). 2231 // src0 = [4, 5, 6, 7], src1 = [0, 1, 2, 3] (flipped from TransposeLeft).
2194 __ vmov(kScratchQuadReg, src1); 2232 __ vmov(kScratchQuadReg, src1);
2195 __ vtrn(Neon32, kScratchQuadReg, dst); // dst = [1, 5, 3, 7] 2233 __ vtrn(Neon32, kScratchQuadReg, dst); // dst = [1, 5, 3, 7]
2196 break; 2234 break;
2197 } 2235 }
2198 case kArmS16x8ZipLeft: { 2236 case kArmS16x8ZipLeft: {
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
2242 } 2280 }
2243 case kArmS16x8TransposeRight: { 2281 case kArmS16x8TransposeRight: {
2244 Simd128Register dst = i.OutputSimd128Register(), 2282 Simd128Register dst = i.OutputSimd128Register(),
2245 src1 = i.InputSimd128Register(1); 2283 src1 = i.InputSimd128Register(1);
2246 DCHECK(dst.is(i.InputSimd128Register(0))); 2284 DCHECK(dst.is(i.InputSimd128Register(0)));
2247 // src0 = [8, 9, 10, 11, ... 15], src1 = [0, 1, 2, 3, ... 7] (flipped). 2285 // src0 = [8, 9, 10, 11, ... 15], src1 = [0, 1, 2, 3, ... 7] (flipped).
2248 __ vmov(kScratchQuadReg, src1); 2286 __ vmov(kScratchQuadReg, src1);
2249 __ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15] 2287 __ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]
2250 break; 2288 break;
2251 } 2289 }
2290 case kArmS16x8Shuffle: {
2291 Simd128Register dst = i.OutputSimd128Register(),
2292 src0 = i.InputSimd128Register(0),
2293 src1 = i.InputSimd128Register(1);
2294 DwVfpRegister table_base = src0.low();
2295 int table_size = GetVtblTableSize(src0, src1);
2296 // Convert the shuffle lane masks to byte masks in kScratchQuadReg.
2297 int scratch_s_base = kScratchQuadReg.code() * 4;
2298 for (int j = 0; j < 2; j++) {
2299 int32_t four_lanes = i.InputInt32(2 + j);
2300 for (int k = 0; k < 2; k++) {
2301 uint8_t w0 = (four_lanes & 0xFF) * kShortSize;
2302 four_lanes >>= 8;
2303 uint8_t w1 = (four_lanes & 0xFF) * kShortSize;
2304 four_lanes >>= 8;
2305 int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);
2306 __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),
2307 bit_cast<float>(mask));
martyn.capewell 2017/05/05 14:28:49 This may cause a problem - if your mask looks like
bbudge 2017/05/05 20:36:28 I could also finesse the code so NaNs can't be gen
martyn.capewell 2017/05/08 13:33:53 That will fix the NaN problem. However, the assemb
2308 }
2309 }
2310 NeonListOperand table(table_base, table_size);
2311 __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
2312 __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
2313 __ vmov(dst, kScratchQuadReg);
martyn.capewell 2017/05/05 14:28:49 If you know dst doesn't alias src0 or src1, vtbl c
bbudge 2017/05/05 20:36:28 Nice, done.
2314 break;
2315 }
2252 case kArmS8x16ZipLeft: { 2316 case kArmS8x16ZipLeft: {
2253 Simd128Register dst = i.OutputSimd128Register(), 2317 Simd128Register dst = i.OutputSimd128Register(),
2254 src1 = i.InputSimd128Register(1); 2318 src1 = i.InputSimd128Register(1);
2255 DCHECK(dst.is(i.InputSimd128Register(0))); 2319 DCHECK(dst.is(i.InputSimd128Register(0)));
2256 // src0 = [0, 1, 2, 3, ... 15], src1 = [16, 17, 18, 19, ... 31] 2320 // src0 = [0, 1, 2, 3, ... 15], src1 = [16, 17, 18, 19, ... 31]
2257 __ vmov(dst.high(), src1.low()); 2321 __ vmov(dst.high(), src1.low());
2258 __ vzip(Neon8, dst.low(), dst.high()); // dst = [0, 16, 1, 17, ... 23] 2322 __ vzip(Neon8, dst.low(), dst.high()); // dst = [0, 16, 1, 17, ... 23]
2259 break; 2323 break;
2260 } 2324 }
2261 case kArmS8x16ZipRight: { 2325 case kArmS8x16ZipRight: {
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
2301 // src0 = [16, 17, 18, 19, ... 31], src1 = [0, 1, 2, 3, ... 15] (flipped). 2365 // src0 = [16, 17, 18, 19, ... 31], src1 = [0, 1, 2, 3, ... 15] (flipped).
2302 __ vmov(kScratchQuadReg, src1); 2366 __ vmov(kScratchQuadReg, src1);
2303 __ vtrn(Neon8, kScratchQuadReg, dst); // dst = [1, 17, 3, 19, ... 31] 2367 __ vtrn(Neon8, kScratchQuadReg, dst); // dst = [1, 17, 3, 19, ... 31]
2304 break; 2368 break;
2305 } 2369 }
2306 case kArmS8x16Concat: { 2370 case kArmS8x16Concat: {
2307 __ vext(i.OutputSimd128Register(), i.InputSimd128Register(0), 2371 __ vext(i.OutputSimd128Register(), i.InputSimd128Register(0),
2308 i.InputSimd128Register(1), i.InputInt4(2)); 2372 i.InputSimd128Register(1), i.InputInt4(2));
2309 break; 2373 break;
2310 } 2374 }
2375 case kArmS8x16Shuffle: {
2376 Simd128Register dst = i.OutputSimd128Register(),
2377 src0 = i.InputSimd128Register(0),
2378 src1 = i.InputSimd128Register(1);
2379 DwVfpRegister table_base = src0.low();
2380 int table_size = GetVtblTableSize(src0, src1);
2381 // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.
2382 int scratch_s_base = kScratchQuadReg.code() * 4;
2383 for (int j = 0; j < 4; j++) {
2384 int32_t four_lanes = i.InputInt32(2 + j);
2385 __ vmov(SwVfpRegister::from_code(scratch_s_base + j),
2386 bit_cast<float>(four_lanes));
2387 }
2388 NeonListOperand table(table_base, table_size);
2389 __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
2390 __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
2391 __ vmov(dst, kScratchQuadReg);
2392 break;
2393 }
2311 case kArmS32x2Reverse: { 2394 case kArmS32x2Reverse: {
2312 __ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0)); 2395 __ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));
2313 break; 2396 break;
2314 } 2397 }
2315 case kArmS16x4Reverse: { 2398 case kArmS16x4Reverse: {
2316 __ vrev64(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0)); 2399 __ vrev64(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));
2317 break; 2400 break;
2318 } 2401 }
2319 case kArmS16x2Reverse: { 2402 case kArmS16x2Reverse: {
2320 __ vrev32(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0)); 2403 __ vrev32(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));
(...skipping 826 matching lines...) Expand 10 before | Expand all | Expand 10 after
3147 padding_size -= v8::internal::Assembler::kInstrSize; 3230 padding_size -= v8::internal::Assembler::kInstrSize;
3148 } 3231 }
3149 } 3232 }
3150 } 3233 }
3151 3234
3152 #undef __ 3235 #undef __
3153 3236
3154 } // namespace compiler 3237 } // namespace compiler
3155 } // namespace internal 3238 } // namespace internal
3156 } // namespace v8 3239 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698