src/compiler/arm/code-generator-arm.cc - Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD.

Side by Side Diff: src/compiler/arm/code-generator-arm.cc

Issue 2856363003: [ARM] Implement irregular vector shuffles for SIMD. (Closed)

Patch Set: Factor out more common shuffle code. Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/compiler/code-generator.h"	5 #include "src/compiler/code-generator.h"

6	6

7 #include "src/arm/macro-assembler-arm.h"	7 #include "src/arm/macro-assembler-arm.h"

8 #include "src/assembler-inl.h"	8 #include "src/assembler-inl.h"

9 #include "src/compilation-info.h"	9 #include "src/compilation-info.h"

10 #include "src/compiler/code-generator-impl.h"	10 #include "src/compiler/code-generator-impl.h"

(...skipping 329 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
340 return pl;	340 return pl;

341 case kNegative:	341 case kNegative:

342 return mi;	342 return mi;

343 default:	343 default:

344 break;	344 break;

345 }	345 }

346 UNREACHABLE();	346 UNREACHABLE();

347 return kNoCondition;	347 return kNoCondition;

348 }	348 }

349	349

	350 int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {

	351 // If unary shuffle, table is src0 (2 d-registers).

	352 if (src0.is(src1)) return 2;

	353 // Binary shuffle, table is src0, src1. They must be consecutive

	354 DCHECK_EQ(src0.code() + 1, src1.code());

	355 return 4; // 4 d-registers.

	356 }

	357

350 } // namespace	358 } // namespace

351	359

352 #define ASSEMBLE_CHECKED_LOAD_FP(Type) \	360 #define ASSEMBLE_CHECKED_LOAD_FP(Type) \

353 do { \	361 do { \

354 auto result = i.Output##Type##Register(); \	362 auto result = i.Output##Type##Register(); \

355 auto offset = i.InputRegister(0); \	363 auto offset = i.InputRegister(0); \

356 if (instr->InputAt(1)->IsRegister()) { \	364 if (instr->InputAt(1)->IsRegister()) { \

357 __ cmp(offset, i.InputRegister(1)); \	365 __ cmp(offset, i.InputRegister(1)); \

358 } else { \	366 } else { \

359 __ cmp(offset, i.InputImmediate(1)); \	367 __ cmp(offset, i.InputImmediate(1)); \

(...skipping 1819 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2179 }	2187 }

2180 case kArmS32x4TransposeLeft: {	2188 case kArmS32x4TransposeLeft: {

2181 Simd128Register dst = i.OutputSimd128Register(),	2189 Simd128Register dst = i.OutputSimd128Register(),

2182 src1 = i.InputSimd128Register(1);	2190 src1 = i.InputSimd128Register(1);

2183 DCHECK(dst.is(i.InputSimd128Register(0)));	2191 DCHECK(dst.is(i.InputSimd128Register(0)));

2184 // src0 = [0, 1, 2, 3], src1 = [4, 5, 6, 7]	2192 // src0 = [0, 1, 2, 3], src1 = [4, 5, 6, 7]

2185 __ vmov(kScratchQuadReg, src1);	2193 __ vmov(kScratchQuadReg, src1);

2186 __ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]	2194 __ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]

2187 break;	2195 break;

2188 }	2196 }

	2197 case kArmS32x4Shuffle: {

	2198 Simd128Register dst = i.OutputSimd128Register(),

	2199 src0 = i.InputSimd128Register(0),

	2200 src1 = i.InputSimd128Register(1);

	2201 // Check for in-place shuffles.

	2202 // If dst == src0 == src1, then the shuffle is unary and we only use src0.

	2203 if (dst.is(src0)) {

	2204 __ vmov(kScratchQuadReg, src0);

	2205 src0 = kScratchQuadReg;

	2206 } else if (dst.is(src1)) {

	2207 __ vmov(kScratchQuadReg, src1);

	2208 src1 = kScratchQuadReg;

	2209 }

	2210 // Perform shuffle as a vmov per lane.

	2211 int dst_code = dst.code() * 4;

	2212 int src0_code = src0.code() * 4;

	2213 int src1_code = src1.code() * 4;

	2214 int32_t shuffle = i.InputInt32(2);

	2215 for (int i = 0; i < 4; i++) {

	2216 int lane = shuffle & 0x7;

	2217 int src_code = src0_code;

	2218 if (lane >= 4) {

	2219 src_code = src1_code;

	2220 lane &= 0x3;

	2221 }

	2222 __ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
	martyn.capewell 2017/05/05 14:28:49 This will become expensive when each s-register mo This will become expensive when each s-register move goes via kScratchReg - core-vfp and vfp-core moves can take longer than core-core or vfp-vfp. A follow up patch to improve VmovExtended could try using combinations of vdup, vsli and vsri to eliminate the need for a core scratch register. e.g. move s40 (a) to s43: d20 = [ X \| a ], d21 = [ Y \| Z ], use "vsli d21, d20, #32" to give d21 = [ a \| Z ]. bbudge 2017/05/05 20:36:28 Yes, I think I have a TODO to improve VMovExtended Show quoted text On 2017/05/05 14:28:49, martyn.capewell wrote: > This will become expensive when each s-register move goes via kScratchReg - > core-vfp and vfp-core moves can take longer than core-core or vfp-vfp. > > A follow up patch to improve VmovExtended could try using combinations of vdup, > vsli and vsri to eliminate the need for a core scratch register. > > e.g. move s40 (a) to s43: d20 = [ X \| a ], d21 = [ Y \| Z ], use "vsli d21, d20, > #32" to give d21 = [ a \| Z ]. Yes, I think I have a TODO to improve VMovExtended. It's also potentially used a lot for gap moves. I'll get on that.
	2223 shuffle >>= 8;

	2224 }

	2225 break;

	2226 }

2189 case kArmS32x4TransposeRight: {	2227 case kArmS32x4TransposeRight: {

2190 Simd128Register dst = i.OutputSimd128Register(),	2228 Simd128Register dst = i.OutputSimd128Register(),

2191 src1 = i.InputSimd128Register(1);	2229 src1 = i.InputSimd128Register(1);

2192 DCHECK(dst.is(i.InputSimd128Register(0)));	2230 DCHECK(dst.is(i.InputSimd128Register(0)));

2193 // src0 = [4, 5, 6, 7], src1 = [0, 1, 2, 3] (flipped from TransposeLeft).	2231 // src0 = [4, 5, 6, 7], src1 = [0, 1, 2, 3] (flipped from TransposeLeft).

2194 __ vmov(kScratchQuadReg, src1);	2232 __ vmov(kScratchQuadReg, src1);

2195 __ vtrn(Neon32, kScratchQuadReg, dst); // dst = [1, 5, 3, 7]	2233 __ vtrn(Neon32, kScratchQuadReg, dst); // dst = [1, 5, 3, 7]

2196 break;	2234 break;

2197 }	2235 }

2198 case kArmS16x8ZipLeft: {	2236 case kArmS16x8ZipLeft: {

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2242 }	2280 }

2243 case kArmS16x8TransposeRight: {	2281 case kArmS16x8TransposeRight: {

2244 Simd128Register dst = i.OutputSimd128Register(),	2282 Simd128Register dst = i.OutputSimd128Register(),

2245 src1 = i.InputSimd128Register(1);	2283 src1 = i.InputSimd128Register(1);

2246 DCHECK(dst.is(i.InputSimd128Register(0)));	2284 DCHECK(dst.is(i.InputSimd128Register(0)));

2247 // src0 = [8, 9, 10, 11, ... 15], src1 = [0, 1, 2, 3, ... 7] (flipped).	2285 // src0 = [8, 9, 10, 11, ... 15], src1 = [0, 1, 2, 3, ... 7] (flipped).

2248 __ vmov(kScratchQuadReg, src1);	2286 __ vmov(kScratchQuadReg, src1);

2249 __ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]	2287 __ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]

2250 break;	2288 break;

2251 }	2289 }

	2290 case kArmS16x8Shuffle: {

	2291 Simd128Register dst = i.OutputSimd128Register(),

	2292 src0 = i.InputSimd128Register(0),

	2293 src1 = i.InputSimd128Register(1);

	2294 DwVfpRegister table_base = src0.low();

	2295 int table_size = GetVtblTableSize(src0, src1);

	2296 // Convert the shuffle lane masks to byte masks in kScratchQuadReg.

	2297 int scratch_s_base = kScratchQuadReg.code() * 4;

	2298 for (int j = 0; j < 2; j++) {

	2299 int32_t four_lanes = i.InputInt32(2 + j);

	2300 for (int k = 0; k < 2; k++) {

	2301 uint8_t w0 = (four_lanes & 0xFF) * kShortSize;

	2302 four_lanes >>= 8;

	2303 uint8_t w1 = (four_lanes & 0xFF) * kShortSize;

	2304 four_lanes >>= 8;

	2305 int32_t mask = w0 \| ((w0 + 1) << 8) \| (w1 << 16) \| ((w1 + 1) << 24);

	2306 __ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),

	2307 bit_cast<float>(mask));
	martyn.capewell 2017/05/05 14:28:49 This may cause a problem - if your mask looks like This may cause a problem - if your mask looks like a signalling NaN, it could be quietened on passing it to vmov. I had a similar problem in the ARM64 simulator. The outcome was some of the test results differed between hardware and simulation. A vmov-immediate function that accepts uint32_t would help. bbudge 2017/05/05 20:36:28 I could also finesse the code so NaNs can't be gen Show quoted text On 2017/05/05 14:28:49, martyn.capewell wrote: > This may cause a problem - if your mask looks like a signalling NaN, it could be > quietened on passing it to vmov. I had a similar problem in the ARM64 simulator. > The outcome was some of the test results differed between hardware and > simulation. > > A vmov-immediate function that accepts uint32_t would help. I could also finesse the code so NaNs can't be generated. We can guarantee that byte indices are in [0, 31] - but I'll have to mask them above. This way there's a very slight chance (depending on the mask) that we can leverage the more efficient encoding this move supports. martyn.capewell 2017/05/08 13:33:53 That will fix the NaN problem. However, the assemb That will fix the NaN problem. However, the assembler doesn't support some of the other non-FP vmov immediate forms that might be useful in this context. See "A7.4.6 One register and a modified immediate value" in the ARM ARM for more info. Not for this patch, though.
	2308 }

	2309 }

	2310 NeonListOperand table(table_base, table_size);

	2311 __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

	2312 __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

	2313 __ vmov(dst, kScratchQuadReg);
	martyn.capewell 2017/05/05 14:28:49 If you know dst doesn't alias src0 or src1, vtbl c If you know dst doesn't alias src0 or src1, vtbl can write into dst directly. bbudge 2017/05/05 20:36:28 Nice, done. Show quoted text On 2017/05/05 14:28:49, martyn.capewell wrote: > If you know dst doesn't alias src0 or src1, vtbl can write into dst directly. Nice, done.
	2314 break;

	2315 }

2252 case kArmS8x16ZipLeft: {	2316 case kArmS8x16ZipLeft: {

2253 Simd128Register dst = i.OutputSimd128Register(),	2317 Simd128Register dst = i.OutputSimd128Register(),

2254 src1 = i.InputSimd128Register(1);	2318 src1 = i.InputSimd128Register(1);

2255 DCHECK(dst.is(i.InputSimd128Register(0)));	2319 DCHECK(dst.is(i.InputSimd128Register(0)));

2256 // src0 = [0, 1, 2, 3, ... 15], src1 = [16, 17, 18, 19, ... 31]	2320 // src0 = [0, 1, 2, 3, ... 15], src1 = [16, 17, 18, 19, ... 31]

2257 __ vmov(dst.high(), src1.low());	2321 __ vmov(dst.high(), src1.low());

2258 __ vzip(Neon8, dst.low(), dst.high()); // dst = [0, 16, 1, 17, ... 23]	2322 __ vzip(Neon8, dst.low(), dst.high()); // dst = [0, 16, 1, 17, ... 23]

2259 break;	2323 break;

2260 }	2324 }

2261 case kArmS8x16ZipRight: {	2325 case kArmS8x16ZipRight: {

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2301 // src0 = [16, 17, 18, 19, ... 31], src1 = [0, 1, 2, 3, ... 15] (flipped).	2365 // src0 = [16, 17, 18, 19, ... 31], src1 = [0, 1, 2, 3, ... 15] (flipped).

2302 __ vmov(kScratchQuadReg, src1);	2366 __ vmov(kScratchQuadReg, src1);

2303 __ vtrn(Neon8, kScratchQuadReg, dst); // dst = [1, 17, 3, 19, ... 31]	2367 __ vtrn(Neon8, kScratchQuadReg, dst); // dst = [1, 17, 3, 19, ... 31]

2304 break;	2368 break;

2305 }	2369 }

2306 case kArmS8x16Concat: {	2370 case kArmS8x16Concat: {

2307 __ vext(i.OutputSimd128Register(), i.InputSimd128Register(0),	2371 __ vext(i.OutputSimd128Register(), i.InputSimd128Register(0),

2308 i.InputSimd128Register(1), i.InputInt4(2));	2372 i.InputSimd128Register(1), i.InputInt4(2));

2309 break;	2373 break;

2310 }	2374 }

	2375 case kArmS8x16Shuffle: {

	2376 Simd128Register dst = i.OutputSimd128Register(),

	2377 src0 = i.InputSimd128Register(0),

	2378 src1 = i.InputSimd128Register(1);

	2379 DwVfpRegister table_base = src0.low();

	2380 int table_size = GetVtblTableSize(src0, src1);

	2381 // The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.

	2382 int scratch_s_base = kScratchQuadReg.code() * 4;

	2383 for (int j = 0; j < 4; j++) {

	2384 int32_t four_lanes = i.InputInt32(2 + j);

	2385 __ vmov(SwVfpRegister::from_code(scratch_s_base + j),

	2386 bit_cast<float>(four_lanes));

	2387 }

	2388 NeonListOperand table(table_base, table_size);

	2389 __ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());

	2390 __ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());

	2391 __ vmov(dst, kScratchQuadReg);

	2392 break;

	2393 }

2311 case kArmS32x2Reverse: {	2394 case kArmS32x2Reverse: {

2312 __ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));	2395 __ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));

2313 break;	2396 break;

2314 }	2397 }

2315 case kArmS16x4Reverse: {	2398 case kArmS16x4Reverse: {

2316 __ vrev64(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));	2399 __ vrev64(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));

2317 break;	2400 break;

2318 }	2401 }

2319 case kArmS16x2Reverse: {	2402 case kArmS16x2Reverse: {

2320 __ vrev32(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));	2403 __ vrev32(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));

(...skipping 826 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3147 padding_size -= v8::internal::Assembler::kInstrSize;	3230 padding_size -= v8::internal::Assembler::kInstrSize;

3148 }	3231 }

3149 }	3232 }

3150 }	3233 }

3151	3234

3152 #undef __	3235 #undef __

3153	3236

3154 } // namespace compiler	3237 } // namespace compiler

3155 } // namespace internal	3238 } // namespace internal

3156 } // namespace v8	3239 } // namespace v8

OLD	NEW

« no previous file with comments | « no previous file | src/compiler/arm/instruction-codes-arm.h » ('j') | no next file with comments »