src/compiler/arm/code-generator-arm.cc - Issue 2804883008: [WASM SIMD] Implement horizontal add for float and integer types.

Side by Side Diff: src/compiler/arm/code-generator-arm.cc

Issue 2804883008: [WASM SIMD] Implement horizontal add for float and integer types. (Closed)

Patch Set: Fix MIPS. Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/compiler/code-generator.h"	5 #include "src/compiler/code-generator.h"

6	6

7 #include "src/arm/macro-assembler-arm.h"	7 #include "src/arm/macro-assembler-arm.h"

8 #include "src/assembler-inl.h"	8 #include "src/assembler-inl.h"

9 #include "src/compilation-info.h"	9 #include "src/compilation-info.h"

10 #include "src/compiler/code-generator-impl.h"	10 #include "src/compiler/code-generator-impl.h"

(...skipping 1593 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1604 }	1604 }

1605 case kArmF32x4RecipSqrtApprox: {	1605 case kArmF32x4RecipSqrtApprox: {

1606 __ vrsqrte(i.OutputSimd128Register(), i.InputSimd128Register(0));	1606 __ vrsqrte(i.OutputSimd128Register(), i.InputSimd128Register(0));

1607 break;	1607 break;

1608 }	1608 }

1609 case kArmF32x4Add: {	1609 case kArmF32x4Add: {

1610 __ vadd(i.OutputSimd128Register(), i.InputSimd128Register(0),	1610 __ vadd(i.OutputSimd128Register(), i.InputSimd128Register(0),

1611 i.InputSimd128Register(1));	1611 i.InputSimd128Register(1));

1612 break;	1612 break;

1613 }	1613 }

	1614 case kArmF32x4AddHoriz: {

	1615 Simd128Register dst = i.OutputSimd128Register(),

	1616 src0 = i.InputSimd128Register(0),

	1617 src1 = i.InputSimd128Register(1);

	1618 // Make sure we don't overwrite source data before it's used.

	1619 if (dst.is(src0)) {

	1620 __ vpadd(dst.low(), src0.low(), src0.high());
	georgia.kouveli 2017/04/20 14:53:06 The inputs to all the vpadd instructions seem to b The inputs to all the vpadd instructions seem to be in the wrong order, shouldn't it be high first and low second? (VPADD is not commutative). The RunF32x4HorizOpTest and other tests will not catch this, as the two inputs have splatted values, which means the two pairs in dst.high() will have the same result and the two pairs in dst.low() will also have the same result. It would help to update the test to have a different expected value in each pair, this way you would catch any such issues. bbudge 2017/04/21 20:18:58 I changed the tests to use the permutation test ma Show quoted text On 2017/04/20 14:53:06, georgia.kouveli wrote: > The inputs to all the vpadd instructions seem to be in the wrong order, > shouldn't it be high first and low second? (VPADD is not commutative). > > The RunF32x4HorizOpTest and other tests will not catch this, as the two inputs > have splatted values, which means the two pairs in dst.high() will have the same > result and the two pairs in dst.low() will also have the same result. It would > help to update the test to have a different expected value in each pair, this > way you would catch any such issues. I changed the tests to use the permutation test machinery, which assigns unique (integral) values to each lane of each vector. This eliminates a lot of code and results in better tests (no more splatted inputs.) I think there's some confusion in the assembler calls - vpadd stores the first source operand as Vn, and the second source operand as Vm, which is flipped from the normal assembler syntax. I'm not sure what to do about that though, as this matches a lot of existing NEON assembler code. Perhaps another CL to straighten out the confusion?
	1621 if (dst.is(src1)) {

	1622 __ vmov(dst.high(), dst.low());

	1623 } else {

	1624 __ vpadd(dst.high(), src1.low(), src1.high());

	1625 }

	1626 } else {

	1627 __ vpadd(dst.high(), src1.low(), src1.high());

	1628 __ vpadd(dst.low(), src0.low(), src0.high());

	1629 }

	1630 break;

	1631 }

1614 case kArmF32x4Sub: {	1632 case kArmF32x4Sub: {

1615 __ vsub(i.OutputSimd128Register(), i.InputSimd128Register(0),	1633 __ vsub(i.OutputSimd128Register(), i.InputSimd128Register(0),

1616 i.InputSimd128Register(1));	1634 i.InputSimd128Register(1));

1617 break;	1635 break;

1618 }	1636 }

1619 case kArmF32x4Mul: {	1637 case kArmF32x4Mul: {

1620 __ vmul(i.OutputSimd128Register(), i.InputSimd128Register(0),	1638 __ vmul(i.OutputSimd128Register(), i.InputSimd128Register(0),

1621 i.InputSimd128Register(1));	1639 i.InputSimd128Register(1));

1622 break;	1640 break;

1623 }	1641 }

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1702 case kArmI32x4ShrS: {	1720 case kArmI32x4ShrS: {

1703 __ vshr(NeonS32, i.OutputSimd128Register(), i.InputSimd128Register(0),	1721 __ vshr(NeonS32, i.OutputSimd128Register(), i.InputSimd128Register(0),

1704 i.InputInt5(1));	1722 i.InputInt5(1));

1705 break;	1723 break;

1706 }	1724 }

1707 case kArmI32x4Add: {	1725 case kArmI32x4Add: {

1708 __ vadd(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),	1726 __ vadd(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),

1709 i.InputSimd128Register(1));	1727 i.InputSimd128Register(1));

1710 break;	1728 break;

1711 }	1729 }

	1730 case kArmI32x4AddHoriz: {

	1731 Simd128Register dst = i.OutputSimd128Register(),

	1732 src0 = i.InputSimd128Register(0),

	1733 src1 = i.InputSimd128Register(1);

	1734 // Make sure we don't overwrite source data before it's used.
	georgia.kouveli 2017/04/20 14:53:06 Might make sense to factor out this code, since it Might make sense to factor out this code, since it's repeated for all types. bbudge 2017/04/21 20:18:58 Done. (And I did the same for the repetitious narr Show quoted text On 2017/04/20 14:53:06, georgia.kouveli wrote: > Might make sense to factor out this code, since it's repeated for all types. Done. (And I did the same for the repetitious narrowing integer conversions too.)
	1735 if (dst.is(src0)) {

	1736 __ vpadd(Neon32, dst.low(), src0.low(), src0.high());

	1737 if (dst.is(src1)) {

	1738 __ vmov(dst.high(), dst.low());

	1739 } else {

	1740 __ vpadd(Neon32, dst.high(), src1.low(), src1.high());

	1741 }

	1742 } else {

	1743 __ vpadd(Neon32, dst.high(), src1.low(), src1.high());

	1744 __ vpadd(Neon32, dst.low(), src0.low(), src0.high());

	1745 }

	1746 break;

	1747 }

1712 case kArmI32x4Sub: {	1748 case kArmI32x4Sub: {

1713 __ vsub(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),	1749 __ vsub(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),

1714 i.InputSimd128Register(1));	1750 i.InputSimd128Register(1));

1715 break;	1751 break;

1716 }	1752 }

1717 case kArmI32x4Mul: {	1753 case kArmI32x4Mul: {

1718 __ vmul(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),	1754 __ vmul(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0),

1719 i.InputSimd128Register(1));	1755 i.InputSimd128Register(1));

1720 break;	1756 break;

1721 }	1757 }

(...skipping 128 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1850 case kArmI16x8Add: {	1886 case kArmI16x8Add: {

1851 __ vadd(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),	1887 __ vadd(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),

1852 i.InputSimd128Register(1));	1888 i.InputSimd128Register(1));

1853 break;	1889 break;

1854 }	1890 }

1855 case kArmI16x8AddSaturateS: {	1891 case kArmI16x8AddSaturateS: {

1856 __ vqadd(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),	1892 __ vqadd(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),

1857 i.InputSimd128Register(1));	1893 i.InputSimd128Register(1));

1858 break;	1894 break;

1859 }	1895 }

	1896 case kArmI16x8AddHoriz: {

	1897 Simd128Register dst = i.OutputSimd128Register(),

	1898 src0 = i.InputSimd128Register(0),

	1899 src1 = i.InputSimd128Register(1);

	1900 // Make sure we don't overwrite source data before it's used.

	1901 if (dst.is(src0)) {

	1902 __ vpadd(Neon16, dst.low(), src0.low(), src0.high());

	1903 if (dst.is(src1)) {

	1904 __ vmov(dst.high(), dst.low());

	1905 } else {

	1906 __ vpadd(Neon16, dst.high(), src1.low(), src1.high());

	1907 }

	1908 } else {

	1909 __ vpadd(Neon16, dst.high(), src1.low(), src1.high());

	1910 __ vpadd(Neon16, dst.low(), src0.low(), src0.high());

	1911 }

	1912 break;

	1913 }

1860 case kArmI16x8Sub: {	1914 case kArmI16x8Sub: {

1861 __ vsub(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),	1915 __ vsub(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0),

1862 i.InputSimd128Register(1));	1916 i.InputSimd128Register(1));

1863 break;	1917 break;

1864 }	1918 }

1865 case kArmI16x8SubSaturateS: {	1919 case kArmI16x8SubSaturateS: {

1866 __ vqsub(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),	1920 __ vqsub(NeonS16, i.OutputSimd128Register(), i.InputSimd128Register(0),

1867 i.InputSimd128Register(1));	1921 i.InputSimd128Register(1));

1868 break;	1922 break;

1869 }	1923 }

(...skipping 148 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2018 case kArmI8x16Add: {	2072 case kArmI8x16Add: {

2019 __ vadd(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),	2073 __ vadd(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),

2020 i.InputSimd128Register(1));	2074 i.InputSimd128Register(1));

2021 break;	2075 break;

2022 }	2076 }

2023 case kArmI8x16AddSaturateS: {	2077 case kArmI8x16AddSaturateS: {

2024 __ vqadd(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),	2078 __ vqadd(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),

2025 i.InputSimd128Register(1));	2079 i.InputSimd128Register(1));

2026 break;	2080 break;

2027 }	2081 }

	2082 case kArmI8x16AddHoriz: {

	2083 Simd128Register dst = i.OutputSimd128Register(),

	2084 src0 = i.InputSimd128Register(0),

	2085 src1 = i.InputSimd128Register(1);

	2086 // Make sure we don't overwrite source data before it's used.

	2087 if (dst.is(src0)) {

	2088 __ vpadd(Neon8, dst.low(), src0.low(), src0.high());

	2089 if (dst.is(src1)) {

	2090 __ vmov(dst.high(), dst.low());

	2091 } else {

	2092 __ vpadd(Neon8, dst.high(), src1.low(), src1.high());

	2093 }

	2094 } else {

	2095 __ vpadd(Neon8, dst.high(), src1.low(), src1.high());

	2096 __ vpadd(Neon8, dst.low(), src0.low(), src0.high());

	2097 }

	2098 break;

	2099 }

2028 case kArmI8x16Sub: {	2100 case kArmI8x16Sub: {

2029 __ vsub(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),	2101 __ vsub(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0),

2030 i.InputSimd128Register(1));	2102 i.InputSimd128Register(1));

2031 break;	2103 break;

2032 }	2104 }

2033 case kArmI8x16SubSaturateS: {	2105 case kArmI8x16SubSaturateS: {

2034 __ vqsub(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),	2106 __ vqsub(NeonS8, i.OutputSimd128Register(), i.InputSimd128Register(0),

2035 i.InputSimd128Register(1));	2107 i.InputSimd128Register(1));

2036 break;	2108 break;

2037 }	2109 }

(...skipping 1124 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3162 padding_size -= v8::internal::Assembler::kInstrSize;	3234 padding_size -= v8::internal::Assembler::kInstrSize;

3163 }	3235 }

3164 }	3236 }

3165 }	3237 }

3166	3238

3167 #undef __	3239 #undef __

3168	3240

3169 } // namespace compiler	3241 } // namespace compiler

3170 } // namespace internal	3242 } // namespace internal

3171 } // namespace v8	3243 } // namespace v8

OLD	NEW

« src/arm/assembler-arm.h ('K') | « src/arm/simulator-arm.cc ('k') | src/compiler/arm/instruction-codes-arm.h » ('j') | src/compiler/machine-operator.cc » ('J')