src/opts/SkTextureCompression_opts_neon.cpp - Issue 390453002: Add support for NEON intrinsics to speed up texture compression. We can

Side by Side Diff: src/opts/SkTextureCompression_opts_neon.cpp

Issue 390453002: Add support for NEON intrinsics to speed up texture compression. We can (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2014

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #include "SkTextureCompression_opts.h"

	9

	10 #include <arm_neon.h>

	11

	12 // Converts indices in each of the four bits of the register from

	13 // 0, 1, 2, 3, 4, 5, 6, 7

	14 // to

	15 // 3, 2, 1, 0, 4, 5, 6, 7

	16 //

	17 // A more detailed explanation can be found in SkTextureCompressor::convert_indi ces

	18 static inline uint8x16_t convert_indices(const uint8x16_t &x) {

	19 static const int8x16_t kThree = {

	20 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,

	21 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,

	22 };

	23

	24 static const int8x16_t kZero = {

	25 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

	26 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,

	27 };

	28

	29 // Take top three bits

	30 int8x16_t sx = vreinterpretq_s8_u8(x);

	31

	32 // Negate ...

	33 sx = vnegq_s8(sx);

	34

	35 // Add three...

	36 sx = vaddq_s8(sx, kThree);

	37

	38 // Generate negatives mask

	39 const int8x16_t mask = vreinterpretq_s8_u8(vcltq_s8(sx, kZero));

	40

	41 // Absolute value

	42 sx = vabsq_s8(sx);

	43

	44 // Add three to the values that were negative...

	45 return vreinterpretq_u8_s8(vaddq_s8(sx, vandq_s8(kThree, mask)));

	46 }

	47

	48 template<unsigned shift>

	49 static inline uint64x2_t shift_swap(const uint64x2_t &x, const uint64x2_t &mask) {

	50 uint64x2_t t = vandq_u64(mask, veorq_u64(x, vshrq_n_u64(x, shift)));

	51 return veorq_u64(x, veorq_u64(t, vshlq_n_u64(t, shift)));

	52 }

	53

	54 static inline uint64x2_t pack_indices(const uint64x2_t &x) {

	55 // x: 00 a e 00 b f 00 c g 00 d h 00 i m 00 j n 00 k o 00 l p

	56

	57 static const uint64x2_t kMask1 = { 0x3FC0003FC00000ULL, 0x3FC0003FC00000ULL };

	58 uint64x2_t ret = shift_swap<10>(x, kMask1);

	59

	60 // x: b f 00 00 00 a e c g i m 00 00 00 d h j n 00 k o 00 l p

	61 static const uint64x2_t kMask2 = { (0x3FULL << 52), (0x3FULL << 52) };

	62 static const uint64x2_t kMask3 = { (0x3FULL << 28), (0x3FULL << 28) };

	63 const uint64x2_t x1 = vandq_u64(vshlq_n_u64(ret, 52), kMask2);

	64 const uint64x2_t x2 = vandq_u64(vshlq_n_u64(ret, 20), kMask3);

	65 ret = vshrq_n_u64(vorrq_u64(ret, vorrq_u64(x1, x2)), 16);

	66

	67 // x: 00 00 00 00 00 00 00 00 b f l p a e c g i m k o d h j n

	68

	69 static const uint64x2_t kMask4 = { 0xFC0000ULL, 0xFC0000ULL };

	70 ret = shift_swap<6>(ret, kMask4);

	71

	72 #if defined (SK_CPU_BENDIAN)

	73 // x: 00 00 00 00 00 00 00 00 b f l p a e i m c g k o d h j n

	74

	75 static const uint64x2_t kMask5 = { 0x3FULL, 0x3FULL };

	76 ret = shift_swap<36>(ret, kMask5);

	77

	78 // x: 00 00 00 00 00 00 00 00 b f j n a e i m c g k o d h l p

	79

	80 static const uint64x2_t kMask6 = { 0xFFF000000ULL, 0xFFF000000ULL };

	81 ret = shift_swap<12>(ret, kMask6);

	82 #else

	83 // x: 00 00 00 00 00 00 00 00 c g i m d h l p b f j n a e k o

	84

	85 static const uint64x2_t kMask5 = { 0xFC0ULL, 0xFC0ULL };

	86 ret = shift_swap<36>(ret, kMask5);

	87

	88 // x: 00 00 00 00 00 00 00 00 a e i m d h l p b f j n c g k o

	89

	90 static const uint64x2_t kMask6 = { (0xFFFULL << 36), (0xFFFULL << 36) };

	91 static const uint64x2_t kMask7 = { 0xFFFFFFULL, 0xFFFFFFULL };

	92 static const uint64x2_t kMask8 = { 0xFFFULL, 0xFFFULL };

	93 const uint64x2_t y1 = vandq_u64(ret, kMask6);

	94 const uint64x2_t y2 = vshlq_n_u64(vandq_u64(ret, kMask7), 12);

	95 const uint64x2_t y3 = vandq_u64(vshrq_n_u64(ret, 24), kMask8);

	96 ret = vorrq_u64(y1, vorrq_u64(y2, y3));

	97 #endif

	98

	99 // x: 00 00 00 00 00 00 00 00 a e i m b f j n c g k o d h l p

	100

	101 // Set the header

	102 static const uint64x2_t kHeader = { 0x8490000000000000ULL, 0x849000000000000 0ULL };

	103 return vorrq_u64(kHeader, ret);

	104 }

	105

	106 // Takes a row of alpha values and places the most significant three bits of eac h byte into

	107 // the least significant bits of the same byte

	108 static inline uint8x16_t make_index_row(const uint8x16_t &x) {

	109 static const uint8x16_t kTopThreeMask = {

	110 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0,

	111 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0, 0xE0,

	112 };

	113 return vshrq_n_u8(vandq_u8(x, kTopThreeMask), 5);

	114 }

	115

	116 // Returns true if all of the bits in x are 0.

	117 static inline bool is_zero(uint8x16_t x) {

	118 // First experiments say that this is way slower than just examining the lanes

	119 // but it might need a little more investigation.

	120 #if 0

	121 // This code path tests the system register for overflow. We trigger

	122 // overflow by adding x to a register with all of its bits set. The

	123 // first instruction sets the bits.

	124 int reg;

	125 asm ("VTST.8 %%q0, %q1, %q1\n"

	126 "VQADD.u8 %q1, %%q0\n"

	127 "VMRS %0, FPSCR\n"

	128 : "=r"(reg) : "w"(vreinterpretq_f32_u8(x)) : "q0", "q1");

	129

	130 // Bit 21 corresponds to the overflow flag.

	131 return reg & (0x1 << 21);

	132 #else

	133 const uint64x2_t cvt = vreinterpretq_u64_u8(x);

	134 const uint64_t l1 = vgetq_lane_u64(cvt, 0);

	135 return (l1 == 0) && (l1 == vgetq_lane_u64(cvt, 1));

	136 #endif

	137 }

	138

	139 #if defined (SK_CPU_BENDIAN)

	140 static inline uint64x2_t fix_endianness(uint64x2_t x) {

	141 return x;

	142 }

	143 #else

	144 static inline uint64x2_t fix_endianness(uint64x2_t x) {

	145 return vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(x)));

	146 }

	147 #endif

	148

	149 static void compress_r11eac_blocks(uint64_t** dst, const uint8_t* src, int rowBy tes) {
	mtklein 2014/07/11 15:24:27 Seems like you don't need the double pointer indir Seems like you don't need the double pointer indirection for dst. uint64_t* dst1 = dst + 0; uint64_t* dst2 = dst + 1; ... dst +=4; krajcevski 2014/07/11 16:11:47 Done. Show quoted text On 2014/07/11 15:24:27, mtklein wrote: > Seems like you don't need the double pointer indirection for dst. > > uint64_t* dst1 = dst + 0; > uint64_t* dst2 = dst + 1; > ... > dst +=4; Done.
	150

	151 // Try to avoid switching between vector and non-vector ops...

	152 const uint8_t *const src1 = src;

	153 const uint8_t *const src2 = src + rowBytes;

	154 const uint8_t const src3 = src + 2rowBytes;

	155 const uint8_t const src4 = src + 3rowBytes;

	156 uint64_t const dst1 = dst;

	157 uint64_t const dst2 = dst + 1;

	158 uint64_t const dst3 = dst + 2;

	159 uint64_t const dst4 = dst + 3;

	160 *dst += 4;

	161

	162 const uint8x16_t alphaRow1 = vld1q_u8(src1);
	mtklein 2014/07/11 15:24:27 Seems like we can't do this without knowing src is Seems like we can't do this without knowing src is 128-bit aligned. krajcevski 2014/07/11 16:11:47 We don't need to check. The ARM intrinsics create We don't need to check. The ARM intrinsics create unaligned loads by default. That being said, there might be room for performance improvements if we do get an aligned buffer to use the aligned load instruction to avoid an extra DWORD per load. http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344e/Cihejdic... http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489c/CJAJIIGG... On 2014/07/11 15:24:27, mtklein wrote: Show quoted text > Seems like we can't do this without knowing src is 128-bit aligned. mtklein 2014/07/11 18:21:19 Ah, neat. On 2014/07/11 16:11:47, krajcevski wrote Ah, neat. On 2014/07/11 16:11:47, krajcevski wrote: Show quoted text > We don't need to check. The ARM intrinsics create unaligned loads by default. > That being said, there might be room for performance improvements if we do get > an aligned buffer to use the aligned load instruction to avoid an extra DWORD > per load. > > http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344e/Cihejdic... > http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489c/CJAJIIGG... > > On 2014/07/11 15:24:27, mtklein wrote: > > Seems like we can't do this without knowing src is 128-bit aligned. >
	163 const uint8x16_t alphaRow2 = vld1q_u8(src2);

	164 const uint8x16_t alphaRow3 = vld1q_u8(src3);

	165 const uint8x16_t alphaRow4 = vld1q_u8(src4);

	166

	167 const uint8x16_t cmp12 = vceqq_u8(alphaRow1, alphaRow2);

	168 const uint8x16_t cmp34 = vceqq_u8(alphaRow3, alphaRow4);

	169 const uint8x16_t cmp13 = vceqq_u8(alphaRow1, alphaRow3);

	170

	171 const uint8x16_t cmp = vandq_u8(vandq_u8(cmp12, cmp34), cmp13);

	172 const uint8x16_t ncmp = vmvnq_u8(cmp);

	173 const uint8x16_t nAlphaRow1 = vmvnq_u8(alphaRow1);

	174 if (is_zero(ncmp)) {

	175 if (is_zero(alphaRow1)) {

	176 static const uint64x2_t kTransparent = { 0x0020000000002000ULL,

	177 0x0020000000002000ULL };

	178 vst1q_u64(dst1, kTransparent);

	179 vst1q_u64(dst3, kTransparent);

	180 return;

	181 } else if (is_zero(nAlphaRow1)) {

	182 vst1q_u64(dst1, vreinterpretq_u64_u8(cmp));

	183 vst1q_u64(dst3, vreinterpretq_u64_u8(cmp));

	184 return;

	185 }

	186 }

	187

	188 const uint8x16_t indexRow1 = convert_indices(make_index_row(alphaRow1));

	189 const uint8x16_t indexRow2 = convert_indices(make_index_row(alphaRow2));

	190 const uint8x16_t indexRow3 = convert_indices(make_index_row(alphaRow3));

	191 const uint8x16_t indexRow4 = convert_indices(make_index_row(alphaRow4));

	192

	193 const uint64x2_t indexRow12 = vreinterpretq_u64_u8(

	194 vorrq_u8(vshlq_n_u8(indexRow1, 3), indexRow2));

	195 const uint64x2_t indexRow34 = vreinterpretq_u64_u8(

	196 vorrq_u8(vshlq_n_u8(indexRow3, 3), indexRow4));

	197

	198 static const uint64x2_t kMask1 = { 0xFFFFFFFF00000000ULL, 0xFFFFFFFF00000000 ULL };

	199 static const uint64x2_t kMask2 = { 0x00000000FFFFFFFFULL, 0x00000000FFFFFFFF ULL };

	200

	201 // Shuffle into 64-bit words

	202 const uint64x2_t blockIndicesRight =

	203 vorrq_u64(

	204 vandq_u64(indexRow12, kMask1),

	205 vshrq_n_u64(indexRow34, 32));

	206

	207 const uint64x2_t blockIndicesLeft =

	208 vorrq_u64(

	209 vandq_u64(indexRow34, kMask2),

	210 vshlq_n_u64(indexRow12, 32));

	211

	212 const uint64x2_t indicesLeft = fix_endianness(pack_indices(blockIndicesLeft) );

	213 const uint64x2_t indicesRight = fix_endianness(pack_indices(blockIndicesRigh t));

	214

	215 // TODO (krajcevski): Investigate whether or not we can

	216 // efficiently exchange lanes and get more efficient reading

	217 // into memory by using vst1q_u64

	218 vst1q_lane_u64(dst1, indicesLeft, 0);

	219 vst1q_lane_u64(dst2, indicesRight, 0);

	220 vst1q_lane_u64(dst3, indicesLeft, 1);

	221 vst1q_lane_u64(dst4, indicesRight, 1);

	222 }

	223

	224 static bool compress_a8_to_r11eac(uint8_t* dst, const uint8_t* src,

	225 int width, int height, int rowBytes) {

	226

	227 // Since we're going to operate on 4 blocks at a time, the src width
	mtklein 2014/07/11 15:24:27 Is it worth falling back to non-vectorized code fo Is it worth falling back to non-vectorized code for widths that aren't a multiple of 16? krajcevski 2014/07/11 16:11:47 Done. Show quoted text On 2014/07/11 15:24:27, mtklein wrote: > Is it worth falling back to non-vectorized code for widths that aren't a > multiple of 16? Done.
	228 // must be a multiple of 16. However, the height only needs to be a

	229 // multiple of 4

	230 if (0 == width \|\| 0 == height \|\| (width % 16) != 0 \|\| (height % 4) != 0) {

	231 return false;

	232 }

	233

	234 const int blocksX = width >> 2;

	235 const int blocksY = height >> 2;

	236

	237 SkASSERT((blocksX % 4) == 0);

	238

	239 uint64_t* encPtr = reinterpret_cast<uint64_t*>(dst);
	mtklein 2014/07/11 15:24:27 Don't we also need to check that these are aligned Don't we also need to check that these are aligned, or ramp up until they are? As far as I know, ARM chips are sort of bad with unaligned reads and writes, in that they'll crash if you're lucky, or silently do nothing if you aren't. krajcevski 2014/07/11 16:11:47 See previous comment on alignment. On 2014/07/11 See previous comment on alignment. On 2014/07/11 15:24:27, mtklein wrote: Show quoted text > Don't we also need to check that these are aligned, or ramp up until they are? > As far as I know, ARM chips are sort of bad with unaligned reads and writes, in > that they'll crash if you're lucky, or silently do nothing if you aren't.
	240 for (int y = 0; y < blocksY; ++y) {

	241 for (int x = 0; x < blocksX; x+=4) {

	242 // Compress it

	243 compress_r11eac_blocks(&encPtr, src + 4*x, rowBytes);

	244 }

	245 src += 4 * rowBytes;

	246 }

	247 return true;

	248 }

	249

	250 SkTextureCompressor::CompressionProc

	251 SkTextureCompressorGetPlatformProc(SkTextureCompressor::Format fmt) {
	mtklein 2014/07/11 15:24:27 Seems like we should also pass the source color ty Seems like we should also pass the source color type and check that inside here. This thing's really got two parameters, even if one is always A8 for now. krajcevski 2014/07/11 16:11:47 Done. Show quoted text On 2014/07/11 15:24:27, mtklein wrote: > Seems like we should also pass the source color type and check that inside here. > This thing's really got two parameters, even if one is always A8 for now. Done.
	252 switch(fmt) {

	253 case SkTextureCompressor::kR11_EAC_Format:

	254 return compress_a8_to_r11eac;

	255 default:

	256 return NULL;

	257 }

	258 }

OLD	NEW

« no previous file with comments | « src/opts/SkTextureCompression_opts.h ('k') | src/opts/SkTextureCompression_opts_none.cpp » ('j') | src/utils/SkTextureCompressor.cpp » ('J')