source/scale_msa.cc - Issue 2527983002: Add MSA optimized ARGB scaling functions

Side by Side Diff: source/scale_msa.cc

Issue 2527983002: Add MSA optimized ARGB scaling functions (Closed)

Patch Set: Corrected patchset files Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2016 The LibYuv Project Authors. All rights reserved.

	3 *

	4 * Use of this source code is governed by a BSD-style license

	5 * that can be found in the LICENSE file in the root of the source

	6 * tree. An additional intellectual property rights grant can be found

	7 * in the file PATENTS. All contributing project authors may

	8 * be found in the AUTHORS file in the root of the source tree.

	9 */

	10

	11 #include "libyuv/scale_row.h"

	12

	13 // This module is for GCC MSA

	14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

	15 #include "libyuv/macros_msa.h"

	16

	17 #ifdef __cplusplus

	18 namespace libyuv {

	19 extern "C" {

	20 #endif

	21

	22 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
	fbarchard1 2016/11/30 01:15:48 This function, and Linear, arent really used in pr This function, and Linear, arent really used in production code much, so I would keep this as similar to ScaleARGBRowDown2Linear_MSA as possible, which is 4 at a time. You should expect performance to be the same or slightly better than Linear manojkumar.bhosale 2016/12/01 13:06:06 Done. The performance is slightly degraded (2.6x t Show quoted text On 2016/11/30 01:15:48, fbarchard1 wrote: > This function, and Linear, arent really used in production code much, so I would > keep this as similar to ScaleARGBRowDown2Linear_MSA as possible, which is 4 at a > time. You should expect performance to be the same or slightly better than > Linear Done. The performance is slightly degraded (2.6x to 2.3x) because of reduced instructions to handle load latencies.
	23 ptrdiff_t src_stride,

	24 uint8_t* dst_argb,

	25 int dst_width) {

	26 int x;

	27 v16u8 src0, src1, src2, src3, dst0, dst1;

	28

	29 for (x = 0; x < dst_width; x += 8) {

	30 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);

	31 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);

	32 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);

	33 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);

	34 dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

	35 dst1 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);

	36 ST_UB2(dst0, dst1, dst_argb, 16);

	37 src_argb += 64;

	38 dst_argb += 32;

	39 }

	40 }

	41

	42 void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,

	43 ptrdiff_t src_stride,

	44 uint8_t* dst_argb,

	45 int dst_width) {

	46 int x;

	47 v16u8 src0, src1, vec0, vec1, dst0;

	48

	49 for (x = 0; x < dst_width; x += 4) {

	50 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);

	51 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);

	52 vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);

	53 vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

	54 dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);

	55 ST_UB(dst0, dst_argb);

	56 src_argb += 32;

	57 dst_argb += 16;

	58 }

	59 }

	60

	61 void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,

	62 ptrdiff_t src_stride,

	63 uint8_t* dst_argb,

	64 int dst_width) {

	65 int x;

	66 const uint8_t* nxt_argb = src_argb + src_stride;
	fbarchard1 2016/11/30 01:15:48 nit re nxt_argb I havent used this variable name nit re nxt_argb I havent used this variable name before. I see why you did it In the YUV C code I used s and t variables like this: void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; dst += 2; s += 4; t += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; } } or sometimes I would number the pointers if there are several rows. On Intel its often better to use stride in the addressing because you can address with base, offset and index and then only increment 1 variable. Is Mips like that? I sometimes go out of my way to use a fixed offset between 2 variables even. The simpliest example of that would be memcpy(s,d,len) instead of while(len--) d++ = s++; you could use: int offset = d-s; while(len--) { d[offset] = s[0]; ++s; } or for C code more readable would be for (int i=0; i < len; ++i) { d[i] = s[i]; } which increments 1 offset instead of 2 pointers. manojkumar.bhosale 2016/12/01 13:06:06 Done. Also, we avoided using stride in addressing Show quoted text On 2016/11/30 01:15:48, fbarchard1 wrote: > nit re nxt_argb > > I havent used this variable name before. I see why you did it > In the YUV C code I used s and t variables like this: > > void ScaleRowDown2Box_C(const uint8* src_ptr, > ptrdiff_t src_stride, > uint8* dst, > int dst_width) { > const uint8* s = src_ptr; > const uint8* t = src_ptr + src_stride; > int x; > for (x = 0; x < dst_width - 1; x += 2) { > dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; > dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; > dst += 2; > s += 4; > t += 4; > } > if (dst_width & 1) { > dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; > } > } > > or sometimes I would number the pointers if there are several rows. > > On Intel its often better to use stride in the addressing because you can > address with base, offset and index and then only increment 1 variable. > Is Mips like that? > I sometimes go out of my way to use a fixed offset between 2 variables even. > The simpliest example of that would be memcpy(s,d,len) > instead of > while(len--) > d++ = s++; > you could use: > int offset = d-s; > while(len--) { > d[offset] = s[0]; > ++s; > } > or for C code more readable would be > for (int i=0; i < len; ++i) { > d[i] = s[i]; > } > which increments 1 offset instead of 2 pointers. Done. Also, we avoided using stride in addressing (every time in loop) where compiler may generate multiple pointer manipulation GP instructions impacting core loop's performance.
	67 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;

	68 v8u16 reg0, reg1, reg2, reg3;

	69

	70 for (x = 0; x < dst_width; x += 4) {

	71 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
	fbarchard1 2016/11/30 01:15:48 This is pretty large/slow? This is an important fu This is pretty large/slow? This is an important function, and YUV and some subsampling code will be similar. On intel I took the liberty of doing 2 averages, which has rounding error, but is roughly the same as ScaleARGBRowDown2Linear_MSA done twice. But I think we should move away from that, and do correct rounding, though. On ARM I would do code similar to ScaleARGBRowDown2Linear_MSA but instead of average, do an add that expands to 16 bit values. Do you have something like that? Then the code would be similar to Linear, but done twice, then results adds and shifted/packed down. So it would be 5 instructions per row * 2 rows = 10, add, narrow and store. Roughly 12 instructions. Currently you have 22 instructions. On Intel I might consider using vpmaddubsw because it does a horizontal add (paired) and widens the result to 16 bits. It does a multiply, which could be 1, but could be used to move the result to the upper 8 bits. Its only 0.5 cycles. I dont think MSA (or ARM) has an instruction like that. manojkumar.bhosale 2016/12/01 13:06:06 Modified this function to replace interleave & pac Show quoted text On 2016/11/30 01:15:48, fbarchard1 wrote: > This is pretty large/slow? > This is an important function, and YUV and some subsampling code will be > similar. > > On intel I took the liberty of doing 2 averages, which has rounding error, but > is roughly the same as ScaleARGBRowDown2Linear_MSA done twice. > But I think we should move away from that, and do correct rounding, though. > > On ARM I would do code similar to ScaleARGBRowDown2Linear_MSA but instead of > average, do an add that expands to 16 bit values. Do you have something like > that? > Then the code would be similar to Linear, but done twice, then results adds and > shifted/packed down. > So it would be 5 instructions per row * 2 rows = 10, add, narrow and store. > Roughly 12 instructions. Currently you have 22 instructions. > On Intel I might consider using vpmaddubsw because it does a horizontal add > (paired) and widens the result to 16 bits. It does a multiply, which could be > 1, but could be used to move the result to the upper 8 bits. Its only 0.5 > cycles. I dont think MSA (or ARM) has an instruction like that. Modified this function to replace interleave & pack operations with shuffle and fitted in total 18 instructions. Here, strategy of Linear can not be used as is because in Linear, we could do add & rounding with single aver_u_b instruction and within 8 bits only, which is not true here. For MSA, we do not have add with expand.
	72 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);

	73 src2 = (v16u8)__msa_ld_b((v16i8*)nxt_argb, 0);

	74 src3 = (v16u8)__msa_ld_b((v16i8*)nxt_argb, 16);

	75 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

	76 vec1 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

	77 vec2 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

	78 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

	79 reg0 = (v8u16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);

	80 reg1 = (v8u16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);

	81 reg2 = (v8u16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);

	82 reg3 = (v8u16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);

	83 reg0 = (v8u16)__msa_hadd_u_h((v16u8)reg0, (v16u8)reg0);

	84 reg1 = (v8u16)__msa_hadd_u_h((v16u8)reg1, (v16u8)reg1);

	85 reg2 = (v8u16)__msa_hadd_u_h((v16u8)reg2, (v16u8)reg2);

	86 reg3 = (v8u16)__msa_hadd_u_h((v16u8)reg3, (v16u8)reg3);

	87 reg0 += reg2;

	88 reg1 += reg3;

	89 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);

	90 reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
	fbarchard1 2016/11/30 01:15:48 Does __msa_srari_h() do rounding? Can you refer me Does __msa_srari_h() do rounding? Can you refer me to an instruction guide? The C code does dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; manojkumar.bhosale 2016/12/01 13:06:06 Please refer to the MSA instruction set at, https: Show quoted text On 2016/11/30 01:15:48, fbarchard1 wrote: > Does __msa_srari_h() do rounding? > Can you refer me to an instruction guide? > > The C code does > dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + > src_argb[src_stride + 4] + 2) >> > 2; > Please refer to the MSA instruction set at, https://imgtec.com/documentation/ -> MIPS SIMD -> The MIPS32 SIMD Architecture Module fbarchard1 2016/12/06 00:45:19 Acknowledged. srari is Immediate Shift Right Arith Show quoted text On 2016/12/01 13:06:06, manojkumar.bhosale wrote: > On 2016/11/30 01:15:48, fbarchard1 wrote: > > Does __msa_srari_h() do rounding? > > Can you refer me to an instruction guide? > > > > The C code does > > dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + > > src_argb[src_stride + 4] + 2) >> > > 2; > > > > Please refer to the MSA instruction set at, > https://imgtec.com/documentation/ -> MIPS SIMD -> The MIPS32 SIMD > Architecture Module Acknowledged. srari is Immediate Shift Right Arithmetic Rounded
	91 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

	92 ST_UB(dst0, dst_argb);

	93 src_argb += 32;

	94 nxt_argb += 32;

	95 dst_argb += 16;

	96 }

	97 }

	98

	99 void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,

	100 ptrdiff_t src_stride,

	101 int32_t src_stepx,

	102 uint8_t* dst_argb,

	103 int dst_width) {

	104 int x;

	105 int32_t stepx = src_stepx * 4;

	106 int32_t data0, data1, data2, data3;

	107

	108 for (x = 0; x < dst_width; x += 4) {

	109 data0 = LW(src_argb);

	110 data1 = LW(src_argb + stepx);

	111 data2 = LW(src_argb + stepx * 2);

	112 data3 = LW(src_argb + stepx * 3);

	113 SW(data0, dst_argb);

	114 SW(data1, dst_argb + 4);

	115 SW(data2, dst_argb + 8);

	116 SW(data3, dst_argb + 12);

	117 src_argb += stepx * 4;

	118 dst_argb += 16;

	119 }

	120 }

	121

	122 void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,

	123 ptrdiff_t src_stride,

	124 int src_stepx,

	125 uint8* dst_argb,

	126 int dst_width) {

	127 int x;

	128 const uint8* nxt_argb = src_argb + src_stride;

	129 int32_t stepx = src_stepx * 4;

	130 int64_t data0, data1, data2, data3;

	131 v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};

	132 v16u8 vec0, vec1, vec2, vec3;

	133 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

	134 v16u8 dst0;

	135

	136 for (x = 0; x < dst_width; x += 4) {

	137 data0 = LD(src_argb);

	138 data1 = LD(src_argb + stepx);

	139 data2 = LD(src_argb + stepx * 2);

	140 data3 = LD(src_argb + stepx * 3);

	141 src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);

	142 src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);

	143 src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);

	144 src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);

	145 data0 = LD(nxt_argb);

	146 data1 = LD(nxt_argb + stepx);

	147 data2 = LD(nxt_argb + stepx * 2);

	148 data3 = LD(nxt_argb + stepx * 3);

	149 src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);

	150 src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);

	151 src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);

	152 src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);

	153 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

	154 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

	155 vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

	156 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

	157 reg0 = __msa_hadd_u_h(vec0, vec0);

	158 reg1 = __msa_hadd_u_h(vec1, vec1);

	159 reg2 = __msa_hadd_u_h(vec2, vec2);

	160 reg3 = __msa_hadd_u_h(vec3, vec3);

	161 reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);

	162 reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);

	163 reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);

	164 reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);

	165 reg4 += reg6;

	166 reg5 += reg7;

	167 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);

	168 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);

	169 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);

	170 ST_UB(dst0, dst_argb);

	171 src_argb += stepx * 4;

	172 nxt_argb += stepx * 4;

	173 dst_argb += 16;

	174 }

	175 }

	176

	177 #ifdef __cplusplus

	178 } // extern "C"

	179 } // namespace libyuv

	180 #endif

	181

	182 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

OLD	NEW

« no previous file with comments | « source/scale_argb.cc ('k') | no next file » | no next file with comments »