source/libvpx/vpx_dsp/mips/itrans16_dspr2.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/mips/itrans16_dspr2.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>

12 #include <stdio.h>

13

14 #include "./vpx_config.h"	11 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	12 #include "./vpx_dsp_rtcd.h"

16 #include "vp9/common/vp9_common.h"	13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"

17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_idct.h"

19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

20 #include "vpx_dsp/txfm_common.h"	14 #include "vpx_dsp/txfm_common.h"

21 #include "vpx_ports/mem.h"

22	15

23 #if HAVE_DSPR2	16 #if HAVE_DSPR2

24 static void idct16_rows_dspr2(const int16_t input, int16_t output,	17 void idct16_rows_dspr2(const int16_t input, int16_t output,

25 uint32_t no_rows) {	18 uint32_t no_rows) {

26 int i;	19 int i;

27 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;	20 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

28 int step1_10, step1_11, step1_12, step1_13;	21 int step1_10, step1_11, step1_12, step1_13;

29 int step2_0, step2_1, step2_2, step2_3;	22 int step2_0, step2_1, step2_2, step2_3;

30 int step2_8, step2_9, step2_10, step2_11;	23 int step2_8, step2_9, step2_10, step2_11;

31 int step2_12, step2_13, step2_14, step2_15;	24 int step2_12, step2_13, step2_14, step2_15;

32 int load1, load2, load3, load4, load5, load6, load7, load8;	25 int load1, load2, load3, load4, load5, load6, load7, load8;

33 int result1, result2, result3, result4;	26 int result1, result2, result3, result4;

34 const int const_2_power_13 = 8192;	27 const int const_2_power_13 = 8192;

35	28

(...skipping 363 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),	392 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

400 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),	393 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

401 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)	394 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)

402 );	395 );

403	396

404 input += 16;	397 input += 16;

405 output += 1;	398 output += 1;

406 }	399 }

407 }	400 }

408	401

409 static void idct16_cols_add_blk_dspr2(int16_t input, uint8_t dest,	402 void idct16_cols_add_blk_dspr2(int16_t input, uint8_t dest,

410 int dest_stride) {	403 int dest_stride) {

411 int i;	404 int i;

412 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;	405 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

413 int step1_8, step1_9, step1_10, step1_11;	406 int step1_8, step1_9, step1_10, step1_11;

414 int step1_12, step1_13, step1_14, step1_15;	407 int step1_12, step1_13, step1_14, step1_15;

415 int step2_0, step2_1, step2_2, step2_3;	408 int step2_0, step2_1, step2_2, step2_3;

416 int step2_8, step2_9, step2_10, step2_11;	409 int step2_8, step2_9, step2_10, step2_11;

417 int step2_12, step2_13, step2_14, step2_15;	410 int step2_12, step2_13, step2_14, step2_15;

418 int load1, load2, load3, load4, load5, load6, load7, load8;	411 int load1, load2, load3, load4, load5, load6, load7, load8;

419 int result1, result2, result3, result4;	412 int result1, result2, result3, result4;

420 const int const_2_power_13 = 8192;	413 const int const_2_power_13 = 8192;

(...skipping 466 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
887 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),	880 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),

888 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),	881 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

889 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),	882 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),

890 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)	883 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)

891 );	884 );

892	885

893 input += 16;	886 input += 16;

894 }	887 }

895 }	888 }

896	889

897 void vp9_idct16x16_256_add_dspr2(const int16_t input, uint8_t dest,	890 void vpx_idct16x16_256_add_dspr2(const int16_t input, uint8_t dest,

898 int dest_stride) {	891 int dest_stride) {

899 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);	892 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

900 uint32_t pos = 45;	893 uint32_t pos = 45;

901	894

902 /* bit positon for extract from acc */	895 /* bit positon for extract from acc */

903 __asm__ __volatile__ (	896 __asm__ __volatile__ (

904 "wrdsp %[pos], 1 \n\t"	897 "wrdsp %[pos], 1 \n\t"

905 :	898 :

906 : [pos] "r" (pos)	899 : [pos] "r" (pos)

907 );	900 );

908	901

909 // First transform rows	902 // First transform rows

910 idct16_rows_dspr2(input, out, 16);	903 idct16_rows_dspr2(input, out, 16);

911	904

912 // Then transform columns and add to dest	905 // Then transform columns and add to dest

913 idct16_cols_add_blk_dspr2(out, dest, dest_stride);	906 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

914 }	907 }

915	908

916 static void iadst16(const int16_t input, int16_t output) {	909 void vpx_idct16x16_10_add_dspr2(const int16_t input, uint8_t dest,

	910 int dest_stride) {

	911 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

	912 int16_t *outptr = out;

	913 uint32_t i;

	914 uint32_t pos = 45;

	915

	916 /* bit positon for extract from acc */

	917 __asm__ __volatile__ (

	918 "wrdsp %[pos], 1 \n\t"

	919 :

	920 : [pos] "r" (pos)

	921 );

	922

	923 // First transform rows. Since all non-zero dct coefficients are in

	924 // upper-left 4x4 area, we only need to calculate first 4 rows here.

	925 idct16_rows_dspr2(input, outptr, 4);

	926

	927 outptr += 4;

	928 for (i = 0; i < 6; ++i) {

	929 __asm__ __volatile__ (

	930 "sw $zero, 0(%[outptr]) \n\t"

	931 "sw $zero, 32(%[outptr]) \n\t"

	932 "sw $zero, 64(%[outptr]) \n\t"

	933 "sw $zero, 96(%[outptr]) \n\t"

	934 "sw $zero, 128(%[outptr]) \n\t"

	935 "sw $zero, 160(%[outptr]) \n\t"

	936 "sw $zero, 192(%[outptr]) \n\t"

	937 "sw $zero, 224(%[outptr]) \n\t"

	938 "sw $zero, 256(%[outptr]) \n\t"

	939 "sw $zero, 288(%[outptr]) \n\t"

	940 "sw $zero, 320(%[outptr]) \n\t"

	941 "sw $zero, 352(%[outptr]) \n\t"

	942 "sw $zero, 384(%[outptr]) \n\t"

	943 "sw $zero, 416(%[outptr]) \n\t"

	944 "sw $zero, 448(%[outptr]) \n\t"

	945 "sw $zero, 480(%[outptr]) \n\t"

	946

	947 :

	948 : [outptr] "r" (outptr)

	949 );

	950

	951 outptr += 2;

	952 }

	953

	954 // Then transform columns

	955 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

	956 }

	957

	958 void vpx_idct16x16_1_add_dspr2(const int16_t input, uint8_t dest,

	959 int dest_stride) {

	960 uint32_t pos = 45;

	961 int32_t out;

	962 int32_t r;

	963 int32_t a1, absa1;

	964 int32_t vector_a1;

	965 int32_t t1, t2, t3, t4;

	966 int32_t vector_1, vector_2, vector_3, vector_4;

	967

	968 /* bit positon for extract from acc */

	969 __asm__ __volatile__ (

	970 "wrdsp %[pos], 1 \n\t"

	971

	972 :

	973 : [pos] "r" (pos)

	974 );

	975

	976 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

	977 __asm__ __volatile__ (

	978 "addi %[out], %[out], 32 \n\t"

	979 "sra %[a1], %[out], 6 \n\t"

	980

	981 : [out] "+r" (out), [a1] "=r" (a1)

	982 :

	983 );

	984

	985 if (a1 < 0) {

	986 /* use quad-byte

	987 * input and output memory are four byte aligned */

	988 __asm__ __volatile__ (

	989 "abs %[absa1], %[a1] \n\t"

	990 "replv.qb %[vector_a1], %[absa1] \n\t"

	991

	992 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

	993 : [a1] "r" (a1)

	994 );

	995

	996 for (r = 16; r--;) {

	997 __asm__ __volatile__ (

	998 "lw %[t1], 0(%[dest]) \n\t"

	999 "lw %[t2], 4(%[dest]) \n\t"

	1000 "lw %[t3], 8(%[dest]) \n\t"

	1001 "lw %[t4], 12(%[dest]) \n\t"

	1002 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	1003 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	1004 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	1005 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	1006 "sw %[vector_1], 0(%[dest]) \n\t"

	1007 "sw %[vector_2], 4(%[dest]) \n\t"

	1008 "sw %[vector_3], 8(%[dest]) \n\t"

	1009 "sw %[vector_4], 12(%[dest]) \n\t"

	1010 "add %[dest], %[dest], %[dest_stride] \n\t"

	1011

	1012 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

	1013 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

	1014 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

	1015 [dest] "+&r" (dest)

	1016 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

	1017 );

	1018 }

	1019 } else {

	1020 /* use quad-byte

	1021 * input and output memory are four byte aligned */

	1022 __asm__ __volatile__ (

	1023 "replv.qb %[vector_a1], %[a1] \n\t"

	1024

	1025 : [vector_a1] "=r" (vector_a1)

	1026 : [a1] "r" (a1)

	1027 );

	1028

	1029 for (r = 16; r--;) {

	1030 __asm__ __volatile__ (

	1031 "lw %[t1], 0(%[dest]) \n\t"

	1032 "lw %[t2], 4(%[dest]) \n\t"

	1033 "lw %[t3], 8(%[dest]) \n\t"

	1034 "lw %[t4], 12(%[dest]) \n\t"

	1035 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

	1036 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

	1037 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

	1038 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

	1039 "sw %[vector_1], 0(%[dest]) \n\t"

	1040 "sw %[vector_2], 4(%[dest]) \n\t"

	1041 "sw %[vector_3], 8(%[dest]) \n\t"

	1042 "sw %[vector_4], 12(%[dest]) \n\t"

	1043 "add %[dest], %[dest], %[dest_stride] \n\t"

	1044

	1045 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

	1046 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

	1047 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

	1048 [dest] "+&r" (dest)

	1049 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

	1050 );

	1051 }

	1052 }

	1053 }

	1054

	1055 void iadst16_dspr2(const int16_t input, int16_t output) {

917 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;	1056 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

918	1057

919 int x0 = input[15];	1058 int x0 = input[15];

920 int x1 = input[0];	1059 int x1 = input[0];

921 int x2 = input[13];	1060 int x2 = input[13];

922 int x3 = input[2];	1061 int x3 = input[2];

923 int x4 = input[11];	1062 int x4 = input[11];

924 int x5 = input[4];	1063 int x5 = input[4];

925 int x6 = input[9];	1064 int x6 = input[9];

926 int x7 = input[6];	1065 int x7 = input[6];

(...skipping 150 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1077 output[8] = x3;	1216 output[8] = x3;

1078 output[9] = x11;	1217 output[9] = x11;

1079 output[10] = x15;	1218 output[10] = x15;

1080 output[11] = x7;	1219 output[11] = x7;

1081 output[12] = x5;	1220 output[12] = x5;

1082 output[13] = -x13;	1221 output[13] = -x13;

1083 output[14] = x9;	1222 output[14] = x9;

1084 output[15] = -x1;	1223 output[15] = -x1;

1085 }	1224 }

1086	1225

1087 void vp9_iht16x16_256_add_dspr2(const int16_t input, uint8_t dest,

1088 int pitch, int tx_type) {

1089 int i, j;

1090 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

1091 int16_t *outptr = out;

1092 int16_t temp_out[16];

1093 uint32_t pos = 45;

1094	1226

1095 /* bit positon for extract from acc */	1227 #endif // HAVE_DSPR2

1096 __asm__ __volatile__ (

1097 "wrdsp %[pos], 1 \n\t"

1098 :

1099 : [pos] "r" (pos)

1100 );

1101

1102 switch (tx_type) {

1103 case DCT_DCT: // DCT in both horizontal and vertical

1104 idct16_rows_dspr2(input, outptr, 16);

1105 idct16_cols_add_blk_dspr2(out, dest, pitch);

1106 break;

1107 case ADST_DCT: // ADST in vertical, DCT in horizontal

1108 idct16_rows_dspr2(input, outptr, 16);

1109

1110 outptr = out;

1111

1112 for (i = 0; i < 16; ++i) {

1113 iadst16(outptr, temp_out);

1114

1115 for (j = 0; j < 16; ++j)

1116 dest[j * pitch + i] =

1117 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1118 + dest[j * pitch + i]);

1119 outptr += 16;

1120 }

1121 break;

1122 case DCT_ADST: // DCT in vertical, ADST in horizontal

1123 {

1124 int16_t temp_in[16 * 16];

1125

1126 for (i = 0; i < 16; ++i) {

1127 /* prefetch row */

1128 prefetch_load((const uint8_t *)(input + 16));

1129

1130 iadst16(input, outptr);

1131 input += 16;

1132 outptr += 16;

1133 }

1134

1135 for (i = 0; i < 16; ++i)

1136 for (j = 0; j < 16; ++j)

1137 temp_in[j * 16 + i] = out[i * 16 + j];

1138

1139 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);

1140 }

1141 break;

1142 case ADST_ADST: // ADST in both directions

1143 {

1144 int16_t temp_in[16];

1145

1146 for (i = 0; i < 16; ++i) {

1147 /* prefetch row */

1148 prefetch_load((const uint8_t *)(input + 16));

1149

1150 iadst16(input, outptr);

1151 input += 16;

1152 outptr += 16;

1153 }

1154

1155 for (i = 0; i < 16; ++i) {

1156 for (j = 0; j < 16; ++j)

1157 temp_in[j] = out[j * 16 + i];

1158 iadst16(temp_in, temp_out);

1159 for (j = 0; j < 16; ++j)

1160 dest[j * pitch + i] =

1161 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1162 + dest[j * pitch + i]);

1163 }

1164 }

1165 break;

1166 default:

1167 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");

1168 break;

1169 }

1170 }

1171

1172 void vp9_idct16x16_10_add_dspr2(const int16_t input, uint8_t dest,

1173 int dest_stride) {

1174 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

1175 int16_t *outptr = out;

1176 uint32_t i;

1177 uint32_t pos = 45;

1178

1179 /* bit positon for extract from acc */

1180 __asm__ __volatile__ (

1181 "wrdsp %[pos], 1 \n\t"

1182 :

1183 : [pos] "r" (pos)

1184 );

1185

1186 // First transform rows. Since all non-zero dct coefficients are in

1187 // upper-left 4x4 area, we only need to calculate first 4 rows here.

1188 idct16_rows_dspr2(input, outptr, 4);

1189

1190 outptr += 4;

1191 for (i = 0; i < 6; ++i) {

1192 __asm__ __volatile__ (

1193 "sw $zero, 0(%[outptr]) \n\t"

1194 "sw $zero, 32(%[outptr]) \n\t"

1195 "sw $zero, 64(%[outptr]) \n\t"

1196 "sw $zero, 96(%[outptr]) \n\t"

1197 "sw $zero, 128(%[outptr]) \n\t"

1198 "sw $zero, 160(%[outptr]) \n\t"

1199 "sw $zero, 192(%[outptr]) \n\t"

1200 "sw $zero, 224(%[outptr]) \n\t"

1201 "sw $zero, 256(%[outptr]) \n\t"

1202 "sw $zero, 288(%[outptr]) \n\t"

1203 "sw $zero, 320(%[outptr]) \n\t"

1204 "sw $zero, 352(%[outptr]) \n\t"

1205 "sw $zero, 384(%[outptr]) \n\t"

1206 "sw $zero, 416(%[outptr]) \n\t"

1207 "sw $zero, 448(%[outptr]) \n\t"

1208 "sw $zero, 480(%[outptr]) \n\t"

1209

1210 :

1211 : [outptr] "r" (outptr)

1212 );

1213

1214 outptr += 2;

1215 }

1216

1217 // Then transform columns

1218 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

1219 }

1220

1221 void vp9_idct16x16_1_add_dspr2(const int16_t input, uint8_t dest,

1222 int dest_stride) {

1223 uint32_t pos = 45;

1224 int32_t out;

1225 int32_t r;

1226 int32_t a1, absa1;

1227 int32_t vector_a1;

1228 int32_t t1, t2, t3, t4;

1229 int32_t vector_1, vector_2, vector_3, vector_4;

1230

1231 /* bit positon for extract from acc */

1232 __asm__ __volatile__ (

1233 "wrdsp %[pos], 1 \n\t"

1234

1235 :

1236 : [pos] "r" (pos)

1237 );

1238

1239 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);

1240 __asm__ __volatile__ (

1241 "addi %[out], %[out], 32 \n\t"

1242 "sra %[a1], %[out], 6 \n\t"

1243

1244 : [out] "+r" (out), [a1] "=r" (a1)

1245 :

1246 );

1247

1248 if (a1 < 0) {

1249 /* use quad-byte

1250 * input and output memory are four byte aligned */

1251 __asm__ __volatile__ (

1252 "abs %[absa1], %[a1] \n\t"

1253 "replv.qb %[vector_a1], %[absa1] \n\t"

1254

1255 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)

1256 : [a1] "r" (a1)

1257 );

1258

1259 for (r = 16; r--;) {

1260 __asm__ __volatile__ (

1261 "lw %[t1], 0(%[dest]) \n\t"

1262 "lw %[t2], 4(%[dest]) \n\t"

1263 "lw %[t3], 8(%[dest]) \n\t"

1264 "lw %[t4], 12(%[dest]) \n\t"

1265 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1266 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1267 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1268 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1269 "sw %[vector_1], 0(%[dest]) \n\t"

1270 "sw %[vector_2], 4(%[dest]) \n\t"

1271 "sw %[vector_3], 8(%[dest]) \n\t"

1272 "sw %[vector_4], 12(%[dest]) \n\t"

1273 "add %[dest], %[dest], %[dest_stride] \n\t"

1274

1275 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1276 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1277 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1278 [dest] "+&r" (dest)

1279 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

1280 );

1281 }

1282 } else {

1283 /* use quad-byte

1284 * input and output memory are four byte aligned */

1285 __asm__ __volatile__ (

1286 "replv.qb %[vector_a1], %[a1] \n\t"

1287

1288 : [vector_a1] "=r" (vector_a1)

1289 : [a1] "r" (a1)

1290 );

1291

1292 for (r = 16; r--;) {

1293 __asm__ __volatile__ (

1294 "lw %[t1], 0(%[dest]) \n\t"

1295 "lw %[t2], 4(%[dest]) \n\t"

1296 "lw %[t3], 8(%[dest]) \n\t"

1297 "lw %[t4], 12(%[dest]) \n\t"

1298 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"

1299 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"

1300 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"

1301 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"

1302 "sw %[vector_1], 0(%[dest]) \n\t"

1303 "sw %[vector_2], 4(%[dest]) \n\t"

1304 "sw %[vector_3], 8(%[dest]) \n\t"

1305 "sw %[vector_4], 12(%[dest]) \n\t"

1306 "add %[dest], %[dest], %[dest_stride] \n\t"

1307

1308 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1309 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1310 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1311 [dest] "+&r" (dest)

1312 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

1313 );

1314 }

1315 }

1316 }

1317 #endif // #if HAVE_DSPR2

OLD	NEW

« no previous file with comments | « source/libvpx/vpx_dsp/mips/inv_txfm_msa.h ('k') | source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c » ('j') | no next file with comments »