source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c - Issue 168343002: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c

Issue 168343002: libvpx: Pull from upstream (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/libvpx/

Patch Set: libvpx: Pull from upstream Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 #include <assert.h>	11 #include <assert.h>

12 #include <stdio.h>	12 #include <stdio.h>

13	13

14 #include "./vpx_config.h"	14 #include "./vpx_config.h"

15 #include "./vp9_rtcd.h"	15 #include "./vp9_rtcd.h"

16 #include "vp9/common/vp9_common.h"	16 #include "vp9/common/vp9_common.h"

17 #include "vp9/common/vp9_blockd.h"	17 #include "vp9/common/vp9_blockd.h"

18 #include "vp9/common/vp9_idct.h"	18 #include "vp9/common/vp9_idct.h"

19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"	19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"

20	20

21 #if HAVE_DSPR2	21 #if HAVE_DSPR2

22 static void idct16_1d_rows_dspr2(const int16_t input, int16_t output,	22 static void idct16_rows_dspr2(const int16_t input, int16_t output,

23 uint32_t no_rows) {	23 uint32_t no_rows) {

24 int i;	24 int i;

25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;	25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

26 int step1_10, step1_11, step1_12, step1_13;	26 int step1_10, step1_11, step1_12, step1_13;

27 int step2_0, step2_1, step2_2, step2_3;	27 int step2_0, step2_1, step2_2, step2_3;

28 int step2_8, step2_9, step2_10, step2_11;	28 int step2_8, step2_9, step2_10, step2_11;

29 int step2_12, step2_13, step2_14, step2_15;	29 int step2_12, step2_13, step2_14, step2_15;

30 int load1, load2, load3, load4, load5, load6, load7, load8;	30 int load1, load2, load3, load4, load5, load6, load7, load8;

31 int result1, result2, result3, result4;	31 int result1, result2, result3, result4;

32 const int const_2_power_13 = 8192;	32 const int const_2_power_13 = 8192;

33	33

(...skipping 363 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
397 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),	397 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),

398 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),	398 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),

399 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)	399 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)

400 );	400 );

401	401

402 input += 16;	402 input += 16;

403 output += 1;	403 output += 1;

404 }	404 }

405 }	405 }

406	406

407 static void idct16_1d_cols_add_blk_dspr2(int16_t input, uint8_t dest,	407 static void idct16_cols_add_blk_dspr2(int16_t input, uint8_t dest,

408 int dest_stride) {	408 int dest_stride) {

409 int i;	409 int i;

410 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;	410 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;

411 int step1_8, step1_9, step1_10, step1_11;	411 int step1_8, step1_9, step1_10, step1_11;

412 int step1_12, step1_13, step1_14, step1_15;	412 int step1_12, step1_13, step1_14, step1_15;

413 int step2_0, step2_1, step2_2, step2_3;	413 int step2_0, step2_1, step2_2, step2_3;

414 int step2_8, step2_9, step2_10, step2_11;	414 int step2_8, step2_9, step2_10, step2_11;

415 int step2_12, step2_13, step2_14, step2_15;	415 int step2_12, step2_13, step2_14, step2_15;

416 int load1, load2, load3, load4, load5, load6, load7, load8;	416 int load1, load2, load3, load4, load5, load6, load7, load8;

417 int result1, result2, result3, result4;	417 int result1, result2, result3, result4;

418 const int const_2_power_13 = 8192;	418 const int const_2_power_13 = 8192;

(...skipping 479 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
898 uint32_t pos = 45;	898 uint32_t pos = 45;

899	899

900 /* bit positon for extract from acc */	900 /* bit positon for extract from acc */

901 __asm__ __volatile__ (	901 __asm__ __volatile__ (

902 "wrdsp %[pos], 1 \n\t"	902 "wrdsp %[pos], 1 \n\t"

903 :	903 :

904 : [pos] "r" (pos)	904 : [pos] "r" (pos)

905 );	905 );

906	906

907 // First transform rows	907 // First transform rows

908 idct16_1d_rows_dspr2(input, out, 16);	908 idct16_rows_dspr2(input, out, 16);

909	909

910 // Then transform columns and add to dest	910 // Then transform columns and add to dest

911 idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);	911 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

912 }	912 }

913	913

914 static void iadst16_1d(const int16_t input, int16_t output) {	914 static void iadst16(const int16_t input, int16_t output) {

915 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;	915 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

916	916

917 int x0 = input[15];	917 int x0 = input[15];

918 int x1 = input[0];	918 int x1 = input[0];

919 int x2 = input[13];	919 int x2 = input[13];

920 int x3 = input[2];	920 int x3 = input[2];

921 int x4 = input[11];	921 int x4 = input[11];

922 int x5 = input[4];	922 int x5 = input[4];

923 int x6 = input[9];	923 int x6 = input[9];

924 int x7 = input[6];	924 int x7 = input[6];

(...skipping 167 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1092	1092

1093 /* bit positon for extract from acc */	1093 /* bit positon for extract from acc */

1094 __asm__ __volatile__ (	1094 __asm__ __volatile__ (

1095 "wrdsp %[pos], 1 \n\t"	1095 "wrdsp %[pos], 1 \n\t"

1096 :	1096 :

1097 : [pos] "r" (pos)	1097 : [pos] "r" (pos)

1098 );	1098 );

1099	1099

1100 switch (tx_type) {	1100 switch (tx_type) {

1101 case DCT_DCT: // DCT in both horizontal and vertical	1101 case DCT_DCT: // DCT in both horizontal and vertical

1102 idct16_1d_rows_dspr2(input, outptr, 16);	1102 idct16_rows_dspr2(input, outptr, 16);

1103 idct16_1d_cols_add_blk_dspr2(out, dest, pitch);	1103 idct16_cols_add_blk_dspr2(out, dest, pitch);

1104 break;	1104 break;

1105 case ADST_DCT: // ADST in vertical, DCT in horizontal	1105 case ADST_DCT: // ADST in vertical, DCT in horizontal

1106 idct16_1d_rows_dspr2(input, outptr, 16);	1106 idct16_rows_dspr2(input, outptr, 16);

1107	1107

1108 outptr = out;	1108 outptr = out;

1109	1109

1110 for (i = 0; i < 16; ++i) {	1110 for (i = 0; i < 16; ++i) {

1111 iadst16_1d(outptr, temp_out);	1111 iadst16(outptr, temp_out);

1112	1112

1113 for (j = 0; j < 16; ++j)	1113 for (j = 0; j < 16; ++j)

1114 dest[j * pitch + i] =	1114 dest[j * pitch + i] =

1115 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1115 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1116 + dest[j * pitch + i]);	1116 + dest[j * pitch + i]);

1117 outptr += 16;	1117 outptr += 16;

1118 }	1118 }

1119 break;	1119 break;

1120 case DCT_ADST: // DCT in vertical, ADST in horizontal	1120 case DCT_ADST: // DCT in vertical, ADST in horizontal

1121 {	1121 {

1122 int16_t temp_in[16 * 16];	1122 int16_t temp_in[16 * 16];

1123	1123

1124 for (i = 0; i < 16; ++i) {	1124 for (i = 0; i < 16; ++i) {

1125 /* prefetch row */	1125 /* prefetch row */

1126 vp9_prefetch_load((const uint8_t *)(input + 16));	1126 vp9_prefetch_load((const uint8_t *)(input + 16));

1127	1127

1128 iadst16_1d(input, outptr);	1128 iadst16(input, outptr);

1129 input += 16;	1129 input += 16;

1130 outptr += 16;	1130 outptr += 16;

1131 }	1131 }

1132	1132

1133 for (i = 0; i < 16; ++i)	1133 for (i = 0; i < 16; ++i)

1134 for (j = 0; j < 16; ++j)	1134 for (j = 0; j < 16; ++j)

1135 temp_in[j * 16 + i] = out[i * 16 + j];	1135 temp_in[j * 16 + i] = out[i * 16 + j];

1136	1136

1137 idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);	1137 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);

1138 }	1138 }

1139 break;	1139 break;

1140 case ADST_ADST: // ADST in both directions	1140 case ADST_ADST: // ADST in both directions

1141 {	1141 {

1142 int16_t temp_in[16];	1142 int16_t temp_in[16];

1143	1143

1144 for (i = 0; i < 16; ++i) {	1144 for (i = 0; i < 16; ++i) {

1145 /* prefetch row */	1145 /* prefetch row */

1146 vp9_prefetch_load((const uint8_t *)(input + 16));	1146 vp9_prefetch_load((const uint8_t *)(input + 16));

1147	1147

1148 iadst16_1d(input, outptr);	1148 iadst16(input, outptr);

1149 input += 16;	1149 input += 16;

1150 outptr += 16;	1150 outptr += 16;

1151 }	1151 }

1152	1152

1153 for (i = 0; i < 16; ++i) {	1153 for (i = 0; i < 16; ++i) {

1154 for (j = 0; j < 16; ++j)	1154 for (j = 0; j < 16; ++j)

1155 temp_in[j] = out[j * 16 + i];	1155 temp_in[j] = out[j * 16 + i];

1156 iadst16_1d(temp_in, temp_out);	1156 iadst16(temp_in, temp_out);

1157 for (j = 0; j < 16; ++j)	1157 for (j = 0; j < 16; ++j)

1158 dest[j * pitch + i] =	1158 dest[j * pitch + i] =

1159 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)	1159 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

1160 + dest[j * pitch + i]);	1160 + dest[j * pitch + i]);

1161 }	1161 }

1162 }	1162 }

1163 break;	1163 break;

1164 default:	1164 default:

1165 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");	1165 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");

1166 break;	1166 break;

1167 }	1167 }

1168 }	1168 }

1169	1169

1170 void vp9_idct16x16_10_add_dspr2(const int16_t input, uint8_t dest,	1170 void vp9_idct16x16_10_add_dspr2(const int16_t input, uint8_t dest,

1171 int dest_stride) {	1171 int dest_stride) {

1172 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);	1172 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);

1173 int16_t *outptr = out;	1173 int16_t *outptr = out;

1174 uint32_t i;	1174 uint32_t i;

1175 uint32_t pos = 45;	1175 uint32_t pos = 45;

1176	1176

1177 /* bit positon for extract from acc */	1177 /* bit positon for extract from acc */

1178 __asm__ __volatile__ (	1178 __asm__ __volatile__ (

1179 "wrdsp %[pos], 1 \n\t"	1179 "wrdsp %[pos], 1 \n\t"

1180 :	1180 :

1181 : [pos] "r" (pos)	1181 : [pos] "r" (pos)

1182 );	1182 );

1183	1183

1184 // First transform rows. Since all non-zero dct coefficients are in	1184 // First transform rows. Since all non-zero dct coefficients are in

1185 // upper-left 4x4 area, we only need to calculate first 4 rows here.	1185 // upper-left 4x4 area, we only need to calculate first 4 rows here.

1186 idct16_1d_rows_dspr2(input, outptr, 4);	1186 idct16_rows_dspr2(input, outptr, 4);

1187	1187

1188 outptr += 4;	1188 outptr += 4;

1189 for (i = 0; i < 6; ++i) {	1189 for (i = 0; i < 6; ++i) {

1190 __asm__ __volatile__ (	1190 __asm__ __volatile__ (

1191 "sw $zero, 0(%[outptr]) \n\t"	1191 "sw $zero, 0(%[outptr]) \n\t"

1192 "sw $zero, 32(%[outptr]) \n\t"	1192 "sw $zero, 32(%[outptr]) \n\t"

1193 "sw $zero, 64(%[outptr]) \n\t"	1193 "sw $zero, 64(%[outptr]) \n\t"

1194 "sw $zero, 96(%[outptr]) \n\t"	1194 "sw $zero, 96(%[outptr]) \n\t"

1195 "sw $zero, 128(%[outptr]) \n\t"	1195 "sw $zero, 128(%[outptr]) \n\t"

1196 "sw $zero, 160(%[outptr]) \n\t"	1196 "sw $zero, 160(%[outptr]) \n\t"

1197 "sw $zero, 192(%[outptr]) \n\t"	1197 "sw $zero, 192(%[outptr]) \n\t"

1198 "sw $zero, 224(%[outptr]) \n\t"	1198 "sw $zero, 224(%[outptr]) \n\t"

1199 "sw $zero, 256(%[outptr]) \n\t"	1199 "sw $zero, 256(%[outptr]) \n\t"

1200 "sw $zero, 288(%[outptr]) \n\t"	1200 "sw $zero, 288(%[outptr]) \n\t"

1201 "sw $zero, 320(%[outptr]) \n\t"	1201 "sw $zero, 320(%[outptr]) \n\t"

1202 "sw $zero, 352(%[outptr]) \n\t"	1202 "sw $zero, 352(%[outptr]) \n\t"

1203 "sw $zero, 384(%[outptr]) \n\t"	1203 "sw $zero, 384(%[outptr]) \n\t"

1204 "sw $zero, 416(%[outptr]) \n\t"	1204 "sw $zero, 416(%[outptr]) \n\t"

1205 "sw $zero, 448(%[outptr]) \n\t"	1205 "sw $zero, 448(%[outptr]) \n\t"

1206 "sw $zero, 480(%[outptr]) \n\t"	1206 "sw $zero, 480(%[outptr]) \n\t"

1207	1207

1208 :	1208 :

1209 : [outptr] "r" (outptr)	1209 : [outptr] "r" (outptr)

1210 );	1210 );

1211	1211

1212 outptr += 2;	1212 outptr += 2;

1213 }	1213 }

1214	1214

1215 // Then transform columns	1215 // Then transform columns

1216 idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);	1216 idct16_cols_add_blk_dspr2(out, dest, dest_stride);

1217 }	1217 }

1218	1218

1219 void vp9_idct16x16_1_add_dspr2(const int16_t input, uint8_t dest,	1219 void vp9_idct16x16_1_add_dspr2(const int16_t input, uint8_t dest,

1220 int dest_stride) {	1220 int dest_stride) {

1221 uint32_t pos = 45;	1221 uint32_t pos = 45;

1222 int32_t out;	1222 int32_t out;

1223 int32_t r;	1223 int32_t r;

1224 int32_t a1, absa1;	1224 int32_t a1, absa1;

1225 int32_t vector_a1;	1225 int32_t vector_a1;

1226 int32_t t1, t2, t3, t4;	1226 int32_t t1, t2, t3, t4;

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1306 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),	1306 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),

1307 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),	1307 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),

1308 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),	1308 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),

1309 [dest] "+&r" (dest)	1309 [dest] "+&r" (dest)

1310 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)	1310 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)

1311 );	1311 );

1312 }	1312 }

1313 }	1313 }

1314 }	1314 }

1315 #endif // #if HAVE_DSPR2	1315 #endif // #if HAVE_DSPR2

OLD	NEW