Index: source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c |
=================================================================== |
--- source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c (revision 232232) |
+++ source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c (working copy) |
@@ -11,45 +11,47 @@ |
#include "./vp9_rtcd.h" |
#include "vp9/common/vp9_common.h" |
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, |
- int16_t *output, |
- int output_stride); |
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, |
- int16_t *output, |
- int16_t *pass1Output, |
- int16_t skip_adding, |
- uint8_t *dest, |
- int dest_stride); |
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, |
- int16_t *output, |
- int output_stride); |
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, |
- int16_t *output, |
- int16_t *pass1Output, |
- int16_t skip_adding, |
- uint8_t *dest, |
- int dest_stride); |
-extern void save_neon_registers(); |
-extern void restore_neon_registers(); |
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, |
+ int16_t *output, |
+ int output_stride); |
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, |
+ int16_t *output, |
+ int16_t *pass1Output, |
+ int16_t skip_adding, |
+ uint8_t *dest, |
+ int dest_stride); |
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, |
+ int16_t *output, |
+ int output_stride); |
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, |
+ int16_t *output, |
+ int16_t *pass1Output, |
+ int16_t skip_adding, |
+ uint8_t *dest, |
+ int dest_stride); |
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ |
+extern void vp9_push_neon(int64_t *store); |
+extern void vp9_pop_neon(int64_t *store); |
-void vp9_short_idct16x16_add_neon(int16_t *input, |
- uint8_t *dest, int dest_stride) { |
+void vp9_idct16x16_256_add_neon(const int16_t *input, |
+ uint8_t *dest, int dest_stride) { |
+ int64_t store_reg[8]; |
int16_t pass1_output[16*16] = {0}; |
int16_t row_idct_output[16*16] = {0}; |
// save d8-d15 register values. |
- save_neon_registers(); |
+ vp9_push_neon(store_reg); |
/* Parallel idct on the upper 8 rows */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7 |
// which will be saved into row_idct_output. |
- vp9_short_idct16x16_add_neon_pass2(input+1, |
+ vp9_idct16x16_256_add_neon_pass2(input+1, |
row_idct_output, |
pass1_output, |
0, |
@@ -59,12 +61,12 @@ |
/* Parallel idct on the lower 8 rows */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7 |
// which will be saved into row_idct_output. |
- vp9_short_idct16x16_add_neon_pass2(input+8*16+1, |
+ vp9_idct16x16_256_add_neon_pass2(input+8*16+1, |
row_idct_output+8, |
pass1_output, |
0, |
@@ -74,12 +76,12 @@ |
/* Parallel idct on the left 8 columns */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7. |
// Then add the result to the destination data. |
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, |
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, |
row_idct_output, |
pass1_output, |
1, |
@@ -89,12 +91,12 @@ |
/* Parallel idct on the right 8 columns */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7. |
// Then add the result to the destination data. |
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, |
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, |
row_idct_output+8, |
pass1_output, |
1, |
@@ -102,28 +104,29 @@ |
dest_stride); |
// restore d8-d15 register values. |
- restore_neon_registers(); |
+ vp9_pop_neon(store_reg); |
return; |
} |
-void vp9_short_idct10_16x16_add_neon(int16_t *input, |
- uint8_t *dest, int dest_stride) { |
+void vp9_idct16x16_10_add_neon(const int16_t *input, |
+ uint8_t *dest, int dest_stride) { |
+ int64_t store_reg[8]; |
int16_t pass1_output[16*16] = {0}; |
int16_t row_idct_output[16*16] = {0}; |
// save d8-d15 register values. |
- save_neon_registers(); |
+ vp9_push_neon(store_reg); |
/* Parallel idct on the upper 8 rows */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); |
+ vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7 |
// which will be saved into row_idct_output. |
- vp9_short_idct10_16x16_add_neon_pass2(input+1, |
+ vp9_idct16x16_10_add_neon_pass2(input+1, |
row_idct_output, |
pass1_output, |
0, |
@@ -135,12 +138,12 @@ |
/* Parallel idct on the left 8 columns */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7. |
// Then add the result to the destination data. |
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, |
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, |
row_idct_output, |
pass1_output, |
1, |
@@ -150,12 +153,12 @@ |
/* Parallel idct on the right 8 columns */ |
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the |
// stage 6 result in pass1_output. |
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); |
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines |
// with result in pass1(pass1_output) to calculate final result in stage 7. |
// Then add the result to the destination data. |
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, |
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, |
row_idct_output+8, |
pass1_output, |
1, |
@@ -163,7 +166,7 @@ |
dest_stride); |
// restore d8-d15 register values. |
- restore_neon_registers(); |
+ vp9_pop_neon(store_reg); |
return; |
} |