Index: source/libvpx/vp8/common/arm/neon/loopfilter_neon.c |
=================================================================== |
--- source/libvpx/vp8/common/arm/neon/loopfilter_neon.c (revision 292608) |
+++ source/libvpx/vp8/common/arm/neon/loopfilter_neon.c (working copy) |
@@ -10,6 +10,7 @@ |
#include <arm_neon.h> |
#include "./vpx_config.h" |
+#include "vpx_ports/arm.h" |
static INLINE void vp8_loop_filter_neon( |
uint8x16_t qblimit, // flimit |
@@ -251,38 +252,56 @@ |
return; |
} |
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) |
-#warning Using GCC 4.6 is not recommended |
-// Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built |
-// with any gcc4.6, use the C code. |
-extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, |
- const unsigned char *blimit, |
- const unsigned char *limit, |
- const unsigned char *thresh, |
- int count); |
- |
-void vp8_loop_filter_vertical_edge_y_neon( |
- unsigned char *src, |
- int pitch, |
- unsigned char blimit, |
- unsigned char limit, |
- unsigned char thresh) { |
- vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2); |
-} |
- |
-void vp8_loop_filter_vertical_edge_uv_neon( |
- unsigned char *u, |
- int pitch, |
- unsigned char blimit, |
- unsigned char limit, |
- unsigned char thresh, |
- unsigned char *v) { |
- vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1); |
- vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1); |
-} |
-#else |
static INLINE void write_4x8(unsigned char *dst, int pitch, |
const uint8x8x4_t result) { |
+#ifdef VPX_INCOMPATIBLE_GCC |
+ /* |
+ * uint8x8x4_t result |
+ 00 01 02 03 | 04 05 06 07 |
+ 10 11 12 13 | 14 15 16 17 |
+ 20 21 22 23 | 24 25 26 27 |
+ 30 31 32 33 | 34 35 36 37 |
+ --- |
+ * after vtrn_u16 |
+ 00 01 20 21 | 04 05 24 25 |
+ 02 03 22 23 | 06 07 26 27 |
+ 10 11 30 31 | 14 15 34 35 |
+ 12 13 32 33 | 16 17 36 37 |
+ --- |
+ * after vtrn_u8 |
+ 00 10 20 30 | 04 14 24 34 |
+ 01 11 21 31 | 05 15 25 35 |
+ 02 12 22 32 | 06 16 26 36 |
+ 03 13 23 33 | 07 17 27 37 |
+ */ |
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), |
+ vreinterpret_u16_u8(result.val[2])); |
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), |
+ vreinterpret_u16_u8(result.val[3])); |
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), |
+ vreinterpret_u8_u16(r13_u16.val[0])); |
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), |
+ vreinterpret_u8_u16(r13_u16.val[1])); |
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); |
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); |
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); |
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); |
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1); |
+ dst += pitch; |
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1); |
+#else |
vst4_lane_u8(dst, result, 0); |
dst += pitch; |
vst4_lane_u8(dst, result, 1); |
@@ -298,6 +317,7 @@ |
vst4_lane_u8(dst, result, 6); |
dst += pitch; |
vst4_lane_u8(dst, result, 7); |
+#endif // VPX_INCOMPATIBLE_GCC |
} |
void vp8_loop_filter_vertical_edge_y_neon( |
@@ -528,4 +548,3 @@ |
vd = v - 2; |
write_4x8(vd, pitch, q4ResultH); |
} |
-#endif // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) |