Index: source/libvpx/vp8/common/arm/neon/loopfilter_neon.c |
=================================================================== |
--- source/libvpx/vp8/common/arm/neon/loopfilter_neon.c (revision 292104) |
+++ source/libvpx/vp8/common/arm/neon/loopfilter_neon.c (working copy) |
@@ -251,9 +251,38 @@ |
return; |
} |
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) |
+#warning Using GCC 4.6 is not recommended |
+// Some versions of gcc4.6 do not correctly process vst4_lane_u8. When built |
+// with any gcc4.6, use the C code. |
+extern void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p, |
+ const unsigned char *blimit, |
+ const unsigned char *limit, |
+ const unsigned char *thresh, |
+ int count); |
+ |
+void vp8_loop_filter_vertical_edge_y_neon( |
+ unsigned char *src, |
+ int pitch, |
+ unsigned char blimit, |
+ unsigned char limit, |
+ unsigned char thresh) { |
+ vp8_loop_filter_vertical_edge_c(src, pitch, &blimit, &limit, &thresh, 2); |
+} |
+ |
+void vp8_loop_filter_vertical_edge_uv_neon( |
+ unsigned char *u, |
+ int pitch, |
+ unsigned char blimit, |
+ unsigned char limit, |
+ unsigned char thresh, |
+ unsigned char *v) { |
+ vp8_loop_filter_vertical_edge_c(u, pitch, &blimit, &limit, &thresh, 1); |
+ vp8_loop_filter_vertical_edge_c(v, pitch, &blimit, &limit, &thresh, 1); |
+} |
+#else |
static INLINE void write_4x8(unsigned char *dst, int pitch, |
const uint8x8x4_t result) { |
-#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) |
vst4_lane_u8(dst, result, 0); |
dst += pitch; |
vst4_lane_u8(dst, result, 1); |
@@ -269,54 +298,6 @@ |
vst4_lane_u8(dst, result, 6); |
dst += pitch; |
vst4_lane_u8(dst, result, 7); |
-#else |
- /* |
- * uint8x8x4_t result |
- 00 01 02 03 | 04 05 06 07 |
- 10 11 12 13 | 14 15 16 17 |
- 20 21 22 23 | 24 25 26 27 |
- 30 31 32 33 | 34 35 36 37 |
- --- |
- * after vtrn_u16 |
- 00 01 20 21 | 04 05 24 25 |
- 02 03 22 23 | 06 07 26 27 |
- 10 11 30 31 | 14 15 34 35 |
- 12 13 32 33 | 16 17 36 37 |
- --- |
- * after vtrn_u8 |
- 00 10 20 30 | 04 14 24 34 |
- 01 11 21 31 | 05 15 25 35 |
- 02 12 22 32 | 06 16 26 36 |
- 03 13 23 33 | 07 17 27 37 |
- */ |
- const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), |
- vreinterpret_u16_u8(result.val[2])); |
- const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), |
- vreinterpret_u16_u8(result.val[3])); |
- const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), |
- vreinterpret_u8_u16(r13_u16.val[0])); |
- const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), |
- vreinterpret_u8_u16(r13_u16.val[1])); |
- const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); |
- const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); |
- const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); |
- const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); |
- vst1_lane_u32((uint32_t *)dst, x_0_4, 0); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_1_5, 0); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_2_6, 0); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_3_7, 0); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_0_4, 1); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_1_5, 1); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_2_6, 1); |
- dst += pitch; |
- vst1_lane_u32((uint32_t *)dst, x_3_7, 1); |
-#endif |
} |
void vp8_loop_filter_vertical_edge_y_neon( |
@@ -547,3 +528,4 @@ |
vd = v - 2; |
write_4x8(vd, pitch, q4ResultH); |
} |
+#endif // (__GNUC__ == 4 && (__GNUC_MINOR__ == 6)) |