| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <arm_neon.h> | 11 #include <arm_neon.h> |
| 12 | 12 |
| 13 static const uint16_t bifilter4_coeff[8][2] = { | 13 static const uint8_t bifilter4_coeff[8][2] = { |
| 14 {128, 0}, | 14 {128, 0}, |
| 15 {112, 16}, | 15 {112, 16}, |
| 16 { 96, 32}, | 16 { 96, 32}, |
| 17 { 80, 48}, | 17 { 80, 48}, |
| 18 { 64, 64}, | 18 { 64, 64}, |
| 19 { 48, 80}, | 19 { 48, 80}, |
| 20 { 32, 96}, | 20 { 32, 96}, |
| 21 { 16, 112} | 21 { 16, 112} |
| 22 }; | 22 }; |
| 23 | 23 |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 57 } else { | 57 } else { |
| 58 d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 58 d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 59 d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 59 d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 60 d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 60 d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 61 d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 61 d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 62 d6u8 = vld1_u8(src_ptr); | 62 d6u8 = vld1_u8(src_ptr); |
| 63 | 63 |
| 64 q1u8 = vcombine_u8(d2u8, d3u8); | 64 q1u8 = vcombine_u8(d2u8, d3u8); |
| 65 q2u8 = vcombine_u8(d4u8, d5u8); | 65 q2u8 = vcombine_u8(d4u8, d5u8); |
| 66 | 66 |
| 67 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 67 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
| 68 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 68 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
| 69 | 69 |
| 70 q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); | 70 q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); |
| 71 q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); | 71 q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); |
| 72 d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); | 72 d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); |
| 73 | 73 |
| 74 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), | 74 d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), |
| 75 vreinterpret_u32_u8(vget_high_u8(q1u8))); | 75 vreinterpret_u32_u8(vget_high_u8(q1u8))); |
| 76 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), | 76 d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), |
| 77 vreinterpret_u32_u8(vget_high_u8(q2u8))); | 77 vreinterpret_u32_u8(vget_high_u8(q2u8))); |
| 78 d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), | 78 d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), |
| (...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 148 d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 148 d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 149 d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 149 d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 150 d26u8 = vld1_u8(src_ptr); | 150 d26u8 = vld1_u8(src_ptr); |
| 151 } else { | 151 } else { |
| 152 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 152 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 153 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 153 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 154 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 154 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 155 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 155 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 156 q5u8 = vld1q_u8(src_ptr); | 156 q5u8 = vld1q_u8(src_ptr); |
| 157 | 157 |
| 158 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 158 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
| 159 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 159 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
| 160 | 160 |
| 161 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); | 161 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); |
| 162 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); | 162 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); |
| 163 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); | 163 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); |
| 164 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); | 164 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); |
| 165 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); | 165 q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); |
| 166 | 166 |
| 167 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); | 167 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); |
| 168 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); | 168 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); |
| 169 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); | 169 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); |
| (...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 238 d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 238 d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 239 d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 239 d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 240 d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; | 240 d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 241 d30u8 = vld1_u8(src_ptr); | 241 d30u8 = vld1_u8(src_ptr); |
| 242 } else { | 242 } else { |
| 243 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 243 q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 244 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 244 q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 245 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 245 q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 246 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; | 246 q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; |
| 247 | 247 |
| 248 d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); | 248 d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); |
| 249 d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); | 249 d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); |
| 250 | 250 |
| 251 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); | 251 q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); |
| 252 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); | 252 q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); |
| 253 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); | 253 q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); |
| 254 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); | 254 q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); |
| 255 | 255 |
| 256 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); | 256 d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); |
| 257 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); | 257 d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); |
| 258 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); | 258 d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); |
| 259 d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); | 259 d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); |
| (...skipping 430 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 690 | 690 |
| 691 q11u8 = q15u8; | 691 q11u8 = q15u8; |
| 692 | 692 |
| 693 vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; | 693 vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; |
| 694 vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; | 694 vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; |
| 695 vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; | 695 vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; |
| 696 vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; | 696 vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; |
| 697 } | 697 } |
| 698 return; | 698 return; |
| 699 } | 699 } |
| OLD | NEW |