| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| 11 #include <assert.h> | 11 #include <assert.h> |
| 12 | 12 |
| 13 #include "./vpx_config.h" | 13 #include "./vpx_config.h" |
| 14 #include "./vp9_rtcd.h" | 14 #include "./vp9_rtcd.h" |
| 15 #include "vpx_ports/mem.h" | 15 #include "vpx_ports/mem.h" |
| 16 | 16 |
| 17 typedef void filter8_1dfunction ( | 17 typedef void filter8_1dfunction ( |
| 18 const unsigned char *src_ptr, | 18 const unsigned char *src_ptr, |
| 19 const unsigned int src_pitch, | 19 const ptrdiff_t src_pitch, |
| 20 unsigned char *output_ptr, | 20 unsigned char *output_ptr, |
| 21 unsigned int out_pitch, | 21 ptrdiff_t out_pitch, |
| 22 unsigned int output_height, | 22 unsigned int output_height, |
| 23 const short *filter | 23 const short *filter |
| 24 ); | 24 ); |
| 25 | 25 |
| 26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ |
| 27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ |
| 28 uint8_t *dst, ptrdiff_t dst_stride, \ |
| 29 const int16_t *filter_x, int x_step_q4, \ |
| 30 const int16_t *filter_y, int y_step_q4, \ |
| 31 int w, int h) { \ |
| 32 if (step_q4 == 16 && filter[3] != 128) { \ |
| 33 if (filter[0] || filter[1] || filter[2]) { \ |
| 34 while (w >= 16) { \ |
| 35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ |
| 36 src_stride, \ |
| 37 dst, \ |
| 38 dst_stride, \ |
| 39 h, \ |
| 40 filter); \ |
| 41 src += 16; \ |
| 42 dst += 16; \ |
| 43 w -= 16; \ |
| 44 } \ |
| 45 while (w >= 8) { \ |
| 46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ |
| 47 src_stride, \ |
| 48 dst, \ |
| 49 dst_stride, \ |
| 50 h, \ |
| 51 filter); \ |
| 52 src += 8; \ |
| 53 dst += 8; \ |
| 54 w -= 8; \ |
| 55 } \ |
| 56 while (w >= 4) { \ |
| 57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ |
| 58 src_stride, \ |
| 59 dst, \ |
| 60 dst_stride, \ |
| 61 h, \ |
| 62 filter); \ |
| 63 src += 4; \ |
| 64 dst += 4; \ |
| 65 w -= 4; \ |
| 66 } \ |
| 67 } else { \ |
| 68 while (w >= 16) { \ |
| 69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \ |
| 70 src_stride, \ |
| 71 dst, \ |
| 72 dst_stride, \ |
| 73 h, \ |
| 74 filter); \ |
| 75 src += 16; \ |
| 76 dst += 16; \ |
| 77 w -= 16; \ |
| 78 } \ |
| 79 while (w >= 8) { \ |
| 80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \ |
| 81 src_stride, \ |
| 82 dst, \ |
| 83 dst_stride, \ |
| 84 h, \ |
| 85 filter); \ |
| 86 src += 8; \ |
| 87 dst += 8; \ |
| 88 w -= 8; \ |
| 89 } \ |
| 90 while (w >= 4) { \ |
| 91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \ |
| 92 src_stride, \ |
| 93 dst, \ |
| 94 dst_stride, \ |
| 95 h, \ |
| 96 filter); \ |
| 97 src += 4; \ |
| 98 dst += 4; \ |
| 99 w -= 4; \ |
| 100 } \ |
| 101 } \ |
| 102 } \ |
| 103 if (w) { \ |
| 104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ |
| 105 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 106 w, h); \ |
| 107 } \ |
| 108 } |
| 109 |
| 110 #define FUN_CONV_2D(avg, opt) \ |
| 111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ |
| 112 uint8_t *dst, ptrdiff_t dst_stride, \ |
| 113 const int16_t *filter_x, int x_step_q4, \ |
| 114 const int16_t *filter_y, int y_step_q4, \ |
| 115 int w, int h) { \ |
| 116 assert(w <= 64); \ |
| 117 assert(h <= 64); \ |
| 118 if (x_step_q4 == 16 && y_step_q4 == 16) { \ |
| 119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ |
| 120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ |
| 121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ |
| 122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ |
| 123 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 124 w, h + 7); \ |
| 125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ |
| 126 filter_x, x_step_q4, filter_y, \ |
| 127 y_step_q4, w, h); \ |
| 128 } else { \ |
| 129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ |
| 130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ |
| 131 filter_x, x_step_q4, filter_y, y_step_q4, \ |
| 132 w, h + 1); \ |
| 133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ |
| 134 filter_x, x_step_q4, filter_y, \ |
| 135 y_step_q4, w, h); \ |
| 136 } \ |
| 137 } else { \ |
| 138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ |
| 139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ |
| 140 } \ |
| 141 } |
| 142 #if HAVE_AVX2 |
| 143 filter8_1dfunction vp9_filter_block1d16_v8_avx2; |
| 144 filter8_1dfunction vp9_filter_block1d16_h8_avx2; |
| 145 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 146 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 147 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 148 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 149 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 150 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 151 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 152 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 153 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 154 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 155 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 |
| 156 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 |
| 157 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 |
| 158 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 |
| 159 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 |
| 160 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 |
| 161 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 |
| 162 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 |
| 163 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 |
| 164 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 |
| 165 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 166 // uint8_t *dst, ptrdiff_t dst_stride, |
| 167 // const int16_t *filter_x, int x_step_q4, |
| 168 // const int16_t *filter_y, int y_step_q4, |
| 169 // int w, int h); |
| 170 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 171 // uint8_t *dst, ptrdiff_t dst_stride, |
| 172 // const int16_t *filter_x, int x_step_q4, |
| 173 // const int16_t *filter_y, int y_step_q4, |
| 174 // int w, int h); |
| 175 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); |
| 176 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); |
| 177 |
| 178 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, |
| 179 // uint8_t *dst, ptrdiff_t dst_stride, |
| 180 // const int16_t *filter_x, int x_step_q4, |
| 181 // const int16_t *filter_y, int y_step_q4, |
| 182 // int w, int h); |
| 183 FUN_CONV_2D(, avx2); |
| 184 #endif |
| 26 #if HAVE_SSSE3 | 185 #if HAVE_SSSE3 |
| 27 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; | 186 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
| 28 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; | 187 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
| 29 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; | 188 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
| 30 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; | 189 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
| 31 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; | 190 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
| 32 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; | 191 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
| 33 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; | 192 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
| 34 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; | 193 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
| 35 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; | 194 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
| 36 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; | 195 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
| 37 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; | 196 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
| 38 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; | 197 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
| 39 | 198 |
| 40 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 199 filter8_1dfunction vp9_filter_block1d16_v2_ssse3; |
| 41 uint8_t *dst, ptrdiff_t dst_stride, | 200 filter8_1dfunction vp9_filter_block1d16_h2_ssse3; |
| 42 const int16_t *filter_x, int x_step_q4, | 201 filter8_1dfunction vp9_filter_block1d8_v2_ssse3; |
| 43 const int16_t *filter_y, int y_step_q4, | 202 filter8_1dfunction vp9_filter_block1d8_h2_ssse3; |
| 44 int w, int h) { | 203 filter8_1dfunction vp9_filter_block1d4_v2_ssse3; |
| 45 /* Ensure the filter can be compressed to int16_t. */ | 204 filter8_1dfunction vp9_filter_block1d4_h2_ssse3; |
| 46 if (x_step_q4 == 16 && filter_x[3] != 128) { | 205 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; |
| 47 while (w >= 16) { | 206 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; |
| 48 vp9_filter_block1d16_h8_ssse3(src, src_stride, | 207 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; |
| 49 dst, dst_stride, | 208 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; |
| 50 h, filter_x); | 209 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; |
| 51 src += 16; | 210 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; |
| 52 dst += 16; | |
| 53 w -= 16; | |
| 54 } | |
| 55 while (w >= 8) { | |
| 56 vp9_filter_block1d8_h8_ssse3(src, src_stride, | |
| 57 dst, dst_stride, | |
| 58 h, filter_x); | |
| 59 src += 8; | |
| 60 dst += 8; | |
| 61 w -= 8; | |
| 62 } | |
| 63 while (w >= 4) { | |
| 64 vp9_filter_block1d4_h8_ssse3(src, src_stride, | |
| 65 dst, dst_stride, | |
| 66 h, filter_x); | |
| 67 src += 4; | |
| 68 dst += 4; | |
| 69 w -= 4; | |
| 70 } | |
| 71 } | |
| 72 if (w) { | |
| 73 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
| 74 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 75 w, h); | |
| 76 } | |
| 77 } | |
| 78 | 211 |
| 79 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 212 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 80 uint8_t *dst, ptrdiff_t dst_stride, | 213 // uint8_t *dst, ptrdiff_t dst_stride, |
| 81 const int16_t *filter_x, int x_step_q4, | 214 // const int16_t *filter_x, int x_step_q4, |
| 82 const int16_t *filter_y, int y_step_q4, | 215 // const int16_t *filter_y, int y_step_q4, |
| 83 int w, int h) { | 216 // int w, int h); |
| 84 if (y_step_q4 == 16 && filter_y[3] != 128) { | 217 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 85 while (w >= 16) { | 218 // uint8_t *dst, ptrdiff_t dst_stride, |
| 86 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, | 219 // const int16_t *filter_x, int x_step_q4, |
| 87 dst, dst_stride, | 220 // const int16_t *filter_y, int y_step_q4, |
| 88 h, filter_y); | 221 // int w, int h); |
| 89 src += 16; | 222 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 90 dst += 16; | 223 // uint8_t *dst, ptrdiff_t dst_stride, |
| 91 w -= 16; | 224 // const int16_t *filter_x, int x_step_q4, |
| 92 } | 225 // const int16_t *filter_y, int y_step_q4, |
| 93 while (w >= 8) { | 226 // int w, int h); |
| 94 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, | 227 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 95 dst, dst_stride, | 228 // uint8_t *dst, ptrdiff_t dst_stride, |
| 96 h, filter_y); | 229 // const int16_t *filter_x, int x_step_q4, |
| 97 src += 8; | 230 // const int16_t *filter_y, int y_step_q4, |
| 98 dst += 8; | 231 // int w, int h); |
| 99 w -= 8; | 232 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); |
| 100 } | 233 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); |
| 101 while (w >= 4) { | 234 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); |
| 102 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, | 235 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, |
| 103 dst, dst_stride, | 236 ssse3); |
| 104 h, filter_y); | |
| 105 src += 4; | |
| 106 dst += 4; | |
| 107 w -= 4; | |
| 108 } | |
| 109 } | |
| 110 if (w) { | |
| 111 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
| 112 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 113 w, h); | |
| 114 } | |
| 115 } | |
| 116 | 237 |
| 117 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, | 238 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 118 uint8_t *dst, ptrdiff_t dst_stride, | 239 // uint8_t *dst, ptrdiff_t dst_stride, |
| 119 const int16_t *filter_x, int x_step_q4, | 240 // const int16_t *filter_x, int x_step_q4, |
| 120 const int16_t *filter_y, int y_step_q4, | 241 // const int16_t *filter_y, int y_step_q4, |
| 121 int w, int h) { | 242 // int w, int h); |
| 122 if (x_step_q4 == 16 && filter_x[3] != 128) { | 243 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
| 123 while (w >= 16) { | 244 // uint8_t *dst, ptrdiff_t dst_stride, |
| 124 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, | 245 // const int16_t *filter_x, int x_step_q4, |
| 125 dst, dst_stride, | 246 // const int16_t *filter_y, int y_step_q4, |
| 126 h, filter_x); | 247 // int w, int h); |
| 127 src += 16; | 248 FUN_CONV_2D(, ssse3); |
| 128 dst += 16; | 249 FUN_CONV_2D(avg_ , ssse3); |
| 129 w -= 16; | |
| 130 } | |
| 131 while (w >= 8) { | |
| 132 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, | |
| 133 dst, dst_stride, | |
| 134 h, filter_x); | |
| 135 src += 8; | |
| 136 dst += 8; | |
| 137 w -= 8; | |
| 138 } | |
| 139 while (w >= 4) { | |
| 140 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, | |
| 141 dst, dst_stride, | |
| 142 h, filter_x); | |
| 143 src += 4; | |
| 144 dst += 4; | |
| 145 w -= 4; | |
| 146 } | |
| 147 } | |
| 148 if (w) { | |
| 149 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
| 150 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 151 w, h); | |
| 152 } | |
| 153 } | |
| 154 | |
| 155 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
| 156 uint8_t *dst, ptrdiff_t dst_stride, | |
| 157 const int16_t *filter_x, int x_step_q4, | |
| 158 const int16_t *filter_y, int y_step_q4, | |
| 159 int w, int h) { | |
| 160 if (y_step_q4 == 16 && filter_y[3] != 128) { | |
| 161 while (w >= 16) { | |
| 162 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
| 163 dst, dst_stride, | |
| 164 h, filter_y); | |
| 165 src += 16; | |
| 166 dst += 16; | |
| 167 w -= 16; | |
| 168 } | |
| 169 while (w >= 8) { | |
| 170 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
| 171 dst, dst_stride, | |
| 172 h, filter_y); | |
| 173 src += 8; | |
| 174 dst += 8; | |
| 175 w -= 8; | |
| 176 } | |
| 177 while (w >= 4) { | |
| 178 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, | |
| 179 dst, dst_stride, | |
| 180 h, filter_y); | |
| 181 src += 4; | |
| 182 dst += 4; | |
| 183 w -= 4; | |
| 184 } | |
| 185 } | |
| 186 if (w) { | |
| 187 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
| 188 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 189 w, h); | |
| 190 } | |
| 191 } | |
| 192 | |
| 193 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
| 194 uint8_t *dst, ptrdiff_t dst_stride, | |
| 195 const int16_t *filter_x, int x_step_q4, | |
| 196 const int16_t *filter_y, int y_step_q4, | |
| 197 int w, int h) { | |
| 198 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
| 199 | |
| 200 assert(w <= 64); | |
| 201 assert(h <= 64); | |
| 202 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
| 203 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | |
| 204 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 205 w, h + 7); | |
| 206 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | |
| 207 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 208 } else { | |
| 209 vp9_convolve8_c(src, src_stride, dst, dst_stride, | |
| 210 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 211 } | |
| 212 } | |
| 213 | |
| 214 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, | |
| 215 uint8_t *dst, ptrdiff_t dst_stride, | |
| 216 const int16_t *filter_x, int x_step_q4, | |
| 217 const int16_t *filter_y, int y_step_q4, | |
| 218 int w, int h) { | |
| 219 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
| 220 | |
| 221 assert(w <= 64); | |
| 222 assert(h <= 64); | |
| 223 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
| 224 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, | |
| 225 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 226 w, h + 7); | |
| 227 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, | |
| 228 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 229 w, h); | |
| 230 } else { | |
| 231 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | |
| 232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 233 } | |
| 234 } | |
| 235 #endif | 250 #endif |
| 236 | 251 |
| 237 #if HAVE_SSE2 | 252 #if HAVE_SSE2 |
| 238 filter8_1dfunction vp9_filter_block1d16_v8_sse2; | 253 filter8_1dfunction vp9_filter_block1d16_v8_sse2; |
| 239 filter8_1dfunction vp9_filter_block1d16_h8_sse2; | 254 filter8_1dfunction vp9_filter_block1d16_h8_sse2; |
| 240 filter8_1dfunction vp9_filter_block1d8_v8_sse2; | 255 filter8_1dfunction vp9_filter_block1d8_v8_sse2; |
| 241 filter8_1dfunction vp9_filter_block1d8_h8_sse2; | 256 filter8_1dfunction vp9_filter_block1d8_h8_sse2; |
| 242 filter8_1dfunction vp9_filter_block1d4_v8_sse2; | 257 filter8_1dfunction vp9_filter_block1d4_v8_sse2; |
| 243 filter8_1dfunction vp9_filter_block1d4_h8_sse2; | 258 filter8_1dfunction vp9_filter_block1d4_h8_sse2; |
| 244 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; | 259 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; |
| 245 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; | 260 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; |
| 246 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; | 261 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; |
| 247 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; | 262 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; |
| 248 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; | 263 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; |
| 249 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; | 264 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; |
| 250 | 265 |
| 251 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, | 266 filter8_1dfunction vp9_filter_block1d16_v2_sse2; |
| 252 uint8_t *dst, ptrdiff_t dst_stride, | 267 filter8_1dfunction vp9_filter_block1d16_h2_sse2; |
| 253 const int16_t *filter_x, int x_step_q4, | 268 filter8_1dfunction vp9_filter_block1d8_v2_sse2; |
| 254 const int16_t *filter_y, int y_step_q4, | 269 filter8_1dfunction vp9_filter_block1d8_h2_sse2; |
| 255 int w, int h) { | 270 filter8_1dfunction vp9_filter_block1d4_v2_sse2; |
| 256 /* Ensure the filter can be compressed to int16_t. */ | 271 filter8_1dfunction vp9_filter_block1d4_h2_sse2; |
| 257 if (x_step_q4 == 16 && filter_x[3] != 128) { | 272 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; |
| 258 while (w >= 16) { | 273 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; |
| 259 vp9_filter_block1d16_h8_sse2(src, src_stride, | 274 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; |
| 260 dst, dst_stride, | 275 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; |
| 261 h, filter_x); | 276 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; |
| 262 src += 16; | 277 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; |
| 263 dst += 16; | |
| 264 w -= 16; | |
| 265 } | |
| 266 while (w >= 8) { | |
| 267 vp9_filter_block1d8_h8_sse2(src, src_stride, | |
| 268 dst, dst_stride, | |
| 269 h, filter_x); | |
| 270 src += 8; | |
| 271 dst += 8; | |
| 272 w -= 8; | |
| 273 } | |
| 274 while (w >= 4) { | |
| 275 vp9_filter_block1d4_h8_sse2(src, src_stride, | |
| 276 dst, dst_stride, | |
| 277 h, filter_x); | |
| 278 src += 4; | |
| 279 dst += 4; | |
| 280 w -= 4; | |
| 281 } | |
| 282 } | |
| 283 if (w) { | |
| 284 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, | |
| 285 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 286 w, h); | |
| 287 } | |
| 288 } | |
| 289 | 278 |
| 290 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, | 279 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 291 uint8_t *dst, ptrdiff_t dst_stride, | 280 // uint8_t *dst, ptrdiff_t dst_stride, |
| 292 const int16_t *filter_x, int x_step_q4, | 281 // const int16_t *filter_x, int x_step_q4, |
| 293 const int16_t *filter_y, int y_step_q4, | 282 // const int16_t *filter_y, int y_step_q4, |
| 294 int w, int h) { | 283 // int w, int h); |
| 295 if (y_step_q4 == 16 && filter_y[3] != 128) { | 284 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 296 while (w >= 16) { | 285 // uint8_t *dst, ptrdiff_t dst_stride, |
| 297 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, | 286 // const int16_t *filter_x, int x_step_q4, |
| 298 dst, dst_stride, | 287 // const int16_t *filter_y, int y_step_q4, |
| 299 h, filter_y); | 288 // int w, int h); |
| 300 src += 16; | 289 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 301 dst += 16; | 290 // uint8_t *dst, ptrdiff_t dst_stride, |
| 302 w -= 16; | 291 // const int16_t *filter_x, int x_step_q4, |
| 303 } | 292 // const int16_t *filter_y, int y_step_q4, |
| 304 while (w >= 8) { | 293 // int w, int h); |
| 305 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, | 294 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 306 dst, dst_stride, | 295 // uint8_t *dst, ptrdiff_t dst_stride, |
| 307 h, filter_y); | 296 // const int16_t *filter_x, int x_step_q4, |
| 308 src += 8; | 297 // const int16_t *filter_y, int y_step_q4, |
| 309 dst += 8; | 298 // int w, int h); |
| 310 w -= 8; | 299 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); |
| 311 } | 300 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); |
| 312 while (w >= 4) { | 301 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); |
| 313 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, | 302 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); |
| 314 dst, dst_stride, | |
| 315 h, filter_y); | |
| 316 src += 4; | |
| 317 dst += 4; | |
| 318 w -= 4; | |
| 319 } | |
| 320 } | |
| 321 if (w) { | |
| 322 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, | |
| 323 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 324 w, h); | |
| 325 } | |
| 326 } | |
| 327 | 303 |
| 328 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, | 304 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 329 uint8_t *dst, ptrdiff_t dst_stride, | 305 // uint8_t *dst, ptrdiff_t dst_stride, |
| 330 const int16_t *filter_x, int x_step_q4, | 306 // const int16_t *filter_x, int x_step_q4, |
| 331 const int16_t *filter_y, int y_step_q4, | 307 // const int16_t *filter_y, int y_step_q4, |
| 332 int w, int h) { | 308 // int w, int h); |
| 333 if (x_step_q4 == 16 && filter_x[3] != 128) { | 309 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, |
| 334 while (w >= 16) { | 310 // uint8_t *dst, ptrdiff_t dst_stride, |
| 335 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, | 311 // const int16_t *filter_x, int x_step_q4, |
| 336 dst, dst_stride, | 312 // const int16_t *filter_y, int y_step_q4, |
| 337 h, filter_x); | 313 // int w, int h); |
| 338 src += 16; | 314 FUN_CONV_2D(, sse2); |
| 339 dst += 16; | 315 FUN_CONV_2D(avg_ , sse2); |
| 340 w -= 16; | |
| 341 } | |
| 342 while (w >= 8) { | |
| 343 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, | |
| 344 dst, dst_stride, | |
| 345 h, filter_x); | |
| 346 src += 8; | |
| 347 dst += 8; | |
| 348 w -= 8; | |
| 349 } | |
| 350 while (w >= 4) { | |
| 351 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, | |
| 352 dst, dst_stride, | |
| 353 h, filter_x); | |
| 354 src += 4; | |
| 355 dst += 4; | |
| 356 w -= 4; | |
| 357 } | |
| 358 } | |
| 359 if (w) { | |
| 360 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, | |
| 361 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 362 w, h); | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
| 367 uint8_t *dst, ptrdiff_t dst_stride, | |
| 368 const int16_t *filter_x, int x_step_q4, | |
| 369 const int16_t *filter_y, int y_step_q4, | |
| 370 int w, int h) { | |
| 371 if (y_step_q4 == 16 && filter_y[3] != 128) { | |
| 372 while (w >= 16) { | |
| 373 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, | |
| 374 dst, dst_stride, | |
| 375 h, filter_y); | |
| 376 src += 16; | |
| 377 dst += 16; | |
| 378 w -= 16; | |
| 379 } | |
| 380 while (w >= 8) { | |
| 381 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, | |
| 382 dst, dst_stride, | |
| 383 h, filter_y); | |
| 384 src += 8; | |
| 385 dst += 8; | |
| 386 w -= 8; | |
| 387 } | |
| 388 while (w >= 4) { | |
| 389 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, | |
| 390 dst, dst_stride, | |
| 391 h, filter_y); | |
| 392 src += 4; | |
| 393 dst += 4; | |
| 394 w -= 4; | |
| 395 } | |
| 396 } | |
| 397 if (w) { | |
| 398 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, | |
| 399 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 400 w, h); | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
| 405 uint8_t *dst, ptrdiff_t dst_stride, | |
| 406 const int16_t *filter_x, int x_step_q4, | |
| 407 const int16_t *filter_y, int y_step_q4, | |
| 408 int w, int h) { | |
| 409 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
| 410 | |
| 411 assert(w <= 64); | |
| 412 assert(h <= 64); | |
| 413 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
| 414 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, | |
| 415 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 416 w, h + 7); | |
| 417 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, | |
| 418 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 419 } else { | |
| 420 vp9_convolve8_c(src, src_stride, dst, dst_stride, | |
| 421 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 422 } | |
| 423 } | |
| 424 | |
| 425 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, | |
| 426 uint8_t *dst, ptrdiff_t dst_stride, | |
| 427 const int16_t *filter_x, int x_step_q4, | |
| 428 const int16_t *filter_y, int y_step_q4, | |
| 429 int w, int h) { | |
| 430 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); | |
| 431 | |
| 432 assert(w <= 64); | |
| 433 assert(h <= 64); | |
| 434 if (x_step_q4 == 16 && y_step_q4 == 16) { | |
| 435 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, | |
| 436 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 437 w, h + 7); | |
| 438 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, | |
| 439 filter_x, x_step_q4, filter_y, y_step_q4, | |
| 440 w, h); | |
| 441 } else { | |
| 442 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, | |
| 443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); | |
| 444 } | |
| 445 } | |
| 446 #endif | 316 #endif |
| OLD | NEW |