source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c - Issue 1302353004: libvpx: Pull from upstream

Side by Side Diff: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

Issue 1302353004: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master

Patch Set: Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.	2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license	4 * Use of this source code is governed by a BSD-style license

5 * that can be found in the LICENSE file in the root of the source	5 * that can be found in the LICENSE file in the root of the source

6 * tree. An additional intellectual property rights grant can be found	6 * tree. An additional intellectual property rights grant can be found

7 * in the file PATENTS. All contributing project authors may	7 * in the file PATENTS. All contributing project authors may

8 * be found in the AUTHORS file in the root of the source tree.	8 * be found in the AUTHORS file in the root of the source tree.

9 */	9 */

10	10

11 // Due to a header conflict between math.h and intrinsics includes with ceil()	11 // Due to a header conflict between math.h and intrinsics includes with ceil()

12 // in certain configurations under vs9 this include needs to precede	12 // in certain configurations under vs9 this include needs to precede

13 // tmmintrin.h.	13 // tmmintrin.h.

14	14

15 #include <tmmintrin.h>	15 #include <tmmintrin.h>

16	16

17 #include "./vpx_dsp_rtcd.h"	17 #include "./vpx_dsp_rtcd.h"

	18 #include "vpx_dsp/vpx_filter.h"

18 #include "vpx_dsp/x86/convolve.h"	19 #include "vpx_dsp/x86/convolve.h"

	20 #include "vpx_mem/vpx_mem.h"

19 #include "vpx_ports/mem.h"	21 #include "vpx_ports/mem.h"

20 #include "vpx_ports/emmintrin_compat.h"	22 #include "vpx_ports/emmintrin_compat.h"

21	23

22 // filters only for the 4_h8 convolution	24 // filters only for the 4_h8 convolution

23 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {	25 DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {

24 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6	26 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6

25 };	27 };

26	28

27 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {	29 DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {

28 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10	30 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10

(...skipping 165 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
194	196

195 src_ptr+=src_pixels_per_line;	197 src_ptr+=src_pixels_per_line;

196	198

197 // save only 8 bytes	199 // save only 8 bytes

198 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);	200 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);

199	201

200 output_ptr+=output_pitch;	202 output_ptr+=output_pitch;

201 }	203 }

202 }	204 }

203	205

	206 #if ARCH_X86_64

204 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,	207 static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,

205 ptrdiff_t src_pixels_per_line,	208 ptrdiff_t src_pixels_per_line,

206 uint8_t *output_ptr,	209 uint8_t *output_ptr,

207 ptrdiff_t output_pitch,	210 ptrdiff_t output_pitch,

208 uint32_t output_height,	211 uint32_t output_height,

209 const int16_t *filter) {	212 const int16_t *filter) {

210 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;	213 __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;

211 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;	214 __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;

212 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	215 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

213 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;	216 __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
310 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);	313 srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

311	314

312 src_ptr+=src_pixels_per_line;	315 src_ptr+=src_pixels_per_line;

313	316

314 // save 16 bytes	317 // save 16 bytes

315 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);	318 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

316	319

317 output_ptr+=output_pitch;	320 output_ptr+=output_pitch;

318 }	321 }

319 }	322 }

	323 #endif // ARCH_X86_64

320	324

321 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,	325 void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,

322 ptrdiff_t src_pitch,	326 ptrdiff_t src_pitch,

323 uint8_t *output_ptr,	327 uint8_t *output_ptr,

324 ptrdiff_t out_pitch,	328 ptrdiff_t out_pitch,

325 uint32_t output_height,	329 uint32_t output_height,

326 const int16_t *filter) {	330 const int16_t *filter) {

327 __m128i addFilterReg64, filtersReg, minReg;	331 __m128i addFilterReg64, filtersReg, minReg;

328 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	332 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

329 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;	333 __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
399 srcReg6 = srcReg7;	403 srcReg6 = srcReg7;

400 srcReg7 = srcReg8;	404 srcReg7 = srcReg8;

401	405

402 // save only 8 bytes convolve result	406 // save only 8 bytes convolve result

403 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);	407 _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);

404	408

405 output_ptr+=out_pitch;	409 output_ptr+=out_pitch;

406 }	410 }

407 }	411 }

408	412

	413 #if ARCH_X86_64

409 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,	414 static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,

410 ptrdiff_t src_pitch,	415 ptrdiff_t src_pitch,

411 uint8_t *output_ptr,	416 uint8_t *output_ptr,

412 ptrdiff_t out_pitch,	417 ptrdiff_t out_pitch,

413 uint32_t output_height,	418 uint32_t output_height,

414 const int16_t *filter) {	419 const int16_t *filter) {

415 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;	420 __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;

416 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;	421 __m128i firstFilters, secondFilters, thirdFilters, forthFilters;

417 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;	422 __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;

418 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;	423 __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
513 srcReg5 = srcReg6;	518 srcReg5 = srcReg6;

514 srcReg6 = srcReg7;	519 srcReg6 = srcReg7;

515 srcReg7 = srcReg8;	520 srcReg7 = srcReg8;

516	521

517 // save 16 bytes convolve result	522 // save 16 bytes convolve result

518 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);	523 _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

519	524

520 output_ptr+=out_pitch;	525 output_ptr+=out_pitch;

521 }	526 }

522 }	527 }

	528 #endif // ARCH_X86_64

523	529

524 #if ARCH_X86_64	530 #if ARCH_X86_64

525 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;	531 filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;

526 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;	532 filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;

527 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;	533 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;

528 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;	534 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;

529 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;	535 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;

530 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;	536 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;

531 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3	537 #define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3

532 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3	538 #define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3

(...skipping 47 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
580 // uint8_t *dst, ptrdiff_t dst_stride,	586 // uint8_t *dst, ptrdiff_t dst_stride,

581 // const int16_t *filter_x, int x_step_q4,	587 // const int16_t *filter_x, int x_step_q4,

582 // const int16_t *filter_y, int y_step_q4,	588 // const int16_t *filter_y, int y_step_q4,

583 // int w, int h);	589 // int w, int h);

584 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);	590 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);

585 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);	591 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

586 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);	592 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);

587 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,	593 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

588 ssse3);	594 ssse3);

589	595

590 // void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,	596 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \

	597 out0, out1, out2, out3, out4, out5, out6, out7) { \

	598 const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \

	599 const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \

	600 const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \

	601 const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \

	602 \

	603 const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \

	604 const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \

	605 const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \

	606 const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \

	607 \

	608 const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \

	609 const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \

	610 const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \

	611 const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \

	612 \

	613 out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \

	614 out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \

	615 out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \

	616 out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \

	617 out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \

	618 out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \

	619 out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \

	620 out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \

	621 }

	622

	623 static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,

	624 uint8_t dst, const int16_t x_filter) {

	625 const __m128i k_256 = _mm_set1_epi16(1 << 8);

	626 const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);

	627 // pack and duplicate the filter values

	628 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));

	629 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));

	630 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));

	631 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));

	632 const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);

	633 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));

	634 const __m128i C = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 2));

	635 const __m128i D = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 3));

	636 const __m128i E = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 4));

	637 const __m128i F = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 5));

	638 const __m128i G = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 6));

	639 const __m128i H = _mm_loadl_epi64((const __m128i )(src_x + src_pitch 7));

	640 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17

	641 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);

	642 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37

	643 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);

	644 // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57

	645 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);

	646 // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77

	647 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);

	648 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33

	649 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

	650 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37

	651 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);

	652 // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73

	653 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);

	654 // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77

	655 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);

	656 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71

	657 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);

	658 const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);

	659 const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);

	660 const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);

	661 // multiply 2 adjacent elements with the filter and add the result

	662 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);

	663 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);

	664 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);

	665 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);

	666 // add and saturate the results together

	667 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

	668 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

	669 __m128i temp = _mm_adds_epi16(x0, x3);

	670 temp = _mm_adds_epi16(temp, min_x2x1);

	671 temp = _mm_adds_epi16(temp, max_x2x1);

	672 // round and shift by 7 bit each 16 bit

	673 temp = _mm_mulhrs_epi16(temp, k_256);

	674 // shrink to 8 bit each 16 bits

	675 temp = _mm_packus_epi16(temp, temp);

	676 // save only 8 bytes convolve result

	677 _mm_storel_epi64((__m128i*)dst, temp);

	678 }

	679

	680 static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,

	681 uint8_t *dst, ptrdiff_t dst_stride) {

	682 __m128i A, B, C, D, E, F, G, H;

	683

	684 A = _mm_loadl_epi64((const __m128i *)src);

	685 B = _mm_loadl_epi64((const __m128i *)(src + src_stride));

	686 C = _mm_loadl_epi64((const __m128i )(src + src_stride 2));

	687 D = _mm_loadl_epi64((const __m128i )(src + src_stride 3));

	688 E = _mm_loadl_epi64((const __m128i )(src + src_stride 4));

	689 F = _mm_loadl_epi64((const __m128i )(src + src_stride 5));

	690 G = _mm_loadl_epi64((const __m128i )(src + src_stride 6));

	691 H = _mm_loadl_epi64((const __m128i )(src + src_stride 7));

	692

	693 TRANSPOSE_8X8(A, B, C, D, E, F, G, H,

	694 A, B, C, D, E, F, G, H);

	695

	696 _mm_storel_epi64((__m128i*)dst, A);

	697 _mm_storel_epi64((__m128i)(dst + dst_stride 1), B);

	698 _mm_storel_epi64((__m128i)(dst + dst_stride 2), C);

	699 _mm_storel_epi64((__m128i)(dst + dst_stride 3), D);

	700 _mm_storel_epi64((__m128i)(dst + dst_stride 4), E);

	701 _mm_storel_epi64((__m128i)(dst + dst_stride 5), F);

	702 _mm_storel_epi64((__m128i)(dst + dst_stride 6), G);

	703 _mm_storel_epi64((__m128i)(dst + dst_stride 7), H);

	704 }

	705

	706 static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,

	707 uint8_t *dst, ptrdiff_t dst_stride,

	708 const InterpKernel *x_filters,

	709 int x0_q4, int x_step_q4, int w, int h) {

	710 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);

	711 int x, y, z;

	712 src -= SUBPEL_TAPS / 2 - 1;

	713

	714 // This function processes 8x8 areas. The intermediate height is not always

	715 // a multiple of 8, so force it to be a multiple of 8 here.

	716 y = h + (8 - (h & 0x7));

	717

	718 do {

	719 int x_q4 = x0_q4;

	720 for (x = 0; x < w; x += 8) {

	721 // process 8 src_x steps

	722 for (z = 0; z < 8; ++z) {

	723 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];

	724 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];

	725 if (x_q4 & SUBPEL_MASK) {

	726 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);

	727 } else {

	728 int i;

	729 for (i = 0; i < 8; ++i) {

	730 temp[z * 8 + i] = src_x[i * src_stride + 3];

	731 }

	732 }

	733 x_q4 += x_step_q4;

	734 }

	735

	736 // transpose the 8x8 filters values back to dst

	737 transpose8x8_to_dst(temp, 8, dst + x, dst_stride);

	738 }

	739

	740 src += src_stride * 8;

	741 dst += dst_stride * 8;

	742 } while (y -= 8);

	743 }

	744

	745 static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,

	746 uint8_t dst, const int16_t filter) {

	747 const __m128i k_256 = _mm_set1_epi16(1 << 8);

	748 const __m128i f_values = _mm_load_si128((const __m128i *)filter);

	749 // pack and duplicate the filter values

	750 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));

	751 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));

	752 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));

	753 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));

	754 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);

	755 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));

	756 const __m128i C = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));

	757 const __m128i D = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));

	758 // TRANSPOSE...

	759 // 00 01 02 03 04 05 06 07

	760 // 10 11 12 13 14 15 16 17

	761 // 20 21 22 23 24 25 26 27

	762 // 30 31 32 33 34 35 36 37

	763 //

	764 // TO

	765 //

	766 // 00 10 20 30

	767 // 01 11 21 31

	768 // 02 12 22 32

	769 // 03 13 23 33

	770 // 04 14 24 34

	771 // 05 15 25 35

	772 // 06 16 26 36

	773 // 07 17 27 37

	774 //

	775 // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17

	776 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);

	777 // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37

	778 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);

	779 // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33

	780 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

	781 // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37

	782 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);

	783 // 02 03 12 13 22 23 32 33

	784 const __m128i s3s2 = _mm_srli_si128(s1s0, 8);

	785 // 06 07 16 17 26 27 36 37

	786 const __m128i s7s6 = _mm_srli_si128(s5s4, 8);

	787 // multiply 2 adjacent elements with the filter and add the result

	788 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);

	789 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);

	790 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);

	791 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);

	792 // add and saturate the results together

	793 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

	794 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

	795 __m128i temp = _mm_adds_epi16(x0, x3);

	796 temp = _mm_adds_epi16(temp, min_x2x1);

	797 temp = _mm_adds_epi16(temp, max_x2x1);

	798 // round and shift by 7 bit each 16 bit

	799 temp = _mm_mulhrs_epi16(temp, k_256);

	800 // shrink to 8 bit each 16 bits

	801 temp = _mm_packus_epi16(temp, temp);

	802 // save only 4 bytes

	803 (int )dst = _mm_cvtsi128_si32(temp);

	804 }

	805

	806 static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,

	807 uint8_t *dst, ptrdiff_t dst_stride) {

	808 __m128i A = _mm_cvtsi32_si128((const int )src);

	809 __m128i B = _mm_cvtsi32_si128((const int )(src + src_stride));

	810 __m128i C = _mm_cvtsi32_si128((const int )(src + src_stride * 2));

	811 __m128i D = _mm_cvtsi32_si128((const int )(src + src_stride * 3));

	812 // 00 10 01 11 02 12 03 13

	813 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);

	814 // 20 30 21 31 22 32 23 33

	815 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);

	816 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

	817 A = _mm_unpacklo_epi16(tr0_0, tr0_1);

	818 B = _mm_srli_si128(A, 4);

	819 C = _mm_srli_si128(A, 8);

	820 D = _mm_srli_si128(A, 12);

	821

	822 (int )(dst) = _mm_cvtsi128_si32(A);

	823 (int )(dst + dst_stride) = _mm_cvtsi128_si32(B);

	824 (int )(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);

	825 (int )(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);

	826 }

	827

	828 static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,

	829 uint8_t *dst, ptrdiff_t dst_stride,

	830 const InterpKernel *x_filters,

	831 int x0_q4, int x_step_q4, int w, int h) {

	832 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);

	833 int x, y, z;

	834 src -= SUBPEL_TAPS / 2 - 1;

	835

	836 for (y = 0; y < h; y += 4) {

	837 int x_q4 = x0_q4;

	838 for (x = 0; x < w; x += 4) {

	839 // process 4 src_x steps

	840 for (z = 0; z < 4; ++z) {

	841 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];

	842 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];

	843 if (x_q4 & SUBPEL_MASK) {

	844 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);

	845 } else {

	846 int i;

	847 for (i = 0; i < 4; ++i) {

	848 temp[z * 4 + i] = src_x[i * src_stride + 3];

	849 }

	850 }

	851 x_q4 += x_step_q4;

	852 }

	853

	854 // transpose the 4x4 filters values back to dst

	855 transpose4x4_to_dst(temp, 4, dst + x, dst_stride);

	856 }

	857

	858 src += src_stride * 4;

	859 dst += dst_stride * 4;

	860 }

	861 }

	862

	863 static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,

	864 uint8_t dst, const int16_t filter) {

	865 const __m128i k_256 = _mm_set1_epi16(1 << 8);

	866 const __m128i f_values = _mm_load_si128((const __m128i *)filter);

	867 // pack and duplicate the filter values

	868 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));

	869 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));

	870 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));

	871 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));

	872 const __m128i A = _mm_cvtsi32_si128((const int )src_ptr);

	873 const __m128i B = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch));

	874 const __m128i C = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 2));

	875 const __m128i D = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 3));

	876 const __m128i E = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 4));

	877 const __m128i F = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 5));

	878 const __m128i G = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 6));

	879 const __m128i H = _mm_cvtsi32_si128((const int )(src_ptr + src_pitch * 7));

	880 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);

	881 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);

	882 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);

	883 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);

	884 // multiply 2 adjacent elements with the filter and add the result

	885 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);

	886 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);

	887 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);

	888 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);

	889 // add and saturate the results together

	890 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

	891 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

	892 __m128i temp = _mm_adds_epi16(x0, x3);

	893 temp = _mm_adds_epi16(temp, min_x2x1);

	894 temp = _mm_adds_epi16(temp, max_x2x1);

	895 // round and shift by 7 bit each 16 bit

	896 temp = _mm_mulhrs_epi16(temp, k_256);

	897 // shrink to 8 bit each 16 bits

	898 temp = _mm_packus_epi16(temp, temp);

	899 // save only 4 bytes

	900 (int )dst = _mm_cvtsi128_si32(temp);

	901 }

	902

	903 static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,

	904 uint8_t *dst, ptrdiff_t dst_stride,

	905 const InterpKernel *y_filters,

	906 int y0_q4, int y_step_q4, int w, int h) {

	907 int y;

	908 int y_q4 = y0_q4;

	909

	910 src -= src_stride * (SUBPEL_TAPS / 2 - 1);

	911 for (y = 0; y < h; ++y) {

	912 const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];

	913 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];

	914

	915 if (y_q4 & SUBPEL_MASK) {

	916 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);

	917 } else {

	918 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);

	919 }

	920

	921 y_q4 += y_step_q4;

	922 }

	923 }

	924

	925 static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,

	926 uint8_t dst, const int16_t filter) {

	927 const __m128i k_256 = _mm_set1_epi16(1 << 8);

	928 const __m128i f_values = _mm_load_si128((const __m128i *)filter);

	929 // pack and duplicate the filter values

	930 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));

	931 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));

	932 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));

	933 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));

	934 const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);

	935 const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));

	936 const __m128i C = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 2));

	937 const __m128i D = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 3));

	938 const __m128i E = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 4));

	939 const __m128i F = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 5));

	940 const __m128i G = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 6));

	941 const __m128i H = _mm_loadl_epi64((const __m128i )(src_ptr + src_pitch 7));

	942 const __m128i s1s0 = _mm_unpacklo_epi8(A, B);

	943 const __m128i s3s2 = _mm_unpacklo_epi8(C, D);

	944 const __m128i s5s4 = _mm_unpacklo_epi8(E, F);

	945 const __m128i s7s6 = _mm_unpacklo_epi8(G, H);

	946 // multiply 2 adjacent elements with the filter and add the result

	947 const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);

	948 const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);

	949 const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);

	950 const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);

	951 // add and saturate the results together

	952 const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

	953 const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

	954 __m128i temp = _mm_adds_epi16(x0, x3);

	955 temp = _mm_adds_epi16(temp, min_x2x1);

	956 temp = _mm_adds_epi16(temp, max_x2x1);

	957 // round and shift by 7 bit each 16 bit

	958 temp = _mm_mulhrs_epi16(temp, k_256);

	959 // shrink to 8 bit each 16 bits

	960 temp = _mm_packus_epi16(temp, temp);

	961 // save only 8 bytes convolve result

	962 _mm_storel_epi64((__m128i*)dst, temp);

	963 }

	964

	965 static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,

	966 uint8_t *dst, ptrdiff_t dst_stride,

	967 const InterpKernel *y_filters,

	968 int y0_q4, int y_step_q4, int w, int h) {

	969 int y;

	970 int y_q4 = y0_q4;

	971

	972 src -= src_stride * (SUBPEL_TAPS / 2 - 1);

	973 for (y = 0; y < h; ++y) {

	974 const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];

	975 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];

	976 if (y_q4 & SUBPEL_MASK) {

	977 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);

	978 } else {

	979 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);

	980 }

	981 y_q4 += y_step_q4;

	982 }

	983 }

	984

	985 static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,

	986 uint8_t dst, const int16_t filter, int w) {

	987 const __m128i k_256 = _mm_set1_epi16(1 << 8);

	988 const __m128i f_values = _mm_load_si128((const __m128i *)filter);

	989 // pack and duplicate the filter values

	990 const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));

	991 const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));

	992 const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));

	993 const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));

	994 int i;

	995

	996 for (i = 0; i < w; i += 16) {

	997 const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);

	998 const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));

	999 const __m128i C =

	1000 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 2));

	1001 const __m128i D =

	1002 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 3));

	1003 const __m128i E =

	1004 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 4));

	1005 const __m128i F =

	1006 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 5));

	1007 const __m128i G =

	1008 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 6));

	1009 const __m128i H =

	1010 _mm_loadu_si128((const __m128i )(src_ptr + src_pitch 7));

	1011 // merge the result together

	1012 const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);

	1013 const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);

	1014 const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);

	1015 const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);

	1016 // multiply 2 adjacent elements with the filter and add the result

	1017 const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);

	1018 const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);

	1019 const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);

	1020 const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);

	1021 // add and saturate the results together

	1022 const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);

	1023 const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);

	1024 // merge the result together

	1025 const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);

	1026 const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);

	1027 // multiply 2 adjacent elements with the filter and add the result

	1028 const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);

	1029 const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);

	1030 // merge the result together

	1031 const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);

	1032 const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);

	1033 // multiply 2 adjacent elements with the filter and add the result

	1034 const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);

	1035 const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);

	1036 // add and saturate the results together

	1037 __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));

	1038 __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));

	1039

	1040 // add and saturate the results together

	1041 temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));

	1042 temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));

	1043 // round and shift by 7 bit each 16 bit

	1044 temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);

	1045 temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);

	1046 // shrink to 8 bit each 16 bits, the first lane contain the first

	1047 // convolve result and the second lane contain the second convolve

	1048 // result

	1049 temp_hi = _mm_packus_epi16(temp_lo, temp_hi);

	1050 src_ptr += 16;

	1051 // save 16 bytes convolve result

	1052 _mm_store_si128((__m128i*)&dst[i], temp_hi);

	1053 }

	1054 }

	1055

	1056 static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,

	1057 uint8_t *dst, ptrdiff_t dst_stride,

	1058 const InterpKernel *y_filters,

	1059 int y0_q4, int y_step_q4, int w, int h) {

	1060 int y;

	1061 int y_q4 = y0_q4;

	1062

	1063 src -= src_stride * (SUBPEL_TAPS / 2 - 1);

	1064 for (y = 0; y < h; ++y) {

	1065 const unsigned char src_y = &src[(y_q4 >> SUBPEL_BITS) src_stride];

	1066 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];

	1067 if (y_q4 & SUBPEL_MASK) {

	1068 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,

	1069 w);

	1070 } else {

	1071 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);

	1072 }

	1073 y_q4 += y_step_q4;

	1074 }

	1075 }

	1076

	1077 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,

	1078 uint8_t *dst, ptrdiff_t dst_stride,

	1079 const InterpKernel *const x_filters,

	1080 int x0_q4, int x_step_q4,

	1081 const InterpKernel *const y_filters,

	1082 int y0_q4, int y_step_q4,

	1083 int w, int h) {

	1084 // Note: Fixed size intermediate buffer, temp, places limits on parameters.

	1085 // 2d filtering proceeds in 2 steps:

	1086 // (1) Interpolate horizontally into an intermediate buffer, temp.

	1087 // (2) Interpolate temp vertically to derive the sub-pixel result.

	1088 // Deriving the maximum number of rows in the temp buffer (135):

	1089 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).

	1090 // --Largest block size is 64x64 pixels.

	1091 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the

	1092 // original frame (in 1/16th pixel units).

	1093 // --Must round-up because block may be located at sub-pixel position.

	1094 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.

	1095 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.

	1096 // --Require an additional 8 rows for the horiz_w8 transpose tail.

	1097 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);

	1098 const int intermediate_height =

	1099 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

	1100

	1101 assert(w <= 64);

	1102 assert(h <= 64);

	1103 assert(y_step_q4 <= 32);

	1104 assert(x_step_q4 <= 32);

	1105

	1106 if (w >= 8) {

	1107 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),

	1108 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,

	1109 w, intermediate_height);

	1110 } else {

	1111 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),

	1112 src_stride, temp, 64, x_filters, x0_q4, x_step_q4,

	1113 w, intermediate_height);

	1114 }

	1115

	1116 if (w >= 16) {

	1117 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

	1118 dst_stride, y_filters, y0_q4, y_step_q4, w, h);

	1119 } else if (w == 8) {

	1120 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

	1121 dst_stride, y_filters, y0_q4, y_step_q4, w, h);

	1122 } else {

	1123 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

	1124 dst_stride, y_filters, y0_q4, y_step_q4, w, h);

	1125 }

	1126 }

	1127

	1128 static const InterpKernel get_filter_base(const int16_t filter) {

	1129 // NOTE: This assumes that the filter table is 256-byte aligned.

	1130 // TODO(agrange) Modify to make independent of table alignment.

	1131 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));

	1132 }

	1133

	1134 static int get_filter_offset(const int16_t f, const InterpKernel base) {

	1135 return (int)((const InterpKernel *)(intptr_t)f - base);

	1136 }

	1137

	1138 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride,

	1139 uint8_t *dst, ptrdiff_t dst_stride,

	1140 const int16_t *filter_x, int x_step_q4,

	1141 const int16_t *filter_y, int y_step_q4,

	1142 int w, int h) {

	1143 const InterpKernel *const filters_x = get_filter_base(filter_x);

	1144 const int x0_q4 = get_filter_offset(filter_x, filters_x);

	1145

	1146 const InterpKernel *const filters_y = get_filter_base(filter_y);

	1147 const int y0_q4 = get_filter_offset(filter_y, filters_y);

	1148

	1149 scaledconvolve2d(src, src_stride, dst, dst_stride,

	1150 filters_x, x0_q4, x_step_q4,

	1151 filters_y, y0_q4, y_step_q4, w, h);

	1152 }

	1153

	1154 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

591 // uint8_t *dst, ptrdiff_t dst_stride,	1155 // uint8_t *dst, ptrdiff_t dst_stride,

592 // const int16_t *filter_x, int x_step_q4,	1156 // const int16_t *filter_x, int x_step_q4,

593 // const int16_t *filter_y, int y_step_q4,	1157 // const int16_t *filter_y, int y_step_q4,

594 // int w, int h);	1158 // int w, int h);

595 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,	1159 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

596 // uint8_t *dst, ptrdiff_t dst_stride,	1160 // uint8_t *dst, ptrdiff_t dst_stride,

597 // const int16_t *filter_x, int x_step_q4,	1161 // const int16_t *filter_x, int x_step_q4,

598 // const int16_t *filter_y, int y_step_q4,	1162 // const int16_t *filter_y, int y_step_q4,

599 // int w, int h);	1163 // int w, int h);

600 FUN_CONV_2D(, ssse3);	1164 FUN_CONV_2D(, ssse3);

601 FUN_CONV_2D(avg_ , ssse3);	1165 FUN_CONV_2D(avg_ , ssse3);

OLD	NEW

« no previous file with comments | « source/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm ('k') | source/libvpx/vpx_mem/vpx_mem.h » ('j') | no next file with comments »