| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 %include "third_party/x86inc/x86inc.asm" | |
| 12 | |
| 13 SECTION .text | |
| 14 | |
| 15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
| 16 %macro PROCESS_4x2x4 5-6 0 | |
| 17 movd m0, [srcq +%2] | |
| 18 %if %1 == 1 | |
| 19 movd m6, [ref1q+%3] | |
| 20 movd m4, [ref2q+%3] | |
| 21 movd m7, [ref3q+%3] | |
| 22 movd m5, [ref4q+%3] | |
| 23 punpckldq m0, [srcq +%4] | |
| 24 punpckldq m6, [ref1q+%5] | |
| 25 punpckldq m4, [ref2q+%5] | |
| 26 punpckldq m7, [ref3q+%5] | |
| 27 punpckldq m5, [ref4q+%5] | |
| 28 psadbw m6, m0 | |
| 29 psadbw m4, m0 | |
| 30 psadbw m7, m0 | |
| 31 psadbw m5, m0 | |
| 32 punpckldq m6, m4 | |
| 33 punpckldq m7, m5 | |
| 34 %else | |
| 35 movd m1, [ref1q+%3] | |
| 36 movd m2, [ref2q+%3] | |
| 37 movd m3, [ref3q+%3] | |
| 38 movd m4, [ref4q+%3] | |
| 39 punpckldq m0, [srcq +%4] | |
| 40 punpckldq m1, [ref1q+%5] | |
| 41 punpckldq m2, [ref2q+%5] | |
| 42 punpckldq m3, [ref3q+%5] | |
| 43 punpckldq m4, [ref4q+%5] | |
| 44 psadbw m1, m0 | |
| 45 psadbw m2, m0 | |
| 46 psadbw m3, m0 | |
| 47 psadbw m4, m0 | |
| 48 punpckldq m1, m2 | |
| 49 punpckldq m3, m4 | |
| 50 paddd m6, m1 | |
| 51 paddd m7, m3 | |
| 52 %endif | |
| 53 %if %6 == 1 | |
| 54 lea srcq, [srcq +src_strideq*2] | |
| 55 lea ref1q, [ref1q+ref_strideq*2] | |
| 56 lea ref2q, [ref2q+ref_strideq*2] | |
| 57 lea ref3q, [ref3q+ref_strideq*2] | |
| 58 lea ref4q, [ref4q+ref_strideq*2] | |
| 59 %endif | |
| 60 %endmacro | |
| 61 | |
| 62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
| 63 %macro PROCESS_8x2x4 5-6 0 | |
| 64 movh m0, [srcq +%2] | |
| 65 %if %1 == 1 | |
| 66 movh m4, [ref1q+%3] | |
| 67 movh m5, [ref2q+%3] | |
| 68 movh m6, [ref3q+%3] | |
| 69 movh m7, [ref4q+%3] | |
| 70 movhps m0, [srcq +%4] | |
| 71 movhps m4, [ref1q+%5] | |
| 72 movhps m5, [ref2q+%5] | |
| 73 movhps m6, [ref3q+%5] | |
| 74 movhps m7, [ref4q+%5] | |
| 75 psadbw m4, m0 | |
| 76 psadbw m5, m0 | |
| 77 psadbw m6, m0 | |
| 78 psadbw m7, m0 | |
| 79 %else | |
| 80 movh m1, [ref1q+%3] | |
| 81 movh m2, [ref2q+%3] | |
| 82 movh m3, [ref3q+%3] | |
| 83 movhps m0, [srcq +%4] | |
| 84 movhps m1, [ref1q+%5] | |
| 85 movhps m2, [ref2q+%5] | |
| 86 movhps m3, [ref3q+%5] | |
| 87 psadbw m1, m0 | |
| 88 psadbw m2, m0 | |
| 89 psadbw m3, m0 | |
| 90 paddd m4, m1 | |
| 91 movh m1, [ref4q+%3] | |
| 92 movhps m1, [ref4q+%5] | |
| 93 paddd m5, m2 | |
| 94 paddd m6, m3 | |
| 95 psadbw m1, m0 | |
| 96 paddd m7, m1 | |
| 97 %endif | |
| 98 %if %6 == 1 | |
| 99 lea srcq, [srcq +src_strideq*2] | |
| 100 lea ref1q, [ref1q+ref_strideq*2] | |
| 101 lea ref2q, [ref2q+ref_strideq*2] | |
| 102 lea ref3q, [ref3q+ref_strideq*2] | |
| 103 lea ref4q, [ref4q+ref_strideq*2] | |
| 104 %endif | |
| 105 %endmacro | |
| 106 | |
| 107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
| 108 %macro PROCESS_16x2x4 5-6 0 | |
| 109 ; 1st 16 px | |
| 110 mova m0, [srcq +%2] | |
| 111 %if %1 == 1 | |
| 112 movu m4, [ref1q+%3] | |
| 113 movu m5, [ref2q+%3] | |
| 114 movu m6, [ref3q+%3] | |
| 115 movu m7, [ref4q+%3] | |
| 116 psadbw m4, m0 | |
| 117 psadbw m5, m0 | |
| 118 psadbw m6, m0 | |
| 119 psadbw m7, m0 | |
| 120 %else | |
| 121 movu m1, [ref1q+%3] | |
| 122 movu m2, [ref2q+%3] | |
| 123 movu m3, [ref3q+%3] | |
| 124 psadbw m1, m0 | |
| 125 psadbw m2, m0 | |
| 126 psadbw m3, m0 | |
| 127 paddd m4, m1 | |
| 128 movu m1, [ref4q+%3] | |
| 129 paddd m5, m2 | |
| 130 paddd m6, m3 | |
| 131 psadbw m1, m0 | |
| 132 paddd m7, m1 | |
| 133 %endif | |
| 134 | |
| 135 ; 2nd 16 px | |
| 136 mova m0, [srcq +%4] | |
| 137 movu m1, [ref1q+%5] | |
| 138 movu m2, [ref2q+%5] | |
| 139 movu m3, [ref3q+%5] | |
| 140 psadbw m1, m0 | |
| 141 psadbw m2, m0 | |
| 142 psadbw m3, m0 | |
| 143 paddd m4, m1 | |
| 144 movu m1, [ref4q+%5] | |
| 145 paddd m5, m2 | |
| 146 paddd m6, m3 | |
| 147 %if %6 == 1 | |
| 148 lea srcq, [srcq +src_strideq*2] | |
| 149 lea ref1q, [ref1q+ref_strideq*2] | |
| 150 lea ref2q, [ref2q+ref_strideq*2] | |
| 151 lea ref3q, [ref3q+ref_strideq*2] | |
| 152 lea ref4q, [ref4q+ref_strideq*2] | |
| 153 %endif | |
| 154 psadbw m1, m0 | |
| 155 paddd m7, m1 | |
| 156 %endmacro | |
| 157 | |
| 158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
| 159 %macro PROCESS_32x2x4 5-6 0 | |
| 160 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 | |
| 161 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 | |
| 162 %endmacro | |
| 163 | |
| 164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end | |
| 165 %macro PROCESS_64x2x4 5-6 0 | |
| 166 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 | |
| 167 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 | |
| 168 %endmacro | |
| 169 | |
| 170 ; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, | |
| 171 ; uint8_t *ref[4], int ref_stride, | |
| 172 ; unsigned int res[4]); | |
| 173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 | |
| 174 %macro SADNXN4D 2 | |
| 175 %if UNIX64 | |
| 176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ | |
| 177 res, ref2, ref3, ref4 | |
| 178 %else | |
| 179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ | |
| 180 ref2, ref3, ref4 | |
| 181 %endif | |
| 182 movsxdifnidn src_strideq, src_strided | |
| 183 movsxdifnidn ref_strideq, ref_strided | |
| 184 mov ref2q, [ref1q+gprsize*1] | |
| 185 mov ref3q, [ref1q+gprsize*2] | |
| 186 mov ref4q, [ref1q+gprsize*3] | |
| 187 mov ref1q, [ref1q+gprsize*0] | |
| 188 | |
| 189 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 | |
| 190 %rep (%2-4)/2 | |
| 191 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 | |
| 192 %endrep | |
| 193 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 | |
| 194 | |
| 195 %if mmsize == 16 | |
| 196 pslldq m5, 4 | |
| 197 pslldq m7, 4 | |
| 198 por m4, m5 | |
| 199 por m6, m7 | |
| 200 mova m5, m4 | |
| 201 mova m7, m6 | |
| 202 punpcklqdq m4, m6 | |
| 203 punpckhqdq m5, m7 | |
| 204 movifnidn r4, r4mp | |
| 205 paddd m4, m5 | |
| 206 movu [r4], m4 | |
| 207 RET | |
| 208 %else | |
| 209 movifnidn r4, r4mp | |
| 210 movq [r4+0], m6 | |
| 211 movq [r4+8], m7 | |
| 212 RET | |
| 213 %endif | |
| 214 %endmacro | |
| 215 | |
| 216 INIT_XMM sse2 | |
| 217 SADNXN4D 64, 64 | |
| 218 SADNXN4D 64, 32 | |
| 219 SADNXN4D 32, 64 | |
| 220 SADNXN4D 32, 32 | |
| 221 SADNXN4D 32, 16 | |
| 222 SADNXN4D 16, 32 | |
| 223 SADNXN4D 16, 16 | |
| 224 SADNXN4D 16, 8 | |
| 225 SADNXN4D 8, 16 | |
| 226 SADNXN4D 8, 8 | |
| 227 SADNXN4D 8, 4 | |
| 228 | |
| 229 INIT_MMX sse | |
| 230 SADNXN4D 4, 8 | |
| 231 SADNXN4D 4, 4 | |
| OLD | NEW |