| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; | 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; | 9 ; |
| 10 | 10 |
| 11 %define program_name vpx | |
| 12 | |
| 13 %include "third_party/x86inc/x86inc.asm" | 11 %include "third_party/x86inc/x86inc.asm" |
| 14 | 12 |
| 15 SECTION .text | 13 SECTION .text |
| 16 | 14 |
| 17 %macro convolve_fn 1 | 15 %macro convolve_fn 1 |
| 18 INIT_XMM sse2 | 16 INIT_XMM sse2 |
| 19 cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \ | 17 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ |
| 20 fx, fxs, fy, fys, w, h | 18 fx, fxs, fy, fys, w, h |
| 21 mov r4d, dword wm | 19 mov r4d, dword wm |
| 22 cmp r4d, 4 | 20 cmp r4d, 4 |
| 23 je .w4 | 21 je .w4 |
| 24 cmp r4d, 8 | 22 cmp r4d, 8 |
| 25 je .w8 | 23 je .w8 |
| 26 cmp r4d, 16 | 24 cmp r4d, 16 |
| 27 je .w16 | 25 je .w16 |
| 28 cmp r4d, 32 | 26 cmp r4d, 32 |
| 29 je .w32 | 27 je .w32 |
| 30 | 28 |
| 31 ; 64xh | 29 mov r4d, dword hm |
| 32 mov r4d, dword hm | |
| 33 shr r4d, 1 ; ASSUMPTION: hm is at least EVEN | |
| 34 sub r4d, 1 | |
| 35 | |
| 36 movu m0, [srcq] | |
| 37 movu m4, [srcq+src_strideq] | |
| 38 movu m1, [srcq+16] | |
| 39 movu m5, [srcq+src_strideq+16] | |
| 40 movu m2, [srcq+32] | |
| 41 movu m6, [srcq+src_strideq+32] | |
| 42 movu m3, [srcq+48] | |
| 43 movu m7, [srcq+src_strideq+48] | |
| 44 | |
| 45 .loop64: | 30 .loop64: |
| 46 prefetcht0 [srcq+64 ] | 31 movu m0, [srcq] |
| 47 prefetcht0 [srcq+src_strideq+64] | 32 movu m1, [srcq+16] |
| 48 | 33 movu m2, [srcq+32] |
| 49 lea srcq, [srcq+src_strideq*2] | 34 movu m3, [srcq+48] |
| 50 | 35 add srcq, src_strideq |
| 51 %ifidn %1, avg | 36 %ifidn %1, avg |
| 52 pavgb m0, [dstq] | 37 pavgb m0, [dstq] |
| 53 pavgb m1, [dstq+16] | 38 pavgb m1, [dstq+16] |
| 54 | 39 pavgb m2, [dstq+32] |
| 55 mova [dstq ], m0 | 40 pavgb m3, [dstq+48] |
| 56 movu m0, [srcq] | |
| 57 | |
| 58 mova [dstq+16], m1 | |
| 59 movu m1, [srcq+16] | |
| 60 | |
| 61 pavgb m2, [dstq+32] | |
| 62 mova [dstq+32], m2 | |
| 63 movu m2, [srcq+32] | |
| 64 pavgb m3, [dstq+48] | |
| 65 mova [dstq+48], m3 | |
| 66 movu m3, [srcq+48] | |
| 67 pavgb m4, [dstq+dst_strideq] | |
| 68 | |
| 69 mova [dstq+dst_strideq], m4 | |
| 70 movu m4, [srcq+src_strideq] | |
| 71 | |
| 72 pavgb m5, [dstq+dst_strideq+16] | |
| 73 mova [dstq+dst_strideq+16], m5 | |
| 74 movu m5, [srcq+src_strideq+16] | |
| 75 pavgb m6, [dstq+dst_strideq+32] | |
| 76 mova [dstq+dst_strideq+32], m6 | |
| 77 movu m6, [srcq+src_strideq+32] | |
| 78 pavgb m7, [dstq+dst_strideq+48] | |
| 79 mova [dstq+dst_strideq+48], m7 | |
| 80 movu m7, [srcq+src_strideq+48] | |
| 81 | |
| 82 lea dstq, [dstq+dst_strideq*2] | |
| 83 %else | |
| 84 mova [dstq ], m0 | |
| 85 movu m0, [srcq] | |
| 86 | |
| 87 mova [dstq+16], m1 | |
| 88 movu m1, [srcq+16] | |
| 89 mova [dstq+32], m2 | |
| 90 movu m2, [srcq+32] | |
| 91 mova [dstq+48], m3 | |
| 92 movu m3, [srcq+48] | |
| 93 | |
| 94 mova [dstq+dst_strideq], m4 | |
| 95 movu m4, [srcq+src_strideq] | |
| 96 | |
| 97 mova [dstq+dst_strideq+16], m5 | |
| 98 movu m5, [srcq+src_strideq+16] | |
| 99 mova [dstq+dst_strideq+32], m6 | |
| 100 movu m6, [srcq+src_strideq+32] | |
| 101 mova [dstq+dst_strideq+48], m7 | |
| 102 movu m7, [srcq+src_strideq+48] | |
| 103 | |
| 104 lea dstq, [dstq+dst_strideq*2] | |
| 105 %endif | 41 %endif |
| 106 dec r4d | 42 mova [dstq ], m0 |
| 43 mova [dstq+16], m1 |
| 44 mova [dstq+32], m2 |
| 45 mova [dstq+48], m3 |
| 46 add dstq, dst_strideq |
| 47 dec r4d |
| 107 jnz .loop64 | 48 jnz .loop64 |
| 108 | |
| 109 %ifidn %1, avg | |
| 110 pavgb m0, [dstq] | |
| 111 pavgb m1, [dstq+16] | |
| 112 pavgb m2, [dstq+32] | |
| 113 pavgb m3, [dstq+48] | |
| 114 pavgb m4, [dstq+dst_strideq] | |
| 115 pavgb m5, [dstq+dst_strideq+16] | |
| 116 pavgb m6, [dstq+dst_strideq+32] | |
| 117 pavgb m7, [dstq+dst_strideq+48] | |
| 118 %endif | |
| 119 mova [dstq ], m0 | |
| 120 mova [dstq+16], m1 | |
| 121 mova [dstq+32], m2 | |
| 122 mova [dstq+48], m3 | |
| 123 | |
| 124 mova [dstq+dst_strideq ], m4 | |
| 125 mova [dstq+dst_strideq+16], m5 | |
| 126 mova [dstq+dst_strideq+32], m6 | |
| 127 mova [dstq+dst_strideq+48], m7 | |
| 128 | |
| 129 RET | 49 RET |
| 130 | 50 |
| 131 .w32: | 51 .w32: |
| 132 mov r4d, dword hm | 52 mov r4d, dword hm |
| 133 sub r4d, 2 | |
| 134 | |
| 135 movu m0, [srcq] | |
| 136 movu m1, [srcq+16] | |
| 137 movu m2, [srcq+src_strideq] | |
| 138 movu m3, [srcq+src_strideq+16] | |
| 139 | |
| 140 .loop32: | 53 .loop32: |
| 141 prefetcht0 [srcq+64] | 54 movu m0, [srcq] |
| 142 prefetcht0 [srcq+src_strideq+64] | 55 movu m1, [srcq+16] |
| 143 | 56 movu m2, [srcq+src_strideq] |
| 144 lea srcq, [srcq+src_strideq*2] | 57 movu m3, [srcq+src_strideq+16] |
| 58 lea srcq, [srcq+src_strideq*2] |
| 145 %ifidn %1, avg | 59 %ifidn %1, avg |
| 146 pavgb m0, [dstq] | 60 pavgb m0, [dstq] |
| 147 pavgb m1, [dstq+16] | 61 pavgb m1, [dstq +16] |
| 148 pavgb m2, [dstq+dst_strideq] | 62 pavgb m2, [dstq+dst_strideq] |
| 149 pavgb m3, [dstq+dst_strideq+16] | 63 pavgb m3, [dstq+dst_strideq+16] |
| 150 %endif | 64 %endif |
| 151 mova [dstq], m0 | 65 mova [dstq ], m0 |
| 152 movu m0, [srcq] | 66 mova [dstq +16], m1 |
| 153 | 67 mova [dstq+dst_strideq ], m2 |
| 154 mova [dstq+16], m1 | 68 mova [dstq+dst_strideq+16], m3 |
| 155 movu m1, [srcq+16] | 69 lea dstq, [dstq+dst_strideq*2] |
| 156 | 70 sub r4d, 2 |
| 157 mova [dstq+dst_strideq], m2 | |
| 158 movu m2, [srcq+src_strideq] | |
| 159 | |
| 160 mova [dstq+dst_strideq+16], m3 | |
| 161 movu m3, [srcq+src_strideq+16] | |
| 162 | |
| 163 lea dstq, [dstq+dst_strideq*2] | |
| 164 | |
| 165 sub r4d, 2 | |
| 166 jnz .loop32 | 71 jnz .loop32 |
| 167 | |
| 168 %ifidn %1, avg | |
| 169 pavgb m0, [dstq] | |
| 170 pavgb m1, [dstq+16] | |
| 171 pavgb m2, [dstq+dst_strideq] | |
| 172 pavgb m3, [dstq+dst_strideq+16] | |
| 173 %endif | |
| 174 mova [dstq ], m0 | |
| 175 mova [dstq+16], m1 | |
| 176 | |
| 177 mova [dstq+dst_strideq ], m2 | |
| 178 mova [dstq+dst_strideq+16], m3 | |
| 179 | |
| 180 RET | 72 RET |
| 181 | 73 |
| 182 .w16: | 74 .w16: |
| 183 mov r4d, dword hm | 75 mov r4d, dword hm |
| 184 sub r4d, 4 | 76 lea r5q, [src_strideq*3] |
| 185 | 77 lea r6q, [dst_strideq*3] |
| 186 movu m0, [srcq] | |
| 187 movu m1, [srcq+src_strideq] | |
| 188 | |
| 189 .loop16: | 78 .loop16: |
| 190 lea srcq, [srcq+src_strideq] | 79 movu m0, [srcq] |
| 191 prefetcht0 [srcq+src_strideq*4] | 80 movu m1, [srcq+src_strideq] |
| 192 lea srcq, [srcq+src_strideq] | 81 movu m2, [srcq+src_strideq*2] |
| 193 prefetcht0 [srcq+src_strideq*2] | 82 movu m3, [srcq+r5q] |
| 83 lea srcq, [srcq+src_strideq*4] |
| 194 %ifidn %1, avg | 84 %ifidn %1, avg |
| 195 pavgb m0, [dstq] | 85 pavgb m0, [dstq] |
| 196 pavgb m1, [dstq+dst_strideq] | 86 pavgb m1, [dstq+dst_strideq] |
| 87 pavgb m2, [dstq+dst_strideq*2] |
| 88 pavgb m3, [dstq+r6q] |
| 197 %endif | 89 %endif |
| 198 mova [dstq ], m0 | 90 mova [dstq ], m0 |
| 199 mova [dstq+dst_strideq], m1 | 91 mova [dstq+dst_strideq ], m1 |
| 200 | 92 mova [dstq+dst_strideq*2], m2 |
| 201 lea dstq, [dstq+dst_strideq*2] | 93 mova [dstq+r6q ], m3 |
| 202 | 94 lea dstq, [dstq+dst_strideq*4] |
| 203 movu m0, [srcq] | 95 sub r4d, 4 |
| 204 movu m1, [srcq+src_strideq] | |
| 205 | |
| 206 sub r4d, 2 | |
| 207 jnz .loop16 | 96 jnz .loop16 |
| 208 | |
| 209 lea srcq, [srcq+src_strideq*2] | |
| 210 %ifidn %1, avg | |
| 211 pavgb m0, [dstq] | |
| 212 pavgb m1, [dstq+dst_strideq] | |
| 213 %endif | |
| 214 mova [dstq ], m0 | |
| 215 mova [dstq+dst_strideq], m1 | |
| 216 | |
| 217 lea dstq, [dstq+dst_strideq*2] | |
| 218 | |
| 219 movu m0, [srcq] | |
| 220 movu m1, [srcq+src_strideq] | |
| 221 | |
| 222 %ifidn %1, avg | |
| 223 pavgb m0, [dstq] | |
| 224 pavgb m1, [dstq+dst_strideq] | |
| 225 %endif | |
| 226 | |
| 227 mova [dstq ], m0 | |
| 228 mova [dstq+dst_strideq], m1 | |
| 229 | |
| 230 RET | 97 RET |
| 231 | 98 |
| 232 INIT_MMX sse | 99 INIT_MMX sse |
| 233 .w8: | 100 .w8: |
| 234 mov r4d, dword hm | 101 mov r4d, dword hm |
| 235 sub r4d, 2 | 102 lea r5q, [src_strideq*3] |
| 236 | 103 lea r6q, [dst_strideq*3] |
| 237 movu m0, [srcq] | |
| 238 movu m1, [srcq+src_strideq] | |
| 239 | |
| 240 .loop8: | 104 .loop8: |
| 241 lea srcq, [srcq+src_strideq] | 105 movu m0, [srcq] |
| 242 prefetcht0 [srcq+src_strideq*4] | 106 movu m1, [srcq+src_strideq] |
| 243 lea srcq, [srcq+src_strideq] | 107 movu m2, [srcq+src_strideq*2] |
| 244 prefetcht0 [srcq+src_strideq*2] | 108 movu m3, [srcq+r5q] |
| 245 | 109 lea srcq, [srcq+src_strideq*4] |
| 246 %ifidn %1, avg | 110 %ifidn %1, avg |
| 247 pavgb m0, [dstq] | 111 pavgb m0, [dstq] |
| 248 pavgb m1, [dstq+dst_strideq] | 112 pavgb m1, [dstq+dst_strideq] |
| 113 pavgb m2, [dstq+dst_strideq*2] |
| 114 pavgb m3, [dstq+r6q] |
| 249 %endif | 115 %endif |
| 250 mova [dstq ], m0 | 116 mova [dstq ], m0 |
| 251 mova [dstq+dst_strideq], m1 | 117 mova [dstq+dst_strideq ], m1 |
| 252 | 118 mova [dstq+dst_strideq*2], m2 |
| 253 movu m0, [srcq] | 119 mova [dstq+r6q ], m3 |
| 254 movu m1, [srcq+src_strideq] | 120 lea dstq, [dstq+dst_strideq*4] |
| 255 | 121 sub r4d, 4 |
| 256 lea dstq, [dstq+dst_strideq*2] | |
| 257 | |
| 258 sub r4d, 2 | |
| 259 jnz .loop8 | 122 jnz .loop8 |
| 260 | |
| 261 %ifidn %1, avg | |
| 262 pavgb m0, [dstq] | |
| 263 pavgb m1, [dstq+dst_strideq] | |
| 264 %endif | |
| 265 mova [dstq ], m0 | |
| 266 mova [dstq+dst_strideq], m1 | |
| 267 | |
| 268 RET | 123 RET |
| 269 | 124 |
| 270 .w4: | 125 .w4: |
| 271 mov r4d, dword hm | 126 mov r4d, dword hm |
| 272 | 127 lea r5q, [src_strideq*3] |
| 273 lea r5q, [src_strideq*3] | 128 lea r6q, [dst_strideq*3] |
| 274 lea r6q, [dst_strideq*3] | |
| 275 | |
| 276 .loop4: | 129 .loop4: |
| 277 movh m0, [srcq] | 130 movh m0, [srcq] |
| 278 movh m1, [srcq+src_strideq] | 131 movh m1, [srcq+src_strideq] |
| 279 movh m2, [srcq+src_strideq*2] | 132 movh m2, [srcq+src_strideq*2] |
| 280 movh m3, [srcq+r5q] | 133 movh m3, [srcq+r5q] |
| 281 | 134 lea srcq, [srcq+src_strideq*4] |
| 282 lea srcq, [srcq+src_strideq*4] | |
| 283 %ifidn %1, avg | 135 %ifidn %1, avg |
| 284 movh m4, [dstq] | 136 movh m4, [dstq] |
| 285 movh m5, [dstq+dst_strideq] | 137 movh m5, [dstq+dst_strideq] |
| 286 movh m6, [dstq+dst_strideq*2] | 138 movh m6, [dstq+dst_strideq*2] |
| 287 movh m7, [dstq+r6q] | 139 movh m7, [dstq+r6q] |
| 288 | 140 pavgb m0, m4 |
| 289 pavgb m0, m4 | 141 pavgb m1, m5 |
| 290 pavgb m1, m5 | 142 pavgb m2, m6 |
| 291 pavgb m2, m6 | 143 pavgb m3, m7 |
| 292 pavgb m3, m7 | |
| 293 %endif | 144 %endif |
| 294 movh [dstq ], m0 | 145 movh [dstq ], m0 |
| 295 movh [dstq+dst_strideq ], m1 | 146 movh [dstq+dst_strideq ], m1 |
| 296 movh [dstq+dst_strideq*2], m2 | 147 movh [dstq+dst_strideq*2], m2 |
| 297 movh [dstq+r6q ], m3 | 148 movh [dstq+r6q ], m3 |
| 298 | 149 lea dstq, [dstq+dst_strideq*4] |
| 299 lea dstq, [dstq+dst_strideq*4] | 150 sub r4d, 4 |
| 300 | |
| 301 sub r4d, 4 | |
| 302 jnz .loop4 | 151 jnz .loop4 |
| 303 RET | 152 RET |
| 304 %endmacro | 153 %endmacro |
| 305 | 154 |
| 306 convolve_fn copy | 155 convolve_fn copy |
| 307 convolve_fn avg | 156 convolve_fn avg |
| OLD | NEW |