OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 ;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride) |
| 14 global sym(vp9_recon_b_mmx) |
| 15 sym(vp9_recon_b_mmx): |
| 16 push rbp |
| 17 mov rbp, rsp |
| 18 SHADOW_ARGS_TO_STACK 4 |
| 19 push rsi |
| 20 push rdi |
| 21 ; end prolog |
| 22 |
| 23 mov rsi, arg(0) ;s |
| 24 mov rdi, arg(2) ;d |
| 25 mov rdx, arg(1) ;q |
| 26 movsxd rax, dword ptr arg(3) ;stride |
| 27 pxor mm0, mm0 |
| 28 |
| 29 movd mm1, [rsi] |
| 30 punpcklbw mm1, mm0 |
| 31 paddsw mm1, [rdx] |
| 32 packuswb mm1, mm0 ; pack and unpack to saturate |
| 33 movd [rdi], mm1 |
| 34 |
| 35 movd mm2, [rsi+16] |
| 36 punpcklbw mm2, mm0 |
| 37 paddsw mm2, [rdx+32] |
| 38 packuswb mm2, mm0 ; pack and unpack to saturate |
| 39 movd [rdi+rax], mm2 |
| 40 |
| 41 movd mm3, [rsi+32] |
| 42 punpcklbw mm3, mm0 |
| 43 paddsw mm3, [rdx+64] |
| 44 packuswb mm3, mm0 ; pack and unpack to saturate |
| 45 movd [rdi+2*rax], mm3 |
| 46 |
| 47 add rdi, rax |
| 48 movd mm4, [rsi+48] |
| 49 punpcklbw mm4, mm0 |
| 50 paddsw mm4, [rdx+96] |
| 51 packuswb mm4, mm0 ; pack and unpack to saturate |
| 52 movd [rdi+2*rax], mm4 |
| 53 |
| 54 ; begin epilog |
| 55 pop rdi |
| 56 pop rsi |
| 57 UNSHADOW_ARGS |
| 58 pop rbp |
| 59 ret |
| 60 |
| 61 |
| 62 ;void copy_mem8x8_mmx( |
| 63 ; unsigned char *src, |
| 64 ; int src_stride, |
| 65 ; unsigned char *dst, |
| 66 ; int dst_stride |
| 67 ; ) |
| 68 global sym(vp9_copy_mem8x8_mmx) |
| 69 sym(vp9_copy_mem8x8_mmx): |
| 70 push rbp |
| 71 mov rbp, rsp |
| 72 SHADOW_ARGS_TO_STACK 4 |
| 73 push rsi |
| 74 push rdi |
| 75 ; end prolog |
| 76 |
| 77 mov rsi, arg(0) ;src; |
| 78 movq mm0, [rsi] |
| 79 |
| 80 movsxd rax, dword ptr arg(1) ;src_stride; |
| 81 mov rdi, arg(2) ;dst; |
| 82 |
| 83 movq mm1, [rsi+rax] |
| 84 movq mm2, [rsi+rax*2] |
| 85 |
| 86 movsxd rcx, dword ptr arg(3) ;dst_stride |
| 87 lea rsi, [rsi+rax*2] |
| 88 |
| 89 movq [rdi], mm0 |
| 90 add rsi, rax |
| 91 |
| 92 movq [rdi+rcx], mm1 |
| 93 movq [rdi+rcx*2], mm2 |
| 94 |
| 95 |
| 96 lea rdi, [rdi+rcx*2] |
| 97 movq mm3, [rsi] |
| 98 |
| 99 add rdi, rcx |
| 100 movq mm4, [rsi+rax] |
| 101 |
| 102 movq mm5, [rsi+rax*2] |
| 103 movq [rdi], mm3 |
| 104 |
| 105 lea rsi, [rsi+rax*2] |
| 106 movq [rdi+rcx], mm4 |
| 107 |
| 108 movq [rdi+rcx*2], mm5 |
| 109 lea rdi, [rdi+rcx*2] |
| 110 |
| 111 movq mm0, [rsi+rax] |
| 112 movq mm1, [rsi+rax*2] |
| 113 |
| 114 movq [rdi+rcx], mm0 |
| 115 movq [rdi+rcx*2],mm1 |
| 116 |
| 117 ; begin epilog |
| 118 pop rdi |
| 119 pop rsi |
| 120 UNSHADOW_ARGS |
| 121 pop rbp |
| 122 ret |
| 123 |
| 124 |
| 125 ;void copy_mem8x4_mmx( |
| 126 ; unsigned char *src, |
| 127 ; int src_stride, |
| 128 ; unsigned char *dst, |
| 129 ; int dst_stride |
| 130 ; ) |
| 131 global sym(vp9_copy_mem8x4_mmx) |
| 132 sym(vp9_copy_mem8x4_mmx): |
| 133 push rbp |
| 134 mov rbp, rsp |
| 135 SHADOW_ARGS_TO_STACK 4 |
| 136 push rsi |
| 137 push rdi |
| 138 ; end prolog |
| 139 |
| 140 mov rsi, arg(0) ;src; |
| 141 movq mm0, [rsi] |
| 142 |
| 143 movsxd rax, dword ptr arg(1) ;src_stride; |
| 144 mov rdi, arg(2) ;dst; |
| 145 |
| 146 movq mm1, [rsi+rax] |
| 147 movq mm2, [rsi+rax*2] |
| 148 |
| 149 movsxd rcx, dword ptr arg(3) ;dst_stride |
| 150 lea rsi, [rsi+rax*2] |
| 151 |
| 152 movq [rdi], mm0 |
| 153 movq [rdi+rcx], mm1 |
| 154 |
| 155 movq [rdi+rcx*2], mm2 |
| 156 lea rdi, [rdi+rcx*2] |
| 157 |
| 158 movq mm3, [rsi+rax] |
| 159 movq [rdi+rcx], mm3 |
| 160 |
| 161 ; begin epilog |
| 162 pop rdi |
| 163 pop rsi |
| 164 UNSHADOW_ARGS |
| 165 pop rbp |
| 166 ret |
| 167 |
| 168 |
| 169 ;void copy_mem16x16_mmx( |
| 170 ; unsigned char *src, |
| 171 ; int src_stride, |
| 172 ; unsigned char *dst, |
| 173 ; int dst_stride |
| 174 ; ) |
| 175 global sym(vp9_copy_mem16x16_mmx) |
| 176 sym(vp9_copy_mem16x16_mmx): |
| 177 push rbp |
| 178 mov rbp, rsp |
| 179 SHADOW_ARGS_TO_STACK 4 |
| 180 push rsi |
| 181 push rdi |
| 182 ; end prolog |
| 183 |
| 184 mov rsi, arg(0) ;src; |
| 185 movsxd rax, dword ptr arg(1) ;src_stride; |
| 186 |
| 187 mov rdi, arg(2) ;dst; |
| 188 movsxd rcx, dword ptr arg(3) ;dst_stride |
| 189 |
| 190 movq mm0, [rsi] |
| 191 movq mm3, [rsi+8]; |
| 192 |
| 193 movq mm1, [rsi+rax] |
| 194 movq mm4, [rsi+rax+8] |
| 195 |
| 196 movq mm2, [rsi+rax*2] |
| 197 movq mm5, [rsi+rax*2+8] |
| 198 |
| 199 lea rsi, [rsi+rax*2] |
| 200 add rsi, rax |
| 201 |
| 202 movq [rdi], mm0 |
| 203 movq [rdi+8], mm3 |
| 204 |
| 205 movq [rdi+rcx], mm1 |
| 206 movq [rdi+rcx+8], mm4 |
| 207 |
| 208 movq [rdi+rcx*2], mm2 |
| 209 movq [rdi+rcx*2+8], mm5 |
| 210 |
| 211 lea rdi, [rdi+rcx*2] |
| 212 add rdi, rcx |
| 213 |
| 214 movq mm0, [rsi] |
| 215 movq mm3, [rsi+8]; |
| 216 |
| 217 movq mm1, [rsi+rax] |
| 218 movq mm4, [rsi+rax+8] |
| 219 |
| 220 movq mm2, [rsi+rax*2] |
| 221 movq mm5, [rsi+rax*2+8] |
| 222 |
| 223 lea rsi, [rsi+rax*2] |
| 224 add rsi, rax |
| 225 |
| 226 movq [rdi], mm0 |
| 227 movq [rdi+8], mm3 |
| 228 |
| 229 movq [rdi+rcx], mm1 |
| 230 movq [rdi+rcx+8], mm4 |
| 231 |
| 232 movq [rdi+rcx*2], mm2 |
| 233 movq [rdi+rcx*2+8], mm5 |
| 234 |
| 235 lea rdi, [rdi+rcx*2] |
| 236 add rdi, rcx |
| 237 |
| 238 movq mm0, [rsi] |
| 239 movq mm3, [rsi+8]; |
| 240 |
| 241 movq mm1, [rsi+rax] |
| 242 movq mm4, [rsi+rax+8] |
| 243 |
| 244 movq mm2, [rsi+rax*2] |
| 245 movq mm5, [rsi+rax*2+8] |
| 246 |
| 247 lea rsi, [rsi+rax*2] |
| 248 add rsi, rax |
| 249 |
| 250 movq [rdi], mm0 |
| 251 movq [rdi+8], mm3 |
| 252 |
| 253 movq [rdi+rcx], mm1 |
| 254 movq [rdi+rcx+8], mm4 |
| 255 |
| 256 movq [rdi+rcx*2], mm2 |
| 257 movq [rdi+rcx*2+8], mm5 |
| 258 |
| 259 lea rdi, [rdi+rcx*2] |
| 260 add rdi, rcx |
| 261 |
| 262 movq mm0, [rsi] |
| 263 movq mm3, [rsi+8]; |
| 264 |
| 265 movq mm1, [rsi+rax] |
| 266 movq mm4, [rsi+rax+8] |
| 267 |
| 268 movq mm2, [rsi+rax*2] |
| 269 movq mm5, [rsi+rax*2+8] |
| 270 |
| 271 lea rsi, [rsi+rax*2] |
| 272 add rsi, rax |
| 273 |
| 274 movq [rdi], mm0 |
| 275 movq [rdi+8], mm3 |
| 276 |
| 277 movq [rdi+rcx], mm1 |
| 278 movq [rdi+rcx+8], mm4 |
| 279 |
| 280 movq [rdi+rcx*2], mm2 |
| 281 movq [rdi+rcx*2+8], mm5 |
| 282 |
| 283 lea rdi, [rdi+rcx*2] |
| 284 add rdi, rcx |
| 285 |
| 286 movq mm0, [rsi] |
| 287 movq mm3, [rsi+8]; |
| 288 |
| 289 movq mm1, [rsi+rax] |
| 290 movq mm4, [rsi+rax+8] |
| 291 |
| 292 movq mm2, [rsi+rax*2] |
| 293 movq mm5, [rsi+rax*2+8] |
| 294 |
| 295 lea rsi, [rsi+rax*2] |
| 296 add rsi, rax |
| 297 |
| 298 movq [rdi], mm0 |
| 299 movq [rdi+8], mm3 |
| 300 |
| 301 movq [rdi+rcx], mm1 |
| 302 movq [rdi+rcx+8], mm4 |
| 303 |
| 304 movq [rdi+rcx*2], mm2 |
| 305 movq [rdi+rcx*2+8], mm5 |
| 306 |
| 307 lea rdi, [rdi+rcx*2] |
| 308 add rdi, rcx |
| 309 |
| 310 movq mm0, [rsi] |
| 311 movq mm3, [rsi+8]; |
| 312 |
| 313 movq [rdi], mm0 |
| 314 movq [rdi+8], mm3 |
| 315 |
| 316 ; begin epilog |
| 317 pop rdi |
| 318 pop rsi |
| 319 UNSHADOW_ARGS |
| 320 pop rbp |
| 321 ret |
OLD | NEW |