| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 %include "vpx_ports/x86_abi_support.asm" | |
| 13 | |
| 14 | |
| 15 ;void vp8_loop_filter_horizontal_edge_mmx | |
| 16 ;( | |
| 17 ; unsigned char *src_ptr, | |
| 18 ; int src_pixel_step, | |
| 19 ; const char *blimit, | |
| 20 ; const char *limit, | |
| 21 ; const char *thresh, | |
| 22 ; int count | |
| 23 ;) | |
| 24 global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE | |
| 25 sym(vp8_loop_filter_horizontal_edge_mmx): | |
| 26 push rbp | |
| 27 mov rbp, rsp | |
| 28 SHADOW_ARGS_TO_STACK 6 | |
| 29 GET_GOT rbx | |
| 30 push rsi | |
| 31 push rdi | |
| 32 ; end prolog | |
| 33 | |
| 34 ALIGN_STACK 16, rax | |
| 35 sub rsp, 32 ; reserve 32 bytes | |
| 36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
| 37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
| 38 | |
| 39 mov rsi, arg(0) ;src_ptr | |
| 40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
| 41 | |
| 42 movsxd rcx, dword ptr arg(5) ;count | |
| 43 .next8_h: | |
| 44 mov rdx, arg(3) ;limit | |
| 45 movq mm7, [rdx] | |
| 46 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
| 47 add rdi, rax | |
| 48 | |
| 49 ; calculate breakout conditions | |
| 50 movq mm2, [rdi+2*rax] ; q3 | |
| 51 movq mm1, [rsi+2*rax] ; q2 | |
| 52 movq mm6, mm1 ; q2 | |
| 53 psubusb mm1, mm2 ; q2-=q3 | |
| 54 psubusb mm2, mm6 ; q3-=q2 | |
| 55 por mm1, mm2 ; abs(q3-q2) | |
| 56 psubusb mm1, mm7 ; | |
| 57 | |
| 58 | |
| 59 movq mm4, [rsi+rax] ; q1 | |
| 60 movq mm3, mm4 ; q1 | |
| 61 psubusb mm4, mm6 ; q1-=q2 | |
| 62 psubusb mm6, mm3 ; q2-=q1 | |
| 63 por mm4, mm6 ; abs(q2-q1) | |
| 64 | |
| 65 psubusb mm4, mm7 | |
| 66 por mm1, mm4 | |
| 67 | |
| 68 movq mm4, [rsi] ; q0 | |
| 69 movq mm0, mm4 ; q0 | |
| 70 psubusb mm4, mm3 ; q0-=q1 | |
| 71 psubusb mm3, mm0 ; q1-=q0 | |
| 72 por mm4, mm3 ; abs(q0-q1) | |
| 73 movq t0, mm4 ; save to t0 | |
| 74 psubusb mm4, mm7 | |
| 75 por mm1, mm4 | |
| 76 | |
| 77 | |
| 78 neg rax ; negate pitch to deal with above bord
er | |
| 79 | |
| 80 movq mm2, [rsi+4*rax] ; p3 | |
| 81 movq mm4, [rdi+4*rax] ; p2 | |
| 82 movq mm5, mm4 ; p2 | |
| 83 psubusb mm4, mm2 ; p2-=p3 | |
| 84 psubusb mm2, mm5 ; p3-=p2 | |
| 85 por mm4, mm2 ; abs(p3 - p2) | |
| 86 psubusb mm4, mm7 | |
| 87 por mm1, mm4 | |
| 88 | |
| 89 | |
| 90 movq mm4, [rsi+2*rax] ; p1 | |
| 91 movq mm3, mm4 ; p1 | |
| 92 psubusb mm4, mm5 ; p1-=p2 | |
| 93 psubusb mm5, mm3 ; p2-=p1 | |
| 94 por mm4, mm5 ; abs(p2 - p1) | |
| 95 psubusb mm4, mm7 | |
| 96 por mm1, mm4 | |
| 97 | |
| 98 movq mm2, mm3 ; p1 | |
| 99 | |
| 100 movq mm4, [rsi+rax] ; p0 | |
| 101 movq mm5, mm4 ; p0 | |
| 102 psubusb mm4, mm3 ; p0-=p1 | |
| 103 psubusb mm3, mm5 ; p1-=p0 | |
| 104 por mm4, mm3 ; abs(p1 - p0) | |
| 105 movq t1, mm4 ; save to t1 | |
| 106 psubusb mm4, mm7 | |
| 107 por mm1, mm4 | |
| 108 | |
| 109 movq mm3, [rdi] ; q1 | |
| 110 movq mm4, mm3 ; q1 | |
| 111 psubusb mm3, mm2 ; q1-=p1 | |
| 112 psubusb mm2, mm4 ; p1-=q1 | |
| 113 por mm2, mm3 ; abs(p1-q1) | |
| 114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
| 115 psrlw mm2, 1 ; abs(p1-q1)/2 | |
| 116 | |
| 117 movq mm6, mm5 ; p0 | |
| 118 movq mm3, [rsi] ; q0 | |
| 119 psubusb mm5, mm3 ; p0-=q0 | |
| 120 psubusb mm3, mm6 ; q0-=p0 | |
| 121 por mm5, mm3 ; abs(p0 - q0) | |
| 122 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
| 123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
| 124 | |
| 125 mov rdx, arg(2) ;blimit ; get blimit | |
| 126 movq mm7, [rdx] ; blimit | |
| 127 | |
| 128 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b
limit | |
| 129 por mm1, mm5 | |
| 130 pxor mm5, mm5 | |
| 131 pcmpeqb mm1, mm5 ; mask mm1 | |
| 132 | |
| 133 ; calculate high edge variance | |
| 134 mov rdx, arg(4) ;thresh ; get thresh | |
| 135 movq mm7, [rdx] ; | |
| 136 movq mm4, t0 ; get abs (q1 - q0) | |
| 137 psubusb mm4, mm7 | |
| 138 movq mm3, t1 ; get abs (p1 - p0) | |
| 139 psubusb mm3, mm7 | |
| 140 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0
) > thresh | |
| 141 | |
| 142 pcmpeqb mm4, mm5 | |
| 143 | |
| 144 pcmpeqb mm5, mm5 | |
| 145 pxor mm4, mm5 | |
| 146 | |
| 147 | |
| 148 ; start work on filters | |
| 149 movq mm2, [rsi+2*rax] ; p1 | |
| 150 movq mm7, [rdi] ; q1 | |
| 151 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
| 152 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
| 153 psubsb mm2, mm7 ; p1 - q1 | |
| 154 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) | |
| 155 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
| 156 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
| 157 movq mm3, mm0 ; q0 | |
| 158 psubsb mm0, mm6 ; q0 - p0 | |
| 159 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) | |
| 160 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) | |
| 161 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) | |
| 162 pand mm1, mm2 ; mask filter values we don't care
about | |
| 163 movq mm2, mm1 | |
| 164 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 | |
| 165 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 | |
| 166 | |
| 167 pxor mm0, mm0 ; | |
| 168 pxor mm5, mm5 | |
| 169 punpcklbw mm0, mm2 ; | |
| 170 punpckhbw mm5, mm2 ; | |
| 171 psraw mm0, 11 ; | |
| 172 psraw mm5, 11 | |
| 173 packsswb mm0, mm5 | |
| 174 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >>
3; | |
| 175 | |
| 176 pxor mm0, mm0 ; 0 | |
| 177 movq mm5, mm1 ; abcdefgh | |
| 178 punpcklbw mm0, mm1 ; e0f0g0h0 | |
| 179 psraw mm0, 11 ; sign extended shift right by 3 | |
| 180 pxor mm1, mm1 ; 0 | |
| 181 punpckhbw mm1, mm5 ; a0b0c0d0 | |
| 182 psraw mm1, 11 ; sign extended shift right by 3 | |
| 183 movq mm5, mm0 ; save results | |
| 184 | |
| 185 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>
3 | |
| 186 paddsw mm5, [GLOBAL(ones)] | |
| 187 paddsw mm1, [GLOBAL(ones)] | |
| 188 psraw mm5, 1 ; partial shifted one more time for 2n
d tap | |
| 189 psraw mm1, 1 ; partial shifted one more time for 2n
d tap | |
| 190 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>
4 | |
| 191 pandn mm4, mm5 ; high edge variance additive | |
| 192 | |
| 193 paddsb mm6, mm2 ; p0+= p0 add | |
| 194 pxor mm6, [GLOBAL(t80)] ; unoffset | |
| 195 movq [rsi+rax], mm6 ; write back | |
| 196 | |
| 197 movq mm6, [rsi+2*rax] ; p1 | |
| 198 pxor mm6, [GLOBAL(t80)] ; reoffset | |
| 199 paddsb mm6, mm4 ; p1+= p1 add | |
| 200 pxor mm6, [GLOBAL(t80)] ; unoffset | |
| 201 movq [rsi+2*rax], mm6 ; write back | |
| 202 | |
| 203 psubsb mm3, mm0 ; q0-= q0 add | |
| 204 pxor mm3, [GLOBAL(t80)] ; unoffset | |
| 205 movq [rsi], mm3 ; write back | |
| 206 | |
| 207 psubsb mm7, mm4 ; q1-= q1 add | |
| 208 pxor mm7, [GLOBAL(t80)] ; unoffset | |
| 209 movq [rdi], mm7 ; write back | |
| 210 | |
| 211 add rsi,8 | |
| 212 neg rax | |
| 213 dec rcx | |
| 214 jnz .next8_h | |
| 215 | |
| 216 add rsp, 32 | |
| 217 pop rsp | |
| 218 ; begin epilog | |
| 219 pop rdi | |
| 220 pop rsi | |
| 221 RESTORE_GOT | |
| 222 UNSHADOW_ARGS | |
| 223 pop rbp | |
| 224 ret | |
| 225 | |
| 226 | |
| 227 ;void vp8_loop_filter_vertical_edge_mmx | |
| 228 ;( | |
| 229 ; unsigned char *src_ptr, | |
| 230 ; int src_pixel_step, | |
| 231 ; const char *blimit, | |
| 232 ; const char *limit, | |
| 233 ; const char *thresh, | |
| 234 ; int count | |
| 235 ;) | |
| 236 global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE | |
| 237 sym(vp8_loop_filter_vertical_edge_mmx): | |
| 238 push rbp | |
| 239 mov rbp, rsp | |
| 240 SHADOW_ARGS_TO_STACK 6 | |
| 241 GET_GOT rbx | |
| 242 push rsi | |
| 243 push rdi | |
| 244 ; end prolog | |
| 245 | |
| 246 ALIGN_STACK 16, rax | |
| 247 sub rsp, 64 ; reserve 64 bytes | |
| 248 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
| 249 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
| 250 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; | |
| 251 | |
| 252 mov rsi, arg(0) ;src_ptr | |
| 253 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati
on pitch? | |
| 254 | |
| 255 lea rsi, [rsi + rax*4 - 4] | |
| 256 | |
| 257 movsxd rcx, dword ptr arg(5) ;count | |
| 258 .next8_v: | |
| 259 mov rdi, rsi ; rdi points to row +1 for indirec
t addressing | |
| 260 add rdi, rax | |
| 261 | |
| 262 | |
| 263 ;transpose | |
| 264 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62
61 60 | |
| 265 movq mm7, mm6 ; 77 76 75 74 73 72
71 70 | |
| 266 | |
| 267 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65
74 64 | |
| 268 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61
70 60 | |
| 269 | |
| 270 movq mm4, [rsi] ; 47 46 45 44 43 42
41 40 | |
| 271 movq mm5, mm4 ; 47 46 45 44 43 42
41 40 | |
| 272 | |
| 273 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45
54 44 | |
| 274 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41
50 40 | |
| 275 | |
| 276 movq mm3, mm5 ; 57 47 56 46 55 45
54 44 | |
| 277 punpckhwd mm5, mm7 ; 77 67 57 47 76 66
56 46 | |
| 278 | |
| 279 punpcklwd mm3, mm7 ; 75 65 55 45 74 64
54 44 | |
| 280 movq mm2, mm4 ; 53 43 52 42 51 41
50 40 | |
| 281 | |
| 282 punpckhwd mm4, mm6 ; 73 63 53 43 72 62
52 42 | |
| 283 punpcklwd mm2, mm6 ; 71 61 51 41 70 60
50 40 | |
| 284 | |
| 285 neg rax | |
| 286 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22
21 20 | |
| 287 | |
| 288 movq mm1, mm6 ; 27 26 25 24 23 22
21 20 | |
| 289 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25
34 24 | |
| 290 | |
| 291 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21
30 20 | |
| 292 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02
01 00 | |
| 293 | |
| 294 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05
14 04 | |
| 295 movq mm0, mm7 ; 17 07 16 06 15 05
14 04 | |
| 296 | |
| 297 punpckhwd mm7, mm6 ; 37 27 17 07 36 26
16 06 | |
| 298 punpcklwd mm0, mm6 ; 35 25 15 05 34 24
14 04 | |
| 299 | |
| 300 movq mm6, mm7 ; 37 27 17 07 36 26
16 06 | |
| 301 punpckhdq mm7, mm5 ; 77 67 57 47 37 27
17 07 = q3 | |
| 302 | |
| 303 punpckldq mm6, mm5 ; 76 66 56 46 36 26
16 06 = q2 | |
| 304 | |
| 305 movq mm5, mm6 ; 76 66 56 46 36 26
16 06 | |
| 306 psubusb mm5, mm7 ; q2-q3 | |
| 307 | |
| 308 psubusb mm7, mm6 ; q3-q2 | |
| 309 por mm7, mm5; ; mm7=abs (q3-q2) | |
| 310 | |
| 311 movq mm5, mm0 ; 35 25 15 05 34 24
14 04 | |
| 312 punpckhdq mm5, mm3 ; 75 65 55 45 35 25
15 05 = q1 | |
| 313 | |
| 314 punpckldq mm0, mm3 ; 74 64 54 44 34 24
15 04 = q0 | |
| 315 movq mm3, mm5 ; 75 65 55 45 35 25
15 05 = q1 | |
| 316 | |
| 317 psubusb mm3, mm6 ; q1-q2 | |
| 318 psubusb mm6, mm5 ; q2-q1 | |
| 319 | |
| 320 por mm6, mm3 ; mm6=abs(q2-q1) | |
| 321 lea rdx, srct | |
| 322 | |
| 323 movq [rdx+24], mm5 ; save q1 | |
| 324 movq [rdx+16], mm0 ; save q0 | |
| 325 | |
| 326 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02
01 00 | |
| 327 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01
10 00 | |
| 328 | |
| 329 movq mm0, mm3 ; 13 03 12 02 11 01
10 00 | |
| 330 punpcklwd mm0, mm1 ; 31 21 11 01 30 20
10 00 | |
| 331 | |
| 332 punpckhwd mm3, mm1 ; 33 23 13 03 32 22
12 02 | |
| 333 movq mm1, mm0 ; 31 21 11 01 30 20
10 00 | |
| 334 | |
| 335 punpckldq mm0, mm2 ; 70 60 50 40 30 20
10 00 =p3 | |
| 336 punpckhdq mm1, mm2 ; 71 61 51 41 31 21
11 01 =p2 | |
| 337 | |
| 338 movq mm2, mm1 ; 71 61 51 41 31 21
11 01 =p2 | |
| 339 psubusb mm2, mm0 ; p2-p3 | |
| 340 | |
| 341 psubusb mm0, mm1 ; p3-p2 | |
| 342 por mm0, mm2 ; mm0=abs(p3-p2) | |
| 343 | |
| 344 movq mm2, mm3 ; 33 23 13 03 32 22
12 02 | |
| 345 punpckldq mm2, mm4 ; 72 62 52 42 32 22
12 02 = p1 | |
| 346 | |
| 347 punpckhdq mm3, mm4 ; 73 63 53 43 33 23
13 03 = p0 | |
| 348 movq [rdx+8], mm3 ; save p0 | |
| 349 | |
| 350 movq [rdx], mm2 ; save p1 | |
| 351 movq mm5, mm2 ; mm5 = p1 | |
| 352 | |
| 353 psubusb mm2, mm1 ; p1-p2 | |
| 354 psubusb mm1, mm5 ; p2-p1 | |
| 355 | |
| 356 por mm1, mm2 ; mm1=abs(p2-p1) | |
| 357 mov rdx, arg(3) ;limit | |
| 358 | |
| 359 movq mm4, [rdx] ; mm4 = limit | |
| 360 psubusb mm7, mm4 | |
| 361 | |
| 362 psubusb mm0, mm4 | |
| 363 psubusb mm1, mm4 | |
| 364 | |
| 365 psubusb mm6, mm4 | |
| 366 por mm7, mm6 | |
| 367 | |
| 368 por mm0, mm1 | |
| 369 por mm0, mm7 ; abs(q3-q2) > lim
it || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit | |
| 370 | |
| 371 movq mm1, mm5 ; p1 | |
| 372 | |
| 373 movq mm7, mm3 ; mm3=mm7=p0 | |
| 374 psubusb mm7, mm5 ; p0 - p1 | |
| 375 | |
| 376 psubusb mm5, mm3 ; p1 - p0 | |
| 377 por mm5, mm7 ; abs(p1-p0) | |
| 378 | |
| 379 movq t0, mm5 ; save abs(p1-p0) | |
| 380 lea rdx, srct | |
| 381 | |
| 382 psubusb mm5, mm4 | |
| 383 por mm0, mm5 ; mm0=mask | |
| 384 | |
| 385 movq mm5, [rdx+16] ; mm5=q0 | |
| 386 movq mm7, [rdx+24] ; mm7=q1 | |
| 387 | |
| 388 movq mm6, mm5 ; mm6=q0 | |
| 389 movq mm2, mm7 ; q1 | |
| 390 psubusb mm5, mm7 ; q0-q1 | |
| 391 | |
| 392 psubusb mm7, mm6 ; q1-q0 | |
| 393 por mm7, mm5 ; abs(q1-q0) | |
| 394 | |
| 395 movq t1, mm7 ; save abs(q1-q0) | |
| 396 psubusb mm7, mm4 | |
| 397 | |
| 398 por mm0, mm7 ; mask | |
| 399 | |
| 400 movq mm5, mm2 ; q1 | |
| 401 psubusb mm5, mm1 ; q1-=p1 | |
| 402 psubusb mm1, mm2 ; p1-=q1 | |
| 403 por mm5, mm1 ; abs(p1-q1) | |
| 404 pand mm5, [GLOBAL(tfe)] ; set lsb of each by
te to zero | |
| 405 psrlw mm5, 1 ; abs(p1-q1)/2 | |
| 406 | |
| 407 mov rdx, arg(2) ;blimit ; | |
| 408 | |
| 409 movq mm4, [rdx] ;blimit | |
| 410 movq mm1, mm3 ; mm1=mm3=p0 | |
| 411 | |
| 412 movq mm7, mm6 ; mm7=mm6=q0 | |
| 413 psubusb mm1, mm7 ; p0-q0 | |
| 414 | |
| 415 psubusb mm7, mm3 ; q0-p0 | |
| 416 por mm1, mm7 ; abs(q0-p0) | |
| 417 paddusb mm1, mm1 ; abs(q0-p0)*2 | |
| 418 paddusb mm1, mm5 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 | |
| 419 | |
| 420 psubusb mm1, mm4 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 > blimit | |
| 421 por mm1, mm0; ; mask | |
| 422 | |
| 423 pxor mm0, mm0 | |
| 424 pcmpeqb mm1, mm0 | |
| 425 | |
| 426 ; calculate high edge variance | |
| 427 mov rdx, arg(4) ;thresh ; get thresh | |
| 428 movq mm7, [rdx] | |
| 429 ; | |
| 430 movq mm4, t0 ; get abs (q1 - q0) | |
| 431 psubusb mm4, mm7 | |
| 432 | |
| 433 movq mm3, t1 ; get abs (p1 - p0) | |
| 434 psubusb mm3, mm7 | |
| 435 | |
| 436 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p
1 - p0) > thresh | |
| 437 pcmpeqb mm4, mm0 | |
| 438 | |
| 439 pcmpeqb mm0, mm0 | |
| 440 pxor mm4, mm0 | |
| 441 | |
| 442 | |
| 443 | |
| 444 ; start work on filters | |
| 445 lea rdx, srct | |
| 446 | |
| 447 movq mm2, [rdx] ; p1 | |
| 448 movq mm7, [rdx+24] ; q1 | |
| 449 | |
| 450 movq mm6, [rdx+8] ; p0 | |
| 451 movq mm0, [rdx+16] ; q0 | |
| 452 | |
| 453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed
values | |
| 454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed
values | |
| 455 | |
| 456 psubsb mm2, mm7 ; p1 - q1 | |
| 457 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) | |
| 458 | |
| 459 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed va
lues | |
| 460 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed va
lues | |
| 461 | |
| 462 movq mm3, mm0 ; q0 | |
| 463 psubsb mm0, mm6 ; q0 - p0 | |
| 464 | |
| 465 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) | |
| 466 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) | |
| 467 | |
| 468 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) | |
| 469 pand mm1, mm2 ; mask filter values we don't ca
re about | |
| 470 | |
| 471 movq mm2, mm1 | |
| 472 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1)
+ 4 | |
| 473 | |
| 474 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1)
+ 3 | |
| 475 pxor mm0, mm0 ; | |
| 476 | |
| 477 pxor mm5, mm5 | |
| 478 punpcklbw mm0, mm2 ; | |
| 479 | |
| 480 punpckhbw mm5, mm2 ; | |
| 481 psraw mm0, 11 ; | |
| 482 | |
| 483 psraw mm5, 11 | |
| 484 packsswb mm0, mm5 | |
| 485 | |
| 486 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3)
>> 3; | |
| 487 | |
| 488 pxor mm0, mm0 ; 0 | |
| 489 movq mm5, mm1 ; abcdefgh | |
| 490 | |
| 491 punpcklbw mm0, mm1 ; e0f0g0h0 | |
| 492 psraw mm0, 11 ; sign extended shift right by
3 | |
| 493 | |
| 494 pxor mm1, mm1 ; 0 | |
| 495 punpckhbw mm1, mm5 ; a0b0c0d0 | |
| 496 | |
| 497 psraw mm1, 11 ; sign extended shift right by
3 | |
| 498 movq mm5, mm0 ; save results | |
| 499 | |
| 500 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4
) >>3 | |
| 501 paddsw mm5, [GLOBAL(ones)] | |
| 502 | |
| 503 paddsw mm1, [GLOBAL(ones)] | |
| 504 psraw mm5, 1 ; partial shifted one more tim
e for 2nd tap | |
| 505 | |
| 506 psraw mm1, 1 ; partial shifted one more tim
e for 2nd tap | |
| 507 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4
) >>4 | |
| 508 | |
| 509 pandn mm4, mm5 ; high edge variance additive | |
| 510 | |
| 511 paddsb mm6, mm2 ; p0+= p0 add | |
| 512 pxor mm6, [GLOBAL(t80)] ; unoffset | |
| 513 | |
| 514 ; mm6=p0 ; | |
| 515 movq mm1, [rdx] ; p1 | |
| 516 pxor mm1, [GLOBAL(t80)] ; reoffset | |
| 517 | |
| 518 paddsb mm1, mm4 ; p1+= p1 add | |
| 519 pxor mm1, [GLOBAL(t80)] ; unoffset | |
| 520 ; mm6 = p0 mm1 = p1 | |
| 521 | |
| 522 psubsb mm3, mm0 ; q0-= q0 add | |
| 523 pxor mm3, [GLOBAL(t80)] ; unoffset | |
| 524 | |
| 525 ; mm3 = q0 | |
| 526 psubsb mm7, mm4 ; q1-= q1 add | |
| 527 pxor mm7, [GLOBAL(t80)] ; unoffset | |
| 528 ; mm7 = q1 | |
| 529 | |
| 530 ; transpose and write back | |
| 531 ; mm1 = 72 62 52 42 32 22 12 02 | |
| 532 ; mm6 = 73 63 53 43 33 23 13 03 | |
| 533 ; mm3 = 74 64 54 44 34 24 14 04 | |
| 534 ; mm7 = 75 65 55 45 35 25 15 05 | |
| 535 | |
| 536 movq mm2, mm1 ; 72 62 52 42 32 22 12 02 | |
| 537 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 | |
| 538 | |
| 539 movq mm4, mm3 ; 74 64 54 44 34 24 14 04 | |
| 540 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 | |
| 541 | |
| 542 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 | |
| 543 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 | |
| 544 | |
| 545 movq mm6, mm2 ; 33 32 23 22 13 12 03 02 | |
| 546 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 | |
| 547 | |
| 548 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 | |
| 549 movq mm5, mm1 ; 73 72 63 62 53 52 43 42 | |
| 550 | |
| 551 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 | |
| 552 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 | |
| 553 | |
| 554 | |
| 555 ; mm2 = 15 14 13 12 05 04 03 02 | |
| 556 ; mm6 = 35 34 33 32 25 24 23 22 | |
| 557 ; mm5 = 55 54 53 52 45 44 43 42 | |
| 558 ; mm1 = 75 74 73 72 65 64 63 62 | |
| 559 | |
| 560 | |
| 561 | |
| 562 movd [rsi+rax*4+2], mm2 | |
| 563 psrlq mm2, 32 | |
| 564 | |
| 565 movd [rdi+rax*4+2], mm2 | |
| 566 movd [rsi+rax*2+2], mm6 | |
| 567 | |
| 568 psrlq mm6, 32 | |
| 569 movd [rsi+rax+2],mm6 | |
| 570 | |
| 571 movd [rsi+2], mm1 | |
| 572 psrlq mm1, 32 | |
| 573 | |
| 574 movd [rdi+2], mm1 | |
| 575 neg rax | |
| 576 | |
| 577 movd [rdi+rax+2],mm5 | |
| 578 psrlq mm5, 32 | |
| 579 | |
| 580 movd [rdi+rax*2+2], mm5 | |
| 581 | |
| 582 lea rsi, [rsi+rax*8] | |
| 583 dec rcx | |
| 584 jnz .next8_v | |
| 585 | |
| 586 add rsp, 64 | |
| 587 pop rsp | |
| 588 ; begin epilog | |
| 589 pop rdi | |
| 590 pop rsi | |
| 591 RESTORE_GOT | |
| 592 UNSHADOW_ARGS | |
| 593 pop rbp | |
| 594 ret | |
| 595 | |
| 596 | |
| 597 ;void vp8_mbloop_filter_horizontal_edge_mmx | |
| 598 ;( | |
| 599 ; unsigned char *src_ptr, | |
| 600 ; int src_pixel_step, | |
| 601 ; const char *blimit, | |
| 602 ; const char *limit, | |
| 603 ; const char *thresh, | |
| 604 ; int count | |
| 605 ;) | |
| 606 global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE | |
| 607 sym(vp8_mbloop_filter_horizontal_edge_mmx): | |
| 608 push rbp | |
| 609 mov rbp, rsp | |
| 610 SHADOW_ARGS_TO_STACK 6 | |
| 611 GET_GOT rbx | |
| 612 push rsi | |
| 613 push rdi | |
| 614 ; end prolog | |
| 615 | |
| 616 ALIGN_STACK 16, rax | |
| 617 sub rsp, 32 ; reserve 32 bytes | |
| 618 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
| 619 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
| 620 | |
| 621 mov rsi, arg(0) ;src_ptr | |
| 622 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
| 623 | |
| 624 movsxd rcx, dword ptr arg(5) ;count | |
| 625 .next8_mbh: | |
| 626 mov rdx, arg(3) ;limit | |
| 627 movq mm7, [rdx] | |
| 628 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
| 629 add rdi, rax | |
| 630 | |
| 631 ; calculate breakout conditions | |
| 632 movq mm2, [rdi+2*rax] ; q3 | |
| 633 | |
| 634 movq mm1, [rsi+2*rax] ; q2 | |
| 635 movq mm6, mm1 ; q2 | |
| 636 psubusb mm1, mm2 ; q2-=q3 | |
| 637 psubusb mm2, mm6 ; q3-=q2 | |
| 638 por mm1, mm2 ; abs(q3-q2) | |
| 639 psubusb mm1, mm7 | |
| 640 | |
| 641 | |
| 642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit | |
| 643 movq mm4, [rsi+rax] ; q1 | |
| 644 movq mm3, mm4 ; q1 | |
| 645 psubusb mm4, mm6 ; q1-=q2 | |
| 646 psubusb mm6, mm3 ; q2-=q1 | |
| 647 por mm4, mm6 ; abs(q2-q1) | |
| 648 psubusb mm4, mm7 | |
| 649 por mm1, mm4 | |
| 650 | |
| 651 | |
| 652 ; mm1 = mask, mm3=q1, mm7 = limit | |
| 653 | |
| 654 movq mm4, [rsi] ; q0 | |
| 655 movq mm0, mm4 ; q0 | |
| 656 psubusb mm4, mm3 ; q0-=q1 | |
| 657 psubusb mm3, mm0 ; q1-=q0 | |
| 658 por mm4, mm3 ; abs(q0-q1) | |
| 659 movq t0, mm4 ; save to t0 | |
| 660 psubusb mm4, mm7 | |
| 661 por mm1, mm4 | |
| 662 | |
| 663 | |
| 664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
| 665 | |
| 666 neg rax ; negate pitch to deal with above bord
er | |
| 667 | |
| 668 movq mm2, [rsi+4*rax] ; p3 | |
| 669 movq mm4, [rdi+4*rax] ; p2 | |
| 670 movq mm5, mm4 ; p2 | |
| 671 psubusb mm4, mm2 ; p2-=p3 | |
| 672 psubusb mm2, mm5 ; p3-=p2 | |
| 673 por mm4, mm2 ; abs(p3 - p2) | |
| 674 psubusb mm4, mm7 | |
| 675 por mm1, mm4 | |
| 676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
| 677 | |
| 678 movq mm4, [rsi+2*rax] ; p1 | |
| 679 movq mm3, mm4 ; p1 | |
| 680 psubusb mm4, mm5 ; p1-=p2 | |
| 681 psubusb mm5, mm3 ; p2-=p1 | |
| 682 por mm4, mm5 ; abs(p2 - p1) | |
| 683 psubusb mm4, mm7 | |
| 684 por mm1, mm4 | |
| 685 | |
| 686 movq mm2, mm3 ; p1 | |
| 687 | |
| 688 | |
| 689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) | |
| 690 | |
| 691 movq mm4, [rsi+rax] ; p0 | |
| 692 movq mm5, mm4 ; p0 | |
| 693 psubusb mm4, mm3 ; p0-=p1 | |
| 694 psubusb mm3, mm5 ; p1-=p0 | |
| 695 por mm4, mm3 ; abs(p1 - p0) | |
| 696 movq t1, mm4 ; save to t1 | |
| 697 psubusb mm4, mm7 | |
| 698 por mm1, mm4 | |
| 699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
| 700 ; mm5 = p0 | |
| 701 movq mm3, [rdi] ; q1 | |
| 702 movq mm4, mm3 ; q1 | |
| 703 psubusb mm3, mm2 ; q1-=p1 | |
| 704 psubusb mm2, mm4 ; p1-=q1 | |
| 705 por mm2, mm3 ; abs(p1-q1) | |
| 706 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
| 707 psrlw mm2, 1 ; abs(p1-q1)/2 | |
| 708 | |
| 709 movq mm6, mm5 ; p0 | |
| 710 movq mm3, mm0 ; q0 | |
| 711 psubusb mm5, mm3 ; p0-=q0 | |
| 712 psubusb mm3, mm6 ; q0-=p0 | |
| 713 por mm5, mm3 ; abs(p0 - q0) | |
| 714 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
| 715 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
| 716 | |
| 717 mov rdx, arg(2) ;blimit ; get blimit | |
| 718 movq mm7, [rdx] ; blimit | |
| 719 | |
| 720 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > b
limit | |
| 721 por mm1, mm5 | |
| 722 pxor mm5, mm5 | |
| 723 pcmpeqb mm1, mm5 ; mask mm1 | |
| 724 | |
| 725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
| 726 ; mm6 = p0, | |
| 727 | |
| 728 ; calculate high edge variance | |
| 729 mov rdx, arg(4) ;thresh ; get thresh | |
| 730 movq mm7, [rdx] ; | |
| 731 movq mm4, t0 ; get abs (q1 - q0) | |
| 732 psubusb mm4, mm7 | |
| 733 movq mm3, t1 ; get abs (p1 - p0) | |
| 734 psubusb mm3, mm7 | |
| 735 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0
) > thresh | |
| 736 | |
| 737 pcmpeqb mm4, mm5 | |
| 738 | |
| 739 pcmpeqb mm5, mm5 | |
| 740 pxor mm4, mm5 | |
| 741 | |
| 742 | |
| 743 | |
| 744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) | |
| 745 ; mm6 = p0, mm4=hev | |
| 746 ; start work on filters | |
| 747 movq mm2, [rsi+2*rax] ; p1 | |
| 748 movq mm7, [rdi] ; q1 | |
| 749 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
| 750 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
| 751 psubsb mm2, mm7 ; p1 - q1 | |
| 752 | |
| 753 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
| 754 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
| 755 movq mm3, mm0 ; q0 | |
| 756 psubsb mm0, mm6 ; q0 - p0 | |
| 757 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) | |
| 758 paddsb mm2, mm0 ; 2 * (q0 - p0) | |
| 759 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) | |
| 760 pand mm1, mm2 ; mask filter values we don't care abo
ut | |
| 761 | |
| 762 | |
| 763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 | |
| 764 movq mm2, mm1 ; vp8_filter | |
| 765 pand mm2, mm4; ; Filter2 = vp8_filter & hev | |
| 766 | |
| 767 movq mm5, mm2 ; | |
| 768 paddsb mm5, [GLOBAL(t3)]; | |
| 769 | |
| 770 pxor mm0, mm0 ; 0 | |
| 771 pxor mm7, mm7 ; 0 | |
| 772 | |
| 773 punpcklbw mm0, mm5 ; e0f0g0h0 | |
| 774 psraw mm0, 11 ; sign extended shift right by 3 | |
| 775 punpckhbw mm7, mm5 ; a0b0c0d0 | |
| 776 psraw mm7, 11 ; sign extended shift right by 3 | |
| 777 packsswb mm0, mm7 ; Filter2 >>=3; | |
| 778 | |
| 779 movq mm5, mm0 ; Filter2 | |
| 780 | |
| 781 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) | |
| 782 pxor mm0, mm0 ; 0 | |
| 783 pxor mm7, mm7 ; 0 | |
| 784 | |
| 785 punpcklbw mm0, mm2 ; e0f0g0h0 | |
| 786 psraw mm0, 11 ; sign extended shift right by 3 | |
| 787 punpckhbw mm7, mm2 ; a0b0c0d0 | |
| 788 psraw mm7, 11 ; sign extended shift right by 3 | |
| 789 packsswb mm0, mm7 ; Filter2 >>=3; | |
| 790 | |
| 791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 | |
| 792 psubsb mm3, mm0 ; qs0 =qs0 - filter1 | |
| 793 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 | |
| 794 | |
| 795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 | |
| 796 ; vp8_filter &= ~hev; | |
| 797 ; Filter2 = vp8_filter; | |
| 798 pandn mm4, mm1 ; vp8_filter&=~hev | |
| 799 | |
| 800 | |
| 801 ; mm3=qs0, mm4=filter2, mm6=ps0 | |
| 802 | |
| 803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); | |
| 804 ; s = vp8_signed_char_clamp(qs0 - u); | |
| 805 ; *oq0 = s^0x80; | |
| 806 ; s = vp8_signed_char_clamp(ps0 + u); | |
| 807 ; *op0 = s^0x80; | |
| 808 pxor mm0, mm0 | |
| 809 | |
| 810 pxor mm1, mm1 | |
| 811 pxor mm2, mm2 | |
| 812 punpcklbw mm1, mm4 | |
| 813 punpckhbw mm2, mm4 | |
| 814 pmulhw mm1, [GLOBAL(s27)] | |
| 815 pmulhw mm2, [GLOBAL(s27)] | |
| 816 paddw mm1, [GLOBAL(s63)] | |
| 817 paddw mm2, [GLOBAL(s63)] | |
| 818 psraw mm1, 7 | |
| 819 psraw mm2, 7 | |
| 820 packsswb mm1, mm2 | |
| 821 | |
| 822 psubsb mm3, mm1 | |
| 823 paddsb mm6, mm1 | |
| 824 | |
| 825 pxor mm3, [GLOBAL(t80)] | |
| 826 pxor mm6, [GLOBAL(t80)] | |
| 827 movq [rsi+rax], mm6 | |
| 828 movq [rsi], mm3 | |
| 829 | |
| 830 ; roughly 2/7th difference across boundary | |
| 831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); | |
| 832 ; s = vp8_signed_char_clamp(qs1 - u); | |
| 833 ; *oq1 = s^0x80; | |
| 834 ; s = vp8_signed_char_clamp(ps1 + u); | |
| 835 ; *op1 = s^0x80; | |
| 836 pxor mm1, mm1 | |
| 837 pxor mm2, mm2 | |
| 838 punpcklbw mm1, mm4 | |
| 839 punpckhbw mm2, mm4 | |
| 840 pmulhw mm1, [GLOBAL(s18)] | |
| 841 pmulhw mm2, [GLOBAL(s18)] | |
| 842 paddw mm1, [GLOBAL(s63)] | |
| 843 paddw mm2, [GLOBAL(s63)] | |
| 844 psraw mm1, 7 | |
| 845 psraw mm2, 7 | |
| 846 packsswb mm1, mm2 | |
| 847 | |
| 848 movq mm3, [rdi] | |
| 849 movq mm6, [rsi+rax*2] ; p1 | |
| 850 | |
| 851 pxor mm3, [GLOBAL(t80)] | |
| 852 pxor mm6, [GLOBAL(t80)] | |
| 853 | |
| 854 paddsb mm6, mm1 | |
| 855 psubsb mm3, mm1 | |
| 856 | |
| 857 pxor mm6, [GLOBAL(t80)] | |
| 858 pxor mm3, [GLOBAL(t80)] | |
| 859 movq [rdi], mm3 | |
| 860 movq [rsi+rax*2], mm6 | |
| 861 | |
| 862 ; roughly 1/7th difference across boundary | |
| 863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); | |
| 864 ; s = vp8_signed_char_clamp(qs2 - u); | |
| 865 ; *oq2 = s^0x80; | |
| 866 ; s = vp8_signed_char_clamp(ps2 + u); | |
| 867 ; *op2 = s^0x80; | |
| 868 pxor mm1, mm1 | |
| 869 pxor mm2, mm2 | |
| 870 punpcklbw mm1, mm4 | |
| 871 punpckhbw mm2, mm4 | |
| 872 pmulhw mm1, [GLOBAL(s9)] | |
| 873 pmulhw mm2, [GLOBAL(s9)] | |
| 874 paddw mm1, [GLOBAL(s63)] | |
| 875 paddw mm2, [GLOBAL(s63)] | |
| 876 psraw mm1, 7 | |
| 877 psraw mm2, 7 | |
| 878 packsswb mm1, mm2 | |
| 879 | |
| 880 | |
| 881 movq mm6, [rdi+rax*4] | |
| 882 neg rax | |
| 883 movq mm3, [rdi+rax ] | |
| 884 | |
| 885 pxor mm6, [GLOBAL(t80)] | |
| 886 pxor mm3, [GLOBAL(t80)] | |
| 887 | |
| 888 paddsb mm6, mm1 | |
| 889 psubsb mm3, mm1 | |
| 890 | |
| 891 pxor mm6, [GLOBAL(t80)] | |
| 892 pxor mm3, [GLOBAL(t80)] | |
| 893 movq [rdi+rax ], mm3 | |
| 894 neg rax | |
| 895 movq [rdi+rax*4], mm6 | |
| 896 | |
| 897 ;EARLY_BREAK_OUT: | |
| 898 neg rax | |
| 899 add rsi,8 | |
| 900 dec rcx | |
| 901 jnz .next8_mbh | |
| 902 | |
| 903 add rsp, 32 | |
| 904 pop rsp | |
| 905 ; begin epilog | |
| 906 pop rdi | |
| 907 pop rsi | |
| 908 RESTORE_GOT | |
| 909 UNSHADOW_ARGS | |
| 910 pop rbp | |
| 911 ret | |
| 912 | |
| 913 | |
| 914 ;void vp8_mbloop_filter_vertical_edge_mmx | |
| 915 ;( | |
| 916 ; unsigned char *src_ptr, | |
| 917 ; int src_pixel_step, | |
| 918 ; const char *blimit, | |
| 919 ; const char *limit, | |
| 920 ; const char *thresh, | |
| 921 ; int count | |
| 922 ;) | |
| 923 global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE | |
| 924 sym(vp8_mbloop_filter_vertical_edge_mmx): | |
| 925 push rbp | |
| 926 mov rbp, rsp | |
| 927 SHADOW_ARGS_TO_STACK 6 | |
| 928 GET_GOT rbx | |
| 929 push rsi | |
| 930 push rdi | |
| 931 ; end prolog | |
| 932 | |
| 933 ALIGN_STACK 16, rax | |
| 934 sub rsp, 96 ; reserve 96 bytes | |
| 935 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
| 936 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
| 937 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; | |
| 938 | |
| 939 mov rsi, arg(0) ;src_ptr | |
| 940 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destinati
on pitch? | |
| 941 | |
| 942 lea rsi, [rsi + rax*4 - 4] | |
| 943 | |
| 944 movsxd rcx, dword ptr arg(5) ;count | |
| 945 .next8_mbv: | |
| 946 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect
addressing | |
| 947 | |
| 948 ;transpose | |
| 949 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72
71 70 | |
| 950 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62
61 60 | |
| 951 | |
| 952 movq mm7, mm6 ; 77 76 75 74 73 72
71 70 | |
| 953 punpckhbw mm7, mm0 ; 77 67 76 66 75 65
74 64 | |
| 954 | |
| 955 punpcklbw mm6, mm0 ; 73 63 72 62 71 61
70 60 | |
| 956 movq mm0, [rsi+rax] ; 57 56 55 54 53 52
51 50 | |
| 957 | |
| 958 movq mm4, [rsi] ; 47 46 45 44 43 42
41 40 | |
| 959 movq mm5, mm4 ; 47 46 45 44 43 42
41 40 | |
| 960 | |
| 961 punpckhbw mm5, mm0 ; 57 47 56 46 55 45
54 44 | |
| 962 punpcklbw mm4, mm0 ; 53 43 52 42 51 41
50 40 | |
| 963 | |
| 964 movq mm3, mm5 ; 57 47 56 46 55 45
54 44 | |
| 965 punpckhwd mm5, mm7 ; 77 67 57 47 76 66
56 46 | |
| 966 | |
| 967 punpcklwd mm3, mm7 ; 75 65 55 45 74 64
54 44 | |
| 968 movq mm2, mm4 ; 53 43 52 42 51 41
50 40 | |
| 969 | |
| 970 punpckhwd mm4, mm6 ; 73 63 53 43 72 62
52 42 | |
| 971 punpcklwd mm2, mm6 ; 71 61 51 41 70 60
50 40 | |
| 972 | |
| 973 neg rax | |
| 974 | |
| 975 movq mm7, [rsi+rax] ; 37 36 35 34 33 32
31 30 | |
| 976 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22
21 20 | |
| 977 | |
| 978 movq mm1, mm6 ; 27 26 25 24 23 22
21 20 | |
| 979 punpckhbw mm6, mm7 ; 37 27 36 36 35 25
34 24 | |
| 980 | |
| 981 punpcklbw mm1, mm7 ; 33 23 32 22 31 21
30 20 | |
| 982 | |
| 983 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02
01 00 | |
| 984 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05
14 04 | |
| 985 | |
| 986 movq mm0, mm7 ; 17 07 16 06 15 05
14 04 | |
| 987 punpckhwd mm7, mm6 ; 37 27 17 07 36 26
16 06 | |
| 988 | |
| 989 punpcklwd mm0, mm6 ; 35 25 15 05 34 24
14 04 | |
| 990 movq mm6, mm7 ; 37 27 17 07 36 26
16 06 | |
| 991 | |
| 992 punpckhdq mm7, mm5 ; 77 67 57 47 37 27
17 07 = q3 | |
| 993 punpckldq mm6, mm5 ; 76 66 56 46 36 26
16 06 = q2 | |
| 994 | |
| 995 lea rdx, srct | |
| 996 movq mm5, mm6 ; 76 66 56 46 36 26
16 06 | |
| 997 | |
| 998 movq [rdx+56], mm7 | |
| 999 psubusb mm5, mm7 ; q2-q3 | |
| 1000 | |
| 1001 | |
| 1002 movq [rdx+48], mm6 | |
| 1003 psubusb mm7, mm6 ; q3-q2 | |
| 1004 | |
| 1005 por mm7, mm5; ; mm7=abs (q3-q2) | |
| 1006 movq mm5, mm0 ; 35 25 15 05 34 24
14 04 | |
| 1007 | |
| 1008 punpckhdq mm5, mm3 ; 75 65 55 45 35 25
15 05 = q1 | |
| 1009 punpckldq mm0, mm3 ; 74 64 54 44 34 24
15 04 = q0 | |
| 1010 | |
| 1011 movq mm3, mm5 ; 75 65 55 45 35 25
15 05 = q1 | |
| 1012 psubusb mm3, mm6 ; q1-q2 | |
| 1013 | |
| 1014 psubusb mm6, mm5 ; q2-q1 | |
| 1015 por mm6, mm3 ; mm6=abs(q2-q1) | |
| 1016 | |
| 1017 movq [rdx+40], mm5 ; save q1 | |
| 1018 movq [rdx+32], mm0 ; save q0 | |
| 1019 | |
| 1020 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02
01 00 | |
| 1021 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01
10 00 | |
| 1022 | |
| 1023 movq mm0, mm3 ; 13 03 12 02 11 01
10 00 | |
| 1024 punpcklwd mm0, mm1 ; 31 21 11 01 30 20
10 00 | |
| 1025 | |
| 1026 punpckhwd mm3, mm1 ; 33 23 13 03 32 22
12 02 | |
| 1027 movq mm1, mm0 ; 31 21 11 01 30 20
10 00 | |
| 1028 | |
| 1029 punpckldq mm0, mm2 ; 70 60 50 40 30 20
10 00 =p3 | |
| 1030 punpckhdq mm1, mm2 ; 71 61 51 41 31 21
11 01 =p2 | |
| 1031 | |
| 1032 movq [rdx], mm0 ; save p3 | |
| 1033 movq [rdx+8], mm1 ; save p2 | |
| 1034 | |
| 1035 movq mm2, mm1 ; 71 61 51 41 31 21
11 01 =p2 | |
| 1036 psubusb mm2, mm0 ; p2-p3 | |
| 1037 | |
| 1038 psubusb mm0, mm1 ; p3-p2 | |
| 1039 por mm0, mm2 ; mm0=abs(p3-p2) | |
| 1040 | |
| 1041 movq mm2, mm3 ; 33 23 13 03 32 22
12 02 | |
| 1042 punpckldq mm2, mm4 ; 72 62 52 42 32 22
12 02 = p1 | |
| 1043 | |
| 1044 punpckhdq mm3, mm4 ; 73 63 53 43 33 23
13 03 = p0 | |
| 1045 movq [rdx+24], mm3 ; save p0 | |
| 1046 | |
| 1047 movq [rdx+16], mm2 ; save p1 | |
| 1048 movq mm5, mm2 ; mm5 = p1 | |
| 1049 | |
| 1050 psubusb mm2, mm1 ; p1-p2 | |
| 1051 psubusb mm1, mm5 ; p2-p1 | |
| 1052 | |
| 1053 por mm1, mm2 ; mm1=abs(p2-p1) | |
| 1054 mov rdx, arg(3) ;limit | |
| 1055 | |
| 1056 movq mm4, [rdx] ; mm4 = limit | |
| 1057 psubusb mm7, mm4 ; abs(q3-q2) > limit | |
| 1058 | |
| 1059 psubusb mm0, mm4 ; abs(p3-p2) > limit | |
| 1060 psubusb mm1, mm4 ; abs(p2-p1) > limit | |
| 1061 | |
| 1062 psubusb mm6, mm4 ; abs(q2-q1) > limit | |
| 1063 por mm7, mm6 ; or | |
| 1064 | |
| 1065 por mm0, mm1 ; | |
| 1066 por mm0, mm7 ; abs(q3-q2) > limit
|| abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit | |
| 1067 | |
| 1068 movq mm1, mm5 ; p1 | |
| 1069 | |
| 1070 movq mm7, mm3 ; mm3=mm7=p0 | |
| 1071 psubusb mm7, mm5 ; p0 - p1 | |
| 1072 | |
| 1073 psubusb mm5, mm3 ; p1 - p0 | |
| 1074 por mm5, mm7 ; abs(p1-p0) | |
| 1075 | |
| 1076 movq t0, mm5 ; save abs(p1-p0) | |
| 1077 lea rdx, srct | |
| 1078 | |
| 1079 psubusb mm5, mm4 ; mm5 = abs(p1-p0) >
limit | |
| 1080 por mm0, mm5 ; mm0=mask | |
| 1081 | |
| 1082 movq mm5, [rdx+32] ; mm5=q0 | |
| 1083 movq mm7, [rdx+40] ; mm7=q1 | |
| 1084 | |
| 1085 movq mm6, mm5 ; mm6=q0 | |
| 1086 movq mm2, mm7 ; q1 | |
| 1087 psubusb mm5, mm7 ; q0-q1 | |
| 1088 | |
| 1089 psubusb mm7, mm6 ; q1-q0 | |
| 1090 por mm7, mm5 ; abs(q1-q0) | |
| 1091 | |
| 1092 movq t1, mm7 ; save abs(q1-q0) | |
| 1093 psubusb mm7, mm4 ; mm7=abs(q1-q0)> li
mit | |
| 1094 | |
| 1095 por mm0, mm7 ; mask | |
| 1096 | |
| 1097 movq mm5, mm2 ; q1 | |
| 1098 psubusb mm5, mm1 ; q1-=p1 | |
| 1099 psubusb mm1, mm2 ; p1-=q1 | |
| 1100 por mm5, mm1 ; abs(p1-q1) | |
| 1101 pand mm5, [GLOBAL(tfe)] ; set lsb of each by
te to zero | |
| 1102 psrlw mm5, 1 ; abs(p1-q1)/2 | |
| 1103 | |
| 1104 mov rdx, arg(2) ;blimit ; | |
| 1105 | |
| 1106 movq mm4, [rdx] ;blimit | |
| 1107 movq mm1, mm3 ; mm1=mm3=p0 | |
| 1108 | |
| 1109 movq mm7, mm6 ; mm7=mm6=q0 | |
| 1110 psubusb mm1, mm7 ; p0-q0 | |
| 1111 | |
| 1112 psubusb mm7, mm3 ; q0-p0 | |
| 1113 por mm1, mm7 ; abs(q0-p0) | |
| 1114 paddusb mm1, mm1 ; abs(q0-p0)*2 | |
| 1115 paddusb mm1, mm5 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 | |
| 1116 | |
| 1117 psubusb mm1, mm4 ; abs (p0 - q0) *2 +
abs(p1-q1)/2 > blimit | |
| 1118 por mm1, mm0; ; mask | |
| 1119 | |
| 1120 pxor mm0, mm0 | |
| 1121 pcmpeqb mm1, mm0 | |
| 1122 | |
| 1123 ; calculate high edge variance | |
| 1124 mov rdx, arg(4) ;thresh ; get thresh | |
| 1125 movq mm7, [rdx] | |
| 1126 ; | |
| 1127 movq mm4, t0 ; get abs (q1 - q0) | |
| 1128 psubusb mm4, mm7 ; abs(q1 - q0) > thresh | |
| 1129 | |
| 1130 movq mm3, t1 ; get abs (p1 - p0) | |
| 1131 psubusb mm3, mm7 ; abs(p1 - p0)> thresh | |
| 1132 | |
| 1133 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p
1 - p0) > thresh | |
| 1134 pcmpeqb mm4, mm0 | |
| 1135 | |
| 1136 pcmpeqb mm0, mm0 | |
| 1137 pxor mm4, mm0 | |
| 1138 | |
| 1139 | |
| 1140 | |
| 1141 | |
| 1142 ; start work on filters | |
| 1143 lea rdx, srct | |
| 1144 | |
| 1145 ; start work on filters | |
| 1146 movq mm2, [rdx+16] ; p1 | |
| 1147 movq mm7, [rdx+40] ; q1 | |
| 1148 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
| 1149 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
| 1150 psubsb mm2, mm7 ; p1 - q1 | |
| 1151 | |
| 1152 movq mm6, [rdx+24] ; p0 | |
| 1153 movq mm0, [rdx+32] ; q0 | |
| 1154 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
| 1155 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
| 1156 | |
| 1157 movq mm3, mm0 ; q0 | |
| 1158 psubsb mm0, mm6 ; q0 - p0 | |
| 1159 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) | |
| 1160 paddsb mm2, mm0 ; 2 * (q0 - p0) | |
| 1161 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) | |
| 1162 pand mm1, mm2 ; mask filter values we don't care about | |
| 1163 | |
| 1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 | |
| 1165 movq mm2, mm1 ; vp8_filter | |
| 1166 pand mm2, mm4; ; Filter2 = vp8_filter & hev | |
| 1167 | |
| 1168 movq mm5, mm2 ; | |
| 1169 paddsb mm5, [GLOBAL(t3)]; | |
| 1170 | |
| 1171 pxor mm0, mm0 ; 0 | |
| 1172 pxor mm7, mm7 ; 0 | |
| 1173 | |
| 1174 punpcklbw mm0, mm5 ; e0f0g0h0 | |
| 1175 psraw mm0, 11 ; sign extended shift right by 3 | |
| 1176 punpckhbw mm7, mm5 ; a0b0c0d0 | |
| 1177 psraw mm7, 11 ; sign extended shift right by 3 | |
| 1178 packsswb mm0, mm7 ; Filter2 >>=3; | |
| 1179 | |
| 1180 movq mm5, mm0 ; Filter2 | |
| 1181 | |
| 1182 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) | |
| 1183 pxor mm0, mm0 ; 0 | |
| 1184 pxor mm7, mm7 ; 0 | |
| 1185 | |
| 1186 punpcklbw mm0, mm2 ; e0f0g0h0 | |
| 1187 psraw mm0, 11 ; sign extended shift right by 3 | |
| 1188 punpckhbw mm7, mm2 ; a0b0c0d0 | |
| 1189 psraw mm7, 11 ; sign extended shift right by 3 | |
| 1190 packsswb mm0, mm7 ; Filter2 >>=3; | |
| 1191 | |
| 1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 | |
| 1193 psubsb mm3, mm0 ; qs0 =qs0 - filter1 | |
| 1194 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 | |
| 1195 | |
| 1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 | |
| 1197 ; vp8_filter &= ~hev; | |
| 1198 ; Filter2 = vp8_filter; | |
| 1199 pandn mm4, mm1 ; vp8_filter&=~hev | |
| 1200 | |
| 1201 | |
| 1202 ; mm3=qs0, mm4=filter2, mm6=ps0 | |
| 1203 | |
| 1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); | |
| 1205 ; s = vp8_signed_char_clamp(qs0 - u); | |
| 1206 ; *oq0 = s^0x80; | |
| 1207 ; s = vp8_signed_char_clamp(ps0 + u); | |
| 1208 ; *op0 = s^0x80; | |
| 1209 pxor mm0, mm0 | |
| 1210 | |
| 1211 pxor mm1, mm1 | |
| 1212 pxor mm2, mm2 | |
| 1213 punpcklbw mm1, mm4 | |
| 1214 punpckhbw mm2, mm4 | |
| 1215 pmulhw mm1, [GLOBAL(s27)] | |
| 1216 pmulhw mm2, [GLOBAL(s27)] | |
| 1217 paddw mm1, [GLOBAL(s63)] | |
| 1218 paddw mm2, [GLOBAL(s63)] | |
| 1219 psraw mm1, 7 | |
| 1220 psraw mm2, 7 | |
| 1221 packsswb mm1, mm2 | |
| 1222 | |
| 1223 psubsb mm3, mm1 | |
| 1224 paddsb mm6, mm1 | |
| 1225 | |
| 1226 pxor mm3, [GLOBAL(t80)] | |
| 1227 pxor mm6, [GLOBAL(t80)] | |
| 1228 movq [rdx+24], mm6 | |
| 1229 movq [rdx+32], mm3 | |
| 1230 | |
| 1231 ; roughly 2/7th difference across boundary | |
| 1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); | |
| 1233 ; s = vp8_signed_char_clamp(qs1 - u); | |
| 1234 ; *oq1 = s^0x80; | |
| 1235 ; s = vp8_signed_char_clamp(ps1 + u); | |
| 1236 ; *op1 = s^0x80; | |
| 1237 pxor mm1, mm1 | |
| 1238 pxor mm2, mm2 | |
| 1239 punpcklbw mm1, mm4 | |
| 1240 punpckhbw mm2, mm4 | |
| 1241 pmulhw mm1, [GLOBAL(s18)] | |
| 1242 pmulhw mm2, [GLOBAL(s18)] | |
| 1243 paddw mm1, [GLOBAL(s63)] | |
| 1244 paddw mm2, [GLOBAL(s63)] | |
| 1245 psraw mm1, 7 | |
| 1246 psraw mm2, 7 | |
| 1247 packsswb mm1, mm2 | |
| 1248 | |
| 1249 movq mm3, [rdx + 40] | |
| 1250 movq mm6, [rdx + 16] ; p1 | |
| 1251 pxor mm3, [GLOBAL(t80)] | |
| 1252 pxor mm6, [GLOBAL(t80)] | |
| 1253 | |
| 1254 paddsb mm6, mm1 | |
| 1255 psubsb mm3, mm1 | |
| 1256 | |
| 1257 pxor mm6, [GLOBAL(t80)] | |
| 1258 pxor mm3, [GLOBAL(t80)] | |
| 1259 movq [rdx + 40], mm3 | |
| 1260 movq [rdx + 16], mm6 | |
| 1261 | |
| 1262 ; roughly 1/7th difference across boundary | |
| 1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); | |
| 1264 ; s = vp8_signed_char_clamp(qs2 - u); | |
| 1265 ; *oq2 = s^0x80; | |
| 1266 ; s = vp8_signed_char_clamp(ps2 + u); | |
| 1267 ; *op2 = s^0x80; | |
| 1268 pxor mm1, mm1 | |
| 1269 pxor mm2, mm2 | |
| 1270 punpcklbw mm1, mm4 | |
| 1271 punpckhbw mm2, mm4 | |
| 1272 pmulhw mm1, [GLOBAL(s9)] | |
| 1273 pmulhw mm2, [GLOBAL(s9)] | |
| 1274 paddw mm1, [GLOBAL(s63)] | |
| 1275 paddw mm2, [GLOBAL(s63)] | |
| 1276 psraw mm1, 7 | |
| 1277 psraw mm2, 7 | |
| 1278 packsswb mm1, mm2 | |
| 1279 | |
| 1280 movq mm6, [rdx+ 8] | |
| 1281 movq mm3, [rdx+48] | |
| 1282 | |
| 1283 pxor mm6, [GLOBAL(t80)] | |
| 1284 pxor mm3, [GLOBAL(t80)] | |
| 1285 | |
| 1286 paddsb mm6, mm1 | |
| 1287 psubsb mm3, mm1 | |
| 1288 | |
| 1289 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 | |
| 1290 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 | |
| 1291 | |
| 1292 ; transpose and write back | |
| 1293 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 | |
| 1294 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 | |
| 1295 | |
| 1296 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 | |
| 1297 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 | |
| 1298 | |
| 1299 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 | |
| 1300 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 | |
| 1301 | |
| 1302 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 | |
| 1303 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 | |
| 1304 | |
| 1305 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 | |
| 1306 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 | |
| 1307 | |
| 1308 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 | |
| 1309 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 | |
| 1310 | |
| 1311 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 | |
| 1312 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 | |
| 1313 | |
| 1314 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 | |
| 1315 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 | |
| 1316 | |
| 1317 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 | |
| 1318 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 | |
| 1319 | |
| 1320 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 | |
| 1321 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 | |
| 1322 | |
| 1323 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 | |
| 1324 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 | |
| 1325 | |
| 1326 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 | |
| 1327 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 | |
| 1328 | |
| 1329 movq [rsi+rax*4], mm0 ; write out | |
| 1330 movq [rdi+rax*4], mm6 ; write out | |
| 1331 | |
| 1332 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 | |
| 1333 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 | |
| 1334 | |
| 1335 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 | |
| 1336 movq [rsi+rax*2], mm0 ; write out | |
| 1337 | |
| 1338 movq [rdi+rax*2], mm5 ; write out | |
| 1339 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 | |
| 1340 | |
| 1341 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 | |
| 1342 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 | |
| 1343 | |
| 1344 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 | |
| 1345 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 | |
| 1346 | |
| 1347 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 | |
| 1348 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 | |
| 1349 | |
| 1350 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 | |
| 1351 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 | |
| 1352 | |
| 1353 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 | |
| 1354 movq [rsi], mm0 ; write out | |
| 1355 | |
| 1356 movq [rdi], mm1 ; write out | |
| 1357 neg rax | |
| 1358 | |
| 1359 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 | |
| 1360 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 | |
| 1361 | |
| 1362 movq [rsi+rax*2], mm3 | |
| 1363 movq [rdi+rax*2], mm4 | |
| 1364 | |
| 1365 lea rsi, [rsi+rax*8] | |
| 1366 dec rcx | |
| 1367 | |
| 1368 jnz .next8_mbv | |
| 1369 | |
| 1370 add rsp, 96 | |
| 1371 pop rsp | |
| 1372 ; begin epilog | |
| 1373 pop rdi | |
| 1374 pop rsi | |
| 1375 RESTORE_GOT | |
| 1376 UNSHADOW_ARGS | |
| 1377 pop rbp | |
| 1378 ret | |
| 1379 | |
| 1380 | |
| 1381 ;void vp8_loop_filter_simple_horizontal_edge_mmx | |
| 1382 ;( | |
| 1383 ; unsigned char *src_ptr, | |
| 1384 ; int src_pixel_step, | |
| 1385 ; const char *blimit | |
| 1386 ;) | |
| 1387 global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE | |
| 1388 sym(vp8_loop_filter_simple_horizontal_edge_mmx): | |
| 1389 push rbp | |
| 1390 mov rbp, rsp | |
| 1391 SHADOW_ARGS_TO_STACK 3 | |
| 1392 GET_GOT rbx | |
| 1393 push rsi | |
| 1394 push rdi | |
| 1395 ; end prolog | |
| 1396 | |
| 1397 mov rsi, arg(0) ;src_ptr | |
| 1398 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
| 1399 | |
| 1400 mov rcx, 2 ; count | |
| 1401 .nexts8_h: | |
| 1402 mov rdx, arg(2) ;blimit ; get blimit | |
| 1403 movq mm3, [rdx] ; | |
| 1404 | |
| 1405 mov rdi, rsi ; rdi points to row +1 for indirect ad
dressing | |
| 1406 add rdi, rax | |
| 1407 neg rax | |
| 1408 | |
| 1409 ; calculate mask | |
| 1410 movq mm1, [rsi+2*rax] ; p1 | |
| 1411 movq mm0, [rdi] ; q1 | |
| 1412 movq mm2, mm1 | |
| 1413 movq mm7, mm0 | |
| 1414 movq mm4, mm0 | |
| 1415 psubusb mm0, mm1 ; q1-=p1 | |
| 1416 psubusb mm1, mm4 ; p1-=q1 | |
| 1417 por mm1, mm0 ; abs(p1-q1) | |
| 1418 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero | |
| 1419 psrlw mm1, 1 ; abs(p1-q1)/2 | |
| 1420 | |
| 1421 movq mm5, [rsi+rax] ; p0 | |
| 1422 movq mm4, [rsi] ; q0 | |
| 1423 movq mm0, mm4 ; q0 | |
| 1424 movq mm6, mm5 ; p0 | |
| 1425 psubusb mm5, mm4 ; p0-=q0 | |
| 1426 psubusb mm4, mm6 ; q0-=p0 | |
| 1427 por mm5, mm4 ; abs(p0 - q0) | |
| 1428 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
| 1429 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 | |
| 1430 | |
| 1431 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > bl
imit | |
| 1432 pxor mm3, mm3 | |
| 1433 pcmpeqb mm5, mm3 | |
| 1434 | |
| 1435 ; start work on filters | |
| 1436 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed value
s | |
| 1437 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed value
s | |
| 1438 psubsb mm2, mm7 ; p1 - q1 | |
| 1439 | |
| 1440 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values | |
| 1441 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values | |
| 1442 movq mm3, mm0 ; q0 | |
| 1443 psubsb mm0, mm6 ; q0 - p0 | |
| 1444 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) | |
| 1445 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) | |
| 1446 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) | |
| 1447 pand mm5, mm2 ; mask filter values we don't care abo
ut | |
| 1448 | |
| 1449 ; do + 4 side | |
| 1450 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 | |
| 1451 | |
| 1452 movq mm0, mm5 ; get a copy of filters | |
| 1453 psllw mm0, 8 ; shift left 8 | |
| 1454 psraw mm0, 3 ; arithmetic shift right 11 | |
| 1455 psrlw mm0, 8 | |
| 1456 movq mm1, mm5 ; get a copy of filters | |
| 1457 psraw mm1, 11 ; arithmetic shift right 11 | |
| 1458 psllw mm1, 8 ; shift left 8 to put it back | |
| 1459 | |
| 1460 por mm0, mm1 ; put the two together to get result | |
| 1461 | |
| 1462 psubsb mm3, mm0 ; q0-= q0 add | |
| 1463 pxor mm3, [GLOBAL(t80)] ; unoffset | |
| 1464 movq [rsi], mm3 ; write back | |
| 1465 | |
| 1466 | |
| 1467 ; now do +3 side | |
| 1468 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 | |
| 1469 | |
| 1470 movq mm0, mm5 ; get a copy of filters | |
| 1471 psllw mm0, 8 ; shift left 8 | |
| 1472 psraw mm0, 3 ; arithmetic shift right 11 | |
| 1473 psrlw mm0, 8 | |
| 1474 psraw mm5, 11 ; arithmetic shift right 11 | |
| 1475 psllw mm5, 8 ; shift left 8 to put it back | |
| 1476 por mm0, mm5 ; put the two together to get result | |
| 1477 | |
| 1478 | |
| 1479 paddsb mm6, mm0 ; p0+= p0 add | |
| 1480 pxor mm6, [GLOBAL(t80)] ; unoffset | |
| 1481 movq [rsi+rax], mm6 ; write back | |
| 1482 | |
| 1483 add rsi,8 | |
| 1484 neg rax | |
| 1485 dec rcx | |
| 1486 jnz .nexts8_h | |
| 1487 | |
| 1488 ; begin epilog | |
| 1489 pop rdi | |
| 1490 pop rsi | |
| 1491 RESTORE_GOT | |
| 1492 UNSHADOW_ARGS | |
| 1493 pop rbp | |
| 1494 ret | |
| 1495 | |
| 1496 | |
| 1497 ;void vp8_loop_filter_simple_vertical_edge_mmx | |
| 1498 ;( | |
| 1499 ; unsigned char *src_ptr, | |
| 1500 ; int src_pixel_step, | |
| 1501 ; const char *blimit | |
| 1502 ;) | |
| 1503 global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE | |
| 1504 sym(vp8_loop_filter_simple_vertical_edge_mmx): | |
| 1505 push rbp | |
| 1506 mov rbp, rsp | |
| 1507 SHADOW_ARGS_TO_STACK 3 | |
| 1508 GET_GOT rbx | |
| 1509 push rsi | |
| 1510 push rdi | |
| 1511 ; end prolog | |
| 1512 | |
| 1513 ALIGN_STACK 16, rax | |
| 1514 sub rsp, 32 ; reserve 32 bytes | |
| 1515 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; | |
| 1516 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; | |
| 1517 | |
| 1518 mov rsi, arg(0) ;src_ptr | |
| 1519 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitc
h? | |
| 1520 | |
| 1521 lea rsi, [rsi + rax*4- 2]; ; | |
| 1522 mov rcx, 2 ; count | |
| 1523 .nexts8_v: | |
| 1524 | |
| 1525 lea rdi, [rsi + rax]; | |
| 1526 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73
72 71 70 | |
| 1527 | |
| 1528 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63
62 61 60 | |
| 1529 punpcklbw mm6, mm0 ; 73 63 72 62 71
61 70 60 | |
| 1530 | |
| 1531 movd mm0, [rsi + rax] ; xx xx xx xx 53
52 51 50 | |
| 1532 movd mm4, [rsi] ; xx xx xx xx 43
42 41 40 | |
| 1533 | |
| 1534 punpcklbw mm4, mm0 ; 53 43 52 42 51
41 50 40 | |
| 1535 movq mm5, mm4 ; 53 43 52 42 51
41 50 40 | |
| 1536 | |
| 1537 punpcklwd mm4, mm6 ; 71 61 51 41 70
60 50 40 | |
| 1538 punpckhwd mm5, mm6 ; 73 63 53 43 72
62 52 42 | |
| 1539 | |
| 1540 neg rax | |
| 1541 | |
| 1542 movd mm7, [rsi + rax] ; xx xx xx xx 33
32 31 30 | |
| 1543 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23
22 21 20 | |
| 1544 | |
| 1545 punpcklbw mm6, mm7 ; 33 23 32 22 31
21 30 20 | |
| 1546 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13
12 11 10 | |
| 1547 | |
| 1548 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03
02 01 00 | |
| 1549 punpcklbw mm0, mm1 ; 13 03 12 02 11
01 10 00 | |
| 1550 | |
| 1551 movq mm2, mm0 ; 13 03 12 02 11
01 10 00 | |
| 1552 punpcklwd mm0, mm6 ; 31 21 11 01 30
20 10 00 | |
| 1553 | |
| 1554 punpckhwd mm2, mm6 ; 33 23 13 03 32
22 12 02 | |
| 1555 movq mm1, mm0 ; 13 03 12 02 11
01 10 00 | |
| 1556 | |
| 1557 punpckldq mm0, mm4 ; 70 60 50 40 30
20 10 00 = p1 | |
| 1558 movq mm3, mm2 ; 33 23 13 03 32
22 12 02 | |
| 1559 | |
| 1560 punpckhdq mm1, mm4 ; 71 61 51 41 31
21 11 01 = p0 | |
| 1561 punpckldq mm2, mm5 ; 72 62 52 42 32
22 12 02 = q0 | |
| 1562 | |
| 1563 punpckhdq mm3, mm5 ; 73 63 53 43 33
23 13 03 = q1 | |
| 1564 | |
| 1565 | |
| 1566 ; calculate mask | |
| 1567 movq mm6, mm0 ; p1 | |
| 1568 movq mm7, mm3 ; q1 | |
| 1569 psubusb mm7, mm6 ; q1-=p1 | |
| 1570 psubusb mm6, mm3 ; p1-=q1 | |
| 1571 por mm6, mm7 ; abs(p1-q1) | |
| 1572 pand mm6, [GLOBAL(tfe)] ; set lsb of eac
h byte to zero | |
| 1573 psrlw mm6, 1 ; abs(p1-q1)/2 | |
| 1574 | |
| 1575 movq mm5, mm1 ; p0 | |
| 1576 movq mm4, mm2 ; q0 | |
| 1577 | |
| 1578 psubusb mm5, mm2 ; p0-=q0 | |
| 1579 psubusb mm4, mm1 ; q0-=p0 | |
| 1580 | |
| 1581 por mm5, mm4 ; abs(p0 - q0) | |
| 1582 paddusb mm5, mm5 ; abs(p0-q0)*2 | |
| 1583 paddusb mm5, mm6 ; abs (p0 - q0)
*2 + abs(p1-q1)/2 | |
| 1584 | |
| 1585 mov rdx, arg(2) ;blimit ; get bl
imit | |
| 1586 movq mm7, [rdx] | |
| 1587 | |
| 1588 psubusb mm5, mm7 ; abs(p0 - q0) *
2 + abs(p1-q1)/2 > blimit | |
| 1589 pxor mm7, mm7 | |
| 1590 pcmpeqb mm5, mm7 ; mm5 = mask | |
| 1591 | |
| 1592 ; start work on filters | |
| 1593 movq t0, mm0 | |
| 1594 movq t1, mm3 | |
| 1595 | |
| 1596 pxor mm0, [GLOBAL(t80)] ; p1 offset to c
onvert to signed values | |
| 1597 pxor mm3, [GLOBAL(t80)] ; q1 offset to c
onvert to signed values | |
| 1598 | |
| 1599 psubsb mm0, mm3 ; p1 - q1 | |
| 1600 movq mm6, mm1 ; p0 | |
| 1601 | |
| 1602 movq mm7, mm2 ; q0 | |
| 1603 pxor mm6, [GLOBAL(t80)] ; offset to conv
ert to signed values | |
| 1604 | |
| 1605 pxor mm7, [GLOBAL(t80)] ; offset to conv
ert to signed values | |
| 1606 movq mm3, mm7 ; offseted ; q0 | |
| 1607 | |
| 1608 psubsb mm7, mm6 ; q0 - p0 | |
| 1609 paddsb mm0, mm7 ; p1 - q1 + 1 *
(q0 - p0) | |
| 1610 | |
| 1611 paddsb mm0, mm7 ; p1 - q1 + 2 *
(q0 - p0) | |
| 1612 paddsb mm0, mm7 ; p1 - q1 + 3 *
(q0 - p0) | |
| 1613 | |
| 1614 pand mm5, mm0 ; mask filter va
lues we don't care about | |
| 1615 | |
| 1616 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0)
+ (p1 - q1) + 4 | |
| 1617 | |
| 1618 movq mm0, mm5 ; get a copy of
filters | |
| 1619 psllw mm0, 8 ; shift left 8 | |
| 1620 psraw mm0, 3 ; arithmetic shi
ft right 11 | |
| 1621 psrlw mm0, 8 | |
| 1622 | |
| 1623 movq mm7, mm5 ; get a copy of
filters | |
| 1624 psraw mm7, 11 ; arithmetic shi
ft right 11 | |
| 1625 psllw mm7, 8 ; shift left 8 t
o put it back | |
| 1626 | |
| 1627 por mm0, mm7 ; put the two to
gether to get result | |
| 1628 | |
| 1629 psubsb mm3, mm0 ; q0-= q0sz add | |
| 1630 pxor mm3, [GLOBAL(t80)] ; unoffset | |
| 1631 | |
| 1632 ; now do +3 side | |
| 1633 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of
+4 | |
| 1634 | |
| 1635 movq mm0, mm5 ; get a copy of
filters | |
| 1636 psllw mm0, 8 ; shift left 8 | |
| 1637 psraw mm0, 3 ; arithmetic shi
ft right 11 | |
| 1638 psrlw mm0, 8 | |
| 1639 | |
| 1640 psraw mm5, 11 ; arithmetic shi
ft right 11 | |
| 1641 psllw mm5, 8 ; shift left 8 t
o put it back | |
| 1642 por mm0, mm5 ; put the two to
gether to get result | |
| 1643 | |
| 1644 paddsb mm6, mm0 ; p0+= p0 add | |
| 1645 pxor mm6, [GLOBAL(t80)] ; unoffset | |
| 1646 | |
| 1647 | |
| 1648 movq mm0, t0 | |
| 1649 movq mm4, t1 | |
| 1650 | |
| 1651 ; mm0 = 70 60 50 40 30 20 10 00 | |
| 1652 ; mm6 = 71 61 51 41 31 21 11 01 | |
| 1653 ; mm3 = 72 62 52 42 32 22 12 02 | |
| 1654 ; mm4 = 73 63 53 43 33 23 13 03 | |
| 1655 ; transpose back to write out | |
| 1656 | |
| 1657 movq mm1, mm0 ; | |
| 1658 punpcklbw mm0, mm6 ; 31 30 21 20 11 10
01 00 | |
| 1659 | |
| 1660 punpckhbw mm1, mm6 ; 71 70 61 60 51 50
41 40 | |
| 1661 movq mm2, mm3 ; | |
| 1662 | |
| 1663 punpcklbw mm2, mm4 ; 33 32 23 22 13 12
03 02 | |
| 1664 movq mm5, mm1 ; 71 70 61 60 51 50
41 40 | |
| 1665 | |
| 1666 punpckhbw mm3, mm4 ; 73 72 63 62 53 52
43 42 | |
| 1667 movq mm6, mm0 ; 31 30 21 20 11 10
01 00 | |
| 1668 | |
| 1669 punpcklwd mm0, mm2 ; 13 12 11 10 03 02
01 00 | |
| 1670 punpckhwd mm6, mm2 ; 33 32 31 30 23 22
21 20 | |
| 1671 | |
| 1672 movd [rsi+rax*4], mm0 ; write 03 02 01 00 | |
| 1673 punpcklwd mm1, mm3 ; 53 52 51 50 43 42
41 40 | |
| 1674 | |
| 1675 psrlq mm0, 32 ; xx xx xx xx 13 12
11 10 | |
| 1676 punpckhwd mm5, mm3 ; 73 72 71 70 63 62
61 60 | |
| 1677 | |
| 1678 movd [rdi+rax*4], mm0 ; write 13 12 11 10 | |
| 1679 movd [rsi+rax*2], mm6 ; write 23 22 21 20 | |
| 1680 | |
| 1681 psrlq mm6, 32 ; 33 32 31 30 | |
| 1682 movd [rsi], mm1 ; write 43 42 41 40 | |
| 1683 | |
| 1684 movd [rsi + rax], mm6 ; write 33 32 31 30 | |
| 1685 neg rax | |
| 1686 | |
| 1687 movd [rsi + rax*2], mm5 ; write 63 62 61 60 | |
| 1688 psrlq mm1, 32 ; 53 52 51 50 | |
| 1689 | |
| 1690 movd [rdi], mm1 ; write out 53 52 51
50 | |
| 1691 psrlq mm5, 32 ; 73 72 71 70 | |
| 1692 | |
| 1693 movd [rdi + rax*2], mm5 ; write 73 72 71 70 | |
| 1694 | |
| 1695 lea rsi, [rsi+rax*8] ; next 8 | |
| 1696 | |
| 1697 dec rcx | |
| 1698 jnz .nexts8_v | |
| 1699 | |
| 1700 add rsp, 32 | |
| 1701 pop rsp | |
| 1702 ; begin epilog | |
| 1703 pop rdi | |
| 1704 pop rsi | |
| 1705 RESTORE_GOT | |
| 1706 UNSHADOW_ARGS | |
| 1707 pop rbp | |
| 1708 ret | |
| 1709 | |
| 1710 | |
| 1711 | |
| 1712 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, | |
| 1713 ; int y_stride, | |
| 1714 ; loop_filter_info *lfi) | |
| 1715 ;{ | |
| 1716 ; | |
| 1717 ; | |
| 1718 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->
lim,lfi->thr,2); | |
| 1719 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->
lim,lfi->thr,2); | |
| 1720 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi-
>lim,lfi->thr,2); | |
| 1721 ;} | |
| 1722 | |
| 1723 SECTION_RODATA | |
| 1724 align 16 | |
| 1725 tfe: | |
| 1726 times 8 db 0xfe | |
| 1727 align 16 | |
| 1728 t80: | |
| 1729 times 8 db 0x80 | |
| 1730 align 16 | |
| 1731 t1s: | |
| 1732 times 8 db 0x01 | |
| 1733 align 16 | |
| 1734 t3: | |
| 1735 times 8 db 0x03 | |
| 1736 align 16 | |
| 1737 t4: | |
| 1738 times 8 db 0x04 | |
| 1739 align 16 | |
| 1740 ones: | |
| 1741 times 4 dw 0x0001 | |
| 1742 align 16 | |
| 1743 s27: | |
| 1744 times 4 dw 0x1b00 | |
| 1745 align 16 | |
| 1746 s18: | |
| 1747 times 4 dw 0x1200 | |
| 1748 align 16 | |
| 1749 s9: | |
| 1750 times 4 dw 0x0900 | |
| 1751 align 16 | |
| 1752 s63: | |
| 1753 times 4 dw 0x003f | |
| OLD | NEW |