OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %macro STACK_FRAME_CREATE 0 |
| 15 %if ABI_IS_32BIT |
| 16 %define input rsi |
| 17 %define output rdi |
| 18 %define pitch rax |
| 19 push rbp |
| 20 mov rbp, rsp |
| 21 GET_GOT rbx |
| 22 push rsi |
| 23 push rdi |
| 24 ; end prolog |
| 25 |
| 26 mov rsi, arg(0) |
| 27 mov rdi, arg(1) |
| 28 |
| 29 movsxd rax, dword ptr arg(2) |
| 30 lea rcx, [rsi + rax*2] |
| 31 %else |
| 32 %ifidn __OUTPUT_FORMAT__,x64 |
| 33 %define input rcx |
| 34 %define output rdx |
| 35 %define pitch r8 |
| 36 SAVE_XMM 7, u |
| 37 %else |
| 38 %define input rdi |
| 39 %define output rsi |
| 40 %define pitch rdx |
| 41 %endif |
| 42 %endif |
| 43 %endmacro |
| 44 |
| 45 %macro STACK_FRAME_DESTROY 0 |
| 46 %define input |
| 47 %define output |
| 48 %define pitch |
| 49 |
| 50 %if ABI_IS_32BIT |
| 51 pop rdi |
| 52 pop rsi |
| 53 RESTORE_GOT |
| 54 pop rbp |
| 55 %else |
| 56 %ifidn __OUTPUT_FORMAT__,x64 |
| 57 RESTORE_XMM |
| 58 %endif |
| 59 %endif |
| 60 ret |
| 61 %endmacro |
| 62 |
| 63 ;void vp9_short_fdct4x4_sse2(short *input, short *output, int pitch) |
| 64 global sym(vp9_short_fdct4x4_sse2) |
| 65 sym(vp9_short_fdct4x4_sse2): |
| 66 |
| 67 STACK_FRAME_CREATE |
| 68 |
| 69 movq xmm0, MMWORD PTR[input ] ;03 02 01 00 |
| 70 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 |
| 71 lea input, [input+2*pitch] |
| 72 movq xmm1, MMWORD PTR[input ] ;23 22 21 20 |
| 73 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 |
| 74 |
| 75 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 |
| 76 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 |
| 77 |
| 78 movdqa xmm2, xmm0 |
| 79 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 |
| 80 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 |
| 81 movdqa xmm1, xmm0 |
| 82 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 |
| 83 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx |
| 84 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx |
| 85 |
| 86 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 |
| 87 movdqa xmm3, xmm0 |
| 88 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 |
| 89 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 |
| 90 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 |
| 91 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 |
| 92 |
| 93 movdqa xmm1, xmm0 |
| 94 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 |
| 95 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 |
| 96 movdqa xmm4, xmm3 |
| 97 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 |
| 98 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 |
| 99 |
| 100 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] |
| 101 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] |
| 102 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 |
| 103 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 |
| 104 |
| 105 packssdw xmm0, xmm1 ;op[2] op[0] |
| 106 packssdw xmm3, xmm4 ;op[3] op[1] |
| 107 ; 23 22 21 20 03 02 01 00 |
| 108 ; |
| 109 ; 33 32 31 30 13 12 11 10 |
| 110 ; |
| 111 movdqa xmm2, xmm0 |
| 112 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 |
| 113 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 |
| 114 |
| 115 movdqa xmm3, xmm0 |
| 116 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 |
| 117 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 |
| 118 movdqa xmm2, xmm0 |
| 119 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 |
| 120 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 |
| 121 |
| 122 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] |
| 123 pshufd xmm2, xmm2, 04eh |
| 124 movdqa xmm3, xmm0 |
| 125 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 |
| 126 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 |
| 127 |
| 128 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 |
| 129 movdqa xmm2, xmm3 ;save d1 for compare |
| 130 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 |
| 131 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 |
| 132 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 |
| 133 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 |
| 134 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 |
| 135 movdqa xmm1, xmm0 |
| 136 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 |
| 137 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 |
| 138 |
| 139 pxor xmm4, xmm4 ;zero out for compare |
| 140 paddd xmm0, xmm5 |
| 141 paddd xmm1, xmm5 |
| 142 pcmpeqw xmm2, xmm4 |
| 143 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 |
| 144 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 |
| 145 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, |
| 146 ;and keep bit 0 of lower |
| 147 |
| 148 movdqa xmm4, xmm3 |
| 149 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 |
| 150 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 |
| 151 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] |
| 152 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] |
| 153 packssdw xmm0, xmm1 ;op[8] op[0] |
| 154 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 |
| 155 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 |
| 156 |
| 157 packssdw xmm3, xmm4 ;op[12] op[4] |
| 158 movdqa xmm1, xmm0 |
| 159 paddw xmm3, xmm2 ;op[4] += (d1!=0) |
| 160 punpcklqdq xmm0, xmm3 ;op[4] op[0] |
| 161 punpckhqdq xmm1, xmm3 ;op[12] op[8] |
| 162 |
| 163 movdqa XMMWORD PTR[output + 0], xmm0 |
| 164 movdqa XMMWORD PTR[output + 16], xmm1 |
| 165 |
| 166 STACK_FRAME_DESTROY |
| 167 |
| 168 ;void vp9_short_fdct8x4_sse2(short *input, short *output, int pitch) |
| 169 global sym(vp9_short_fdct8x4_sse2) |
| 170 sym(vp9_short_fdct8x4_sse2): |
| 171 |
| 172 STACK_FRAME_CREATE |
| 173 |
| 174 ; read the input data |
| 175 movdqa xmm0, [input ] |
| 176 movdqa xmm2, [input+ pitch] |
| 177 lea input, [input+2*pitch] |
| 178 movdqa xmm4, [input ] |
| 179 movdqa xmm3, [input+ pitch] |
| 180 |
| 181 ; transpose for the first stage |
| 182 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 |
| 183 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 |
| 184 |
| 185 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 |
| 186 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 |
| 187 |
| 188 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 |
| 189 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 |
| 190 |
| 191 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 |
| 192 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 |
| 193 |
| 194 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 |
| 195 |
| 196 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 |
| 197 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 |
| 198 |
| 199 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 |
| 200 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 |
| 201 |
| 202 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 |
| 203 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 |
| 204 |
| 205 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 |
| 206 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 |
| 207 |
| 208 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 |
| 209 |
| 210 ; xmm0 0 |
| 211 ; xmm1 1 |
| 212 ; xmm2 2 |
| 213 ; xmm3 3 |
| 214 |
| 215 ; first stage |
| 216 movdqa xmm5, xmm0 |
| 217 movdqa xmm4, xmm1 |
| 218 |
| 219 paddw xmm0, xmm3 ; a1 = 0 + 3 |
| 220 paddw xmm1, xmm2 ; b1 = 1 + 2 |
| 221 |
| 222 psubw xmm4, xmm2 ; c1 = 1 - 2 |
| 223 psubw xmm5, xmm3 ; d1 = 0 - 3 |
| 224 |
| 225 psllw xmm5, 3 |
| 226 psllw xmm4, 3 |
| 227 |
| 228 psllw xmm0, 3 |
| 229 psllw xmm1, 3 |
| 230 |
| 231 ; output 0 and 2 |
| 232 movdqa xmm2, xmm0 ; a1 |
| 233 |
| 234 paddw xmm0, xmm1 ; op[0] = a1 + b1 |
| 235 psubw xmm2, xmm1 ; op[2] = a1 - b1 |
| 236 |
| 237 ; output 1 and 3 |
| 238 ; interleave c1, d1 |
| 239 movdqa xmm1, xmm5 ; d1 |
| 240 punpcklwd xmm1, xmm4 ; c1 d1 |
| 241 punpckhwd xmm5, xmm4 ; c1 d1 |
| 242 |
| 243 movdqa xmm3, xmm1 |
| 244 movdqa xmm4, xmm5 |
| 245 |
| 246 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 +
d1*5352 |
| 247 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 +
d1*5352 |
| 248 |
| 249 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 -
c1*5352 |
| 250 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 -
c1*5352 |
| 251 |
| 252 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] |
| 253 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] |
| 254 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] |
| 255 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] |
| 256 |
| 257 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>
12 |
| 258 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>
12 |
| 259 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>
12 |
| 260 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>
12 |
| 261 |
| 262 packssdw xmm1, xmm4 ; op[1] |
| 263 packssdw xmm3, xmm5 ; op[3] |
| 264 |
| 265 ; done with vertical |
| 266 ; transpose for the second stage |
| 267 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 |
| 268 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 |
| 269 |
| 270 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 |
| 271 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 |
| 272 |
| 273 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 |
| 274 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 |
| 275 |
| 276 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 |
| 277 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 |
| 278 |
| 279 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 |
| 280 |
| 281 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 |
| 282 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 |
| 283 |
| 284 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 |
| 285 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 |
| 286 |
| 287 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 |
| 288 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 |
| 289 |
| 290 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 |
| 291 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 |
| 292 |
| 293 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 |
| 294 |
| 295 ; xmm0 0 |
| 296 ; xmm1 4 |
| 297 ; xmm2 1 |
| 298 ; xmm3 3 |
| 299 |
| 300 movdqa xmm5, xmm0 |
| 301 movdqa xmm2, xmm1 |
| 302 |
| 303 paddw xmm0, xmm3 ; a1 = 0 + 3 |
| 304 paddw xmm1, xmm4 ; b1 = 1 + 2 |
| 305 |
| 306 psubw xmm4, xmm2 ; c1 = 1 - 2 |
| 307 psubw xmm5, xmm3 ; d1 = 0 - 3 |
| 308 |
| 309 pxor xmm6, xmm6 ; zero out for compare |
| 310 |
| 311 pcmpeqw xmm6, xmm5 ; d1 != 0 |
| 312 |
| 313 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear uppe
r, |
| 314 ; and keep b
it 0 of lower |
| 315 |
| 316 ; output 0 and 2 |
| 317 movdqa xmm2, xmm0 ; a1 |
| 318 |
| 319 paddw xmm0, xmm1 ; a1 + b1 |
| 320 psubw xmm2, xmm1 ; a1 - b1 |
| 321 |
| 322 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] |
| 323 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] |
| 324 |
| 325 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 |
| 326 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 |
| 327 |
| 328 ; output 1 and 3 |
| 329 ; interleave c1, d1 |
| 330 movdqa xmm1, xmm5 ; d1 |
| 331 punpcklwd xmm1, xmm4 ; c1 d1 |
| 332 punpckhwd xmm5, xmm4 ; c1 d1 |
| 333 |
| 334 movdqa xmm3, xmm1 |
| 335 movdqa xmm4, xmm5 |
| 336 |
| 337 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 +
d1*5352 |
| 338 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 +
d1*5352 |
| 339 |
| 340 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 -
c1*5352 |
| 341 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 -
c1*5352 |
| 342 |
| 343 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] |
| 344 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] |
| 345 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] |
| 346 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] |
| 347 |
| 348 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>
16 |
| 349 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>
16 |
| 350 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>
16 |
| 351 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>
16 |
| 352 |
| 353 packssdw xmm1, xmm4 ; op[4] |
| 354 packssdw xmm3, xmm5 ; op[12] |
| 355 |
| 356 paddw xmm1, xmm6 ; op[4] += (d1!=0) |
| 357 |
| 358 movdqa xmm4, xmm0 |
| 359 movdqa xmm5, xmm2 |
| 360 |
| 361 punpcklqdq xmm0, xmm1 |
| 362 punpckhqdq xmm4, xmm1 |
| 363 |
| 364 punpcklqdq xmm2, xmm3 |
| 365 punpckhqdq xmm5, xmm3 |
| 366 |
| 367 movdqa XMMWORD PTR[output + 0 ], xmm0 |
| 368 movdqa XMMWORD PTR[output + 16], xmm2 |
| 369 movdqa XMMWORD PTR[output + 32], xmm4 |
| 370 movdqa XMMWORD PTR[output + 48], xmm5 |
| 371 |
| 372 STACK_FRAME_DESTROY |
| 373 |
| 374 SECTION_RODATA |
| 375 align 16 |
| 376 _5352_2217: |
| 377 dw 5352 |
| 378 dw 2217 |
| 379 dw 5352 |
| 380 dw 2217 |
| 381 dw 5352 |
| 382 dw 2217 |
| 383 dw 5352 |
| 384 dw 2217 |
| 385 align 16 |
| 386 _2217_neg5352: |
| 387 dw 2217 |
| 388 dw -5352 |
| 389 dw 2217 |
| 390 dw -5352 |
| 391 dw 2217 |
| 392 dw -5352 |
| 393 dw 2217 |
| 394 dw -5352 |
| 395 align 16 |
| 396 _mult_add: |
| 397 times 8 dw 1 |
| 398 align 16 |
| 399 _cmp_mask: |
| 400 times 4 dw 1 |
| 401 times 4 dw 0 |
| 402 align 16 |
| 403 _cmp_mask8x4: |
| 404 times 8 dw 1 |
| 405 align 16 |
| 406 _mult_sub: |
| 407 dw 1 |
| 408 dw -1 |
| 409 dw 1 |
| 410 dw -1 |
| 411 dw 1 |
| 412 dw -1 |
| 413 dw 1 |
| 414 dw -1 |
| 415 align 16 |
| 416 _7: |
| 417 times 4 dd 7 |
| 418 align 16 |
| 419 _7w: |
| 420 times 8 dw 7 |
| 421 align 16 |
| 422 _14500: |
| 423 times 4 dd 14500 |
| 424 align 16 |
| 425 _7500: |
| 426 times 4 dd 7500 |
| 427 align 16 |
| 428 _12000: |
| 429 times 4 dd 12000 |
| 430 align 16 |
| 431 _51000: |
| 432 times 4 dd 51000 |
OLD | NEW |