OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 %include "third_party/x86inc/x86inc.asm" |
| 12 |
| 13 SECTION_RODATA |
| 14 align 16 |
| 15 x_s1sqr2: times 4 dw 0x8A8C |
| 16 align 16 |
| 17 x_c1sqr2less1: times 4 dw 0x4E7B |
| 18 align 16 |
| 19 pw_16: times 4 dw 16 |
| 20 |
| 21 SECTION .text |
| 22 |
| 23 |
| 24 ; /**************************************************************************** |
| 25 ; * Notes: |
| 26 ; * |
| 27 ; * This implementation makes use of 16 bit fixed point version of two multiply |
| 28 ; * constants: |
| 29 ; * 1. sqrt(2) * cos (pi/8) |
| 30 ; * 2. sqrt(2) * sin (pi/8) |
| 31 ; * Because the first constant is bigger than 1, to maintain the same 16 bit |
| 32 ; * fixed point precision as the second one, we use a trick of |
| 33 ; * x * a = x + x*(a-1) |
| 34 ; * so |
| 35 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). |
| 36 ; * |
| 37 ; * For the second constant, because of the 16bit version is 35468, which |
| 38 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative |
| 39 ; * number. |
| 40 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x |
| 41 ; * |
| 42 ; **************************************************************************/ |
| 43 |
| 44 INIT_MMX |
| 45 |
| 46 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch) |
| 47 cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit |
| 48 mova m0, [inpq +0] |
| 49 mova m1, [inpq +8] |
| 50 |
| 51 mova m2, [inpq+16] |
| 52 mova m3, [inpq+24] |
| 53 |
| 54 psubw m0, m2 ; b1= 0-2 |
| 55 paddw m2, m2 ; |
| 56 |
| 57 mova m5, m1 |
| 58 paddw m2, m0 ; a1 =0+2 |
| 59 |
| 60 pmulhw m5, [x_s1sqr2] ; |
| 61 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) |
| 62 |
| 63 mova m7, m3 ; |
| 64 pmulhw m7, [x_c1sqr2less1] ; |
| 65 |
| 66 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) |
| 67 psubw m7, m5 ; c1 |
| 68 |
| 69 mova m5, m1 |
| 70 mova m4, m3 |
| 71 |
| 72 pmulhw m5, [x_c1sqr2less1] |
| 73 paddw m5, m1 |
| 74 |
| 75 pmulhw m3, [x_s1sqr2] |
| 76 paddw m3, m4 |
| 77 |
| 78 paddw m3, m5 ; d1 |
| 79 mova m6, m2 ; a1 |
| 80 |
| 81 mova m4, m0 ; b1 |
| 82 paddw m2, m3 ;0 |
| 83 |
| 84 paddw m4, m7 ;1 |
| 85 psubw m0, m7 ;2 |
| 86 |
| 87 psubw m6, m3 ;3 |
| 88 |
| 89 mova m1, m2 ; 03 02 01 00 |
| 90 mova m3, m4 ; 23 22 21 20 |
| 91 |
| 92 punpcklwd m1, m0 ; 11 01 10 00 |
| 93 punpckhwd m2, m0 ; 13 03 12 02 |
| 94 |
| 95 punpcklwd m3, m6 ; 31 21 30 20 |
| 96 punpckhwd m4, m6 ; 33 23 32 22 |
| 97 |
| 98 mova m0, m1 ; 11 01 10 00 |
| 99 mova m5, m2 ; 13 03 12 02 |
| 100 |
| 101 punpckldq m0, m3 ; 30 20 10 00 |
| 102 punpckhdq m1, m3 ; 31 21 11 01 |
| 103 |
| 104 punpckldq m2, m4 ; 32 22 12 02 |
| 105 punpckhdq m5, m4 ; 33 23 13 03 |
| 106 |
| 107 mova m3, m5 ; 33 23 13 03 |
| 108 |
| 109 psubw m0, m2 ; b1= 0-2 |
| 110 paddw m2, m2 ; |
| 111 |
| 112 mova m5, m1 |
| 113 paddw m2, m0 ; a1 =0+2 |
| 114 |
| 115 pmulhw m5, [x_s1sqr2] ; |
| 116 paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) |
| 117 |
| 118 mova m7, m3 ; |
| 119 pmulhw m7, [x_c1sqr2less1] ; |
| 120 |
| 121 paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) |
| 122 psubw m7, m5 ; c1 |
| 123 |
| 124 mova m5, m1 |
| 125 mova m4, m3 |
| 126 |
| 127 pmulhw m5, [x_c1sqr2less1] |
| 128 paddw m5, m1 |
| 129 |
| 130 pmulhw m3, [x_s1sqr2] |
| 131 paddw m3, m4 |
| 132 |
| 133 paddw m3, m5 ; d1 |
| 134 paddw m0, [pw_16] |
| 135 |
| 136 paddw m2, [pw_16] |
| 137 mova m6, m2 ; a1 |
| 138 |
| 139 mova m4, m0 ; b1 |
| 140 paddw m2, m3 ;0 |
| 141 |
| 142 paddw m4, m7 ;1 |
| 143 psubw m0, m7 ;2 |
| 144 |
| 145 psubw m6, m3 ;3 |
| 146 psraw m2, 5 |
| 147 |
| 148 psraw m0, 5 |
| 149 psraw m4, 5 |
| 150 |
| 151 psraw m6, 5 |
| 152 |
| 153 mova m1, m2 ; 03 02 01 00 |
| 154 mova m3, m4 ; 23 22 21 20 |
| 155 |
| 156 punpcklwd m1, m0 ; 11 01 10 00 |
| 157 punpckhwd m2, m0 ; 13 03 12 02 |
| 158 |
| 159 punpcklwd m3, m6 ; 31 21 30 20 |
| 160 punpckhwd m4, m6 ; 33 23 32 22 |
| 161 |
| 162 mova m0, m1 ; 11 01 10 00 |
| 163 mova m5, m2 ; 13 03 12 02 |
| 164 |
| 165 punpckldq m0, m3 ; 30 20 10 00 |
| 166 punpckhdq m1, m3 ; 31 21 11 01 |
| 167 |
| 168 punpckldq m2, m4 ; 32 22 12 02 |
| 169 punpckhdq m5, m4 ; 33 23 13 03 |
| 170 |
| 171 mova [outq], m0 |
| 172 |
| 173 mova [outq+r2], m1 |
| 174 mova [outq+pitq*2], m2 |
| 175 |
| 176 add outq, pitq |
| 177 mova [outq+pitq*2], m5 |
| 178 RET |
| 179 |
| 180 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) |
| 181 cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit |
| 182 movh m0, [inpq] |
| 183 paddw m0, [pw_16] |
| 184 psraw m0, 5 |
| 185 punpcklwd m0, m0 |
| 186 punpckldq m0, m0 |
| 187 |
| 188 mova [outq], m0 |
| 189 mova [outq+pitq], m0 |
| 190 |
| 191 mova [outq+pitq*2], m0 |
| 192 add r1, r2 |
| 193 |
| 194 mova [outq+pitq*2], m0 |
| 195 RET |
| 196 |
| 197 |
| 198 ;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned cha
r *dst_ptr, int pitch, int stride) |
| 199 cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride |
| 200 %if ARCH_X86_64 |
| 201 movsxd strideq, dword stridem |
| 202 %else |
| 203 mov strideq, stridem |
| 204 %endif |
| 205 pxor m0, m0 |
| 206 |
| 207 movh m5, in_dcq ; dc |
| 208 paddw m5, [pw_16] |
| 209 |
| 210 psraw m5, 5 |
| 211 |
| 212 punpcklwd m5, m5 |
| 213 punpckldq m5, m5 |
| 214 |
| 215 movh m1, [predq] |
| 216 punpcklbw m1, m0 |
| 217 paddsw m1, m5 |
| 218 packuswb m1, m0 ; pack and unpack to saturate |
| 219 movh [dstq], m1 |
| 220 |
| 221 movh m2, [predq+pitq] |
| 222 punpcklbw m2, m0 |
| 223 paddsw m2, m5 |
| 224 packuswb m2, m0 ; pack and unpack to saturate |
| 225 movh [dstq+strideq], m2 |
| 226 |
| 227 movh m3, [predq+2*pitq] |
| 228 punpcklbw m3, m0 |
| 229 paddsw m3, m5 |
| 230 packuswb m3, m0 ; pack and unpack to saturate |
| 231 movh [dstq+2*strideq], m3 |
| 232 |
| 233 add dstq, strideq |
| 234 add predq, pitq |
| 235 movh m4, [predq+2*pitq] |
| 236 punpcklbw m4, m0 |
| 237 paddsw m4, m5 |
| 238 packuswb m4, m0 ; pack and unpack to saturate |
| 239 movh [dstq+2*strideq], m4 |
| 240 RET |
| 241 |
OLD | NEW |