| OLD | NEW |
| (Empty) | |
| 1 ;***************************************************************************** |
| 2 ;* SSE2-optimized weighted prediction code |
| 3 ;***************************************************************************** |
| 4 ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
| 5 ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> |
| 6 ;* |
| 7 ;* This file is part of FFmpeg. |
| 8 ;* |
| 9 ;* FFmpeg is free software; you can redistribute it and/or |
| 10 ;* modify it under the terms of the GNU Lesser General Public |
| 11 ;* License as published by the Free Software Foundation; either |
| 12 ;* version 2.1 of the License, or (at your option) any later version. |
| 13 ;* |
| 14 ;* FFmpeg is distributed in the hope that it will be useful, |
| 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 17 ;* Lesser General Public License for more details. |
| 18 ;* |
| 19 ;* You should have received a copy of the GNU Lesser General Public |
| 20 ;* License along with FFmpeg; if not, write to the Free Software |
| 21 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 22 ;****************************************************************************** |
| 23 |
| 24 %include "x86inc.asm" |
| 25 |
| 26 SECTION .text |
| 27 |
| 28 ;----------------------------------------------------------------------------- |
| 29 ; biweight pred: |
| 30 ; |
| 31 ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, |
| 32 ; int log2_denom, int weightd, int weights, |
| 33 ; int offset); |
| 34 ; and |
| 35 ; void h264_weight_16x16_sse2(uint8_t *dst, int stride, |
| 36 ; int log2_denom, int weight, |
| 37 ; int offset); |
| 38 ;----------------------------------------------------------------------------- |
| 39 |
| 40 %macro WEIGHT_SETUP 0 |
| 41 add r4, r4 |
| 42 inc r4 |
| 43 movd m3, r3d |
| 44 movd m5, r4d |
| 45 movd m6, r2d |
| 46 pslld m5, m6 |
| 47 psrld m5, 1 |
| 48 %if mmsize == 16 |
| 49 pshuflw m3, m3, 0 |
| 50 pshuflw m5, m5, 0 |
| 51 punpcklqdq m3, m3 |
| 52 punpcklqdq m5, m5 |
| 53 %else |
| 54 pshufw m3, m3, 0 |
| 55 pshufw m5, m5, 0 |
| 56 %endif |
| 57 pxor m7, m7 |
| 58 %endmacro |
| 59 |
| 60 %macro WEIGHT_OP 2 |
| 61 movh m0, [r0+%1] |
| 62 movh m1, [r0+%2] |
| 63 punpcklbw m0, m7 |
| 64 punpcklbw m1, m7 |
| 65 pmullw m0, m3 |
| 66 pmullw m1, m3 |
| 67 paddsw m0, m5 |
| 68 paddsw m1, m5 |
| 69 psraw m0, m6 |
| 70 psraw m1, m6 |
| 71 packuswb m0, m1 |
| 72 %endmacro |
| 73 |
| 74 %macro WEIGHT_FUNC_DBL_MM 1 |
| 75 cglobal h264_weight_16x%1_mmx2, 5, 5, 0 |
| 76 WEIGHT_SETUP |
| 77 mov r2, %1 |
| 78 %if %1 == 16 |
| 79 .nextrow |
| 80 WEIGHT_OP 0, 4 |
| 81 mova [r0 ], m0 |
| 82 WEIGHT_OP 8, 12 |
| 83 mova [r0+8], m0 |
| 84 add r0, r1 |
| 85 dec r2 |
| 86 jnz .nextrow |
| 87 REP_RET |
| 88 %else |
| 89 jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) |
| 90 %endif |
| 91 %endmacro |
| 92 |
| 93 INIT_MMX |
| 94 WEIGHT_FUNC_DBL_MM 16 |
| 95 WEIGHT_FUNC_DBL_MM 8 |
| 96 |
| 97 %macro WEIGHT_FUNC_MM 4 |
| 98 cglobal h264_weight_%1x%2_%4, 7, 7, %3 |
| 99 WEIGHT_SETUP |
| 100 mov r2, %2 |
| 101 %if %2 == 16 |
| 102 .nextrow |
| 103 WEIGHT_OP 0, mmsize/2 |
| 104 mova [r0], m0 |
| 105 add r0, r1 |
| 106 dec r2 |
| 107 jnz .nextrow |
| 108 REP_RET |
| 109 %else |
| 110 jmp mangle(ff_h264_weight_%1x16_%4.nextrow) |
| 111 %endif |
| 112 %endmacro |
| 113 |
| 114 INIT_MMX |
| 115 WEIGHT_FUNC_MM 8, 16, 0, mmx2 |
| 116 WEIGHT_FUNC_MM 8, 8, 0, mmx2 |
| 117 WEIGHT_FUNC_MM 8, 4, 0, mmx2 |
| 118 INIT_XMM |
| 119 WEIGHT_FUNC_MM 16, 16, 8, sse2 |
| 120 WEIGHT_FUNC_MM 16, 8, 8, sse2 |
| 121 |
| 122 %macro WEIGHT_FUNC_HALF_MM 5 |
| 123 cglobal h264_weight_%1x%2_%5, 5, 5, %4 |
| 124 WEIGHT_SETUP |
| 125 mov r2, %2/2 |
| 126 lea r3, [r1*2] |
| 127 %if %2 == mmsize |
| 128 .nextrow |
| 129 WEIGHT_OP 0, r1 |
| 130 movh [r0], m0 |
| 131 %if mmsize == 16 |
| 132 movhps [r0+r1], m0 |
| 133 %else |
| 134 psrlq m0, 32 |
| 135 movh [r0+r1], m0 |
| 136 %endif |
| 137 add r0, r3 |
| 138 dec r2 |
| 139 jnz .nextrow |
| 140 REP_RET |
| 141 %else |
| 142 jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) |
| 143 %endif |
| 144 %endmacro |
| 145 |
| 146 INIT_MMX |
| 147 WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
| 148 WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
| 149 WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
| 150 INIT_XMM |
| 151 WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
| 152 WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
| 153 WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
| 154 |
| 155 %macro BIWEIGHT_SETUP 0 |
| 156 add r6, 1 |
| 157 or r6, 1 |
| 158 add r3, 1 |
| 159 movd m3, r4d |
| 160 movd m4, r5d |
| 161 movd m5, r6d |
| 162 movd m6, r3d |
| 163 pslld m5, m6 |
| 164 psrld m5, 1 |
| 165 %if mmsize == 16 |
| 166 pshuflw m3, m3, 0 |
| 167 pshuflw m4, m4, 0 |
| 168 pshuflw m5, m5, 0 |
| 169 punpcklqdq m3, m3 |
| 170 punpcklqdq m4, m4 |
| 171 punpcklqdq m5, m5 |
| 172 %else |
| 173 pshufw m3, m3, 0 |
| 174 pshufw m4, m4, 0 |
| 175 pshufw m5, m5, 0 |
| 176 %endif |
| 177 pxor m7, m7 |
| 178 %endmacro |
| 179 |
| 180 %macro BIWEIGHT_STEPA 3 |
| 181 movh m%1, [r0+%3] |
| 182 movh m%2, [r1+%3] |
| 183 punpcklbw m%1, m7 |
| 184 punpcklbw m%2, m7 |
| 185 pmullw m%1, m3 |
| 186 pmullw m%2, m4 |
| 187 paddsw m%1, m%2 |
| 188 %endmacro |
| 189 |
| 190 %macro BIWEIGHT_STEPB 0 |
| 191 paddsw m0, m5 |
| 192 paddsw m1, m5 |
| 193 psraw m0, m6 |
| 194 psraw m1, m6 |
| 195 packuswb m0, m1 |
| 196 %endmacro |
| 197 |
| 198 %macro BIWEIGHT_FUNC_DBL_MM 1 |
| 199 cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |
| 200 BIWEIGHT_SETUP |
| 201 mov r3, %1 |
| 202 %if %1 == 16 |
| 203 .nextrow |
| 204 BIWEIGHT_STEPA 0, 1, 0 |
| 205 BIWEIGHT_STEPA 1, 2, 4 |
| 206 BIWEIGHT_STEPB |
| 207 mova [r0], m0 |
| 208 BIWEIGHT_STEPA 0, 1, 8 |
| 209 BIWEIGHT_STEPA 1, 2, 12 |
| 210 BIWEIGHT_STEPB |
| 211 mova [r0+8], m0 |
| 212 add r0, r2 |
| 213 add r1, r2 |
| 214 dec r3 |
| 215 jnz .nextrow |
| 216 REP_RET |
| 217 %else |
| 218 jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) |
| 219 %endif |
| 220 %endmacro |
| 221 |
| 222 INIT_MMX |
| 223 BIWEIGHT_FUNC_DBL_MM 16 |
| 224 BIWEIGHT_FUNC_DBL_MM 8 |
| 225 |
| 226 %macro BIWEIGHT_FUNC_MM 4 |
| 227 cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |
| 228 BIWEIGHT_SETUP |
| 229 mov r3, %2 |
| 230 %if %2 == 16 |
| 231 .nextrow |
| 232 BIWEIGHT_STEPA 0, 1, 0 |
| 233 BIWEIGHT_STEPA 1, 2, mmsize/2 |
| 234 BIWEIGHT_STEPB |
| 235 mova [r0], m0 |
| 236 add r0, r2 |
| 237 add r1, r2 |
| 238 dec r3 |
| 239 jnz .nextrow |
| 240 REP_RET |
| 241 %else |
| 242 jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) |
| 243 %endif |
| 244 %endmacro |
| 245 |
| 246 INIT_MMX |
| 247 BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 |
| 248 BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 |
| 249 BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 |
| 250 INIT_XMM |
| 251 BIWEIGHT_FUNC_MM 16, 16, 8, sse2 |
| 252 BIWEIGHT_FUNC_MM 16, 8, 8, sse2 |
| 253 |
| 254 %macro BIWEIGHT_FUNC_HALF_MM 5 |
| 255 cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |
| 256 BIWEIGHT_SETUP |
| 257 mov r3, %2/2 |
| 258 lea r4, [r2*2] |
| 259 %if %2 == mmsize |
| 260 .nextrow |
| 261 BIWEIGHT_STEPA 0, 1, 0 |
| 262 BIWEIGHT_STEPA 1, 2, r2 |
| 263 BIWEIGHT_STEPB |
| 264 movh [r0], m0 |
| 265 %if mmsize == 16 |
| 266 movhps [r0+r2], m0 |
| 267 %else |
| 268 psrlq m0, 32 |
| 269 movh [r0+r2], m0 |
| 270 %endif |
| 271 add r0, r4 |
| 272 add r1, r4 |
| 273 dec r3 |
| 274 jnz .nextrow |
| 275 REP_RET |
| 276 %else |
| 277 jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) |
| 278 %endif |
| 279 %endmacro |
| 280 |
| 281 INIT_MMX |
| 282 BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
| 283 BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
| 284 BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
| 285 INIT_XMM |
| 286 BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
| 287 BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
| 288 BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
| 289 |
| 290 %macro BIWEIGHT_SSSE3_SETUP 0 |
| 291 add r6, 1 |
| 292 or r6, 1 |
| 293 add r3, 1 |
| 294 movd m4, r4d |
| 295 movd m0, r5d |
| 296 movd m5, r6d |
| 297 movd m6, r3d |
| 298 pslld m5, m6 |
| 299 psrld m5, 1 |
| 300 punpcklbw m4, m0 |
| 301 pshuflw m4, m4, 0 |
| 302 pshuflw m5, m5, 0 |
| 303 punpcklqdq m4, m4 |
| 304 punpcklqdq m5, m5 |
| 305 %endmacro |
| 306 |
| 307 %macro BIWEIGHT_SSSE3_OP 0 |
| 308 pmaddubsw m0, m4 |
| 309 pmaddubsw m2, m4 |
| 310 paddsw m0, m5 |
| 311 paddsw m2, m5 |
| 312 psraw m0, m6 |
| 313 psraw m2, m6 |
| 314 packuswb m0, m2 |
| 315 %endmacro |
| 316 |
| 317 %macro BIWEIGHT_SSSE3_16 1 |
| 318 cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |
| 319 BIWEIGHT_SSSE3_SETUP |
| 320 mov r3, %1 |
| 321 |
| 322 %if %1 == 16 |
| 323 .nextrow |
| 324 movh m0, [r0] |
| 325 movh m2, [r0+8] |
| 326 movh m3, [r1+8] |
| 327 punpcklbw m0, [r1] |
| 328 punpcklbw m2, m3 |
| 329 BIWEIGHT_SSSE3_OP |
| 330 mova [r0], m0 |
| 331 add r0, r2 |
| 332 add r1, r2 |
| 333 dec r3 |
| 334 jnz .nextrow |
| 335 REP_RET |
| 336 %else |
| 337 jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) |
| 338 %endif |
| 339 %endmacro |
| 340 |
| 341 INIT_XMM |
| 342 BIWEIGHT_SSSE3_16 16 |
| 343 BIWEIGHT_SSSE3_16 8 |
| 344 |
| 345 %macro BIWEIGHT_SSSE3_8 1 |
| 346 cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |
| 347 BIWEIGHT_SSSE3_SETUP |
| 348 mov r3, %1/2 |
| 349 lea r4, [r2*2] |
| 350 |
| 351 %if %1 == 16 |
| 352 .nextrow |
| 353 movh m0, [r0] |
| 354 movh m1, [r1] |
| 355 movh m2, [r0+r2] |
| 356 movh m3, [r1+r2] |
| 357 punpcklbw m0, m1 |
| 358 punpcklbw m2, m3 |
| 359 BIWEIGHT_SSSE3_OP |
| 360 movh [r0], m0 |
| 361 movhps [r0+r2], m0 |
| 362 add r0, r4 |
| 363 add r1, r4 |
| 364 dec r3 |
| 365 jnz .nextrow |
| 366 REP_RET |
| 367 %else |
| 368 jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) |
| 369 %endif |
| 370 %endmacro |
| 371 |
| 372 INIT_XMM |
| 373 BIWEIGHT_SSSE3_8 16 |
| 374 BIWEIGHT_SSSE3_8 8 |
| 375 BIWEIGHT_SSSE3_8 4 |
| OLD | NEW |