OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;/******************************************************************************
****** |
| 15 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe
ls. The |
| 16 ; input pixel array has output_height rows. This routine assumes that output_hei
ght is an |
| 17 ; even number. This function handles 8 pixels in horizontal direction, calculati
ng ONE |
| 18 ; rows each iteration to take advantage of the 128 bits operations. |
| 19 ; |
| 20 ; This is an implementation of some of the SSE optimizations first seen in ffvp8 |
| 21 ; |
| 22 ;*******************************************************************************
******/ |
| 23 |
| 24 ;void vp9_filter_block1d8_v8_ssse3 |
| 25 ;( |
| 26 ; unsigned char *src_ptr, |
| 27 ; unsigned int src_pitch, |
| 28 ; unsigned char *output_ptr, |
| 29 ; unsigned int out_pitch, |
| 30 ; unsigned int output_height, |
| 31 ; short *filter |
| 32 ;) |
| 33 global sym(vp9_filter_block1d8_v8_ssse3) |
| 34 sym(vp9_filter_block1d8_v8_ssse3): |
| 35 push rbp |
| 36 mov rbp, rsp |
| 37 SHADOW_ARGS_TO_STACK 6 |
| 38 SAVE_XMM 7 |
| 39 push rsi |
| 40 push rdi |
| 41 push rbx |
| 42 ; end prolog |
| 43 |
| 44 ALIGN_STACK 16, rax |
| 45 sub rsp, 16*5 |
| 46 %define k0k1 [rsp + 16*0] |
| 47 %define k2k3 [rsp + 16*1] |
| 48 %define k4k5 [rsp + 16*2] |
| 49 %define k6k7 [rsp + 16*3] |
| 50 %define krd [rsp + 16*4] |
| 51 |
| 52 mov rdx, arg(5) ;filter ptr |
| 53 mov rsi, arg(0) ;src_ptr |
| 54 mov rdi, arg(2) ;output_ptr |
| 55 mov rcx, 0x0400040 |
| 56 |
| 57 movdqa xmm4, [rdx] ;load filters |
| 58 movd xmm5, rcx |
| 59 packsswb xmm4, xmm4 |
| 60 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 61 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 62 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 63 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 64 |
| 65 punpcklqdq xmm0, xmm0 |
| 66 punpcklqdq xmm1, xmm1 |
| 67 punpcklqdq xmm2, xmm2 |
| 68 punpcklqdq xmm3, xmm3 |
| 69 |
| 70 movdqa k0k1, xmm0 |
| 71 movdqa k2k3, xmm1 |
| 72 pshufd xmm5, xmm5, 0 |
| 73 movdqa k4k5, xmm2 |
| 74 movdqa k6k7, xmm3 |
| 75 movdqa krd, xmm5 |
| 76 |
| 77 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 78 |
| 79 %if ABI_IS_32BIT=0 |
| 80 movsxd r8, DWORD PTR arg(3) ;out_pitch |
| 81 %endif |
| 82 mov rax, rsi |
| 83 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 84 add rax, rdx |
| 85 |
| 86 lea rbx, [rdx + rdx*4] |
| 87 add rbx, rdx ;pitch * 6 |
| 88 |
| 89 .vp9_filter_block1d8_v8_ssse3_loop: |
| 90 movq xmm0, [rsi] ;A |
| 91 movq xmm1, [rsi + rdx] ;B |
| 92 movq xmm2, [rsi + rdx * 2] ;C |
| 93 movq xmm3, [rax + rdx * 2] ;D |
| 94 movq xmm4, [rsi + rdx * 4] ;E |
| 95 movq xmm5, [rax + rdx * 4] ;F |
| 96 |
| 97 punpcklbw xmm0, xmm1 ;A B |
| 98 punpcklbw xmm2, xmm3 ;C D |
| 99 punpcklbw xmm4, xmm5 ;E F |
| 100 |
| 101 movq xmm6, [rsi + rbx] ;G |
| 102 movq xmm7, [rax + rbx] ;H |
| 103 |
| 104 pmaddubsw xmm0, k0k1 |
| 105 pmaddubsw xmm2, k2k3 |
| 106 punpcklbw xmm6, xmm7 ;G H |
| 107 pmaddubsw xmm4, k4k5 |
| 108 pmaddubsw xmm6, k6k7 |
| 109 |
| 110 paddsw xmm0, xmm2 |
| 111 paddsw xmm0, krd |
| 112 paddsw xmm4, xmm6 |
| 113 paddsw xmm0, xmm4 |
| 114 |
| 115 psraw xmm0, 7 |
| 116 packuswb xmm0, xmm0 |
| 117 |
| 118 add rsi, rdx |
| 119 add rax, rdx |
| 120 |
| 121 movq [rdi], xmm0 |
| 122 |
| 123 %if ABI_IS_32BIT |
| 124 add rdi, DWORD PTR arg(3) ;out_pitch |
| 125 %else |
| 126 add rdi, r8 |
| 127 %endif |
| 128 dec rcx |
| 129 jnz .vp9_filter_block1d8_v8_ssse3_loop |
| 130 |
| 131 add rsp, 16*5 |
| 132 pop rsp |
| 133 pop rbx |
| 134 ; begin epilog |
| 135 pop rdi |
| 136 pop rsi |
| 137 RESTORE_XMM |
| 138 UNSHADOW_ARGS |
| 139 pop rbp |
| 140 ret |
| 141 |
| 142 ;void vp9_filter_block1d16_v8_ssse3 |
| 143 ;( |
| 144 ; unsigned char *src_ptr, |
| 145 ; unsigned int src_pitch, |
| 146 ; unsigned char *output_ptr, |
| 147 ; unsigned int out_pitch, |
| 148 ; unsigned int output_height, |
| 149 ; short *filter |
| 150 ;) |
| 151 global sym(vp9_filter_block1d16_v8_ssse3) |
| 152 sym(vp9_filter_block1d16_v8_ssse3): |
| 153 push rbp |
| 154 mov rbp, rsp |
| 155 SHADOW_ARGS_TO_STACK 6 |
| 156 SAVE_XMM 7 |
| 157 push rsi |
| 158 push rdi |
| 159 push rbx |
| 160 ; end prolog |
| 161 |
| 162 ALIGN_STACK 16, rax |
| 163 sub rsp, 16*5 |
| 164 %define k0k1 [rsp + 16*0] |
| 165 %define k2k3 [rsp + 16*1] |
| 166 %define k4k5 [rsp + 16*2] |
| 167 %define k6k7 [rsp + 16*3] |
| 168 %define krd [rsp + 16*4] |
| 169 |
| 170 mov rdx, arg(5) ;filter ptr |
| 171 mov rsi, arg(0) ;src_ptr |
| 172 mov rdi, arg(2) ;output_ptr |
| 173 mov rcx, 0x0400040 |
| 174 |
| 175 movdqa xmm4, [rdx] ;load filters |
| 176 movd xmm5, rcx |
| 177 packsswb xmm4, xmm4 |
| 178 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 179 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 180 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 181 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 182 |
| 183 punpcklqdq xmm0, xmm0 |
| 184 punpcklqdq xmm1, xmm1 |
| 185 punpcklqdq xmm2, xmm2 |
| 186 punpcklqdq xmm3, xmm3 |
| 187 |
| 188 movdqa k0k1, xmm0 |
| 189 movdqa k2k3, xmm1 |
| 190 pshufd xmm5, xmm5, 0 |
| 191 movdqa k4k5, xmm2 |
| 192 movdqa k6k7, xmm3 |
| 193 movdqa krd, xmm5 |
| 194 |
| 195 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 196 |
| 197 %if ABI_IS_32BIT=0 |
| 198 movsxd r8, DWORD PTR arg(3) ;out_pitch |
| 199 %endif |
| 200 mov rax, rsi |
| 201 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 202 add rax, rdx |
| 203 |
| 204 lea rbx, [rdx + rdx*4] |
| 205 add rbx, rdx ;pitch * 6 |
| 206 |
| 207 .vp9_filter_block1d16_v8_ssse3_loop: |
| 208 movq xmm0, [rsi] ;A |
| 209 movq xmm1, [rsi + rdx] ;B |
| 210 movq xmm2, [rsi + rdx * 2] ;C |
| 211 movq xmm3, [rax + rdx * 2] ;D |
| 212 movq xmm4, [rsi + rdx * 4] ;E |
| 213 movq xmm5, [rax + rdx * 4] ;F |
| 214 |
| 215 punpcklbw xmm0, xmm1 ;A B |
| 216 punpcklbw xmm2, xmm3 ;C D |
| 217 punpcklbw xmm4, xmm5 ;E F |
| 218 |
| 219 movq xmm6, [rsi + rbx] ;G |
| 220 movq xmm7, [rax + rbx] ;H |
| 221 |
| 222 pmaddubsw xmm0, k0k1 |
| 223 pmaddubsw xmm2, k2k3 |
| 224 punpcklbw xmm6, xmm7 ;G H |
| 225 pmaddubsw xmm4, k4k5 |
| 226 pmaddubsw xmm6, k6k7 |
| 227 |
| 228 paddsw xmm0, xmm2 |
| 229 paddsw xmm0, krd |
| 230 paddsw xmm4, xmm6 |
| 231 paddsw xmm0, xmm4 |
| 232 |
| 233 psraw xmm0, 7 |
| 234 packuswb xmm0, xmm0 |
| 235 |
| 236 movq [rdi], xmm0 |
| 237 |
| 238 movq xmm0, [rsi + 8] ;A |
| 239 movq xmm1, [rsi + rdx + 8] ;B |
| 240 movq xmm2, [rsi + rdx * 2 + 8] ;C |
| 241 movq xmm3, [rax + rdx * 2 + 8] ;D |
| 242 movq xmm4, [rsi + rdx * 4 + 8] ;E |
| 243 movq xmm5, [rax + rdx * 4 + 8] ;F |
| 244 |
| 245 punpcklbw xmm0, xmm1 ;A B |
| 246 punpcklbw xmm2, xmm3 ;C D |
| 247 punpcklbw xmm4, xmm5 ;E F |
| 248 |
| 249 |
| 250 movq xmm6, [rsi + rbx + 8] ;G |
| 251 movq xmm7, [rax + rbx + 8] ;H |
| 252 punpcklbw xmm6, xmm7 ;G H |
| 253 |
| 254 |
| 255 pmaddubsw xmm0, k0k1 |
| 256 pmaddubsw xmm2, k2k3 |
| 257 pmaddubsw xmm4, k4k5 |
| 258 pmaddubsw xmm6, k6k7 |
| 259 |
| 260 paddsw xmm0, xmm2 |
| 261 paddsw xmm4, xmm6 |
| 262 paddsw xmm0, krd |
| 263 paddsw xmm0, xmm4 |
| 264 |
| 265 psraw xmm0, 7 |
| 266 packuswb xmm0, xmm0 |
| 267 |
| 268 add rsi, rdx |
| 269 add rax, rdx |
| 270 |
| 271 movq [rdi+8], xmm0 |
| 272 |
| 273 %if ABI_IS_32BIT |
| 274 add rdi, DWORD PTR arg(3) ;out_pitch |
| 275 %else |
| 276 add rdi, r8 |
| 277 %endif |
| 278 dec rcx |
| 279 jnz .vp9_filter_block1d16_v8_ssse3_loop |
| 280 |
| 281 add rsp, 16*5 |
| 282 pop rsp |
| 283 pop rbx |
| 284 ; begin epilog |
| 285 pop rdi |
| 286 pop rsi |
| 287 RESTORE_XMM |
| 288 UNSHADOW_ARGS |
| 289 pop rbp |
| 290 ret |
| 291 |
| 292 ;void vp9_filter_block1d8_h8_ssse3 |
| 293 ;( |
| 294 ; unsigned char *src_ptr, |
| 295 ; unsigned int src_pixels_per_line, |
| 296 ; unsigned char *output_ptr, |
| 297 ; unsigned int output_pitch, |
| 298 ; unsigned int output_height, |
| 299 ; short *filter |
| 300 ;) |
| 301 global sym(vp9_filter_block1d8_h8_ssse3) |
| 302 sym(vp9_filter_block1d8_h8_ssse3): |
| 303 push rbp |
| 304 mov rbp, rsp |
| 305 SHADOW_ARGS_TO_STACK 6 |
| 306 SAVE_XMM 7 |
| 307 GET_GOT rbx |
| 308 push rsi |
| 309 push rdi |
| 310 ; end prolog |
| 311 |
| 312 ALIGN_STACK 16, rax |
| 313 sub rsp, 16*5 |
| 314 %define k0k1 [rsp + 16*0] |
| 315 %define k2k3 [rsp + 16*1] |
| 316 %define k4k5 [rsp + 16*2] |
| 317 %define k6k7 [rsp + 16*3] |
| 318 %define krd [rsp + 16*4] |
| 319 |
| 320 mov rdx, arg(5) ;filter ptr |
| 321 mov rsi, arg(0) ;src_ptr |
| 322 mov rdi, arg(2) ;output_ptr |
| 323 mov rcx, 0x0400040 |
| 324 |
| 325 movdqa xmm4, [rdx] ;load filters |
| 326 movd xmm5, rcx |
| 327 packsswb xmm4, xmm4 |
| 328 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 329 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 330 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 331 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 332 |
| 333 punpcklqdq xmm0, xmm0 |
| 334 punpcklqdq xmm1, xmm1 |
| 335 punpcklqdq xmm2, xmm2 |
| 336 punpcklqdq xmm3, xmm3 |
| 337 |
| 338 movdqa k0k1, xmm0 |
| 339 movdqa k2k3, xmm1 |
| 340 pshufd xmm5, xmm5, 0 |
| 341 movdqa k4k5, xmm2 |
| 342 movdqa k6k7, xmm3 |
| 343 ; movdqa krd, xmm5 |
| 344 |
| 345 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 346 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 347 movsxd rcx, dword ptr arg(4) ;output_height |
| 348 |
| 349 .filter_block1d8_h8_rowloop_ssse3: |
| 350 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 |
| 351 |
| 352 ; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 |
| 353 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 |
| 354 ;note: if we create a k0_k7 filter, we can save a pshufb |
| 355 ; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 |
| 356 punpcklqdq xmm0, xmm3 |
| 357 |
| 358 movdqa xmm1, xmm0 |
| 359 pshufb xmm0, [GLOBAL(shuf_t0t1)] |
| 360 pmaddubsw xmm0, k0k1 |
| 361 |
| 362 movdqa xmm2, xmm1 |
| 363 pshufb xmm1, [GLOBAL(shuf_t2t3)] |
| 364 pmaddubsw xmm1, k2k3 |
| 365 |
| 366 movdqa xmm4, xmm2 |
| 367 pshufb xmm2, [GLOBAL(shuf_t4t5)] |
| 368 pmaddubsw xmm2, k4k5 |
| 369 |
| 370 pshufb xmm4, [GLOBAL(shuf_t6t7)] |
| 371 pmaddubsw xmm4, k6k7 |
| 372 |
| 373 paddsw xmm0, xmm1 |
| 374 paddsw xmm0, xmm2 |
| 375 paddsw xmm0, xmm5 |
| 376 paddsw xmm0, xmm4 |
| 377 psraw xmm0, 7 |
| 378 packuswb xmm0, xmm0 |
| 379 |
| 380 lea rsi, [rsi + rax] |
| 381 movq [rdi], xmm0 |
| 382 |
| 383 lea rdi, [rdi + rdx] |
| 384 dec rcx |
| 385 jnz .filter_block1d8_h8_rowloop_ssse3 |
| 386 |
| 387 add rsp, 16*5 |
| 388 pop rsp |
| 389 |
| 390 ; begin epilog |
| 391 pop rdi |
| 392 pop rsi |
| 393 RESTORE_GOT |
| 394 RESTORE_XMM |
| 395 UNSHADOW_ARGS |
| 396 pop rbp |
| 397 ret |
| 398 |
| 399 ;void vp9_filter_block1d16_h8_ssse3 |
| 400 ;( |
| 401 ; unsigned char *src_ptr, |
| 402 ; unsigned int src_pixels_per_line, |
| 403 ; unsigned char *output_ptr, |
| 404 ; unsigned int output_pitch, |
| 405 ; unsigned int output_height, |
| 406 ; short *filter |
| 407 ;) |
| 408 global sym(vp9_filter_block1d16_h8_ssse3) |
| 409 sym(vp9_filter_block1d16_h8_ssse3): |
| 410 push rbp |
| 411 mov rbp, rsp |
| 412 SHADOW_ARGS_TO_STACK 6 |
| 413 SAVE_XMM 7 |
| 414 GET_GOT rbx |
| 415 push rsi |
| 416 push rdi |
| 417 ; end prolog |
| 418 |
| 419 ALIGN_STACK 16, rax |
| 420 sub rsp, 16*5 |
| 421 %define k0k1 [rsp + 16*0] |
| 422 %define k2k3 [rsp + 16*1] |
| 423 %define k4k5 [rsp + 16*2] |
| 424 %define k6k7 [rsp + 16*3] |
| 425 %define krd [rsp + 16*4] |
| 426 |
| 427 mov rdx, arg(5) ;filter ptr |
| 428 mov rsi, arg(0) ;src_ptr |
| 429 mov rdi, arg(2) ;output_ptr |
| 430 mov rcx, 0x0400040 |
| 431 |
| 432 movdqa xmm4, [rdx] ;load filters |
| 433 movd xmm5, rcx |
| 434 packsswb xmm4, xmm4 |
| 435 pshuflw xmm0, xmm4, 0b ;k0_k1 |
| 436 pshuflw xmm1, xmm4, 01010101b ;k2_k3 |
| 437 pshuflw xmm2, xmm4, 10101010b ;k4_k5 |
| 438 pshuflw xmm3, xmm4, 11111111b ;k6_k7 |
| 439 |
| 440 punpcklqdq xmm0, xmm0 |
| 441 punpcklqdq xmm1, xmm1 |
| 442 punpcklqdq xmm2, xmm2 |
| 443 punpcklqdq xmm3, xmm3 |
| 444 |
| 445 movdqa k0k1, xmm0 |
| 446 movdqa k2k3, xmm1 |
| 447 pshufd xmm5, xmm5, 0 |
| 448 movdqa k4k5, xmm2 |
| 449 movdqa k6k7, xmm3 |
| 450 movdqa krd, xmm5 |
| 451 |
| 452 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 453 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 454 movsxd rcx, dword ptr arg(4) ;output_height |
| 455 |
| 456 .filter_block1d16_h8_rowloop_ssse3: |
| 457 movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 |
| 458 |
| 459 ; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 |
| 460 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 |
| 461 ;note: if we create a k0_k7 filter, we can save a pshufb |
| 462 ; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 |
| 463 punpcklqdq xmm0, xmm3 |
| 464 |
| 465 movdqa xmm1, xmm0 |
| 466 pshufb xmm0, [GLOBAL(shuf_t0t1)] |
| 467 pmaddubsw xmm0, k0k1 |
| 468 |
| 469 movdqa xmm2, xmm1 |
| 470 pshufb xmm1, [GLOBAL(shuf_t2t3)] |
| 471 pmaddubsw xmm1, k2k3 |
| 472 |
| 473 movdqa xmm4, xmm2 |
| 474 pshufb xmm2, [GLOBAL(shuf_t4t5)] |
| 475 pmaddubsw xmm2, k4k5 |
| 476 |
| 477 pshufb xmm4, [GLOBAL(shuf_t6t7)] |
| 478 pmaddubsw xmm4, k6k7 |
| 479 |
| 480 paddsw xmm0, xmm1 |
| 481 paddsw xmm0, xmm4 |
| 482 paddsw xmm0, xmm2 |
| 483 paddsw xmm0, krd |
| 484 psraw xmm0, 7 |
| 485 packuswb xmm0, xmm0 |
| 486 |
| 487 |
| 488 movq xmm3, [rsi + 5] |
| 489 ; movq xmm7, [rsi + 12] |
| 490 movq xmm7, [rsi + 13] |
| 491 ;note: same as above |
| 492 ; punpcklbw xmm3, xmm7 |
| 493 punpcklqdq xmm3, xmm7 |
| 494 |
| 495 movdqa xmm1, xmm3 |
| 496 pshufb xmm3, [GLOBAL(shuf_t0t1)] |
| 497 pmaddubsw xmm3, k0k1 |
| 498 |
| 499 movdqa xmm2, xmm1 |
| 500 pshufb xmm1, [GLOBAL(shuf_t2t3)] |
| 501 pmaddubsw xmm1, k2k3 |
| 502 |
| 503 movdqa xmm4, xmm2 |
| 504 pshufb xmm2, [GLOBAL(shuf_t4t5)] |
| 505 pmaddubsw xmm2, k4k5 |
| 506 |
| 507 pshufb xmm4, [GLOBAL(shuf_t6t7)] |
| 508 pmaddubsw xmm4, k6k7 |
| 509 |
| 510 paddsw xmm3, xmm1 |
| 511 paddsw xmm3, xmm2 |
| 512 paddsw xmm3, krd |
| 513 paddsw xmm3, xmm4 |
| 514 psraw xmm3, 7 |
| 515 packuswb xmm3, xmm3 |
| 516 punpcklqdq xmm0, xmm3 |
| 517 |
| 518 lea rsi, [rsi + rax] |
| 519 movdqa [rdi], xmm0 |
| 520 |
| 521 lea rdi, [rdi + rdx] |
| 522 dec rcx |
| 523 jnz .filter_block1d16_h8_rowloop_ssse3 |
| 524 |
| 525 add rsp, 16*5 |
| 526 pop rsp |
| 527 |
| 528 ; begin epilog |
| 529 pop rdi |
| 530 pop rsi |
| 531 RESTORE_GOT |
| 532 RESTORE_XMM |
| 533 UNSHADOW_ARGS |
| 534 pop rbp |
| 535 ret |
| 536 |
| 537 |
| 538 SECTION_RODATA |
| 539 align 16 |
| 540 shuf_t0t1: |
| 541 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
| 542 align 16 |
| 543 shuf_t2t3: |
| 544 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 545 align 16 |
| 546 shuf_t4t5: |
| 547 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 |
| 548 align 16 |
| 549 shuf_t6t7: |
| 550 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |
OLD | NEW |