| OLD | NEW |
| 1 ; | 1 ; |
| 2 ; jdsample.asm - upsampling (64-bit SSE2) | 2 ; jdsample.asm - upsampling (64-bit SSE2) |
| 3 ; | 3 ; |
| 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
| 5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
| 6 ; | 6 ; |
| 7 ; Based on | 7 ; Based on |
| 8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
| 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
| 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
| 11 ; | 11 ; |
| 12 ; This file should be assembled with NASM (Netwide Assembler), | 12 ; This file should be assembled with NASM (Netwide Assembler), |
| 13 ; can *not* be assembled with Microsoft's MASM or any compatible | 13 ; can *not* be assembled with Microsoft's MASM or any compatible |
| 14 ; assembler (including Borland's Turbo Assembler). | 14 ; assembler (including Borland's Turbo Assembler). |
| 15 ; NASM is available from http://nasm.sourceforge.net/ or | 15 ; NASM is available from http://nasm.sourceforge.net/ or |
| 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
| 17 ; | 17 ; |
| 18 ; [TAB8] | 18 ; [TAB8] |
| 19 | 19 |
| 20 %include "jsimdext.inc" | 20 %include "jsimdext.inc" |
| 21 | 21 |
| 22 ; -------------------------------------------------------------------------- | 22 ; -------------------------------------------------------------------------- |
| 23 SECTION SEG_CONST | 23 SECTION SEG_CONST |
| 24 | 24 |
| 25 alignz 16 | 25 alignz 16 |
| 26 global EXTN(jconst_fancy_upsample_sse2) | 26 global EXTN(jconst_fancy_upsample_sse2) PRIVATE |
| 27 | 27 |
| 28 EXTN(jconst_fancy_upsample_sse2): | 28 EXTN(jconst_fancy_upsample_sse2): |
| 29 | 29 |
| 30 PW_ONE times 8 dw 1 | 30 PW_ONE times 8 dw 1 |
| 31 PW_TWO times 8 dw 2 | 31 PW_TWO times 8 dw 2 |
| 32 PW_THREE times 8 dw 3 | 32 PW_THREE times 8 dw 3 |
| 33 PW_SEVEN times 8 dw 7 | 33 PW_SEVEN times 8 dw 7 |
| 34 PW_EIGHT times 8 dw 8 | 34 PW_EIGHT times 8 dw 8 |
| 35 | 35 |
| 36 alignz 16 | 36 alignz 16 |
| (...skipping 15 matching lines...) Expand all Loading... |
| 52 ; JSAMPARRAY input_data, | 52 ; JSAMPARRAY input_data, |
| 53 ; JSAMPARRAY *output_data_ptr); | 53 ; JSAMPARRAY *output_data_ptr); |
| 54 ; | 54 ; |
| 55 | 55 |
| 56 ; r10 = int max_v_samp_factor | 56 ; r10 = int max_v_samp_factor |
| 57 ; r11 = JDIMENSION downsampled_width | 57 ; r11 = JDIMENSION downsampled_width |
| 58 ; r12 = JSAMPARRAY input_data | 58 ; r12 = JSAMPARRAY input_data |
| 59 ; r13 = JSAMPARRAY *output_data_ptr | 59 ; r13 = JSAMPARRAY *output_data_ptr |
| 60 | 60 |
| 61 align 16 | 61 align 16 |
| 62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) | 62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE |
| 63 | 63 |
| 64 EXTN(jsimd_h2v1_fancy_upsample_sse2): | 64 EXTN(jsimd_h2v1_fancy_upsample_sse2): |
| 65 push rbp | 65 push rbp |
| 66 mov rax,rsp | 66 mov rax,rsp |
| 67 mov rbp,rsp | 67 mov rbp,rsp |
| 68 collect_args | 68 collect_args |
| 69 | 69 |
| 70 mov eax, r11d ; colctr | 70 mov eax, r11d ; colctr |
| 71 test rax,rax | 71 test rax,rax |
| 72 jz near .return | 72 jz near .return |
| (...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 194 | 194 |
| 195 ; r10 = int max_v_samp_factor | 195 ; r10 = int max_v_samp_factor |
| 196 ; r11 = JDIMENSION downsampled_width | 196 ; r11 = JDIMENSION downsampled_width |
| 197 ; r12 = JSAMPARRAY input_data | 197 ; r12 = JSAMPARRAY input_data |
| 198 ; r13 = JSAMPARRAY *output_data_ptr | 198 ; r13 = JSAMPARRAY *output_data_ptr |
| 199 | 199 |
| 200 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 200 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
| 201 %define WK_NUM 4 | 201 %define WK_NUM 4 |
| 202 | 202 |
| 203 align 16 | 203 align 16 |
| 204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) | 204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE |
| 205 | 205 |
| 206 EXTN(jsimd_h2v2_fancy_upsample_sse2): | 206 EXTN(jsimd_h2v2_fancy_upsample_sse2): |
| 207 push rbp | 207 push rbp |
| 208 mov rax,rsp ; rax = original rbp | 208 mov rax,rsp ; rax = original rbp |
| 209 sub rsp, byte 4 | 209 sub rsp, byte 4 |
| 210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
| 211 mov [rsp],rax | 211 mov [rsp],rax |
| 212 mov rbp,rsp ; rbp = aligned rbp | 212 mov rbp,rsp ; rbp = aligned rbp |
| 213 lea rsp, [wk(0)] | 213 lea rsp, [wk(0)] |
| 214 collect_args | 214 collect_args |
| (...skipping 276 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 491 ; JSAMPARRAY input_data, | 491 ; JSAMPARRAY input_data, |
| 492 ; JSAMPARRAY *output_data_ptr); | 492 ; JSAMPARRAY *output_data_ptr); |
| 493 ; | 493 ; |
| 494 | 494 |
| 495 ; r10 = int max_v_samp_factor | 495 ; r10 = int max_v_samp_factor |
| 496 ; r11 = JDIMENSION output_width | 496 ; r11 = JDIMENSION output_width |
| 497 ; r12 = JSAMPARRAY input_data | 497 ; r12 = JSAMPARRAY input_data |
| 498 ; r13 = JSAMPARRAY *output_data_ptr | 498 ; r13 = JSAMPARRAY *output_data_ptr |
| 499 | 499 |
| 500 align 16 | 500 align 16 |
| 501 global EXTN(jsimd_h2v1_upsample_sse2) | 501 global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE |
| 502 | 502 |
| 503 EXTN(jsimd_h2v1_upsample_sse2): | 503 EXTN(jsimd_h2v1_upsample_sse2): |
| 504 push rbp | 504 push rbp |
| 505 mov rax,rsp | 505 mov rax,rsp |
| 506 mov rbp,rsp | 506 mov rbp,rsp |
| 507 collect_args | 507 collect_args |
| 508 | 508 |
| 509 mov edx, r11d | 509 mov edx, r11d |
| 510 add rdx, byte (2*SIZEOF_XMMWORD)-1 | 510 add rdx, byte (2*SIZEOF_XMMWORD)-1 |
| 511 and rdx, byte -(2*SIZEOF_XMMWORD) | 511 and rdx, byte -(2*SIZEOF_XMMWORD) |
| (...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 580 ; JSAMPARRAY input_data, | 580 ; JSAMPARRAY input_data, |
| 581 ; JSAMPARRAY *output_data_ptr); | 581 ; JSAMPARRAY *output_data_ptr); |
| 582 ; | 582 ; |
| 583 | 583 |
| 584 ; r10 = int max_v_samp_factor | 584 ; r10 = int max_v_samp_factor |
| 585 ; r11 = JDIMENSION output_width | 585 ; r11 = JDIMENSION output_width |
| 586 ; r12 = JSAMPARRAY input_data | 586 ; r12 = JSAMPARRAY input_data |
| 587 ; r13 = JSAMPARRAY *output_data_ptr | 587 ; r13 = JSAMPARRAY *output_data_ptr |
| 588 | 588 |
| 589 align 16 | 589 align 16 |
| 590 global EXTN(jsimd_h2v2_upsample_sse2) | 590 global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE |
| 591 | 591 |
| 592 EXTN(jsimd_h2v2_upsample_sse2): | 592 EXTN(jsimd_h2v2_upsample_sse2): |
| 593 push rbp | 593 push rbp |
| 594 mov rax,rsp | 594 mov rax,rsp |
| 595 mov rbp,rsp | 595 mov rbp,rsp |
| 596 collect_args | 596 collect_args |
| 597 push rbx | 597 push rbx |
| 598 | 598 |
| 599 mov edx, r11d | 599 mov edx, r11d |
| 600 add rdx, byte (2*SIZEOF_XMMWORD)-1 | 600 add rdx, byte (2*SIZEOF_XMMWORD)-1 |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 662 | 662 |
| 663 .return: | 663 .return: |
| 664 pop rbx | 664 pop rbx |
| 665 uncollect_args | 665 uncollect_args |
| 666 pop rbp | 666 pop rbp |
| 667 ret | 667 ret |
| 668 | 668 |
| 669 ; For some reason, the OS X linker does not honor the request to align the | 669 ; For some reason, the OS X linker does not honor the request to align the |
| 670 ; segment unless we do this. | 670 ; segment unless we do this. |
| 671 align 16 | 671 align 16 |
| OLD | NEW |