OLD | NEW |
1 ; | 1 ; |
2 ; jdsample.asm - upsampling (64-bit SSE2) | 2 ; jdsample.asm - upsampling (64-bit SSE2) |
3 ; | 3 ; |
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB | 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
5 ; Copyright 2009 D. R. Commander | 5 ; Copyright 2009 D. R. Commander |
6 ; | 6 ; |
7 ; Based on | 7 ; Based on |
8 ; x86 SIMD extension for IJG JPEG library | 8 ; x86 SIMD extension for IJG JPEG library |
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. | 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc | 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
11 ; | 11 ; |
12 ; This file should be assembled with NASM (Netwide Assembler), | 12 ; This file should be assembled with NASM (Netwide Assembler), |
13 ; can *not* be assembled with Microsoft's MASM or any compatible | 13 ; can *not* be assembled with Microsoft's MASM or any compatible |
14 ; assembler (including Borland's Turbo Assembler). | 14 ; assembler (including Borland's Turbo Assembler). |
15 ; NASM is available from http://nasm.sourceforge.net/ or | 15 ; NASM is available from http://nasm.sourceforge.net/ or |
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 | 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
17 ; | 17 ; |
18 ; [TAB8] | 18 ; [TAB8] |
19 | 19 |
20 %include "jsimdext.inc" | 20 %include "jsimdext.inc" |
21 | 21 |
22 ; -------------------------------------------------------------------------- | 22 ; -------------------------------------------------------------------------- |
23 SECTION SEG_CONST | 23 SECTION SEG_CONST |
24 | 24 |
25 alignz 16 | 25 alignz 16 |
26 global EXTN(jconst_fancy_upsample_sse2) | 26 global EXTN(jconst_fancy_upsample_sse2) PRIVATE |
27 | 27 |
28 EXTN(jconst_fancy_upsample_sse2): | 28 EXTN(jconst_fancy_upsample_sse2): |
29 | 29 |
30 PW_ONE times 8 dw 1 | 30 PW_ONE times 8 dw 1 |
31 PW_TWO times 8 dw 2 | 31 PW_TWO times 8 dw 2 |
32 PW_THREE times 8 dw 3 | 32 PW_THREE times 8 dw 3 |
33 PW_SEVEN times 8 dw 7 | 33 PW_SEVEN times 8 dw 7 |
34 PW_EIGHT times 8 dw 8 | 34 PW_EIGHT times 8 dw 8 |
35 | 35 |
36 alignz 16 | 36 alignz 16 |
(...skipping 15 matching lines...) Expand all Loading... |
52 ; JSAMPARRAY input_data, | 52 ; JSAMPARRAY input_data, |
53 ; JSAMPARRAY *output_data_ptr); | 53 ; JSAMPARRAY *output_data_ptr); |
54 ; | 54 ; |
55 | 55 |
56 ; r10 = int max_v_samp_factor | 56 ; r10 = int max_v_samp_factor |
57 ; r11 = JDIMENSION downsampled_width | 57 ; r11 = JDIMENSION downsampled_width |
58 ; r12 = JSAMPARRAY input_data | 58 ; r12 = JSAMPARRAY input_data |
59 ; r13 = JSAMPARRAY *output_data_ptr | 59 ; r13 = JSAMPARRAY *output_data_ptr |
60 | 60 |
61 align 16 | 61 align 16 |
62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) | 62 global EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE |
63 | 63 |
64 EXTN(jsimd_h2v1_fancy_upsample_sse2): | 64 EXTN(jsimd_h2v1_fancy_upsample_sse2): |
65 push rbp | 65 push rbp |
66 mov rax,rsp | 66 mov rax,rsp |
67 mov rbp,rsp | 67 mov rbp,rsp |
68 collect_args | 68 collect_args |
69 | 69 |
70 mov eax, r11d ; colctr | 70 mov eax, r11d ; colctr |
71 test rax,rax | 71 test rax,rax |
72 jz near .return | 72 jz near .return |
(...skipping 121 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 | 194 |
195 ; r10 = int max_v_samp_factor | 195 ; r10 = int max_v_samp_factor |
196 ; r11 = JDIMENSION downsampled_width | 196 ; r11 = JDIMENSION downsampled_width |
197 ; r12 = JSAMPARRAY input_data | 197 ; r12 = JSAMPARRAY input_data |
198 ; r13 = JSAMPARRAY *output_data_ptr | 198 ; r13 = JSAMPARRAY *output_data_ptr |
199 | 199 |
200 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] | 200 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] |
201 %define WK_NUM 4 | 201 %define WK_NUM 4 |
202 | 202 |
203 align 16 | 203 align 16 |
204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) | 204 global EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE |
205 | 205 |
206 EXTN(jsimd_h2v2_fancy_upsample_sse2): | 206 EXTN(jsimd_h2v2_fancy_upsample_sse2): |
207 push rbp | 207 push rbp |
208 mov rax,rsp ; rax = original rbp | 208 mov rax,rsp ; rax = original rbp |
209 sub rsp, byte 4 | 209 sub rsp, byte 4 |
210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits | 210 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits |
211 mov [rsp],rax | 211 mov [rsp],rax |
212 mov rbp,rsp ; rbp = aligned rbp | 212 mov rbp,rsp ; rbp = aligned rbp |
213 lea rsp, [wk(0)] | 213 lea rsp, [wk(0)] |
214 collect_args | 214 collect_args |
(...skipping 276 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
491 ; JSAMPARRAY input_data, | 491 ; JSAMPARRAY input_data, |
492 ; JSAMPARRAY *output_data_ptr); | 492 ; JSAMPARRAY *output_data_ptr); |
493 ; | 493 ; |
494 | 494 |
495 ; r10 = int max_v_samp_factor | 495 ; r10 = int max_v_samp_factor |
496 ; r11 = JDIMENSION output_width | 496 ; r11 = JDIMENSION output_width |
497 ; r12 = JSAMPARRAY input_data | 497 ; r12 = JSAMPARRAY input_data |
498 ; r13 = JSAMPARRAY *output_data_ptr | 498 ; r13 = JSAMPARRAY *output_data_ptr |
499 | 499 |
500 align 16 | 500 align 16 |
501 global EXTN(jsimd_h2v1_upsample_sse2) | 501 global EXTN(jsimd_h2v1_upsample_sse2) PRIVATE |
502 | 502 |
503 EXTN(jsimd_h2v1_upsample_sse2): | 503 EXTN(jsimd_h2v1_upsample_sse2): |
504 push rbp | 504 push rbp |
505 mov rax,rsp | 505 mov rax,rsp |
506 mov rbp,rsp | 506 mov rbp,rsp |
507 collect_args | 507 collect_args |
508 | 508 |
509 mov edx, r11d | 509 mov edx, r11d |
510 add rdx, byte (2*SIZEOF_XMMWORD)-1 | 510 add rdx, byte (2*SIZEOF_XMMWORD)-1 |
511 and rdx, byte -(2*SIZEOF_XMMWORD) | 511 and rdx, byte -(2*SIZEOF_XMMWORD) |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
580 ; JSAMPARRAY input_data, | 580 ; JSAMPARRAY input_data, |
581 ; JSAMPARRAY *output_data_ptr); | 581 ; JSAMPARRAY *output_data_ptr); |
582 ; | 582 ; |
583 | 583 |
584 ; r10 = int max_v_samp_factor | 584 ; r10 = int max_v_samp_factor |
585 ; r11 = JDIMENSION output_width | 585 ; r11 = JDIMENSION output_width |
586 ; r12 = JSAMPARRAY input_data | 586 ; r12 = JSAMPARRAY input_data |
587 ; r13 = JSAMPARRAY *output_data_ptr | 587 ; r13 = JSAMPARRAY *output_data_ptr |
588 | 588 |
589 align 16 | 589 align 16 |
590 global EXTN(jsimd_h2v2_upsample_sse2) | 590 global EXTN(jsimd_h2v2_upsample_sse2) PRIVATE |
591 | 591 |
592 EXTN(jsimd_h2v2_upsample_sse2): | 592 EXTN(jsimd_h2v2_upsample_sse2): |
593 push rbp | 593 push rbp |
594 mov rax,rsp | 594 mov rax,rsp |
595 mov rbp,rsp | 595 mov rbp,rsp |
596 collect_args | 596 collect_args |
597 push rbx | 597 push rbx |
598 | 598 |
599 mov edx, r11d | 599 mov edx, r11d |
600 add rdx, byte (2*SIZEOF_XMMWORD)-1 | 600 add rdx, byte (2*SIZEOF_XMMWORD)-1 |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
662 | 662 |
663 .return: | 663 .return: |
664 pop rbx | 664 pop rbx |
665 uncollect_args | 665 uncollect_args |
666 pop rbp | 666 pop rbp |
667 ret | 667 ret |
668 | 668 |
669 ; For some reason, the OS X linker does not honor the request to align the | 669 ; For some reason, the OS X linker does not honor the request to align the |
670 ; segment unless we do this. | 670 ; segment unless we do this. |
671 align 16 | 671 align 16 |
OLD | NEW |