simd/jquanti-sse2-64.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jquanti-sse2-64.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ; Copyright 2009 D. R. Commander

	6 ;

	7 ; Based on

	8 ; x86 SIMD extension for IJG JPEG library

	9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	11 ;

	12 ; This file should be assembled with NASM (Netwide Assembler),

	13 ; can not be assembled with Microsoft's MASM or any compatible

	14 ; assembler (including Borland's Turbo Assembler).

	15 ; NASM is available from http://nasm.sourceforge.net/ or

	16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	17 ;

	18 ; [TAB8]

	19

	20 %include "jsimdext.inc"

	21 %include "jdct.inc"

	22

	23 ; --------------------------------------------------------------------------

	24 SECTION SEG_TEXT

	25 BITS 64

	26 ;

	27 ; Load data into workspace, applying unsigned->signed conversion

	28 ;

	29 ; GLOBAL(void)

	30 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,

	31 ; DCTELEM *workspace);

	32 ;

	33

	34 ; r10 = JSAMPARRAY sample_data

	35 ; r11 = JDIMENSION start_col

	36 ; r12 = DCTELEM *workspace

	37

	38 align 16

	39 global EXTN(jsimd_convsamp_sse2)

	40

	41 EXTN(jsimd_convsamp_sse2):

	42 push rbp

	43 mov rax,rsp

	44 mov rbp,rsp

	45 collect_args

	46 push rbx

	47

	48 pxor xmm6,xmm6 ; xmm6=(all 0's)

	49 pcmpeqw xmm7,xmm7

	50 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}

	51

	52 mov rsi, r10

	53 mov eax, r11d

	54 mov rdi, r12

	55 mov rcx, DCTSIZE/4

	56 .convloop:

	57 mov rbx, JSAMPROW [rsi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

	58 mov rdx, JSAMPROW [rsi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

	59

	60 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567 )

	61 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF )

	62

	63 mov rbx, JSAMPROW [rsi+2SIZEOF_JSAMPROW] ; (JSAMPLE )

	64 mov rdx, JSAMPROW [rsi+3SIZEOF_JSAMPROW] ; (JSAMPLE )

	65

	66 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN )

	67 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV )

	68

	69 punpcklbw xmm0,xmm6 ; xmm0=(01234567)

	70 punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF)

	71 paddw xmm0,xmm7

	72 paddw xmm1,xmm7

	73 punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN)

	74 punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV)

	75 paddw xmm2,xmm7

	76 paddw xmm3,xmm7

	77

	78 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

	79 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

	80 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

	81 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

	82

	83 add rsi, byte 4*SIZEOF_JSAMPROW

	84 add rdi, byte 4DCTSIZESIZEOF_DCTELEM

	85 dec rcx

	86 jnz short .convloop

	87

	88 pop rbx

	89 uncollect_args

	90 pop rbp

	91 ret

	92

	93 ; --------------------------------------------------------------------------

	94 ;

	95 ; Quantize/descale the coefficients, and store into coef_block

	96 ;

	97 ; This implementation is based on an algorithm described in

	98 ; "How to optimize for the Pentium family of microprocessors"

	99 ; (http://www.agner.org/assem/).

	100 ;

	101 ; GLOBAL(void)

	102 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,

	103 ; DCTELEM *workspace);

	104 ;

	105

	106 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)

	107 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)

	108 %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

	109

	110 ; r10 = JCOEFPTR coef_block

	111 ; r11 = DCTELEM *divisors

	112 ; r12 = DCTELEM *workspace

	113

	114 align 16

	115 global EXTN(jsimd_quantize_sse2)

	116

	117 EXTN(jsimd_quantize_sse2):

	118 push rbp

	119 mov rax,rsp

	120 mov rbp,rsp

	121 collect_args

	122

	123 mov rsi, r12

	124 mov rdx, r11

	125 mov rdi, r10

	126 mov rax, DCTSIZE2/32

	127 .quantloop:

	128 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]

	129 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]

	130 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]

	131 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]

	132 movdqa xmm0,xmm4

	133 movdqa xmm1,xmm5

	134 movdqa xmm2,xmm6

	135 movdqa xmm3,xmm7

	136 psraw xmm4,(WORD_BIT-1)

	137 psraw xmm5,(WORD_BIT-1)

	138 psraw xmm6,(WORD_BIT-1)

	139 psraw xmm7,(WORD_BIT-1)

	140 pxor xmm0,xmm4

	141 pxor xmm1,xmm5

	142 pxor xmm2,xmm6

	143 pxor xmm3,xmm7

	144 psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;

	145 psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;

	146 psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;

	147 psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;

	148

	149 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor

	150 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]

	151 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]

	152 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]

	153 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal

	154 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]

	155 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]

	156 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]

	157 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale

	158 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]

	159 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]

	160 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]

	161

	162 pxor xmm0,xmm4

	163 pxor xmm1,xmm5

	164 pxor xmm2,xmm6

	165 pxor xmm3,xmm7

	166 psubw xmm0,xmm4

	167 psubw xmm1,xmm5

	168 psubw xmm2,xmm6

	169 psubw xmm3,xmm7

	170 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

	171 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

	172 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

	173 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

	174

	175 add rsi, byte 32*SIZEOF_DCTELEM

	176 add rdx, byte 32*SIZEOF_DCTELEM

	177 add rdi, byte 32*SIZEOF_JCOEF

	178 dec rax

	179 jnz near .quantloop

	180

	181 uncollect_args

	182 pop rbp

	183 ret

	184

	185 ; For some reason, the OS X linker does not honor the request to align the

	186 ; segment unless we do this.

	187 align 16

OLD	NEW

« no previous file with comments | « simd/jquanti-sse2.asm ('k') | simd/jsimd.h » ('j') | no next file with comments »