simd/jquantf-sse2.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Side by Side Diff: simd/jquantf-sse2.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 ;

	2 ; jquantf.asm - sample data conversion and quantization (SSE & SSE2)

	3 ;

	4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

	5 ;

	6 ; Based on

	7 ; x86 SIMD extension for IJG JPEG library

	8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

	9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

	10 ;

	11 ; This file should be assembled with NASM (Netwide Assembler),

	12 ; can not be assembled with Microsoft's MASM or any compatible

	13 ; assembler (including Borland's Turbo Assembler).

	14 ; NASM is available from http://nasm.sourceforge.net/ or

	15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

	16 ;

	17 ; [TAB8]

	18

	19 %include "jsimdext.inc"

	20 %include "jdct.inc"

	21

	22 ; --------------------------------------------------------------------------

	23 SECTION SEG_TEXT

	24 BITS 32

	25 ;

	26 ; Load data into workspace, applying unsigned->signed conversion

	27 ;

	28 ; GLOBAL(void)

	29 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,

	30 ; FAST_FLOAT *workspace);

	31 ;

	32

	33 %define sample_data ebp+8 ; JSAMPARRAY sample_data

	34 %define start_col ebp+12 ; JDIMENSION start_col

	35 %define workspace ebp+16 ; FAST_FLOAT *workspace

	36

	37 align 16

	38 global EXTN(jsimd_convsamp_float_sse2)

	39

	40 EXTN(jsimd_convsamp_float_sse2):

	41 push ebp

	42 mov ebp,esp

	43 push ebx

	44 ; push ecx ; need not be preserved

	45 ; push edx ; need not be preserved

	46 push esi

	47 push edi

	48

	49 pcmpeqw xmm7,xmm7

	50 psllw xmm7,7

	51 packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..)

	52

	53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

	54 mov eax, JDIMENSION [start_col]

	55 mov edi, POINTER [workspace] ; (DCTELEM *)

	56 mov ecx, DCTSIZE/2

	57 alignx 16,7

	58 .convloop:

	59 mov ebx, JSAMPROW [esi+0SIZEOF_JSAMPROW] ; (JSAMPLE )

	60 mov edx, JSAMPROW [esi+1SIZEOF_JSAMPROW] ; (JSAMPLE )

	61

	62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]

	63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]

	64

	65 psubb xmm0,xmm7 ; xmm0=(01234567)

	66 psubb xmm1,xmm7 ; xmm1=(89ABCDEF)

	67

	68 punpcklbw xmm0,xmm0 ; xmm0=(01234567)

	69 punpcklbw xmm1,xmm1 ; xmm1=(89ABCDEF)

	70

	71 punpcklwd xmm2,xmm0 ; xmm2=(*012*3)

	72 punpckhwd xmm0,xmm0 ; xmm0=(*456*7)

	73 punpcklwd xmm3,xmm1 ; xmm3=(*89A*B)

	74 punpckhwd xmm1,xmm1 ; xmm1=(*CDE*F)

	75

	76 psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123)

	77 psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567)

	78 cvtdq2ps xmm2,xmm2 ; xmm2=(0123)

	79 cvtdq2ps xmm0,xmm0 ; xmm0=(4567)

	80 psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB)

	81 psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF)

	82 cvtdq2ps xmm3,xmm3 ; xmm3=(89AB)

	83 cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF)

	84

	85 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2

	86 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0

	87 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3

	88 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

	89

	90 add esi, byte 2*SIZEOF_JSAMPROW

	91 add edi, byte 2DCTSIZESIZEOF_FAST_FLOAT

	92 dec ecx

	93 jnz short .convloop

	94

	95 pop edi

	96 pop esi

	97 ; pop edx ; need not be preserved

	98 ; pop ecx ; need not be preserved

	99 pop ebx

	100 pop ebp

	101 ret

	102

	103

	104 ; --------------------------------------------------------------------------

	105 ;

	106 ; Quantize/descale the coefficients, and store into coef_block

	107 ;

	108 ; GLOBAL(void)

	109 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,

	110 ; FAST_FLOAT *workspace);

	111 ;

	112

	113 %define coef_block ebp+8 ; JCOEFPTR coef_block

	114 %define divisors ebp+12 ; FAST_FLOAT *divisors

	115 %define workspace ebp+16 ; FAST_FLOAT *workspace

	116

	117 align 16

	118 global EXTN(jsimd_quantize_float_sse2)

	119

	120 EXTN(jsimd_quantize_float_sse2):

	121 push ebp

	122 mov ebp,esp

	123 ; push ebx ; unused

	124 ; push ecx ; unused

	125 ; push edx ; need not be preserved

	126 push esi

	127 push edi

	128

	129 mov esi, POINTER [workspace]

	130 mov edx, POINTER [divisors]

	131 mov edi, JCOEFPTR [coef_block]

	132 mov eax, DCTSIZE2/16

	133 alignx 16,7

	134 .quantloop:

	135 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

	136 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

	137 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]

	138 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]

	139 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

	140 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]

	141 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]

	142 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]

	143

	144 cvtps2dq xmm0,xmm0

	145 cvtps2dq xmm1,xmm1

	146 cvtps2dq xmm2,xmm2

	147 cvtps2dq xmm3,xmm3

	148

	149 packssdw xmm0,xmm1

	150 packssdw xmm2,xmm3

	151

	152 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0

	153 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2

	154

	155 add esi, byte 16*SIZEOF_FAST_FLOAT

	156 add edx, byte 16*SIZEOF_FAST_FLOAT

	157 add edi, byte 16*SIZEOF_JCOEF

	158 dec eax

	159 jnz short .quantloop

	160

	161 pop edi

	162 pop esi

	163 ; pop edx ; need not be preserved

	164 ; pop ecx ; unused

	165 ; pop ebx ; unused

	166 pop ebp

	167 ret

	168

	169 ; For some reason, the OS X linker does not honor the request to align the

	170 ; segment unless we do this.

	171 align 16

OLD	NEW

« no previous file with comments | « simd/jquant-sse.asm ('k') | simd/jquantf-sse2-64.asm » ('j') | no next file with comments »