simd/jcqntsse.asm - Issue 1953443002: Update to libjpeg_turbo 1.4.90

Unified Diff: simd/jcqntsse.asm

Issue 1953443002: Update to libjpeg_turbo 1.4.90 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libjpeg_turbo.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: simd/jcqntsse.asm

diff --git a/simd/jcqntsse.asm b/simd/jcqntsse.asm

deleted file mode 100644

index df7243ee9b8403fcf727a7610fef2f4c4149e8b0..0000000000000000000000000000000000000000

--- a/simd/jcqntsse.asm

+++ /dev/null

@@ -1,211 +0,0 @@

-; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)

-; Based on

-; x86 SIMD extension for IJG JPEG library

-; For conditions of distribution and use, see copyright notice in jsimdext.inc

-; This file should be assembled with NASM (Netwide Assembler),

-; can *not* be assembled with Microsoft's MASM or any compatible

-; assembler (including Borland's Turbo Assembler).

-; NASM is available from http://nasm.sourceforge.net/ or

-; http://sourceforge.net/project/showfiles.php?group_id=6208

-; [TAB8]

-%include "jsimdext.inc"

-%include "jdct.inc"

-; --------------------------------------------------------------------------

- SECTION SEG_TEXT

- BITS 32

-; Load data into workspace, applying unsigned->signed conversion

-; GLOBAL(void)

-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,

-; FAST_FLOAT * workspace);

-%define sample_data ebp+8 ; JSAMPARRAY sample_data

-%define start_col ebp+12 ; JDIMENSION start_col

-%define workspace ebp+16 ; FAST_FLOAT * workspace

- align 16

- global EXTN(jsimd_convsamp_float_sse) PRIVATE

-EXTN(jsimd_convsamp_float_sse):

- push ebp

- mov ebp,esp

- push ebx

-; push ecx ; need not be preserved

-; push edx ; need not be preserved

- push esi

- push edi

- pcmpeqw mm7,mm7

- psllw mm7,7

- packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..)

- mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)

- mov eax, JDIMENSION [start_col]

- mov edi, POINTER [workspace] ; (DCTELEM *)

- mov ecx, DCTSIZE/2

- alignx 16,7

-.convloop:

- mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)

- mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)

- movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]

- movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]

- psubb mm0,mm7 ; mm0=(01234567)

- psubb mm1,mm7 ; mm1=(89ABCDEF)

- punpcklbw mm2,mm0 ; mm2=(*0*1*2*3)

- punpckhbw mm0,mm0 ; mm0=(*4*5*6*7)

- punpcklbw mm3,mm1 ; mm3=(*8*9*A*B)

- punpckhbw mm1,mm1 ; mm1=(*C*D*E*F)

- punpcklwd mm4,mm2 ; mm4=(***0***1)

- punpckhwd mm2,mm2 ; mm2=(***2***3)

- punpcklwd mm5,mm0 ; mm5=(***4***5)

- punpckhwd mm0,mm0 ; mm0=(***6***7)

- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01)

- psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23)

- cvtpi2ps xmm0,mm4 ; xmm0=(01**)

- cvtpi2ps xmm1,mm2 ; xmm1=(23**)

- psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45)

- psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67)

- cvtpi2ps xmm2,mm5 ; xmm2=(45**)

- cvtpi2ps xmm3,mm0 ; xmm3=(67**)

- punpcklwd mm6,mm3 ; mm6=(***8***9)

- punpckhwd mm3,mm3 ; mm3=(***A***B)

- punpcklwd mm4,mm1 ; mm4=(***C***D)

- punpckhwd mm1,mm1 ; mm1=(***E***F)

- psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89)

- psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB)

- cvtpi2ps xmm4,mm6 ; xmm4=(89**)

- cvtpi2ps xmm5,mm3 ; xmm5=(AB**)

- psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD)

- psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF)

- cvtpi2ps xmm6,mm4 ; xmm6=(CD**)

- cvtpi2ps xmm7,mm1 ; xmm7=(EF**)

- movlhps xmm0,xmm1 ; xmm0=(0123)

- movlhps xmm2,xmm3 ; xmm2=(4567)

- movlhps xmm4,xmm5 ; xmm4=(89AB)

- movlhps xmm6,xmm7 ; xmm6=(CDEF)

- movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0

- movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2

- movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4

- movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6

- add esi, byte 2*SIZEOF_JSAMPROW

- add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT

- dec ecx

- jnz near .convloop

- emms ; empty MMX state

- pop edi

- pop esi

-; pop edx ; need not be preserved

-; pop ecx ; need not be preserved

- pop ebx

- pop ebp

- ret

-; --------------------------------------------------------------------------

-; Quantize/descale the coefficients, and store into coef_block

-; GLOBAL(void)

-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,

-; FAST_FLOAT * workspace);

-%define coef_block ebp+8 ; JCOEFPTR coef_block

-%define divisors ebp+12 ; FAST_FLOAT * divisors

-%define workspace ebp+16 ; FAST_FLOAT * workspace

- align 16

- global EXTN(jsimd_quantize_float_sse) PRIVATE

-EXTN(jsimd_quantize_float_sse):

- push ebp

- mov ebp,esp

-; push ebx ; unused

-; push ecx ; unused

-; push edx ; need not be preserved

- push esi

- push edi

- mov esi, POINTER [workspace]

- mov edx, POINTER [divisors]

- mov edi, JCOEFPTR [coef_block]

- mov eax, DCTSIZE2/16

- alignx 16,7

-.quantloop:

- movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

- movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]

- mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]

- mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]

- movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

- movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]

- mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]

- mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]

- movhlps xmm4,xmm0

- movhlps xmm5,xmm1

- cvtps2pi mm0,xmm0

- cvtps2pi mm1,xmm1

- cvtps2pi mm4,xmm4

- cvtps2pi mm5,xmm5

- movhlps xmm6,xmm2

- movhlps xmm7,xmm3

- cvtps2pi mm2,xmm2

- cvtps2pi mm3,xmm3

- cvtps2pi mm6,xmm6

- cvtps2pi mm7,xmm7

- packssdw mm0,mm4

- packssdw mm1,mm5

- packssdw mm2,mm6

- packssdw mm3,mm7

- movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0

- movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1

- movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2

- movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3

- add esi, byte 16*SIZEOF_FAST_FLOAT

- add edx, byte 16*SIZEOF_FAST_FLOAT

- add edi, byte 16*SIZEOF_JCOEF

- dec eax

- jnz short .quantloop

- emms ; empty MMX state

- pop edi

- pop esi

-; pop edx ; need not be preserved

-; pop ecx ; unused

-; pop ebx ; unused

- pop ebp

- ret

-; For some reason, the OS X linker does not honor the request to align the

-; segment unless we do this.

- align 16

« no previous file with comments | « simd/jcqnts2i-64.asm ('k') | simd/jcsammmx.asm » ('j') | no next file with comments »