| Index: jcdctmgr.c
 | 
| diff --git a/jcdctmgr.c b/jcdctmgr.c
 | 
| index 3234a01aa14ca91421f133b2f6ed62881e85aca9..aef8517f9c5c292e713cfdb61e6cae55b2ee7223 100644
 | 
| --- a/jcdctmgr.c
 | 
| +++ b/jcdctmgr.c
 | 
| @@ -6,8 +6,9 @@
 | 
|   * libjpeg-turbo Modifications:
 | 
|   * Copyright (C) 1999-2006, MIYASAKA Masaru.
 | 
|   * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 | 
| - * Copyright (C) 2011 D. R. Commander
 | 
| - * For conditions of distribution and use, see the accompanying README file.
 | 
| + * Copyright (C) 2011, 2014-2015, D. R. Commander.
 | 
| + * For conditions of distribution and use, see the accompanying README.ijg
 | 
| + * file.
 | 
|   *
 | 
|   * This file contains the forward-DCT management logic.
 | 
|   * This code selects a particular DCT implementation to be used,
 | 
| @@ -18,33 +19,32 @@
 | 
|  #define JPEG_INTERNALS
 | 
|  #include "jinclude.h"
 | 
|  #include "jpeglib.h"
 | 
| -#include "jdct.h"		/* Private declarations for DCT subsystem */
 | 
| +#include "jdct.h"               /* Private declarations for DCT subsystem */
 | 
|  #include "jsimddct.h"
 | 
|  
 | 
|  
 | 
|  /* Private subobject for this module */
 | 
|  
 | 
| -typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
 | 
| -typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
 | 
| +typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
 | 
| +typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
 | 
|  
 | 
| -typedef JMETHOD(void, convsamp_method_ptr,
 | 
| -                (JSAMPARRAY sample_data, JDIMENSION start_col,
 | 
| -                 DCTELEM * workspace));
 | 
| -typedef JMETHOD(void, float_convsamp_method_ptr,
 | 
| -                (JSAMPARRAY sample_data, JDIMENSION start_col,
 | 
| -                 FAST_FLOAT *workspace));
 | 
| +typedef void (*convsamp_method_ptr) (JSAMPARRAY sample_data,
 | 
| +                                     JDIMENSION start_col,
 | 
| +                                     DCTELEM *workspace);
 | 
| +typedef void (*float_convsamp_method_ptr) (JSAMPARRAY sample_data,
 | 
| +                                           JDIMENSION start_col,
 | 
| +                                           FAST_FLOAT *workspace);
 | 
|  
 | 
| -typedef JMETHOD(void, quantize_method_ptr,
 | 
| -                (JCOEFPTR coef_block, DCTELEM * divisors,
 | 
| -                 DCTELEM * workspace));
 | 
| -typedef JMETHOD(void, float_quantize_method_ptr,
 | 
| -                (JCOEFPTR coef_block, FAST_FLOAT * divisors,
 | 
| -                 FAST_FLOAT * workspace));
 | 
| +typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
 | 
| +                                     DCTELEM *workspace);
 | 
| +typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
 | 
| +                                           FAST_FLOAT *divisors,
 | 
| +                                           FAST_FLOAT *workspace);
 | 
|  
 | 
|  METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
 | 
|  
 | 
|  typedef struct {
 | 
| -  struct jpeg_forward_dct pub;	/* public fields */
 | 
| +  struct jpeg_forward_dct pub;  /* public fields */
 | 
|  
 | 
|    /* Pointer to the DCT routine actually in use */
 | 
|    forward_DCT_method_ptr dct;
 | 
| @@ -55,27 +55,30 @@ typedef struct {
 | 
|     * entries, because of scaling (especially for an unnormalized DCT).
 | 
|     * Each table is given in normal array order.
 | 
|     */
 | 
| -  DCTELEM * divisors[NUM_QUANT_TBLS];
 | 
| +  DCTELEM *divisors[NUM_QUANT_TBLS];
 | 
|  
 | 
|    /* work area for FDCT subroutine */
 | 
| -  DCTELEM * workspace;
 | 
| +  DCTELEM *workspace;
 | 
|  
 | 
|  #ifdef DCT_FLOAT_SUPPORTED
 | 
|    /* Same as above for the floating-point case. */
 | 
|    float_DCT_method_ptr float_dct;
 | 
|    float_convsamp_method_ptr float_convsamp;
 | 
|    float_quantize_method_ptr float_quantize;
 | 
| -  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 | 
| -  FAST_FLOAT * float_workspace;
 | 
| +  FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
 | 
| +  FAST_FLOAT *float_workspace;
 | 
|  #endif
 | 
|  } my_fdct_controller;
 | 
|  
 | 
| -typedef my_fdct_controller * my_fdct_ptr;
 | 
| +typedef my_fdct_controller *my_fdct_ptr;
 | 
|  
 | 
|  
 | 
| +#if BITS_IN_JSAMPLE == 8
 | 
| +
 | 
|  /*
 | 
|   * Find the highest bit in an integer through binary search.
 | 
|   */
 | 
| +
 | 
|  LOCAL(int)
 | 
|  flss (UINT16 val)
 | 
|  {
 | 
| @@ -106,6 +109,7 @@ flss (UINT16 val)
 | 
|    return bit;
 | 
|  }
 | 
|  
 | 
| +
 | 
|  /*
 | 
|   * Compute values to do a division using reciprocal.
 | 
|   *
 | 
| @@ -147,7 +151,7 @@ flss (UINT16 val)
 | 
|   *
 | 
|   * In order to allow SIMD implementations we also tweak the values to
 | 
|   * allow the same calculation to be made at all times:
 | 
| - * 
 | 
| + *
 | 
|   *   dctbl[0] = f rounded to nearest integer
 | 
|   *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
 | 
|   *   dctbl[2] = 1 << ((word size) * 2 - r)
 | 
| @@ -164,13 +168,27 @@ flss (UINT16 val)
 | 
|   * of in a consecutive manner, yet again in order to allow SIMD
 | 
|   * routines.
 | 
|   */
 | 
| +
 | 
|  LOCAL(int)
 | 
| -compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
 | 
| +compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
 | 
|  {
 | 
|    UDCTELEM2 fq, fr;
 | 
|    UDCTELEM c;
 | 
|    int b, r;
 | 
|  
 | 
| +  if (divisor == 1) {
 | 
| +    /* divisor == 1 means unquantized, so these reciprocal/correction/shift
 | 
| +     * values will cause the C quantization algorithm to act like the
 | 
| +     * identity function.  Since only the C quantization algorithm is used in
 | 
| +     * these cases, the scale value is irrelevant.
 | 
| +     */
 | 
| +    dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
 | 
| +    dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
 | 
| +    dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
 | 
| +    dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
 | 
| +    return 0;
 | 
| +  }
 | 
| +
 | 
|    b = flss(divisor) - 1;
 | 
|    r  = sizeof(DCTELEM) * 8 + b;
 | 
|  
 | 
| @@ -191,13 +209,20 @@ compute_reciprocal (UINT16 divisor, DCTELEM * dtbl)
 | 
|  
 | 
|    dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
 | 
|    dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
 | 
| +#ifdef WITH_SIMD
 | 
|    dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
 | 
| +#else
 | 
| +  dtbl[DCTSIZE2 * 2] = 1;
 | 
| +#endif
 | 
|    dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
 | 
|  
 | 
|    if(r <= 16) return 0;
 | 
|    else return 1;
 | 
|  }
 | 
|  
 | 
| +#endif
 | 
| +
 | 
| +
 | 
|  /*
 | 
|   * Initialize for a processing pass.
 | 
|   * Verify that all referenced Q-tables are present, and set up
 | 
| @@ -213,15 +238,15 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 | 
|    my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
 | 
|    int ci, qtblno, i;
 | 
|    jpeg_component_info *compptr;
 | 
| -  JQUANT_TBL * qtbl;
 | 
| -  DCTELEM * dtbl;
 | 
| +  JQUANT_TBL *qtbl;
 | 
| +  DCTELEM *dtbl;
 | 
|  
 | 
|    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
 | 
|         ci++, compptr++) {
 | 
|      qtblno = compptr->quant_tbl_no;
 | 
|      /* Make sure specified quantization table is present */
 | 
|      if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
 | 
| -	cinfo->quant_tbl_ptrs[qtblno] == NULL)
 | 
| +        cinfo->quant_tbl_ptrs[qtblno] == NULL)
 | 
|        ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
 | 
|      qtbl = cinfo->quant_tbl_ptrs[qtblno];
 | 
|      /* Compute divisors for this quant table */
 | 
| @@ -233,91 +258,102 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 | 
|         * coefficients multiplied by 8 (to counteract scaling).
 | 
|         */
 | 
|        if (fdct->divisors[qtblno] == NULL) {
 | 
| -	fdct->divisors[qtblno] = (DCTELEM *)
 | 
| -	  (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -				      (DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 | 
| +        fdct->divisors[qtblno] = (DCTELEM *)
 | 
| +          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| +                                      (DCTSIZE2 * 4) * sizeof(DCTELEM));
 | 
|        }
 | 
|        dtbl = fdct->divisors[qtblno];
 | 
|        for (i = 0; i < DCTSIZE2; i++) {
 | 
| -	if(!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i])
 | 
| -	  && fdct->quantize == jsimd_quantize)
 | 
| -	  fdct->quantize = quantize;
 | 
| +#if BITS_IN_JSAMPLE == 8
 | 
| +        if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
 | 
| +            fdct->quantize == jsimd_quantize)
 | 
| +          fdct->quantize = quantize;
 | 
| +#else
 | 
| +        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
 | 
| +#endif
 | 
|        }
 | 
|        break;
 | 
|  #endif
 | 
|  #ifdef DCT_IFAST_SUPPORTED
 | 
|      case JDCT_IFAST:
 | 
|        {
 | 
| -	/* For AA&N IDCT method, divisors are equal to quantization
 | 
| -	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
 | 
| -	 *   scalefactor[0] = 1
 | 
| -	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
 | 
| -	 * We apply a further scale factor of 8.
 | 
| -	 */
 | 
| +        /* For AA&N IDCT method, divisors are equal to quantization
 | 
| +         * coefficients scaled by scalefactor[row]*scalefactor[col], where
 | 
| +         *   scalefactor[0] = 1
 | 
| +         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
 | 
| +         * We apply a further scale factor of 8.
 | 
| +         */
 | 
|  #define CONST_BITS 14
 | 
| -	static const INT16 aanscales[DCTSIZE2] = {
 | 
| -	  /* precomputed values scaled up by 14 bits */
 | 
| -	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
 | 
| -	  22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
 | 
| -	  21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
 | 
| -	  19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
 | 
| -	  16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
 | 
| -	  12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
 | 
| -	   8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
 | 
| -	   4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
 | 
| -	};
 | 
| -	SHIFT_TEMPS
 | 
| -
 | 
| -	if (fdct->divisors[qtblno] == NULL) {
 | 
| -	  fdct->divisors[qtblno] = (DCTELEM *)
 | 
| -	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -					(DCTSIZE2 * 4) * SIZEOF(DCTELEM));
 | 
| -	}
 | 
| -	dtbl = fdct->divisors[qtblno];
 | 
| -	for (i = 0; i < DCTSIZE2; i++) {
 | 
| -	  if(!compute_reciprocal(
 | 
| -	    DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
 | 
| -				  (INT32) aanscales[i]),
 | 
| -		    CONST_BITS-3), &dtbl[i])
 | 
| -	    && fdct->quantize == jsimd_quantize)
 | 
| -	    fdct->quantize = quantize;
 | 
| -	}
 | 
| +        static const INT16 aanscales[DCTSIZE2] = {
 | 
| +          /* precomputed values scaled up by 14 bits */
 | 
| +          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
 | 
| +          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
 | 
| +          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
 | 
| +          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
 | 
| +          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
 | 
| +          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
 | 
| +           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
 | 
| +           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
 | 
| +        };
 | 
| +        SHIFT_TEMPS
 | 
| +
 | 
| +        if (fdct->divisors[qtblno] == NULL) {
 | 
| +          fdct->divisors[qtblno] = (DCTELEM *)
 | 
| +            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| +                                        (DCTSIZE2 * 4) * sizeof(DCTELEM));
 | 
| +        }
 | 
| +        dtbl = fdct->divisors[qtblno];
 | 
| +        for (i = 0; i < DCTSIZE2; i++) {
 | 
| +#if BITS_IN_JSAMPLE == 8
 | 
| +          if (!compute_reciprocal(
 | 
| +                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
 | 
| +                                      (JLONG) aanscales[i]),
 | 
| +                        CONST_BITS-3), &dtbl[i]) &&
 | 
| +              fdct->quantize == jsimd_quantize)
 | 
| +            fdct->quantize = quantize;
 | 
| +#else
 | 
| +           dtbl[i] = (DCTELEM)
 | 
| +             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
 | 
| +                                   (JLONG) aanscales[i]),
 | 
| +                     CONST_BITS-3);
 | 
| +#endif
 | 
| +        }
 | 
|        }
 | 
|        break;
 | 
|  #endif
 | 
|  #ifdef DCT_FLOAT_SUPPORTED
 | 
|      case JDCT_FLOAT:
 | 
|        {
 | 
| -	/* For float AA&N IDCT method, divisors are equal to quantization
 | 
| -	 * coefficients scaled by scalefactor[row]*scalefactor[col], where
 | 
| -	 *   scalefactor[0] = 1
 | 
| -	 *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
 | 
| -	 * We apply a further scale factor of 8.
 | 
| -	 * What's actually stored is 1/divisor so that the inner loop can
 | 
| -	 * use a multiplication rather than a division.
 | 
| -	 */
 | 
| -	FAST_FLOAT * fdtbl;
 | 
| -	int row, col;
 | 
| -	static const double aanscalefactor[DCTSIZE] = {
 | 
| -	  1.0, 1.387039845, 1.306562965, 1.175875602,
 | 
| -	  1.0, 0.785694958, 0.541196100, 0.275899379
 | 
| -	};
 | 
| -
 | 
| -	if (fdct->float_divisors[qtblno] == NULL) {
 | 
| -	  fdct->float_divisors[qtblno] = (FAST_FLOAT *)
 | 
| -	    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -					DCTSIZE2 * SIZEOF(FAST_FLOAT));
 | 
| -	}
 | 
| -	fdtbl = fdct->float_divisors[qtblno];
 | 
| -	i = 0;
 | 
| -	for (row = 0; row < DCTSIZE; row++) {
 | 
| -	  for (col = 0; col < DCTSIZE; col++) {
 | 
| -	    fdtbl[i] = (FAST_FLOAT)
 | 
| -	      (1.0 / (((double) qtbl->quantval[i] *
 | 
| -		       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
 | 
| -	    i++;
 | 
| -	  }
 | 
| -	}
 | 
| +        /* For float AA&N IDCT method, divisors are equal to quantization
 | 
| +         * coefficients scaled by scalefactor[row]*scalefactor[col], where
 | 
| +         *   scalefactor[0] = 1
 | 
| +         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
 | 
| +         * We apply a further scale factor of 8.
 | 
| +         * What's actually stored is 1/divisor so that the inner loop can
 | 
| +         * use a multiplication rather than a division.
 | 
| +         */
 | 
| +        FAST_FLOAT *fdtbl;
 | 
| +        int row, col;
 | 
| +        static const double aanscalefactor[DCTSIZE] = {
 | 
| +          1.0, 1.387039845, 1.306562965, 1.175875602,
 | 
| +          1.0, 0.785694958, 0.541196100, 0.275899379
 | 
| +        };
 | 
| +
 | 
| +        if (fdct->float_divisors[qtblno] == NULL) {
 | 
| +          fdct->float_divisors[qtblno] = (FAST_FLOAT *)
 | 
| +            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| +                                        DCTSIZE2 * sizeof(FAST_FLOAT));
 | 
| +        }
 | 
| +        fdtbl = fdct->float_divisors[qtblno];
 | 
| +        i = 0;
 | 
| +        for (row = 0; row < DCTSIZE; row++) {
 | 
| +          for (col = 0; col < DCTSIZE; col++) {
 | 
| +            fdtbl[i] = (FAST_FLOAT)
 | 
| +              (1.0 / (((double) qtbl->quantval[i] *
 | 
| +                       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
 | 
| +            i++;
 | 
| +          }
 | 
| +        }
 | 
|        }
 | 
|        break;
 | 
|  #endif
 | 
| @@ -334,7 +370,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 | 
|   */
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
 | 
| +convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 | 
|  {
 | 
|    register DCTELEM *workspaceptr;
 | 
|    register JSAMPROW elemptr;
 | 
| @@ -344,7 +380,7 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
 | 
|    for (elemr = 0; elemr < DCTSIZE; elemr++) {
 | 
|      elemptr = sample_data[elemr] + start_col;
 | 
|  
 | 
| -#if DCTSIZE == 8		/* unroll the inner loop */
 | 
| +#if DCTSIZE == 8                /* unroll the inner loop */
 | 
|      *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
 | 
|      *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
 | 
|      *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
 | 
| @@ -369,14 +405,18 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace)
 | 
|   */
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 | 
| +quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 | 
|  {
 | 
|    int i;
 | 
|    DCTELEM temp;
 | 
| -  UDCTELEM recip, corr, shift;
 | 
| -  UDCTELEM2 product;
 | 
|    JCOEFPTR output_ptr = coef_block;
 | 
|  
 | 
| +#if BITS_IN_JSAMPLE == 8
 | 
| +
 | 
| +  UDCTELEM recip, corr;
 | 
| +  int shift;
 | 
| +  UDCTELEM2 product;
 | 
| +
 | 
|    for (i = 0; i < DCTSIZE2; i++) {
 | 
|      temp = workspace[i];
 | 
|      recip = divisors[i + DCTSIZE2 * 0];
 | 
| @@ -387,16 +427,54 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 | 
|        temp = -temp;
 | 
|        product = (UDCTELEM2)(temp + corr) * recip;
 | 
|        product >>= shift + sizeof(DCTELEM)*8;
 | 
| -      temp = product;
 | 
| +      temp = (DCTELEM)product;
 | 
|        temp = -temp;
 | 
|      } else {
 | 
|        product = (UDCTELEM2)(temp + corr) * recip;
 | 
|        product >>= shift + sizeof(DCTELEM)*8;
 | 
| -      temp = product;
 | 
| +      temp = (DCTELEM)product;
 | 
|      }
 | 
| +    output_ptr[i] = (JCOEF) temp;
 | 
| +  }
 | 
| +
 | 
| +#else
 | 
| +
 | 
| +  register DCTELEM qval;
 | 
|  
 | 
| +  for (i = 0; i < DCTSIZE2; i++) {
 | 
| +    qval = divisors[i];
 | 
| +    temp = workspace[i];
 | 
| +    /* Divide the coefficient value by qval, ensuring proper rounding.
 | 
| +     * Since C does not specify the direction of rounding for negative
 | 
| +     * quotients, we have to force the dividend positive for portability.
 | 
| +     *
 | 
| +     * In most files, at least half of the output values will be zero
 | 
| +     * (at default quantization settings, more like three-quarters...)
 | 
| +     * so we should ensure that this case is fast.  On many machines,
 | 
| +     * a comparison is enough cheaper than a divide to make a special test
 | 
| +     * a win.  Since both inputs will be nonnegative, we need only test
 | 
| +     * for a < b to discover whether a/b is 0.
 | 
| +     * If your machine's division is fast enough, define FAST_DIVIDE.
 | 
| +     */
 | 
| +#ifdef FAST_DIVIDE
 | 
| +#define DIVIDE_BY(a,b)  a /= b
 | 
| +#else
 | 
| +#define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
 | 
| +#endif
 | 
| +    if (temp < 0) {
 | 
| +      temp = -temp;
 | 
| +      temp += qval>>1;  /* for rounding */
 | 
| +      DIVIDE_BY(temp, qval);
 | 
| +      temp = -temp;
 | 
| +    } else {
 | 
| +      temp += qval>>1;  /* for rounding */
 | 
| +      DIVIDE_BY(temp, qval);
 | 
| +    }
 | 
|      output_ptr[i] = (JCOEF) temp;
 | 
|    }
 | 
| +
 | 
| +#endif
 | 
| +
 | 
|  }
 | 
|  
 | 
|  
 | 
| @@ -409,16 +487,16 @@ quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace)
 | 
|   */
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 | 
| -	     JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 | 
| -	     JDIMENSION start_row, JDIMENSION start_col,
 | 
| -	     JDIMENSION num_blocks)
 | 
| +forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
 | 
| +             JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 | 
| +             JDIMENSION start_row, JDIMENSION start_col,
 | 
| +             JDIMENSION num_blocks)
 | 
|  /* This version is used for integer DCT implementations. */
 | 
|  {
 | 
|    /* This routine is heavily used, so it's worth coding it tightly. */
 | 
|    my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
 | 
| -  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
 | 
| -  DCTELEM * workspace;
 | 
| +  DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
 | 
| +  DCTELEM *workspace;
 | 
|    JDIMENSION bi;
 | 
|  
 | 
|    /* Make sure the compiler doesn't look up these every pass */
 | 
| @@ -427,7 +505,7 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 | 
|    quantize_method_ptr do_quantize = fdct->quantize;
 | 
|    workspace = fdct->workspace;
 | 
|  
 | 
| -  sample_data += start_row;	/* fold in the vertical offset once */
 | 
| +  sample_data += start_row;     /* fold in the vertical offset once */
 | 
|  
 | 
|    for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
 | 
|      /* Load data into workspace, applying unsigned->signed conversion */
 | 
| @@ -446,7 +524,7 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 | 
|  
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)
 | 
| +convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
 | 
|  {
 | 
|    register FAST_FLOAT *workspaceptr;
 | 
|    register JSAMPROW elemptr;
 | 
| @@ -455,7 +533,7 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * works
 | 
|    workspaceptr = workspace;
 | 
|    for (elemr = 0; elemr < DCTSIZE; elemr++) {
 | 
|      elemptr = sample_data[elemr] + start_col;
 | 
| -#if DCTSIZE == 8		/* unroll the inner loop */
 | 
| +#if DCTSIZE == 8                /* unroll the inner loop */
 | 
|      *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
 | 
|      *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
 | 
|      *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
 | 
| @@ -477,7 +555,7 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * works
 | 
|  
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)
 | 
| +quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
 | 
|  {
 | 
|    register FAST_FLOAT temp;
 | 
|    register int i;
 | 
| @@ -499,16 +577,16 @@ quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspa
 | 
|  
 | 
|  
 | 
|  METHODDEF(void)
 | 
| -forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 | 
| -		   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 | 
| -		   JDIMENSION start_row, JDIMENSION start_col,
 | 
| -		   JDIMENSION num_blocks)
 | 
| +forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
 | 
| +                   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
 | 
| +                   JDIMENSION start_row, JDIMENSION start_col,
 | 
| +                   JDIMENSION num_blocks)
 | 
|  /* This version is used for floating-point DCT implementations. */
 | 
|  {
 | 
|    /* This routine is heavily used, so it's worth coding it tightly. */
 | 
|    my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
 | 
| -  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
 | 
| -  FAST_FLOAT * workspace;
 | 
| +  FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
 | 
| +  FAST_FLOAT *workspace;
 | 
|    JDIMENSION bi;
 | 
|  
 | 
|  
 | 
| @@ -518,7 +596,7 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 | 
|    float_quantize_method_ptr do_quantize = fdct->float_quantize;
 | 
|    workspace = fdct->float_workspace;
 | 
|  
 | 
| -  sample_data += start_row;	/* fold in the vertical offset once */
 | 
| +  sample_data += start_row;     /* fold in the vertical offset once */
 | 
|  
 | 
|    for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
 | 
|      /* Load data into workspace, applying unsigned->signed conversion */
 | 
| @@ -547,7 +625,7 @@ jinit_forward_dct (j_compress_ptr cinfo)
 | 
|  
 | 
|    fdct = (my_fdct_ptr)
 | 
|      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -				SIZEOF(my_fdct_controller));
 | 
| +                                sizeof(my_fdct_controller));
 | 
|    cinfo->fdct = (struct jpeg_forward_dct *) fdct;
 | 
|    fdct->pub.start_pass = start_pass_fdctmgr;
 | 
|  
 | 
| @@ -626,12 +704,12 @@ jinit_forward_dct (j_compress_ptr cinfo)
 | 
|    if (cinfo->dct_method == JDCT_FLOAT)
 | 
|      fdct->float_workspace = (FAST_FLOAT *)
 | 
|        (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -				  SIZEOF(FAST_FLOAT) * DCTSIZE2);
 | 
| +                                  sizeof(FAST_FLOAT) * DCTSIZE2);
 | 
|    else
 | 
|  #endif
 | 
|      fdct->workspace = (DCTELEM *)
 | 
|        (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
 | 
| -				  SIZEOF(DCTELEM) * DCTSIZE2);
 | 
| +                                  sizeof(DCTELEM) * DCTSIZE2);
 | 
|  
 | 
|    /* Mark divisor tables unallocated */
 | 
|    for (i = 0; i < NUM_QUANT_TBLS; i++) {
 | 
| 
 |