OLD | NEW |
---|---|
1 /* | 1 /* |
2 * The copyright in this software is being made available under the 2-clauses | 2 * The copyright in this software is being made available under the 2-clauses |
3 * BSD License, included below. This software may be subject to other third | 3 * BSD License, included below. This software may be subject to other third |
4 * party and contributor rights, including patent rights, and no such rights | 4 * party and contributor rights, including patent rights, and no such rights |
5 * are granted under this license. | 5 * are granted under this license. |
6 * | 6 * |
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium | 7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium |
8 * Copyright (c) 2002-2014, Professor Benoit Macq | 8 * Copyright (c) 2002-2014, Professor Benoit Macq |
9 * Copyright (c) 2001-2003, David Janssens | 9 * Copyright (c) 2001-2003, David Janssens |
10 * Copyright (c) 2002-2003, Yannick Verschueren | 10 * Copyright (c) 2002-2003, Yannick Verschueren |
(...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
133 | 133 |
134 /* <summary> */ | 134 /* <summary> */ |
135 /* Inverse 9-7 wavelet transform in 1-D. */ | 135 /* Inverse 9-7 wavelet transform in 1-D. */ |
136 /* </summary> */ | 136 /* </summary> */ |
137 static void opj_v4dwt_decode(opj_v4dwt_t* restrict dwt); | 137 static void opj_v4dwt_decode(opj_v4dwt_t* restrict dwt); |
138 | 138 |
139 static void opj_v4dwt_interleave_h(opj_v4dwt_t* restrict w, OPJ_FLOAT32* restric t a, OPJ_INT32 x, OPJ_INT32 size); | 139 static void opj_v4dwt_interleave_h(opj_v4dwt_t* restrict w, OPJ_FLOAT32* restric t a, OPJ_INT32 x, OPJ_INT32 size); |
140 | 140 |
141 static void opj_v4dwt_interleave_v(opj_v4dwt_t* restrict v , OPJ_FLOAT32* restri ct a , OPJ_INT32 x, OPJ_INT32 nb_elts_read); | 141 static void opj_v4dwt_interleave_v(opj_v4dwt_t* restrict v , OPJ_FLOAT32* restri ct a , OPJ_INT32 x, OPJ_INT32 nb_elts_read); |
142 | 142 |
143 #ifdef __SSE__ | 143 //#ifdef __SSE__ |
144 #if 0 | |
jabdelmalek
2014/06/06 17:25:31
here and below where you comment this out, please
| |
144 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m12 8 c); | 145 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m12 8 c); |
145 | 146 |
146 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OP J_INT32 m, __m128 c); | 147 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OP J_INT32 m, __m128 c); |
147 | 148 |
148 #else | 149 #else |
149 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count, const OPJ_FLOAT 32 c); | 150 static void opj_v4dwt_decode_step1(opj_v4_t* w, OPJ_INT32 count, const OPJ_FLOAT 32 c); |
150 | 151 |
151 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_IN T32 m, OPJ_FLOAT32 c); | 152 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, OPJ_INT32 k, OPJ_IN T32 m, OPJ_FLOAT32 c); |
152 | 153 |
153 #endif | 154 #endif |
(...skipping 510 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
664 } | 665 } |
665 | 666 |
666 a += v->sn * x; | 667 a += v->sn * x; |
667 bi = v->wavelet + 1 - v->cas; | 668 bi = v->wavelet + 1 - v->cas; |
668 | 669 |
669 for(i = 0; i < v->dn; ++i){ | 670 for(i = 0; i < v->dn; ++i){ |
670 memcpy(&bi[i*2], &a[i*x], (size_t)nb_elts_read * sizeof(OPJ_FLOA T32)); | 671 memcpy(&bi[i*2], &a[i*x], (size_t)nb_elts_read * sizeof(OPJ_FLOA T32)); |
671 } | 672 } |
672 } | 673 } |
673 | 674 |
674 #ifdef __SSE__ | 675 //#ifdef __SSE__ |
675 | 676 #if 0 |
676 void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m128 c){ | 677 void opj_v4dwt_decode_step1_sse(opj_v4_t* w, OPJ_INT32 count, const __m128 c){ |
677 __m128* restrict vw = (__m128*) w; | 678 __m128* restrict vw = (__m128*) w; |
678 OPJ_INT32 i; | 679 OPJ_INT32 i; |
679 /* 4x unrolled loop */ | 680 /* 4x unrolled loop */ |
680 for(i = 0; i < count >> 2; ++i){ | 681 for(i = 0; i < count >> 2; ++i){ |
681 *vw = _mm_mul_ps(*vw, c); | 682 *vw = _mm_mul_ps(*vw, c); |
682 vw += 2; | 683 vw += 2; |
683 *vw = _mm_mul_ps(*vw, c); | 684 *vw = _mm_mul_ps(*vw, c); |
684 vw += 2; | 685 vw += 2; |
685 *vw = _mm_mul_ps(*vw, c); | 686 *vw = _mm_mul_ps(*vw, c); |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
801 } | 802 } |
802 a = 0; | 803 a = 0; |
803 b = 1; | 804 b = 1; |
804 }else{ | 805 }else{ |
805 if(!((dwt->sn > 0) || (dwt->dn > 1))) { | 806 if(!((dwt->sn > 0) || (dwt->dn > 1))) { |
806 return; | 807 return; |
807 } | 808 } |
808 a = 1; | 809 a = 1; |
809 b = 0; | 810 b = 0; |
810 } | 811 } |
811 #ifdef __SSE__ | 812 #if 0 |
813 //#ifdef __SSE__ | |
812 opj_v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(opj_K)); | 814 opj_v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(opj_K)); |
813 opj_v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(opj_c133 18)); | 815 opj_v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(opj_c133 18)); |
814 opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, op j_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_delta)); | 816 opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, op j_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_delta)); |
815 opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, op j_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_gamma)); | 817 opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, op j_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_gamma)); |
816 opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, op j_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_beta)); | 818 opj_v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, op j_int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(opj_dwt_beta)); |
817 opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, op j_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_alpha)); | 819 opj_v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, op j_int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(opj_dwt_alpha)); |
818 #else | 820 #else |
819 opj_v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, opj_K); | 821 opj_v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, opj_K); |
820 opj_v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, opj_c13318); | 822 opj_v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, opj_c13318); |
821 opj_v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_in t_min(dwt->sn, dwt->dn-a), opj_dwt_delta); | 823 opj_v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, opj_in t_min(dwt->sn, dwt->dn-a), opj_dwt_delta); |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
916 | 918 |
917 for(k = 0; k < rh; ++k){ | 919 for(k = 0; k < rh; ++k){ |
918 memcpy(&aj[k*w], &v.wavelet[k], (size_t)j * size of(OPJ_FLOAT32)); | 920 memcpy(&aj[k*w], &v.wavelet[k], (size_t)j * size of(OPJ_FLOAT32)); |
919 } | 921 } |
920 } | 922 } |
921 } | 923 } |
922 | 924 |
923 opj_aligned_free(h.wavelet); | 925 opj_aligned_free(h.wavelet); |
924 return OPJ_TRUE; | 926 return OPJ_TRUE; |
925 } | 927 } |
OLD | NEW |