third_party/libwebp/dsp/dec_neon.c - Issue 10832153: libwebp: update snapshot to v0.2.0-rc1

Side by Side Diff: third_party/libwebp/dsp/dec_neon.c

Issue 10832153: libwebp: update snapshot to v0.2.0-rc1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc.	1 // Copyright 2012 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // ARM NEON version of dsp functions and loop filtering.	8 // ARM NEON version of dsp functions and loop filtering.

9 //	9 //

10 // Author: somnath@google.com (Somnath Banerjee)	10 // Authors: Somnath Banerjee (somnath@google.com)

	11 // Johann Koenig (johannkoenig@google.com)

11	12

12 #if defined(__GNUC__) && defined(__ARM_NEON__)	13 #include "./dsp.h"

	14

	15 #if defined(WEBP_USE_NEON)

13	16

14 #include "../dec/vp8i.h"	17 #include "../dec/vp8i.h"

15	18

16 #if defined(__cplusplus) \|\| defined(c_plusplus)	19 #if defined(__cplusplus) \|\| defined(c_plusplus)

17 extern "C" {	20 extern "C" {

18 #endif	21 #endif

19	22

20 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \	23 #define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \

21 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"	24 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

22	25

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
145 }	148 }

146	149

147 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {	150 static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {

148 int k;	151 int k;

149 for (k = 3; k > 0; --k) {	152 for (k = 3; k > 0; --k) {

150 p += 4;	153 p += 4;

151 SimpleHFilter16NEON(p, stride, thresh);	154 SimpleHFilter16NEON(p, stride, thresh);

152 }	155 }

153 }	156 }

154	157

	158 static void TransformOneNEON(const int16_t in, uint8_t dst) {

	159 const int kBPS = BPS;

	160 const int16_t constants[] = {20091, 17734, 0, 0};

	161 /* kC1, kC2. Padded because vld1.16 loads 8 bytes

	162 * Technically these are unsigned but vqdmulh is only available in signed.

	163 * vqdmulh returns high half (effectively >> 16) but also doubles the value,

	164 * changing the >> 16 to >> 15 and requiring an additional >> 1.

	165 * We use this to our advantage with kC2. The canonical value is 35468.

	166 * However, the high bit is set so treating it as signed will give incorrect

	167 * results. We avoid this by down shifting by 1 here to clear the highest bit.

	168 * Combined with the doubling effect of vqdmulh we get >> 16.

	169 * This can not be applied to kC1 because the lowest bit is set. Down shifting

	170 * the constant would reduce precision.

	171 */

	172

	173 /* libwebp uses a trick to avoid some extra addition that libvpx does.

	174 * Instead of:

	175 * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);

	176 * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the

	177 * same issue with kC1 and vqdmulh that we work around by down shifting kC2

	178 */

	179

	180 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */

	181 __asm__ volatile (

	182 "vld1.16 {q1, q2}, [%[in]] \n"

	183 "vld1.16 {d0}, [%[constants]] \n"

	184

	185 /* d2: in[0]

	186 * d3: in[8]

	187 * d4: in[4]

	188 * d5: in[12]

	189 */

	190 "vswp d3, d4 \n"

	191

	192 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16

	193 * q9 = {in[4], in[12]} * kC2 >> 16

	194 */

	195 "vqdmulh.s16 q8, q2, d0[0] \n"

	196 "vqdmulh.s16 q9, q2, d0[1] \n"

	197

	198 /* d22 = a = in[0] + in[8]

	199 * d23 = b = in[0] - in[8]

	200 */

	201 "vqadd.s16 d22, d2, d3 \n"

	202 "vqsub.s16 d23, d2, d3 \n"

	203

	204 /* The multiplication should be x * kC1 >> 16

	205 * However, with vqdmulh we get x * kC1 * 2 >> 16

	206 * (multiply, double, return high half)

	207 * We avoided this in kC2 by pre-shifting the constant.

	208 * q8 = in[4]/[12] * kC1 >> 16

	209 */

	210 "vshr.s16 q8, q8, #1 \n"

	211

	212 /* Add {in[4], in[12]} back after the multiplication. This is handled by

	213 * adding 1 << 16 to kC1 in the libwebp C code.

	214 */

	215 "vqadd.s16 q8, q2, q8 \n"

	216

	217 /* d20 = c = in[4]kC2 - in[12]kC1

	218 * d21 = d = in[4]kC1 + in[12]kC2

	219 */

	220 "vqsub.s16 d20, d18, d17 \n"

	221 "vqadd.s16 d21, d19, d16 \n"

	222

	223 /* d2 = tmp[0] = a + d

	224 * d3 = tmp[1] = b + c

	225 * d4 = tmp[2] = b - c

	226 * d5 = tmp[3] = a - d

	227 */

	228 "vqadd.s16 d2, d22, d21 \n"

	229 "vqadd.s16 d3, d23, d20 \n"

	230 "vqsub.s16 d4, d23, d20 \n"

	231 "vqsub.s16 d5, d22, d21 \n"

	232

	233 "vzip.16 q1, q2 \n"

	234 "vzip.16 q1, q2 \n"

	235

	236 "vswp d3, d4 \n"

	237

	238 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16

	239 * q9 = {tmp[4], tmp[12]} * kC2 >> 16

	240 */

	241 "vqdmulh.s16 q8, q2, d0[0] \n"

	242 "vqdmulh.s16 q9, q2, d0[1] \n"

	243

	244 /* d22 = a = tmp[0] + tmp[8]

	245 * d23 = b = tmp[0] - tmp[8]

	246 */

	247 "vqadd.s16 d22, d2, d3 \n"

	248 "vqsub.s16 d23, d2, d3 \n"

	249

	250 /* See long winded explanations prior */

	251 "vshr.s16 q8, q8, #1 \n"

	252 "vqadd.s16 q8, q2, q8 \n"

	253

	254 /* d20 = c = in[4]kC2 - in[12]kC1

	255 * d21 = d = in[4]kC1 + in[12]kC2

	256 */

	257 "vqsub.s16 d20, d18, d17 \n"

	258 "vqadd.s16 d21, d19, d16 \n"

	259

	260 /* d2 = tmp[0] = a + d

	261 * d3 = tmp[1] = b + c

	262 * d4 = tmp[2] = b - c

	263 * d5 = tmp[3] = a - d

	264 */

	265 "vqadd.s16 d2, d22, d21 \n"

	266 "vqadd.s16 d3, d23, d20 \n"

	267 "vqsub.s16 d4, d23, d20 \n"

	268 "vqsub.s16 d5, d22, d21 \n"

	269

	270 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"

	271 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"

	272 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"

	273 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"

	274

	275 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"

	276

	277 /* (val) + 4 >> 3 */

	278 "vrshr.s16 d2, d2, #3 \n"

	279 "vrshr.s16 d3, d3, #3 \n"

	280 "vrshr.s16 d4, d4, #3 \n"

	281 "vrshr.s16 d5, d5, #3 \n"

	282

	283 "vzip.16 q1, q2 \n"

	284 "vzip.16 q1, q2 \n"

	285

	286 /* Must accumulate before saturating */

	287 "vmovl.u8 q8, d6 \n"

	288 "vmovl.u8 q9, d7 \n"

	289

	290 "vqadd.s16 q1, q1, q8 \n"

	291 "vqadd.s16 q2, q2, q9 \n"

	292

	293 "vqmovun.s16 d0, q1 \n"

	294 "vqmovun.s16 d1, q2 \n"

	295

	296 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"

	297 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"

	298 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"

	299 "vst1.32 d1[1], [%[dst]] \n"

	300

	301 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */

	302 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */

	303 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */

	304 );

	305 }

	306

	307 static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {

	308 TransformOneNEON(in, dst);

	309 if (do_two) {

	310 TransformOneNEON(in + 16, dst + 4);

	311 }

	312 }

	313

155 extern void VP8DspInitNEON(void);	314 extern void VP8DspInitNEON(void);

156	315

157 void VP8DspInitNEON(void) {	316 void VP8DspInitNEON(void) {

	317 VP8Transform = TransformTwoNEON;

	318

158 VP8SimpleVFilter16 = SimpleVFilter16NEON;	319 VP8SimpleVFilter16 = SimpleVFilter16NEON;

159 VP8SimpleHFilter16 = SimpleHFilter16NEON;	320 VP8SimpleHFilter16 = SimpleHFilter16NEON;

160 VP8SimpleVFilter16i = SimpleVFilter16iNEON;	321 VP8SimpleVFilter16i = SimpleVFilter16iNEON;

161 VP8SimpleHFilter16i = SimpleHFilter16iNEON;	322 VP8SimpleHFilter16i = SimpleHFilter16iNEON;

162 }	323 }

163	324

164 #if defined(__cplusplus) \|\| defined(c_plusplus)	325 #if defined(__cplusplus) \|\| defined(c_plusplus)

165 } // extern "C"	326 } // extern "C"

166 #endif	327 #endif

167	328

168 #endif // __GNUC__ && __ARM_NEON__	329 #endif // WEBP_USE_NEON

OLD	NEW

« no previous file with comments | « third_party/libwebp/dsp/dec.c ('k') | third_party/libwebp/dsp/dec_sse2.c » ('j') | third_party/libwebp/libwebp.gyp » ('J')