OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ |
| 10 |
| 11 #include <arm_neon.h> |
| 12 #include "vp8/encoder/block.h" |
| 13 |
| 14 static const uint16_t inv_zig_zag[16] = { |
| 15 1, 2, 6, 7, |
| 16 3, 5, 8, 13, |
| 17 4, 9, 12, 14, |
| 18 10, 11, 15, 16 |
| 19 }; |
| 20 |
| 21 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { |
| 22 const int16x8_t one_q = vdupq_n_s16(-1), |
| 23 z0 = vld1q_s16(b->coeff), |
| 24 z1 = vld1q_s16(b->coeff + 8), |
| 25 round0 = vld1q_s16(b->round), |
| 26 round1 = vld1q_s16(b->round + 8), |
| 27 quant0 = vld1q_s16(b->quant_fast), |
| 28 quant1 = vld1q_s16(b->quant_fast + 8), |
| 29 dequant0 = vld1q_s16(d->dequant), |
| 30 dequant1 = vld1q_s16(d->dequant + 8); |
| 31 const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), |
| 32 zig_zag1 = vld1q_u16(inv_zig_zag + 8); |
| 33 int16x8_t x0, x1, sz0, sz1, y0, y1; |
| 34 uint16x8_t eob0, eob1; |
| 35 uint16x4_t eob_d16; |
| 36 uint32x2_t eob_d32; |
| 37 uint32x4_t eob_q32; |
| 38 |
| 39 /* sign of z: z >> 15 */ |
| 40 sz0 = vshrq_n_s16(z0, 15); |
| 41 sz1 = vshrq_n_s16(z1, 15); |
| 42 |
| 43 /* x = abs(z) */ |
| 44 x0 = vabsq_s16(z0); |
| 45 x1 = vabsq_s16(z1); |
| 46 |
| 47 /* x += round */ |
| 48 x0 = vaddq_s16(x0, round0); |
| 49 x1 = vaddq_s16(x1, round1); |
| 50 |
| 51 /* y = 2 * (x * quant) >> 16 */ |
| 52 y0 = vqdmulhq_s16(x0, quant0); |
| 53 y1 = vqdmulhq_s16(x1, quant1); |
| 54 |
| 55 /* Compensate for doubling in vqdmulhq */ |
| 56 y0 = vshrq_n_s16(y0, 1); |
| 57 y1 = vshrq_n_s16(y1, 1); |
| 58 |
| 59 /* Restore sign bit */ |
| 60 y0 = veorq_s16(y0, sz0); |
| 61 y1 = veorq_s16(y1, sz1); |
| 62 x0 = vsubq_s16(y0, sz0); |
| 63 x1 = vsubq_s16(y1, sz1); |
| 64 |
| 65 /* find non-zero elements */ |
| 66 eob0 = vtstq_s16(x0, one_q); |
| 67 eob1 = vtstq_s16(x1, one_q); |
| 68 |
| 69 /* mask zig zag */ |
| 70 eob0 = vandq_u16(eob0, zig_zag0); |
| 71 eob1 = vandq_u16(eob1, zig_zag1); |
| 72 |
| 73 /* select the largest value */ |
| 74 eob0 = vmaxq_u16(eob0, eob1); |
| 75 eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); |
| 76 eob_q32 = vmovl_u16(eob_d16); |
| 77 eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); |
| 78 eob_d32 = vpmax_u32(eob_d32, eob_d32); |
| 79 |
| 80 /* qcoeff = x */ |
| 81 vst1q_s16(d->qcoeff, x0); |
| 82 vst1q_s16(d->qcoeff + 8, x1); |
| 83 |
| 84 /* dqcoeff = x * dequant */ |
| 85 vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); |
| 86 vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); |
| 87 |
| 88 vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); |
| 89 } |
OLD | NEW |