| OLD | NEW |
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | 1 // Copyright 2011 Google Inc. All Rights Reserved. |
| 2 // | 2 // |
| 3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
| 4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
| 5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
| 6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
| 7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
| 8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
| 9 // | 9 // |
| 10 // NEON version of YUV to RGB upsampling functions. | 10 // NEON version of YUV to RGB upsampling functions. |
| (...skipping 10 matching lines...) Expand all Loading... |
| 21 #include <string.h> | 21 #include <string.h> |
| 22 #include "./neon.h" | 22 #include "./neon.h" |
| 23 #include "./yuv.h" | 23 #include "./yuv.h" |
| 24 | 24 |
| 25 #ifdef FANCY_UPSAMPLING | 25 #ifdef FANCY_UPSAMPLING |
| 26 | 26 |
| 27 //----------------------------------------------------------------------------- | 27 //----------------------------------------------------------------------------- |
| 28 // U/V upsampling | 28 // U/V upsampling |
| 29 | 29 |
| 30 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels. | 30 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels. |
| 31 #define UPSAMPLE_16PIXELS(r1, r2, out) { \ | 31 #define UPSAMPLE_16PIXELS(r1, r2, out) do { \ |
| 32 uint8x8_t a = vld1_u8(r1); \ | 32 const uint8x8_t a = vld1_u8(r1 + 0); \ |
| 33 uint8x8_t b = vld1_u8(r1 + 1); \ | 33 const uint8x8_t b = vld1_u8(r1 + 1); \ |
| 34 uint8x8_t c = vld1_u8(r2); \ | 34 const uint8x8_t c = vld1_u8(r2 + 0); \ |
| 35 uint8x8_t d = vld1_u8(r2 + 1); \ | 35 const uint8x8_t d = vld1_u8(r2 + 1); \ |
| 36 /* a + b + c + d */ \ |
| 37 const uint16x8_t ad = vaddl_u8(a, d); \ |
| 38 const uint16x8_t bc = vaddl_u8(b, c); \ |
| 39 const uint16x8_t abcd = vaddq_u16(ad, bc); \ |
| 40 /* 3a + b + c + 3d */ \ |
| 41 const uint16x8_t al = vaddq_u16(abcd, vshlq_n_u16(ad, 1)); \ |
| 42 /* a + 3b + 3c + d */ \ |
| 43 const uint16x8_t bl = vaddq_u16(abcd, vshlq_n_u16(bc, 1)); \ |
| 36 \ | 44 \ |
| 37 uint16x8_t al = vshll_n_u8(a, 1); \ | 45 const uint8x8_t diag2 = vshrn_n_u16(al, 3); \ |
| 38 uint16x8_t bl = vshll_n_u8(b, 1); \ | 46 const uint8x8_t diag1 = vshrn_n_u16(bl, 3); \ |
| 39 uint16x8_t cl = vshll_n_u8(c, 1); \ | |
| 40 uint16x8_t dl = vshll_n_u8(d, 1); \ | |
| 41 \ | 47 \ |
| 42 uint8x8_t diag1, diag2; \ | 48 const uint8x8_t A = vrhadd_u8(a, diag1); \ |
| 43 uint16x8_t sl; \ | 49 const uint8x8_t B = vrhadd_u8(b, diag2); \ |
| 50 const uint8x8_t C = vrhadd_u8(c, diag2); \ |
| 51 const uint8x8_t D = vrhadd_u8(d, diag1); \ |
| 44 \ | 52 \ |
| 45 /* a + b + c + d */ \ | 53 uint8x8x2_t A_B, C_D; \ |
| 46 sl = vaddl_u8(a, b); \ | 54 INIT_VECTOR2(A_B, A, B); \ |
| 47 sl = vaddw_u8(sl, c); \ | 55 INIT_VECTOR2(C_D, C, D); \ |
| 48 sl = vaddw_u8(sl, d); \ | 56 vst2_u8(out + 0, A_B); \ |
| 49 \ | 57 vst2_u8(out + 32, C_D); \ |
| 50 al = vaddq_u16(sl, al); /* 3a + b + c + d */ \ | 58 } while (0) |
| 51 bl = vaddq_u16(sl, bl); /* a + 3b + c + d */ \ | |
| 52 \ | |
| 53 al = vaddq_u16(al, dl); /* 3a + b + c + 3d */ \ | |
| 54 bl = vaddq_u16(bl, cl); /* a + 3b + 3c + d */ \ | |
| 55 \ | |
| 56 diag2 = vshrn_n_u16(al, 3); \ | |
| 57 diag1 = vshrn_n_u16(bl, 3); \ | |
| 58 \ | |
| 59 a = vrhadd_u8(a, diag1); \ | |
| 60 b = vrhadd_u8(b, diag2); \ | |
| 61 c = vrhadd_u8(c, diag2); \ | |
| 62 d = vrhadd_u8(d, diag1); \ | |
| 63 \ | |
| 64 { \ | |
| 65 uint8x8x2_t a_b, c_d; \ | |
| 66 INIT_VECTOR2(a_b, a, b); \ | |
| 67 INIT_VECTOR2(c_d, c, d); \ | |
| 68 vst2_u8(out, a_b); \ | |
| 69 vst2_u8(out + 32, c_d); \ | |
| 70 } \ | |
| 71 } | |
| 72 | 59 |
| 73 // Turn the macro into a function for reducing code-size when non-critical | 60 // Turn the macro into a function for reducing code-size when non-critical |
| 74 static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, | 61 static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, |
| 75 uint8_t *out) { | 62 uint8_t *out) { |
| 76 UPSAMPLE_16PIXELS(r1, r2, out); | 63 UPSAMPLE_16PIXELS(r1, r2, out); |
| 77 } | 64 } |
| 78 | 65 |
| 79 #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ | 66 #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ |
| 80 uint8_t r1[9], r2[9]; \ | 67 uint8_t r1[9], r2[9]; \ |
| 81 memcpy(r1, (tb), (num_pixels)); \ | 68 memcpy(r1, (tb), (num_pixels)); \ |
| 82 memcpy(r2, (bb), (num_pixels)); \ | 69 memcpy(r2, (bb), (num_pixels)); \ |
| 83 /* replicate last byte */ \ | 70 /* replicate last byte */ \ |
| 84 memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ | 71 memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ |
| 85 memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ | 72 memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ |
| 86 Upsample16Pixels(r1, r2, out); \ | 73 Upsample16Pixels(r1, r2, out); \ |
| 87 } | 74 } |
| 88 | 75 |
| 89 //----------------------------------------------------------------------------- | 76 //----------------------------------------------------------------------------- |
| 90 // YUV->RGB conversion | 77 // YUV->RGB conversion |
| 91 | 78 |
| 92 // note: we represent the 33050 large constant as 32768 + 282 | 79 // note: we represent the 33050 large constant as 32768 + 282 |
| 93 static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; | 80 static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 }; |
| 94 | 81 |
| 95 #define v255 vdup_n_u8(255) | 82 #define v255 vdup_n_u8(255) |
| 96 #define v_0x0f vdup_n_u8(15) | |
| 97 | 83 |
| 98 #define STORE_Rgb(out, r, g, b) do { \ | 84 #define STORE_Rgb(out, r, g, b) do { \ |
| 99 uint8x8x3_t r_g_b; \ | 85 uint8x8x3_t r_g_b; \ |
| 100 INIT_VECTOR3(r_g_b, r, g, b); \ | 86 INIT_VECTOR3(r_g_b, r, g, b); \ |
| 101 vst3_u8(out, r_g_b); \ | 87 vst3_u8(out, r_g_b); \ |
| 102 } while (0) | 88 } while (0) |
| 103 | 89 |
| 104 #define STORE_Bgr(out, r, g, b) do { \ | 90 #define STORE_Bgr(out, r, g, b) do { \ |
| 105 uint8x8x3_t b_g_r; \ | 91 uint8x8x3_t b_g_r; \ |
| 106 INIT_VECTOR3(b_g_r, b, g, r); \ | 92 INIT_VECTOR3(b_g_r, b, g, r); \ |
| (...skipping 18 matching lines...) Expand all Loading... |
| 125 vst4_u8(out, v255_r_g_b); \ | 111 vst4_u8(out, v255_r_g_b); \ |
| 126 } while (0) | 112 } while (0) |
| 127 | 113 |
| 128 #if !defined(WEBP_SWAP_16BIT_CSP) | 114 #if !defined(WEBP_SWAP_16BIT_CSP) |
| 129 #define ZIP_U8(lo, hi) vzip_u8((lo), (hi)) | 115 #define ZIP_U8(lo, hi) vzip_u8((lo), (hi)) |
| 130 #else | 116 #else |
| 131 #define ZIP_U8(lo, hi) vzip_u8((hi), (lo)) | 117 #define ZIP_U8(lo, hi) vzip_u8((hi), (lo)) |
| 132 #endif | 118 #endif |
| 133 | 119 |
| 134 #define STORE_Rgba4444(out, r, g, b) do { \ | 120 #define STORE_Rgba4444(out, r, g, b) do { \ |
| 135 const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 4), 4); /* 4bits */ \ | 121 const uint8x8_t rg = vsri_n_u8(r, g, 4); /* shift g, insert r */ \ |
| 136 const uint8x8_t g1 = vshr_n_u8(g, 4); \ | 122 const uint8x8_t ba = vsri_n_u8(b, v255, 4); /* shift a, insert b */ \ |
| 137 const uint8x8_t ba = vorr_u8(b, v_0x0f); \ | |
| 138 const uint8x8_t rg = vorr_u8(r1, g1); \ | |
| 139 const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba); \ | 123 const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba); \ |
| 140 vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1])); \ | 124 vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1])); \ |
| 141 } while (0) | 125 } while (0) |
| 142 | 126 |
| 143 #define STORE_Rgb565(out, r, g, b) do { \ | 127 #define STORE_Rgb565(out, r, g, b) do { \ |
| 144 const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 3), 3); /* 5bits */ \ | 128 const uint8x8_t rg = vsri_n_u8(r, g, 5); /* shift g and insert r */ \ |
| 145 const uint8x8_t g1 = vshr_n_u8(g, 5); /* upper 3bits */\ | 129 const uint8x8_t g1 = vshl_n_u8(g, 3); /* pre-shift g: 3bits */ \ |
| 146 const uint8x8_t g2 = vshl_n_u8(vshr_n_u8(g, 2), 5); /* lower 3bits */\ | 130 const uint8x8_t gb = vsri_n_u8(g1, b, 3); /* shift b and insert g */ \ |
| 147 const uint8x8_t b1 = vshr_n_u8(b, 3); /* 5bits */ \ | |
| 148 const uint8x8_t rg = vorr_u8(r1, g1); \ | |
| 149 const uint8x8_t gb = vorr_u8(g2, b1); \ | |
| 150 const uint8x8x2_t rgb565 = ZIP_U8(rg, gb); \ | 131 const uint8x8x2_t rgb565 = ZIP_U8(rg, gb); \ |
| 151 vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1])); \ | 132 vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1])); \ |
| 152 } while (0) | 133 } while (0) |
| 153 | 134 |
| 154 #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do { \ | 135 #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do { \ |
| 155 int i; \ | 136 int i; \ |
| 156 for (i = 0; i < N; i += 8) { \ | 137 for (i = 0; i < N; i += 8) { \ |
| 157 const int off = ((cur_x) + i) * XSTEP; \ | 138 const int off = ((cur_x) + i) * XSTEP; \ |
| 158 const uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ | 139 const uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ |
| 159 const uint8x8_t u = vld1_u8((src_uv) + i + 0); \ | 140 const uint8x8_t u = vld1_u8((src_uv) + i + 0); \ |
| (...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 291 WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; | 272 WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair; |
| 292 } | 273 } |
| 293 | 274 |
| 294 #endif // FANCY_UPSAMPLING | 275 #endif // FANCY_UPSAMPLING |
| 295 | 276 |
| 296 #endif // WEBP_USE_NEON | 277 #endif // WEBP_USE_NEON |
| 297 | 278 |
| 298 #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_NEON)) | 279 #if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_NEON)) |
| 299 WEBP_DSP_INIT_STUB(WebPInitUpsamplersNEON) | 280 WEBP_DSP_INIT_STUB(WebPInitUpsamplersNEON) |
| 300 #endif | 281 #endif |
| OLD | NEW |