| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright © 2011,2012 Google, Inc. | 2 * Copyright © 2011,2012,2014 Google, Inc. |
| 3 * | 3 * |
| 4 * This is part of HarfBuzz, a text shaping library. | 4 * This is part of HarfBuzz, a text shaping library. |
| 5 * | 5 * |
| 6 * Permission is hereby granted, without written agreement and without | 6 * Permission is hereby granted, without written agreement and without |
| 7 * license or royalty fees, to use, copy, modify, and distribute this | 7 * license or royalty fees, to use, copy, modify, and distribute this |
| 8 * software and its documentation for any purpose, provided that the | 8 * software and its documentation for any purpose, provided that the |
| 9 * above copyright notice and the following two paragraphs appear in | 9 * above copyright notice and the following two paragraphs appear in |
| 10 * all copies of this software. | 10 * all copies of this software. |
| 11 * | 11 * |
| 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR | 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
| 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES | 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
| 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN | 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
| 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH | 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
| 16 * DAMAGE. | 16 * DAMAGE. |
| 17 * | 17 * |
| 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, | 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
| 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS | 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
| 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO | 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
| 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. | 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
| 23 * | 23 * |
| 24 * Google Author(s): Behdad Esfahbod | 24 * Google Author(s): Behdad Esfahbod |
| 25 */ | 25 */ |
| 26 | 26 |
| 27 #ifndef HB_UTF_PRIVATE_HH | 27 #ifndef HB_UTF_PRIVATE_HH |
| 28 #define HB_UTF_PRIVATE_HH | 28 #define HB_UTF_PRIVATE_HH |
| 29 | 29 |
| 30 #include "hb-private.hh" | 30 #include "hb-private.hh" |
| 31 | 31 |
| 32 template <typename T, bool validate=true> struct hb_utf_t; |
| 33 |
| 32 | 34 |
| 33 /* UTF-8 */ | 35 /* UTF-8 */ |
| 34 | 36 |
| 35 #define HB_UTF8_COMPUTE(Char, Mask, Len) \ | 37 template <> |
| 36 if (Char < 128) { Len = 1; Mask = 0x7f; } \ | 38 struct hb_utf_t<uint8_t, true> |
| 37 else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \ | |
| 38 else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \ | |
| 39 else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \ | |
| 40 else Len = 0; | |
| 41 | |
| 42 static inline const uint8_t * | |
| 43 hb_utf_next (const uint8_t *text, | |
| 44 » const uint8_t *end, | |
| 45 » hb_codepoint_t *unicode) | |
| 46 { | 39 { |
| 47 hb_codepoint_t c = *text, mask; | 40 static inline const uint8_t * |
| 48 unsigned int len; | 41 next (const uint8_t *text, |
| 49 | 42 » const uint8_t *end, |
| 50 /* TODO check for overlong sequences? */ | 43 » hb_codepoint_t *unicode, |
| 51 | 44 » hb_codepoint_t replacement) |
| 52 HB_UTF8_COMPUTE (c, mask, len); | 45 { |
| 53 if (unlikely (!len || (unsigned int) (end - text) < len)) { | 46 /* Written to only accept well-formed sequences. |
| 54 *unicode = -1; | 47 * Based on ideas from ICU's U8_NEXT. |
| 55 return text + 1; | 48 * Generates one "replacement" for each ill-formed byte. */ |
| 56 } else { | 49 |
| 57 hb_codepoint_t result; | 50 hb_codepoint_t c = *text++; |
| 58 unsigned int i; | 51 |
| 59 result = c & mask; | 52 if (c > 0x7Fu) |
| 60 for (i = 1; i < len; i++) | 53 { |
| 61 { | 54 if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ |
| 62 » if (unlikely ((text[i] & 0xc0) != 0x80)) | 55 { |
| 63 » { | 56 » unsigned int t1; |
| 64 » *unicode = -1; | 57 » if (likely (text < end && |
| 65 » return text + 1; | 58 » » (t1 = text[0] - 0x80u) <= 0x3Fu)) |
| 66 » } | 59 » { |
| 67 » result <<= 6; | 60 » c = ((c&0x1Fu)<<6) | t1; |
| 68 » result |= (text[i] & 0x3f); | 61 » text++; |
| 69 } | 62 » } |
| 70 *unicode = result; | 63 » else |
| 71 return text + len; | 64 » goto error; |
| 72 } | 65 } |
| 73 } | 66 else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ |
| 74 | 67 { |
| 75 static inline const uint8_t * | 68 » unsigned int t1, t2; |
| 76 hb_utf_prev (const uint8_t *text, | 69 » if (likely (1 < end - text && |
| 77 » const uint8_t *start, | 70 » » (t1 = text[0] - 0x80u) <= 0x3Fu && |
| 78 » hb_codepoint_t *unicode) | 71 » » (t2 = text[1] - 0x80u) <= 0x3Fu)) |
| 72 » { |
| 73 » c = ((c&0xFu)<<12) | (t1<<6) | t2; |
| 74 » if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) |
| 75 » goto error; |
| 76 » text += 2; |
| 77 » } |
| 78 » else |
| 79 » goto error; |
| 80 } |
| 81 else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ |
| 82 { |
| 83 » unsigned int t1, t2, t3; |
| 84 » if (likely (2 < end - text && |
| 85 » » (t1 = text[0] - 0x80u) <= 0x3Fu && |
| 86 » » (t2 = text[1] - 0x80u) <= 0x3Fu && |
| 87 » » (t3 = text[2] - 0x80u) <= 0x3Fu)) |
| 88 » { |
| 89 » c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
| 90 » if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) |
| 91 » goto error; |
| 92 » text += 3; |
| 93 » } |
| 94 » else |
| 95 » goto error; |
| 96 } |
| 97 else |
| 98 » goto error; |
| 99 } |
| 100 |
| 101 *unicode = c; |
| 102 return text; |
| 103 |
| 104 error: |
| 105 *unicode = replacement; |
| 106 return text; |
| 107 } |
| 108 |
| 109 static inline const uint8_t * |
| 110 prev (const uint8_t *text, |
| 111 » const uint8_t *start, |
| 112 » hb_codepoint_t *unicode, |
| 113 » hb_codepoint_t replacement) |
| 114 { |
| 115 const uint8_t *end = text--; |
| 116 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
| 117 text--; |
| 118 |
| 119 if (likely (next (text, end, unicode, replacement) == end)) |
| 120 return text; |
| 121 |
| 122 *unicode = replacement; |
| 123 return end - 1; |
| 124 } |
| 125 |
| 126 static inline unsigned int |
| 127 strlen (const uint8_t *text) |
| 128 { |
| 129 return ::strlen ((const char *) text); |
| 130 } |
| 131 }; |
| 132 |
| 133 |
| 134 /* UTF-16 */ |
| 135 |
| 136 template <> |
| 137 struct hb_utf_t<uint16_t, true> |
| 79 { | 138 { |
| 80 const uint8_t *end = text--; | 139 static inline const uint16_t * |
| 81 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) | 140 next (const uint16_t *text, |
| 82 text--; | 141 » const uint16_t *end, |
| 83 | 142 » hb_codepoint_t *unicode, |
| 84 hb_codepoint_t c = *text, mask; | 143 » hb_codepoint_t replacement) |
| 85 unsigned int len; | 144 { |
| 86 | 145 hb_codepoint_t c = *text++; |
| 87 /* TODO check for overlong sequences? */ | 146 |
| 88 | 147 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) |
| 89 HB_UTF8_COMPUTE (c, mask, len); | 148 { |
| 90 if (unlikely (!len || (unsigned int) (end - text) != len)) { | 149 *unicode = c; |
| 91 *unicode = -1; | 150 return text; |
| 151 } |
| 152 |
| 153 if (likely (hb_in_range (c, 0xD800u, 0xDBFFu))) |
| 154 { |
| 155 /* High-surrogate in c */ |
| 156 hb_codepoint_t l; |
| 157 if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))
)) |
| 158 { |
| 159 » /* Low-surrogate in l */ |
| 160 » *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
| 161 » text++; |
| 162 » return text; |
| 163 } |
| 164 } |
| 165 |
| 166 /* Lonely / out-of-order surrogate. */ |
| 167 *unicode = replacement; |
| 168 return text; |
| 169 } |
| 170 |
| 171 static inline const uint16_t * |
| 172 prev (const uint16_t *text, |
| 173 » const uint16_t *start, |
| 174 » hb_codepoint_t *unicode, |
| 175 » hb_codepoint_t replacement) |
| 176 { |
| 177 const uint16_t *end = text--; |
| 178 hb_codepoint_t c = *text; |
| 179 |
| 180 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) |
| 181 { |
| 182 *unicode = c; |
| 183 return text; |
| 184 } |
| 185 |
| 186 if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) |
| 187 text--; |
| 188 |
| 189 if (likely (next (text, end, unicode, replacement) == end)) |
| 190 return text; |
| 191 |
| 192 *unicode = replacement; |
| 92 return end - 1; | 193 return end - 1; |
| 93 } else { | 194 } |
| 94 hb_codepoint_t result; | 195 |
| 95 unsigned int i; | 196 |
| 96 result = c & mask; | 197 static inline unsigned int |
| 97 for (i = 1; i < len; i++) | 198 strlen (const uint16_t *text) |
| 98 { | 199 { |
| 99 » result <<= 6; | 200 unsigned int l = 0; |
| 100 » result |= (text[i] & 0x3f); | 201 while (*text++) l++; |
| 101 } | 202 return l; |
| 102 *unicode = result; | 203 } |
| 103 return text; | 204 }; |
| 104 } | 205 |
| 105 } | 206 |
| 106 | 207 /* UTF-32 */ |
| 107 | 208 |
| 108 static inline unsigned int | 209 template <bool validate> |
| 109 hb_utf_strlen (const uint8_t *text) | 210 struct hb_utf_t<uint32_t, validate> |
| 110 { | 211 { |
| 111 return strlen ((const char *) text); | 212 static inline const uint32_t * |
| 112 } | 213 next (const uint32_t *text, |
| 113 | 214 » const uint32_t *end HB_UNUSED, |
| 114 | 215 » hb_codepoint_t *unicode, |
| 115 /* UTF-16 */ | 216 » hb_codepoint_t replacement) |
| 116 | 217 { |
| 117 static inline const uint16_t * | 218 hb_codepoint_t c = *text++; |
| 118 hb_utf_next (const uint16_t *text, | 219 if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)
)) |
| 119 » const uint16_t *end, | 220 goto error; |
| 120 » hb_codepoint_t *unicode) | |
| 121 { | |
| 122 hb_codepoint_t c = *text++; | |
| 123 | |
| 124 if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xd800, 0xdbff))) | |
| 125 { | |
| 126 /* high surrogate */ | |
| 127 hb_codepoint_t l; | |
| 128 if (text < end && ((l = *text), likely (hb_in_range<hb_codepoint_t> (l, 0xdc
00, 0xdfff)))) | |
| 129 { | |
| 130 /* low surrogate */ | |
| 131 *unicode = (c << 10) + l - ((0xd800 << 10) - 0x10000 + 0xdc00); | |
| 132 text++; | |
| 133 } else | |
| 134 *unicode = -1; | |
| 135 } else | |
| 136 *unicode = c; | 221 *unicode = c; |
| 137 | 222 return text; |
| 138 return text; | 223 |
| 139 } | 224 error: |
| 140 | 225 *unicode = replacement; |
| 141 static inline const uint16_t * | 226 return text; |
| 142 hb_utf_prev (const uint16_t *text, | 227 } |
| 143 » const uint16_t *start, | 228 |
| 144 » hb_codepoint_t *unicode) | 229 static inline const uint32_t * |
| 145 { | 230 prev (const uint32_t *text, |
| 146 hb_codepoint_t c = *--text; | 231 » const uint32_t *start HB_UNUSED, |
| 147 | 232 » hb_codepoint_t *unicode, |
| 148 if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xdc00, 0xdfff))) | 233 » hb_codepoint_t replacement) |
| 149 { | 234 { |
| 150 /* low surrogate */ | 235 next (text - 1, text, unicode, replacement); |
| 151 hb_codepoint_t h; | 236 return text - 1; |
| 152 if (start < text && ((h = *(text - 1)), likely (hb_in_range<hb_codepoint_t>
(h, 0xd800, 0xdbff)))) | 237 } |
| 153 { | 238 |
| 154 /* high surrogate */ | 239 static inline unsigned int |
| 155 *unicode = (h << 10) + c - ((0xd800 << 10) - 0x10000 + 0xdc00); | 240 strlen (const uint32_t *text) |
| 156 text--; | 241 { |
| 157 } else | 242 unsigned int l = 0; |
| 158 *unicode = -1; | 243 while (*text++) l++; |
| 159 } else | 244 return l; |
| 160 *unicode = c; | 245 } |
| 161 | 246 }; |
| 162 return text; | |
| 163 } | |
| 164 | |
| 165 | |
| 166 static inline unsigned int | |
| 167 hb_utf_strlen (const uint16_t *text) | |
| 168 { | |
| 169 unsigned int l = 0; | |
| 170 while (*text++) l++; | |
| 171 return l; | |
| 172 } | |
| 173 | |
| 174 | |
| 175 /* UTF-32 */ | |
| 176 | |
| 177 static inline const uint32_t * | |
| 178 hb_utf_next (const uint32_t *text, | |
| 179 » const uint32_t *end HB_UNUSED, | |
| 180 » hb_codepoint_t *unicode) | |
| 181 { | |
| 182 *unicode = *text++; | |
| 183 return text; | |
| 184 } | |
| 185 | |
| 186 static inline const uint32_t * | |
| 187 hb_utf_prev (const uint32_t *text, | |
| 188 » const uint32_t *start HB_UNUSED, | |
| 189 » hb_codepoint_t *unicode) | |
| 190 { | |
| 191 *unicode = *--text; | |
| 192 return text; | |
| 193 } | |
| 194 | |
| 195 static inline unsigned int | |
| 196 hb_utf_strlen (const uint32_t *text) | |
| 197 { | |
| 198 unsigned int l = 0; | |
| 199 while (*text++) l++; | |
| 200 return l; | |
| 201 } | |
| 202 | 247 |
| 203 | 248 |
| 204 #endif /* HB_UTF_PRIVATE_HH */ | 249 #endif /* HB_UTF_PRIVATE_HH */ |
| OLD | NEW |