OLD | NEW |
1 /* | 1 /* |
2 * Copyright © 2011,2012 Google, Inc. | 2 * Copyright © 2011,2012,2014 Google, Inc. |
3 * | 3 * |
4 * This is part of HarfBuzz, a text shaping library. | 4 * This is part of HarfBuzz, a text shaping library. |
5 * | 5 * |
6 * Permission is hereby granted, without written agreement and without | 6 * Permission is hereby granted, without written agreement and without |
7 * license or royalty fees, to use, copy, modify, and distribute this | 7 * license or royalty fees, to use, copy, modify, and distribute this |
8 * software and its documentation for any purpose, provided that the | 8 * software and its documentation for any purpose, provided that the |
9 * above copyright notice and the following two paragraphs appear in | 9 * above copyright notice and the following two paragraphs appear in |
10 * all copies of this software. | 10 * all copies of this software. |
11 * | 11 * |
12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR | 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR |
13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES | 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN | 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN |
15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH | 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH |
16 * DAMAGE. | 16 * DAMAGE. |
17 * | 17 * |
18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, | 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, |
19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND | 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND |
20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS | 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS |
21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO | 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO |
22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. | 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. |
23 * | 23 * |
24 * Google Author(s): Behdad Esfahbod | 24 * Google Author(s): Behdad Esfahbod |
25 */ | 25 */ |
26 | 26 |
27 #ifndef HB_UTF_PRIVATE_HH | 27 #ifndef HB_UTF_PRIVATE_HH |
28 #define HB_UTF_PRIVATE_HH | 28 #define HB_UTF_PRIVATE_HH |
29 | 29 |
30 #include "hb-private.hh" | 30 #include "hb-private.hh" |
31 | 31 |
| 32 template <typename T, bool validate=true> struct hb_utf_t; |
| 33 |
32 | 34 |
33 /* UTF-8 */ | 35 /* UTF-8 */ |
34 | 36 |
35 #define HB_UTF8_COMPUTE(Char, Mask, Len) \ | 37 template <> |
36 if (Char < 128) { Len = 1; Mask = 0x7f; } \ | 38 struct hb_utf_t<uint8_t, true> |
37 else if ((Char & 0xe0) == 0xc0) { Len = 2; Mask = 0x1f; } \ | |
38 else if ((Char & 0xf0) == 0xe0) { Len = 3; Mask = 0x0f; } \ | |
39 else if ((Char & 0xf8) == 0xf0) { Len = 4; Mask = 0x07; } \ | |
40 else Len = 0; | |
41 | |
42 static inline const uint8_t * | |
43 hb_utf_next (const uint8_t *text, | |
44 » const uint8_t *end, | |
45 » hb_codepoint_t *unicode) | |
46 { | 39 { |
47 hb_codepoint_t c = *text, mask; | 40 static inline const uint8_t * |
48 unsigned int len; | 41 next (const uint8_t *text, |
49 | 42 » const uint8_t *end, |
50 /* TODO check for overlong sequences? */ | 43 » hb_codepoint_t *unicode, |
51 | 44 » hb_codepoint_t replacement) |
52 HB_UTF8_COMPUTE (c, mask, len); | 45 { |
53 if (unlikely (!len || (unsigned int) (end - text) < len)) { | 46 /* Written to only accept well-formed sequences. |
54 *unicode = -1; | 47 * Based on ideas from ICU's U8_NEXT. |
55 return text + 1; | 48 * Generates one "replacement" for each ill-formed byte. */ |
56 } else { | 49 |
57 hb_codepoint_t result; | 50 hb_codepoint_t c = *text++; |
58 unsigned int i; | 51 |
59 result = c & mask; | 52 if (c > 0x7Fu) |
60 for (i = 1; i < len; i++) | 53 { |
61 { | 54 if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ |
62 » if (unlikely ((text[i] & 0xc0) != 0x80)) | 55 { |
63 » { | 56 » unsigned int t1; |
64 » *unicode = -1; | 57 » if (likely (text < end && |
65 » return text + 1; | 58 » » (t1 = text[0] - 0x80u) <= 0x3Fu)) |
66 » } | 59 » { |
67 » result <<= 6; | 60 » c = ((c&0x1Fu)<<6) | t1; |
68 » result |= (text[i] & 0x3f); | 61 » text++; |
69 } | 62 » } |
70 *unicode = result; | 63 » else |
71 return text + len; | 64 » goto error; |
72 } | 65 } |
73 } | 66 else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ |
74 | 67 { |
75 static inline const uint8_t * | 68 » unsigned int t1, t2; |
76 hb_utf_prev (const uint8_t *text, | 69 » if (likely (1 < end - text && |
77 » const uint8_t *start, | 70 » » (t1 = text[0] - 0x80u) <= 0x3Fu && |
78 » hb_codepoint_t *unicode) | 71 » » (t2 = text[1] - 0x80u) <= 0x3Fu)) |
| 72 » { |
| 73 » c = ((c&0xFu)<<12) | (t1<<6) | t2; |
| 74 » if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) |
| 75 » goto error; |
| 76 » text += 2; |
| 77 » } |
| 78 » else |
| 79 » goto error; |
| 80 } |
| 81 else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ |
| 82 { |
| 83 » unsigned int t1, t2, t3; |
| 84 » if (likely (2 < end - text && |
| 85 » » (t1 = text[0] - 0x80u) <= 0x3Fu && |
| 86 » » (t2 = text[1] - 0x80u) <= 0x3Fu && |
| 87 » » (t3 = text[2] - 0x80u) <= 0x3Fu)) |
| 88 » { |
| 89 » c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; |
| 90 » if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) |
| 91 » goto error; |
| 92 » text += 3; |
| 93 » } |
| 94 » else |
| 95 » goto error; |
| 96 } |
| 97 else |
| 98 » goto error; |
| 99 } |
| 100 |
| 101 *unicode = c; |
| 102 return text; |
| 103 |
| 104 error: |
| 105 *unicode = replacement; |
| 106 return text; |
| 107 } |
| 108 |
| 109 static inline const uint8_t * |
| 110 prev (const uint8_t *text, |
| 111 » const uint8_t *start, |
| 112 » hb_codepoint_t *unicode, |
| 113 » hb_codepoint_t replacement) |
| 114 { |
| 115 const uint8_t *end = text--; |
| 116 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) |
| 117 text--; |
| 118 |
| 119 if (likely (next (text, end, unicode, replacement) == end)) |
| 120 return text; |
| 121 |
| 122 *unicode = replacement; |
| 123 return end - 1; |
| 124 } |
| 125 |
| 126 static inline unsigned int |
| 127 strlen (const uint8_t *text) |
| 128 { |
| 129 return ::strlen ((const char *) text); |
| 130 } |
| 131 }; |
| 132 |
| 133 |
| 134 /* UTF-16 */ |
| 135 |
| 136 template <> |
| 137 struct hb_utf_t<uint16_t, true> |
79 { | 138 { |
80 const uint8_t *end = text--; | 139 static inline const uint16_t * |
81 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) | 140 next (const uint16_t *text, |
82 text--; | 141 » const uint16_t *end, |
83 | 142 » hb_codepoint_t *unicode, |
84 hb_codepoint_t c = *text, mask; | 143 » hb_codepoint_t replacement) |
85 unsigned int len; | 144 { |
86 | 145 hb_codepoint_t c = *text++; |
87 /* TODO check for overlong sequences? */ | 146 |
88 | 147 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) |
89 HB_UTF8_COMPUTE (c, mask, len); | 148 { |
90 if (unlikely (!len || (unsigned int) (end - text) != len)) { | 149 *unicode = c; |
91 *unicode = -1; | 150 return text; |
| 151 } |
| 152 |
| 153 if (likely (hb_in_range (c, 0xD800u, 0xDBFFu))) |
| 154 { |
| 155 /* High-surrogate in c */ |
| 156 hb_codepoint_t l; |
| 157 if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))
)) |
| 158 { |
| 159 » /* Low-surrogate in l */ |
| 160 » *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); |
| 161 » text++; |
| 162 » return text; |
| 163 } |
| 164 } |
| 165 |
| 166 /* Lonely / out-of-order surrogate. */ |
| 167 *unicode = replacement; |
| 168 return text; |
| 169 } |
| 170 |
| 171 static inline const uint16_t * |
| 172 prev (const uint16_t *text, |
| 173 » const uint16_t *start, |
| 174 » hb_codepoint_t *unicode, |
| 175 » hb_codepoint_t replacement) |
| 176 { |
| 177 const uint16_t *end = text--; |
| 178 hb_codepoint_t c = *text; |
| 179 |
| 180 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) |
| 181 { |
| 182 *unicode = c; |
| 183 return text; |
| 184 } |
| 185 |
| 186 if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) |
| 187 text--; |
| 188 |
| 189 if (likely (next (text, end, unicode, replacement) == end)) |
| 190 return text; |
| 191 |
| 192 *unicode = replacement; |
92 return end - 1; | 193 return end - 1; |
93 } else { | 194 } |
94 hb_codepoint_t result; | 195 |
95 unsigned int i; | 196 |
96 result = c & mask; | 197 static inline unsigned int |
97 for (i = 1; i < len; i++) | 198 strlen (const uint16_t *text) |
98 { | 199 { |
99 » result <<= 6; | 200 unsigned int l = 0; |
100 » result |= (text[i] & 0x3f); | 201 while (*text++) l++; |
101 } | 202 return l; |
102 *unicode = result; | 203 } |
103 return text; | 204 }; |
104 } | 205 |
105 } | 206 |
106 | 207 /* UTF-32 */ |
107 | 208 |
108 static inline unsigned int | 209 template <bool validate> |
109 hb_utf_strlen (const uint8_t *text) | 210 struct hb_utf_t<uint32_t, validate> |
110 { | 211 { |
111 return strlen ((const char *) text); | 212 static inline const uint32_t * |
112 } | 213 next (const uint32_t *text, |
113 | 214 » const uint32_t *end HB_UNUSED, |
114 | 215 » hb_codepoint_t *unicode, |
115 /* UTF-16 */ | 216 » hb_codepoint_t replacement) |
116 | 217 { |
117 static inline const uint16_t * | 218 hb_codepoint_t c = *text++; |
118 hb_utf_next (const uint16_t *text, | 219 if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)
)) |
119 » const uint16_t *end, | 220 goto error; |
120 » hb_codepoint_t *unicode) | |
121 { | |
122 hb_codepoint_t c = *text++; | |
123 | |
124 if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xd800, 0xdbff))) | |
125 { | |
126 /* high surrogate */ | |
127 hb_codepoint_t l; | |
128 if (text < end && ((l = *text), likely (hb_in_range<hb_codepoint_t> (l, 0xdc
00, 0xdfff)))) | |
129 { | |
130 /* low surrogate */ | |
131 *unicode = (c << 10) + l - ((0xd800 << 10) - 0x10000 + 0xdc00); | |
132 text++; | |
133 } else | |
134 *unicode = -1; | |
135 } else | |
136 *unicode = c; | 221 *unicode = c; |
137 | 222 return text; |
138 return text; | 223 |
139 } | 224 error: |
140 | 225 *unicode = replacement; |
141 static inline const uint16_t * | 226 return text; |
142 hb_utf_prev (const uint16_t *text, | 227 } |
143 » const uint16_t *start, | 228 |
144 » hb_codepoint_t *unicode) | 229 static inline const uint32_t * |
145 { | 230 prev (const uint32_t *text, |
146 hb_codepoint_t c = *--text; | 231 » const uint32_t *start HB_UNUSED, |
147 | 232 » hb_codepoint_t *unicode, |
148 if (unlikely (hb_in_range<hb_codepoint_t> (c, 0xdc00, 0xdfff))) | 233 » hb_codepoint_t replacement) |
149 { | 234 { |
150 /* low surrogate */ | 235 next (text - 1, text, unicode, replacement); |
151 hb_codepoint_t h; | 236 return text - 1; |
152 if (start < text && ((h = *(text - 1)), likely (hb_in_range<hb_codepoint_t>
(h, 0xd800, 0xdbff)))) | 237 } |
153 { | 238 |
154 /* high surrogate */ | 239 static inline unsigned int |
155 *unicode = (h << 10) + c - ((0xd800 << 10) - 0x10000 + 0xdc00); | 240 strlen (const uint32_t *text) |
156 text--; | 241 { |
157 } else | 242 unsigned int l = 0; |
158 *unicode = -1; | 243 while (*text++) l++; |
159 } else | 244 return l; |
160 *unicode = c; | 245 } |
161 | 246 }; |
162 return text; | |
163 } | |
164 | |
165 | |
166 static inline unsigned int | |
167 hb_utf_strlen (const uint16_t *text) | |
168 { | |
169 unsigned int l = 0; | |
170 while (*text++) l++; | |
171 return l; | |
172 } | |
173 | |
174 | |
175 /* UTF-32 */ | |
176 | |
177 static inline const uint32_t * | |
178 hb_utf_next (const uint32_t *text, | |
179 » const uint32_t *end HB_UNUSED, | |
180 » hb_codepoint_t *unicode) | |
181 { | |
182 *unicode = *text++; | |
183 return text; | |
184 } | |
185 | |
186 static inline const uint32_t * | |
187 hb_utf_prev (const uint32_t *text, | |
188 » const uint32_t *start HB_UNUSED, | |
189 » hb_codepoint_t *unicode) | |
190 { | |
191 *unicode = *--text; | |
192 return text; | |
193 } | |
194 | |
195 static inline unsigned int | |
196 hb_utf_strlen (const uint32_t *text) | |
197 { | |
198 unsigned int l = 0; | |
199 while (*text++) l++; | |
200 return l; | |
201 } | |
202 | 247 |
203 | 248 |
204 #endif /* HB_UTF_PRIVATE_HH */ | 249 #endif /* HB_UTF_PRIVATE_HH */ |
OLD | NEW |