OLD | NEW |
| (Empty) |
1 /* | |
2 ********************************************************************** | |
3 * Copyright (C) 2005-2010, International Business Machines | |
4 * Corporation and others. All Rights Reserved. | |
5 ********************************************************************** | |
6 * file name: ucsdet.h | |
7 * encoding: US-ASCII | |
8 * indentation:4 | |
9 * | |
10 * created on: 2005Aug04 | |
11 * created by: Andy Heninger | |
12 * | |
13 * ICU Character Set Detection, API for C | |
14 * | |
15 * Draft version 18 Oct 2005 | |
16 * | |
17 */ | |
18 | |
19 #ifndef __UCSDET_H | |
20 #define __UCSDET_H | |
21 | |
22 #include "unicode/utypes.h" | |
23 | |
24 #if !UCONFIG_NO_CONVERSION | |
25 | |
26 #include "unicode/localpointer.h" | |
27 #include "unicode/uenum.h" | |
28 | |
29 /** | |
30 * \file | |
31 * \brief C API: Charset Detection API | |
32 * | |
33 * This API provides a facility for detecting the | |
34 * charset or encoding of character data in an unknown text format. | |
35 * The input data can be from an array of bytes. | |
36 * <p> | |
37 * Character set detection is at best an imprecise operation. The detection | |
38 * process will attempt to identify the charset that best matches the characteri
stics | |
39 * of the byte data, but the process is partly statistical in nature, and | |
40 * the results can not be guaranteed to always be correct. | |
41 * <p> | |
42 * For best accuracy in charset detection, the input data should be primarily | |
43 * in a single language, and a minimum of a few hundred bytes worth of plain tex
t | |
44 * in the language are needed. The detection process will attempt to | |
45 * ignore html or xml style markup that could otherwise obscure the content. | |
46 */ | |
47 | |
48 | |
49 struct UCharsetDetector; | |
50 /** | |
51 * Structure representing a charset detector | |
52 * @stable ICU 3.6 | |
53 */ | |
54 typedef struct UCharsetDetector UCharsetDetector; | |
55 | |
56 struct UCharsetMatch; | |
57 /** | |
58 * Opaque structure representing a match that was identified | |
59 * from a charset detection operation. | |
60 * @stable ICU 3.6 | |
61 */ | |
62 typedef struct UCharsetMatch UCharsetMatch; | |
63 | |
64 /** | |
65 * Open a charset detector. | |
66 * | |
67 * @param status Any error conditions occurring during the open | |
68 * operation are reported back in this variable. | |
69 * @return the newly opened charset detector. | |
70 * @stable ICU 3.6 | |
71 */ | |
72 U_STABLE UCharsetDetector * U_EXPORT2 | |
73 ucsdet_open(UErrorCode *status); | |
74 | |
75 /** | |
76 * Close a charset detector. All storage and any other resources | |
77 * owned by this charset detector will be released. Failure to | |
78 * close a charset detector when finished with it can result in | |
79 * memory leaks in the application. | |
80 * | |
81 * @param ucsd The charset detector to be closed. | |
82 * @stable ICU 3.6 | |
83 */ | |
84 U_STABLE void U_EXPORT2 | |
85 ucsdet_close(UCharsetDetector *ucsd); | |
86 | |
87 #if U_SHOW_CPLUSPLUS_API | |
88 | |
89 U_NAMESPACE_BEGIN | |
90 | |
91 /** | |
92 * \class LocalUCharsetDetectorPointer | |
93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). | |
94 * For most methods see the LocalPointerBase base class. | |
95 * | |
96 * @see LocalPointerBase | |
97 * @see LocalPointer | |
98 * @stable ICU 4.4 | |
99 */ | |
100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsd
et_close); | |
101 | |
102 U_NAMESPACE_END | |
103 | |
104 #endif | |
105 | |
106 /** | |
107 * Set the input byte data whose charset is to detected. | |
108 * | |
109 * Ownership of the input text byte array remains with the caller. | |
110 * The input string must not be altered or deleted until the charset | |
111 * detector is either closed or reset to refer to different input text. | |
112 * | |
113 * @param ucsd the charset detector to be used. | |
114 * @param textIn the input text of unknown encoding. . | |
115 * @param len the length of the input text, or -1 if the text | |
116 * is NUL terminated. | |
117 * @param status any error conditions are reported back in this variable. | |
118 * | |
119 * @stable ICU 3.6 | |
120 */ | |
121 U_STABLE void U_EXPORT2 | |
122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCo
de *status); | |
123 | |
124 | |
125 /** Set the declared encoding for charset detection. | |
126 * The declared encoding of an input text is an encoding obtained | |
127 * by the user from an http header or xml declaration or similar source that | |
128 * can be provided as an additional hint to the charset detector. | |
129 * | |
130 * How and whether the declared encoding will be used during the | |
131 * detection process is TBD. | |
132 * | |
133 * @param ucsd the charset detector to be used. | |
134 * @param encoding an encoding for the current data obtained from | |
135 * a header or declaration or other source outside | |
136 * of the byte data itself. | |
137 * @param length the length of the encoding name, or -1 if the name string | |
138 * is NUL terminated. | |
139 * @param status any error conditions are reported back in this variable. | |
140 * | |
141 * @stable ICU 3.6 | |
142 */ | |
143 U_STABLE void U_EXPORT2 | |
144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t
length, UErrorCode *status); | |
145 | |
146 | |
147 /** | |
148 * Return the charset that best matches the supplied input data. | |
149 * | |
150 * Note though, that because the detection | |
151 * only looks at the start of the input data, | |
152 * there is a possibility that the returned charset will fail to handle | |
153 * the full set of input data. | |
154 * <p> | |
155 * The returned UCharsetMatch object is owned by the UCharsetDetector. | |
156 * It will remain valid until the detector input is reset, or until | |
157 * the detector is closed. | |
158 * <p> | |
159 * The function will fail if | |
160 * <ul> | |
161 * <li>no charset appears to match the data.</li> | |
162 * <li>no input text has been provided</li> | |
163 * </ul> | |
164 * | |
165 * @param ucsd the charset detector to be used. | |
166 * @param status any error conditions are reported back in this variable. | |
167 * @return a UCharsetMatch representing the best matching charset, | |
168 * or NULL if no charset matches the byte data. | |
169 * | |
170 * @stable ICU 3.6 | |
171 */ | |
172 U_STABLE const UCharsetMatch * U_EXPORT2 | |
173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); | |
174 | |
175 | |
176 /** | |
177 * Find all charset matches that appear to be consistent with the input, | |
178 * returning an array of results. The results are ordered with the | |
179 * best quality match first. | |
180 * | |
181 * Because the detection only looks at a limited amount of the | |
182 * input byte data, some of the returned charsets may fail to handle | |
183 * the all of input data. | |
184 * <p> | |
185 * The returned UCharsetMatch objects are owned by the UCharsetDetector. | |
186 * They will remain valid until the detector is closed or modified | |
187 * | |
188 * <p> | |
189 * Return an error if | |
190 * <ul> | |
191 * <li>no charsets appear to match the input data.</li> | |
192 * <li>no input text has been provided</li> | |
193 * </ul> | |
194 * | |
195 * @param ucsd the charset detector to be used. | |
196 * @param matchesFound pointer to a variable that will be set to the | |
197 * number of charsets identified that are consistent with | |
198 * the input data. Output only. | |
199 * @param status any error conditions are reported back in this variable. | |
200 * @return A pointer to an array of pointers to UCharSetMatch objec
ts. | |
201 * This array, and the UCharSetMatch instances to which it
refers, | |
202 * are owned by the UCharsetDetector, and will remain valid
until | |
203 * the detector is closed or modified. | |
204 * @stable ICU 3.6 | |
205 */ | |
206 U_STABLE const UCharsetMatch ** U_EXPORT2 | |
207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *stat
us); | |
208 | |
209 | |
210 | |
211 /** | |
212 * Get the name of the charset represented by a UCharsetMatch. | |
213 * | |
214 * The storage for the returned name string is owned by the | |
215 * UCharsetMatch, and will remain valid while the UCharsetMatch | |
216 * is valid. | |
217 * | |
218 * The name returned is suitable for use with the ICU conversion APIs. | |
219 * | |
220 * @param ucsm The charset match object. | |
221 * @param status Any error conditions are reported back in this variable. | |
222 * @return The name of the matching charset. | |
223 * | |
224 * @stable ICU 3.6 | |
225 */ | |
226 U_STABLE const char * U_EXPORT2 | |
227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); | |
228 | |
229 /** | |
230 * Get a confidence number for the quality of the match of the byte | |
231 * data with the charset. Confidence numbers range from zero to 100, | |
232 * with 100 representing complete confidence and zero representing | |
233 * no confidence. | |
234 * | |
235 * The confidence values are somewhat arbitrary. They define an | |
236 * an ordering within the results for any single detection operation | |
237 * but are not generally comparable between the results for different input. | |
238 * | |
239 * A confidence value of ten does have a general meaning - it is used | |
240 * for charsets that can represent the input data, but for which there | |
241 * is no other indication that suggests that the charset is the correct one. | |
242 * Pure 7 bit ASCII data, for example, is compatible with a | |
243 * great many charsets, most of which will appear as possible matches | |
244 * with a confidence of 10. | |
245 * | |
246 * @param ucsm The charset match object. | |
247 * @param status Any error conditions are reported back in this variable. | |
248 * @return A confidence number for the charset match. | |
249 * | |
250 * @stable ICU 3.6 | |
251 */ | |
252 U_STABLE int32_t U_EXPORT2 | |
253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); | |
254 | |
255 /** | |
256 * Get the RFC 3066 code for the language of the input data. | |
257 * | |
258 * The Charset Detection service is intended primarily for detecting | |
259 * charsets, not language. For some, but not all, charsets, a language is | |
260 * identified as a byproduct of the detection process, and that is what | |
261 * is returned by this function. | |
262 * | |
263 * CAUTION: | |
264 * 1. Language information is not available for input data encoded in | |
265 * all charsets. In particular, no language is identified | |
266 * for UTF-8 input data. | |
267 * | |
268 * 2. Closely related languages may sometimes be confused. | |
269 * | |
270 * If more accurate language detection is required, a linguistic | |
271 * analysis package should be used. | |
272 * | |
273 * The storage for the returned name string is owned by the | |
274 * UCharsetMatch, and will remain valid while the UCharsetMatch | |
275 * is valid. | |
276 * | |
277 * @param ucsm The charset match object. | |
278 * @param status Any error conditions are reported back in this variable. | |
279 * @return The RFC 3066 code for the language of the input data, or | |
280 * an empty string if the language could not be determined. | |
281 * | |
282 * @stable ICU 3.6 | |
283 */ | |
284 U_STABLE const char * U_EXPORT2 | |
285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); | |
286 | |
287 | |
288 /** | |
289 * Get the entire input text as a UChar string, placing it into | |
290 * a caller-supplied buffer. A terminating | |
291 * NUL character will be appended to the buffer if space is available. | |
292 * | |
293 * The number of UChars in the output string, not including the terminating | |
294 * NUL, is returned. | |
295 * | |
296 * If the supplied buffer is smaller than required to hold the output, | |
297 * the contents of the buffer are undefined. The full output string length | |
298 * (in UChars) is returned as always, and can be used to allocate a buffer | |
299 * of the correct size. | |
300 * | |
301 * | |
302 * @param ucsm The charset match object. | |
303 * @param buf A UChar buffer to be filled with the converted text data. | |
304 * @param cap The capacity of the buffer in UChars. | |
305 * @param status Any error conditions are reported back in this variable. | |
306 * @return The number of UChars in the output string. | |
307 * | |
308 * @stable ICU 3.6 | |
309 */ | |
310 U_STABLE int32_t U_EXPORT2 | |
311 ucsdet_getUChars(const UCharsetMatch *ucsm, | |
312 UChar *buf, int32_t cap, UErrorCode *status); | |
313 | |
314 | |
315 | |
316 /** | |
317 * Get an iterator over the set of all detectable charsets - | |
318 * over the charsets that are known to the charset detection | |
319 * service. | |
320 * | |
321 * The returned UEnumeration provides access to the names of | |
322 * the charsets. | |
323 * | |
324 * The state of the Charset detector that is passed in does not | |
325 * affect the result of this function, but requiring a valid, open | |
326 * charset detector as a parameter insures that the charset detection | |
327 * service has been safely initialized and that the required detection | |
328 * data is available. | |
329 * | |
330 * @param ucsd a Charset detector. | |
331 * @param status Any error conditions are reported back in this variable. | |
332 * @return an iterator providing access to the detectable charset names. | |
333 * @stable ICU 3.6 | |
334 */ | |
335 U_STABLE UEnumeration * U_EXPORT2 | |
336 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *statu
s); | |
337 | |
338 | |
339 /** | |
340 * Test whether input filtering is enabled for this charset detector. | |
341 * Input filtering removes text that appears to be HTML or xml | |
342 * markup from the input before applying the code page detection | |
343 * heuristics. | |
344 * | |
345 * @param ucsd The charset detector to check. | |
346 * @return TRUE if filtering is enabled. | |
347 * @stable ICU 3.6 | |
348 */ | |
349 U_STABLE UBool U_EXPORT2 | |
350 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); | |
351 | |
352 | |
353 /** | |
354 * Enable filtering of input text. If filtering is enabled, | |
355 * text within angle brackets ("<" and ">") will be removed | |
356 * before detection, which will remove most HTML or xml markup. | |
357 * | |
358 * @param ucsd the charset detector to be modified. | |
359 * @param filter <code>true</code> to enable input text filtering. | |
360 * @return The previous setting. | |
361 * | |
362 * @stable ICU 3.6 | |
363 */ | |
364 U_STABLE UBool U_EXPORT2 | |
365 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); | |
366 | |
367 #endif | |
368 #endif /* __UCSDET_H */ | |
369 | |
370 | |
OLD | NEW |