icu46/source/i18n/unicode/ucsdet.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Side by Side Diff: icu46/source/i18n/unicode/ucsdet.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 /*

	2 **********************************************************************

	3 * Copyright (C) 2005-2010, International Business Machines

	4 * Corporation and others. All Rights Reserved.

	5 **********************************************************************

	6 * file name: ucsdet.h

	7 * encoding: US-ASCII

	8 * indentation:4

	9 *

	10 * created on: 2005Aug04

	11 * created by: Andy Heninger

	12 *

	13 * ICU Character Set Detection, API for C

	14 *

	15 * Draft version 18 Oct 2005

	16 *

	17 */

	18

	19 #ifndef __UCSDET_H

	20 #define __UCSDET_H

	21

	22 #include "unicode/utypes.h"

	23

	24 #if !UCONFIG_NO_CONVERSION

	25

	26 #include "unicode/localpointer.h"

	27 #include "unicode/uenum.h"

	28

	29 /**

	30 * \file

	31 * \brief C API: Charset Detection API

	32 *

	33 * This API provides a facility for detecting the

	34 * charset or encoding of character data in an unknown text format.

	35 * The input data can be from an array of bytes.

	36 * <p>

	37 * Character set detection is at best an imprecise operation. The detection

	38 * process will attempt to identify the charset that best matches the characteri stics

	39 * of the byte data, but the process is partly statistical in nature, and

	40 * the results can not be guaranteed to always be correct.

	41 * <p>

	42 * For best accuracy in charset detection, the input data should be primarily

	43 * in a single language, and a minimum of a few hundred bytes worth of plain tex t

	44 * in the language are needed. The detection process will attempt to

	45 * ignore html or xml style markup that could otherwise obscure the content.

	46 */

	47

	48

	49 struct UCharsetDetector;

	50 /**

	51 * Structure representing a charset detector

	52 * @stable ICU 3.6

	53 */

	54 typedef struct UCharsetDetector UCharsetDetector;

	55

	56 struct UCharsetMatch;

	57 /**

	58 * Opaque structure representing a match that was identified

	59 * from a charset detection operation.

	60 * @stable ICU 3.6

	61 */

	62 typedef struct UCharsetMatch UCharsetMatch;

	63

	64 /**

	65 * Open a charset detector.

	66 *

	67 * @param status Any error conditions occurring during the open

	68 * operation are reported back in this variable.

	69 * @return the newly opened charset detector.

	70 * @stable ICU 3.6

	71 */

	72 U_STABLE UCharsetDetector * U_EXPORT2

	73 ucsdet_open(UErrorCode *status);

	74

	75 /**

	76 * Close a charset detector. All storage and any other resources

	77 * owned by this charset detector will be released. Failure to

	78 * close a charset detector when finished with it can result in

	79 * memory leaks in the application.

	80 *

	81 * @param ucsd The charset detector to be closed.

	82 * @stable ICU 3.6

	83 */

	84 U_STABLE void U_EXPORT2

	85 ucsdet_close(UCharsetDetector *ucsd);

	86

	87 #if U_SHOW_CPLUSPLUS_API

	88

	89 U_NAMESPACE_BEGIN

	90

	91 /**

	92 * \class LocalUCharsetDetectorPointer

	93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().

	94 * For most methods see the LocalPointerBase base class.

	95 *

	96 * @see LocalPointerBase

	97 * @see LocalPointer

	98 * @stable ICU 4.4

	99 */

	100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsd et_close);

	101

	102 U_NAMESPACE_END

	103

	104 #endif

	105

	106 /**

	107 * Set the input byte data whose charset is to detected.

	108 *

	109 * Ownership of the input text byte array remains with the caller.

	110 * The input string must not be altered or deleted until the charset

	111 * detector is either closed or reset to refer to different input text.

	112 *

	113 * @param ucsd the charset detector to be used.

	114 * @param textIn the input text of unknown encoding. .

	115 * @param len the length of the input text, or -1 if the text

	116 * is NUL terminated.

	117 * @param status any error conditions are reported back in this variable.

	118 *

	119 * @stable ICU 3.6

	120 */

	121 U_STABLE void U_EXPORT2

	122 ucsdet_setText(UCharsetDetector ucsd, const char textIn, int32_t len, UErrorCo de *status);

	123

	124

	125 /** Set the declared encoding for charset detection.

	126 * The declared encoding of an input text is an encoding obtained

	127 * by the user from an http header or xml declaration or similar source that

	128 * can be provided as an additional hint to the charset detector.

	129 *

	130 * How and whether the declared encoding will be used during the

	131 * detection process is TBD.

	132 *

	133 * @param ucsd the charset detector to be used.

	134 * @param encoding an encoding for the current data obtained from

	135 * a header or declaration or other source outside

	136 * of the byte data itself.

	137 * @param length the length of the encoding name, or -1 if the name string

	138 * is NUL terminated.

	139 * @param status any error conditions are reported back in this variable.

	140 *

	141 * @stable ICU 3.6

	142 */

	143 U_STABLE void U_EXPORT2

	144 ucsdet_setDeclaredEncoding(UCharsetDetector ucsd, const char encoding, int32_t length, UErrorCode *status);

	145

	146

	147 /**

	148 * Return the charset that best matches the supplied input data.

	149 *

	150 * Note though, that because the detection

	151 * only looks at the start of the input data,

	152 * there is a possibility that the returned charset will fail to handle

	153 * the full set of input data.

	154 * <p>

	155 * The returned UCharsetMatch object is owned by the UCharsetDetector.

	156 * It will remain valid until the detector input is reset, or until

	157 * the detector is closed.

	158 * <p>

	159 * The function will fail if

	160 * <ul>

	161 * <li>no charset appears to match the data.</li>

	162 * <li>no input text has been provided</li>

	163 * </ul>

	164 *

	165 * @param ucsd the charset detector to be used.

	166 * @param status any error conditions are reported back in this variable.

	167 * @return a UCharsetMatch representing the best matching charset,

	168 * or NULL if no charset matches the byte data.

	169 *

	170 * @stable ICU 3.6

	171 */

	172 U_STABLE const UCharsetMatch * U_EXPORT2

	173 ucsdet_detect(UCharsetDetector ucsd, UErrorCode status);

	174

	175

	176 /**

	177 * Find all charset matches that appear to be consistent with the input,

	178 * returning an array of results. The results are ordered with the

	179 * best quality match first.

	180 *

	181 * Because the detection only looks at a limited amount of the

	182 * input byte data, some of the returned charsets may fail to handle

	183 * the all of input data.

	184 * <p>

	185 * The returned UCharsetMatch objects are owned by the UCharsetDetector.

	186 * They will remain valid until the detector is closed or modified

	187 *

	188 * <p>

	189 * Return an error if

	190 * <ul>

	191 * <li>no charsets appear to match the input data.</li>

	192 * <li>no input text has been provided</li>

	193 * </ul>

	194 *

	195 * @param ucsd the charset detector to be used.

	196 * @param matchesFound pointer to a variable that will be set to the

	197 * number of charsets identified that are consistent with

	198 * the input data. Output only.

	199 * @param status any error conditions are reported back in this variable.

	200 * @return A pointer to an array of pointers to UCharSetMatch objec ts.

	201 * This array, and the UCharSetMatch instances to which it refers,

	202 * are owned by the UCharsetDetector, and will remain valid until

	203 * the detector is closed or modified.

	204 * @stable ICU 3.6

	205 */

	206 U_STABLE const UCharsetMatch ** U_EXPORT2

	207 ucsdet_detectAll(UCharsetDetector ucsd, int32_t matchesFound, UErrorCode *stat us);

	208

	209

	210

	211 /**

	212 * Get the name of the charset represented by a UCharsetMatch.

	213 *

	214 * The storage for the returned name string is owned by the

	215 * UCharsetMatch, and will remain valid while the UCharsetMatch

	216 * is valid.

	217 *

	218 * The name returned is suitable for use with the ICU conversion APIs.

	219 *

	220 * @param ucsm The charset match object.

	221 * @param status Any error conditions are reported back in this variable.

	222 * @return The name of the matching charset.

	223 *

	224 * @stable ICU 3.6

	225 */

	226 U_STABLE const char * U_EXPORT2

	227 ucsdet_getName(const UCharsetMatch ucsm, UErrorCode status);

	228

	229 /**

	230 * Get a confidence number for the quality of the match of the byte

	231 * data with the charset. Confidence numbers range from zero to 100,

	232 * with 100 representing complete confidence and zero representing

	233 * no confidence.

	234 *

	235 * The confidence values are somewhat arbitrary. They define an

	236 * an ordering within the results for any single detection operation

	237 * but are not generally comparable between the results for different input.

	238 *

	239 * A confidence value of ten does have a general meaning - it is used

	240 * for charsets that can represent the input data, but for which there

	241 * is no other indication that suggests that the charset is the correct one.

	242 * Pure 7 bit ASCII data, for example, is compatible with a

	243 * great many charsets, most of which will appear as possible matches

	244 * with a confidence of 10.

	245 *

	246 * @param ucsm The charset match object.

	247 * @param status Any error conditions are reported back in this variable.

	248 * @return A confidence number for the charset match.

	249 *

	250 * @stable ICU 3.6

	251 */

	252 U_STABLE int32_t U_EXPORT2

	253 ucsdet_getConfidence(const UCharsetMatch ucsm, UErrorCode status);

	254

	255 /**

	256 * Get the RFC 3066 code for the language of the input data.

	257 *

	258 * The Charset Detection service is intended primarily for detecting

	259 * charsets, not language. For some, but not all, charsets, a language is

	260 * identified as a byproduct of the detection process, and that is what

	261 * is returned by this function.

	262 *

	263 * CAUTION:

	264 * 1. Language information is not available for input data encoded in

	265 * all charsets. In particular, no language is identified

	266 * for UTF-8 input data.

	267 *

	268 * 2. Closely related languages may sometimes be confused.

	269 *

	270 * If more accurate language detection is required, a linguistic

	271 * analysis package should be used.

	272 *

	273 * The storage for the returned name string is owned by the

	274 * UCharsetMatch, and will remain valid while the UCharsetMatch

	275 * is valid.

	276 *

	277 * @param ucsm The charset match object.

	278 * @param status Any error conditions are reported back in this variable.

	279 * @return The RFC 3066 code for the language of the input data, or

	280 * an empty string if the language could not be determined.

	281 *

	282 * @stable ICU 3.6

	283 */

	284 U_STABLE const char * U_EXPORT2

	285 ucsdet_getLanguage(const UCharsetMatch ucsm, UErrorCode status);

	286

	287

	288 /**

	289 * Get the entire input text as a UChar string, placing it into

	290 * a caller-supplied buffer. A terminating

	291 * NUL character will be appended to the buffer if space is available.

	292 *

	293 * The number of UChars in the output string, not including the terminating

	294 * NUL, is returned.

	295 *

	296 * If the supplied buffer is smaller than required to hold the output,

	297 * the contents of the buffer are undefined. The full output string length

	298 * (in UChars) is returned as always, and can be used to allocate a buffer

	299 * of the correct size.

	300 *

	301 *

	302 * @param ucsm The charset match object.

	303 * @param buf A UChar buffer to be filled with the converted text data.

	304 * @param cap The capacity of the buffer in UChars.

	305 * @param status Any error conditions are reported back in this variable.

	306 * @return The number of UChars in the output string.

	307 *

	308 * @stable ICU 3.6

	309 */

	310 U_STABLE int32_t U_EXPORT2

	311 ucsdet_getUChars(const UCharsetMatch *ucsm,

	312 UChar buf, int32_t cap, UErrorCode status);

	313

	314

	315

	316 /**

	317 * Get an iterator over the set of all detectable charsets -

	318 * over the charsets that are known to the charset detection

	319 * service.

	320 *

	321 * The returned UEnumeration provides access to the names of

	322 * the charsets.

	323 *

	324 * The state of the Charset detector that is passed in does not

	325 * affect the result of this function, but requiring a valid, open

	326 * charset detector as a parameter insures that the charset detection

	327 * service has been safely initialized and that the required detection

	328 * data is available.

	329 *

	330 * @param ucsd a Charset detector.

	331 * @param status Any error conditions are reported back in this variable.

	332 * @return an iterator providing access to the detectable charset names.

	333 * @stable ICU 3.6

	334 */

	335 U_STABLE UEnumeration * U_EXPORT2

	336 ucsdet_getAllDetectableCharsets(const UCharsetDetector ucsd, UErrorCode statu s);

	337

	338

	339 /**

	340 * Test whether input filtering is enabled for this charset detector.

	341 * Input filtering removes text that appears to be HTML or xml

	342 * markup from the input before applying the code page detection

	343 * heuristics.

	344 *

	345 * @param ucsd The charset detector to check.

	346 * @return TRUE if filtering is enabled.

	347 * @stable ICU 3.6

	348 */

	349 U_STABLE UBool U_EXPORT2

	350 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);

	351

	352

	353 /**

	354 * Enable filtering of input text. If filtering is enabled,

	355 * text within angle brackets ("<" and ">") will be removed

	356 * before detection, which will remove most HTML or xml markup.

	357 *

	358 * @param ucsd the charset detector to be modified.

	359 * @param filter <code>true</code> to enable input text filtering.

	360 * @return The previous setting.

	361 *

	362 * @stable ICU 3.6

	363 */

	364 U_STABLE UBool U_EXPORT2

	365 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);

	366

	367 #endif

	368 #endif /* __UCSDET_H */

	369

	370

OLD	NEW

« no previous file with comments | « icu46/source/i18n/unicode/ucoleitr.h ('k') | icu46/source/i18n/unicode/ucurr.h » ('j') | no next file with comments »