icu46/source/i18n/unicode/ucsdet.h - Issue 5516007: Check in the pristine copy of ICU 4.6...

Unified Diff: icu46/source/i18n/unicode/ucsdet.h

Issue 5516007: Check in the pristine copy of ICU 4.6... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/

Patch Set: Created 10 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: icu46/source/i18n/unicode/ucsdet.h

===================================================================

--- icu46/source/i18n/unicode/ucsdet.h (revision 0)

+++ icu46/source/i18n/unicode/ucsdet.h (revision 0)

@@ -0,0 +1,370 @@

+/*

+ **********************************************************************

+ * file name: ucsdet.h

+ * encoding: US-ASCII

+ * indentation:4

+ *

+ * created on: 2005Aug04

+ * created by: Andy Heninger

+ *

+ * ICU Character Set Detection, API for C

+ *

+ * Draft version 18 Oct 2005

+ *

+ */

+#ifndef __UCSDET_H

+#define __UCSDET_H

+#include "unicode/utypes.h"

+#if !UCONFIG_NO_CONVERSION

+#include "unicode/localpointer.h"

+#include "unicode/uenum.h"

+/**

+ * \file

+ * \brief C API: Charset Detection API

+ *

+ * This API provides a facility for detecting the

+ * charset or encoding of character data in an unknown text format.

+ * The input data can be from an array of bytes.

+ *

+ * Character set detection is at best an imprecise operation. The detection

+ * process will attempt to identify the charset that best matches the characteristics

+ * of the byte data, but the process is partly statistical in nature, and

+ * the results can not be guaranteed to always be correct.

+ *

+ * For best accuracy in charset detection, the input data should be primarily

+ * in a single language, and a minimum of a few hundred bytes worth of plain text

+ * in the language are needed. The detection process will attempt to

+ * ignore html or xml style markup that could otherwise obscure the content.

+ */

+struct UCharsetDetector;

+/**

+ * Structure representing a charset detector

+ * @stable ICU 3.6

+ */

+typedef struct UCharsetDetector UCharsetDetector;

+struct UCharsetMatch;

+/**

+ * Opaque structure representing a match that was identified

+ * from a charset detection operation.

+ * @stable ICU 3.6

+ */

+typedef struct UCharsetMatch UCharsetMatch;

+/**

+ * Open a charset detector.

+ *

+ * @param status Any error conditions occurring during the open

+ * operation are reported back in this variable.

+ * @return the newly opened charset detector.

+ * @stable ICU 3.6

+ */

+U_STABLE UCharsetDetector * U_EXPORT2

+ucsdet_open(UErrorCode *status);

+/**

+ * Close a charset detector. All storage and any other resources

+ * owned by this charset detector will be released. Failure to

+ * close a charset detector when finished with it can result in

+ * memory leaks in the application.

+ *

+ * @param ucsd The charset detector to be closed.

+ * @stable ICU 3.6

+ */

+U_STABLE void U_EXPORT2

+ucsdet_close(UCharsetDetector *ucsd);

+#if U_SHOW_CPLUSPLUS_API

+U_NAMESPACE_BEGIN

+/**

+ * \class LocalUCharsetDetectorPointer

+ * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().

+ * For most methods see the LocalPointerBase base class.

+ *

+ * @see LocalPointerBase

+ * @see LocalPointer

+ * @stable ICU 4.4

+ */

+U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);

+U_NAMESPACE_END

+#endif

+/**

+ * Set the input byte data whose charset is to detected.

+ *

+ * Ownership of the input text byte array remains with the caller.

+ * The input string must not be altered or deleted until the charset

+ * detector is either closed or reset to refer to different input text.

+ *

+ * @param ucsd the charset detector to be used.

+ * @param textIn the input text of unknown encoding. .

+ * @param len the length of the input text, or -1 if the text

+ * is NUL terminated.

+ * @param status any error conditions are reported back in this variable.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE void U_EXPORT2

+ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);

+/** Set the declared encoding for charset detection.

+ * The declared encoding of an input text is an encoding obtained

+ * by the user from an http header or xml declaration or similar source that

+ * can be provided as an additional hint to the charset detector.

+ *

+ * How and whether the declared encoding will be used during the

+ * detection process is TBD.

+ *

+ * @param ucsd the charset detector to be used.

+ * @param encoding an encoding for the current data obtained from

+ * a header or declaration or other source outside

+ * of the byte data itself.

+ * @param length the length of the encoding name, or -1 if the name string

+ * is NUL terminated.

+ * @param status any error conditions are reported back in this variable.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE void U_EXPORT2

+ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);

+/**

+ * Return the charset that best matches the supplied input data.

+ *

+ * Note though, that because the detection

+ * only looks at the start of the input data,

+ * there is a possibility that the returned charset will fail to handle

+ * the full set of input data.

+ *

+ * The returned UCharsetMatch object is owned by the UCharsetDetector.

+ * It will remain valid until the detector input is reset, or until

+ * the detector is closed.

+ *

+ * The function will fail if

+ * <ul>

+ * <li>no charset appears to match the data.</li>

+ * <li>no input text has been provided</li>

+ * </ul>

+ *

+ * @param ucsd the charset detector to be used.

+ * @param status any error conditions are reported back in this variable.

+ * @return a UCharsetMatch representing the best matching charset,

+ * or NULL if no charset matches the byte data.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE const UCharsetMatch * U_EXPORT2

+ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);

+/**

+ * Find all charset matches that appear to be consistent with the input,

+ * returning an array of results. The results are ordered with the

+ * best quality match first.

+ *

+ * Because the detection only looks at a limited amount of the

+ * input byte data, some of the returned charsets may fail to handle

+ * the all of input data.

+ *

+ * The returned UCharsetMatch objects are owned by the UCharsetDetector.

+ * They will remain valid until the detector is closed or modified

+ *

+ *

+ * Return an error if

+ * <ul>

+ * <li>no charsets appear to match the input data.</li>

+ * <li>no input text has been provided</li>

+ * </ul>

+ *

+ * @param ucsd the charset detector to be used.

+ * @param matchesFound pointer to a variable that will be set to the

+ * number of charsets identified that are consistent with

+ * the input data. Output only.

+ * @param status any error conditions are reported back in this variable.

+ * @return A pointer to an array of pointers to UCharSetMatch objects.

+ * This array, and the UCharSetMatch instances to which it refers,

+ * are owned by the UCharsetDetector, and will remain valid until

+ * the detector is closed or modified.

+ * @stable ICU 3.6

+ */

+U_STABLE const UCharsetMatch ** U_EXPORT2

+ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);

+/**

+ * Get the name of the charset represented by a UCharsetMatch.

+ *

+ * The storage for the returned name string is owned by the

+ * UCharsetMatch, and will remain valid while the UCharsetMatch

+ * is valid.

+ *

+ * The name returned is suitable for use with the ICU conversion APIs.

+ *

+ * @param ucsm The charset match object.

+ * @param status Any error conditions are reported back in this variable.

+ * @return The name of the matching charset.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE const char * U_EXPORT2

+ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);

+/**

+ * Get a confidence number for the quality of the match of the byte

+ * data with the charset. Confidence numbers range from zero to 100,

+ * with 100 representing complete confidence and zero representing

+ * no confidence.

+ *

+ * The confidence values are somewhat arbitrary. They define an

+ * an ordering within the results for any single detection operation

+ * but are not generally comparable between the results for different input.

+ *

+ * A confidence value of ten does have a general meaning - it is used

+ * for charsets that can represent the input data, but for which there

+ * is no other indication that suggests that the charset is the correct one.

+ * Pure 7 bit ASCII data, for example, is compatible with a

+ * great many charsets, most of which will appear as possible matches

+ * with a confidence of 10.

+ *

+ * @param ucsm The charset match object.

+ * @param status Any error conditions are reported back in this variable.

+ * @return A confidence number for the charset match.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE int32_t U_EXPORT2

+ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);

+/**

+ * Get the RFC 3066 code for the language of the input data.

+ *

+ * The Charset Detection service is intended primarily for detecting

+ * charsets, not language. For some, but not all, charsets, a language is

+ * identified as a byproduct of the detection process, and that is what

+ * is returned by this function.

+ *

+ * CAUTION:

+ * 1. Language information is not available for input data encoded in

+ * all charsets. In particular, no language is identified

+ * for UTF-8 input data.

+ *

+ * 2. Closely related languages may sometimes be confused.

+ *

+ * If more accurate language detection is required, a linguistic

+ * analysis package should be used.

+ *

+ * The storage for the returned name string is owned by the

+ * UCharsetMatch, and will remain valid while the UCharsetMatch

+ * is valid.

+ *

+ * @param ucsm The charset match object.

+ * @param status Any error conditions are reported back in this variable.

+ * @return The RFC 3066 code for the language of the input data, or

+ * an empty string if the language could not be determined.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE const char * U_EXPORT2

+ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);

+/**

+ * Get the entire input text as a UChar string, placing it into

+ * a caller-supplied buffer. A terminating

+ * NUL character will be appended to the buffer if space is available.

+ *

+ * The number of UChars in the output string, not including the terminating

+ * NUL, is returned.

+ *

+ * If the supplied buffer is smaller than required to hold the output,

+ * the contents of the buffer are undefined. The full output string length

+ * (in UChars) is returned as always, and can be used to allocate a buffer

+ * of the correct size.

+ *

+ * @param ucsm The charset match object.

+ * @param buf A UChar buffer to be filled with the converted text data.

+ * @param cap The capacity of the buffer in UChars.

+ * @param status Any error conditions are reported back in this variable.

+ * @return The number of UChars in the output string.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE int32_t U_EXPORT2

+ucsdet_getUChars(const UCharsetMatch *ucsm,

+ UChar *buf, int32_t cap, UErrorCode *status);

+/**

+ * Get an iterator over the set of all detectable charsets -

+ * over the charsets that are known to the charset detection

+ * service.

+ *

+ * The returned UEnumeration provides access to the names of

+ * the charsets.

+ *

+ * The state of the Charset detector that is passed in does not

+ * affect the result of this function, but requiring a valid, open

+ * charset detector as a parameter insures that the charset detection

+ * service has been safely initialized and that the required detection

+ * data is available.

+ *

+ * @param ucsd a Charset detector.

+ * @param status Any error conditions are reported back in this variable.

+ * @return an iterator providing access to the detectable charset names.

+ * @stable ICU 3.6

+ */

+U_STABLE UEnumeration * U_EXPORT2

+ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status);

+/**

+ * Test whether input filtering is enabled for this charset detector.

+ * Input filtering removes text that appears to be HTML or xml

+ * markup from the input before applying the code page detection

+ * heuristics.

+ *

+ * @param ucsd The charset detector to check.

+ * @return TRUE if filtering is enabled.

+ * @stable ICU 3.6

+ */

+U_STABLE UBool U_EXPORT2

+ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);

+/**

+ * Enable filtering of input text. If filtering is enabled,

+ * text within angle brackets ("<" and ">") will be removed

+ * before detection, which will remove most HTML or xml markup.

+ *

+ * @param ucsd the charset detector to be modified.

+ * @param filter <code>true</code> to enable input text filtering.

+ * @return The previous setting.

+ *

+ * @stable ICU 3.6

+ */

+U_STABLE UBool U_EXPORT2

+ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);

+#endif

+#endif /* __UCSDET_H */

Property changes on: icu46/source/i18n/unicode/ucsdet.h

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « icu46/source/i18n/unicode/ucoleitr.h ('k') | icu46/source/i18n/unicode/ucurr.h » ('j') | no next file with comments »