public/i18n/unicode/ucsdet.h - Issue 18836004: Move ICU headers from public/{common,i18n} to source/{common,i18n}

Side by Side Diff: public/i18n/unicode/ucsdet.h

Issue 18836004: Move ICU headers from public/{common,i18n} to source/{common,i18n} (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu46.git@master

Patch Set: same as ps #3. retry uploading Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 **********************************************************************

3 * Copyright (C) 2005-2010, International Business Machines

4 * Corporation and others. All Rights Reserved.

5 **********************************************************************

6 * file name: ucsdet.h

7 * encoding: US-ASCII

8 * indentation:4

9 *

10 * created on: 2005Aug04

11 * created by: Andy Heninger

12 *

13 * ICU Character Set Detection, API for C

14 *

15 * Draft version 18 Oct 2005

16 *

17 */

18

19 #ifndef __UCSDET_H

20 #define __UCSDET_H

21

22 #include "unicode/utypes.h"

23

24 #if !UCONFIG_NO_CONVERSION

25

26 #include "unicode/localpointer.h"

27 #include "unicode/uenum.h"

28

29 /**

30 * \file

31 * \brief C API: Charset Detection API

32 *

33 * This API provides a facility for detecting the

34 * charset or encoding of character data in an unknown text format.

35 * The input data can be from an array of bytes.

36 * <p>

37 * Character set detection is at best an imprecise operation. The detection

38 * process will attempt to identify the charset that best matches the characteri stics

39 * of the byte data, but the process is partly statistical in nature, and

40 * the results can not be guaranteed to always be correct.

41 * <p>

42 * For best accuracy in charset detection, the input data should be primarily

43 * in a single language, and a minimum of a few hundred bytes worth of plain tex t

44 * in the language are needed. The detection process will attempt to

45 * ignore html or xml style markup that could otherwise obscure the content.

46 */

47

48

49 struct UCharsetDetector;

50 /**

51 * Structure representing a charset detector

52 * @stable ICU 3.6

53 */

54 typedef struct UCharsetDetector UCharsetDetector;

55

56 struct UCharsetMatch;

57 /**

58 * Opaque structure representing a match that was identified

59 * from a charset detection operation.

60 * @stable ICU 3.6

61 */

62 typedef struct UCharsetMatch UCharsetMatch;

63

64 /**

65 * Open a charset detector.

66 *

67 * @param status Any error conditions occurring during the open

68 * operation are reported back in this variable.

69 * @return the newly opened charset detector.

70 * @stable ICU 3.6

71 */

72 U_STABLE UCharsetDetector * U_EXPORT2

73 ucsdet_open(UErrorCode *status);

74

75 /**

76 * Close a charset detector. All storage and any other resources

77 * owned by this charset detector will be released. Failure to

78 * close a charset detector when finished with it can result in

79 * memory leaks in the application.

80 *

81 * @param ucsd The charset detector to be closed.

82 * @stable ICU 3.6

83 */

84 U_STABLE void U_EXPORT2

85 ucsdet_close(UCharsetDetector *ucsd);

86

87 #if U_SHOW_CPLUSPLUS_API

88

89 U_NAMESPACE_BEGIN

90

91 /**

92 * \class LocalUCharsetDetectorPointer

93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().

94 * For most methods see the LocalPointerBase base class.

95 *

96 * @see LocalPointerBase

97 * @see LocalPointer

98 * @stable ICU 4.4

99 */

100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsd et_close);

101

102 U_NAMESPACE_END

103

104 #endif

105

106 /**

107 * Set the input byte data whose charset is to detected.

108 *

109 * Ownership of the input text byte array remains with the caller.

110 * The input string must not be altered or deleted until the charset

111 * detector is either closed or reset to refer to different input text.

112 *

113 * @param ucsd the charset detector to be used.

114 * @param textIn the input text of unknown encoding. .

115 * @param len the length of the input text, or -1 if the text

116 * is NUL terminated.

117 * @param status any error conditions are reported back in this variable.

118 *

119 * @stable ICU 3.6

120 */

121 U_STABLE void U_EXPORT2

122 ucsdet_setText(UCharsetDetector ucsd, const char textIn, int32_t len, UErrorCo de *status);

123

124

125 /** Set the declared encoding for charset detection.

126 * The declared encoding of an input text is an encoding obtained

127 * by the user from an http header or xml declaration or similar source that

128 * can be provided as an additional hint to the charset detector.

129 *

130 * How and whether the declared encoding will be used during the

131 * detection process is TBD.

132 *

133 * @param ucsd the charset detector to be used.

134 * @param encoding an encoding for the current data obtained from

135 * a header or declaration or other source outside

136 * of the byte data itself.

137 * @param length the length of the encoding name, or -1 if the name string

138 * is NUL terminated.

139 * @param status any error conditions are reported back in this variable.

140 *

141 * @stable ICU 3.6

142 */

143 U_STABLE void U_EXPORT2

144 ucsdet_setDeclaredEncoding(UCharsetDetector ucsd, const char encoding, int32_t length, UErrorCode *status);

145

146

147 /**

148 * Return the charset that best matches the supplied input data.

149 *

150 * Note though, that because the detection

151 * only looks at the start of the input data,

152 * there is a possibility that the returned charset will fail to handle

153 * the full set of input data.

154 * <p>

155 * The returned UCharsetMatch object is owned by the UCharsetDetector.

156 * It will remain valid until the detector input is reset, or until

157 * the detector is closed.

158 * <p>

159 * The function will fail if

160 * <ul>

161 * <li>no charset appears to match the data.</li>

162 * <li>no input text has been provided</li>

163 * </ul>

164 *

165 * @param ucsd the charset detector to be used.

166 * @param status any error conditions are reported back in this variable.

167 * @return a UCharsetMatch representing the best matching charset,

168 * or NULL if no charset matches the byte data.

169 *

170 * @stable ICU 3.6

171 */

172 U_STABLE const UCharsetMatch * U_EXPORT2

173 ucsdet_detect(UCharsetDetector ucsd, UErrorCode status);

174

175

176 /**

177 * Find all charset matches that appear to be consistent with the input,

178 * returning an array of results. The results are ordered with the

179 * best quality match first.

180 *

181 * Because the detection only looks at a limited amount of the

182 * input byte data, some of the returned charsets may fail to handle

183 * the all of input data.

184 * <p>

185 * The returned UCharsetMatch objects are owned by the UCharsetDetector.

186 * They will remain valid until the detector is closed or modified

187 *

188 * <p>

189 * Return an error if

190 * <ul>

191 * <li>no charsets appear to match the input data.</li>

192 * <li>no input text has been provided</li>

193 * </ul>

194 *

195 * @param ucsd the charset detector to be used.

196 * @param matchesFound pointer to a variable that will be set to the

197 * number of charsets identified that are consistent with

198 * the input data. Output only.

199 * @param status any error conditions are reported back in this variable.

200 * @return A pointer to an array of pointers to UCharSetMatch objec ts.

201 * This array, and the UCharSetMatch instances to which it refers,

202 * are owned by the UCharsetDetector, and will remain valid until

203 * the detector is closed or modified.

204 * @stable ICU 3.6

205 */

206 U_STABLE const UCharsetMatch ** U_EXPORT2

207 ucsdet_detectAll(UCharsetDetector ucsd, int32_t matchesFound, UErrorCode *stat us);

208

209

210

211 /**

212 * Get the name of the charset represented by a UCharsetMatch.

213 *

214 * The storage for the returned name string is owned by the

215 * UCharsetMatch, and will remain valid while the UCharsetMatch

216 * is valid.

217 *

218 * The name returned is suitable for use with the ICU conversion APIs.

219 *

220 * @param ucsm The charset match object.

221 * @param status Any error conditions are reported back in this variable.

222 * @return The name of the matching charset.

223 *

224 * @stable ICU 3.6

225 */

226 U_STABLE const char * U_EXPORT2

227 ucsdet_getName(const UCharsetMatch ucsm, UErrorCode status);

228

229 /**

230 * Get a confidence number for the quality of the match of the byte

231 * data with the charset. Confidence numbers range from zero to 100,

232 * with 100 representing complete confidence and zero representing

233 * no confidence.

234 *

235 * The confidence values are somewhat arbitrary. They define an

236 * an ordering within the results for any single detection operation

237 * but are not generally comparable between the results for different input.

238 *

239 * A confidence value of ten does have a general meaning - it is used

240 * for charsets that can represent the input data, but for which there

241 * is no other indication that suggests that the charset is the correct one.

242 * Pure 7 bit ASCII data, for example, is compatible with a

243 * great many charsets, most of which will appear as possible matches

244 * with a confidence of 10.

245 *

246 * @param ucsm The charset match object.

247 * @param status Any error conditions are reported back in this variable.

248 * @return A confidence number for the charset match.

249 *

250 * @stable ICU 3.6

251 */

252 U_STABLE int32_t U_EXPORT2

253 ucsdet_getConfidence(const UCharsetMatch ucsm, UErrorCode status);

254

255 /**

256 * Get the RFC 3066 code for the language of the input data.

257 *

258 * The Charset Detection service is intended primarily for detecting

259 * charsets, not language. For some, but not all, charsets, a language is

260 * identified as a byproduct of the detection process, and that is what

261 * is returned by this function.

262 *

263 * CAUTION:

264 * 1. Language information is not available for input data encoded in

265 * all charsets. In particular, no language is identified

266 * for UTF-8 input data.

267 *

268 * 2. Closely related languages may sometimes be confused.

269 *

270 * If more accurate language detection is required, a linguistic

271 * analysis package should be used.

272 *

273 * The storage for the returned name string is owned by the

274 * UCharsetMatch, and will remain valid while the UCharsetMatch

275 * is valid.

276 *

277 * @param ucsm The charset match object.

278 * @param status Any error conditions are reported back in this variable.

279 * @return The RFC 3066 code for the language of the input data, or

280 * an empty string if the language could not be determined.

281 *

282 * @stable ICU 3.6

283 */

284 U_STABLE const char * U_EXPORT2

285 ucsdet_getLanguage(const UCharsetMatch ucsm, UErrorCode status);

286

287

288 /**

289 * Get the entire input text as a UChar string, placing it into

290 * a caller-supplied buffer. A terminating

291 * NUL character will be appended to the buffer if space is available.

292 *

293 * The number of UChars in the output string, not including the terminating

294 * NUL, is returned.

295 *

296 * If the supplied buffer is smaller than required to hold the output,

297 * the contents of the buffer are undefined. The full output string length

298 * (in UChars) is returned as always, and can be used to allocate a buffer

299 * of the correct size.

300 *

301 *

302 * @param ucsm The charset match object.

303 * @param buf A UChar buffer to be filled with the converted text data.

304 * @param cap The capacity of the buffer in UChars.

305 * @param status Any error conditions are reported back in this variable.

306 * @return The number of UChars in the output string.

307 *

308 * @stable ICU 3.6

309 */

310 U_STABLE int32_t U_EXPORT2

311 ucsdet_getUChars(const UCharsetMatch *ucsm,

312 UChar buf, int32_t cap, UErrorCode status);

313

314

315

316 /**

317 * Get an iterator over the set of all detectable charsets -

318 * over the charsets that are known to the charset detection

319 * service.

320 *

321 * The returned UEnumeration provides access to the names of

322 * the charsets.

323 *

324 * The state of the Charset detector that is passed in does not

325 * affect the result of this function, but requiring a valid, open

326 * charset detector as a parameter insures that the charset detection

327 * service has been safely initialized and that the required detection

328 * data is available.

329 *

330 * @param ucsd a Charset detector.

331 * @param status Any error conditions are reported back in this variable.

332 * @return an iterator providing access to the detectable charset names.

333 * @stable ICU 3.6

334 */

335 U_STABLE UEnumeration * U_EXPORT2

336 ucsdet_getAllDetectableCharsets(const UCharsetDetector ucsd, UErrorCode statu s);

337

338

339 /**

340 * Test whether input filtering is enabled for this charset detector.

341 * Input filtering removes text that appears to be HTML or xml

342 * markup from the input before applying the code page detection

343 * heuristics.

344 *

345 * @param ucsd The charset detector to check.

346 * @return TRUE if filtering is enabled.

347 * @stable ICU 3.6

348 */

349 U_STABLE UBool U_EXPORT2

350 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);

351

352

353 /**

354 * Enable filtering of input text. If filtering is enabled,

355 * text within angle brackets ("<" and ">") will be removed

356 * before detection, which will remove most HTML or xml markup.

357 *

358 * @param ucsd the charset detector to be modified.

359 * @param filter <code>true</code> to enable input text filtering.

360 * @return The previous setting.

361 *

362 * @stable ICU 3.6

363 */

364 U_STABLE UBool U_EXPORT2

365 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);

366

367 #endif

368 #endif /* __UCSDET_H */

369

370

OLD	NEW

« no previous file with comments | « public/i18n/unicode/ucoleitr.h ('k') | public/i18n/unicode/ucurr.h » ('j') | no next file with comments »