Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(216)

Side by Side Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are 5 * modification, are permitted provided that the following conditions are
6 * met: 6 * met:
7 * 7 *
8 * * Redistributions of source code must retain the above copyright 8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above 10 * * Redistributions in binary form must reproduce the above
(...skipping 15 matching lines...) Expand all
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */ 29 */
30 30
31 #include "platform/text/TextEncodingDetector.h" 31 #include "platform/text/TextEncodingDetector.h"
32 32
33 #include "wtf/text/TextEncoding.h" 33 #include "wtf/text/TextEncoding.h"
34 #include <unicode/ucnv.h> 34 #include <unicode/ucnv.h>
35 #include <unicode/ucsdet.h> 35 #include <unicode/ucsdet.h>
36 #include <unicode/utf8.h>
36 37
37 namespace blink { 38 namespace blink {
38 39
39 bool detectTextEncoding(const char* data, size_t length, 40 bool detectTextEncoding(const char* data, size_t length,
40 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) 41 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
41 { 42 {
42 *detectedEncoding = WTF::TextEncoding(); 43 *detectedEncoding = WTF::TextEncoding();
43 int matchesCount = 0; 44 int matchesCount = 0;
44 UErrorCode status = U_ZERO_ERROR; 45 UErrorCode status = U_ZERO_ERROR;
45 UCharsetDetector* detector = ucsdet_open(&status); 46 UCharsetDetector* detector = ucsdet_open(&status);
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 encoding = ucsdet_getName(matches[0], &status); 106 encoding = ucsdet_getName(matches[0], &status);
106 if (U_SUCCESS(status)) { 107 if (U_SUCCESS(status)) {
107 *detectedEncoding = WTF::TextEncoding(encoding); 108 *detectedEncoding = WTF::TextEncoding(encoding);
108 ucsdet_close(detector); 109 ucsdet_close(detector);
109 return true; 110 return true;
110 } 111 }
111 ucsdet_close(detector); 112 ucsdet_close(detector);
112 return false; 113 return false;
113 } 114 }
114 115
116 bool isUTF8Encoded(const char* data, size_t length)
117 {
118 int32_t srcLen = static_cast<int32_t>(length);
119 int32_t charIndex = 0;
120 bool markDetected = false;
121
122 while (charIndex < srcLen) {
123 int32_t codePoint;
124 if ((uint8_t)(data[charIndex]) >= 0x80)
125 markDetected = true;
126 U8_NEXT(data, charIndex, srcLen, codePoint);
127 if (!U_IS_UNICODE_CHAR(codePoint))
aelias_OOO_until_Jul13 2016/02/24 04:37:54 According to http://icu-project.org/apiref/icu4c/u
Jinsuk Kim 2016/02/24 06:54:54 Thanks for looking into the detail. Ran the unitte
128 return false;
129 }
130 return markDetected;
131 }
132
115 } // namespace blink 133 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698