| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // File utilities that use the ICU library go in this file. | |
| 6 | |
| 7 #include "base/i18n/file_util_icu.h" | |
| 8 | |
| 9 #include "base/files/file_path.h" | |
| 10 #include "base/i18n/icu_string_conversions.h" | |
| 11 #include "base/i18n/string_compare.h" | |
| 12 #include "base/logging.h" | |
| 13 #include "base/memory/scoped_ptr.h" | |
| 14 #include "base/memory/singleton.h" | |
| 15 #include "base/strings/string_util.h" | |
| 16 #include "base/strings/sys_string_conversions.h" | |
| 17 #include "base/strings/utf_string_conversions.h" | |
| 18 #include "build/build_config.h" | |
| 19 #include "third_party/icu/source/common/unicode/uniset.h" | |
| 20 #include "third_party/icu/source/i18n/unicode/coll.h" | |
| 21 | |
| 22 namespace base { | |
| 23 namespace i18n { | |
| 24 | |
| 25 namespace { | |
| 26 | |
| 27 class IllegalCharacters { | |
| 28 public: | |
| 29 static IllegalCharacters* GetInstance() { | |
| 30 return Singleton<IllegalCharacters>::get(); | |
| 31 } | |
| 32 | |
| 33 bool DisallowedEverywhere(UChar32 ucs4) { | |
| 34 return !!illegal_anywhere_->contains(ucs4); | |
| 35 } | |
| 36 | |
| 37 bool DisallowedLeadingOrTrailing(UChar32 ucs4) { | |
| 38 return !!illegal_at_ends_->contains(ucs4); | |
| 39 } | |
| 40 | |
| 41 bool IsAllowedName(const string16& s) { | |
| 42 return s.empty() || (!!illegal_anywhere_->containsNone( | |
| 43 icu::UnicodeString(s.c_str(), s.size())) && | |
| 44 !illegal_at_ends_->contains(*s.begin()) && | |
| 45 !illegal_at_ends_->contains(*s.rbegin())); | |
| 46 } | |
| 47 | |
| 48 private: | |
| 49 friend class Singleton<IllegalCharacters>; | |
| 50 friend struct DefaultSingletonTraits<IllegalCharacters>; | |
| 51 | |
| 52 IllegalCharacters(); | |
| 53 ~IllegalCharacters() { } | |
| 54 | |
| 55 // set of characters considered invalid anywhere inside a filename. | |
| 56 scoped_ptr<icu::UnicodeSet> illegal_anywhere_; | |
| 57 | |
| 58 // set of characters considered invalid at either end of a filename. | |
| 59 scoped_ptr<icu::UnicodeSet> illegal_at_ends_; | |
| 60 | |
| 61 DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); | |
| 62 }; | |
| 63 | |
| 64 IllegalCharacters::IllegalCharacters() { | |
| 65 UErrorCode everywhere_status = U_ZERO_ERROR; | |
| 66 UErrorCode ends_status = U_ZERO_ERROR; | |
| 67 // Control characters, formatting characters, non-characters, path separators, | |
| 68 // and some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). | |
| 69 // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx | |
| 70 // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx | |
| 71 // Note that code points in the "Other, Format" (Cf) category are ignored on | |
| 72 // HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being | |
| 73 // legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is | |
| 74 // also excluded due to the possibility of interacting poorly with short | |
| 75 // filenames on VFAT. (Related to CVE-2014-9390) | |
| 76 illegal_anywhere_.reset(new icu::UnicodeSet( | |
| 77 UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\|][:Cc:][:Cf:]]"), | |
| 78 everywhere_status)); | |
| 79 illegal_at_ends_.reset(new icu::UnicodeSet( | |
| 80 UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status)); | |
| 81 DCHECK(U_SUCCESS(everywhere_status)); | |
| 82 DCHECK(U_SUCCESS(ends_status)); | |
| 83 | |
| 84 // Add non-characters. If this becomes a performance bottleneck by | |
| 85 // any chance, do not add these to |set| and change IsFilenameLegal() | |
| 86 // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling | |
| 87 // IsAllowedName(). | |
| 88 illegal_anywhere_->add(0xFDD0, 0xFDEF); | |
| 89 for (int i = 0; i <= 0x10; ++i) { | |
| 90 int plane_base = 0x10000 * i; | |
| 91 illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF); | |
| 92 } | |
| 93 illegal_anywhere_->freeze(); | |
| 94 illegal_at_ends_->freeze(); | |
| 95 } | |
| 96 | |
| 97 } // namespace | |
| 98 | |
| 99 bool IsFilenameLegal(const string16& file_name) { | |
| 100 return IllegalCharacters::GetInstance()->IsAllowedName(file_name); | |
| 101 } | |
| 102 | |
| 103 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name, | |
| 104 char replace_char) { | |
| 105 IllegalCharacters* illegal = IllegalCharacters::GetInstance(); | |
| 106 | |
| 107 DCHECK(!(illegal->DisallowedEverywhere(replace_char))); | |
| 108 DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char))); | |
| 109 | |
| 110 int cursor = 0; // The ICU macros expect an int. | |
| 111 while (cursor < static_cast<int>(file_name->size())) { | |
| 112 int char_begin = cursor; | |
| 113 uint32 code_point; | |
| 114 #if defined(OS_MACOSX) | |
| 115 // Mac uses UTF-8 encoding for filenames. | |
| 116 U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), | |
| 117 code_point); | |
| 118 #elif defined(OS_WIN) | |
| 119 // Windows uses UTF-16 encoding for filenames. | |
| 120 U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), | |
| 121 code_point); | |
| 122 #elif defined(OS_POSIX) | |
| 123 // Linux doesn't actually define an encoding. It basically allows anything | |
| 124 // except for a few special ASCII characters. | |
| 125 unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]); | |
| 126 if (cur_char >= 0x80) | |
| 127 continue; | |
| 128 code_point = cur_char; | |
| 129 #else | |
| 130 NOTREACHED(); | |
| 131 #endif | |
| 132 | |
| 133 if (illegal->DisallowedEverywhere(code_point) || | |
| 134 ((char_begin == 0 || cursor == static_cast<int>(file_name->length())) && | |
| 135 illegal->DisallowedLeadingOrTrailing(code_point))) { | |
| 136 file_name->replace(char_begin, cursor - char_begin, 1, replace_char); | |
| 137 // We just made the potentially multi-byte/word char into one that only | |
| 138 // takes one byte/word, so need to adjust the cursor to point to the next | |
| 139 // character again. | |
| 140 cursor = char_begin + 1; | |
| 141 } | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) { | |
| 146 UErrorCode error_code = U_ZERO_ERROR; | |
| 147 // Use the default collator. The default locale should have been properly | |
| 148 // set by the time this constructor is called. | |
| 149 scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code)); | |
| 150 DCHECK(U_SUCCESS(error_code)); | |
| 151 // Make it case-sensitive. | |
| 152 collator->setStrength(icu::Collator::TERTIARY); | |
| 153 | |
| 154 #if defined(OS_WIN) | |
| 155 return CompareString16WithCollator(*collator, WideToUTF16(a.value()), | |
| 156 WideToUTF16(b.value())) == UCOL_LESS; | |
| 157 | |
| 158 #elif defined(OS_POSIX) | |
| 159 // On linux, the file system encoding is not defined. We assume | |
| 160 // SysNativeMBToWide takes care of it. | |
| 161 return CompareString16WithCollator( | |
| 162 *collator, WideToUTF16(SysNativeMBToWide(a.value().c_str())), | |
| 163 WideToUTF16(SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS; | |
| 164 #else | |
| 165 #error Not implemented on your system | |
| 166 #endif | |
| 167 } | |
| 168 | |
| 169 void NormalizeFileNameEncoding(FilePath* file_name) { | |
| 170 #if defined(OS_CHROMEOS) | |
| 171 std::string normalized_str; | |
| 172 if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), | |
| 173 kCodepageUTF8, | |
| 174 &normalized_str)) { | |
| 175 *file_name = file_name->DirName().Append(FilePath(normalized_str)); | |
| 176 } | |
| 177 #endif | |
| 178 } | |
| 179 | |
| 180 } // namespace i18n | |
| 181 } // namespace base | |
| OLD | NEW |