base/i18n/file_util_icu.cc - Issue 869823003: Update ReplaceIllegalCharactersInPath to handle quirks in HFS+ and VFS

Side by Side Diff: base/i18n/file_util_icu.cc

Issue 869823003: Update ReplaceIllegalCharactersInPath to handle quirks in HFS+ and VFS (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // File utilities that use the ICU library go in this file.	5 // File utilities that use the ICU library go in this file.

6	6

7 #include "base/i18n/file_util_icu.h"	7 #include "base/i18n/file_util_icu.h"

8	8

9 #include "base/files/file_path.h"	9 #include "base/files/file_path.h"

10 #include "base/i18n/icu_string_conversions.h"	10 #include "base/i18n/icu_string_conversions.h"

(...skipping 12 matching lines...) Expand all Loading...
23 namespace i18n {	23 namespace i18n {

24	24

25 namespace {	25 namespace {

26	26

27 class IllegalCharacters {	27 class IllegalCharacters {

28 public:	28 public:

29 static IllegalCharacters* GetInstance() {	29 static IllegalCharacters* GetInstance() {

30 return Singleton<IllegalCharacters>::get();	30 return Singleton<IllegalCharacters>::get();

31 }	31 }

32	32

33 bool contains(UChar32 ucs4) {	33 bool DisallowedEverywhere(UChar32 ucs4) {

34 return !!set->contains(ucs4);	34 return !!illegal_anywhere_->contains(ucs4);

35 }	35 }

36	36

37 bool containsNone(const string16 &s) {	37 bool DisallowedLeadingOrTrailing(UChar32 ucs4) {

38 return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));	38 return !!illegal_at_ends_->contains(ucs4);

	39 }

	40

	41 bool IsAllowedName(const string16& s) {

	42 return s.empty() \|\| (!!illegal_anywhere_->containsNone(

	43 icu::UnicodeString(s.c_str(), s.size())) &&

	44 !illegal_at_ends_->contains(*s.begin()) &&

	45 !illegal_at_ends_->contains(*s.rbegin()));

39 }	46 }

40	47

41 private:	48 private:

42 friend class Singleton<IllegalCharacters>;	49 friend class Singleton<IllegalCharacters>;

43 friend struct DefaultSingletonTraits<IllegalCharacters>;	50 friend struct DefaultSingletonTraits<IllegalCharacters>;

44	51

45 IllegalCharacters();	52 IllegalCharacters();

46 ~IllegalCharacters() { }	53 ~IllegalCharacters() { }

47	54

48 scoped_ptr<icu::UnicodeSet> set;	55 // set of characters considered invalid anywhere inside a filename.

	56 scoped_ptr<icu::UnicodeSet> illegal_anywhere_;

	57

	58 // set of characters considered invalid at either end of a filename.

	59 scoped_ptr<icu::UnicodeSet> illegal_at_ends_;

49	60

50 DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);	61 DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);

51 };	62 };

52	63

53 IllegalCharacters::IllegalCharacters() {	64 IllegalCharacters::IllegalCharacters() {

54 UErrorCode status = U_ZERO_ERROR;	65 UErrorCode everywhere_status = U_ZERO_ERROR;

55 // Control characters, formatting characters, non-characters, and	66 UErrorCode ends_status = U_ZERO_ERROR;

56 // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').	67 // Control characters, formatting characters, non-characters, path separators,

	68 // and some printable ASCII characters regarded as dangerous ('"*/:<>?\\').

57 // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx	69 // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx

58 // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx	70 // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx

59 // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they	71 // Note that code points in the "Other, Format" (Cf) category are ignored on

60 // are legitimate in Arabic and some S/SE Asian scripts. However, when used	72 // HFS+ despite the ZERO_WIDTH_JOINER and ZERO_WIDTH_NON-JOINER being

61 // elsewhere, they can be confusing/problematic.	73 // legitimate in Arabic and some S/SE Asian scripts. In addition tilde (~) is

62 // Also, consider wrapping the set with our Singleton class to create and	74 // also excluded due to the possibility of interacting poorly with short

63 // freeze it only once. Note that there's a trade-off between memory and	75 // filenames on VFAT. (Related to CVE-2014-9390)

64 // speed.

65 #if defined(WCHAR_T_IS_UTF16)	76 #if defined(WCHAR_T_IS_UTF16)

66 set.reset(new icu::UnicodeSet(icu::UnicodeString(	77 illegal_anywhere_.reset(

67 L"[[\"*/:<>?\\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));	78 new icu::UnicodeSet(icu::UnicodeString(L"[[\"~*/:<>?\\\\\|][:Cc:][:Cf:]]"),

	79 everywhere_status));

	80 illegal_at_ends_.reset(

	81 new icu::UnicodeSet(icu::UnicodeString(L"[[:WSpace:][.]]"), ends_status));

68 #else	82 #else

69 set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(	83 illegal_anywhere_.reset(new icu::UnicodeSet(

70 "[[\"*/:<>?\\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),	84 UNICODE_STRING_SIMPLE("[[\"~*/:<>?\\\\\|][:Cc:][:Cf:]]"),

71 status));	85 everywhere_status));

	86 illegal_at_ends_.reset(new icu::UnicodeSet(

	87 UNICODE_STRING_SIMPLE("[[:WSpace:][.]]"), ends_status));
	jungshik at Google 2015/01/30 23:13:25 You don't need \|#if defined\| block any more (actua You don't need \|#if defined\| block any more (actually, even before this CL, it looks like we could have removed the #if block). You can just use 'else' block on all platforms now. asanka 2015/01/30 23:36:48 Done Show quoted text On 2015/01/30 at 23:13:25, Jungshik at google wrote: > You don't need \|#if defined\| block any more (actually, even before this CL, it looks like we could have removed the #if block). You can just use 'else' block on all platforms now. Done
72 #endif	88 #endif

73 DCHECK(U_SUCCESS(status));	89 DCHECK(U_SUCCESS(everywhere_status));

	90 DCHECK(U_SUCCESS(ends_status));

	91

74 // Add non-characters. If this becomes a performance bottleneck by	92 // Add non-characters. If this becomes a performance bottleneck by

75 // any chance, do not add these to \|set\| and change IsFilenameLegal()	93 // any chance, do not add these to \|set\| and change IsFilenameLegal()

76 // to check \|ucs4 & 0xFFFEu == 0xFFFEu\|, in addiition to calling	94 // to check \|ucs4 & 0xFFFEu == 0xFFFEu\|, in addiition to calling

77 // containsNone().	95 // IsAllowedName().

78 set->add(0xFDD0, 0xFDEF);	96 illegal_anywhere_->add(0xFDD0, 0xFDEF);

79 for (int i = 0; i <= 0x10; ++i) {	97 for (int i = 0; i <= 0x10; ++i) {

80 int plane_base = 0x10000 * i;	98 int plane_base = 0x10000 * i;

81 set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);	99 illegal_anywhere_->add(plane_base + 0xFFFE, plane_base + 0xFFFF);

82 }	100 }

83 set->freeze();	101 illegal_anywhere_->freeze();

	102 illegal_at_ends_->freeze();

84 }	103 }

85	104

86 } // namespace	105 } // namespace

87	106

88 bool IsFilenameLegal(const string16& file_name) {	107 bool IsFilenameLegal(const string16& file_name) {

89 return IllegalCharacters::GetInstance()->containsNone(file_name);	108 return IllegalCharacters::GetInstance()->IsAllowedName(file_name);

90 }	109 }

91	110

92 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,	111 void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,

93 char replace_char) {	112 char replace_char) {

94 DCHECK(file_name);	113 IllegalCharacters* illegal = IllegalCharacters::GetInstance();

95	114

96 DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));	115 DCHECK(!(illegal->DisallowedEverywhere(replace_char)));

	116 DCHECK(!(illegal->DisallowedLeadingOrTrailing(replace_char)));

97	117

98 // Remove leading and trailing whitespace.

99 TrimWhitespace(*file_name, TRIM_ALL, file_name);

100

101 IllegalCharacters* illegal = IllegalCharacters::GetInstance();

102 int cursor = 0; // The ICU macros expect an int.	118 int cursor = 0; // The ICU macros expect an int.

103 while (cursor < static_cast<int>(file_name->size())) {	119 while (cursor < static_cast<int>(file_name->size())) {

104 int char_begin = cursor;	120 int char_begin = cursor;

105 uint32 code_point;	121 uint32 code_point;

106 #if defined(OS_MACOSX)	122 #if defined(OS_MACOSX)

107 // Mac uses UTF-8 encoding for filenames.	123 // Mac uses UTF-8 encoding for filenames.

108 U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),	124 U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),

109 code_point);	125 code_point);

110 #elif defined(OS_WIN)	126 #elif defined(OS_WIN)

111 // Windows uses UTF-16 encoding for filenames.	127 // Windows uses UTF-16 encoding for filenames.

112 U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),	128 U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),

113 code_point);	129 code_point);

114 #elif defined(OS_POSIX)	130 #elif defined(OS_POSIX)

115 // Linux doesn't actually define an encoding. It basically allows anything	131 // Linux doesn't actually define an encoding. It basically allows anything

116 // except for a few special ASCII characters.	132 // except for a few special ASCII characters.

117 unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);	133 unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);

118 if (cur_char >= 0x80)	134 if (cur_char >= 0x80)

119 continue;	135 continue;

120 code_point = cur_char;	136 code_point = cur_char;

121 #else	137 #else

122 NOTREACHED();	138 NOTREACHED();

123 #endif	139 #endif

124	140

125 if (illegal->contains(code_point)) {	141 if (illegal->DisallowedEverywhere(code_point) \|\|

	142 ((char_begin == 0 \|\| cursor == static_cast<int>(file_name->length())) &&

	143 illegal->DisallowedLeadingOrTrailing(code_point))) {

126 file_name->replace(char_begin, cursor - char_begin, 1, replace_char);	144 file_name->replace(char_begin, cursor - char_begin, 1, replace_char);

127 // We just made the potentially multi-byte/word char into one that only	145 // We just made the potentially multi-byte/word char into one that only

128 // takes one byte/word, so need to adjust the cursor to point to the next	146 // takes one byte/word, so need to adjust the cursor to point to the next

129 // character again.	147 // character again.

130 cursor = char_begin + 1;	148 cursor = char_begin + 1;

131 }	149 }

132 }	150 }

133 }	151 }

134	152

135 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {	153 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {

(...skipping 27 matching lines...) Expand all Loading...
163 if (ConvertToUtf8AndNormalize(file_name->BaseName().value(),	181 if (ConvertToUtf8AndNormalize(file_name->BaseName().value(),

164 kCodepageUTF8,	182 kCodepageUTF8,

165 &normalized_str)) {	183 &normalized_str)) {

166 *file_name = file_name->DirName().Append(FilePath(normalized_str));	184 *file_name = file_name->DirName().Append(FilePath(normalized_str));

167 }	185 }

168 #endif	186 #endif

169 }	187 }

170	188

171 } // namespace i18n	189 } // namespace i18n

172 } // namespace base	190 } // namespace base

OLD	NEW

« no previous file with comments | « base/i18n/file_util_icu.h ('k') | base/i18n/file_util_icu_unittest.cc » ('j') | no next file with comments »