swarm_client/third_party/requests/packages/charade/chardistribution.py - Issue 69143004: Delete swarm_client.

Side by Side Diff: swarm_client/third_party/requests/packages/charade/chardistribution.py

Issue 69143004: Delete swarm_client. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« no previous file with comments | « swarm_client/third_party/requests/packages/charade/big5prober.py ('k') | swarm_client/third_party/requests/packages/charade/charsetgroupprober.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 ######################## BEGIN LICENSE BLOCK ########################

2 # The Original Code is Mozilla Communicator client code.

3 #

4 # The Initial Developer of the Original Code is

5 # Netscape Communications Corporation.

6 # Portions created by the Initial Developer are Copyright (C) 1998

7 # the Initial Developer. All Rights Reserved.

8 #

9 # Contributor(s):

10 # Mark Pilgrim - port to Python

11 #

12 # This library is free software; you can redistribute it and/or

13 # modify it under the terms of the GNU Lesser General Public

14 # License as published by the Free Software Foundation; either

15 # version 2.1 of the License, or (at your option) any later version.

16 #

17 # This library is distributed in the hope that it will be useful,

18 # but WITHOUT ANY WARRANTY; without even the implied warranty of

19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20 # Lesser General Public License for more details.

21 #

22 # You should have received a copy of the GNU Lesser General Public

23 # License along with this library; if not, write to the Free Software

24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

25 # 02110-1301 USA

26 ######################### END LICENSE BLOCK #########################

27

28 from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,

29 EUCTW_TYPICAL_DISTRIBUTION_RATIO)

30 from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,

31 EUCKR_TYPICAL_DISTRIBUTION_RATIO)

32 from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,

33 GB2312_TYPICAL_DISTRIBUTION_RATIO)

34 from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,

35 BIG5_TYPICAL_DISTRIBUTION_RATIO)

36 from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,

37 JIS_TYPICAL_DISTRIBUTION_RATIO)

38 from .compat import wrap_ord

39

40 ENOUGH_DATA_THRESHOLD = 1024

41 SURE_YES = 0.99

42 SURE_NO = 0.01

43 MINIMUM_DATA_THRESHOLD = 3

44

45

46 class CharDistributionAnalysis:

47 def __init__(self):

48 # Mapping table to get frequency order from char order (get from

49 # GetOrder())

50 self._mCharToFreqOrder = None

51 self._mTableSize = None # Size of above table

52 # This is a constant value which varies from language to language,

53 # used in calculating confidence. See

54 # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html

55 # for further detail.

56 self._mTypicalDistributionRatio = None

57 self.reset()

58

59 def reset(self):

60 """reset analyser, clear any state"""

61 # If this flag is set to True, detection is done and conclusion has

62 # been made

63 self._mDone = False

64 self._mTotalChars = 0 # Total characters encountered

65 # The number of characters whose frequency order is less than 512

66 self._mFreqChars = 0

67

68 def feed(self, aBuf, aCharLen):

69 """feed a character with known length"""

70 if aCharLen == 2:

71 # we only care about 2-bytes character in our distribution analysis

72 order = self.get_order(aBuf)

73 else:

74 order = -1

75 if order >= 0:

76 self._mTotalChars += 1

77 # order is valid

78 if order < self._mTableSize:

79 if 512 > self._mCharToFreqOrder[order]:

80 self._mFreqChars += 1

81

82 def get_confidence(self):

83 """return confidence based on existing data"""

84 # if we didn't receive any character in our consideration range,

85 # return negative answer

86 if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:

87 return SURE_NO

88

89 if self._mTotalChars != self._mFreqChars:

90 r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)

91 * self._mTypicalDistributionRatio))

92 if r < SURE_YES:

93 return r

94

95 # normalize confidence (we don't want to be 100% sure)

96 return SURE_YES

97

98 def got_enough_data(self):

99 # It is not necessary to receive all data to draw conclusion.

100 # For charset detection, certain amount of data is enough

101 return self._mTotalChars > ENOUGH_DATA_THRESHOLD

102

103 def get_order(self, aBuf):

104 # We do not handle characters based on the original encoding string,

105 # but convert this encoding string to a number, here called order.

106 # This allows multiple encodings of a language to share one frequency

107 # table.

108 return -1

109

110

111 class EUCTWDistributionAnalysis(CharDistributionAnalysis):

112 def __init__(self):

113 CharDistributionAnalysis.__init__(self)

114 self._mCharToFreqOrder = EUCTWCharToFreqOrder

115 self._mTableSize = EUCTW_TABLE_SIZE

116 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO

117

118 def get_order(self, aBuf):

119 # for euc-TW encoding, we are interested

120 # first byte range: 0xc4 -- 0xfe

121 # second byte range: 0xa1 -- 0xfe

122 # no validation needed here. State machine has done that

123 first_char = wrap_ord(aBuf[0])

124 if first_char >= 0xC4:

125 return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1

126 else:

127 return -1

128

129

130 class EUCKRDistributionAnalysis(CharDistributionAnalysis):

131 def __init__(self):

132 CharDistributionAnalysis.__init__(self)

133 self._mCharToFreqOrder = EUCKRCharToFreqOrder

134 self._mTableSize = EUCKR_TABLE_SIZE

135 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

136

137 def get_order(self, aBuf):

138 # for euc-KR encoding, we are interested

139 # first byte range: 0xb0 -- 0xfe

140 # second byte range: 0xa1 -- 0xfe

141 # no validation needed here. State machine has done that

142 first_char = wrap_ord(aBuf[0])

143 if first_char >= 0xB0:

144 return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1

145 else:

146 return -1

147

148

149 class GB2312DistributionAnalysis(CharDistributionAnalysis):

150 def __init__(self):

151 CharDistributionAnalysis.__init__(self)

152 self._mCharToFreqOrder = GB2312CharToFreqOrder

153 self._mTableSize = GB2312_TABLE_SIZE

154 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO

155

156 def get_order(self, aBuf):

157 # for GB2312 encoding, we are interested

158 # first byte range: 0xb0 -- 0xfe

159 # second byte range: 0xa1 -- 0xfe

160 # no validation needed here. State machine has done that

161 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

162 if (first_char >= 0xB0) and (second_char >= 0xA1):

163 return 94 * (first_char - 0xB0) + second_char - 0xA1

164 else:

165 return -1

166

167

168 class Big5DistributionAnalysis(CharDistributionAnalysis):

169 def __init__(self):

170 CharDistributionAnalysis.__init__(self)

171 self._mCharToFreqOrder = Big5CharToFreqOrder

172 self._mTableSize = BIG5_TABLE_SIZE

173 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO

174

175 def get_order(self, aBuf):

176 # for big5 encoding, we are interested

177 # first byte range: 0xa4 -- 0xfe

178 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe

179 # no validation needed here. State machine has done that

180 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

181 if first_char >= 0xA4:

182 if second_char >= 0xA1:

183 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63

184 else:

185 return 157 * (first_char - 0xA4) + second_char - 0x40

186 else:

187 return -1

188

189

190 class SJISDistributionAnalysis(CharDistributionAnalysis):

191 def __init__(self):

192 CharDistributionAnalysis.__init__(self)

193 self._mCharToFreqOrder = JISCharToFreqOrder

194 self._mTableSize = JIS_TABLE_SIZE

195 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO

196

197 def get_order(self, aBuf):

198 # for sjis encoding, we are interested

199 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe

200 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe

201 # no validation needed here. State machine has done that

202 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

203 if (first_char >= 0x81) and (first_char <= 0x9F):

204 order = 188 * (first_char - 0x81)

205 elif (first_char >= 0xE0) and (first_char <= 0xEF):

206 order = 188 * (first_char - 0xE0 + 31)

207 else:

208 return -1

209 order = order + second_char - 0x40

210 if second_char > 0x7F:

211 order = -1

212 return order

213

214

215 class EUCJPDistributionAnalysis(CharDistributionAnalysis):

216 def __init__(self):

217 CharDistributionAnalysis.__init__(self)

218 self._mCharToFreqOrder = JISCharToFreqOrder

219 self._mTableSize = JIS_TABLE_SIZE

220 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO

221

222 def get_order(self, aBuf):

223 # for euc-JP encoding, we are interested

224 # first byte range: 0xa0 -- 0xfe

225 # second byte range: 0xa1 -- 0xfe

226 # no validation needed here. State machine has done that

227 char = wrap_ord(aBuf[0])

228 if char >= 0xA0:

229 return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1

230 else:

231 return -1

OLD	NEW