third_party/requests/packages/charade/chardistribution.py - Issue 24076010: Add 'requests' library to third_party.

Side by Side Diff: third_party/requests/packages/charade/chardistribution.py

Issue 24076010: Add 'requests' library to third_party. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/tools/swarm_client

Patch Set: Created 7 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« third_party/requests/adapters.py ('K') | « third_party/requests/packages/charade/big5prober.py ('k') | third_party/requests/packages/charade/charsetgroupprober.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 ######################## BEGIN LICENSE BLOCK ########################

	2 # The Original Code is Mozilla Communicator client code.

	3 #

	4 # The Initial Developer of the Original Code is

	5 # Netscape Communications Corporation.

	6 # Portions created by the Initial Developer are Copyright (C) 1998

	7 # the Initial Developer. All Rights Reserved.

	8 #

	9 # Contributor(s):

	10 # Mark Pilgrim - port to Python

	11 #

	12 # This library is free software; you can redistribute it and/or

	13 # modify it under the terms of the GNU Lesser General Public

	14 # License as published by the Free Software Foundation; either

	15 # version 2.1 of the License, or (at your option) any later version.

	16 #

	17 # This library is distributed in the hope that it will be useful,

	18 # but WITHOUT ANY WARRANTY; without even the implied warranty of

	19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

	20 # Lesser General Public License for more details.

	21 #

	22 # You should have received a copy of the GNU Lesser General Public

	23 # License along with this library; if not, write to the Free Software

	24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

	25 # 02110-1301 USA

	26 ######################### END LICENSE BLOCK #########################

	27

	28 from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE,

	29 EUCTW_TYPICAL_DISTRIBUTION_RATIO)

	30 from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE,

	31 EUCKR_TYPICAL_DISTRIBUTION_RATIO)

	32 from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE,

	33 GB2312_TYPICAL_DISTRIBUTION_RATIO)

	34 from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE,

	35 BIG5_TYPICAL_DISTRIBUTION_RATIO)

	36 from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE,

	37 JIS_TYPICAL_DISTRIBUTION_RATIO)

	38 from .compat import wrap_ord

	39

	40 ENOUGH_DATA_THRESHOLD = 1024

	41 SURE_YES = 0.99

	42 SURE_NO = 0.01

	43 MINIMUM_DATA_THRESHOLD = 3

	44

	45

	46 class CharDistributionAnalysis:

	47 def __init__(self):

	48 # Mapping table to get frequency order from char order (get from

	49 # GetOrder())

	50 self._mCharToFreqOrder = None

	51 self._mTableSize = None # Size of above table

	52 # This is a constant value which varies from language to language,

	53 # used in calculating confidence. See

	54 # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html

	55 # for further detail.

	56 self._mTypicalDistributionRatio = None

	57 self.reset()

	58

	59 def reset(self):

	60 """reset analyser, clear any state"""

	61 # If this flag is set to True, detection is done and conclusion has

	62 # been made

	63 self._mDone = False

	64 self._mTotalChars = 0 # Total characters encountered

	65 # The number of characters whose frequency order is less than 512

	66 self._mFreqChars = 0

	67

	68 def feed(self, aBuf, aCharLen):

	69 """feed a character with known length"""

	70 if aCharLen == 2:

	71 # we only care about 2-bytes character in our distribution analysis

	72 order = self.get_order(aBuf)

	73 else:

	74 order = -1

	75 if order >= 0:

	76 self._mTotalChars += 1

	77 # order is valid

	78 if order < self._mTableSize:

	79 if 512 > self._mCharToFreqOrder[order]:

	80 self._mFreqChars += 1

	81

	82 def get_confidence(self):

	83 """return confidence based on existing data"""

	84 # if we didn't receive any character in our consideration range,

	85 # return negative answer

	86 if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:

	87 return SURE_NO

	88

	89 if self._mTotalChars != self._mFreqChars:

	90 r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)

	91 * self._mTypicalDistributionRatio))

	92 if r < SURE_YES:

	93 return r

	94

	95 # normalize confidence (we don't want to be 100% sure)

	96 return SURE_YES

	97

	98 def got_enough_data(self):

	99 # It is not necessary to receive all data to draw conclusion.

	100 # For charset detection, certain amount of data is enough

	101 return self._mTotalChars > ENOUGH_DATA_THRESHOLD

	102

	103 def get_order(self, aBuf):

	104 # We do not handle characters based on the original encoding string,

	105 # but convert this encoding string to a number, here called order.

	106 # This allows multiple encodings of a language to share one frequency

	107 # table.

	108 return -1

	109

	110

	111 class EUCTWDistributionAnalysis(CharDistributionAnalysis):

	112 def __init__(self):

	113 CharDistributionAnalysis.__init__(self)

	114 self._mCharToFreqOrder = EUCTWCharToFreqOrder

	115 self._mTableSize = EUCTW_TABLE_SIZE

	116 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO

	117

	118 def get_order(self, aBuf):

	119 # for euc-TW encoding, we are interested

	120 # first byte range: 0xc4 -- 0xfe

	121 # second byte range: 0xa1 -- 0xfe

	122 # no validation needed here. State machine has done that

	123 first_char = wrap_ord(aBuf[0])

	124 if first_char >= 0xC4:

	125 return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1

	126 else:

	127 return -1

	128

	129

	130 class EUCKRDistributionAnalysis(CharDistributionAnalysis):

	131 def __init__(self):

	132 CharDistributionAnalysis.__init__(self)

	133 self._mCharToFreqOrder = EUCKRCharToFreqOrder

	134 self._mTableSize = EUCKR_TABLE_SIZE

	135 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO

	136

	137 def get_order(self, aBuf):

	138 # for euc-KR encoding, we are interested

	139 # first byte range: 0xb0 -- 0xfe

	140 # second byte range: 0xa1 -- 0xfe

	141 # no validation needed here. State machine has done that

	142 first_char = wrap_ord(aBuf[0])

	143 if first_char >= 0xB0:

	144 return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1

	145 else:

	146 return -1

	147

	148

	149 class GB2312DistributionAnalysis(CharDistributionAnalysis):

	150 def __init__(self):

	151 CharDistributionAnalysis.__init__(self)

	152 self._mCharToFreqOrder = GB2312CharToFreqOrder

	153 self._mTableSize = GB2312_TABLE_SIZE

	154 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO

	155

	156 def get_order(self, aBuf):

	157 # for GB2312 encoding, we are interested

	158 # first byte range: 0xb0 -- 0xfe

	159 # second byte range: 0xa1 -- 0xfe

	160 # no validation needed here. State machine has done that

	161 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

	162 if (first_char >= 0xB0) and (second_char >= 0xA1):

	163 return 94 * (first_char - 0xB0) + second_char - 0xA1

	164 else:

	165 return -1

	166

	167

	168 class Big5DistributionAnalysis(CharDistributionAnalysis):

	169 def __init__(self):

	170 CharDistributionAnalysis.__init__(self)

	171 self._mCharToFreqOrder = Big5CharToFreqOrder

	172 self._mTableSize = BIG5_TABLE_SIZE

	173 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO

	174

	175 def get_order(self, aBuf):

	176 # for big5 encoding, we are interested

	177 # first byte range: 0xa4 -- 0xfe

	178 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe

	179 # no validation needed here. State machine has done that

	180 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

	181 if first_char >= 0xA4:

	182 if second_char >= 0xA1:

	183 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63

	184 else:

	185 return 157 * (first_char - 0xA4) + second_char - 0x40

	186 else:

	187 return -1

	188

	189

	190 class SJISDistributionAnalysis(CharDistributionAnalysis):

	191 def __init__(self):

	192 CharDistributionAnalysis.__init__(self)

	193 self._mCharToFreqOrder = JISCharToFreqOrder

	194 self._mTableSize = JIS_TABLE_SIZE

	195 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO

	196

	197 def get_order(self, aBuf):

	198 # for sjis encoding, we are interested

	199 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe

	200 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe

	201 # no validation needed here. State machine has done that

	202 first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1])

	203 if (first_char >= 0x81) and (first_char <= 0x9F):

	204 order = 188 * (first_char - 0x81)

	205 elif (first_char >= 0xE0) and (first_char <= 0xEF):

	206 order = 188 * (first_char - 0xE0 + 31)

	207 else:

	208 return -1

	209 order = order + second_char - 0x40

	210 if second_char > 0x7F:

	211 order = -1

	212 return order

	213

	214

	215 class EUCJPDistributionAnalysis(CharDistributionAnalysis):

	216 def __init__(self):

	217 CharDistributionAnalysis.__init__(self)

	218 self._mCharToFreqOrder = JISCharToFreqOrder

	219 self._mTableSize = JIS_TABLE_SIZE

	220 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO

	221

	222 def get_order(self, aBuf):

	223 # for euc-JP encoding, we are interested

	224 # first byte range: 0xa0 -- 0xfe

	225 # second byte range: 0xa1 -- 0xfe

	226 # no validation needed here. State machine has done that

	227 char = wrap_ord(aBuf[0])

	228 if char >= 0xA0:

	229 return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1

	230 else:

	231 return -1

OLD	NEW