| Index: heuristics/distillable/calculate_derived_features.py
|
| diff --git a/heuristics/distillable/calculate_derived_features.py b/heuristics/distillable/calculate_derived_features.py
|
| index 1ae52967464a57a2c5a730d77860f86dcdf3d738..dcff117ccfdb697663290571355651b5bfe92a74 100755
|
| --- a/heuristics/distillable/calculate_derived_features.py
|
| +++ b/heuristics/distillable/calculate_derived_features.py
|
| @@ -23,8 +23,28 @@ def WordCount(s):
|
| def GetLastSegment(path):
|
| return re.search('[^/]*\/?$', path).group(0)
|
|
|
| -def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
|
| - innerText, textContent, innerHTML, numText, numPassword, mozScores):
|
| +def CalcDerivedFeatures(index, raw):
|
| + return _CalcDerivedFeatures(
|
| + index,
|
| + raw,
|
| + raw['opengraph'],
|
| + raw['url'],
|
| + raw['title'],
|
| + raw['numElements'],
|
| + raw['numAnchors'],
|
| + raw['numForms'],
|
| + raw['numPPRE'],
|
| + raw['visibleElements'],
|
| + raw['visiblePPRE'],
|
| + raw['innerText'],
|
| + raw['textContent'],
|
| + raw['innerHTML'],
|
| + raw['numTextInput'],
|
| + raw['numPasswordInput']
|
| + )
|
| +
|
| +def _CalcDerivedFeatures(index, raw, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
|
| + innerText, textContent, innerHTML, numText, numPassword):
|
| path = urlparse.urlparse(url).path
|
|
|
| path = path.encode('utf-8')
|
| @@ -35,7 +55,9 @@ def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
|
| innerTextWords = WordCount(innerText)
|
| textContentWords = WordCount(textContent)
|
| innerHTMLWords = WordCount(innerHTML)
|
| - return [
|
| + features = [
|
| + 'id', index,
|
| + 'sin', math.sin(index),
|
| 'openGraph', opengraph,
|
|
|
| 'forum', 'forum' in path,
|
| @@ -53,16 +75,39 @@ def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
|
| 'pathNumbers', CountMatches(path, r'\d+'),
|
| 'lastSegmentLength', len(GetLastSegment(path)),
|
|
|
| + 'visibleRatio', float(visibleElements) / max(1, numElements),
|
| + 'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
|
| + 'PPRERatio', float(numPPRE) / max(1, numElements),
|
| + 'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),
|
| +
|
| + 'innerTextLength', len(innerText),
|
| + 'textContentLength', len(textContent),
|
| + 'innerHtmlLength', len(innerHTML),
|
| + 'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
|
| + 'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
|
| + 'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textContent)),
|
| +
|
| + 'innerTextWordCount', innerTextWords,
|
| + 'textContentWordCount', textContentWords,
|
| + 'innerhtmlWordCount', innerHTMLWords,
|
| + 'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
|
| + 'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords),
|
| + 'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textContentWords),
|
| +
|
| + 'textCount', numText,
|
| + 'passwordCount', numPassword,
|
| 'formCount', numForms,
|
| 'anchorCount', numAnchors,
|
| 'elementCount', numElements,
|
| 'anchorRatio', float(numAnchors) / max(1, numElements),
|
| -
|
| - 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)),
|
| - 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)),
|
| - 'mozScoreAllLinear', min(mozScores[5], 6000),
|
| ]
|
|
|
| + for k in sorted(raw):
|
| + if 'mozScore' in k or 'num' in k:
|
| + features += [k, raw[k]]
|
| +
|
| + return features
|
| +
|
| def main(argv):
|
| parser = argparse.ArgumentParser()
|
| parser.add_argument('--out', required=True)
|
| @@ -79,25 +124,8 @@ def main(argv):
|
| for entry in core:
|
| features = entry['features']
|
| print 'processing %d' % (entry['index'])
|
| - entry['features'] = CalcDerivedFeatures(
|
| - entry['index'],
|
| - features['opengraph'],
|
| - features['url'],
|
| - features['title'],
|
| - features['numElements'],
|
| - features['numAnchors'],
|
| - features['numForms'],
|
| - features['numPPRE'],
|
| - features['visibleElements'],
|
| - features['visiblePPRE'],
|
| - features['innerText'],
|
| - features['textContent'],
|
| - features['innerHTML'],
|
| - features['numTextInput'],
|
| - features['numPasswordInput'],
|
| - [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllLinear'],
|
| - features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozScoreFastAllLinear']]
|
| - )
|
| +
|
| + entry['features'] = CalcDerivedFeatures(entry['index'], features)
|
|
|
| with open(options.out, 'w') as outfile:
|
| json.dump(core, outfile, indent=1)
|
|
|