OLD | NEW |
1 # Copyright (C) 2011 Google Inc. All rights reserved. | 1 # Copyright (C) 2011 Google Inc. All rights reserved. |
2 # | 2 # |
3 # Redistribution and use in source and binary forms, with or without | 3 # Redistribution and use in source and binary forms, with or without |
4 # modification, are permitted provided that the following conditions are | 4 # modification, are permitted provided that the following conditions are |
5 # met: | 5 # met: |
6 # | 6 # |
7 # * Redistributions of source code must retain the above copyright | 7 # * Redistributions of source code must retain the above copyright |
8 # notice, this list of conditions and the following disclaimer. | 8 # notice, this list of conditions and the following disclaimer. |
9 # * Redistributions in binary form must reproduce the above | 9 # * Redistributions in binary form must reproduce the above |
10 # copyright notice, this list of conditions and the following disclaimer | 10 # copyright notice, this list of conditions and the following disclaimer |
(...skipping 19 matching lines...) Expand all Loading... |
30 | 30 |
31 | 31 |
32 class ExtractReferenceLinkParser(HTMLParser): | 32 class ExtractReferenceLinkParser(HTMLParser): |
33 | 33 |
34 def __init__(self): | 34 def __init__(self): |
35 HTMLParser.__init__(self) | 35 HTMLParser.__init__(self) |
36 self.matches = [] | 36 self.matches = [] |
37 self.mismatches = [] | 37 self.mismatches = [] |
38 | 38 |
39 def handle_starttag(self, tag, attrs): | 39 def handle_starttag(self, tag, attrs): |
40 if tag != "link": | 40 if tag != 'link': |
41 return | 41 return |
42 attrs = dict(attrs) | 42 attrs = dict(attrs) |
43 if not "rel" in attrs: | 43 if not 'rel' in attrs: |
44 return | 44 return |
45 if not "href" in attrs: | 45 if not 'href' in attrs: |
46 return | 46 return |
47 if attrs["rel"] == "match": | 47 if attrs['rel'] == 'match': |
48 self.matches.append(attrs["href"]) | 48 self.matches.append(attrs['href']) |
49 if attrs["rel"] == "mismatch": | 49 if attrs['rel'] == 'mismatch': |
50 self.mismatches.append(attrs["href"]) | 50 self.mismatches.append(attrs['href']) |
51 | 51 |
52 | 52 |
53 def get_reference_link(html_string): | 53 def get_reference_link(html_string): |
54 """Returns reference links in the given html_string. | 54 """Returns reference links in the given html_string. |
55 | 55 |
56 Returns: | 56 Returns: |
57 a tuple of two URL lists, (matches, mismatches). | 57 a tuple of two URL lists, (matches, mismatches). |
58 """ | 58 """ |
59 parser = ExtractReferenceLinkParser() | 59 parser = ExtractReferenceLinkParser() |
60 parser.feed(html_string) | 60 parser.feed(html_string) |
61 parser.close() | 61 parser.close() |
62 | 62 |
63 return parser.matches, parser.mismatches | 63 return parser.matches, parser.mismatches |
OLD | NEW |