| OLD | NEW |
| 1 # Copyright (C) 2011 Google Inc. All rights reserved. | 1 # Copyright (C) 2011 Google Inc. All rights reserved. |
| 2 # | 2 # |
| 3 # Redistribution and use in source and binary forms, with or without | 3 # Redistribution and use in source and binary forms, with or without |
| 4 # modification, are permitted provided that the following conditions are | 4 # modification, are permitted provided that the following conditions are |
| 5 # met: | 5 # met: |
| 6 # | 6 # |
| 7 # * Redistributions of source code must retain the above copyright | 7 # * Redistributions of source code must retain the above copyright |
| 8 # notice, this list of conditions and the following disclaimer. | 8 # notice, this list of conditions and the following disclaimer. |
| 9 # * Redistributions in binary form must reproduce the above | 9 # * Redistributions in binary form must reproduce the above |
| 10 # copyright notice, this list of conditions and the following disclaimer | 10 # copyright notice, this list of conditions and the following disclaimer |
| (...skipping 19 matching lines...) Expand all Loading... |
| 30 | 30 |
| 31 | 31 |
| 32 class ExtractReferenceLinkParser(HTMLParser): | 32 class ExtractReferenceLinkParser(HTMLParser): |
| 33 | 33 |
| 34 def __init__(self): | 34 def __init__(self): |
| 35 HTMLParser.__init__(self) | 35 HTMLParser.__init__(self) |
| 36 self.matches = [] | 36 self.matches = [] |
| 37 self.mismatches = [] | 37 self.mismatches = [] |
| 38 | 38 |
| 39 def handle_starttag(self, tag, attrs): | 39 def handle_starttag(self, tag, attrs): |
| 40 if tag != "link": | 40 if tag != 'link': |
| 41 return | 41 return |
| 42 attrs = dict(attrs) | 42 attrs = dict(attrs) |
| 43 if not "rel" in attrs: | 43 if not 'rel' in attrs: |
| 44 return | 44 return |
| 45 if not "href" in attrs: | 45 if not 'href' in attrs: |
| 46 return | 46 return |
| 47 if attrs["rel"] == "match": | 47 if attrs['rel'] == 'match': |
| 48 self.matches.append(attrs["href"]) | 48 self.matches.append(attrs['href']) |
| 49 if attrs["rel"] == "mismatch": | 49 if attrs['rel'] == 'mismatch': |
| 50 self.mismatches.append(attrs["href"]) | 50 self.mismatches.append(attrs['href']) |
| 51 | 51 |
| 52 | 52 |
| 53 def get_reference_link(html_string): | 53 def get_reference_link(html_string): |
| 54 """Returns reference links in the given html_string. | 54 """Returns reference links in the given html_string. |
| 55 | 55 |
| 56 Returns: | 56 Returns: |
| 57 a tuple of two URL lists, (matches, mismatches). | 57 a tuple of two URL lists, (matches, mismatches). |
| 58 """ | 58 """ |
| 59 parser = ExtractReferenceLinkParser() | 59 parser = ExtractReferenceLinkParser() |
| 60 parser.feed(html_string) | 60 parser.feed(html_string) |
| 61 parser.close() | 61 parser.close() |
| 62 | 62 |
| 63 return parser.matches, parser.mismatches | 63 return parser.matches, parser.mismatches |
| OLD | NEW |