OLD | NEW |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Creates a Python telemetry page_set from the specified webpages CSV. | 6 """Creates a Python telemetry page_set from the specified webpages CSV. |
7 | 7 |
8 This module does the following steps: | 8 This module does the following steps: |
9 * Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip | 9 * Downloads a ZIP from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip |
10 * Unpacks it and reads its contents in memory. | 10 * Unpacks it and reads its contents in memory. |
11 * Writes out multiple Python page sets from the CSV file for the specified | 11 * Writes out multiple Python page sets from the CSV file for the specified |
12 number of webpages. | 12 number of webpages. |
13 | 13 |
14 Sample Usage: | 14 Sample Usage: |
15 python create_page_set.py -s 1 -e 10000 | 15 python create_page_set.py -s 1 -e 10000 |
16 | 16 |
17 Running the above command will create 10000 different page sets. | 17 Running the above command will create 10000 different page sets. |
18 """ | 18 """ |
19 | 19 |
20 __author__ = 'Ravi Mistry' | 20 __author__ = 'Ravi Mistry' |
21 | 21 |
22 import json | |
22 import optparse | 23 import optparse |
23 import os | 24 import os |
24 import urllib | 25 import urllib |
25 import zipfile | 26 import zipfile |
26 | 27 |
27 from StringIO import StringIO | 28 from StringIO import StringIO |
28 | 29 |
29 | 30 |
30 TOP1M_CSV_FILE_NAME = 'top-1m.csv' | 31 TOP1M_CSV_FILE_NAME = 'top-1m.csv' |
31 TOP1M_CSV_ZIP_LOCATION = ( | 32 TOP1M_CSV_ZIP_LOCATION = ( |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
98 qualified_website = website | 99 qualified_website = website |
99 else: | 100 else: |
100 qualified_website = 'http://www.%s' % website | 101 qualified_website = 'http://www.%s' % website |
101 websites.append(qualified_website) | 102 websites.append(qualified_website) |
102 | 103 |
103 archive_data_file = os.path.join( | 104 archive_data_file = os.path.join( |
104 '/', 'b', 'storage', 'webpage_archives', | 105 '/', 'b', 'storage', 'webpage_archives', |
105 options.pagesets_type, | 106 options.pagesets_type, |
106 'alexa%s-%s.json' % (options.start_number, options.end_number)) | 107 'alexa%s-%s.json' % (options.start_number, options.end_number)) |
107 | 108 |
108 page_set_content = """ | 109 page_set_content = { |
109 # Copyright 2015 The Chromium Authors. All rights reserved. | 110 'user_agent': options.useragent_type, |
110 # Use of this source code is governed by a BSD-style license that can be | 111 'archive_data_file': archive_data_file, |
111 # found in the LICENSE file. | 112 'urls_list': ','.join(websites), |
112 # pylint: disable=W0401,W0614 | |
113 | |
114 from telemetry import story | |
115 from telemetry.page import page as page_module | |
116 from telemetry.page import shared_page_state | |
117 from page_sets import repaint_helpers | |
118 | |
119 | |
120 class TypicalAlexaPage(page_module.Page): | |
121 | |
122 def __init__(self, url, page_set): | |
123 super(TypicalAlexaPage, self).__init__( | |
124 url=url, | |
125 page_set=page_set, | |
126 shared_page_state_class=shared_page_state.Shared%(user_agent)sPageState) | |
127 self.archive_data_file = '%(archive_data_file)s' | |
128 | |
129 def RunNavigateSteps(self, action_runner): | |
130 action_runner.Navigate(self.url) | |
131 action_runner.Wait(2) | |
132 | |
133 def RunPageInteractions(self, action_runner): | |
134 repaint_helpers.Repaint(action_runner) | |
135 | |
136 | |
137 class Alexa%(start)s_%(end)sPageSet(story.StorySet): | |
138 | |
139 def __init__(self): | |
140 super(Alexa%(start)s_%(end)sPageSet, self).__init__( | |
141 archive_data_file='%(archive_data_file)s') | |
142 | |
143 urls_list = %(urls_list)s | |
144 | |
145 for url in urls_list: | |
146 self.AddStory(TypicalAlexaPage(url, self)) | |
147 """ % { | |
148 "user_agent": options.useragent_type.capitalize(), | |
149 "archive_data_file": archive_data_file, | |
150 "start": options.start_number, | |
151 "end": options.end_number, | |
152 "urls_list": str(websites), | |
153 } | 113 } |
154 | 114 |
155 # Output the pageset to a file. | 115 # Output the pageset to a file. |
156 with open(os.path.join(options.pagesets_output_dir, 'alexa%s_%s.py' % ( | 116 with open(os.path.join(options.pagesets_output_dir, 'alexa%s_%s.py' % ( |
157 options.start_number, options.end_number)), | 117 options.start_number, options.end_number)), |
158 'w') as outfile: | 118 'w') as outfile: |
159 outfile.write(page_set_content) | 119 json.dump(page_set_content, outfile) |
dogben
2015/10/14 18:25:50
Any potential issues with changing the format of t
rmistry
2015/10/15 12:23:21
Right, all pagesets will need to be recreated and
| |
160 | |
OLD | NEW |