OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 """ Tasks and handlers for maintaining the spam classifier model. These |
| 6 should be run via cron and task queue rather than manually. |
| 7 """ |
| 8 |
| 9 import cgi |
| 10 import csv |
| 11 import logging |
| 12 import webapp2 |
| 13 import cloudstorage |
| 14 import json |
| 15 |
| 16 from datetime import date |
| 17 from datetime import datetime |
| 18 from datetime import timedelta |
| 19 |
| 20 from framework import servlet |
| 21 from framework import urls |
| 22 from google.appengine.api import taskqueue |
| 23 from google.appengine.api import app_identity |
| 24 from framework import gcs_helpers |
| 25 |
| 26 class TrainingDataExport(webapp2.RequestHandler): |
| 27 """Trigger a training data export task""" |
| 28 def get(self): |
| 29 logging.info("Training data export requested.") |
| 30 taskqueue.add(url=urls.SPAM_DATA_EXPORT_TASK + '.do') |
| 31 |
| 32 BATCH_SIZE = 100 |
| 33 |
| 34 class TrainingDataExportTask(servlet.Servlet): |
| 35 """Export any human-labeled ham or spam from the previous day. These |
| 36 records will be used by a subsequent task to create an updated model. |
| 37 """ |
| 38 CHECK_SECURITY_TOKEN = False |
| 39 |
| 40 def ProcessFormData(self, mr, post_data): |
| 41 logging.info("Training data export initiated.") |
| 42 |
| 43 bucket_name = app_identity.get_default_gcs_bucket_name() |
| 44 date_str = date.today().isoformat() |
| 45 export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str |
| 46 total_issues = 0 |
| 47 |
| 48 with cloudstorage.open(export_target_path, mode='w', |
| 49 content_type=None, options=None, retry_params=None) as gcs_file: |
| 50 |
| 51 csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"', |
| 52 quoting=csv.QUOTE_ALL, lineterminator='\n') |
| 53 |
| 54 since = datetime.now() - timedelta(days=1) |
| 55 |
| 56 # TODO: Comments, and further pagination |
| 57 issues, first_comments, _count = ( |
| 58 self.services.spam.GetTrainingIssues( |
| 59 mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE)) |
| 60 total_issues += len(issues) |
| 61 for issue in issues: |
| 62 # Cloud Prediction API doesn't allow newlines in the training data. |
| 63 fixed_summary = issue.summary.replace('\r\n', ' ') |
| 64 fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ') |
| 65 |
| 66 csv_writer.writerow([ |
| 67 'spam' if issue.is_spam else 'ham', |
| 68 fixed_summary, fixed_comment, |
| 69 ]) |
| 70 |
| 71 self.response.body = json.dumps({ |
| 72 "exported_issue_count": total_issues, |
| 73 }) |
| 74 |
OLD | NEW |