Index: appengine/monorail/features/spammodel.py |
diff --git a/appengine/monorail/features/spammodel.py b/appengine/monorail/features/spammodel.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..3f292a9231f21f49c3135927a723ad1370ac0cca |
--- /dev/null |
+++ b/appengine/monorail/features/spammodel.py |
@@ -0,0 +1,74 @@ |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is govered by a BSD-style |
+# license that can be found in the LICENSE file or at |
+# https://developers.google.com/open-source/licenses/bsd |
+""" Tasks and handlers for maintaining the spam classifier model. These |
+ should be run via cron and task queue rather than manually. |
+""" |
+ |
+import cgi |
+import csv |
+import logging |
+import webapp2 |
+import cloudstorage |
+import json |
+ |
+from datetime import date |
+from datetime import datetime |
+from datetime import timedelta |
+ |
+from framework import servlet |
+from framework import urls |
+from google.appengine.api import taskqueue |
+from google.appengine.api import app_identity |
+from framework import gcs_helpers |
+ |
+class TrainingDataExport(webapp2.RequestHandler): |
+ """Trigger a training data export task""" |
+ def get(self): |
+ logging.info("Training data export requested.") |
+ taskqueue.add(url=urls.SPAM_DATA_EXPORT_TASK + '.do') |
+ |
+BATCH_SIZE = 100 |
+ |
+class TrainingDataExportTask(servlet.Servlet): |
+ """Export any human-labeled ham or spam from the previous day. These |
+ records will be used by a subsequent task to create an updated model. |
+ """ |
+ CHECK_SECURITY_TOKEN = False |
+ |
+ def ProcessFormData(self, mr, post_data): |
+ logging.info("Training data export initiated.") |
+ |
+ bucket_name = app_identity.get_default_gcs_bucket_name() |
+ date_str = date.today().isoformat() |
+ export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str |
+ total_issues = 0 |
+ |
+ with cloudstorage.open(export_target_path, mode='w', |
+ content_type=None, options=None, retry_params=None) as gcs_file: |
+ |
+ csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"', |
+ quoting=csv.QUOTE_ALL, lineterminator='\n') |
+ |
+ since = datetime.now() - timedelta(days=1) |
+ |
+ # TODO: Comments, and further pagination |
+ issues, first_comments, _count = ( |
+ self.services.spam.GetTrainingIssues( |
+ mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE)) |
+ total_issues += len(issues) |
+ for issue in issues: |
+ # Cloud Prediction API doesn't allow newlines in the training data. |
+ fixed_summary = issue.summary.replace('\r\n', ' ') |
+ fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ') |
+ |
+ csv_writer.writerow([ |
+ 'spam' if issue.is_spam else 'ham', |
+ fixed_summary, fixed_comment, |
+ ]) |
+ |
+ self.response.body = json.dumps({ |
+ "exported_issue_count": total_issues, |
+ }) |
+ |