appengine/monorail/features/spammodel.py - Issue 1868553004: Open Source Monorail

Unified Diff: appengine/monorail/features/spammodel.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Rebase Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: appengine/monorail/features/spammodel.py

diff --git a/appengine/monorail/features/spammodel.py b/appengine/monorail/features/spammodel.py

new file mode 100644

index 0000000000000000000000000000000000000000..3f292a9231f21f49c3135927a723ad1370ac0cca

--- /dev/null

+++ b/appengine/monorail/features/spammodel.py

@@ -0,0 +1,74 @@

+# Use of this source code is govered by a BSD-style

+# license that can be found in the LICENSE file or at

+# https://developers.google.com/open-source/licenses/bsd

+""" Tasks and handlers for maintaining the spam classifier model. These

+ should be run via cron and task queue rather than manually.

+"""

+import cgi

+import csv

+import logging

+import webapp2

+import cloudstorage

+import json

+from datetime import date

+from datetime import datetime

+from datetime import timedelta

+from framework import servlet

+from framework import urls

+from google.appengine.api import taskqueue

+from google.appengine.api import app_identity

+from framework import gcs_helpers

+class TrainingDataExport(webapp2.RequestHandler):

+ """Trigger a training data export task"""

+ def get(self):

+ logging.info("Training data export requested.")

+ taskqueue.add(url=urls.SPAM_DATA_EXPORT_TASK + '.do')

+BATCH_SIZE = 100

+class TrainingDataExportTask(servlet.Servlet):

+ """Export any human-labeled ham or spam from the previous day. These

+ records will be used by a subsequent task to create an updated model.

+ """

+ CHECK_SECURITY_TOKEN = False

+ def ProcessFormData(self, mr, post_data):

+ logging.info("Training data export initiated.")

+ bucket_name = app_identity.get_default_gcs_bucket_name()

+ date_str = date.today().isoformat()

+ export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str

+ total_issues = 0

+ with cloudstorage.open(export_target_path, mode='w',

+ content_type=None, options=None, retry_params=None) as gcs_file:

+ csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"',

+ quoting=csv.QUOTE_ALL, lineterminator='\n')

+ since = datetime.now() - timedelta(days=1)

+ # TODO: Comments, and further pagination

+ issues, first_comments, _count = (

+ self.services.spam.GetTrainingIssues(

+ mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))

+ total_issues += len(issues)

+ for issue in issues:

+ # Cloud Prediction API doesn't allow newlines in the training data.

+ fixed_summary = issue.summary.replace('\r\n', ' ')

+ fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ')

+ csv_writer.writerow([

+ 'spam' if issue.is_spam else 'ham',

+ fixed_summary, fixed_comment,

+ ])

+ self.response.body = json.dumps({

+ "exported_issue_count": total_issues,

+ })

« no previous file with comments | « appengine/monorail/features/savedqueries_helpers.py ('k') | appengine/monorail/features/stars.py » ('j') | no next file with comments »