Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(577)

Side by Side Diff: appengine/monorail/features/spammodel.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is govered by a BSD-style
3 # license that can be found in the LICENSE file or at
4 # https://developers.google.com/open-source/licenses/bsd
5 """ Tasks and handlers for maintaining the spam classifier model. These
6 should be run via cron and task queue rather than manually.
7 """
8
9 import cgi
10 import csv
11 import logging
12 import webapp2
13 import cloudstorage
14 import json
15
16 from datetime import date
17 from datetime import datetime
18 from datetime import timedelta
19
20 from framework import servlet
21 from framework import urls
22 from google.appengine.api import taskqueue
23 from google.appengine.api import app_identity
24 from framework import gcs_helpers
25
26 class TrainingDataExport(webapp2.RequestHandler):
27 """Trigger a training data export task"""
28 def get(self):
29 logging.info("Training data export requested.")
30 taskqueue.add(url=urls.SPAM_DATA_EXPORT_TASK + '.do')
31
32 BATCH_SIZE = 100
33
34 class TrainingDataExportTask(servlet.Servlet):
35 """Export any human-labeled ham or spam from the previous day. These
36 records will be used by a subsequent task to create an updated model.
37 """
38 CHECK_SECURITY_TOKEN = False
39
40 def ProcessFormData(self, mr, post_data):
41 logging.info("Training data export initiated.")
42
43 bucket_name = app_identity.get_default_gcs_bucket_name()
44 date_str = date.today().isoformat()
45 export_target_path = '/' + bucket_name + '/spam_training_data/' + date_str
46 total_issues = 0
47
48 with cloudstorage.open(export_target_path, mode='w',
49 content_type=None, options=None, retry_params=None) as gcs_file:
50
51 csv_writer = csv.writer(gcs_file, delimiter=',', quotechar='"',
52 quoting=csv.QUOTE_ALL, lineterminator='\n')
53
54 since = datetime.now() - timedelta(days=1)
55
56 # TODO: Comments, and further pagination
57 issues, first_comments, _count = (
58 self.services.spam.GetTrainingIssues(
59 mr.cnxn, self.services.issue, since, offset=0, limit=BATCH_SIZE))
60 total_issues += len(issues)
61 for issue in issues:
62 # Cloud Prediction API doesn't allow newlines in the training data.
63 fixed_summary = issue.summary.replace('\r\n', ' ')
64 fixed_comment = first_comments[issue.issue_id].replace('\r\n', ' ')
65
66 csv_writer.writerow([
67 'spam' if issue.is_spam else 'ham',
68 fixed_summary, fixed_comment,
69 ])
70
71 self.response.body = json.dumps({
72 "exported_issue_count": total_issues,
73 })
74
OLDNEW
« no previous file with comments | « appengine/monorail/features/savedqueries_helpers.py ('k') | appengine/monorail/features/stars.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698