Index: appengine/monorail/framework/emailfmt.py |
diff --git a/appengine/monorail/framework/emailfmt.py b/appengine/monorail/framework/emailfmt.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d4aa955e8cdcb518ac4865e4a6e74944da1c52e3 |
--- /dev/null |
+++ b/appengine/monorail/framework/emailfmt.py |
@@ -0,0 +1,359 @@ |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is govered by a BSD-style |
+# license that can be found in the LICENSE file or at |
+# https://developers.google.com/open-source/licenses/bsd |
+ |
+"""Functions that format or parse email messages in Monorail. |
+ |
+Specifically, this module has the logic for generating various email |
+header lines that help match inbound and outbound email to the project |
+and artifact that generated it. |
+""" |
+ |
+import hmac |
+import logging |
+import re |
+import rfc822 |
+ |
+from google.appengine.api import app_identity |
+ |
+import settings |
+from framework import framework_constants |
+from services import client_config_svc |
+from services import secrets_svc |
+ |
+# TODO(jrobbins): Parsing very large messages is slow, and we are not going |
+# to handle attachments at first, so there is no reason to consider large |
+# emails. |
+MAX_BODY_SIZE = 100 * 1024 |
+MAX_HEADER_CHARS_CONSIDERED = 255 |
+ |
+ |
+ |
+def IsBodyTooBigToParse(body): |
+ """Return True if the email message body is too big to process.""" |
+ return len(body) > MAX_BODY_SIZE |
+ |
+ |
+def IsProjectAddressOnToLine(project_addr, to_addrs): |
+ """Return True if an email was explicitly sent directly to us.""" |
+ return project_addr in to_addrs |
+ |
+ |
+def ParseEmailMessage(msg): |
+ """Parse the given MessageRouterMessage and return relevant fields. |
+ |
+ Args: |
+ msg: email.message.Message object for the email message sent to us. |
+ |
+ Returns: |
+ A tuple: from_addr, to_addrs, cc_addrs, references, subject, body. |
+ """ |
+ # Ignore messages that are probably not from humans, see: |
+ # http://google.com/search?q=precedence+bulk+junk |
+ precedence = msg.get('precedence', '') |
+ if precedence.lower() in ['bulk', 'junk']: |
+ logging.info('Precedence: %r indicates an autoresponder', precedence) |
+ return '', [], [], '', '', '' |
+ |
+ from_addrs = _ExtractAddrs(msg.get('from', '')) |
+ if from_addrs: |
+ from_addr = from_addrs[0] |
+ else: |
+ from_addr = '' |
+ |
+ to_addrs = _ExtractAddrs(msg.get('to', '')) |
+ cc_addrs = _ExtractAddrs(msg.get('cc', '')) |
+ |
+ in_reply_to = msg.get('in-reply-to', '') |
+ references = msg.get('references', '').split() |
+ references = list({ref for ref in [in_reply_to] + references if ref}) |
+ subject = _StripSubjectPrefixes(msg.get('subject', '')) |
+ |
+ body = '' |
+ for part in msg.walk(): |
+ # We only process plain text emails. |
+ if part.get_content_type() == 'text/plain': |
+ body = part.get_payload(decode=True) |
+ break # Only consider the first text part. |
+ |
+ return from_addr, to_addrs, cc_addrs, references, subject, body |
+ |
+ |
+def _ExtractAddrs(header_value): |
+ """Given a message header value, return email address found there.""" |
+ friendly_addr_pairs = list(rfc822.AddressList(header_value)) |
+ return [addr for _friendly, addr in friendly_addr_pairs] |
+ |
+ |
+def _StripSubjectPrefixes(subject): |
+ """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes.""" |
+ prefix = _FindSubjectPrefix(subject) |
+ while prefix: |
+ subject = subject[len(prefix):].strip() |
+ prefix = _FindSubjectPrefix(subject) |
+ |
+ return subject |
+ |
+ |
+def _FindSubjectPrefix(subject): |
+ """If the given subject starts with a prefix, return that prefix.""" |
+ for prefix in ['re:', 'aw:', 'fwd:', 'fw:']: |
+ if subject.lower().startswith(prefix): |
+ return prefix |
+ |
+ return None |
+ |
+ |
+def MailDomain(): |
+ """Return the domain name where this app can recieve email.""" |
+ if settings.unit_test_mode: |
+ return 'testbed-test.appspotmail.com' |
+ |
+ # If running on a GAFYD domain, you must define an app alias on the |
+ # Application Settings admin web page. If you cannot reserve the matching |
+ # APP_ID for the alias, then specify it in settings.mail_domain. |
+ if settings.mail_domain: |
+ return settings.mail_domain |
+ |
+ app_id = app_identity.get_application_id() |
+ if ':' in app_id: |
+ app_id = app_id.split(':')[-1] |
+ |
+ return '%s.appspotmail.com' % app_id |
+ |
+ |
+def FormatFriendly(commenter_view, sender, reveal_addr): |
+ """Format the From: line to include the commenter's friendly name if given.""" |
+ if commenter_view: |
+ site_name = settings.site_name |
+ if commenter_view.email in client_config_svc.GetServiceAccountMap(): |
+ friendly = commenter_view.display_name |
+ elif reveal_addr: |
+ friendly = commenter_view.email |
+ else: |
+ friendly = commenter_view.display_name |
+ return '%s via %s <%s>' % (friendly, site_name, sender) |
+ else: |
+ return sender |
+ |
+ |
+def NoReplyAddress(commenter_view=None, reveal_addr=False): |
+ """Return an address that ignores all messages sent to it.""" |
+ # Note: We use "no_reply" with an underscore to avoid potential conflict |
+ # with any project name. Project names cannot have underscores. |
+ sender = 'no_reply@%s' % MailDomain() |
+ return FormatFriendly(commenter_view, sender, reveal_addr) |
+ |
+ |
+def FormatFromAddr(_project, commenter_view=None, reveal_addr=False, |
+ can_reply_to=True): |
+ """Return a string to be used on the email From: line. |
+ |
+ Args: |
+ project: Project PB for the project that the email is sent from. |
+ commenter_view: Optional UserView of the user who made a comment. We use |
+ the user's (potentially obscured) email address as their friendly name. |
+ reveal_addr: Optional bool. If False then the address is obscured. |
+ can_reply_to: Optional bool. If True then settings.send_email_as is used, |
+ otherwise settings.send_noreply_email_as is used. |
+ |
+ Returns: |
+ A string that should be used in the From: line of outbound email |
+ notifications for the given project. |
+ """ |
+ addr = (settings.send_email_as if can_reply_to |
+ else settings.send_noreply_email_as) |
+ return FormatFriendly(commenter_view, addr, reveal_addr) |
+ |
+ |
+def NormalizeHeader(s): |
+ """Make our message-ids robust against mail client spacing and truncation.""" |
+ words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace. |
+ normalized = ' '.join(words) |
+ truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED] |
+ return truncated |
+ |
+ |
+def MakeMessageID(to_addr, subject, from_addr): |
+ """Make a unique (but deterministic) email Message-Id: value.""" |
+ normalized_subject = NormalizeHeader(subject) |
+ if isinstance(normalized_subject, unicode): |
+ normalized_subject = normalized_subject.encode('utf-8') |
+ mail_hmac_key = secrets_svc.GetEmailKey() |
+ return '<0=%s=%s=%s@%s>' % ( |
+ hmac.new(mail_hmac_key, to_addr).hexdigest(), |
+ hmac.new(mail_hmac_key, normalized_subject).hexdigest(), |
+ from_addr.split('@')[0], |
+ MailDomain()) |
+ |
+ |
+def GetReferences(to_addr, subject, seq_num, project_from_addr): |
+ """Make a References: header to make this message thread properly. |
+ |
+ Args: |
+ to_addr: address that email message will be sent to. |
+ subject: subject line of email message. |
+ seq_num: sequence number of message in thread, e.g., 0, 1, 2, ..., |
+ or None if the message is not part of a thread. |
+ project_from_addr: address that the message will be sent from. |
+ |
+ Returns: |
+ A string Message-ID that does not correspond to any actual email |
+ message that was ever sent, but it does serve to unite all the |
+ messages that belong togther in a thread. |
+ """ |
+ if seq_num is not None: |
+ return MakeMessageID(to_addr, subject, project_from_addr) |
+ else: |
+ return '' |
+ |
+ |
+def ValidateReferencesHeader(message_ref, project, from_addr, subject): |
+ """Check that the References header is one that we could have sent. |
+ |
+ Args: |
+ message_ref: one of the References header values from the inbound email. |
+ project: Project PB for the affected project. |
+ from_addr: string email address that inbound email was sent from. |
+ subject: string base subject line of inbound email. |
+ |
+ Returns: |
+ True if it looks like this is a reply to a message that we sent |
+ to the same address that replied. Otherwise, False. |
+ """ |
+ sender = '%s@%s' % (project.project_name, MailDomain()) |
+ expected_ref = MakeMessageID(from_addr, subject, sender) |
+ |
+ # TODO(jrobbins): project option to not check from_addr. |
+ # TODO(jrobbins): project inbound auth token. |
+ return expected_ref == message_ref |
+ |
+ |
+PROJECT_EMAIL_RE = re.compile( |
+ r'(?P<project>[-a-z0-9]+)' |
+ r'@(?P<domain>[-a-z0-9.]+)') |
+ |
+ISSUE_CHANGE_SUMMARY_RE = re.compile( |
+ r'Issue (?P<local_id>[0-9]+) in ' |
+ r'(?P<project>[-a-z0-9]+): ' |
+ r'(?P<summary>.+)') |
+ |
+ |
+def IdentifyProjectAndIssue(project_addr, subject): |
+ """Parse the domain name, project name, and artifact id from a reply. |
+ |
+ Args: |
+ project_addr: string email address that the email was delivered to, |
+ it must match the Reply-To: header sent in the notification message. |
+ subject: string email subject line received, it must match the one |
+ sent. Leading prefixes like "Re:" should already have been stripped. |
+ |
+ Returns: |
+ A 2-tuple: (project_name, local_id). If either or both are |
+ None, they could not be determined. |
+ """ |
+ # Ignore any inbound email sent to a "no_reply@" address. |
+ if project_addr.startswith('no_reply@'): |
+ return None, None |
+ |
+ project_name = None |
+ |
+ m = PROJECT_EMAIL_RE.match(project_addr.lower()) |
+ if m: |
+ project_name = m.group('project') |
+ |
+ issue_project_name, local_id_str = _MatchSubject(subject) |
+ |
+ if project_name != issue_project_name: |
+ # Something is wrong with the project name. |
+ project_name = None |
+ |
+ logging.info('project_name = %r', project_name) |
+ logging.info('local_id_str = %r', local_id_str) |
+ |
+ try: |
+ local_id = int(local_id_str) |
+ except ValueError: |
+ local_id = None |
+ |
+ return project_name, local_id |
+ |
+ |
+def _MatchSubject(subject): |
+ """Parse the project, artifact type, and artifact id from a subject line.""" |
+ m = ISSUE_CHANGE_SUMMARY_RE.match(subject) |
+ if m: |
+ return m.group('project'), m.group('local_id') |
+ |
+ return None, None |
+ |
+ |
+# TODO(jrobbins): For now, we strip out lines that look like quoted |
+# text and then will give the user the option to see the whole email. |
+# For 2.0 of this feature, we should change the Comment PB to have |
+# runs of text with different properties so that the UI can present |
+# "- Show quoted text -" and expand it in-line. |
+ |
+# TODO(jrobbins): For now, we look for lines that indicate quoted |
+# text (e.g., they start with ">"). But, we should also collapse |
+# multiple lines that are identical to other lines in previous |
+# non-deleted comments on the same issue, regardless of quote markers. |
+ |
+ |
+# We cut off the message if we see something that looks like a signature and |
+# it is near the bottom of the message. |
+SIGNATURE_BOUNDARY_RE = re.compile( |
+ r'^(([-_=]+ ?)+|' |
+ r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|' |
+ r'Sent from my i?Phone|Sent from my iPod)' |
+ r',? *$', re.I) |
+ |
+MAX_SIGNATURE_LINES = 8 |
+ |
+FORWARD_OR_EXPLICIT_SIG_PATS = [ |
+ r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$', |
+ r'Updates:\s*$', |
+ r'Comment #\d+ on issue \d+ by \S+:', |
+ # If we see this anywhere in the message, treat the rest as a signature. |
+ r'--\s*$', |
+ ] |
+FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile( |
+ r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS), |
+ flags=re.MULTILINE | re.IGNORECASE) |
+ |
+# This handles gmail well, and it's pretty broad without seeming like |
+# it would cause false positives. |
+QUOTE_PATS = [ |
+ r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$', |
+ r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$', |
+ r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$', |
+ r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$', |
+ r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$', |
+ r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$', |
+ r'^>.*$', |
+ ] |
+QUOTED_BLOCKS_RE = re.compile( |
+ r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS), |
+ flags=re.MULTILINE | re.IGNORECASE) |
+ |
+ |
+def StripQuotedText(description): |
+ """Strip all quoted text lines out of the given comment text.""" |
+ # If the rest of message is forwared text, we're done. |
+ description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description) |
+ # Replace each quoted block of lines and surrounding blank lines with at |
+ # most one blank line. |
+ description = QUOTED_BLOCKS_RE.sub('\n', description) |
+ |
+ new_lines = description.strip().split('\n') |
+ # Make another pass over the last few lines to strip out signatures. |
+ sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES) |
+ for idx in range(sig_zone_start, len(new_lines)): |
+ line = new_lines[idx] |
+ if SIGNATURE_BOUNDARY_RE.match(line): |
+ # We found the likely start of a signature, just keep the lines above it. |
+ new_lines = new_lines[:idx] |
+ break |
+ |
+ return '\n'.join(new_lines).strip() |