OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 |
| 6 """Functions that format or parse email messages in Monorail. |
| 7 |
| 8 Specifically, this module has the logic for generating various email |
| 9 header lines that help match inbound and outbound email to the project |
| 10 and artifact that generated it. |
| 11 """ |
| 12 |
| 13 import hmac |
| 14 import logging |
| 15 import re |
| 16 import rfc822 |
| 17 |
| 18 from google.appengine.api import app_identity |
| 19 |
| 20 import settings |
| 21 from framework import framework_constants |
| 22 from services import client_config_svc |
| 23 from services import secrets_svc |
| 24 |
| 25 # TODO(jrobbins): Parsing very large messages is slow, and we are not going |
| 26 # to handle attachments at first, so there is no reason to consider large |
| 27 # emails. |
| 28 MAX_BODY_SIZE = 100 * 1024 |
| 29 MAX_HEADER_CHARS_CONSIDERED = 255 |
| 30 |
| 31 |
| 32 |
| 33 def IsBodyTooBigToParse(body): |
| 34 """Return True if the email message body is too big to process.""" |
| 35 return len(body) > MAX_BODY_SIZE |
| 36 |
| 37 |
| 38 def IsProjectAddressOnToLine(project_addr, to_addrs): |
| 39 """Return True if an email was explicitly sent directly to us.""" |
| 40 return project_addr in to_addrs |
| 41 |
| 42 |
| 43 def ParseEmailMessage(msg): |
| 44 """Parse the given MessageRouterMessage and return relevant fields. |
| 45 |
| 46 Args: |
| 47 msg: email.message.Message object for the email message sent to us. |
| 48 |
| 49 Returns: |
| 50 A tuple: from_addr, to_addrs, cc_addrs, references, subject, body. |
| 51 """ |
| 52 # Ignore messages that are probably not from humans, see: |
| 53 # http://google.com/search?q=precedence+bulk+junk |
| 54 precedence = msg.get('precedence', '') |
| 55 if precedence.lower() in ['bulk', 'junk']: |
| 56 logging.info('Precedence: %r indicates an autoresponder', precedence) |
| 57 return '', [], [], '', '', '' |
| 58 |
| 59 from_addrs = _ExtractAddrs(msg.get('from', '')) |
| 60 if from_addrs: |
| 61 from_addr = from_addrs[0] |
| 62 else: |
| 63 from_addr = '' |
| 64 |
| 65 to_addrs = _ExtractAddrs(msg.get('to', '')) |
| 66 cc_addrs = _ExtractAddrs(msg.get('cc', '')) |
| 67 |
| 68 in_reply_to = msg.get('in-reply-to', '') |
| 69 references = msg.get('references', '').split() |
| 70 references = list({ref for ref in [in_reply_to] + references if ref}) |
| 71 subject = _StripSubjectPrefixes(msg.get('subject', '')) |
| 72 |
| 73 body = '' |
| 74 for part in msg.walk(): |
| 75 # We only process plain text emails. |
| 76 if part.get_content_type() == 'text/plain': |
| 77 body = part.get_payload(decode=True) |
| 78 break # Only consider the first text part. |
| 79 |
| 80 return from_addr, to_addrs, cc_addrs, references, subject, body |
| 81 |
| 82 |
| 83 def _ExtractAddrs(header_value): |
| 84 """Given a message header value, return email address found there.""" |
| 85 friendly_addr_pairs = list(rfc822.AddressList(header_value)) |
| 86 return [addr for _friendly, addr in friendly_addr_pairs] |
| 87 |
| 88 |
| 89 def _StripSubjectPrefixes(subject): |
| 90 """Strip off any 'Re:', 'Fwd:', etc. subject line prefixes.""" |
| 91 prefix = _FindSubjectPrefix(subject) |
| 92 while prefix: |
| 93 subject = subject[len(prefix):].strip() |
| 94 prefix = _FindSubjectPrefix(subject) |
| 95 |
| 96 return subject |
| 97 |
| 98 |
| 99 def _FindSubjectPrefix(subject): |
| 100 """If the given subject starts with a prefix, return that prefix.""" |
| 101 for prefix in ['re:', 'aw:', 'fwd:', 'fw:']: |
| 102 if subject.lower().startswith(prefix): |
| 103 return prefix |
| 104 |
| 105 return None |
| 106 |
| 107 |
| 108 def MailDomain(): |
| 109 """Return the domain name where this app can recieve email.""" |
| 110 if settings.unit_test_mode: |
| 111 return 'testbed-test.appspotmail.com' |
| 112 |
| 113 # If running on a GAFYD domain, you must define an app alias on the |
| 114 # Application Settings admin web page. If you cannot reserve the matching |
| 115 # APP_ID for the alias, then specify it in settings.mail_domain. |
| 116 if settings.mail_domain: |
| 117 return settings.mail_domain |
| 118 |
| 119 app_id = app_identity.get_application_id() |
| 120 if ':' in app_id: |
| 121 app_id = app_id.split(':')[-1] |
| 122 |
| 123 return '%s.appspotmail.com' % app_id |
| 124 |
| 125 |
| 126 def FormatFriendly(commenter_view, sender, reveal_addr): |
| 127 """Format the From: line to include the commenter's friendly name if given.""" |
| 128 if commenter_view: |
| 129 site_name = settings.site_name |
| 130 if commenter_view.email in client_config_svc.GetServiceAccountMap(): |
| 131 friendly = commenter_view.display_name |
| 132 elif reveal_addr: |
| 133 friendly = commenter_view.email |
| 134 else: |
| 135 friendly = commenter_view.display_name |
| 136 return '%s via %s <%s>' % (friendly, site_name, sender) |
| 137 else: |
| 138 return sender |
| 139 |
| 140 |
| 141 def NoReplyAddress(commenter_view=None, reveal_addr=False): |
| 142 """Return an address that ignores all messages sent to it.""" |
| 143 # Note: We use "no_reply" with an underscore to avoid potential conflict |
| 144 # with any project name. Project names cannot have underscores. |
| 145 sender = 'no_reply@%s' % MailDomain() |
| 146 return FormatFriendly(commenter_view, sender, reveal_addr) |
| 147 |
| 148 |
| 149 def FormatFromAddr(_project, commenter_view=None, reveal_addr=False, |
| 150 can_reply_to=True): |
| 151 """Return a string to be used on the email From: line. |
| 152 |
| 153 Args: |
| 154 project: Project PB for the project that the email is sent from. |
| 155 commenter_view: Optional UserView of the user who made a comment. We use |
| 156 the user's (potentially obscured) email address as their friendly name. |
| 157 reveal_addr: Optional bool. If False then the address is obscured. |
| 158 can_reply_to: Optional bool. If True then settings.send_email_as is used, |
| 159 otherwise settings.send_noreply_email_as is used. |
| 160 |
| 161 Returns: |
| 162 A string that should be used in the From: line of outbound email |
| 163 notifications for the given project. |
| 164 """ |
| 165 addr = (settings.send_email_as if can_reply_to |
| 166 else settings.send_noreply_email_as) |
| 167 return FormatFriendly(commenter_view, addr, reveal_addr) |
| 168 |
| 169 |
| 170 def NormalizeHeader(s): |
| 171 """Make our message-ids robust against mail client spacing and truncation.""" |
| 172 words = _StripSubjectPrefixes(s).split() # Split on any runs of whitespace. |
| 173 normalized = ' '.join(words) |
| 174 truncated = normalized[:MAX_HEADER_CHARS_CONSIDERED] |
| 175 return truncated |
| 176 |
| 177 |
| 178 def MakeMessageID(to_addr, subject, from_addr): |
| 179 """Make a unique (but deterministic) email Message-Id: value.""" |
| 180 normalized_subject = NormalizeHeader(subject) |
| 181 if isinstance(normalized_subject, unicode): |
| 182 normalized_subject = normalized_subject.encode('utf-8') |
| 183 mail_hmac_key = secrets_svc.GetEmailKey() |
| 184 return '<0=%s=%s=%s@%s>' % ( |
| 185 hmac.new(mail_hmac_key, to_addr).hexdigest(), |
| 186 hmac.new(mail_hmac_key, normalized_subject).hexdigest(), |
| 187 from_addr.split('@')[0], |
| 188 MailDomain()) |
| 189 |
| 190 |
| 191 def GetReferences(to_addr, subject, seq_num, project_from_addr): |
| 192 """Make a References: header to make this message thread properly. |
| 193 |
| 194 Args: |
| 195 to_addr: address that email message will be sent to. |
| 196 subject: subject line of email message. |
| 197 seq_num: sequence number of message in thread, e.g., 0, 1, 2, ..., |
| 198 or None if the message is not part of a thread. |
| 199 project_from_addr: address that the message will be sent from. |
| 200 |
| 201 Returns: |
| 202 A string Message-ID that does not correspond to any actual email |
| 203 message that was ever sent, but it does serve to unite all the |
| 204 messages that belong togther in a thread. |
| 205 """ |
| 206 if seq_num is not None: |
| 207 return MakeMessageID(to_addr, subject, project_from_addr) |
| 208 else: |
| 209 return '' |
| 210 |
| 211 |
| 212 def ValidateReferencesHeader(message_ref, project, from_addr, subject): |
| 213 """Check that the References header is one that we could have sent. |
| 214 |
| 215 Args: |
| 216 message_ref: one of the References header values from the inbound email. |
| 217 project: Project PB for the affected project. |
| 218 from_addr: string email address that inbound email was sent from. |
| 219 subject: string base subject line of inbound email. |
| 220 |
| 221 Returns: |
| 222 True if it looks like this is a reply to a message that we sent |
| 223 to the same address that replied. Otherwise, False. |
| 224 """ |
| 225 sender = '%s@%s' % (project.project_name, MailDomain()) |
| 226 expected_ref = MakeMessageID(from_addr, subject, sender) |
| 227 |
| 228 # TODO(jrobbins): project option to not check from_addr. |
| 229 # TODO(jrobbins): project inbound auth token. |
| 230 return expected_ref == message_ref |
| 231 |
| 232 |
| 233 PROJECT_EMAIL_RE = re.compile( |
| 234 r'(?P<project>[-a-z0-9]+)' |
| 235 r'@(?P<domain>[-a-z0-9.]+)') |
| 236 |
| 237 ISSUE_CHANGE_SUMMARY_RE = re.compile( |
| 238 r'Issue (?P<local_id>[0-9]+) in ' |
| 239 r'(?P<project>[-a-z0-9]+): ' |
| 240 r'(?P<summary>.+)') |
| 241 |
| 242 |
| 243 def IdentifyProjectAndIssue(project_addr, subject): |
| 244 """Parse the domain name, project name, and artifact id from a reply. |
| 245 |
| 246 Args: |
| 247 project_addr: string email address that the email was delivered to, |
| 248 it must match the Reply-To: header sent in the notification message. |
| 249 subject: string email subject line received, it must match the one |
| 250 sent. Leading prefixes like "Re:" should already have been stripped. |
| 251 |
| 252 Returns: |
| 253 A 2-tuple: (project_name, local_id). If either or both are |
| 254 None, they could not be determined. |
| 255 """ |
| 256 # Ignore any inbound email sent to a "no_reply@" address. |
| 257 if project_addr.startswith('no_reply@'): |
| 258 return None, None |
| 259 |
| 260 project_name = None |
| 261 |
| 262 m = PROJECT_EMAIL_RE.match(project_addr.lower()) |
| 263 if m: |
| 264 project_name = m.group('project') |
| 265 |
| 266 issue_project_name, local_id_str = _MatchSubject(subject) |
| 267 |
| 268 if project_name != issue_project_name: |
| 269 # Something is wrong with the project name. |
| 270 project_name = None |
| 271 |
| 272 logging.info('project_name = %r', project_name) |
| 273 logging.info('local_id_str = %r', local_id_str) |
| 274 |
| 275 try: |
| 276 local_id = int(local_id_str) |
| 277 except ValueError: |
| 278 local_id = None |
| 279 |
| 280 return project_name, local_id |
| 281 |
| 282 |
| 283 def _MatchSubject(subject): |
| 284 """Parse the project, artifact type, and artifact id from a subject line.""" |
| 285 m = ISSUE_CHANGE_SUMMARY_RE.match(subject) |
| 286 if m: |
| 287 return m.group('project'), m.group('local_id') |
| 288 |
| 289 return None, None |
| 290 |
| 291 |
| 292 # TODO(jrobbins): For now, we strip out lines that look like quoted |
| 293 # text and then will give the user the option to see the whole email. |
| 294 # For 2.0 of this feature, we should change the Comment PB to have |
| 295 # runs of text with different properties so that the UI can present |
| 296 # "- Show quoted text -" and expand it in-line. |
| 297 |
| 298 # TODO(jrobbins): For now, we look for lines that indicate quoted |
| 299 # text (e.g., they start with ">"). But, we should also collapse |
| 300 # multiple lines that are identical to other lines in previous |
| 301 # non-deleted comments on the same issue, regardless of quote markers. |
| 302 |
| 303 |
| 304 # We cut off the message if we see something that looks like a signature and |
| 305 # it is near the bottom of the message. |
| 306 SIGNATURE_BOUNDARY_RE = re.compile( |
| 307 r'^(([-_=]+ ?)+|' |
| 308 r'cheers|(best |warm |kind )?regards|thx|thanks|thank you|' |
| 309 r'Sent from my i?Phone|Sent from my iPod)' |
| 310 r',? *$', re.I) |
| 311 |
| 312 MAX_SIGNATURE_LINES = 8 |
| 313 |
| 314 FORWARD_OR_EXPLICIT_SIG_PATS = [ |
| 315 r'[^0-9a-z]+(forwarded|original) message[^0-9a-z]+\s*$', |
| 316 r'Updates:\s*$', |
| 317 r'Comment #\d+ on issue \d+ by \S+:', |
| 318 # If we see this anywhere in the message, treat the rest as a signature. |
| 319 r'--\s*$', |
| 320 ] |
| 321 FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE = re.compile( |
| 322 r'^(%s)(.|\n)*' % '|'.join(FORWARD_OR_EXPLICIT_SIG_PATS), |
| 323 flags=re.MULTILINE | re.IGNORECASE) |
| 324 |
| 325 # This handles gmail well, and it's pretty broad without seeming like |
| 326 # it would cause false positives. |
| 327 QUOTE_PATS = [ |
| 328 r'^On .*\s+<\s*\S+?@[-a-z0-9.]+>\s*wrote:\s*$', |
| 329 r'^On .* \S+?@[-a-z0-9.]+\s*wrote:\s*$', |
| 330 r'^\S+?@[-a-z0-9.]+ \(\S+?@[-a-z0-9.]+\)\s*wrote:\s*$', |
| 331 r'\S+?@[-a-z0-9]+.appspotmail.com\s.*wrote:\s*$', |
| 332 r'\S+?@[-a-z0-9]+.appspotmail.com\s+.*a\s+\xc3\xa9crit\s*:\s*$', |
| 333 r'^\d+/\d+/\d+ +<\S+@[-a-z0-9.]+>:?\s*$', |
| 334 r'^>.*$', |
| 335 ] |
| 336 QUOTED_BLOCKS_RE = re.compile( |
| 337 r'(^\s*\n)*((%s)\n?)+(^\s*\n)*' % '|'.join(QUOTE_PATS), |
| 338 flags=re.MULTILINE | re.IGNORECASE) |
| 339 |
| 340 |
| 341 def StripQuotedText(description): |
| 342 """Strip all quoted text lines out of the given comment text.""" |
| 343 # If the rest of message is forwared text, we're done. |
| 344 description = FORWARD_OR_EXPLICIT_SIG_PATS_AND_REST_RE.sub('', description) |
| 345 # Replace each quoted block of lines and surrounding blank lines with at |
| 346 # most one blank line. |
| 347 description = QUOTED_BLOCKS_RE.sub('\n', description) |
| 348 |
| 349 new_lines = description.strip().split('\n') |
| 350 # Make another pass over the last few lines to strip out signatures. |
| 351 sig_zone_start = max(0, len(new_lines) - MAX_SIGNATURE_LINES) |
| 352 for idx in range(sig_zone_start, len(new_lines)): |
| 353 line = new_lines[idx] |
| 354 if SIGNATURE_BOUNDARY_RE.match(line): |
| 355 # We found the likely start of a signature, just keep the lines above it. |
| 356 new_lines = new_lines[:idx] |
| 357 break |
| 358 |
| 359 return '\n'.join(new_lines).strip() |
OLD | NEW |