OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 |
| 6 """A set of functions that integrate the GAE search index with Monorail.""" |
| 7 |
| 8 import collections |
| 9 import datetime |
| 10 import logging |
| 11 import re |
| 12 from services import fulltext_helpers |
| 13 import time |
| 14 |
| 15 from proto import ast_pb2 |
| 16 from proto import tracker_pb2 |
| 17 |
| 18 |
| 19 # TODO(jrobbins): Consider re-implementing this whole file by using a |
| 20 # BNF syntax specification and a parser generator or library. |
| 21 |
| 22 # encodings |
| 23 UTF8 = 'utf-8' |
| 24 |
| 25 # Field types and operators |
| 26 BOOL = tracker_pb2.FieldTypes.BOOL_TYPE |
| 27 DATE = tracker_pb2.FieldTypes.DATE_TYPE |
| 28 NUM = tracker_pb2.FieldTypes.INT_TYPE |
| 29 TXT = tracker_pb2.FieldTypes.STR_TYPE |
| 30 |
| 31 EQ = ast_pb2.QueryOp.EQ |
| 32 NE = ast_pb2.QueryOp.NE |
| 33 LT = ast_pb2.QueryOp.LT |
| 34 GT = ast_pb2.QueryOp.GT |
| 35 LE = ast_pb2.QueryOp.LE |
| 36 GE = ast_pb2.QueryOp.GE |
| 37 TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS |
| 38 NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS |
| 39 TEXT_MATCHES = ast_pb2.QueryOp.TEXT_MATCHES |
| 40 NOT_TEXT_MATCHES = ast_pb2.QueryOp.NOT_TEXT_MATCHES |
| 41 IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED |
| 42 IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED |
| 43 KEY_HAS = ast_pb2.QueryOp.KEY_HAS |
| 44 |
| 45 # Mapping from user query comparison operators to our internal representation. |
| 46 OPS = { |
| 47 ':': TEXT_HAS, |
| 48 '=': EQ, |
| 49 '!=': NE, |
| 50 '<': LT, |
| 51 '>': GT, |
| 52 '<=': LE, |
| 53 '>=': GE, |
| 54 } |
| 55 |
| 56 # This is a partial regular expression that matches all of our comparison |
| 57 # operators, such as =, 1=, >, and <. Longer ones listed first so that the |
| 58 # shorter ones don't cause premature matches. |
| 59 OPS_PATTERN = '|'.join( |
| 60 map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op)))) |
| 61 |
| 62 # This RE extracts search terms from a subquery string. |
| 63 TERM_RE = re.compile( |
| 64 r'(-?"[^"]+")|' # E.g., ["division by zero"] |
| 65 r'(\S+(%s)[^ "]+)|' # E.g., [stars>10] |
| 66 r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"] |
| 67 r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround] |
| 68 % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE) |
| 69 |
| 70 # This RE is used to further decompose a comparison term into prefix, op, and |
| 71 # value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix |
| 72 # can include a leading "-" to negate the comparison. |
| 73 OP_RE = re.compile( |
| 74 r'^(?P<prefix>[-_\w]*?)' |
| 75 r'(?P<op>%s)' |
| 76 r'(?P<value>([-,.@>/_\*\w]+|"[^"]+"))$' % |
| 77 OPS_PATTERN, |
| 78 flags=re.UNICODE) |
| 79 |
| 80 |
| 81 # Predefined issue fields passed to the query parser. |
| 82 _ISSUE_FIELDS_LIST = [ |
| 83 (ast_pb2.ANY_FIELD, TXT), |
| 84 ('attachment', TXT), # attachment file names |
| 85 ('attachments', NUM), # number of attachment files |
| 86 ('blocked', BOOL), |
| 87 ('blockedon', TXT), |
| 88 ('blockedon_id', NUM), |
| 89 ('blocking', TXT), |
| 90 ('blocking_id', NUM), |
| 91 ('cc', TXT), |
| 92 ('cc_id', NUM), |
| 93 ('comment', TXT), |
| 94 ('commentby', TXT), |
| 95 ('commentby_id', NUM), |
| 96 ('component', TXT), |
| 97 ('component_id', NUM), |
| 98 ('description', TXT), |
| 99 ('id', NUM), |
| 100 ('label', TXT), |
| 101 ('label_id', NUM), |
| 102 ('mergedinto', NUM), |
| 103 ('open', BOOL), |
| 104 ('owner', TXT), |
| 105 ('owner_id', NUM), |
| 106 ('project', TXT), |
| 107 ('reporter', TXT), |
| 108 ('reporter_id', NUM), |
| 109 ('spam', BOOL), |
| 110 ('stars', NUM), |
| 111 ('starredby', TXT), |
| 112 ('starredby_id', NUM), |
| 113 ('status', TXT), |
| 114 ('status_id', NUM), |
| 115 ('summary', TXT), |
| 116 ] |
| 117 |
| 118 _DATE_FIELDS = ( |
| 119 'closed', |
| 120 'modified', |
| 121 'opened', |
| 122 ) |
| 123 |
| 124 # Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST. |
| 125 _ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS) |
| 126 |
| 127 _DATE_FIELD_SUFFIX_TO_OP = { |
| 128 '-after': '>', |
| 129 '-before': '<', |
| 130 } |
| 131 |
| 132 BUILTIN_ISSUE_FIELDS = { |
| 133 f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type) |
| 134 for f_name, f_type in _ISSUE_FIELDS_LIST} |
| 135 |
| 136 |
| 137 def ParseUserQuery( |
| 138 query, scope, builtin_fields, harmonized_config, warnings=None): |
| 139 """Parse a user query and return a set of structure terms. |
| 140 |
| 141 Args: |
| 142 query: string with user's query. E.g., 'Priority=High'. |
| 143 scope: string search terms that define the scope in which the |
| 144 query should be executed. They are expressed in the same |
| 145 user query language. E.g., adding the canned query. |
| 146 builtin_fields: dict {field_name: FieldDef(field_name, type)} |
| 147 mapping field names to FieldDef objects for built-in fields. |
| 148 harmonized_config: config for all the projects being searched. |
| 149 @@@ custom field name is not unique in cross project search. |
| 150 - custom_fields = {field_name: [fd, ...]} |
| 151 - query build needs to OR each possible interpretation |
| 152 - could be label in one project and field in another project. |
| 153 @@@ what about searching across all projects? |
| 154 warnings: optional list to accumulate warning messages. |
| 155 |
| 156 Returns: |
| 157 A QueryAST with conjunctions (usually just one), where each has a list of |
| 158 Condition PBs with op, fields, str_values and int_values. E.g., the query |
| 159 [priority=high leak OR stars>100] over open issues would return |
| 160 QueryAST( |
| 161 Conjunction(Condition(EQ, [open_fd], [], [1]), |
| 162 Condition(EQ, [label_fd], ['priority-high'], []), |
| 163 Condition(TEXT_HAS, any_field_fd, ['leak'], [])), |
| 164 Conjunction(Condition(EQ, [open_fd], [], [1]), |
| 165 Condition(GT, [stars_fd], [], [100]))) |
| 166 |
| 167 Raises: |
| 168 InvalidQueryError: If a problem was detected in the user's query. |
| 169 """ |
| 170 if warnings is None: |
| 171 warnings = [] |
| 172 if _HasParens(query): |
| 173 warnings.append('Parentheses are ignored in user queries.') |
| 174 |
| 175 if _HasParens(scope): |
| 176 warnings.append('Parentheses are ignored in saved queries.') |
| 177 |
| 178 # Convert the overall query into one or more OR'd subqueries. |
| 179 subqueries = query.split(' OR ') |
| 180 |
| 181 if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now. |
| 182 raise InvalidQueryError('Logical operator OR is not supported yet.') |
| 183 |
| 184 # Make a dictionary of all fields: built-in + custom in each project. |
| 185 combined_fields = collections.defaultdict( |
| 186 list, {field_name: [field_def] |
| 187 for field_name, field_def in builtin_fields.iteritems()}) |
| 188 for fd in harmonized_config.field_defs: |
| 189 if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE: |
| 190 # Only do non-enum fields because enums are stored as labels |
| 191 combined_fields[fd.field_name.lower()].append(fd) |
| 192 |
| 193 conjunctions = [ |
| 194 _ParseConjunction(sq, scope, combined_fields, warnings) |
| 195 for sq in subqueries] |
| 196 return ast_pb2.QueryAST(conjunctions=conjunctions) |
| 197 |
| 198 |
| 199 def _HasParens(s): |
| 200 """Return True if there are parentheses in the given string.""" |
| 201 # Monorail cannot handle parenthesized expressions, so we tell the |
| 202 # user that immediately. Even inside a quoted string, the GAE search |
| 203 # engine will not handle parens in TEXT-type fields. |
| 204 return '(' in s or ')' in s |
| 205 |
| 206 |
| 207 def _ParseConjunction(subquery, scope, fields, warnings): |
| 208 """Parse part of a user query into a Conjunction PB.""" |
| 209 logging.info('Parsing sub query: %r in scope %r', subquery, scope) |
| 210 scoped_query = ('%s %s' % (scope, subquery)).lower() |
| 211 cond_strs = _ExtractConds(scoped_query) |
| 212 conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs] |
| 213 return ast_pb2.Conjunction(conds=conds) |
| 214 |
| 215 |
| 216 def _ParseCond(cond_str, fields, warnings): |
| 217 """Parse one user query condition string into a Condition PB.""" |
| 218 op_match = OP_RE.match(cond_str) |
| 219 # Do not treat as key:value search terms if any of the special prefixes match. |
| 220 special_prefixes_match = any( |
| 221 cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES) |
| 222 if op_match and not special_prefixes_match: |
| 223 prefix = op_match.group('prefix') |
| 224 op = op_match.group('op') |
| 225 val = op_match.group('value') |
| 226 # Special case handling to continue to support old date query terms from |
| 227 # codesite. See monorail:151 for more details. |
| 228 if prefix.startswith(_DATE_FIELDS): |
| 229 for date_suffix in _DATE_FIELD_SUFFIX_TO_OP: |
| 230 if prefix.endswith(date_suffix): |
| 231 prefix = prefix.rstrip(date_suffix) |
| 232 op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix] |
| 233 return _ParseStructuredTerm(prefix, op, val, fields) |
| 234 |
| 235 # Treat the cond as a full-text search term, which might be negated. |
| 236 if cond_str.startswith('-'): |
| 237 op = NOT_TEXT_HAS |
| 238 cond_str = cond_str[1:] |
| 239 else: |
| 240 op = TEXT_HAS |
| 241 |
| 242 # Flag a potential user misunderstanding. |
| 243 if cond_str.lower() in ('and', 'or', 'not'): |
| 244 warnings.append( |
| 245 'The only supported boolean operator is OR (all capitals).') |
| 246 |
| 247 return ast_pb2.MakeCond( |
| 248 op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], []) |
| 249 |
| 250 |
| 251 def _ParseStructuredTerm(prefix, op_str, value, fields): |
| 252 """Parse one user structured query term into an internal representation. |
| 253 |
| 254 Args: |
| 255 prefix: The query operator, usually a field name. E.g., summary. It can |
| 256 also be special operators like "is" to test boolean fields. |
| 257 op_str: the comparison operator. Usually ":" or "=", but can be any OPS. |
| 258 value: the value to compare against, e.g., term to find in that field. |
| 259 fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields. |
| 260 |
| 261 Returns: |
| 262 A Condition PB. |
| 263 """ |
| 264 unquoted_value = value.strip('"') |
| 265 # Quick-OR is a convenient way to write one condition that matches any one of |
| 266 # multiple values, like set membership. E.g., [Priority=High,Critical]. |
| 267 quick_or_vals = [v.strip() for v in unquoted_value.split(',')] |
| 268 |
| 269 if ((prefix == 'is' or prefix == '-is') and |
| 270 unquoted_value in ['open', 'blocked', 'spam']): |
| 271 return ast_pb2.MakeCond( |
| 272 EQ, fields[unquoted_value], [], [int(prefix == 'is')]) |
| 273 |
| 274 op = OPS[op_str] |
| 275 negate = False |
| 276 if prefix.startswith('-'): |
| 277 negate = True |
| 278 if op == EQ: |
| 279 op = NE |
| 280 elif op == TEXT_HAS: |
| 281 op = NOT_TEXT_HAS |
| 282 prefix = prefix[1:] |
| 283 |
| 284 # Search entries with or without any value in the specified field. |
| 285 if prefix == 'has': |
| 286 op = IS_NOT_DEFINED if negate else IS_DEFINED |
| 287 if unquoted_value in fields: # Look for that field with any value. |
| 288 return ast_pb2.MakeCond(op, fields[unquoted_value], [], []) |
| 289 else: # Look for any label with that prefix. |
| 290 return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], []) |
| 291 |
| 292 if prefix in fields: # search built-in and custom fields. E.g., summary. |
| 293 # Note: if first matching field is date-type, we assume they all are. |
| 294 # TODO(jrobbins): better handling for rare case where multiple projects |
| 295 # define the same custom field name, and one is a date and another is not. |
| 296 first_field = fields[prefix][0] |
| 297 if first_field.field_type == DATE: |
| 298 date_value = _ParseDateValue(unquoted_value) |
| 299 return ast_pb2.MakeCond(op, fields[prefix], [], [date_value]) |
| 300 else: |
| 301 quick_or_ints = [] |
| 302 for qov in quick_or_vals: |
| 303 try: |
| 304 quick_or_ints.append(int(qov)) |
| 305 except ValueError: |
| 306 pass |
| 307 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints) |
| 308 |
| 309 # Since it is not a field, treat it as labels, E.g., Priority. |
| 310 quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals] |
| 311 # Convert substring match to key-value match if user typed 'foo:bar'. |
| 312 if op == TEXT_HAS: |
| 313 op = KEY_HAS |
| 314 return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, []) |
| 315 |
| 316 |
| 317 def _ExtractConds(query): |
| 318 """Parse a query string into a list of individual condition strings. |
| 319 |
| 320 Args: |
| 321 query: UTF-8 encoded search query string. |
| 322 |
| 323 Returns: |
| 324 A list of query condition strings. |
| 325 """ |
| 326 # Convert to unicode then search for distinct terms. |
| 327 term_matches = TERM_RE.findall(query) |
| 328 |
| 329 terms = [] |
| 330 for (phrase, word_label, _op1, phrase_label, _op2, |
| 331 word) in term_matches: |
| 332 # Case 1: Quoted phrases, e.g., ["hot dog"]. |
| 333 if phrase_label or phrase: |
| 334 terms.append(phrase_label or phrase) |
| 335 |
| 336 # Case 2: Comparisons |
| 337 elif word_label: |
| 338 special_prefixes_match = any( |
| 339 word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES) |
| 340 match = OP_RE.match(word_label) |
| 341 if match: |
| 342 label = match.group('prefix') |
| 343 op = match.group('op') |
| 344 word = match.group('value') |
| 345 if special_prefixes_match: |
| 346 # Do not include quotes if any of the special prefixes match because |
| 347 # we do not want to treat the label as key:value search terms. |
| 348 terms.append('%s%s%s' % (label, op, word)) |
| 349 else: |
| 350 terms.append('%s%s"%s"' % (label, op, word)) |
| 351 else: |
| 352 # It looked like a key:value cond, but not exactly, so treat it |
| 353 # as fulltext search. It is probably a tiny bit of source code. |
| 354 terms.append('"%s"' % word_label) |
| 355 |
| 356 # Case 3: Simple words. |
| 357 elif word: |
| 358 terms.append(word) |
| 359 |
| 360 else: |
| 361 logging.warn('Unexpected search term in %r', query) |
| 362 |
| 363 return terms |
| 364 |
| 365 |
| 366 def _ParseDateValue(val): |
| 367 """Convert the user-entered date into timestamp.""" |
| 368 # Support timestamp value such as opened>1437671476 |
| 369 try: |
| 370 return int(val) |
| 371 except ValueError: |
| 372 pass |
| 373 |
| 374 # TODO(jrobbins): future: take timezones into account. |
| 375 # TODO(jrobbins): for now, explain to users that "today" is |
| 376 # actually now: the current time, not 12:01am in their timezone. |
| 377 # In fact, it is not very useful because everything in the system |
| 378 # happened before the current time. |
| 379 if val == 'today': |
| 380 return _CalculatePastDate(0) |
| 381 elif val.startswith('today-'): |
| 382 try: |
| 383 days_ago = int(val.split('-')[1]) |
| 384 except ValueError: |
| 385 days_ago = 0 |
| 386 return _CalculatePastDate(days_ago) |
| 387 |
| 388 if '/' in val: |
| 389 year, month, day = [int(x) for x in val.split('/')] |
| 390 elif '-' in val: |
| 391 year, month, day = [int(x) for x in val.split('-')] |
| 392 |
| 393 try: |
| 394 return int(time.mktime(datetime.datetime(year, month, day).timetuple())) |
| 395 except ValueError: |
| 396 raise InvalidQueryError('Could not parse date') |
| 397 |
| 398 |
| 399 def _CalculatePastDate(days_ago, now=None): |
| 400 """Calculates the timestamp N days ago from now.""" |
| 401 if now is None: |
| 402 now = int(time.time()) |
| 403 ts = now - days_ago * 24 * 60 * 60 |
| 404 return ts |
| 405 |
| 406 |
| 407 def CheckSyntax(query, harmonized_config, warnings=None): |
| 408 """Parse the given query and report the first error or None.""" |
| 409 try: |
| 410 ParseUserQuery( |
| 411 query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings) |
| 412 except InvalidQueryError as e: |
| 413 return e.message |
| 414 |
| 415 return None |
| 416 |
| 417 |
| 418 class Error(Exception): |
| 419 """Base exception class for this package.""" |
| 420 pass |
| 421 |
| 422 |
| 423 class InvalidQueryError(Error): |
| 424 """Error raised when an invalid query is requested.""" |
| 425 pass |
OLD | NEW |