| Index: appengine/monorail/search/query2ast.py
|
| diff --git a/appengine/monorail/search/query2ast.py b/appengine/monorail/search/query2ast.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..6c7b617d1e05008434bfd5f66a78de11b12347fa
|
| --- /dev/null
|
| +++ b/appengine/monorail/search/query2ast.py
|
| @@ -0,0 +1,425 @@
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is govered by a BSD-style
|
| +# license that can be found in the LICENSE file or at
|
| +# https://developers.google.com/open-source/licenses/bsd
|
| +
|
| +"""A set of functions that integrate the GAE search index with Monorail."""
|
| +
|
| +import collections
|
| +import datetime
|
| +import logging
|
| +import re
|
| +from services import fulltext_helpers
|
| +import time
|
| +
|
| +from proto import ast_pb2
|
| +from proto import tracker_pb2
|
| +
|
| +
|
| +# TODO(jrobbins): Consider re-implementing this whole file by using a
|
| +# BNF syntax specification and a parser generator or library.
|
| +
|
| +# encodings
|
| +UTF8 = 'utf-8'
|
| +
|
| +# Field types and operators
|
| +BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
|
| +DATE = tracker_pb2.FieldTypes.DATE_TYPE
|
| +NUM = tracker_pb2.FieldTypes.INT_TYPE
|
| +TXT = tracker_pb2.FieldTypes.STR_TYPE
|
| +
|
| +EQ = ast_pb2.QueryOp.EQ
|
| +NE = ast_pb2.QueryOp.NE
|
| +LT = ast_pb2.QueryOp.LT
|
| +GT = ast_pb2.QueryOp.GT
|
| +LE = ast_pb2.QueryOp.LE
|
| +GE = ast_pb2.QueryOp.GE
|
| +TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
|
| +NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
|
| +TEXT_MATCHES = ast_pb2.QueryOp.TEXT_MATCHES
|
| +NOT_TEXT_MATCHES = ast_pb2.QueryOp.NOT_TEXT_MATCHES
|
| +IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
|
| +IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
|
| +KEY_HAS = ast_pb2.QueryOp.KEY_HAS
|
| +
|
| +# Mapping from user query comparison operators to our internal representation.
|
| +OPS = {
|
| + ':': TEXT_HAS,
|
| + '=': EQ,
|
| + '!=': NE,
|
| + '<': LT,
|
| + '>': GT,
|
| + '<=': LE,
|
| + '>=': GE,
|
| +}
|
| +
|
| +# This is a partial regular expression that matches all of our comparison
|
| +# operators, such as =, 1=, >, and <. Longer ones listed first so that the
|
| +# shorter ones don't cause premature matches.
|
| +OPS_PATTERN = '|'.join(
|
| + map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op))))
|
| +
|
| +# This RE extracts search terms from a subquery string.
|
| +TERM_RE = re.compile(
|
| + r'(-?"[^"]+")|' # E.g., ["division by zero"]
|
| + r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
|
| + r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
|
| + r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
|
| + % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
|
| +
|
| +# This RE is used to further decompose a comparison term into prefix, op, and
|
| +# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
|
| +# can include a leading "-" to negate the comparison.
|
| +OP_RE = re.compile(
|
| + r'^(?P<prefix>[-_\w]*?)'
|
| + r'(?P<op>%s)'
|
| + r'(?P<value>([-,.@>/_\*\w]+|"[^"]+"))$' %
|
| + OPS_PATTERN,
|
| + flags=re.UNICODE)
|
| +
|
| +
|
| +# Predefined issue fields passed to the query parser.
|
| +_ISSUE_FIELDS_LIST = [
|
| + (ast_pb2.ANY_FIELD, TXT),
|
| + ('attachment', TXT), # attachment file names
|
| + ('attachments', NUM), # number of attachment files
|
| + ('blocked', BOOL),
|
| + ('blockedon', TXT),
|
| + ('blockedon_id', NUM),
|
| + ('blocking', TXT),
|
| + ('blocking_id', NUM),
|
| + ('cc', TXT),
|
| + ('cc_id', NUM),
|
| + ('comment', TXT),
|
| + ('commentby', TXT),
|
| + ('commentby_id', NUM),
|
| + ('component', TXT),
|
| + ('component_id', NUM),
|
| + ('description', TXT),
|
| + ('id', NUM),
|
| + ('label', TXT),
|
| + ('label_id', NUM),
|
| + ('mergedinto', NUM),
|
| + ('open', BOOL),
|
| + ('owner', TXT),
|
| + ('owner_id', NUM),
|
| + ('project', TXT),
|
| + ('reporter', TXT),
|
| + ('reporter_id', NUM),
|
| + ('spam', BOOL),
|
| + ('stars', NUM),
|
| + ('starredby', TXT),
|
| + ('starredby_id', NUM),
|
| + ('status', TXT),
|
| + ('status_id', NUM),
|
| + ('summary', TXT),
|
| + ]
|
| +
|
| +_DATE_FIELDS = (
|
| + 'closed',
|
| + 'modified',
|
| + 'opened',
|
| +)
|
| +
|
| +# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
|
| +_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
|
| +
|
| +_DATE_FIELD_SUFFIX_TO_OP = {
|
| + '-after': '>',
|
| + '-before': '<',
|
| +}
|
| +
|
| +BUILTIN_ISSUE_FIELDS = {
|
| + f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
|
| + for f_name, f_type in _ISSUE_FIELDS_LIST}
|
| +
|
| +
|
| +def ParseUserQuery(
|
| + query, scope, builtin_fields, harmonized_config, warnings=None):
|
| + """Parse a user query and return a set of structure terms.
|
| +
|
| + Args:
|
| + query: string with user's query. E.g., 'Priority=High'.
|
| + scope: string search terms that define the scope in which the
|
| + query should be executed. They are expressed in the same
|
| + user query language. E.g., adding the canned query.
|
| + builtin_fields: dict {field_name: FieldDef(field_name, type)}
|
| + mapping field names to FieldDef objects for built-in fields.
|
| + harmonized_config: config for all the projects being searched.
|
| + @@@ custom field name is not unique in cross project search.
|
| + - custom_fields = {field_name: [fd, ...]}
|
| + - query build needs to OR each possible interpretation
|
| + - could be label in one project and field in another project.
|
| + @@@ what about searching across all projects?
|
| + warnings: optional list to accumulate warning messages.
|
| +
|
| + Returns:
|
| + A QueryAST with conjunctions (usually just one), where each has a list of
|
| + Condition PBs with op, fields, str_values and int_values. E.g., the query
|
| + [priority=high leak OR stars>100] over open issues would return
|
| + QueryAST(
|
| + Conjunction(Condition(EQ, [open_fd], [], [1]),
|
| + Condition(EQ, [label_fd], ['priority-high'], []),
|
| + Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
|
| + Conjunction(Condition(EQ, [open_fd], [], [1]),
|
| + Condition(GT, [stars_fd], [], [100])))
|
| +
|
| + Raises:
|
| + InvalidQueryError: If a problem was detected in the user's query.
|
| + """
|
| + if warnings is None:
|
| + warnings = []
|
| + if _HasParens(query):
|
| + warnings.append('Parentheses are ignored in user queries.')
|
| +
|
| + if _HasParens(scope):
|
| + warnings.append('Parentheses are ignored in saved queries.')
|
| +
|
| + # Convert the overall query into one or more OR'd subqueries.
|
| + subqueries = query.split(' OR ')
|
| +
|
| + if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now.
|
| + raise InvalidQueryError('Logical operator OR is not supported yet.')
|
| +
|
| + # Make a dictionary of all fields: built-in + custom in each project.
|
| + combined_fields = collections.defaultdict(
|
| + list, {field_name: [field_def]
|
| + for field_name, field_def in builtin_fields.iteritems()})
|
| + for fd in harmonized_config.field_defs:
|
| + if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
|
| + # Only do non-enum fields because enums are stored as labels
|
| + combined_fields[fd.field_name.lower()].append(fd)
|
| +
|
| + conjunctions = [
|
| + _ParseConjunction(sq, scope, combined_fields, warnings)
|
| + for sq in subqueries]
|
| + return ast_pb2.QueryAST(conjunctions=conjunctions)
|
| +
|
| +
|
| +def _HasParens(s):
|
| + """Return True if there are parentheses in the given string."""
|
| + # Monorail cannot handle parenthesized expressions, so we tell the
|
| + # user that immediately. Even inside a quoted string, the GAE search
|
| + # engine will not handle parens in TEXT-type fields.
|
| + return '(' in s or ')' in s
|
| +
|
| +
|
| +def _ParseConjunction(subquery, scope, fields, warnings):
|
| + """Parse part of a user query into a Conjunction PB."""
|
| + logging.info('Parsing sub query: %r in scope %r', subquery, scope)
|
| + scoped_query = ('%s %s' % (scope, subquery)).lower()
|
| + cond_strs = _ExtractConds(scoped_query)
|
| + conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs]
|
| + return ast_pb2.Conjunction(conds=conds)
|
| +
|
| +
|
| +def _ParseCond(cond_str, fields, warnings):
|
| + """Parse one user query condition string into a Condition PB."""
|
| + op_match = OP_RE.match(cond_str)
|
| + # Do not treat as key:value search terms if any of the special prefixes match.
|
| + special_prefixes_match = any(
|
| + cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
|
| + if op_match and not special_prefixes_match:
|
| + prefix = op_match.group('prefix')
|
| + op = op_match.group('op')
|
| + val = op_match.group('value')
|
| + # Special case handling to continue to support old date query terms from
|
| + # codesite. See monorail:151 for more details.
|
| + if prefix.startswith(_DATE_FIELDS):
|
| + for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
|
| + if prefix.endswith(date_suffix):
|
| + prefix = prefix.rstrip(date_suffix)
|
| + op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
|
| + return _ParseStructuredTerm(prefix, op, val, fields)
|
| +
|
| + # Treat the cond as a full-text search term, which might be negated.
|
| + if cond_str.startswith('-'):
|
| + op = NOT_TEXT_HAS
|
| + cond_str = cond_str[1:]
|
| + else:
|
| + op = TEXT_HAS
|
| +
|
| + # Flag a potential user misunderstanding.
|
| + if cond_str.lower() in ('and', 'or', 'not'):
|
| + warnings.append(
|
| + 'The only supported boolean operator is OR (all capitals).')
|
| +
|
| + return ast_pb2.MakeCond(
|
| + op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
|
| +
|
| +
|
| +def _ParseStructuredTerm(prefix, op_str, value, fields):
|
| + """Parse one user structured query term into an internal representation.
|
| +
|
| + Args:
|
| + prefix: The query operator, usually a field name. E.g., summary. It can
|
| + also be special operators like "is" to test boolean fields.
|
| + op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
|
| + value: the value to compare against, e.g., term to find in that field.
|
| + fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
|
| +
|
| + Returns:
|
| + A Condition PB.
|
| + """
|
| + unquoted_value = value.strip('"')
|
| + # Quick-OR is a convenient way to write one condition that matches any one of
|
| + # multiple values, like set membership. E.g., [Priority=High,Critical].
|
| + quick_or_vals = [v.strip() for v in unquoted_value.split(',')]
|
| +
|
| + if ((prefix == 'is' or prefix == '-is') and
|
| + unquoted_value in ['open', 'blocked', 'spam']):
|
| + return ast_pb2.MakeCond(
|
| + EQ, fields[unquoted_value], [], [int(prefix == 'is')])
|
| +
|
| + op = OPS[op_str]
|
| + negate = False
|
| + if prefix.startswith('-'):
|
| + negate = True
|
| + if op == EQ:
|
| + op = NE
|
| + elif op == TEXT_HAS:
|
| + op = NOT_TEXT_HAS
|
| + prefix = prefix[1:]
|
| +
|
| + # Search entries with or without any value in the specified field.
|
| + if prefix == 'has':
|
| + op = IS_NOT_DEFINED if negate else IS_DEFINED
|
| + if unquoted_value in fields: # Look for that field with any value.
|
| + return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
|
| + else: # Look for any label with that prefix.
|
| + return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
|
| +
|
| + if prefix in fields: # search built-in and custom fields. E.g., summary.
|
| + # Note: if first matching field is date-type, we assume they all are.
|
| + # TODO(jrobbins): better handling for rare case where multiple projects
|
| + # define the same custom field name, and one is a date and another is not.
|
| + first_field = fields[prefix][0]
|
| + if first_field.field_type == DATE:
|
| + date_value = _ParseDateValue(unquoted_value)
|
| + return ast_pb2.MakeCond(op, fields[prefix], [], [date_value])
|
| + else:
|
| + quick_or_ints = []
|
| + for qov in quick_or_vals:
|
| + try:
|
| + quick_or_ints.append(int(qov))
|
| + except ValueError:
|
| + pass
|
| + return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints)
|
| +
|
| + # Since it is not a field, treat it as labels, E.g., Priority.
|
| + quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
|
| + # Convert substring match to key-value match if user typed 'foo:bar'.
|
| + if op == TEXT_HAS:
|
| + op = KEY_HAS
|
| + return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
|
| +
|
| +
|
| +def _ExtractConds(query):
|
| + """Parse a query string into a list of individual condition strings.
|
| +
|
| + Args:
|
| + query: UTF-8 encoded search query string.
|
| +
|
| + Returns:
|
| + A list of query condition strings.
|
| + """
|
| + # Convert to unicode then search for distinct terms.
|
| + term_matches = TERM_RE.findall(query)
|
| +
|
| + terms = []
|
| + for (phrase, word_label, _op1, phrase_label, _op2,
|
| + word) in term_matches:
|
| + # Case 1: Quoted phrases, e.g., ["hot dog"].
|
| + if phrase_label or phrase:
|
| + terms.append(phrase_label or phrase)
|
| +
|
| + # Case 2: Comparisons
|
| + elif word_label:
|
| + special_prefixes_match = any(
|
| + word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
|
| + match = OP_RE.match(word_label)
|
| + if match:
|
| + label = match.group('prefix')
|
| + op = match.group('op')
|
| + word = match.group('value')
|
| + if special_prefixes_match:
|
| + # Do not include quotes if any of the special prefixes match because
|
| + # we do not want to treat the label as key:value search terms.
|
| + terms.append('%s%s%s' % (label, op, word))
|
| + else:
|
| + terms.append('%s%s"%s"' % (label, op, word))
|
| + else:
|
| + # It looked like a key:value cond, but not exactly, so treat it
|
| + # as fulltext search. It is probably a tiny bit of source code.
|
| + terms.append('"%s"' % word_label)
|
| +
|
| + # Case 3: Simple words.
|
| + elif word:
|
| + terms.append(word)
|
| +
|
| + else:
|
| + logging.warn('Unexpected search term in %r', query)
|
| +
|
| + return terms
|
| +
|
| +
|
| +def _ParseDateValue(val):
|
| + """Convert the user-entered date into timestamp."""
|
| + # Support timestamp value such as opened>1437671476
|
| + try:
|
| + return int(val)
|
| + except ValueError:
|
| + pass
|
| +
|
| + # TODO(jrobbins): future: take timezones into account.
|
| + # TODO(jrobbins): for now, explain to users that "today" is
|
| + # actually now: the current time, not 12:01am in their timezone.
|
| + # In fact, it is not very useful because everything in the system
|
| + # happened before the current time.
|
| + if val == 'today':
|
| + return _CalculatePastDate(0)
|
| + elif val.startswith('today-'):
|
| + try:
|
| + days_ago = int(val.split('-')[1])
|
| + except ValueError:
|
| + days_ago = 0
|
| + return _CalculatePastDate(days_ago)
|
| +
|
| + if '/' in val:
|
| + year, month, day = [int(x) for x in val.split('/')]
|
| + elif '-' in val:
|
| + year, month, day = [int(x) for x in val.split('-')]
|
| +
|
| + try:
|
| + return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
|
| + except ValueError:
|
| + raise InvalidQueryError('Could not parse date')
|
| +
|
| +
|
| +def _CalculatePastDate(days_ago, now=None):
|
| + """Calculates the timestamp N days ago from now."""
|
| + if now is None:
|
| + now = int(time.time())
|
| + ts = now - days_ago * 24 * 60 * 60
|
| + return ts
|
| +
|
| +
|
| +def CheckSyntax(query, harmonized_config, warnings=None):
|
| + """Parse the given query and report the first error or None."""
|
| + try:
|
| + ParseUserQuery(
|
| + query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings)
|
| + except InvalidQueryError as e:
|
| + return e.message
|
| +
|
| + return None
|
| +
|
| +
|
| +class Error(Exception):
|
| + """Base exception class for this package."""
|
| + pass
|
| +
|
| +
|
| +class InvalidQueryError(Error):
|
| + """Error raised when an invalid query is requested."""
|
| + pass
|
|
|