Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1354)

Unified Diff: appengine/monorail/search/query2ast.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « appengine/monorail/search/frontendsearchpipeline.py ('k') | appengine/monorail/search/searchpipeline.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/monorail/search/query2ast.py
diff --git a/appengine/monorail/search/query2ast.py b/appengine/monorail/search/query2ast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c7b617d1e05008434bfd5f66a78de11b12347fa
--- /dev/null
+++ b/appengine/monorail/search/query2ast.py
@@ -0,0 +1,425 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is govered by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""A set of functions that integrate the GAE search index with Monorail."""
+
+import collections
+import datetime
+import logging
+import re
+from services import fulltext_helpers
+import time
+
+from proto import ast_pb2
+from proto import tracker_pb2
+
+
+# TODO(jrobbins): Consider re-implementing this whole file by using a
+# BNF syntax specification and a parser generator or library.
+
+# encodings
+UTF8 = 'utf-8'
+
+# Field types and operators
+BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
+DATE = tracker_pb2.FieldTypes.DATE_TYPE
+NUM = tracker_pb2.FieldTypes.INT_TYPE
+TXT = tracker_pb2.FieldTypes.STR_TYPE
+
+EQ = ast_pb2.QueryOp.EQ
+NE = ast_pb2.QueryOp.NE
+LT = ast_pb2.QueryOp.LT
+GT = ast_pb2.QueryOp.GT
+LE = ast_pb2.QueryOp.LE
+GE = ast_pb2.QueryOp.GE
+TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
+NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
+TEXT_MATCHES = ast_pb2.QueryOp.TEXT_MATCHES
+NOT_TEXT_MATCHES = ast_pb2.QueryOp.NOT_TEXT_MATCHES
+IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
+IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
+KEY_HAS = ast_pb2.QueryOp.KEY_HAS
+
+# Mapping from user query comparison operators to our internal representation.
+OPS = {
+ ':': TEXT_HAS,
+ '=': EQ,
+ '!=': NE,
+ '<': LT,
+ '>': GT,
+ '<=': LE,
+ '>=': GE,
+}
+
+# This is a partial regular expression that matches all of our comparison
+# operators, such as =, 1=, >, and <. Longer ones listed first so that the
+# shorter ones don't cause premature matches.
+OPS_PATTERN = '|'.join(
+ map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op))))
+
+# This RE extracts search terms from a subquery string.
+TERM_RE = re.compile(
+ r'(-?"[^"]+")|' # E.g., ["division by zero"]
+ r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
+ r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
+ r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
+ % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
+
+# This RE is used to further decompose a comparison term into prefix, op, and
+# value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
+# can include a leading "-" to negate the comparison.
+OP_RE = re.compile(
+ r'^(?P<prefix>[-_\w]*?)'
+ r'(?P<op>%s)'
+ r'(?P<value>([-,.@>/_\*\w]+|"[^"]+"))$' %
+ OPS_PATTERN,
+ flags=re.UNICODE)
+
+
+# Predefined issue fields passed to the query parser.
+_ISSUE_FIELDS_LIST = [
+ (ast_pb2.ANY_FIELD, TXT),
+ ('attachment', TXT), # attachment file names
+ ('attachments', NUM), # number of attachment files
+ ('blocked', BOOL),
+ ('blockedon', TXT),
+ ('blockedon_id', NUM),
+ ('blocking', TXT),
+ ('blocking_id', NUM),
+ ('cc', TXT),
+ ('cc_id', NUM),
+ ('comment', TXT),
+ ('commentby', TXT),
+ ('commentby_id', NUM),
+ ('component', TXT),
+ ('component_id', NUM),
+ ('description', TXT),
+ ('id', NUM),
+ ('label', TXT),
+ ('label_id', NUM),
+ ('mergedinto', NUM),
+ ('open', BOOL),
+ ('owner', TXT),
+ ('owner_id', NUM),
+ ('project', TXT),
+ ('reporter', TXT),
+ ('reporter_id', NUM),
+ ('spam', BOOL),
+ ('stars', NUM),
+ ('starredby', TXT),
+ ('starredby_id', NUM),
+ ('status', TXT),
+ ('status_id', NUM),
+ ('summary', TXT),
+ ]
+
+_DATE_FIELDS = (
+ 'closed',
+ 'modified',
+ 'opened',
+)
+
+# Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
+_ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
+
+_DATE_FIELD_SUFFIX_TO_OP = {
+ '-after': '>',
+ '-before': '<',
+}
+
+BUILTIN_ISSUE_FIELDS = {
+ f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
+ for f_name, f_type in _ISSUE_FIELDS_LIST}
+
+
+def ParseUserQuery(
+ query, scope, builtin_fields, harmonized_config, warnings=None):
+ """Parse a user query and return a set of structure terms.
+
+ Args:
+ query: string with user's query. E.g., 'Priority=High'.
+ scope: string search terms that define the scope in which the
+ query should be executed. They are expressed in the same
+ user query language. E.g., adding the canned query.
+ builtin_fields: dict {field_name: FieldDef(field_name, type)}
+ mapping field names to FieldDef objects for built-in fields.
+ harmonized_config: config for all the projects being searched.
+ @@@ custom field name is not unique in cross project search.
+ - custom_fields = {field_name: [fd, ...]}
+ - query build needs to OR each possible interpretation
+ - could be label in one project and field in another project.
+ @@@ what about searching across all projects?
+ warnings: optional list to accumulate warning messages.
+
+ Returns:
+ A QueryAST with conjunctions (usually just one), where each has a list of
+ Condition PBs with op, fields, str_values and int_values. E.g., the query
+ [priority=high leak OR stars>100] over open issues would return
+ QueryAST(
+ Conjunction(Condition(EQ, [open_fd], [], [1]),
+ Condition(EQ, [label_fd], ['priority-high'], []),
+ Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
+ Conjunction(Condition(EQ, [open_fd], [], [1]),
+ Condition(GT, [stars_fd], [], [100])))
+
+ Raises:
+ InvalidQueryError: If a problem was detected in the user's query.
+ """
+ if warnings is None:
+ warnings = []
+ if _HasParens(query):
+ warnings.append('Parentheses are ignored in user queries.')
+
+ if _HasParens(scope):
+ warnings.append('Parentheses are ignored in saved queries.')
+
+ # Convert the overall query into one or more OR'd subqueries.
+ subqueries = query.split(' OR ')
+
+ if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now.
+ raise InvalidQueryError('Logical operator OR is not supported yet.')
+
+ # Make a dictionary of all fields: built-in + custom in each project.
+ combined_fields = collections.defaultdict(
+ list, {field_name: [field_def]
+ for field_name, field_def in builtin_fields.iteritems()})
+ for fd in harmonized_config.field_defs:
+ if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
+ # Only do non-enum fields because enums are stored as labels
+ combined_fields[fd.field_name.lower()].append(fd)
+
+ conjunctions = [
+ _ParseConjunction(sq, scope, combined_fields, warnings)
+ for sq in subqueries]
+ return ast_pb2.QueryAST(conjunctions=conjunctions)
+
+
+def _HasParens(s):
+ """Return True if there are parentheses in the given string."""
+ # Monorail cannot handle parenthesized expressions, so we tell the
+ # user that immediately. Even inside a quoted string, the GAE search
+ # engine will not handle parens in TEXT-type fields.
+ return '(' in s or ')' in s
+
+
+def _ParseConjunction(subquery, scope, fields, warnings):
+ """Parse part of a user query into a Conjunction PB."""
+ logging.info('Parsing sub query: %r in scope %r', subquery, scope)
+ scoped_query = ('%s %s' % (scope, subquery)).lower()
+ cond_strs = _ExtractConds(scoped_query)
+ conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs]
+ return ast_pb2.Conjunction(conds=conds)
+
+
+def _ParseCond(cond_str, fields, warnings):
+ """Parse one user query condition string into a Condition PB."""
+ op_match = OP_RE.match(cond_str)
+ # Do not treat as key:value search terms if any of the special prefixes match.
+ special_prefixes_match = any(
+ cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
+ if op_match and not special_prefixes_match:
+ prefix = op_match.group('prefix')
+ op = op_match.group('op')
+ val = op_match.group('value')
+ # Special case handling to continue to support old date query terms from
+ # codesite. See monorail:151 for more details.
+ if prefix.startswith(_DATE_FIELDS):
+ for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
+ if prefix.endswith(date_suffix):
+ prefix = prefix.rstrip(date_suffix)
+ op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
+ return _ParseStructuredTerm(prefix, op, val, fields)
+
+ # Treat the cond as a full-text search term, which might be negated.
+ if cond_str.startswith('-'):
+ op = NOT_TEXT_HAS
+ cond_str = cond_str[1:]
+ else:
+ op = TEXT_HAS
+
+ # Flag a potential user misunderstanding.
+ if cond_str.lower() in ('and', 'or', 'not'):
+ warnings.append(
+ 'The only supported boolean operator is OR (all capitals).')
+
+ return ast_pb2.MakeCond(
+ op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
+
+
+def _ParseStructuredTerm(prefix, op_str, value, fields):
+ """Parse one user structured query term into an internal representation.
+
+ Args:
+ prefix: The query operator, usually a field name. E.g., summary. It can
+ also be special operators like "is" to test boolean fields.
+ op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
+ value: the value to compare against, e.g., term to find in that field.
+ fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
+
+ Returns:
+ A Condition PB.
+ """
+ unquoted_value = value.strip('"')
+ # Quick-OR is a convenient way to write one condition that matches any one of
+ # multiple values, like set membership. E.g., [Priority=High,Critical].
+ quick_or_vals = [v.strip() for v in unquoted_value.split(',')]
+
+ if ((prefix == 'is' or prefix == '-is') and
+ unquoted_value in ['open', 'blocked', 'spam']):
+ return ast_pb2.MakeCond(
+ EQ, fields[unquoted_value], [], [int(prefix == 'is')])
+
+ op = OPS[op_str]
+ negate = False
+ if prefix.startswith('-'):
+ negate = True
+ if op == EQ:
+ op = NE
+ elif op == TEXT_HAS:
+ op = NOT_TEXT_HAS
+ prefix = prefix[1:]
+
+ # Search entries with or without any value in the specified field.
+ if prefix == 'has':
+ op = IS_NOT_DEFINED if negate else IS_DEFINED
+ if unquoted_value in fields: # Look for that field with any value.
+ return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
+ else: # Look for any label with that prefix.
+ return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
+
+ if prefix in fields: # search built-in and custom fields. E.g., summary.
+ # Note: if first matching field is date-type, we assume they all are.
+ # TODO(jrobbins): better handling for rare case where multiple projects
+ # define the same custom field name, and one is a date and another is not.
+ first_field = fields[prefix][0]
+ if first_field.field_type == DATE:
+ date_value = _ParseDateValue(unquoted_value)
+ return ast_pb2.MakeCond(op, fields[prefix], [], [date_value])
+ else:
+ quick_or_ints = []
+ for qov in quick_or_vals:
+ try:
+ quick_or_ints.append(int(qov))
+ except ValueError:
+ pass
+ return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints)
+
+ # Since it is not a field, treat it as labels, E.g., Priority.
+ quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
+ # Convert substring match to key-value match if user typed 'foo:bar'.
+ if op == TEXT_HAS:
+ op = KEY_HAS
+ return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
+
+
+def _ExtractConds(query):
+ """Parse a query string into a list of individual condition strings.
+
+ Args:
+ query: UTF-8 encoded search query string.
+
+ Returns:
+ A list of query condition strings.
+ """
+ # Convert to unicode then search for distinct terms.
+ term_matches = TERM_RE.findall(query)
+
+ terms = []
+ for (phrase, word_label, _op1, phrase_label, _op2,
+ word) in term_matches:
+ # Case 1: Quoted phrases, e.g., ["hot dog"].
+ if phrase_label or phrase:
+ terms.append(phrase_label or phrase)
+
+ # Case 2: Comparisons
+ elif word_label:
+ special_prefixes_match = any(
+ word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
+ match = OP_RE.match(word_label)
+ if match:
+ label = match.group('prefix')
+ op = match.group('op')
+ word = match.group('value')
+ if special_prefixes_match:
+ # Do not include quotes if any of the special prefixes match because
+ # we do not want to treat the label as key:value search terms.
+ terms.append('%s%s%s' % (label, op, word))
+ else:
+ terms.append('%s%s"%s"' % (label, op, word))
+ else:
+ # It looked like a key:value cond, but not exactly, so treat it
+ # as fulltext search. It is probably a tiny bit of source code.
+ terms.append('"%s"' % word_label)
+
+ # Case 3: Simple words.
+ elif word:
+ terms.append(word)
+
+ else:
+ logging.warn('Unexpected search term in %r', query)
+
+ return terms
+
+
+def _ParseDateValue(val):
+ """Convert the user-entered date into timestamp."""
+ # Support timestamp value such as opened>1437671476
+ try:
+ return int(val)
+ except ValueError:
+ pass
+
+ # TODO(jrobbins): future: take timezones into account.
+ # TODO(jrobbins): for now, explain to users that "today" is
+ # actually now: the current time, not 12:01am in their timezone.
+ # In fact, it is not very useful because everything in the system
+ # happened before the current time.
+ if val == 'today':
+ return _CalculatePastDate(0)
+ elif val.startswith('today-'):
+ try:
+ days_ago = int(val.split('-')[1])
+ except ValueError:
+ days_ago = 0
+ return _CalculatePastDate(days_ago)
+
+ if '/' in val:
+ year, month, day = [int(x) for x in val.split('/')]
+ elif '-' in val:
+ year, month, day = [int(x) for x in val.split('-')]
+
+ try:
+ return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
+ except ValueError:
+ raise InvalidQueryError('Could not parse date')
+
+
+def _CalculatePastDate(days_ago, now=None):
+ """Calculates the timestamp N days ago from now."""
+ if now is None:
+ now = int(time.time())
+ ts = now - days_ago * 24 * 60 * 60
+ return ts
+
+
+def CheckSyntax(query, harmonized_config, warnings=None):
+ """Parse the given query and report the first error or None."""
+ try:
+ ParseUserQuery(
+ query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings)
+ except InvalidQueryError as e:
+ return e.message
+
+ return None
+
+
+class Error(Exception):
+ """Base exception class for this package."""
+ pass
+
+
+class InvalidQueryError(Error):
+ """Error raised when an invalid query is requested."""
+ pass
« no previous file with comments | « appengine/monorail/search/frontendsearchpipeline.py ('k') | appengine/monorail/search/searchpipeline.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698