| Index: appengine/monorail/search/ast2ast.py
|
| diff --git a/appengine/monorail/search/ast2ast.py b/appengine/monorail/search/ast2ast.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..19c93ae7757258979a2685a5ad325f79eb74b555
|
| --- /dev/null
|
| +++ b/appengine/monorail/search/ast2ast.py
|
| @@ -0,0 +1,411 @@
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is govered by a BSD-style
|
| +# license that can be found in the LICENSE file or at
|
| +# https://developers.google.com/open-source/licenses/bsd
|
| +
|
| +"""Convert a user's issue search AST into a simplified AST.
|
| +
|
| +This phase of query processing simplifies the user's query by looking up
|
| +the int IDs of any labels, statuses, or components that are mentioned by
|
| +name in the original query. The data needed for lookups is typically cached
|
| +in RAM in each backend job, so this will not put much load on the DB. The
|
| +simplified ASTs are later converted into SQL which is simpler and has
|
| +fewer joins.
|
| +
|
| +The simplified main query is better because:
|
| + + It is clearly faster, especially in the most common case where config
|
| + data is in RAM.
|
| + + Since less RAM is used to process the main query on each shard, query
|
| + execution time is more consistent with less variability under load. Less
|
| + variability is good because the user must wait for the slowest shard.
|
| + + The config tables (LabelDef, StatusDef, etc.) exist only on the master, so
|
| + they cannot be mentioned in a query that runs on a shard.
|
| + + The query string itself is shorter when numeric IDs are substituted, which
|
| + means that we can handle user queries with long lists of labels in a
|
| + reasonable-sized query.
|
| + + It bisects the complexity of the operation: it's easier to test and debug
|
| + the lookup and simplification logic plus the main query logic this way
|
| + than it would be to deal with an even more complex SQL main query.
|
| +"""
|
| +
|
| +import logging
|
| +import re
|
| +
|
| +from proto import ast_pb2
|
| +from proto import tracker_pb2
|
| +# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
|
| +# remove this dep.
|
| +from search import query2ast
|
| +from services import user_svc
|
| +from tracker import tracker_bizobj
|
| +
|
| +
|
| +def PreprocessAST(
|
| + cnxn, query_ast, project_ids, services, harmonized_config):
|
| + """Preprocess the query by doing lookups so that the SQL query is simpler.
|
| +
|
| + Args:
|
| + cnxn: connection to SQL database.
|
| + query_ast: user query abstract syntax tree parsed by query2ast.py.
|
| + project_ids: collection of int project IDs to use to look up status values
|
| + and labels.
|
| + services: Connections to persistence layer for users and configs.
|
| + harmonized_config: harmonized config for all projects being searched.
|
| +
|
| + Returns:
|
| + A new QueryAST PB with simplified conditions. Specifically, string values
|
| + for labels, statuses, and components are replaced with the int IDs of
|
| + those items. Also, is:open is distilled down to
|
| + status_id != closed_status_ids.
|
| + """
|
| + new_conjs = []
|
| + for conj in query_ast.conjunctions:
|
| + new_conds = [
|
| + _PreprocessCond(
|
| + cnxn, cond, project_ids, services, harmonized_config)
|
| + for cond in conj.conds]
|
| + new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
|
| +
|
| + return ast_pb2.QueryAST(conjunctions=new_conjs)
|
| +
|
| +
|
| +def _PreprocessIsOpenCond(
|
| + cnxn, cond, project_ids, services, _harmonized_config):
|
| + """Preprocess an is:open cond into status_id != closed_status_ids."""
|
| + if project_ids:
|
| + closed_status_ids = []
|
| + for project_id in project_ids:
|
| + closed_status_ids.extend(services.config.LookupClosedStatusIDs(
|
| + cnxn, project_id))
|
| + else:
|
| + closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
|
| +
|
| + is_closed = not bool(cond.int_values[0])
|
| + return ast_pb2.Condition(
|
| + op=ast_pb2.QueryOp.EQ if is_closed else ast_pb2.QueryOp.NE,
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
|
| + int_values=closed_status_ids)
|
| +
|
| +
|
| +def _PreprocessIsBlockedCond(
|
| + _cnxn, cond, _project_ids, _services, _harmonized_config):
|
| + """Preprocess an is:blocked cond into issues that are blocked."""
|
| + op = (ast_pb2.QueryOp.IS_DEFINED if bool(cond.int_values[0])
|
| + else ast_pb2.QueryOp.IS_NOT_DEFINED)
|
| + return ast_pb2.Condition(
|
| + op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
|
| +
|
| +
|
| +def _PreprocessBlockedOnCond(
|
| + cnxn, cond, project_ids, services, _harmonized_config):
|
| + """Preprocess blockedon=xyz and has:blockedon conds.
|
| +
|
| + Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
|
| + Preprocesses has:blockedon cond into issues that are blocked on other issues.
|
| + """
|
| + issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
|
| + return ast_pb2.Condition(
|
| + op=_TextOpToIntOp(cond.op),
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
|
| + int_values=issue_ids)
|
| +
|
| +
|
| +def _PreprocessBlockingCond(
|
| + cnxn, cond, project_ids, services, _harmonized_config):
|
| + """Preprocess blocking=xyz and has:blocking conds.
|
| +
|
| + Preprocesses blocking=xyz cond into blocking_id:issue_ids.
|
| + Preprocesses has:blocking cond into issues that are blocking other issues.
|
| + """
|
| + issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
|
| + return ast_pb2.Condition(
|
| + op=_TextOpToIntOp(cond.op),
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
|
| + int_values=issue_ids)
|
| +
|
| +
|
| +def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
|
| + """Returns global IDs from the local IDs provided in the cond."""
|
| + # Get {project_name: project} for all projects in project_ids.
|
| + ids_to_projects = services.project.GetProjects(cnxn, project_ids)
|
| + ref_projects = {pb.project_name: pb for pb in ids_to_projects.itervalues()}
|
| + # Populate default_project_name if there is only one project id provided.
|
| + default_project_name = None
|
| + if len(ref_projects) == 1:
|
| + default_project_name = ref_projects.values()[0].project_name
|
| +
|
| + # Populate refs with (project_name, local_id) pairs.
|
| + refs = []
|
| + for val in cond.str_values:
|
| + project_name, local_id = tracker_bizobj.ParseIssueRef(val)
|
| + if not project_name:
|
| + if not default_project_name:
|
| + # TODO(rmistry): Support the below.
|
| + raise ValueError(
|
| + 'Searching for issues accross multiple/all projects without '
|
| + 'project prefixes is ambiguous and is currently not supported.')
|
| + project_name = default_project_name
|
| + refs.append((project_name, int(local_id)))
|
| +
|
| + return services.issue.ResolveIssueRefs(
|
| + cnxn, ref_projects, default_project_name, refs)
|
| +
|
| +
|
| +def _PreprocessStatusCond(
|
| + cnxn, cond, project_ids, services, _harmonized_config):
|
| + """Preprocess a status=names cond into status_id=IDs."""
|
| + if project_ids:
|
| + status_ids = []
|
| + for project_id in project_ids:
|
| + status_ids.extend(services.config.LookupStatusIDs(
|
| + cnxn, project_id, cond.str_values))
|
| + else:
|
| + status_ids = services.config.LookupStatusIDsAnyProject(
|
| + cnxn, cond.str_values)
|
| +
|
| + return ast_pb2.Condition(
|
| + op=_TextOpToIntOp(cond.op),
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
|
| + int_values=status_ids)
|
| +
|
| +
|
| +def _IsEqualityOp(op):
|
| + """Return True for EQ and NE."""
|
| + return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
|
| +
|
| +
|
| +def _IsDefinedOp(op):
|
| + """Return True for IS_DEFINED and IS_NOT_DEFINED."""
|
| + return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
|
| +
|
| +
|
| +def _TextOpToIntOp(op):
|
| + """If a query is optimized from string to ID matching, use an equality op."""
|
| + if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
|
| + return ast_pb2.QueryOp.EQ
|
| + elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
|
| + return ast_pb2.QueryOp.NE
|
| + return op
|
| +
|
| +
|
| +def _MakePrefixRegex(cond):
|
| + """Return a regex to match strings that start with cond values."""
|
| + all_prefixes = '|'.join(map(re.escape, cond.str_values))
|
| + return re.compile(r'(%s)-.+' % all_prefixes, re.I)
|
| +
|
| +
|
| +def _MakeKeyValueRegex(cond):
|
| + """Return a regex to match the first token and remaining text separately."""
|
| + keys, values = zip(*map(lambda x: x.split('-', 1), cond.str_values))
|
| + if len(set(keys)) != 1:
|
| + raise ValueError(
|
| + "KeyValue query with multiple different keys: %r" % cond.str_values)
|
| + all_values = '|'.join(map(re.escape, values))
|
| + return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
|
| +
|
| +
|
| +def _MakeWordBoundaryRegex(cond):
|
| + """Return a regex to match the cond values as whole words."""
|
| + all_words = '|'.join(map(re.escape, cond.str_values))
|
| + return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
|
| +
|
| +
|
| +def _PreprocessLabelCond(
|
| + cnxn, cond, project_ids, services, _harmonized_config):
|
| + """Preprocess a label=names cond into label_id=IDs."""
|
| + if project_ids:
|
| + label_ids = []
|
| + for project_id in project_ids:
|
| + if _IsEqualityOp(cond.op):
|
| + label_ids.extend(services.config.LookupLabelIDs(
|
| + cnxn, project_id, cond.str_values))
|
| + elif _IsDefinedOp(cond.op):
|
| + label_ids.extend(services.config.LookupIDsOfLabelsMatching(
|
| + cnxn, project_id, _MakePrefixRegex(cond)))
|
| + elif cond.op == ast_pb2.QueryOp.KEY_HAS:
|
| + label_ids.extend(services.config.LookupIDsOfLabelsMatching(
|
| + cnxn, project_id, _MakeKeyValueRegex(cond)))
|
| + else:
|
| + label_ids.extend(services.config.LookupIDsOfLabelsMatching(
|
| + cnxn, project_id, _MakeWordBoundaryRegex(cond)))
|
| + else:
|
| + if _IsEqualityOp(cond.op):
|
| + label_ids = services.config.LookupLabelIDsAnyProject(
|
| + cnxn, cond.str_values)
|
| + elif _IsDefinedOp(cond.op):
|
| + label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
|
| + cnxn, _MakePrefixRegex(cond))
|
| + elif cond.op == ast_pb2.QueryOp.KEY_HAS:
|
| + label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
|
| + cnxn, _MakeKeyValueRegex(cond))
|
| + else:
|
| + label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
|
| + cnxn, _MakeWordBoundaryRegex(cond))
|
| +
|
| + return ast_pb2.Condition(
|
| + op=_TextOpToIntOp(cond.op),
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
|
| + int_values=label_ids)
|
| +
|
| +
|
| +def _PreprocessComponentCond(
|
| + cnxn, cond, project_ids, services, harmonized_config):
|
| + """Preprocess a component= or component:name cond into component_id=IDs."""
|
| + exact = _IsEqualityOp(cond.op)
|
| + component_ids = []
|
| + if project_ids:
|
| + # We are searching within specific projects, so harmonized_config
|
| + # holds the config data for all those projects.
|
| + for comp_path in cond.str_values:
|
| + component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
|
| + comp_path, harmonized_config, exact=exact))
|
| + else:
|
| + # We are searching across the whole site, so we have no harmonized_config
|
| + # to use.
|
| + component_ids = services.config.FindMatchingComponentIDsAnyProject(
|
| + cnxn, cond.str_values, exact=exact)
|
| +
|
| + return ast_pb2.Condition(
|
| + op=_TextOpToIntOp(cond.op),
|
| + field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
|
| + int_values=component_ids)
|
| +
|
| +
|
| +def _PreprocessExactUsers(cnxn, cond, user_service, id_fields):
|
| + """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
|
| +
|
| + This preprocesing step converts string conditions to int ID conditions.
|
| + E.g., [owner=email] to [owner_id=ID]. It only does it in cases
|
| + where (a) the email was "me", so it was already converted to an string of
|
| + digits in the search pipeline, or (b) it is "user@domain" which resolves to
|
| + a known Monorail user. It is also possible to search for, e.g.,
|
| + [owner:substring], but such searches remain 'owner' field searches rather
|
| + than 'owner_id', and they cannot be combined with the "me" keyword.
|
| +
|
| + Args:
|
| + cnxn: connection to the DB.
|
| + cond: original parsed query Condition PB.
|
| + user_service: connection to user persistence layer.
|
| + id_fields: list of the search fields to use if the conversion to IDs
|
| + succeeds.
|
| +
|
| + Returns:
|
| + A new Condition PB that checks the id_field. Or, the original cond.
|
| + """
|
| + op = _TextOpToIntOp(cond.op)
|
| + if _IsDefinedOp(op):
|
| + # No need to look up any IDs if we are just testing for any defined value.
|
| + return ast_pb2.Condition(op=op, field_defs=id_fields)
|
| +
|
| + # This preprocessing step is only for ops that compare whole values, not
|
| + # substrings.
|
| + if not _IsEqualityOp(op):
|
| + logging.info('could not convert to IDs because op is %r', op)
|
| + return cond
|
| +
|
| + user_ids = []
|
| + for val in cond.str_values:
|
| + try:
|
| + user_ids.append(int(val))
|
| + except ValueError:
|
| + try:
|
| + user_ids.append(user_service.LookupUserID(cnxn, val))
|
| + except user_svc.NoSuchUserException:
|
| + logging.info('could not convert user %r to int ID', val)
|
| + return cond # preprocessing failed, stick with the original cond.
|
| +
|
| + return ast_pb2.Condition(op=op, field_defs=id_fields, int_values=user_ids)
|
| +
|
| +
|
| +def _PreprocessOwnerCond(
|
| + cnxn, cond, _project_ids, services, _harmonized_config):
|
| + """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
|
| + return _PreprocessExactUsers(
|
| + cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']])
|
| +
|
| +
|
| +def _PreprocessCcCond(
|
| + cnxn, cond, _project_ids, services, _harmonized_config):
|
| + """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
|
| + return _PreprocessExactUsers(
|
| + cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']])
|
| +
|
| +
|
| +def _PreprocessReporterCond(
|
| + cnxn, cond, _project_ids, services, _harmonized_config):
|
| + """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
|
| + return _PreprocessExactUsers(
|
| + cnxn, cond, services.user,
|
| + [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']])
|
| +
|
| +
|
| +def _PreprocessStarredByCond(
|
| + cnxn, cond, _project_ids, services, _harmonized_config):
|
| + """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
|
| + return _PreprocessExactUsers(
|
| + cnxn, cond, services.user,
|
| + [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']])
|
| +
|
| +
|
| +def _PreprocessCommentByCond(
|
| + cnxn, cond, _project_ids, services, _harmonized_config):
|
| + """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
|
| + return _PreprocessExactUsers(
|
| + cnxn, cond, services.user,
|
| + [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']])
|
| +
|
| +
|
| +def _PreprocessCustomCond(cnxn, cond, services):
|
| + """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
|
| + # TODO(jrobbins): better support for ambiguous fields.
|
| + # For now, if any field is USER_TYPE and the value being searched
|
| + # for is the email address of an existing account, it will convert
|
| + # to a user ID and we go with exact ID matching. Otherwise, we
|
| + # leave the cond as-is for ast2select to do string matching on.
|
| + user_field_defs = [fd for fd in cond.field_defs
|
| + if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
|
| + if user_field_defs:
|
| + return _PreprocessExactUsers(cnxn, cond, services.user, user_field_defs)
|
| + else:
|
| + return cond
|
| +
|
| +
|
| +_PREPROCESSORS = {
|
| + 'open': _PreprocessIsOpenCond,
|
| + 'blocked': _PreprocessIsBlockedCond,
|
| + 'blockedon': _PreprocessBlockedOnCond,
|
| + 'blocking': _PreprocessBlockingCond,
|
| + 'status': _PreprocessStatusCond,
|
| + 'label': _PreprocessLabelCond,
|
| + 'component': _PreprocessComponentCond,
|
| + 'owner': _PreprocessOwnerCond,
|
| + 'cc': _PreprocessCcCond,
|
| + 'reporter': _PreprocessReporterCond,
|
| + 'starredby': _PreprocessStarredByCond,
|
| + 'commentby': _PreprocessCommentByCond,
|
| + }
|
| +
|
| +
|
| +def _PreprocessCond(
|
| + cnxn, cond, project_ids, services, harmonized_config):
|
| + """Preprocess query by looking up status, label and component IDs."""
|
| + # All the fields in a cond share the same name because they are parsed
|
| + # from a user query term, and the term syntax allows just one field name.
|
| + field_name = cond.field_defs[0].field_name
|
| + assert all(fd.field_name == field_name for fd in cond.field_defs)
|
| +
|
| + # Case 1: The user is searching custom fields.
|
| + if any(fd.field_id for fd in cond.field_defs):
|
| + # There can't be a mix of custom and built-in fields because built-in
|
| + # field names are reserved and take priority over any conflicting ones.
|
| + assert all(fd.field_id for fd in cond.field_defs)
|
| + return _PreprocessCustomCond(cnxn, cond, services)
|
| +
|
| + # Case 2: The user is searching a built-in field.
|
| + preproc = _PREPROCESSORS.get(field_name)
|
| + if preproc:
|
| + # We have a preprocessor for that built-in field.
|
| + return preproc(cnxn, cond, project_ids, services, harmonized_config)
|
| + else:
|
| + # We don't have a preprocessor for it.
|
| + return cond
|
|
|