Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1781)

Unified Diff: appengine/monorail/search/ast2ast.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « appengine/monorail/search/__init__.py ('k') | appengine/monorail/search/ast2select.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: appengine/monorail/search/ast2ast.py
diff --git a/appengine/monorail/search/ast2ast.py b/appengine/monorail/search/ast2ast.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c93ae7757258979a2685a5ad325f79eb74b555
--- /dev/null
+++ b/appengine/monorail/search/ast2ast.py
@@ -0,0 +1,411 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is govered by a BSD-style
+# license that can be found in the LICENSE file or at
+# https://developers.google.com/open-source/licenses/bsd
+
+"""Convert a user's issue search AST into a simplified AST.
+
+This phase of query processing simplifies the user's query by looking up
+the int IDs of any labels, statuses, or components that are mentioned by
+name in the original query. The data needed for lookups is typically cached
+in RAM in each backend job, so this will not put much load on the DB. The
+simplified ASTs are later converted into SQL which is simpler and has
+fewer joins.
+
+The simplified main query is better because:
+ + It is clearly faster, especially in the most common case where config
+ data is in RAM.
+ + Since less RAM is used to process the main query on each shard, query
+ execution time is more consistent with less variability under load. Less
+ variability is good because the user must wait for the slowest shard.
+ + The config tables (LabelDef, StatusDef, etc.) exist only on the master, so
+ they cannot be mentioned in a query that runs on a shard.
+ + The query string itself is shorter when numeric IDs are substituted, which
+ means that we can handle user queries with long lists of labels in a
+ reasonable-sized query.
+ + It bisects the complexity of the operation: it's easier to test and debug
+ the lookup and simplification logic plus the main query logic this way
+ than it would be to deal with an even more complex SQL main query.
+"""
+
+import logging
+import re
+
+from proto import ast_pb2
+from proto import tracker_pb2
+# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
+# remove this dep.
+from search import query2ast
+from services import user_svc
+from tracker import tracker_bizobj
+
+
+def PreprocessAST(
+ cnxn, query_ast, project_ids, services, harmonized_config):
+ """Preprocess the query by doing lookups so that the SQL query is simpler.
+
+ Args:
+ cnxn: connection to SQL database.
+ query_ast: user query abstract syntax tree parsed by query2ast.py.
+ project_ids: collection of int project IDs to use to look up status values
+ and labels.
+ services: Connections to persistence layer for users and configs.
+ harmonized_config: harmonized config for all projects being searched.
+
+ Returns:
+ A new QueryAST PB with simplified conditions. Specifically, string values
+ for labels, statuses, and components are replaced with the int IDs of
+ those items. Also, is:open is distilled down to
+ status_id != closed_status_ids.
+ """
+ new_conjs = []
+ for conj in query_ast.conjunctions:
+ new_conds = [
+ _PreprocessCond(
+ cnxn, cond, project_ids, services, harmonized_config)
+ for cond in conj.conds]
+ new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
+
+ return ast_pb2.QueryAST(conjunctions=new_conjs)
+
+
+def _PreprocessIsOpenCond(
+ cnxn, cond, project_ids, services, _harmonized_config):
+ """Preprocess an is:open cond into status_id != closed_status_ids."""
+ if project_ids:
+ closed_status_ids = []
+ for project_id in project_ids:
+ closed_status_ids.extend(services.config.LookupClosedStatusIDs(
+ cnxn, project_id))
+ else:
+ closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
+
+ is_closed = not bool(cond.int_values[0])
+ return ast_pb2.Condition(
+ op=ast_pb2.QueryOp.EQ if is_closed else ast_pb2.QueryOp.NE,
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
+ int_values=closed_status_ids)
+
+
+def _PreprocessIsBlockedCond(
+ _cnxn, cond, _project_ids, _services, _harmonized_config):
+ """Preprocess an is:blocked cond into issues that are blocked."""
+ op = (ast_pb2.QueryOp.IS_DEFINED if bool(cond.int_values[0])
+ else ast_pb2.QueryOp.IS_NOT_DEFINED)
+ return ast_pb2.Condition(
+ op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
+
+
+def _PreprocessBlockedOnCond(
+ cnxn, cond, project_ids, services, _harmonized_config):
+ """Preprocess blockedon=xyz and has:blockedon conds.
+
+ Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
+ Preprocesses has:blockedon cond into issues that are blocked on other issues.
+ """
+ issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
+ int_values=issue_ids)
+
+
+def _PreprocessBlockingCond(
+ cnxn, cond, project_ids, services, _harmonized_config):
+ """Preprocess blocking=xyz and has:blocking conds.
+
+ Preprocesses blocking=xyz cond into blocking_id:issue_ids.
+ Preprocesses has:blocking cond into issues that are blocking other issues.
+ """
+ issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
+ int_values=issue_ids)
+
+
+def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
+ """Returns global IDs from the local IDs provided in the cond."""
+ # Get {project_name: project} for all projects in project_ids.
+ ids_to_projects = services.project.GetProjects(cnxn, project_ids)
+ ref_projects = {pb.project_name: pb for pb in ids_to_projects.itervalues()}
+ # Populate default_project_name if there is only one project id provided.
+ default_project_name = None
+ if len(ref_projects) == 1:
+ default_project_name = ref_projects.values()[0].project_name
+
+ # Populate refs with (project_name, local_id) pairs.
+ refs = []
+ for val in cond.str_values:
+ project_name, local_id = tracker_bizobj.ParseIssueRef(val)
+ if not project_name:
+ if not default_project_name:
+ # TODO(rmistry): Support the below.
+ raise ValueError(
+ 'Searching for issues accross multiple/all projects without '
+ 'project prefixes is ambiguous and is currently not supported.')
+ project_name = default_project_name
+ refs.append((project_name, int(local_id)))
+
+ return services.issue.ResolveIssueRefs(
+ cnxn, ref_projects, default_project_name, refs)
+
+
+def _PreprocessStatusCond(
+ cnxn, cond, project_ids, services, _harmonized_config):
+ """Preprocess a status=names cond into status_id=IDs."""
+ if project_ids:
+ status_ids = []
+ for project_id in project_ids:
+ status_ids.extend(services.config.LookupStatusIDs(
+ cnxn, project_id, cond.str_values))
+ else:
+ status_ids = services.config.LookupStatusIDsAnyProject(
+ cnxn, cond.str_values)
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
+ int_values=status_ids)
+
+
+def _IsEqualityOp(op):
+ """Return True for EQ and NE."""
+ return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
+
+
+def _IsDefinedOp(op):
+ """Return True for IS_DEFINED and IS_NOT_DEFINED."""
+ return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
+
+
+def _TextOpToIntOp(op):
+ """If a query is optimized from string to ID matching, use an equality op."""
+ if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
+ return ast_pb2.QueryOp.EQ
+ elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
+ return ast_pb2.QueryOp.NE
+ return op
+
+
+def _MakePrefixRegex(cond):
+ """Return a regex to match strings that start with cond values."""
+ all_prefixes = '|'.join(map(re.escape, cond.str_values))
+ return re.compile(r'(%s)-.+' % all_prefixes, re.I)
+
+
+def _MakeKeyValueRegex(cond):
+ """Return a regex to match the first token and remaining text separately."""
+ keys, values = zip(*map(lambda x: x.split('-', 1), cond.str_values))
+ if len(set(keys)) != 1:
+ raise ValueError(
+ "KeyValue query with multiple different keys: %r" % cond.str_values)
+ all_values = '|'.join(map(re.escape, values))
+ return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
+
+
+def _MakeWordBoundaryRegex(cond):
+ """Return a regex to match the cond values as whole words."""
+ all_words = '|'.join(map(re.escape, cond.str_values))
+ return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
+
+
+def _PreprocessLabelCond(
+ cnxn, cond, project_ids, services, _harmonized_config):
+ """Preprocess a label=names cond into label_id=IDs."""
+ if project_ids:
+ label_ids = []
+ for project_id in project_ids:
+ if _IsEqualityOp(cond.op):
+ label_ids.extend(services.config.LookupLabelIDs(
+ cnxn, project_id, cond.str_values))
+ elif _IsDefinedOp(cond.op):
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakePrefixRegex(cond)))
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS:
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakeKeyValueRegex(cond)))
+ else:
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching(
+ cnxn, project_id, _MakeWordBoundaryRegex(cond)))
+ else:
+ if _IsEqualityOp(cond.op):
+ label_ids = services.config.LookupLabelIDsAnyProject(
+ cnxn, cond.str_values)
+ elif _IsDefinedOp(cond.op):
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakePrefixRegex(cond))
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS:
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakeKeyValueRegex(cond))
+ else:
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
+ cnxn, _MakeWordBoundaryRegex(cond))
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
+ int_values=label_ids)
+
+
+def _PreprocessComponentCond(
+ cnxn, cond, project_ids, services, harmonized_config):
+ """Preprocess a component= or component:name cond into component_id=IDs."""
+ exact = _IsEqualityOp(cond.op)
+ component_ids = []
+ if project_ids:
+ # We are searching within specific projects, so harmonized_config
+ # holds the config data for all those projects.
+ for comp_path in cond.str_values:
+ component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
+ comp_path, harmonized_config, exact=exact))
+ else:
+ # We are searching across the whole site, so we have no harmonized_config
+ # to use.
+ component_ids = services.config.FindMatchingComponentIDsAnyProject(
+ cnxn, cond.str_values, exact=exact)
+
+ return ast_pb2.Condition(
+ op=_TextOpToIntOp(cond.op),
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
+ int_values=component_ids)
+
+
+def _PreprocessExactUsers(cnxn, cond, user_service, id_fields):
+ """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
+
+ This preprocesing step converts string conditions to int ID conditions.
+ E.g., [owner=email] to [owner_id=ID]. It only does it in cases
+ where (a) the email was "me", so it was already converted to an string of
+ digits in the search pipeline, or (b) it is "user@domain" which resolves to
+ a known Monorail user. It is also possible to search for, e.g.,
+ [owner:substring], but such searches remain 'owner' field searches rather
+ than 'owner_id', and they cannot be combined with the "me" keyword.
+
+ Args:
+ cnxn: connection to the DB.
+ cond: original parsed query Condition PB.
+ user_service: connection to user persistence layer.
+ id_fields: list of the search fields to use if the conversion to IDs
+ succeeds.
+
+ Returns:
+ A new Condition PB that checks the id_field. Or, the original cond.
+ """
+ op = _TextOpToIntOp(cond.op)
+ if _IsDefinedOp(op):
+ # No need to look up any IDs if we are just testing for any defined value.
+ return ast_pb2.Condition(op=op, field_defs=id_fields)
+
+ # This preprocessing step is only for ops that compare whole values, not
+ # substrings.
+ if not _IsEqualityOp(op):
+ logging.info('could not convert to IDs because op is %r', op)
+ return cond
+
+ user_ids = []
+ for val in cond.str_values:
+ try:
+ user_ids.append(int(val))
+ except ValueError:
+ try:
+ user_ids.append(user_service.LookupUserID(cnxn, val))
+ except user_svc.NoSuchUserException:
+ logging.info('could not convert user %r to int ID', val)
+ return cond # preprocessing failed, stick with the original cond.
+
+ return ast_pb2.Condition(op=op, field_defs=id_fields, int_values=user_ids)
+
+
+def _PreprocessOwnerCond(
+ cnxn, cond, _project_ids, services, _harmonized_config):
+ """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']])
+
+
+def _PreprocessCcCond(
+ cnxn, cond, _project_ids, services, _harmonized_config):
+ """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']])
+
+
+def _PreprocessReporterCond(
+ cnxn, cond, _project_ids, services, _harmonized_config):
+ """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']])
+
+
+def _PreprocessStarredByCond(
+ cnxn, cond, _project_ids, services, _harmonized_config):
+ """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']])
+
+
+def _PreprocessCommentByCond(
+ cnxn, cond, _project_ids, services, _harmonized_config):
+ """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
+ return _PreprocessExactUsers(
+ cnxn, cond, services.user,
+ [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']])
+
+
+def _PreprocessCustomCond(cnxn, cond, services):
+ """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
+ # TODO(jrobbins): better support for ambiguous fields.
+ # For now, if any field is USER_TYPE and the value being searched
+ # for is the email address of an existing account, it will convert
+ # to a user ID and we go with exact ID matching. Otherwise, we
+ # leave the cond as-is for ast2select to do string matching on.
+ user_field_defs = [fd for fd in cond.field_defs
+ if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
+ if user_field_defs:
+ return _PreprocessExactUsers(cnxn, cond, services.user, user_field_defs)
+ else:
+ return cond
+
+
+_PREPROCESSORS = {
+ 'open': _PreprocessIsOpenCond,
+ 'blocked': _PreprocessIsBlockedCond,
+ 'blockedon': _PreprocessBlockedOnCond,
+ 'blocking': _PreprocessBlockingCond,
+ 'status': _PreprocessStatusCond,
+ 'label': _PreprocessLabelCond,
+ 'component': _PreprocessComponentCond,
+ 'owner': _PreprocessOwnerCond,
+ 'cc': _PreprocessCcCond,
+ 'reporter': _PreprocessReporterCond,
+ 'starredby': _PreprocessStarredByCond,
+ 'commentby': _PreprocessCommentByCond,
+ }
+
+
+def _PreprocessCond(
+ cnxn, cond, project_ids, services, harmonized_config):
+ """Preprocess query by looking up status, label and component IDs."""
+ # All the fields in a cond share the same name because they are parsed
+ # from a user query term, and the term syntax allows just one field name.
+ field_name = cond.field_defs[0].field_name
+ assert all(fd.field_name == field_name for fd in cond.field_defs)
+
+ # Case 1: The user is searching custom fields.
+ if any(fd.field_id for fd in cond.field_defs):
+ # There can't be a mix of custom and built-in fields because built-in
+ # field names are reserved and take priority over any conflicting ones.
+ assert all(fd.field_id for fd in cond.field_defs)
+ return _PreprocessCustomCond(cnxn, cond, services)
+
+ # Case 2: The user is searching a built-in field.
+ preproc = _PREPROCESSORS.get(field_name)
+ if preproc:
+ # We have a preprocessor for that built-in field.
+ return preproc(cnxn, cond, project_ids, services, harmonized_config)
+ else:
+ # We don't have a preprocessor for it.
+ return cond
« no previous file with comments | « appengine/monorail/search/__init__.py ('k') | appengine/monorail/search/ast2select.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698