Index: appengine/monorail/search/ast2ast.py |
diff --git a/appengine/monorail/search/ast2ast.py b/appengine/monorail/search/ast2ast.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..19c93ae7757258979a2685a5ad325f79eb74b555 |
--- /dev/null |
+++ b/appengine/monorail/search/ast2ast.py |
@@ -0,0 +1,411 @@ |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is govered by a BSD-style |
+# license that can be found in the LICENSE file or at |
+# https://developers.google.com/open-source/licenses/bsd |
+ |
+"""Convert a user's issue search AST into a simplified AST. |
+ |
+This phase of query processing simplifies the user's query by looking up |
+the int IDs of any labels, statuses, or components that are mentioned by |
+name in the original query. The data needed for lookups is typically cached |
+in RAM in each backend job, so this will not put much load on the DB. The |
+simplified ASTs are later converted into SQL which is simpler and has |
+fewer joins. |
+ |
+The simplified main query is better because: |
+ + It is clearly faster, especially in the most common case where config |
+ data is in RAM. |
+ + Since less RAM is used to process the main query on each shard, query |
+ execution time is more consistent with less variability under load. Less |
+ variability is good because the user must wait for the slowest shard. |
+ + The config tables (LabelDef, StatusDef, etc.) exist only on the master, so |
+ they cannot be mentioned in a query that runs on a shard. |
+ + The query string itself is shorter when numeric IDs are substituted, which |
+ means that we can handle user queries with long lists of labels in a |
+ reasonable-sized query. |
+ + It bisects the complexity of the operation: it's easier to test and debug |
+ the lookup and simplification logic plus the main query logic this way |
+ than it would be to deal with an even more complex SQL main query. |
+""" |
+ |
+import logging |
+import re |
+ |
+from proto import ast_pb2 |
+from proto import tracker_pb2 |
+# TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could |
+# remove this dep. |
+from search import query2ast |
+from services import user_svc |
+from tracker import tracker_bizobj |
+ |
+ |
+def PreprocessAST( |
+ cnxn, query_ast, project_ids, services, harmonized_config): |
+ """Preprocess the query by doing lookups so that the SQL query is simpler. |
+ |
+ Args: |
+ cnxn: connection to SQL database. |
+ query_ast: user query abstract syntax tree parsed by query2ast.py. |
+ project_ids: collection of int project IDs to use to look up status values |
+ and labels. |
+ services: Connections to persistence layer for users and configs. |
+ harmonized_config: harmonized config for all projects being searched. |
+ |
+ Returns: |
+ A new QueryAST PB with simplified conditions. Specifically, string values |
+ for labels, statuses, and components are replaced with the int IDs of |
+ those items. Also, is:open is distilled down to |
+ status_id != closed_status_ids. |
+ """ |
+ new_conjs = [] |
+ for conj in query_ast.conjunctions: |
+ new_conds = [ |
+ _PreprocessCond( |
+ cnxn, cond, project_ids, services, harmonized_config) |
+ for cond in conj.conds] |
+ new_conjs.append(ast_pb2.Conjunction(conds=new_conds)) |
+ |
+ return ast_pb2.QueryAST(conjunctions=new_conjs) |
+ |
+ |
+def _PreprocessIsOpenCond( |
+ cnxn, cond, project_ids, services, _harmonized_config): |
+ """Preprocess an is:open cond into status_id != closed_status_ids.""" |
+ if project_ids: |
+ closed_status_ids = [] |
+ for project_id in project_ids: |
+ closed_status_ids.extend(services.config.LookupClosedStatusIDs( |
+ cnxn, project_id)) |
+ else: |
+ closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn) |
+ |
+ is_closed = not bool(cond.int_values[0]) |
+ return ast_pb2.Condition( |
+ op=ast_pb2.QueryOp.EQ if is_closed else ast_pb2.QueryOp.NE, |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
+ int_values=closed_status_ids) |
+ |
+ |
+def _PreprocessIsBlockedCond( |
+ _cnxn, cond, _project_ids, _services, _harmonized_config): |
+ """Preprocess an is:blocked cond into issues that are blocked.""" |
+ op = (ast_pb2.QueryOp.IS_DEFINED if bool(cond.int_values[0]) |
+ else ast_pb2.QueryOp.IS_NOT_DEFINED) |
+ return ast_pb2.Condition( |
+ op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']]) |
+ |
+ |
+def _PreprocessBlockedOnCond( |
+ cnxn, cond, project_ids, services, _harmonized_config): |
+ """Preprocess blockedon=xyz and has:blockedon conds. |
+ |
+ Preprocesses blockedon=xyz cond into blockedon_id:issue_ids. |
+ Preprocesses has:blockedon cond into issues that are blocked on other issues. |
+ """ |
+ issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services) |
+ return ast_pb2.Condition( |
+ op=_TextOpToIntOp(cond.op), |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']], |
+ int_values=issue_ids) |
+ |
+ |
+def _PreprocessBlockingCond( |
+ cnxn, cond, project_ids, services, _harmonized_config): |
+ """Preprocess blocking=xyz and has:blocking conds. |
+ |
+ Preprocesses blocking=xyz cond into blocking_id:issue_ids. |
+ Preprocesses has:blocking cond into issues that are blocking other issues. |
+ """ |
+ issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services) |
+ return ast_pb2.Condition( |
+ op=_TextOpToIntOp(cond.op), |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']], |
+ int_values=issue_ids) |
+ |
+ |
+def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services): |
+ """Returns global IDs from the local IDs provided in the cond.""" |
+ # Get {project_name: project} for all projects in project_ids. |
+ ids_to_projects = services.project.GetProjects(cnxn, project_ids) |
+ ref_projects = {pb.project_name: pb for pb in ids_to_projects.itervalues()} |
+ # Populate default_project_name if there is only one project id provided. |
+ default_project_name = None |
+ if len(ref_projects) == 1: |
+ default_project_name = ref_projects.values()[0].project_name |
+ |
+ # Populate refs with (project_name, local_id) pairs. |
+ refs = [] |
+ for val in cond.str_values: |
+ project_name, local_id = tracker_bizobj.ParseIssueRef(val) |
+ if not project_name: |
+ if not default_project_name: |
+ # TODO(rmistry): Support the below. |
+ raise ValueError( |
+ 'Searching for issues accross multiple/all projects without ' |
+ 'project prefixes is ambiguous and is currently not supported.') |
+ project_name = default_project_name |
+ refs.append((project_name, int(local_id))) |
+ |
+ return services.issue.ResolveIssueRefs( |
+ cnxn, ref_projects, default_project_name, refs) |
+ |
+ |
+def _PreprocessStatusCond( |
+ cnxn, cond, project_ids, services, _harmonized_config): |
+ """Preprocess a status=names cond into status_id=IDs.""" |
+ if project_ids: |
+ status_ids = [] |
+ for project_id in project_ids: |
+ status_ids.extend(services.config.LookupStatusIDs( |
+ cnxn, project_id, cond.str_values)) |
+ else: |
+ status_ids = services.config.LookupStatusIDsAnyProject( |
+ cnxn, cond.str_values) |
+ |
+ return ast_pb2.Condition( |
+ op=_TextOpToIntOp(cond.op), |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
+ int_values=status_ids) |
+ |
+ |
+def _IsEqualityOp(op): |
+ """Return True for EQ and NE.""" |
+ return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE) |
+ |
+ |
+def _IsDefinedOp(op): |
+ """Return True for IS_DEFINED and IS_NOT_DEFINED.""" |
+ return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED) |
+ |
+ |
+def _TextOpToIntOp(op): |
+ """If a query is optimized from string to ID matching, use an equality op.""" |
+ if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS: |
+ return ast_pb2.QueryOp.EQ |
+ elif op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
+ return ast_pb2.QueryOp.NE |
+ return op |
+ |
+ |
+def _MakePrefixRegex(cond): |
+ """Return a regex to match strings that start with cond values.""" |
+ all_prefixes = '|'.join(map(re.escape, cond.str_values)) |
+ return re.compile(r'(%s)-.+' % all_prefixes, re.I) |
+ |
+ |
+def _MakeKeyValueRegex(cond): |
+ """Return a regex to match the first token and remaining text separately.""" |
+ keys, values = zip(*map(lambda x: x.split('-', 1), cond.str_values)) |
+ if len(set(keys)) != 1: |
+ raise ValueError( |
+ "KeyValue query with multiple different keys: %r" % cond.str_values) |
+ all_values = '|'.join(map(re.escape, values)) |
+ return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I) |
+ |
+ |
+def _MakeWordBoundaryRegex(cond): |
+ """Return a regex to match the cond values as whole words.""" |
+ all_words = '|'.join(map(re.escape, cond.str_values)) |
+ return re.compile(r'.*\b(%s)\b.*' % all_words, re.I) |
+ |
+ |
+def _PreprocessLabelCond( |
+ cnxn, cond, project_ids, services, _harmonized_config): |
+ """Preprocess a label=names cond into label_id=IDs.""" |
+ if project_ids: |
+ label_ids = [] |
+ for project_id in project_ids: |
+ if _IsEqualityOp(cond.op): |
+ label_ids.extend(services.config.LookupLabelIDs( |
+ cnxn, project_id, cond.str_values)) |
+ elif _IsDefinedOp(cond.op): |
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
+ cnxn, project_id, _MakePrefixRegex(cond))) |
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
+ cnxn, project_id, _MakeKeyValueRegex(cond))) |
+ else: |
+ label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
+ cnxn, project_id, _MakeWordBoundaryRegex(cond))) |
+ else: |
+ if _IsEqualityOp(cond.op): |
+ label_ids = services.config.LookupLabelIDsAnyProject( |
+ cnxn, cond.str_values) |
+ elif _IsDefinedOp(cond.op): |
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
+ cnxn, _MakePrefixRegex(cond)) |
+ elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
+ cnxn, _MakeKeyValueRegex(cond)) |
+ else: |
+ label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
+ cnxn, _MakeWordBoundaryRegex(cond)) |
+ |
+ return ast_pb2.Condition( |
+ op=_TextOpToIntOp(cond.op), |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']], |
+ int_values=label_ids) |
+ |
+ |
+def _PreprocessComponentCond( |
+ cnxn, cond, project_ids, services, harmonized_config): |
+ """Preprocess a component= or component:name cond into component_id=IDs.""" |
+ exact = _IsEqualityOp(cond.op) |
+ component_ids = [] |
+ if project_ids: |
+ # We are searching within specific projects, so harmonized_config |
+ # holds the config data for all those projects. |
+ for comp_path in cond.str_values: |
+ component_ids.extend(tracker_bizobj.FindMatchingComponentIDs( |
+ comp_path, harmonized_config, exact=exact)) |
+ else: |
+ # We are searching across the whole site, so we have no harmonized_config |
+ # to use. |
+ component_ids = services.config.FindMatchingComponentIDsAnyProject( |
+ cnxn, cond.str_values, exact=exact) |
+ |
+ return ast_pb2.Condition( |
+ op=_TextOpToIntOp(cond.op), |
+ field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']], |
+ int_values=component_ids) |
+ |
+ |
+def _PreprocessExactUsers(cnxn, cond, user_service, id_fields): |
+ """Preprocess a foo=emails cond into foo_id=IDs, if exact user match. |
+ |
+ This preprocesing step converts string conditions to int ID conditions. |
+ E.g., [owner=email] to [owner_id=ID]. It only does it in cases |
+ where (a) the email was "me", so it was already converted to an string of |
+ digits in the search pipeline, or (b) it is "user@domain" which resolves to |
+ a known Monorail user. It is also possible to search for, e.g., |
+ [owner:substring], but such searches remain 'owner' field searches rather |
+ than 'owner_id', and they cannot be combined with the "me" keyword. |
+ |
+ Args: |
+ cnxn: connection to the DB. |
+ cond: original parsed query Condition PB. |
+ user_service: connection to user persistence layer. |
+ id_fields: list of the search fields to use if the conversion to IDs |
+ succeeds. |
+ |
+ Returns: |
+ A new Condition PB that checks the id_field. Or, the original cond. |
+ """ |
+ op = _TextOpToIntOp(cond.op) |
+ if _IsDefinedOp(op): |
+ # No need to look up any IDs if we are just testing for any defined value. |
+ return ast_pb2.Condition(op=op, field_defs=id_fields) |
+ |
+ # This preprocessing step is only for ops that compare whole values, not |
+ # substrings. |
+ if not _IsEqualityOp(op): |
+ logging.info('could not convert to IDs because op is %r', op) |
+ return cond |
+ |
+ user_ids = [] |
+ for val in cond.str_values: |
+ try: |
+ user_ids.append(int(val)) |
+ except ValueError: |
+ try: |
+ user_ids.append(user_service.LookupUserID(cnxn, val)) |
+ except user_svc.NoSuchUserException: |
+ logging.info('could not convert user %r to int ID', val) |
+ return cond # preprocessing failed, stick with the original cond. |
+ |
+ return ast_pb2.Condition(op=op, field_defs=id_fields, int_values=user_ids) |
+ |
+ |
+def _PreprocessOwnerCond( |
+ cnxn, cond, _project_ids, services, _harmonized_config): |
+ """Preprocess a owner=emails cond into owner_id=IDs, if exact user match.""" |
+ return _PreprocessExactUsers( |
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']]) |
+ |
+ |
+def _PreprocessCcCond( |
+ cnxn, cond, _project_ids, services, _harmonized_config): |
+ """Preprocess a cc=emails cond into cc_id=IDs, if exact user match.""" |
+ return _PreprocessExactUsers( |
+ cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']]) |
+ |
+ |
+def _PreprocessReporterCond( |
+ cnxn, cond, _project_ids, services, _harmonized_config): |
+ """Preprocess a reporter=emails cond into reporter_id=IDs, if exact.""" |
+ return _PreprocessExactUsers( |
+ cnxn, cond, services.user, |
+ [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']]) |
+ |
+ |
+def _PreprocessStarredByCond( |
+ cnxn, cond, _project_ids, services, _harmonized_config): |
+ """Preprocess a starredby=emails cond into starredby_id=IDs, if exact.""" |
+ return _PreprocessExactUsers( |
+ cnxn, cond, services.user, |
+ [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']]) |
+ |
+ |
+def _PreprocessCommentByCond( |
+ cnxn, cond, _project_ids, services, _harmonized_config): |
+ """Preprocess a commentby=emails cond into commentby_id=IDs, if exact.""" |
+ return _PreprocessExactUsers( |
+ cnxn, cond, services.user, |
+ [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']]) |
+ |
+ |
+def _PreprocessCustomCond(cnxn, cond, services): |
+ """Preprocess a custom_user_field=emails cond into IDs, if exact matches.""" |
+ # TODO(jrobbins): better support for ambiguous fields. |
+ # For now, if any field is USER_TYPE and the value being searched |
+ # for is the email address of an existing account, it will convert |
+ # to a user ID and we go with exact ID matching. Otherwise, we |
+ # leave the cond as-is for ast2select to do string matching on. |
+ user_field_defs = [fd for fd in cond.field_defs |
+ if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE] |
+ if user_field_defs: |
+ return _PreprocessExactUsers(cnxn, cond, services.user, user_field_defs) |
+ else: |
+ return cond |
+ |
+ |
+_PREPROCESSORS = { |
+ 'open': _PreprocessIsOpenCond, |
+ 'blocked': _PreprocessIsBlockedCond, |
+ 'blockedon': _PreprocessBlockedOnCond, |
+ 'blocking': _PreprocessBlockingCond, |
+ 'status': _PreprocessStatusCond, |
+ 'label': _PreprocessLabelCond, |
+ 'component': _PreprocessComponentCond, |
+ 'owner': _PreprocessOwnerCond, |
+ 'cc': _PreprocessCcCond, |
+ 'reporter': _PreprocessReporterCond, |
+ 'starredby': _PreprocessStarredByCond, |
+ 'commentby': _PreprocessCommentByCond, |
+ } |
+ |
+ |
+def _PreprocessCond( |
+ cnxn, cond, project_ids, services, harmonized_config): |
+ """Preprocess query by looking up status, label and component IDs.""" |
+ # All the fields in a cond share the same name because they are parsed |
+ # from a user query term, and the term syntax allows just one field name. |
+ field_name = cond.field_defs[0].field_name |
+ assert all(fd.field_name == field_name for fd in cond.field_defs) |
+ |
+ # Case 1: The user is searching custom fields. |
+ if any(fd.field_id for fd in cond.field_defs): |
+ # There can't be a mix of custom and built-in fields because built-in |
+ # field names are reserved and take priority over any conflicting ones. |
+ assert all(fd.field_id for fd in cond.field_defs) |
+ return _PreprocessCustomCond(cnxn, cond, services) |
+ |
+ # Case 2: The user is searching a built-in field. |
+ preproc = _PREPROCESSORS.get(field_name) |
+ if preproc: |
+ # We have a preprocessor for that built-in field. |
+ return preproc(cnxn, cond, project_ids, services, harmonized_config) |
+ else: |
+ # We don't have a preprocessor for it. |
+ return cond |