OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 |
| 6 """Convert a user's issue search AST into a simplified AST. |
| 7 |
| 8 This phase of query processing simplifies the user's query by looking up |
| 9 the int IDs of any labels, statuses, or components that are mentioned by |
| 10 name in the original query. The data needed for lookups is typically cached |
| 11 in RAM in each backend job, so this will not put much load on the DB. The |
| 12 simplified ASTs are later converted into SQL which is simpler and has |
| 13 fewer joins. |
| 14 |
| 15 The simplified main query is better because: |
| 16 + It is clearly faster, especially in the most common case where config |
| 17 data is in RAM. |
| 18 + Since less RAM is used to process the main query on each shard, query |
| 19 execution time is more consistent with less variability under load. Less |
| 20 variability is good because the user must wait for the slowest shard. |
| 21 + The config tables (LabelDef, StatusDef, etc.) exist only on the master, so |
| 22 they cannot be mentioned in a query that runs on a shard. |
| 23 + The query string itself is shorter when numeric IDs are substituted, which |
| 24 means that we can handle user queries with long lists of labels in a |
| 25 reasonable-sized query. |
| 26 + It bisects the complexity of the operation: it's easier to test and debug |
| 27 the lookup and simplification logic plus the main query logic this way |
| 28 than it would be to deal with an even more complex SQL main query. |
| 29 """ |
| 30 |
| 31 import logging |
| 32 import re |
| 33 |
| 34 from proto import ast_pb2 |
| 35 from proto import tracker_pb2 |
| 36 # TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could |
| 37 # remove this dep. |
| 38 from search import query2ast |
| 39 from services import user_svc |
| 40 from tracker import tracker_bizobj |
| 41 |
| 42 |
| 43 def PreprocessAST( |
| 44 cnxn, query_ast, project_ids, services, harmonized_config): |
| 45 """Preprocess the query by doing lookups so that the SQL query is simpler. |
| 46 |
| 47 Args: |
| 48 cnxn: connection to SQL database. |
| 49 query_ast: user query abstract syntax tree parsed by query2ast.py. |
| 50 project_ids: collection of int project IDs to use to look up status values |
| 51 and labels. |
| 52 services: Connections to persistence layer for users and configs. |
| 53 harmonized_config: harmonized config for all projects being searched. |
| 54 |
| 55 Returns: |
| 56 A new QueryAST PB with simplified conditions. Specifically, string values |
| 57 for labels, statuses, and components are replaced with the int IDs of |
| 58 those items. Also, is:open is distilled down to |
| 59 status_id != closed_status_ids. |
| 60 """ |
| 61 new_conjs = [] |
| 62 for conj in query_ast.conjunctions: |
| 63 new_conds = [ |
| 64 _PreprocessCond( |
| 65 cnxn, cond, project_ids, services, harmonized_config) |
| 66 for cond in conj.conds] |
| 67 new_conjs.append(ast_pb2.Conjunction(conds=new_conds)) |
| 68 |
| 69 return ast_pb2.QueryAST(conjunctions=new_conjs) |
| 70 |
| 71 |
| 72 def _PreprocessIsOpenCond( |
| 73 cnxn, cond, project_ids, services, _harmonized_config): |
| 74 """Preprocess an is:open cond into status_id != closed_status_ids.""" |
| 75 if project_ids: |
| 76 closed_status_ids = [] |
| 77 for project_id in project_ids: |
| 78 closed_status_ids.extend(services.config.LookupClosedStatusIDs( |
| 79 cnxn, project_id)) |
| 80 else: |
| 81 closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn) |
| 82 |
| 83 is_closed = not bool(cond.int_values[0]) |
| 84 return ast_pb2.Condition( |
| 85 op=ast_pb2.QueryOp.EQ if is_closed else ast_pb2.QueryOp.NE, |
| 86 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
| 87 int_values=closed_status_ids) |
| 88 |
| 89 |
| 90 def _PreprocessIsBlockedCond( |
| 91 _cnxn, cond, _project_ids, _services, _harmonized_config): |
| 92 """Preprocess an is:blocked cond into issues that are blocked.""" |
| 93 op = (ast_pb2.QueryOp.IS_DEFINED if bool(cond.int_values[0]) |
| 94 else ast_pb2.QueryOp.IS_NOT_DEFINED) |
| 95 return ast_pb2.Condition( |
| 96 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']]) |
| 97 |
| 98 |
| 99 def _PreprocessBlockedOnCond( |
| 100 cnxn, cond, project_ids, services, _harmonized_config): |
| 101 """Preprocess blockedon=xyz and has:blockedon conds. |
| 102 |
| 103 Preprocesses blockedon=xyz cond into blockedon_id:issue_ids. |
| 104 Preprocesses has:blockedon cond into issues that are blocked on other issues. |
| 105 """ |
| 106 issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services) |
| 107 return ast_pb2.Condition( |
| 108 op=_TextOpToIntOp(cond.op), |
| 109 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']], |
| 110 int_values=issue_ids) |
| 111 |
| 112 |
| 113 def _PreprocessBlockingCond( |
| 114 cnxn, cond, project_ids, services, _harmonized_config): |
| 115 """Preprocess blocking=xyz and has:blocking conds. |
| 116 |
| 117 Preprocesses blocking=xyz cond into blocking_id:issue_ids. |
| 118 Preprocesses has:blocking cond into issues that are blocking other issues. |
| 119 """ |
| 120 issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services) |
| 121 return ast_pb2.Condition( |
| 122 op=_TextOpToIntOp(cond.op), |
| 123 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']], |
| 124 int_values=issue_ids) |
| 125 |
| 126 |
| 127 def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services): |
| 128 """Returns global IDs from the local IDs provided in the cond.""" |
| 129 # Get {project_name: project} for all projects in project_ids. |
| 130 ids_to_projects = services.project.GetProjects(cnxn, project_ids) |
| 131 ref_projects = {pb.project_name: pb for pb in ids_to_projects.itervalues()} |
| 132 # Populate default_project_name if there is only one project id provided. |
| 133 default_project_name = None |
| 134 if len(ref_projects) == 1: |
| 135 default_project_name = ref_projects.values()[0].project_name |
| 136 |
| 137 # Populate refs with (project_name, local_id) pairs. |
| 138 refs = [] |
| 139 for val in cond.str_values: |
| 140 project_name, local_id = tracker_bizobj.ParseIssueRef(val) |
| 141 if not project_name: |
| 142 if not default_project_name: |
| 143 # TODO(rmistry): Support the below. |
| 144 raise ValueError( |
| 145 'Searching for issues accross multiple/all projects without ' |
| 146 'project prefixes is ambiguous and is currently not supported.') |
| 147 project_name = default_project_name |
| 148 refs.append((project_name, int(local_id))) |
| 149 |
| 150 return services.issue.ResolveIssueRefs( |
| 151 cnxn, ref_projects, default_project_name, refs) |
| 152 |
| 153 |
| 154 def _PreprocessStatusCond( |
| 155 cnxn, cond, project_ids, services, _harmonized_config): |
| 156 """Preprocess a status=names cond into status_id=IDs.""" |
| 157 if project_ids: |
| 158 status_ids = [] |
| 159 for project_id in project_ids: |
| 160 status_ids.extend(services.config.LookupStatusIDs( |
| 161 cnxn, project_id, cond.str_values)) |
| 162 else: |
| 163 status_ids = services.config.LookupStatusIDsAnyProject( |
| 164 cnxn, cond.str_values) |
| 165 |
| 166 return ast_pb2.Condition( |
| 167 op=_TextOpToIntOp(cond.op), |
| 168 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']], |
| 169 int_values=status_ids) |
| 170 |
| 171 |
| 172 def _IsEqualityOp(op): |
| 173 """Return True for EQ and NE.""" |
| 174 return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE) |
| 175 |
| 176 |
| 177 def _IsDefinedOp(op): |
| 178 """Return True for IS_DEFINED and IS_NOT_DEFINED.""" |
| 179 return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED) |
| 180 |
| 181 |
| 182 def _TextOpToIntOp(op): |
| 183 """If a query is optimized from string to ID matching, use an equality op.""" |
| 184 if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS: |
| 185 return ast_pb2.QueryOp.EQ |
| 186 elif op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
| 187 return ast_pb2.QueryOp.NE |
| 188 return op |
| 189 |
| 190 |
| 191 def _MakePrefixRegex(cond): |
| 192 """Return a regex to match strings that start with cond values.""" |
| 193 all_prefixes = '|'.join(map(re.escape, cond.str_values)) |
| 194 return re.compile(r'(%s)-.+' % all_prefixes, re.I) |
| 195 |
| 196 |
| 197 def _MakeKeyValueRegex(cond): |
| 198 """Return a regex to match the first token and remaining text separately.""" |
| 199 keys, values = zip(*map(lambda x: x.split('-', 1), cond.str_values)) |
| 200 if len(set(keys)) != 1: |
| 201 raise ValueError( |
| 202 "KeyValue query with multiple different keys: %r" % cond.str_values) |
| 203 all_values = '|'.join(map(re.escape, values)) |
| 204 return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I) |
| 205 |
| 206 |
| 207 def _MakeWordBoundaryRegex(cond): |
| 208 """Return a regex to match the cond values as whole words.""" |
| 209 all_words = '|'.join(map(re.escape, cond.str_values)) |
| 210 return re.compile(r'.*\b(%s)\b.*' % all_words, re.I) |
| 211 |
| 212 |
| 213 def _PreprocessLabelCond( |
| 214 cnxn, cond, project_ids, services, _harmonized_config): |
| 215 """Preprocess a label=names cond into label_id=IDs.""" |
| 216 if project_ids: |
| 217 label_ids = [] |
| 218 for project_id in project_ids: |
| 219 if _IsEqualityOp(cond.op): |
| 220 label_ids.extend(services.config.LookupLabelIDs( |
| 221 cnxn, project_id, cond.str_values)) |
| 222 elif _IsDefinedOp(cond.op): |
| 223 label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| 224 cnxn, project_id, _MakePrefixRegex(cond))) |
| 225 elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
| 226 label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| 227 cnxn, project_id, _MakeKeyValueRegex(cond))) |
| 228 else: |
| 229 label_ids.extend(services.config.LookupIDsOfLabelsMatching( |
| 230 cnxn, project_id, _MakeWordBoundaryRegex(cond))) |
| 231 else: |
| 232 if _IsEqualityOp(cond.op): |
| 233 label_ids = services.config.LookupLabelIDsAnyProject( |
| 234 cnxn, cond.str_values) |
| 235 elif _IsDefinedOp(cond.op): |
| 236 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| 237 cnxn, _MakePrefixRegex(cond)) |
| 238 elif cond.op == ast_pb2.QueryOp.KEY_HAS: |
| 239 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| 240 cnxn, _MakeKeyValueRegex(cond)) |
| 241 else: |
| 242 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject( |
| 243 cnxn, _MakeWordBoundaryRegex(cond)) |
| 244 |
| 245 return ast_pb2.Condition( |
| 246 op=_TextOpToIntOp(cond.op), |
| 247 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']], |
| 248 int_values=label_ids) |
| 249 |
| 250 |
| 251 def _PreprocessComponentCond( |
| 252 cnxn, cond, project_ids, services, harmonized_config): |
| 253 """Preprocess a component= or component:name cond into component_id=IDs.""" |
| 254 exact = _IsEqualityOp(cond.op) |
| 255 component_ids = [] |
| 256 if project_ids: |
| 257 # We are searching within specific projects, so harmonized_config |
| 258 # holds the config data for all those projects. |
| 259 for comp_path in cond.str_values: |
| 260 component_ids.extend(tracker_bizobj.FindMatchingComponentIDs( |
| 261 comp_path, harmonized_config, exact=exact)) |
| 262 else: |
| 263 # We are searching across the whole site, so we have no harmonized_config |
| 264 # to use. |
| 265 component_ids = services.config.FindMatchingComponentIDsAnyProject( |
| 266 cnxn, cond.str_values, exact=exact) |
| 267 |
| 268 return ast_pb2.Condition( |
| 269 op=_TextOpToIntOp(cond.op), |
| 270 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']], |
| 271 int_values=component_ids) |
| 272 |
| 273 |
| 274 def _PreprocessExactUsers(cnxn, cond, user_service, id_fields): |
| 275 """Preprocess a foo=emails cond into foo_id=IDs, if exact user match. |
| 276 |
| 277 This preprocesing step converts string conditions to int ID conditions. |
| 278 E.g., [owner=email] to [owner_id=ID]. It only does it in cases |
| 279 where (a) the email was "me", so it was already converted to an string of |
| 280 digits in the search pipeline, or (b) it is "user@domain" which resolves to |
| 281 a known Monorail user. It is also possible to search for, e.g., |
| 282 [owner:substring], but such searches remain 'owner' field searches rather |
| 283 than 'owner_id', and they cannot be combined with the "me" keyword. |
| 284 |
| 285 Args: |
| 286 cnxn: connection to the DB. |
| 287 cond: original parsed query Condition PB. |
| 288 user_service: connection to user persistence layer. |
| 289 id_fields: list of the search fields to use if the conversion to IDs |
| 290 succeeds. |
| 291 |
| 292 Returns: |
| 293 A new Condition PB that checks the id_field. Or, the original cond. |
| 294 """ |
| 295 op = _TextOpToIntOp(cond.op) |
| 296 if _IsDefinedOp(op): |
| 297 # No need to look up any IDs if we are just testing for any defined value. |
| 298 return ast_pb2.Condition(op=op, field_defs=id_fields) |
| 299 |
| 300 # This preprocessing step is only for ops that compare whole values, not |
| 301 # substrings. |
| 302 if not _IsEqualityOp(op): |
| 303 logging.info('could not convert to IDs because op is %r', op) |
| 304 return cond |
| 305 |
| 306 user_ids = [] |
| 307 for val in cond.str_values: |
| 308 try: |
| 309 user_ids.append(int(val)) |
| 310 except ValueError: |
| 311 try: |
| 312 user_ids.append(user_service.LookupUserID(cnxn, val)) |
| 313 except user_svc.NoSuchUserException: |
| 314 logging.info('could not convert user %r to int ID', val) |
| 315 return cond # preprocessing failed, stick with the original cond. |
| 316 |
| 317 return ast_pb2.Condition(op=op, field_defs=id_fields, int_values=user_ids) |
| 318 |
| 319 |
| 320 def _PreprocessOwnerCond( |
| 321 cnxn, cond, _project_ids, services, _harmonized_config): |
| 322 """Preprocess a owner=emails cond into owner_id=IDs, if exact user match.""" |
| 323 return _PreprocessExactUsers( |
| 324 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']]) |
| 325 |
| 326 |
| 327 def _PreprocessCcCond( |
| 328 cnxn, cond, _project_ids, services, _harmonized_config): |
| 329 """Preprocess a cc=emails cond into cc_id=IDs, if exact user match.""" |
| 330 return _PreprocessExactUsers( |
| 331 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']]) |
| 332 |
| 333 |
| 334 def _PreprocessReporterCond( |
| 335 cnxn, cond, _project_ids, services, _harmonized_config): |
| 336 """Preprocess a reporter=emails cond into reporter_id=IDs, if exact.""" |
| 337 return _PreprocessExactUsers( |
| 338 cnxn, cond, services.user, |
| 339 [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']]) |
| 340 |
| 341 |
| 342 def _PreprocessStarredByCond( |
| 343 cnxn, cond, _project_ids, services, _harmonized_config): |
| 344 """Preprocess a starredby=emails cond into starredby_id=IDs, if exact.""" |
| 345 return _PreprocessExactUsers( |
| 346 cnxn, cond, services.user, |
| 347 [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']]) |
| 348 |
| 349 |
| 350 def _PreprocessCommentByCond( |
| 351 cnxn, cond, _project_ids, services, _harmonized_config): |
| 352 """Preprocess a commentby=emails cond into commentby_id=IDs, if exact.""" |
| 353 return _PreprocessExactUsers( |
| 354 cnxn, cond, services.user, |
| 355 [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']]) |
| 356 |
| 357 |
| 358 def _PreprocessCustomCond(cnxn, cond, services): |
| 359 """Preprocess a custom_user_field=emails cond into IDs, if exact matches.""" |
| 360 # TODO(jrobbins): better support for ambiguous fields. |
| 361 # For now, if any field is USER_TYPE and the value being searched |
| 362 # for is the email address of an existing account, it will convert |
| 363 # to a user ID and we go with exact ID matching. Otherwise, we |
| 364 # leave the cond as-is for ast2select to do string matching on. |
| 365 user_field_defs = [fd for fd in cond.field_defs |
| 366 if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE] |
| 367 if user_field_defs: |
| 368 return _PreprocessExactUsers(cnxn, cond, services.user, user_field_defs) |
| 369 else: |
| 370 return cond |
| 371 |
| 372 |
| 373 _PREPROCESSORS = { |
| 374 'open': _PreprocessIsOpenCond, |
| 375 'blocked': _PreprocessIsBlockedCond, |
| 376 'blockedon': _PreprocessBlockedOnCond, |
| 377 'blocking': _PreprocessBlockingCond, |
| 378 'status': _PreprocessStatusCond, |
| 379 'label': _PreprocessLabelCond, |
| 380 'component': _PreprocessComponentCond, |
| 381 'owner': _PreprocessOwnerCond, |
| 382 'cc': _PreprocessCcCond, |
| 383 'reporter': _PreprocessReporterCond, |
| 384 'starredby': _PreprocessStarredByCond, |
| 385 'commentby': _PreprocessCommentByCond, |
| 386 } |
| 387 |
| 388 |
| 389 def _PreprocessCond( |
| 390 cnxn, cond, project_ids, services, harmonized_config): |
| 391 """Preprocess query by looking up status, label and component IDs.""" |
| 392 # All the fields in a cond share the same name because they are parsed |
| 393 # from a user query term, and the term syntax allows just one field name. |
| 394 field_name = cond.field_defs[0].field_name |
| 395 assert all(fd.field_name == field_name for fd in cond.field_defs) |
| 396 |
| 397 # Case 1: The user is searching custom fields. |
| 398 if any(fd.field_id for fd in cond.field_defs): |
| 399 # There can't be a mix of custom and built-in fields because built-in |
| 400 # field names are reserved and take priority over any conflicting ones. |
| 401 assert all(fd.field_id for fd in cond.field_defs) |
| 402 return _PreprocessCustomCond(cnxn, cond, services) |
| 403 |
| 404 # Case 2: The user is searching a built-in field. |
| 405 preproc = _PREPROCESSORS.get(field_name) |
| 406 if preproc: |
| 407 # We have a preprocessor for that built-in field. |
| 408 return preproc(cnxn, cond, project_ids, services, harmonized_config) |
| 409 else: |
| 410 # We don't have a preprocessor for it. |
| 411 return cond |
OLD | NEW |