Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(288)

Side by Side Diff: appengine/monorail/search/ast2ast.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « appengine/monorail/search/__init__.py ('k') | appengine/monorail/search/ast2select.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is govered by a BSD-style
3 # license that can be found in the LICENSE file or at
4 # https://developers.google.com/open-source/licenses/bsd
5
6 """Convert a user's issue search AST into a simplified AST.
7
8 This phase of query processing simplifies the user's query by looking up
9 the int IDs of any labels, statuses, or components that are mentioned by
10 name in the original query. The data needed for lookups is typically cached
11 in RAM in each backend job, so this will not put much load on the DB. The
12 simplified ASTs are later converted into SQL which is simpler and has
13 fewer joins.
14
15 The simplified main query is better because:
16 + It is clearly faster, especially in the most common case where config
17 data is in RAM.
18 + Since less RAM is used to process the main query on each shard, query
19 execution time is more consistent with less variability under load. Less
20 variability is good because the user must wait for the slowest shard.
21 + The config tables (LabelDef, StatusDef, etc.) exist only on the master, so
22 they cannot be mentioned in a query that runs on a shard.
23 + The query string itself is shorter when numeric IDs are substituted, which
24 means that we can handle user queries with long lists of labels in a
25 reasonable-sized query.
26 + It bisects the complexity of the operation: it's easier to test and debug
27 the lookup and simplification logic plus the main query logic this way
28 than it would be to deal with an even more complex SQL main query.
29 """
30
31 import logging
32 import re
33
34 from proto import ast_pb2
35 from proto import tracker_pb2
36 # TODO(jrobbins): if BUILTIN_ISSUE_FIELDS was passed through, I could
37 # remove this dep.
38 from search import query2ast
39 from services import user_svc
40 from tracker import tracker_bizobj
41
42
43 def PreprocessAST(
44 cnxn, query_ast, project_ids, services, harmonized_config):
45 """Preprocess the query by doing lookups so that the SQL query is simpler.
46
47 Args:
48 cnxn: connection to SQL database.
49 query_ast: user query abstract syntax tree parsed by query2ast.py.
50 project_ids: collection of int project IDs to use to look up status values
51 and labels.
52 services: Connections to persistence layer for users and configs.
53 harmonized_config: harmonized config for all projects being searched.
54
55 Returns:
56 A new QueryAST PB with simplified conditions. Specifically, string values
57 for labels, statuses, and components are replaced with the int IDs of
58 those items. Also, is:open is distilled down to
59 status_id != closed_status_ids.
60 """
61 new_conjs = []
62 for conj in query_ast.conjunctions:
63 new_conds = [
64 _PreprocessCond(
65 cnxn, cond, project_ids, services, harmonized_config)
66 for cond in conj.conds]
67 new_conjs.append(ast_pb2.Conjunction(conds=new_conds))
68
69 return ast_pb2.QueryAST(conjunctions=new_conjs)
70
71
72 def _PreprocessIsOpenCond(
73 cnxn, cond, project_ids, services, _harmonized_config):
74 """Preprocess an is:open cond into status_id != closed_status_ids."""
75 if project_ids:
76 closed_status_ids = []
77 for project_id in project_ids:
78 closed_status_ids.extend(services.config.LookupClosedStatusIDs(
79 cnxn, project_id))
80 else:
81 closed_status_ids = services.config.LookupClosedStatusIDsAnyProject(cnxn)
82
83 is_closed = not bool(cond.int_values[0])
84 return ast_pb2.Condition(
85 op=ast_pb2.QueryOp.EQ if is_closed else ast_pb2.QueryOp.NE,
86 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
87 int_values=closed_status_ids)
88
89
90 def _PreprocessIsBlockedCond(
91 _cnxn, cond, _project_ids, _services, _harmonized_config):
92 """Preprocess an is:blocked cond into issues that are blocked."""
93 op = (ast_pb2.QueryOp.IS_DEFINED if bool(cond.int_values[0])
94 else ast_pb2.QueryOp.IS_NOT_DEFINED)
95 return ast_pb2.Condition(
96 op=op, field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']])
97
98
99 def _PreprocessBlockedOnCond(
100 cnxn, cond, project_ids, services, _harmonized_config):
101 """Preprocess blockedon=xyz and has:blockedon conds.
102
103 Preprocesses blockedon=xyz cond into blockedon_id:issue_ids.
104 Preprocesses has:blockedon cond into issues that are blocked on other issues.
105 """
106 issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
107 return ast_pb2.Condition(
108 op=_TextOpToIntOp(cond.op),
109 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blockedon_id']],
110 int_values=issue_ids)
111
112
113 def _PreprocessBlockingCond(
114 cnxn, cond, project_ids, services, _harmonized_config):
115 """Preprocess blocking=xyz and has:blocking conds.
116
117 Preprocesses blocking=xyz cond into blocking_id:issue_ids.
118 Preprocesses has:blocking cond into issues that are blocking other issues.
119 """
120 issue_ids = _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services)
121 return ast_pb2.Condition(
122 op=_TextOpToIntOp(cond.op),
123 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['blocking_id']],
124 int_values=issue_ids)
125
126
127 def _GetIssueIDsFromLocalIdsCond(cnxn, cond, project_ids, services):
128 """Returns global IDs from the local IDs provided in the cond."""
129 # Get {project_name: project} for all projects in project_ids.
130 ids_to_projects = services.project.GetProjects(cnxn, project_ids)
131 ref_projects = {pb.project_name: pb for pb in ids_to_projects.itervalues()}
132 # Populate default_project_name if there is only one project id provided.
133 default_project_name = None
134 if len(ref_projects) == 1:
135 default_project_name = ref_projects.values()[0].project_name
136
137 # Populate refs with (project_name, local_id) pairs.
138 refs = []
139 for val in cond.str_values:
140 project_name, local_id = tracker_bizobj.ParseIssueRef(val)
141 if not project_name:
142 if not default_project_name:
143 # TODO(rmistry): Support the below.
144 raise ValueError(
145 'Searching for issues accross multiple/all projects without '
146 'project prefixes is ambiguous and is currently not supported.')
147 project_name = default_project_name
148 refs.append((project_name, int(local_id)))
149
150 return services.issue.ResolveIssueRefs(
151 cnxn, ref_projects, default_project_name, refs)
152
153
154 def _PreprocessStatusCond(
155 cnxn, cond, project_ids, services, _harmonized_config):
156 """Preprocess a status=names cond into status_id=IDs."""
157 if project_ids:
158 status_ids = []
159 for project_id in project_ids:
160 status_ids.extend(services.config.LookupStatusIDs(
161 cnxn, project_id, cond.str_values))
162 else:
163 status_ids = services.config.LookupStatusIDsAnyProject(
164 cnxn, cond.str_values)
165
166 return ast_pb2.Condition(
167 op=_TextOpToIntOp(cond.op),
168 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['status_id']],
169 int_values=status_ids)
170
171
172 def _IsEqualityOp(op):
173 """Return True for EQ and NE."""
174 return op in (ast_pb2.QueryOp.EQ, ast_pb2.QueryOp.NE)
175
176
177 def _IsDefinedOp(op):
178 """Return True for IS_DEFINED and IS_NOT_DEFINED."""
179 return op in (ast_pb2.QueryOp.IS_DEFINED, ast_pb2.QueryOp.IS_NOT_DEFINED)
180
181
182 def _TextOpToIntOp(op):
183 """If a query is optimized from string to ID matching, use an equality op."""
184 if op == ast_pb2.QueryOp.TEXT_HAS or op == ast_pb2.QueryOp.KEY_HAS:
185 return ast_pb2.QueryOp.EQ
186 elif op == ast_pb2.QueryOp.NOT_TEXT_HAS:
187 return ast_pb2.QueryOp.NE
188 return op
189
190
191 def _MakePrefixRegex(cond):
192 """Return a regex to match strings that start with cond values."""
193 all_prefixes = '|'.join(map(re.escape, cond.str_values))
194 return re.compile(r'(%s)-.+' % all_prefixes, re.I)
195
196
197 def _MakeKeyValueRegex(cond):
198 """Return a regex to match the first token and remaining text separately."""
199 keys, values = zip(*map(lambda x: x.split('-', 1), cond.str_values))
200 if len(set(keys)) != 1:
201 raise ValueError(
202 "KeyValue query with multiple different keys: %r" % cond.str_values)
203 all_values = '|'.join(map(re.escape, values))
204 return re.compile(r'%s-.*\b(%s)\b.*' % (keys[0], all_values), re.I)
205
206
207 def _MakeWordBoundaryRegex(cond):
208 """Return a regex to match the cond values as whole words."""
209 all_words = '|'.join(map(re.escape, cond.str_values))
210 return re.compile(r'.*\b(%s)\b.*' % all_words, re.I)
211
212
213 def _PreprocessLabelCond(
214 cnxn, cond, project_ids, services, _harmonized_config):
215 """Preprocess a label=names cond into label_id=IDs."""
216 if project_ids:
217 label_ids = []
218 for project_id in project_ids:
219 if _IsEqualityOp(cond.op):
220 label_ids.extend(services.config.LookupLabelIDs(
221 cnxn, project_id, cond.str_values))
222 elif _IsDefinedOp(cond.op):
223 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
224 cnxn, project_id, _MakePrefixRegex(cond)))
225 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
226 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
227 cnxn, project_id, _MakeKeyValueRegex(cond)))
228 else:
229 label_ids.extend(services.config.LookupIDsOfLabelsMatching(
230 cnxn, project_id, _MakeWordBoundaryRegex(cond)))
231 else:
232 if _IsEqualityOp(cond.op):
233 label_ids = services.config.LookupLabelIDsAnyProject(
234 cnxn, cond.str_values)
235 elif _IsDefinedOp(cond.op):
236 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
237 cnxn, _MakePrefixRegex(cond))
238 elif cond.op == ast_pb2.QueryOp.KEY_HAS:
239 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
240 cnxn, _MakeKeyValueRegex(cond))
241 else:
242 label_ids = services.config.LookupIDsOfLabelsMatchingAnyProject(
243 cnxn, _MakeWordBoundaryRegex(cond))
244
245 return ast_pb2.Condition(
246 op=_TextOpToIntOp(cond.op),
247 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['label_id']],
248 int_values=label_ids)
249
250
251 def _PreprocessComponentCond(
252 cnxn, cond, project_ids, services, harmonized_config):
253 """Preprocess a component= or component:name cond into component_id=IDs."""
254 exact = _IsEqualityOp(cond.op)
255 component_ids = []
256 if project_ids:
257 # We are searching within specific projects, so harmonized_config
258 # holds the config data for all those projects.
259 for comp_path in cond.str_values:
260 component_ids.extend(tracker_bizobj.FindMatchingComponentIDs(
261 comp_path, harmonized_config, exact=exact))
262 else:
263 # We are searching across the whole site, so we have no harmonized_config
264 # to use.
265 component_ids = services.config.FindMatchingComponentIDsAnyProject(
266 cnxn, cond.str_values, exact=exact)
267
268 return ast_pb2.Condition(
269 op=_TextOpToIntOp(cond.op),
270 field_defs=[query2ast.BUILTIN_ISSUE_FIELDS['component_id']],
271 int_values=component_ids)
272
273
274 def _PreprocessExactUsers(cnxn, cond, user_service, id_fields):
275 """Preprocess a foo=emails cond into foo_id=IDs, if exact user match.
276
277 This preprocesing step converts string conditions to int ID conditions.
278 E.g., [owner=email] to [owner_id=ID]. It only does it in cases
279 where (a) the email was "me", so it was already converted to an string of
280 digits in the search pipeline, or (b) it is "user@domain" which resolves to
281 a known Monorail user. It is also possible to search for, e.g.,
282 [owner:substring], but such searches remain 'owner' field searches rather
283 than 'owner_id', and they cannot be combined with the "me" keyword.
284
285 Args:
286 cnxn: connection to the DB.
287 cond: original parsed query Condition PB.
288 user_service: connection to user persistence layer.
289 id_fields: list of the search fields to use if the conversion to IDs
290 succeeds.
291
292 Returns:
293 A new Condition PB that checks the id_field. Or, the original cond.
294 """
295 op = _TextOpToIntOp(cond.op)
296 if _IsDefinedOp(op):
297 # No need to look up any IDs if we are just testing for any defined value.
298 return ast_pb2.Condition(op=op, field_defs=id_fields)
299
300 # This preprocessing step is only for ops that compare whole values, not
301 # substrings.
302 if not _IsEqualityOp(op):
303 logging.info('could not convert to IDs because op is %r', op)
304 return cond
305
306 user_ids = []
307 for val in cond.str_values:
308 try:
309 user_ids.append(int(val))
310 except ValueError:
311 try:
312 user_ids.append(user_service.LookupUserID(cnxn, val))
313 except user_svc.NoSuchUserException:
314 logging.info('could not convert user %r to int ID', val)
315 return cond # preprocessing failed, stick with the original cond.
316
317 return ast_pb2.Condition(op=op, field_defs=id_fields, int_values=user_ids)
318
319
320 def _PreprocessOwnerCond(
321 cnxn, cond, _project_ids, services, _harmonized_config):
322 """Preprocess a owner=emails cond into owner_id=IDs, if exact user match."""
323 return _PreprocessExactUsers(
324 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['owner_id']])
325
326
327 def _PreprocessCcCond(
328 cnxn, cond, _project_ids, services, _harmonized_config):
329 """Preprocess a cc=emails cond into cc_id=IDs, if exact user match."""
330 return _PreprocessExactUsers(
331 cnxn, cond, services.user, [query2ast.BUILTIN_ISSUE_FIELDS['cc_id']])
332
333
334 def _PreprocessReporterCond(
335 cnxn, cond, _project_ids, services, _harmonized_config):
336 """Preprocess a reporter=emails cond into reporter_id=IDs, if exact."""
337 return _PreprocessExactUsers(
338 cnxn, cond, services.user,
339 [query2ast.BUILTIN_ISSUE_FIELDS['reporter_id']])
340
341
342 def _PreprocessStarredByCond(
343 cnxn, cond, _project_ids, services, _harmonized_config):
344 """Preprocess a starredby=emails cond into starredby_id=IDs, if exact."""
345 return _PreprocessExactUsers(
346 cnxn, cond, services.user,
347 [query2ast.BUILTIN_ISSUE_FIELDS['starredby_id']])
348
349
350 def _PreprocessCommentByCond(
351 cnxn, cond, _project_ids, services, _harmonized_config):
352 """Preprocess a commentby=emails cond into commentby_id=IDs, if exact."""
353 return _PreprocessExactUsers(
354 cnxn, cond, services.user,
355 [query2ast.BUILTIN_ISSUE_FIELDS['commentby_id']])
356
357
358 def _PreprocessCustomCond(cnxn, cond, services):
359 """Preprocess a custom_user_field=emails cond into IDs, if exact matches."""
360 # TODO(jrobbins): better support for ambiguous fields.
361 # For now, if any field is USER_TYPE and the value being searched
362 # for is the email address of an existing account, it will convert
363 # to a user ID and we go with exact ID matching. Otherwise, we
364 # leave the cond as-is for ast2select to do string matching on.
365 user_field_defs = [fd for fd in cond.field_defs
366 if fd.field_type == tracker_pb2.FieldTypes.USER_TYPE]
367 if user_field_defs:
368 return _PreprocessExactUsers(cnxn, cond, services.user, user_field_defs)
369 else:
370 return cond
371
372
373 _PREPROCESSORS = {
374 'open': _PreprocessIsOpenCond,
375 'blocked': _PreprocessIsBlockedCond,
376 'blockedon': _PreprocessBlockedOnCond,
377 'blocking': _PreprocessBlockingCond,
378 'status': _PreprocessStatusCond,
379 'label': _PreprocessLabelCond,
380 'component': _PreprocessComponentCond,
381 'owner': _PreprocessOwnerCond,
382 'cc': _PreprocessCcCond,
383 'reporter': _PreprocessReporterCond,
384 'starredby': _PreprocessStarredByCond,
385 'commentby': _PreprocessCommentByCond,
386 }
387
388
389 def _PreprocessCond(
390 cnxn, cond, project_ids, services, harmonized_config):
391 """Preprocess query by looking up status, label and component IDs."""
392 # All the fields in a cond share the same name because they are parsed
393 # from a user query term, and the term syntax allows just one field name.
394 field_name = cond.field_defs[0].field_name
395 assert all(fd.field_name == field_name for fd in cond.field_defs)
396
397 # Case 1: The user is searching custom fields.
398 if any(fd.field_id for fd in cond.field_defs):
399 # There can't be a mix of custom and built-in fields because built-in
400 # field names are reserved and take priority over any conflicting ones.
401 assert all(fd.field_id for fd in cond.field_defs)
402 return _PreprocessCustomCond(cnxn, cond, services)
403
404 # Case 2: The user is searching a built-in field.
405 preproc = _PREPROCESSORS.get(field_name)
406 if preproc:
407 # We have a preprocessor for that built-in field.
408 return preproc(cnxn, cond, project_ids, services, harmonized_config)
409 else:
410 # We don't have a preprocessor for it.
411 return cond
OLDNEW
« no previous file with comments | « appengine/monorail/search/__init__.py ('k') | appengine/monorail/search/ast2select.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698