Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(107)

Side by Side Diff: appengine/monorail/search/query2ast.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is govered by a BSD-style
3 # license that can be found in the LICENSE file or at
4 # https://developers.google.com/open-source/licenses/bsd
5
6 """A set of functions that integrate the GAE search index with Monorail."""
7
8 import collections
9 import datetime
10 import logging
11 import re
12 from services import fulltext_helpers
13 import time
14
15 from proto import ast_pb2
16 from proto import tracker_pb2
17
18
19 # TODO(jrobbins): Consider re-implementing this whole file by using a
20 # BNF syntax specification and a parser generator or library.
21
22 # encodings
23 UTF8 = 'utf-8'
24
25 # Field types and operators
26 BOOL = tracker_pb2.FieldTypes.BOOL_TYPE
27 DATE = tracker_pb2.FieldTypes.DATE_TYPE
28 NUM = tracker_pb2.FieldTypes.INT_TYPE
29 TXT = tracker_pb2.FieldTypes.STR_TYPE
30
31 EQ = ast_pb2.QueryOp.EQ
32 NE = ast_pb2.QueryOp.NE
33 LT = ast_pb2.QueryOp.LT
34 GT = ast_pb2.QueryOp.GT
35 LE = ast_pb2.QueryOp.LE
36 GE = ast_pb2.QueryOp.GE
37 TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS
38 NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS
39 TEXT_MATCHES = ast_pb2.QueryOp.TEXT_MATCHES
40 NOT_TEXT_MATCHES = ast_pb2.QueryOp.NOT_TEXT_MATCHES
41 IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED
42 IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED
43 KEY_HAS = ast_pb2.QueryOp.KEY_HAS
44
45 # Mapping from user query comparison operators to our internal representation.
46 OPS = {
47 ':': TEXT_HAS,
48 '=': EQ,
49 '!=': NE,
50 '<': LT,
51 '>': GT,
52 '<=': LE,
53 '>=': GE,
54 }
55
56 # This is a partial regular expression that matches all of our comparison
57 # operators, such as =, 1=, >, and <. Longer ones listed first so that the
58 # shorter ones don't cause premature matches.
59 OPS_PATTERN = '|'.join(
60 map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op))))
61
62 # This RE extracts search terms from a subquery string.
63 TERM_RE = re.compile(
64 r'(-?"[^"]+")|' # E.g., ["division by zero"]
65 r'(\S+(%s)[^ "]+)|' # E.g., [stars>10]
66 r'(\w+(%s)"[^"]+")|' # E.g., [summary:"memory leak"]
67 r'(-?[._\*\w][-._\*\w]+)' # E.g., [-workaround]
68 % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)
69
70 # This RE is used to further decompose a comparison term into prefix, op, and
71 # value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix
72 # can include a leading "-" to negate the comparison.
73 OP_RE = re.compile(
74 r'^(?P<prefix>[-_\w]*?)'
75 r'(?P<op>%s)'
76 r'(?P<value>([-,.@>/_\*\w]+|"[^"]+"))$' %
77 OPS_PATTERN,
78 flags=re.UNICODE)
79
80
81 # Predefined issue fields passed to the query parser.
82 _ISSUE_FIELDS_LIST = [
83 (ast_pb2.ANY_FIELD, TXT),
84 ('attachment', TXT), # attachment file names
85 ('attachments', NUM), # number of attachment files
86 ('blocked', BOOL),
87 ('blockedon', TXT),
88 ('blockedon_id', NUM),
89 ('blocking', TXT),
90 ('blocking_id', NUM),
91 ('cc', TXT),
92 ('cc_id', NUM),
93 ('comment', TXT),
94 ('commentby', TXT),
95 ('commentby_id', NUM),
96 ('component', TXT),
97 ('component_id', NUM),
98 ('description', TXT),
99 ('id', NUM),
100 ('label', TXT),
101 ('label_id', NUM),
102 ('mergedinto', NUM),
103 ('open', BOOL),
104 ('owner', TXT),
105 ('owner_id', NUM),
106 ('project', TXT),
107 ('reporter', TXT),
108 ('reporter_id', NUM),
109 ('spam', BOOL),
110 ('stars', NUM),
111 ('starredby', TXT),
112 ('starredby_id', NUM),
113 ('status', TXT),
114 ('status_id', NUM),
115 ('summary', TXT),
116 ]
117
118 _DATE_FIELDS = (
119 'closed',
120 'modified',
121 'opened',
122 )
123
124 # Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.
125 _ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)
126
127 _DATE_FIELD_SUFFIX_TO_OP = {
128 '-after': '>',
129 '-before': '<',
130 }
131
132 BUILTIN_ISSUE_FIELDS = {
133 f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)
134 for f_name, f_type in _ISSUE_FIELDS_LIST}
135
136
137 def ParseUserQuery(
138 query, scope, builtin_fields, harmonized_config, warnings=None):
139 """Parse a user query and return a set of structure terms.
140
141 Args:
142 query: string with user's query. E.g., 'Priority=High'.
143 scope: string search terms that define the scope in which the
144 query should be executed. They are expressed in the same
145 user query language. E.g., adding the canned query.
146 builtin_fields: dict {field_name: FieldDef(field_name, type)}
147 mapping field names to FieldDef objects for built-in fields.
148 harmonized_config: config for all the projects being searched.
149 @@@ custom field name is not unique in cross project search.
150 - custom_fields = {field_name: [fd, ...]}
151 - query build needs to OR each possible interpretation
152 - could be label in one project and field in another project.
153 @@@ what about searching across all projects?
154 warnings: optional list to accumulate warning messages.
155
156 Returns:
157 A QueryAST with conjunctions (usually just one), where each has a list of
158 Condition PBs with op, fields, str_values and int_values. E.g., the query
159 [priority=high leak OR stars>100] over open issues would return
160 QueryAST(
161 Conjunction(Condition(EQ, [open_fd], [], [1]),
162 Condition(EQ, [label_fd], ['priority-high'], []),
163 Condition(TEXT_HAS, any_field_fd, ['leak'], [])),
164 Conjunction(Condition(EQ, [open_fd], [], [1]),
165 Condition(GT, [stars_fd], [], [100])))
166
167 Raises:
168 InvalidQueryError: If a problem was detected in the user's query.
169 """
170 if warnings is None:
171 warnings = []
172 if _HasParens(query):
173 warnings.append('Parentheses are ignored in user queries.')
174
175 if _HasParens(scope):
176 warnings.append('Parentheses are ignored in saved queries.')
177
178 # Convert the overall query into one or more OR'd subqueries.
179 subqueries = query.split(' OR ')
180
181 if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now.
182 raise InvalidQueryError('Logical operator OR is not supported yet.')
183
184 # Make a dictionary of all fields: built-in + custom in each project.
185 combined_fields = collections.defaultdict(
186 list, {field_name: [field_def]
187 for field_name, field_def in builtin_fields.iteritems()})
188 for fd in harmonized_config.field_defs:
189 if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:
190 # Only do non-enum fields because enums are stored as labels
191 combined_fields[fd.field_name.lower()].append(fd)
192
193 conjunctions = [
194 _ParseConjunction(sq, scope, combined_fields, warnings)
195 for sq in subqueries]
196 return ast_pb2.QueryAST(conjunctions=conjunctions)
197
198
199 def _HasParens(s):
200 """Return True if there are parentheses in the given string."""
201 # Monorail cannot handle parenthesized expressions, so we tell the
202 # user that immediately. Even inside a quoted string, the GAE search
203 # engine will not handle parens in TEXT-type fields.
204 return '(' in s or ')' in s
205
206
207 def _ParseConjunction(subquery, scope, fields, warnings):
208 """Parse part of a user query into a Conjunction PB."""
209 logging.info('Parsing sub query: %r in scope %r', subquery, scope)
210 scoped_query = ('%s %s' % (scope, subquery)).lower()
211 cond_strs = _ExtractConds(scoped_query)
212 conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs]
213 return ast_pb2.Conjunction(conds=conds)
214
215
216 def _ParseCond(cond_str, fields, warnings):
217 """Parse one user query condition string into a Condition PB."""
218 op_match = OP_RE.match(cond_str)
219 # Do not treat as key:value search terms if any of the special prefixes match.
220 special_prefixes_match = any(
221 cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
222 if op_match and not special_prefixes_match:
223 prefix = op_match.group('prefix')
224 op = op_match.group('op')
225 val = op_match.group('value')
226 # Special case handling to continue to support old date query terms from
227 # codesite. See monorail:151 for more details.
228 if prefix.startswith(_DATE_FIELDS):
229 for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:
230 if prefix.endswith(date_suffix):
231 prefix = prefix.rstrip(date_suffix)
232 op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]
233 return _ParseStructuredTerm(prefix, op, val, fields)
234
235 # Treat the cond as a full-text search term, which might be negated.
236 if cond_str.startswith('-'):
237 op = NOT_TEXT_HAS
238 cond_str = cond_str[1:]
239 else:
240 op = TEXT_HAS
241
242 # Flag a potential user misunderstanding.
243 if cond_str.lower() in ('and', 'or', 'not'):
244 warnings.append(
245 'The only supported boolean operator is OR (all capitals).')
246
247 return ast_pb2.MakeCond(
248 op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])
249
250
251 def _ParseStructuredTerm(prefix, op_str, value, fields):
252 """Parse one user structured query term into an internal representation.
253
254 Args:
255 prefix: The query operator, usually a field name. E.g., summary. It can
256 also be special operators like "is" to test boolean fields.
257 op_str: the comparison operator. Usually ":" or "=", but can be any OPS.
258 value: the value to compare against, e.g., term to find in that field.
259 fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.
260
261 Returns:
262 A Condition PB.
263 """
264 unquoted_value = value.strip('"')
265 # Quick-OR is a convenient way to write one condition that matches any one of
266 # multiple values, like set membership. E.g., [Priority=High,Critical].
267 quick_or_vals = [v.strip() for v in unquoted_value.split(',')]
268
269 if ((prefix == 'is' or prefix == '-is') and
270 unquoted_value in ['open', 'blocked', 'spam']):
271 return ast_pb2.MakeCond(
272 EQ, fields[unquoted_value], [], [int(prefix == 'is')])
273
274 op = OPS[op_str]
275 negate = False
276 if prefix.startswith('-'):
277 negate = True
278 if op == EQ:
279 op = NE
280 elif op == TEXT_HAS:
281 op = NOT_TEXT_HAS
282 prefix = prefix[1:]
283
284 # Search entries with or without any value in the specified field.
285 if prefix == 'has':
286 op = IS_NOT_DEFINED if negate else IS_DEFINED
287 if unquoted_value in fields: # Look for that field with any value.
288 return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])
289 else: # Look for any label with that prefix.
290 return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])
291
292 if prefix in fields: # search built-in and custom fields. E.g., summary.
293 # Note: if first matching field is date-type, we assume they all are.
294 # TODO(jrobbins): better handling for rare case where multiple projects
295 # define the same custom field name, and one is a date and another is not.
296 first_field = fields[prefix][0]
297 if first_field.field_type == DATE:
298 date_value = _ParseDateValue(unquoted_value)
299 return ast_pb2.MakeCond(op, fields[prefix], [], [date_value])
300 else:
301 quick_or_ints = []
302 for qov in quick_or_vals:
303 try:
304 quick_or_ints.append(int(qov))
305 except ValueError:
306 pass
307 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints)
308
309 # Since it is not a field, treat it as labels, E.g., Priority.
310 quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]
311 # Convert substring match to key-value match if user typed 'foo:bar'.
312 if op == TEXT_HAS:
313 op = KEY_HAS
314 return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])
315
316
317 def _ExtractConds(query):
318 """Parse a query string into a list of individual condition strings.
319
320 Args:
321 query: UTF-8 encoded search query string.
322
323 Returns:
324 A list of query condition strings.
325 """
326 # Convert to unicode then search for distinct terms.
327 term_matches = TERM_RE.findall(query)
328
329 terms = []
330 for (phrase, word_label, _op1, phrase_label, _op2,
331 word) in term_matches:
332 # Case 1: Quoted phrases, e.g., ["hot dog"].
333 if phrase_label or phrase:
334 terms.append(phrase_label or phrase)
335
336 # Case 2: Comparisons
337 elif word_label:
338 special_prefixes_match = any(
339 word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)
340 match = OP_RE.match(word_label)
341 if match:
342 label = match.group('prefix')
343 op = match.group('op')
344 word = match.group('value')
345 if special_prefixes_match:
346 # Do not include quotes if any of the special prefixes match because
347 # we do not want to treat the label as key:value search terms.
348 terms.append('%s%s%s' % (label, op, word))
349 else:
350 terms.append('%s%s"%s"' % (label, op, word))
351 else:
352 # It looked like a key:value cond, but not exactly, so treat it
353 # as fulltext search. It is probably a tiny bit of source code.
354 terms.append('"%s"' % word_label)
355
356 # Case 3: Simple words.
357 elif word:
358 terms.append(word)
359
360 else:
361 logging.warn('Unexpected search term in %r', query)
362
363 return terms
364
365
366 def _ParseDateValue(val):
367 """Convert the user-entered date into timestamp."""
368 # Support timestamp value such as opened>1437671476
369 try:
370 return int(val)
371 except ValueError:
372 pass
373
374 # TODO(jrobbins): future: take timezones into account.
375 # TODO(jrobbins): for now, explain to users that "today" is
376 # actually now: the current time, not 12:01am in their timezone.
377 # In fact, it is not very useful because everything in the system
378 # happened before the current time.
379 if val == 'today':
380 return _CalculatePastDate(0)
381 elif val.startswith('today-'):
382 try:
383 days_ago = int(val.split('-')[1])
384 except ValueError:
385 days_ago = 0
386 return _CalculatePastDate(days_ago)
387
388 if '/' in val:
389 year, month, day = [int(x) for x in val.split('/')]
390 elif '-' in val:
391 year, month, day = [int(x) for x in val.split('-')]
392
393 try:
394 return int(time.mktime(datetime.datetime(year, month, day).timetuple()))
395 except ValueError:
396 raise InvalidQueryError('Could not parse date')
397
398
399 def _CalculatePastDate(days_ago, now=None):
400 """Calculates the timestamp N days ago from now."""
401 if now is None:
402 now = int(time.time())
403 ts = now - days_ago * 24 * 60 * 60
404 return ts
405
406
407 def CheckSyntax(query, harmonized_config, warnings=None):
408 """Parse the given query and report the first error or None."""
409 try:
410 ParseUserQuery(
411 query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings)
412 except InvalidQueryError as e:
413 return e.message
414
415 return None
416
417
418 class Error(Exception):
419 """Base exception class for this package."""
420 pass
421
422
423 class InvalidQueryError(Error):
424 """Error raised when an invalid query is requested."""
425 pass
OLDNEW
« no previous file with comments | « appengine/monorail/search/frontendsearchpipeline.py ('k') | appengine/monorail/search/searchpipeline.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698