appengine/monorail/search/query2ast.py - Issue 1868553004: Open Source Monorail

Side by Side Diff: appengine/monorail/search/query2ast.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Rebase Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is govered by a BSD-style

	3 # license that can be found in the LICENSE file or at

	4 # https://developers.google.com/open-source/licenses/bsd

	5

	6 """A set of functions that integrate the GAE search index with Monorail."""

	7

	8 import collections

	9 import datetime

	10 import logging

	11 import re

	12 from services import fulltext_helpers

	13 import time

	14

	15 from proto import ast_pb2

	16 from proto import tracker_pb2

	17

	18

	19 # TODO(jrobbins): Consider re-implementing this whole file by using a

	20 # BNF syntax specification and a parser generator or library.

	21

	22 # encodings

	23 UTF8 = 'utf-8'

	24

	25 # Field types and operators

	26 BOOL = tracker_pb2.FieldTypes.BOOL_TYPE

	27 DATE = tracker_pb2.FieldTypes.DATE_TYPE

	28 NUM = tracker_pb2.FieldTypes.INT_TYPE

	29 TXT = tracker_pb2.FieldTypes.STR_TYPE

	30

	31 EQ = ast_pb2.QueryOp.EQ

	32 NE = ast_pb2.QueryOp.NE

	33 LT = ast_pb2.QueryOp.LT

	34 GT = ast_pb2.QueryOp.GT

	35 LE = ast_pb2.QueryOp.LE

	36 GE = ast_pb2.QueryOp.GE

	37 TEXT_HAS = ast_pb2.QueryOp.TEXT_HAS

	38 NOT_TEXT_HAS = ast_pb2.QueryOp.NOT_TEXT_HAS

	39 TEXT_MATCHES = ast_pb2.QueryOp.TEXT_MATCHES

	40 NOT_TEXT_MATCHES = ast_pb2.QueryOp.NOT_TEXT_MATCHES

	41 IS_DEFINED = ast_pb2.QueryOp.IS_DEFINED

	42 IS_NOT_DEFINED = ast_pb2.QueryOp.IS_NOT_DEFINED

	43 KEY_HAS = ast_pb2.QueryOp.KEY_HAS

	44

	45 # Mapping from user query comparison operators to our internal representation.

	46 OPS = {

	47 ':': TEXT_HAS,

	48 '=': EQ,

	49 '!=': NE,

	50 '<': LT,

	51 '>': GT,

	52 '<=': LE,

	53 '>=': GE,

	54 }

	55

	56 # This is a partial regular expression that matches all of our comparison

	57 # operators, such as =, 1=, >, and <. Longer ones listed first so that the

	58 # shorter ones don't cause premature matches.

	59 OPS_PATTERN = '\|'.join(

	60 map(re.escape, sorted(OPS.keys(), key=lambda op: -len(op))))

	61

	62 # This RE extracts search terms from a subquery string.

	63 TERM_RE = re.compile(

	64 r'(-?"[^"]+")\|' # E.g., ["division by zero"]

	65 r'(\S+(%s)[^ "]+)\|' # E.g., [stars>10]

	66 r'(\w+(%s)"[^"]+")\|' # E.g., [summary:"memory leak"]

	67 r'(-?[._\\w][-._\\w]+)' # E.g., [-workaround]

	68 % (OPS_PATTERN, OPS_PATTERN), flags=re.UNICODE)

	69

	70 # This RE is used to further decompose a comparison term into prefix, op, and

	71 # value. E.g., [stars>10] or [is:open] or [summary:"memory leak"]. The prefix

	72 # can include a leading "-" to negate the comparison.

	73 OP_RE = re.compile(

	74 r'^(?P<prefix>[-_\w]*?)'

	75 r'(?P<op>%s)'

	76 r'(?P<value>([-,.@>/_\*\w]+\|"[^"]+"))$' %

	77 OPS_PATTERN,

	78 flags=re.UNICODE)

	79

	80

	81 # Predefined issue fields passed to the query parser.

	82 _ISSUE_FIELDS_LIST = [

	83 (ast_pb2.ANY_FIELD, TXT),

	84 ('attachment', TXT), # attachment file names

	85 ('attachments', NUM), # number of attachment files

	86 ('blocked', BOOL),

	87 ('blockedon', TXT),

	88 ('blockedon_id', NUM),

	89 ('blocking', TXT),

	90 ('blocking_id', NUM),

	91 ('cc', TXT),

	92 ('cc_id', NUM),

	93 ('comment', TXT),

	94 ('commentby', TXT),

	95 ('commentby_id', NUM),

	96 ('component', TXT),

	97 ('component_id', NUM),

	98 ('description', TXT),

	99 ('id', NUM),

	100 ('label', TXT),

	101 ('label_id', NUM),

	102 ('mergedinto', NUM),

	103 ('open', BOOL),

	104 ('owner', TXT),

	105 ('owner_id', NUM),

	106 ('project', TXT),

	107 ('reporter', TXT),

	108 ('reporter_id', NUM),

	109 ('spam', BOOL),

	110 ('stars', NUM),

	111 ('starredby', TXT),

	112 ('starredby_id', NUM),

	113 ('status', TXT),

	114 ('status_id', NUM),

	115 ('summary', TXT),

	116 ]

	117

	118 _DATE_FIELDS = (

	119 'closed',

	120 'modified',

	121 'opened',

	122 )

	123

	124 # Add all _DATE_FIELDS to _ISSUE_FIELDS_LIST.

	125 _ISSUE_FIELDS_LIST.extend((date_field, DATE) for date_field in _DATE_FIELDS)

	126

	127 _DATE_FIELD_SUFFIX_TO_OP = {

	128 '-after': '>',

	129 '-before': '<',

	130 }

	131

	132 BUILTIN_ISSUE_FIELDS = {

	133 f_name: tracker_pb2.FieldDef(field_name=f_name, field_type=f_type)

	134 for f_name, f_type in _ISSUE_FIELDS_LIST}

	135

	136

	137 def ParseUserQuery(

	138 query, scope, builtin_fields, harmonized_config, warnings=None):

	139 """Parse a user query and return a set of structure terms.

	140

	141 Args:

	142 query: string with user's query. E.g., 'Priority=High'.

	143 scope: string search terms that define the scope in which the

	144 query should be executed. They are expressed in the same

	145 user query language. E.g., adding the canned query.

	146 builtin_fields: dict {field_name: FieldDef(field_name, type)}

	147 mapping field names to FieldDef objects for built-in fields.

	148 harmonized_config: config for all the projects being searched.

	149 @@@ custom field name is not unique in cross project search.

	150 - custom_fields = {field_name: [fd, ...]}

	151 - query build needs to OR each possible interpretation

	152 - could be label in one project and field in another project.

	153 @@@ what about searching across all projects?

	154 warnings: optional list to accumulate warning messages.

	155

	156 Returns:

	157 A QueryAST with conjunctions (usually just one), where each has a list of

	158 Condition PBs with op, fields, str_values and int_values. E.g., the query

	159 [priority=high leak OR stars>100] over open issues would return

	160 QueryAST(

	161 Conjunction(Condition(EQ, [open_fd], [], [1]),

	162 Condition(EQ, [label_fd], ['priority-high'], []),

	163 Condition(TEXT_HAS, any_field_fd, ['leak'], [])),

	164 Conjunction(Condition(EQ, [open_fd], [], [1]),

	165 Condition(GT, [stars_fd], [], [100])))

	166

	167 Raises:

	168 InvalidQueryError: If a problem was detected in the user's query.

	169 """

	170 if warnings is None:

	171 warnings = []

	172 if _HasParens(query):

	173 warnings.append('Parentheses are ignored in user queries.')

	174

	175 if _HasParens(scope):

	176 warnings.append('Parentheses are ignored in saved queries.')

	177

	178 # Convert the overall query into one or more OR'd subqueries.

	179 subqueries = query.split(' OR ')

	180

	181 if len(subqueries) > 1: # TODO(jrobbins): temporary limitation just for now.

	182 raise InvalidQueryError('Logical operator OR is not supported yet.')

	183

	184 # Make a dictionary of all fields: built-in + custom in each project.

	185 combined_fields = collections.defaultdict(

	186 list, {field_name: [field_def]

	187 for field_name, field_def in builtin_fields.iteritems()})

	188 for fd in harmonized_config.field_defs:

	189 if fd.field_type != tracker_pb2.FieldTypes.ENUM_TYPE:

	190 # Only do non-enum fields because enums are stored as labels

	191 combined_fields[fd.field_name.lower()].append(fd)

	192

	193 conjunctions = [

	194 _ParseConjunction(sq, scope, combined_fields, warnings)

	195 for sq in subqueries]

	196 return ast_pb2.QueryAST(conjunctions=conjunctions)

	197

	198

	199 def _HasParens(s):

	200 """Return True if there are parentheses in the given string."""

	201 # Monorail cannot handle parenthesized expressions, so we tell the

	202 # user that immediately. Even inside a quoted string, the GAE search

	203 # engine will not handle parens in TEXT-type fields.

	204 return '(' in s or ')' in s

	205

	206

	207 def _ParseConjunction(subquery, scope, fields, warnings):

	208 """Parse part of a user query into a Conjunction PB."""

	209 logging.info('Parsing sub query: %r in scope %r', subquery, scope)

	210 scoped_query = ('%s %s' % (scope, subquery)).lower()

	211 cond_strs = _ExtractConds(scoped_query)

	212 conds = [_ParseCond(cond_str, fields, warnings) for cond_str in cond_strs]

	213 return ast_pb2.Conjunction(conds=conds)

	214

	215

	216 def _ParseCond(cond_str, fields, warnings):

	217 """Parse one user query condition string into a Condition PB."""

	218 op_match = OP_RE.match(cond_str)

	219 # Do not treat as key:value search terms if any of the special prefixes match.

	220 special_prefixes_match = any(

	221 cond_str.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)

	222 if op_match and not special_prefixes_match:

	223 prefix = op_match.group('prefix')

	224 op = op_match.group('op')

	225 val = op_match.group('value')

	226 # Special case handling to continue to support old date query terms from

	227 # codesite. See monorail:151 for more details.

	228 if prefix.startswith(_DATE_FIELDS):

	229 for date_suffix in _DATE_FIELD_SUFFIX_TO_OP:

	230 if prefix.endswith(date_suffix):

	231 prefix = prefix.rstrip(date_suffix)

	232 op = _DATE_FIELD_SUFFIX_TO_OP[date_suffix]

	233 return _ParseStructuredTerm(prefix, op, val, fields)

	234

	235 # Treat the cond as a full-text search term, which might be negated.

	236 if cond_str.startswith('-'):

	237 op = NOT_TEXT_HAS

	238 cond_str = cond_str[1:]

	239 else:

	240 op = TEXT_HAS

	241

	242 # Flag a potential user misunderstanding.

	243 if cond_str.lower() in ('and', 'or', 'not'):

	244 warnings.append(

	245 'The only supported boolean operator is OR (all capitals).')

	246

	247 return ast_pb2.MakeCond(

	248 op, [BUILTIN_ISSUE_FIELDS[ast_pb2.ANY_FIELD]], [cond_str], [])

	249

	250

	251 def _ParseStructuredTerm(prefix, op_str, value, fields):

	252 """Parse one user structured query term into an internal representation.

	253

	254 Args:

	255 prefix: The query operator, usually a field name. E.g., summary. It can

	256 also be special operators like "is" to test boolean fields.

	257 op_str: the comparison operator. Usually ":" or "=", but can be any OPS.

	258 value: the value to compare against, e.g., term to find in that field.

	259 fields: dict {name_lower: [FieldDef, ...]} for built-in and custom fields.

	260

	261 Returns:

	262 A Condition PB.

	263 """

	264 unquoted_value = value.strip('"')

	265 # Quick-OR is a convenient way to write one condition that matches any one of

	266 # multiple values, like set membership. E.g., [Priority=High,Critical].

	267 quick_or_vals = [v.strip() for v in unquoted_value.split(',')]

	268

	269 if ((prefix == 'is' or prefix == '-is') and

	270 unquoted_value in ['open', 'blocked', 'spam']):

	271 return ast_pb2.MakeCond(

	272 EQ, fields[unquoted_value], [], [int(prefix == 'is')])

	273

	274 op = OPS[op_str]

	275 negate = False

	276 if prefix.startswith('-'):

	277 negate = True

	278 if op == EQ:

	279 op = NE

	280 elif op == TEXT_HAS:

	281 op = NOT_TEXT_HAS

	282 prefix = prefix[1:]

	283

	284 # Search entries with or without any value in the specified field.

	285 if prefix == 'has':

	286 op = IS_NOT_DEFINED if negate else IS_DEFINED

	287 if unquoted_value in fields: # Look for that field with any value.

	288 return ast_pb2.MakeCond(op, fields[unquoted_value], [], [])

	289 else: # Look for any label with that prefix.

	290 return ast_pb2.MakeCond(op, fields['label'], [unquoted_value], [])

	291

	292 if prefix in fields: # search built-in and custom fields. E.g., summary.

	293 # Note: if first matching field is date-type, we assume they all are.

	294 # TODO(jrobbins): better handling for rare case where multiple projects

	295 # define the same custom field name, and one is a date and another is not.

	296 first_field = fields[prefix][0]

	297 if first_field.field_type == DATE:

	298 date_value = _ParseDateValue(unquoted_value)

	299 return ast_pb2.MakeCond(op, fields[prefix], [], [date_value])

	300 else:

	301 quick_or_ints = []

	302 for qov in quick_or_vals:

	303 try:

	304 quick_or_ints.append(int(qov))

	305 except ValueError:

	306 pass

	307 return ast_pb2.MakeCond(op, fields[prefix], quick_or_vals, quick_or_ints)

	308

	309 # Since it is not a field, treat it as labels, E.g., Priority.

	310 quick_or_labels = ['%s-%s' % (prefix, v) for v in quick_or_vals]

	311 # Convert substring match to key-value match if user typed 'foo:bar'.

	312 if op == TEXT_HAS:

	313 op = KEY_HAS

	314 return ast_pb2.MakeCond(op, fields['label'], quick_or_labels, [])

	315

	316

	317 def _ExtractConds(query):

	318 """Parse a query string into a list of individual condition strings.

	319

	320 Args:

	321 query: UTF-8 encoded search query string.

	322

	323 Returns:

	324 A list of query condition strings.

	325 """

	326 # Convert to unicode then search for distinct terms.

	327 term_matches = TERM_RE.findall(query)

	328

	329 terms = []

	330 for (phrase, word_label, _op1, phrase_label, _op2,

	331 word) in term_matches:

	332 # Case 1: Quoted phrases, e.g., ["hot dog"].

	333 if phrase_label or phrase:

	334 terms.append(phrase_label or phrase)

	335

	336 # Case 2: Comparisons

	337 elif word_label:

	338 special_prefixes_match = any(

	339 word_label.startswith(p) for p in fulltext_helpers.NON_OP_PREFIXES)

	340 match = OP_RE.match(word_label)

	341 if match:

	342 label = match.group('prefix')

	343 op = match.group('op')

	344 word = match.group('value')

	345 if special_prefixes_match:

	346 # Do not include quotes if any of the special prefixes match because

	347 # we do not want to treat the label as key:value search terms.

	348 terms.append('%s%s%s' % (label, op, word))

	349 else:

	350 terms.append('%s%s"%s"' % (label, op, word))

	351 else:

	352 # It looked like a key:value cond, but not exactly, so treat it

	353 # as fulltext search. It is probably a tiny bit of source code.

	354 terms.append('"%s"' % word_label)

	355

	356 # Case 3: Simple words.

	357 elif word:

	358 terms.append(word)

	359

	360 else:

	361 logging.warn('Unexpected search term in %r', query)

	362

	363 return terms

	364

	365

	366 def _ParseDateValue(val):

	367 """Convert the user-entered date into timestamp."""

	368 # Support timestamp value such as opened>1437671476

	369 try:

	370 return int(val)

	371 except ValueError:

	372 pass

	373

	374 # TODO(jrobbins): future: take timezones into account.

	375 # TODO(jrobbins): for now, explain to users that "today" is

	376 # actually now: the current time, not 12:01am in their timezone.

	377 # In fact, it is not very useful because everything in the system

	378 # happened before the current time.

	379 if val == 'today':

	380 return _CalculatePastDate(0)

	381 elif val.startswith('today-'):

	382 try:

	383 days_ago = int(val.split('-')[1])

	384 except ValueError:

	385 days_ago = 0

	386 return _CalculatePastDate(days_ago)

	387

	388 if '/' in val:

	389 year, month, day = [int(x) for x in val.split('/')]

	390 elif '-' in val:

	391 year, month, day = [int(x) for x in val.split('-')]

	392

	393 try:

	394 return int(time.mktime(datetime.datetime(year, month, day).timetuple()))

	395 except ValueError:

	396 raise InvalidQueryError('Could not parse date')

	397

	398

	399 def _CalculatePastDate(days_ago, now=None):

	400 """Calculates the timestamp N days ago from now."""

	401 if now is None:

	402 now = int(time.time())

	403 ts = now - days_ago * 24 * 60 * 60

	404 return ts

	405

	406

	407 def CheckSyntax(query, harmonized_config, warnings=None):

	408 """Parse the given query and report the first error or None."""

	409 try:

	410 ParseUserQuery(

	411 query, '', BUILTIN_ISSUE_FIELDS, harmonized_config, warnings=warnings)

	412 except InvalidQueryError as e:

	413 return e.message

	414

	415 return None

	416

	417

	418 class Error(Exception):

	419 """Base exception class for this package."""

	420 pass

	421

	422

	423 class InvalidQueryError(Error):

	424 """Error raised when an invalid query is requested."""

	425 pass

OLD	NEW

« no previous file with comments | « appengine/monorail/search/frontendsearchpipeline.py ('k') | appengine/monorail/search/searchpipeline.py » ('j') | no next file with comments »