OLD | NEW |
(Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is govered by a BSD-style |
| 3 # license that can be found in the LICENSE file or at |
| 4 # https://developers.google.com/open-source/licenses/bsd |
| 5 |
| 6 """A set of helpers functions for fulltext search.""" |
| 7 |
| 8 import logging |
| 9 |
| 10 from google.appengine.api import search |
| 11 |
| 12 import settings |
| 13 from proto import ast_pb2 |
| 14 from proto import tracker_pb2 |
| 15 |
| 16 # GAE search API can only respond with 500 results per call. |
| 17 _SEARCH_RESULT_CHUNK_SIZE = 500 |
| 18 |
| 19 # Do not treat strings that start with the below as key:value search terms. |
| 20 # See bugs.chromium.org/p/monorail/issues/detail?id=419 for more detail. |
| 21 NON_OP_PREFIXES = ( |
| 22 'http:', |
| 23 'https:', |
| 24 ) |
| 25 |
| 26 |
| 27 def BuildFTSQuery(query_ast_conj, fulltext_fields): |
| 28 """Convert a Monorail query AST into a GAE search query string. |
| 29 |
| 30 Args: |
| 31 query_ast_conj: a Conjunction PB with a list of Comparison PBs that each |
| 32 have operator, field definitions, string values, and int values. |
| 33 All Conditions should be AND'd together. |
| 34 fulltext_fields: a list of string names of fields that may exist in the |
| 35 fulltext documents. E.g., issue fulltext documents have a "summary" |
| 36 field. |
| 37 |
| 38 Returns: |
| 39 A string that can be passed to AppEngine's search API. Or, None if there |
| 40 were no fulltext conditions, so no fulltext search should be done. |
| 41 """ |
| 42 fulltext_parts = [ |
| 43 _BuildFTSCondition(cond, fulltext_fields) |
| 44 for cond in query_ast_conj.conds] |
| 45 if any(fulltext_parts): |
| 46 return ' '.join(fulltext_parts) |
| 47 else: |
| 48 return None |
| 49 |
| 50 |
| 51 def _BuildFTSCondition(cond, fulltext_fields): |
| 52 """Convert one query AST condition into a GAE search query string.""" |
| 53 if cond.op == ast_pb2.QueryOp.NOT_TEXT_HAS: |
| 54 neg = 'NOT ' |
| 55 elif cond.op == ast_pb2.QueryOp.TEXT_HAS: |
| 56 neg = '' |
| 57 else: |
| 58 return '' # FTS only looks at TEXT_HAS and NOT_TEXT_HAS |
| 59 |
| 60 parts = [] |
| 61 |
| 62 for fd in cond.field_defs: |
| 63 if fd.field_name in fulltext_fields: |
| 64 pattern = fd.field_name + ':"%s"' |
| 65 elif fd.field_name == ast_pb2.ANY_FIELD: |
| 66 pattern = '"%s"' |
| 67 elif fd.field_id and fd.field_type == tracker_pb2.FieldTypes.STR_TYPE: |
| 68 pattern = 'custom_' + str(fd.field_id) + ':"%s"' |
| 69 else: |
| 70 continue # This issue field is searched via SQL. |
| 71 |
| 72 for value in cond.str_values: |
| 73 # Strip out quotes around the value. |
| 74 value = value.strip('"') |
| 75 special_prefixes_match = any(value.startswith(p) for p in NON_OP_PREFIXES) |
| 76 if not special_prefixes_match: |
| 77 value = value.replace(':', ' ') |
| 78 assert ('"' not in value), 'Value %r has a quote in it' % value |
| 79 parts.append(pattern % value) |
| 80 |
| 81 if parts: |
| 82 return neg + '(%s)' % ' OR '.join(parts) |
| 83 else: |
| 84 return '' # None of the fields were fulltext fields. |
| 85 |
| 86 |
| 87 def ComprehensiveSearch(fulltext_query, index_name): |
| 88 """Call the GAE search API, and keep calling it to get all results. |
| 89 |
| 90 Args: |
| 91 fulltext_query: string in the GAE search API query language. |
| 92 index_name: string name of the GAE fulltext index to hit. |
| 93 |
| 94 Returns: |
| 95 A list of integer issue IIDs or project IDs. |
| 96 """ |
| 97 search_index = search.Index(name=index_name) |
| 98 |
| 99 response = search_index.search(search.Query( |
| 100 fulltext_query, |
| 101 options=search.QueryOptions( |
| 102 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 103 cursor=search.Cursor()))) |
| 104 logging.info('got %d initial results', len(response.results)) |
| 105 ids = [int(result.doc_id) for result in response] |
| 106 |
| 107 remaining_iterations = int( |
| 108 settings.fulltext_limit_per_shard - 1 / _SEARCH_RESULT_CHUNK_SIZE) |
| 109 for _ in range(remaining_iterations): |
| 110 if not response.cursor: |
| 111 break |
| 112 response = search_index.search(search.Query( |
| 113 fulltext_query, |
| 114 options=search.QueryOptions( |
| 115 limit=_SEARCH_RESULT_CHUNK_SIZE, returned_fields=[], ids_only=True, |
| 116 cursor=response.cursor))) |
| 117 logging.info( |
| 118 'got %d more results: %r', len(response.results), response.results) |
| 119 ids.extend(int(result.doc_id) for result in response) |
| 120 |
| 121 logging.info('FTS result ids %d', len(ids)) |
| 122 return ids |
OLD | NEW |