appengine/monorail/services/tracker_fulltext.py - Issue 1868553004: Open Source Monorail

Side by Side Diff: appengine/monorail/services/tracker_fulltext.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: Rebase Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 # Copyright 2016 The Chromium Authors. All rights reserved.

	2 # Use of this source code is govered by a BSD-style

	3 # license that can be found in the LICENSE file or at

	4 # https://developers.google.com/open-source/licenses/bsd

	5

	6 """A set of functions that provide fulltext search for issues."""

	7

	8 import collections

	9 import logging

	10 import time

	11

	12 from google.appengine.api import search

	13

	14 import settings

	15 from framework import framework_constants

	16 from framework import framework_helpers

	17 from framework import framework_views

	18 from services import fulltext_helpers

	19 from tracker import tracker_bizobj

	20

	21

	22 # When updating and re-indexing all issues in a project, work in batches

	23 # of this size to manage memory usage and avoid rpc timeouts.

	24 _INDEX_BATCH_SIZE = 40

	25

	26

	27 # The user can search for text that occurs specifically in these

	28 # parts of an issue.

	29 ISSUE_FULLTEXT_FIELDS = ['summary', 'description', 'comment']

	30 # Note: issue documents also contain a "metadata" field, but we do not

	31 # expose that to users. Issue metadata can be searched in a structured way

	32 # by giving a specific field name such as "owner:" or "status:". The metadata

	33 # search field exists only for fulltext queries that do not specify any field.

	34

	35

	36 def IndexIssues(cnxn, issues, user_service, issue_service, config_service):

	37 """(Re)index all the given issues.

	38

	39 Args:

	40 cnxn: connection to SQL database.

	41 issues: list of Issue PBs to index.

	42 user_service: interface to user data storage.

	43 issue_service: interface to issue data storage.

	44 config_service: interface to configuration data storage.

	45 """

	46 issues = list(issues)

	47 config_dict = config_service.GetProjectConfigs(

	48 cnxn, {issue.project_id for issue in issues})

	49 for start in xrange(0, len(issues), _INDEX_BATCH_SIZE):

	50 logging.info('indexing issues: %d remaining', len(issues) - start)

	51 _IndexIssueBatch(

	52 cnxn, issues[start:start + _INDEX_BATCH_SIZE], user_service,

	53 issue_service, config_dict)

	54

	55

	56 def _IndexIssueBatch(cnxn, issues, user_service, issue_service, config_dict):

	57 """Internal method to (re)index the given batch of issues.

	58

	59 Args:

	60 cnxn: connection to SQL database.

	61 issues: list of Issue PBs to index.

	62 user_service: interface to user data storage.

	63 issue_service: interface to issue data storage.

	64 config_dict: dict {project_id: config} for all the projects that

	65 the given issues are in.

	66 """

	67 user_ids = tracker_bizobj.UsersInvolvedInIssues(issues)

	68 comments_dict = issue_service.GetCommentsForIssues(

	69 cnxn, [issue.issue_id for issue in issues])

	70 for comments in comments_dict.itervalues():

	71 user_ids.update([ic.user_id for ic in comments])

	72

	73 users_by_id = framework_views.MakeAllUserViews(

	74 cnxn, user_service, user_ids)

	75 _CreateIssueSearchDocuments(issues, comments_dict, users_by_id, config_dict)

	76

	77

	78 def _CreateIssueSearchDocuments(

	79 issues, comments_dict, users_by_id, config_dict):

	80 """Make the GAE search index documents for the given issue batch.

	81

	82 Args:

	83 issues: list of issues to index.

	84 comments_dict: prefetched dictionary of comments on those issues.

	85 users_by_id: dictionary {user_id: UserView} so that the email

	86 addresses of users who left comments can be found via search.

	87 config_dict: dict {project_id: config} for all the projects that

	88 the given issues are in.

	89 """

	90 documents_by_shard = collections.defaultdict(list)

	91 for issue in issues:

	92 comments = comments_dict.get(issue.issue_id, [])

	93 comments = _IndexableComments(comments, users_by_id)

	94 summary = issue.summary

	95 # TODO(jrobbins): allow search specifically on explicit vs derived

	96 # fields.

	97 owner_id = tracker_bizobj.GetOwnerId(issue)

	98 owner_email = users_by_id[owner_id].email

	99 config = config_dict[issue.project_id]

	100 component_paths = []

	101 for component_id in issue.component_ids:

	102 cd = tracker_bizobj.FindComponentDefByID(component_id, config)

	103 if cd:

	104 component_paths.append(cd.path)

	105

	106 field_values = [str(tracker_bizobj.GetFieldValue(fv, users_by_id))

	107 for fv in issue.field_values]

	108 metadata = '%s %s %s %s %s %s' % (

	109 tracker_bizobj.GetStatus(issue),

	110 owner_email,

	111 [users_by_id[cc_id].email for cc_id in

	112 tracker_bizobj.GetCcIds(issue)],

	113 ' '.join(component_paths),

	114 ' '.join(field_values),

	115 ' '.join(tracker_bizobj.GetLabels(issue)))

	116 assert comments, 'issues should always have at least the description'

	117 description = _ExtractCommentText(comments[0], users_by_id)

	118 description = description[:framework_constants.MAX_FTS_FIELD_SIZE]

	119 all_comments = ' '. join(

	120 _ExtractCommentText(c, users_by_id) for c in comments[1:])

	121 all_comments = all_comments[:framework_constants.MAX_FTS_FIELD_SIZE]

	122

	123 custom_fields = _BuildCustomFTSFields(issue)

	124 doc = search.Document(

	125 doc_id=str(issue.issue_id),

	126 fields=[

	127 search.NumberField(name='project_id', value=issue.project_id),

	128 search.TextField(name='summary', value=summary),

	129 search.TextField(name='metadata', value=metadata),

	130 search.TextField(name='description', value=description),

	131 search.TextField(name='comment', value=all_comments),

	132 ] + custom_fields)

	133

	134 shard_id = issue.issue_id % settings.num_logical_shards

	135 documents_by_shard[shard_id].append(doc)

	136

	137 start_time = time.time()

	138 promises = []

	139 for shard_id, documents in documents_by_shard.iteritems():

	140 if documents:

	141 promises.append(framework_helpers.Promise(

	142 _IndexDocsInShard, shard_id, documents))

	143

	144 for promise in promises:

	145 promise.WaitAndGetValue()

	146

	147 logging.info('Finished %d indexing in shards in %d ms',

	148 len(documents_by_shard), int((time.time() - start_time) * 1000))

	149

	150

	151 def _IndexableComments(comments, users_by_id):

	152 """We only index the comments that are not deleted or banned.

	153

	154 Args:

	155 comments: list of Comment PBs for one issue.

	156 users_by_id: Dict of (user_id -> UserView) for all users.

	157

	158 Returns:

	159 A list of comments filtered to not have any deleted comments or

	160 comments from banned users. If the issue has a huge number of

	161 comments, only a certain number of the first and last comments

	162 are actually indexed.

	163 """

	164 allowed_comments = []

	165 for comment in comments:

	166 user_view = users_by_id.get(comment.user_id)

	167 if not (comment.deleted_by or (user_view and user_view.banned)):

	168 allowed_comments.append(comment)

	169

	170 reasonable_size = (framework_constants.INITIAL_COMMENTS_TO_INDEX +

	171 framework_constants.FINAL_COMMENTS_TO_INDEX)

	172 if len(allowed_comments) <= reasonable_size:

	173 return allowed_comments

	174

	175 candidates = ( # Prioritize the description and recent comments.

	176 allowed_comments[0:1] +

	177 allowed_comments[-framework_constants.FINAL_COMMENTS_TO_INDEX:] +

	178 allowed_comments[1:framework_constants.INITIAL_COMMENTS_TO_INDEX])

	179 total_length = 0

	180 result = []

	181 for comment in candidates:

	182 total_length += len(comment.content)

	183 if total_length < framework_constants.MAX_FTS_FIELD_SIZE:

	184 result.append(comment)

	185

	186 return result

	187

	188

	189 def _IndexDocsInShard(shard_id, documents):

	190 search_index = search.Index(

	191 name=settings.search_index_name_format % shard_id)

	192 search_index.put(documents)

	193 logging.info('FTS indexed %d docs in shard %d', len(documents), shard_id)

	194 # TODO(jrobbins): catch OverQuotaError and add the issues to the

	195 # ReindexQueue table instead.

	196

	197

	198 def _ExtractCommentText(comment, users_by_id):

	199 """Return a string with all the searchable text of the given Comment PB."""

	200 commenter_email = users_by_id[comment.user_id].email

	201 return '%s %s %s' % (

	202 commenter_email,

	203 comment.content,

	204 ' '.join(attach.filename

	205 for attach in comment.attachments

	206 if not attach.deleted))

	207

	208

	209 def _BuildCustomFTSFields(issue):

	210 """Return a list of FTS Fields to index string-valued custom fields."""

	211 fts_fields = []

	212 for fv in issue.field_values:

	213 if fv.str_value:

	214 # TODO(jrobbins): also indicate which were derived vs. explicit.

	215 # TODO(jrobbins): also toss in the email addresses of any users in

	216 # user-valued custom fields, ints for int-valued fields, etc.

	217 fts_field = search.TextField(

	218 name='custom_%d' % fv.field_id, value=fv.str_value)

	219 fts_fields.append(fts_field)

	220

	221 return fts_fields

	222

	223

	224 def UnindexIssues(issue_ids):

	225 """Remove many issues from the sharded search indexes."""

	226 iids_by_shard = {}

	227 for issue_id in issue_ids:

	228 shard_id = issue_id % settings.num_logical_shards

	229 iids_by_shard.setdefault(shard_id, [])

	230 iids_by_shard[shard_id].append(issue_id)

	231

	232 for shard_id, iids_in_shard in iids_by_shard.iteritems():

	233 try:

	234 logging.info(

	235 'unindexing %r issue_ids in %r', len(iids_in_shard), shard_id)

	236 search_index = search.Index(

	237 name=settings.search_index_name_format % shard_id)

	238 search_index.delete([str(iid) for iid in iids_in_shard])

	239 except search.Error:

	240 logging.exception('FTS deletion failed')

	241

	242

	243 def SearchIssueFullText(project_ids, query_ast_conj, shard_id):

	244 """Do full-text search in GAE FTS.

	245

	246 Args:

	247 project_ids: list of project ID numbers to consider.

	248 query_ast_conj: One conjuctive clause from the AST parsed

	249 from the user's query.

	250 shard_id: int shard ID for the shard to consider.

	251

	252 Returns:

	253 (issue_ids, capped) where issue_ids is a list of issue issue_ids that match

	254 the full-text query. And, capped is True if the results were capped due to

	255 an implementation limitation. Or, return (None, False) if the given AST

	256 conjunction contains no full-text conditions.

	257 """

	258 fulltext_query = fulltext_helpers.BuildFTSQuery(

	259 query_ast_conj, ISSUE_FULLTEXT_FIELDS)

	260 if fulltext_query is None:

	261 return None, False

	262

	263 if project_ids:

	264 project_clause = ' or '.join(

	265 'project_id:%d' % pid for pid in project_ids)

	266 fulltext_query = '(%s) %s' % (project_clause, fulltext_query)

	267

	268 # TODO(jrobbins): it would be good to also include some other

	269 # structured search terms to narrow down the set of index

	270 # documents considered. E.g., most queries are only over the

	271 # open issues.

	272 logging.info('FTS query is %r', fulltext_query)

	273 issue_ids = fulltext_helpers.ComprehensiveSearch(

	274 fulltext_query, settings.search_index_name_format % shard_id)

	275 capped = len(issue_ids) >= settings.fulltext_limit_per_shard

	276 return issue_ids, capped

OLD	NEW

« no previous file with comments | « appengine/monorail/services/test/usergroup_svc_test.py ('k') | appengine/monorail/services/user_svc.py » ('j') | no next file with comments »