Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(246)

Side by Side Diff: appengine/monorail/services/tracker_fulltext.py

Issue 1868553004: Open Source Monorail (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Rebase Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is govered by a BSD-style
3 # license that can be found in the LICENSE file or at
4 # https://developers.google.com/open-source/licenses/bsd
5
6 """A set of functions that provide fulltext search for issues."""
7
8 import collections
9 import logging
10 import time
11
12 from google.appengine.api import search
13
14 import settings
15 from framework import framework_constants
16 from framework import framework_helpers
17 from framework import framework_views
18 from services import fulltext_helpers
19 from tracker import tracker_bizobj
20
21
22 # When updating and re-indexing all issues in a project, work in batches
23 # of this size to manage memory usage and avoid rpc timeouts.
24 _INDEX_BATCH_SIZE = 40
25
26
27 # The user can search for text that occurs specifically in these
28 # parts of an issue.
29 ISSUE_FULLTEXT_FIELDS = ['summary', 'description', 'comment']
30 # Note: issue documents also contain a "metadata" field, but we do not
31 # expose that to users. Issue metadata can be searched in a structured way
32 # by giving a specific field name such as "owner:" or "status:". The metadata
33 # search field exists only for fulltext queries that do not specify any field.
34
35
36 def IndexIssues(cnxn, issues, user_service, issue_service, config_service):
37 """(Re)index all the given issues.
38
39 Args:
40 cnxn: connection to SQL database.
41 issues: list of Issue PBs to index.
42 user_service: interface to user data storage.
43 issue_service: interface to issue data storage.
44 config_service: interface to configuration data storage.
45 """
46 issues = list(issues)
47 config_dict = config_service.GetProjectConfigs(
48 cnxn, {issue.project_id for issue in issues})
49 for start in xrange(0, len(issues), _INDEX_BATCH_SIZE):
50 logging.info('indexing issues: %d remaining', len(issues) - start)
51 _IndexIssueBatch(
52 cnxn, issues[start:start + _INDEX_BATCH_SIZE], user_service,
53 issue_service, config_dict)
54
55
56 def _IndexIssueBatch(cnxn, issues, user_service, issue_service, config_dict):
57 """Internal method to (re)index the given batch of issues.
58
59 Args:
60 cnxn: connection to SQL database.
61 issues: list of Issue PBs to index.
62 user_service: interface to user data storage.
63 issue_service: interface to issue data storage.
64 config_dict: dict {project_id: config} for all the projects that
65 the given issues are in.
66 """
67 user_ids = tracker_bizobj.UsersInvolvedInIssues(issues)
68 comments_dict = issue_service.GetCommentsForIssues(
69 cnxn, [issue.issue_id for issue in issues])
70 for comments in comments_dict.itervalues():
71 user_ids.update([ic.user_id for ic in comments])
72
73 users_by_id = framework_views.MakeAllUserViews(
74 cnxn, user_service, user_ids)
75 _CreateIssueSearchDocuments(issues, comments_dict, users_by_id, config_dict)
76
77
78 def _CreateIssueSearchDocuments(
79 issues, comments_dict, users_by_id, config_dict):
80 """Make the GAE search index documents for the given issue batch.
81
82 Args:
83 issues: list of issues to index.
84 comments_dict: prefetched dictionary of comments on those issues.
85 users_by_id: dictionary {user_id: UserView} so that the email
86 addresses of users who left comments can be found via search.
87 config_dict: dict {project_id: config} for all the projects that
88 the given issues are in.
89 """
90 documents_by_shard = collections.defaultdict(list)
91 for issue in issues:
92 comments = comments_dict.get(issue.issue_id, [])
93 comments = _IndexableComments(comments, users_by_id)
94 summary = issue.summary
95 # TODO(jrobbins): allow search specifically on explicit vs derived
96 # fields.
97 owner_id = tracker_bizobj.GetOwnerId(issue)
98 owner_email = users_by_id[owner_id].email
99 config = config_dict[issue.project_id]
100 component_paths = []
101 for component_id in issue.component_ids:
102 cd = tracker_bizobj.FindComponentDefByID(component_id, config)
103 if cd:
104 component_paths.append(cd.path)
105
106 field_values = [str(tracker_bizobj.GetFieldValue(fv, users_by_id))
107 for fv in issue.field_values]
108 metadata = '%s %s %s %s %s %s' % (
109 tracker_bizobj.GetStatus(issue),
110 owner_email,
111 [users_by_id[cc_id].email for cc_id in
112 tracker_bizobj.GetCcIds(issue)],
113 ' '.join(component_paths),
114 ' '.join(field_values),
115 ' '.join(tracker_bizobj.GetLabels(issue)))
116 assert comments, 'issues should always have at least the description'
117 description = _ExtractCommentText(comments[0], users_by_id)
118 description = description[:framework_constants.MAX_FTS_FIELD_SIZE]
119 all_comments = ' '. join(
120 _ExtractCommentText(c, users_by_id) for c in comments[1:])
121 all_comments = all_comments[:framework_constants.MAX_FTS_FIELD_SIZE]
122
123 custom_fields = _BuildCustomFTSFields(issue)
124 doc = search.Document(
125 doc_id=str(issue.issue_id),
126 fields=[
127 search.NumberField(name='project_id', value=issue.project_id),
128 search.TextField(name='summary', value=summary),
129 search.TextField(name='metadata', value=metadata),
130 search.TextField(name='description', value=description),
131 search.TextField(name='comment', value=all_comments),
132 ] + custom_fields)
133
134 shard_id = issue.issue_id % settings.num_logical_shards
135 documents_by_shard[shard_id].append(doc)
136
137 start_time = time.time()
138 promises = []
139 for shard_id, documents in documents_by_shard.iteritems():
140 if documents:
141 promises.append(framework_helpers.Promise(
142 _IndexDocsInShard, shard_id, documents))
143
144 for promise in promises:
145 promise.WaitAndGetValue()
146
147 logging.info('Finished %d indexing in shards in %d ms',
148 len(documents_by_shard), int((time.time() - start_time) * 1000))
149
150
151 def _IndexableComments(comments, users_by_id):
152 """We only index the comments that are not deleted or banned.
153
154 Args:
155 comments: list of Comment PBs for one issue.
156 users_by_id: Dict of (user_id -> UserView) for all users.
157
158 Returns:
159 A list of comments filtered to not have any deleted comments or
160 comments from banned users. If the issue has a huge number of
161 comments, only a certain number of the first and last comments
162 are actually indexed.
163 """
164 allowed_comments = []
165 for comment in comments:
166 user_view = users_by_id.get(comment.user_id)
167 if not (comment.deleted_by or (user_view and user_view.banned)):
168 allowed_comments.append(comment)
169
170 reasonable_size = (framework_constants.INITIAL_COMMENTS_TO_INDEX +
171 framework_constants.FINAL_COMMENTS_TO_INDEX)
172 if len(allowed_comments) <= reasonable_size:
173 return allowed_comments
174
175 candidates = ( # Prioritize the description and recent comments.
176 allowed_comments[0:1] +
177 allowed_comments[-framework_constants.FINAL_COMMENTS_TO_INDEX:] +
178 allowed_comments[1:framework_constants.INITIAL_COMMENTS_TO_INDEX])
179 total_length = 0
180 result = []
181 for comment in candidates:
182 total_length += len(comment.content)
183 if total_length < framework_constants.MAX_FTS_FIELD_SIZE:
184 result.append(comment)
185
186 return result
187
188
189 def _IndexDocsInShard(shard_id, documents):
190 search_index = search.Index(
191 name=settings.search_index_name_format % shard_id)
192 search_index.put(documents)
193 logging.info('FTS indexed %d docs in shard %d', len(documents), shard_id)
194 # TODO(jrobbins): catch OverQuotaError and add the issues to the
195 # ReindexQueue table instead.
196
197
198 def _ExtractCommentText(comment, users_by_id):
199 """Return a string with all the searchable text of the given Comment PB."""
200 commenter_email = users_by_id[comment.user_id].email
201 return '%s %s %s' % (
202 commenter_email,
203 comment.content,
204 ' '.join(attach.filename
205 for attach in comment.attachments
206 if not attach.deleted))
207
208
209 def _BuildCustomFTSFields(issue):
210 """Return a list of FTS Fields to index string-valued custom fields."""
211 fts_fields = []
212 for fv in issue.field_values:
213 if fv.str_value:
214 # TODO(jrobbins): also indicate which were derived vs. explicit.
215 # TODO(jrobbins): also toss in the email addresses of any users in
216 # user-valued custom fields, ints for int-valued fields, etc.
217 fts_field = search.TextField(
218 name='custom_%d' % fv.field_id, value=fv.str_value)
219 fts_fields.append(fts_field)
220
221 return fts_fields
222
223
224 def UnindexIssues(issue_ids):
225 """Remove many issues from the sharded search indexes."""
226 iids_by_shard = {}
227 for issue_id in issue_ids:
228 shard_id = issue_id % settings.num_logical_shards
229 iids_by_shard.setdefault(shard_id, [])
230 iids_by_shard[shard_id].append(issue_id)
231
232 for shard_id, iids_in_shard in iids_by_shard.iteritems():
233 try:
234 logging.info(
235 'unindexing %r issue_ids in %r', len(iids_in_shard), shard_id)
236 search_index = search.Index(
237 name=settings.search_index_name_format % shard_id)
238 search_index.delete([str(iid) for iid in iids_in_shard])
239 except search.Error:
240 logging.exception('FTS deletion failed')
241
242
243 def SearchIssueFullText(project_ids, query_ast_conj, shard_id):
244 """Do full-text search in GAE FTS.
245
246 Args:
247 project_ids: list of project ID numbers to consider.
248 query_ast_conj: One conjuctive clause from the AST parsed
249 from the user's query.
250 shard_id: int shard ID for the shard to consider.
251
252 Returns:
253 (issue_ids, capped) where issue_ids is a list of issue issue_ids that match
254 the full-text query. And, capped is True if the results were capped due to
255 an implementation limitation. Or, return (None, False) if the given AST
256 conjunction contains no full-text conditions.
257 """
258 fulltext_query = fulltext_helpers.BuildFTSQuery(
259 query_ast_conj, ISSUE_FULLTEXT_FIELDS)
260 if fulltext_query is None:
261 return None, False
262
263 if project_ids:
264 project_clause = ' or '.join(
265 'project_id:%d' % pid for pid in project_ids)
266 fulltext_query = '(%s) %s' % (project_clause, fulltext_query)
267
268 # TODO(jrobbins): it would be good to also include some other
269 # structured search terms to narrow down the set of index
270 # documents considered. E.g., most queries are only over the
271 # open issues.
272 logging.info('FTS query is %r', fulltext_query)
273 issue_ids = fulltext_helpers.ComprehensiveSearch(
274 fulltext_query, settings.search_index_name_format % shard_id)
275 capped = len(issue_ids) >= settings.fulltext_limit_per_shard
276 return issue_ids, capped
OLDNEW
« no previous file with comments | « appengine/monorail/services/test/usergroup_svc_test.py ('k') | appengine/monorail/services/user_svc.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698