third_party/gsutil/gslib/wildcard_iterator.py - Issue 2280023003: depot_tools: Remove third_party/gsutil

Side by Side Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 2280023003: depot_tools: Remove third_party/gsutil (Closed)

Patch Set: Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 # Copyright 2010 Google Inc. All Rights Reserved.

2 #

3 # Permission is hereby granted, free of charge, to any person obtaining a

4 # copy of this software and associated documentation files (the

5 # "Software"), to deal in the Software without restriction, including

6 # without limitation the rights to use, copy, modify, merge, publish, dis-

7 # tribute, sublicense, and/or sell copies of the Software, and to permit

8 # persons to whom the Software is furnished to do so, subject to the fol-

9 # lowing conditions:

10 #

11 # The above copyright notice and this permission notice shall be included

12 # in all copies or substantial portions of the Software.

13 #

14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

20 # IN THE SOFTWARE.

21

22 """Implementation of wildcarding over StorageUris.

23

24 StorageUri is an abstraction that Google introduced in the boto library,

25 for representing storage provider-independent bucket and object names with

26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

27 class provides wildcarding support for StorageUri objects (including both

28 bucket and file system objects), allowing one to express collections of

29 objects with syntax like the following:

30 gs://mybucket/images/*.png

31 file:///tmp/???abc???

32

33 We provide wildcarding support as part of gsutil rather than as part

34 of boto because wildcarding is really part of shell command-like

35 functionality.

36

37 A comment about wildcard semantics: We support both single path component

38 wildcards (e.g., using '') and recursive wildcards (using '*'), for both

39 file and cloud URIs. For example,

40 gs://bucket/doc//.html

41 would enumerate HTML files one directory down from gs://bucket/doc, while

42 gs://bucket/*/.html

43 would enumerate HTML files in all objects contained in the bucket.

44

45 Note also that if you use file system wildcards it's likely your shell

46 interprets the wildcarding before passing the command to gsutil. For example:

47 % gsutil cp /opt/eclipse//.html gs://bucket/eclipse

48 would likely be expanded by the shell into the following before running gsutil:

49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

50

51 Note also that most shells don't support '**' wildcarding (I think only

52 zsh does). If you want to use '**' wildcarding with such a shell you can

53 single quote each wildcarded string, so it gets passed uninterpreted by the

54 shell to gsutil (at which point gsutil will perform the wildcarding expansion):

55 % gsutil cp '/opt/eclipse/*/.html' gs://bucket/eclipse

56 """

57

58 import boto

59 import fnmatch

60 import glob

61 import os

62 import re

63 import sys

64 import urllib

65

66 from boto.s3.prefix import Prefix

67 from boto.storage_uri import BucketStorageUri

68 from bucket_listing_ref import BucketListingRef

69

70 # Regex to determine if a string contains any wildcards.

71 WILDCARD_REGEX = re.compile('[*?\[\]]')

72

73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'

74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'

75

76

77 class WildcardIterator(object):

78 """Base class for wildcarding over StorageUris.

79

80 This class implements support for iterating over StorageUris that

81 contain wildcards.

82

83 The base class is abstract; you should instantiate using the

84 wildcard_iterator() static factory method, which chooses the right

85 implementation depending on the StorageUri.

86 """

87

88 def __repr__(self):

89 """Returns string representation of WildcardIterator."""

90 return 'WildcardIterator(%s)' % self.wildcard_uri

91

92

93 class CloudWildcardIterator(WildcardIterator):

94 """WildcardIterator subclass for buckets and objects.

95

96 Iterates over BucketListingRef matching the StorageUri wildcard. It's

97 much more efficient to request the Key from the BucketListingRef (via

98 GetKey()) than to request the StorageUri and then call uri.get_key()

99 to retrieve the key, for cases where you want to get metadata that's

100 available in the Bucket (for example to get the name and size of

101 each object), because that information is available in the bucket GET

102 results. If you were to iterate over URIs for such cases and then get

103 the name and size info from each resulting StorageUri, it would cause

104 an additional object GET request for each of the result URIs.

105 """

106

107 def __init__(self, wildcard_uri, proj_id_handler,

108 bucket_storage_uri_class=BucketStorageUri, all_versions=False,

109 headers=None, debug=0):

110 """

111 Instantiates an iterator over BucketListingRef matching given wildcard URI.

112

113 Args:

114 wildcard_uri: StorageUri that contains the wildcard to iterate.

115 proj_id_handler: ProjectIdHandler to use for current command.

116 bucket_storage_uri_class: BucketStorageUri interface.

117 Settable for testing/mocking.

118 headers: Dictionary containing optional HTTP headers to pass to boto.

119 debug: Debug level to pass in to boto connection (range 0..3).

120 """

121 self.wildcard_uri = wildcard_uri

122 # Make a copy of the headers so any updates we make during wildcard

123 # expansion aren't left in the input params (specifically, so we don't

124 # include the x-goog-project-id header needed by a subset of cases, in

125 # the data returned to caller, which could then be used in other cases

126 # where that header must not be passed).

127 if headers is None:

128 self.headers = {}

129 else:

130 self.headers = headers.copy()

131 self.proj_id_handler = proj_id_handler

132 self.bucket_storage_uri_class = bucket_storage_uri_class

133 self.all_versions = all_versions

134 self.debug = debug

135

136 def __iter__(self):

137 """Python iterator that gets called when iterating over cloud wildcard.

138

139 Yields:

140 BucketListingRef, or empty iterator if no matches.

141 """

142 # First handle bucket wildcarding, if any.

143 if ContainsWildcard(self.wildcard_uri.bucket_name):

144 regex = fnmatch.translate(self.wildcard_uri.bucket_name)

145 bucket_uris = []

146 prog = re.compile(regex)

147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,

148 self.wildcard_uri,

149 self.headers)

150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers):

151 if prog.match(b.name):

152 # Use str(b.name) because get_all_buckets() returns Unicode

153 # string, which when used to construct x-goog-copy-src metadata

154 # requests for object-to-object copies causes pathname '/' chars

155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir),

156 # which causes the request to fail.

157 uri_str = '%s://%s' % (self.wildcard_uri.scheme,

158 urllib.quote_plus(str(b.name)))

159 bucket_uris.append(

160 boto.storage_uri(

161 uri_str, debug=self.debug,

162 bucket_storage_uri_class=self.bucket_storage_uri_class,

163 suppress_consec_slashes=False))

164 else:

165 bucket_uris = [self.wildcard_uri.clone_replace_name('')]

166

167 # Now iterate over bucket(s), and handle object wildcarding, if any.

168 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,

169 self.wildcard_uri,

170 self.headers)

171 for bucket_uri in bucket_uris:

172 if self.wildcard_uri.names_bucket():

173 # Bucket-only URI.

174 yield BucketListingRef(bucket_uri, key=None, prefix=None,

175 headers=self.headers)

176 else:

177 # URI contains an object name. If there's no wildcard just yield

178 # the needed URI.

179 if not ContainsWildcard(self.wildcard_uri.object_name):

180 uri_to_yield = bucket_uri.clone_replace_name(

181 self.wildcard_uri.object_name)

182 yield BucketListingRef(uri_to_yield, key=None, prefix=None,

183 headers=self.headers)

184 else:

185 # URI contains a wildcard. Expand iteratively by building

186 # prefix/delimiter bucket listing request, filtering the results per

187 # the current level's wildcard, and continuing with the next component

188 # of the wildcard. See _BuildBucketFilterStrings() documentation

189 # for details.

190 #

191 # Initialize the iteration with bucket name from bucket_uri but

192 # object name from self.wildcard_uri. This is needed to handle cases

193 # where both the bucket and object names contain wildcards.

194 uris_needing_expansion = [

195 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]

196 while len(uris_needing_expansion) > 0:

197 uri = uris_needing_expansion.pop(0)

198 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

199 self._BuildBucketFilterStrings(uri.object_name))

200 prog = re.compile(fnmatch.translate(prefix_wildcard))

201 # List bucket for objects matching prefix up to delimiter.

202 for key in bucket_uri.list_bucket(prefix=prefix,

203 delimiter=delimiter,

204 headers=self.headers,

205 all_versions=self.all_versions):

206 # Check that the prefix regex matches rstripped key.name (to

207 # correspond with the rstripped prefix_wildcard from

208 # _BuildBucketFilterStrings()).

209 if prog.match(key.name.rstrip('/')):

210 if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:

211 if isinstance(key, Prefix):

212 # There's more wildcard left to expand.

213 uris_needing_expansion.append(

214 uri.clone_replace_name(key.name.rstrip('/') + '/'

215 + suffix_wildcard))

216 else:

217 # Done expanding.

218 expanded_uri = uri.clone_replace_key(key)

219

220 if isinstance(key, Prefix):

221 yield BucketListingRef(expanded_uri, key=None, prefix=key,

222 headers=self.headers)

223 else:

224 if self.all_versions:

225 yield BucketListingRef(expanded_uri, key=key, prefix=None,

226 headers=self.headers)

227 else:

228 # Yield BLR wrapping version-less URI.

229 yield BucketListingRef(expanded_uri.clone_replace_name(

230 expanded_uri.object_name), key=key, prefix=None,

231 headers=self.headers)

232

233 def _BuildBucketFilterStrings(self, wildcard):

234 """

235 Builds strings needed for querying a bucket and filtering results to

236 implement wildcard object name matching.

237

238 Args:

239 wildcard: The wildcard string to match to objects.

240

241 Returns:

242 (prefix, delimiter, prefix_wildcard, suffix_wildcard)

243 where:

244 prefix is the prefix to be sent in bucket GET request.

245 delimiter is the delimiter to be sent in bucket GET request.

246 prefix_wildcard is the wildcard to be used to filter bucket GET results.

247 suffix_wildcard is wildcard to be appended to filtered bucket GET

248 results for next wildcard expansion iteration.

249 For example, given the wildcard gs://bucket/abc/de/f.txt we

250 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and

251 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket

252 listing request will then produce a listing result set that can be

253 filtered using this prefix_wildcard; and we'd use this suffix_wildcard

254 to feed into the next call(s) to _BuildBucketFilterStrings(), for the

255 next iteration of listing/filtering.

256

257 Raises:

258 AssertionError if wildcard doesn't contain any wildcard chars.

259 """

260 # Generate a request prefix if the object name part of the wildcard starts

261 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').

262 match = WILDCARD_REGEX.search(wildcard)

263 if not match:

264 # Input "wildcard" has no wildcard chars, so just return tuple that will

265 # cause a bucket listing to match the given input wildcard. Example: if

266 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,

267 # the next iteration will call _BuildBucketFilterStrings() with

268 # gs://bucket/dir/abc, and we will return prefix ='dir/abc',

269 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.

270 prefix = wildcard

271 delimiter = '/'

272 prefix_wildcard = wildcard

273 suffix_wildcard = ''

274 else:

275 if match.start() > 0:

276 # Wildcard does not occur at beginning of object name, so construct a

277 # prefix string to send to server.

278 prefix = wildcard[:match.start()]

279 wildcard_part = wildcard[match.start():]

280 else:

281 prefix = None

282 wildcard_part = wildcard

283 end = wildcard_part.find('/')

284 if end != -1:

285 wildcard_part = wildcard_part[:end+1]

286 # Remove trailing '/' so we will match gs://bucket/abc* as well as

287 # gs://bucket/abc*/ with the same wildcard regex.

288 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')

289 suffix_wildcard = wildcard[match.end():]

290 end = suffix_wildcard.find('/')

291 if end == -1:

292 suffix_wildcard = ''

293 else:

294 suffix_wildcard = suffix_wildcard[end+1:]

295 # To implement recursive (**) wildcarding, if prefix_wildcard

296 # suffix_wildcard starts with '**' don't send a delimiter, and combine

297 # suffix_wildcard at end of prefix_wildcard.

298 if prefix_wildcard.find('**') != -1:

299 delimiter = None

300 prefix_wildcard = prefix_wildcard + suffix_wildcard

301 suffix_wildcard = ''

302 else:

303 delimiter = '/'

304 delim_pos = suffix_wildcard.find(delimiter)

305 # The following debug output is useful for tracing how the algorithm

306 # walks through a multi-part wildcard like gs://bucket/abc/de/f.txt

307 if self.debug > 1:

308 sys.stderr.write(

309 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

310 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

311 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

312 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

313

314 def IterKeys(self):

315 """

316 Convenience iterator that runs underlying iterator and returns Key for each

317 iteration.

318

319 Yields:

320 Subclass of boto.s3.key.Key, or empty iterator if no matches.

321

322 Raises:

323 WildcardException: for bucket-only uri.

324 """

325 for bucket_listing_ref in self. __iter__():

326 if bucket_listing_ref.HasKey():

327 yield bucket_listing_ref.GetKey()

328

329 def IterUris(self):

330 """

331 Convenience iterator that runs underlying iterator and returns StorageUri

332 for each iteration.

333

334 Yields:

335 StorageUri, or empty iterator if no matches.

336 """

337 for bucket_listing_ref in self. __iter__():

338 yield bucket_listing_ref.GetUri()

339

340 def IterUrisForKeys(self):

341 """

342 Convenience iterator that runs underlying iterator and returns the

343 StorageUri for each iterated BucketListingRef that has a Key.

344

345 Yields:

346 StorageUri, or empty iterator if no matches.

347 """

348 for bucket_listing_ref in self. __iter__():

349 if bucket_listing_ref.HasKey():

350 yield bucket_listing_ref.GetUri()

351

352

353 class FileWildcardIterator(WildcardIterator):

354 """WildcardIterator subclass for files and directories.

355

356 If you use recursive wildcards ('**') only a single such wildcard is

357 supported. For example you could use the wildcard '*/.txt' to list all .txt

358 files in any subdirectory of the current directory, but you couldn't use a

359 wildcard like '/abc//*.txt' (which would, if supported, let you find .txt

360 files in any subdirectory named 'abc').

361 """

362

363 def __init__(self, wildcard_uri, headers=None, debug=0):

364 """

365 Instantiate an iterator over BucketListingRefs matching given wildcard URI.

366

367 Args:

368 wildcard_uri: StorageUri that contains the wildcard to iterate.

369 headers: Dictionary containing optional HTTP headers to pass to boto.

370 debug: Debug level to pass in to boto connection (range 0..3).

371 """

372 self.wildcard_uri = wildcard_uri

373 self.headers = headers

374 self.debug = debug

375

376 def __iter__(self):

377 wildcard = self.wildcard_uri.object_name

378 match = re.search('\\', wildcard)

379 if match:

380 # Recursive wildcarding request ('.../**/...').

381 # Example input: wildcard = '/tmp/tmp2pQJAX/*/'

382 base_dir = wildcard[:match.start()-1]

383 remaining_wildcard = wildcard[match.start()+2:]

384 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

385 # remaining_wildcard = '/*'

386 if remaining_wildcard.startswith('*'):

387 raise WildcardException('Invalid wildcard with more than 2 consecutive '

388 '*s (%s)' % wildcard)

389 # If there was no remaining wildcard past the recursive wildcard,

390 # treat it as if it were a ''. For example, file://tmp/* is equivalent

391 # to file://tmp/*/

392 if not remaining_wildcard:

393 remaining_wildcard = '*'

394 # Skip slash(es).

395 remaining_wildcard = remaining_wildcard.lstrip(os.sep)

396 filepaths = []

397 for dirpath, unused_dirnames, filenames in os.walk(base_dir):

398 filepaths.extend(

399 os.path.join(dirpath, f) for f in fnmatch.filter(filenames,

400 remaining_wildcard)

401 )

402 else:

403 # Not a recursive wildcarding request.

404 filepaths = glob.glob(wildcard)

405 for filepath in filepaths:

406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath)

407 yield BucketListingRef(expanded_uri)

408

409 def IterKeys(self):

410 """

411 Placeholder to allow polymorphic use of WildcardIterator.

412

413 Raises:

414 WildcardException: in all cases.

415 """

416 raise WildcardException(

417 'Iterating over Keys not possible for file wildcards')

418

419 def IterUris(self):

420 """

421 Convenience iterator that runs underlying iterator and returns StorageUri

422 for each iteration.

423

424 Yields:

425 StorageUri, or empty iterator if no matches.

426 """

427 for bucket_listing_ref in self. __iter__():

428 yield bucket_listing_ref.GetUri()

429

430

431 class WildcardException(StandardError):

432 """Exception thrown for invalid wildcard URIs."""

433

434 def __init__(self, reason):

435 StandardError.__init__(self)

436 self.reason = reason

437

438 def __repr__(self):

439 return 'WildcardException: %s' % self.reason

440

441 def __str__(self):

442 return 'WildcardException: %s' % self.reason

443

444

445 def wildcard_iterator(uri_or_str, proj_id_handler,

446 bucket_storage_uri_class=BucketStorageUri,

447 all_versions=False,

448 headers=None, debug=0):

449 """Instantiate a WildCardIterator for the given StorageUri.

450

451 Args:

452 uri_or_str: StorageUri or URI string naming wildcard objects to iterate.

453 proj_id_handler: ProjectIdHandler to use for current command.

454 bucket_storage_uri_class: BucketStorageUri interface.

455 Settable for testing/mocking.

456 headers: Dictionary containing optional HTTP headers to pass to boto.

457 debug: Debug level to pass in to boto connection (range 0..3).

458

459 Returns:

460 A WildcardIterator that handles the requested iteration.

461 """

462

463 if isinstance(uri_or_str, basestring):

464 # Disable enforce_bucket_naming, to allow bucket names containing wildcard

465 # chars.

466 uri = boto.storage_uri(

467 uri_or_str, debug=debug, validate=False,

468 bucket_storage_uri_class=bucket_storage_uri_class,

469 suppress_consec_slashes=False)

470 else:

471 uri = uri_or_str

472

473 if uri.is_cloud_uri():

474 return CloudWildcardIterator(

475 uri, proj_id_handler,

476 bucket_storage_uri_class=bucket_storage_uri_class,

477 all_versions=all_versions,

478 headers=headers,

479 debug=debug)

480 elif uri.is_file_uri():

481 return FileWildcardIterator(uri, headers=headers, debug=debug)

482 else:

483 raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

484

485

486 def ContainsWildcard(uri_or_str):

487 """Checks whether uri_or_str contains a wildcard.

488

489 Args:

490 uri_or_str: StorageUri or URI string to check.

491

492 Returns:

493 bool indicator.

494 """

495 if isinstance(uri_or_str, basestring):

496 return bool(WILDCARD_REGEX.search(uri_or_str))

497 else:

498 return bool(WILDCARD_REGEX.search(uri_or_str.uri))

OLD	NEW

« no previous file with comments | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | no next file with comments »