common/dirwalk/walk_nostat.go - Issue 2054763004: luci-go/common/dirwalk: Code for walking a directory tree efficiently

Side by Side Diff: common/dirwalk/walk_nostat.go

Issue 2054763004: luci-go/common/dirwalk: Code for walking a directory tree efficiently Base URL: https://github.com/luci/luci-go@master

Patch Set: Fixes. Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2016 The LUCI Authors. All rights reserved.

	2 // Use of this source code is governed under the Apache License, Version 2.0

	3 // that can be found in the LICENSE file.

	4

	5 // This function works strangely for performance reasons.

	6 //

	7 // File systems have been heavily optimised for doing a directory walk in inode

	8 // order. It can be an order of magnitude faster to walk the directory in this

	9 // order so we do. However, we want out output to be in sorted so it is

	10 // deterministic.

	11 //

	12 // Calling `stat` is also one of the most expensive things you can do (it is

	13 // roughly equivalent to reading 64/128k of data). Hence, if you have a lot of

	14 // small files then just reading their contents directly is more efficient.

	15 // Rather then doing the stat, we assume everything is a file and just try to

	16 // read a chunk. If the file is smaller than the block size, we know that we

	17 // have the entire contents. Otherwise we know the file is bigger and can

	18 // decide to do the stat. If the name turned out to be a directory, then we

	19 // will get an error.

	20

	21 package dirwalk

	22

	23 import (

	24 "io"

	25 "os"

	26 "path/filepath"

	27 "sort"

	28 )

	29

	30 type SmallFile struct {

	31 name string

	32 data []byte

	33 }

	34 type SmallFileByName []SmallFile

	35

	36 func (a SmallFileByName) Len() int { return len(a) }

	37 func (a SmallFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

	38 func (a SmallFileByName) Less(i, j int) bool { return a[i].name < a[j].name }

	39

	40 /*

	41 type LargeFile struct {

	42 name string

	43 }

	44 type LargeFileByName []LargeFile

	45 func (a LargeFileByName) Len() int { return len(a) }

	46 func (a LargeFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

	47 func (a LargeFileByName) Less(i, j int) bool { return a[i].name < a[j].name }

	48 */

	49

	50 type EntryError struct {

	51 name string

	52 err error

	53 }

	54 type EntryErrorByName []EntryError

	55

	56 func (a EntryErrorByName) Len() int { return len(a) }

	57 func (a EntryErrorByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

	58 func (a EntryErrorByName) Less(i, j int) bool { return a[i].name < a[j].name }

	59

	60 func walkNoStatInternal(base string, files []string, smallfile_limit int64, obs WalkObserver) {

	61 var errors []EntryError

	62 var smallfiles []SmallFile

	63 var largefiles []string

	64 var dirs []EntryError

	65

	66 for _, name := range files {

	67 fname := filepath.Join(base, name)

	68 file, err := os.Open(fname)

	69

	70 if err != nil {

	71 errors = append(errors, EntryError{fname, err})

	72 continue

	73 }

	74

	75 block := make([]byte, smallfile_limit)

	76 count, err := file.Read(block)

	77 if err != io.EOF && err != nil {

	78 // Its probably a directory

	79 dirs = append(dirs, EntryError{fname, err})

	80 continue

	81 }

	82

	83 // This file was bigger than the block size, stat it

	84 if int64(count) == smallfile_limit {

	85 /*

	86 stat, err := file.Stat()

	87 if err != nil {

	88 errors = append(errors, EntryError{fname , err})

	89 continue

	90 }

	91 */

	92 largefiles = append(largefiles, fname) //LargeFile{name: fname, stat: &stat})

	93

	94 // This file was smaller than the block size

	95 } else {

	96 smallfiles = append(smallfiles, SmallFile{name: fname, d ata: block[:count]})

	97 }

	98 file.Close()

	99 }

	100

	101 sort.Sort(SmallFileByName(smallfiles))
	M-A Ruel 2016/09/23 01:48:17 Why pass the file names in order? Why pass the file names in order?
	102 for _, f := range smallfiles {

	103 obs.SmallFile(f.name, f.data)

	104 }

	105

	106 sort.Strings(largefiles)

	107 for _, fname := range largefiles {

	108 obs.LargeFile(fname)

	109 }

	110

	111 sort.Sort(EntryErrorByName(dirs))

	112 for _, d := range dirs {

	113 file, err := os.Open(d.name)

	114 if err != nil {

	115 obs.Error(d.name, d.err)

	116 continue

	117 }

	118

	119 names, err := file.Readdirnames(0)

	120 if err != nil {

	121 obs.Error(d.name, d.err)

	122 continue

	123 }

	124 walkNoStatInternal(d.name, names, smallfile_limit, obs)

	125 }

	126 }

	127

	128 func WalkNoStat(root string, smallfile_limit int64, obs WalkObserver) {

	129 paths := []string{root}

	130 walkNoStatInternal("", paths, smallfile_limit, obs)

	131 obs.Finished()

	132 }

OLD	NEW

« common/dirwalk/tests/tools/walkdir/walkers_phash.go ('K') | « common/dirwalk/walk_basic.go ('k') | common/dirwalk/walk_parallel.go » ('j') | common/dirwalk/walk_parallel.go » ('J')