common/dirwalk/walk_nostat.go - Issue 2054763004: luci-go/common/dirwalk: Code for walking a directory tree efficiently

Unified Diff: common/dirwalk/walk_nostat.go

Issue 2054763004: luci-go/common/dirwalk: Code for walking a directory tree efficiently Base URL: https://github.com/luci/luci-go@master

Patch Set: Fixes. Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: common/dirwalk/walk_nostat.go

diff --git a/common/dirwalk/walk_nostat.go b/common/dirwalk/walk_nostat.go

new file mode 100644

index 0000000000000000000000000000000000000000..e5182b09a77d3e5add2dc75e963100590ac7a6dc

--- /dev/null

+++ b/common/dirwalk/walk_nostat.go

@@ -0,0 +1,132 @@

+// Use of this source code is governed under the Apache License, Version 2.0

+// that can be found in the LICENSE file.

+// This function works strangely for performance reasons.

+//

+// File systems have been heavily optimised for doing a directory walk in inode

+// order. It can be an order of magnitude faster to walk the directory in this

+// order so we do. *However*, we want out output to be in sorted so it is

+// deterministic.

+//

+// Calling `stat` is also one of the most expensive things you can do (it is

+// roughly equivalent to reading 64/128k of data). Hence, if you have a lot of

+// small files then just reading their contents directly is more efficient.

+// Rather then doing the stat, we assume everything is a file and just try to

+// read a chunk. If the file is smaller than the block size, we know that we

+// have the entire contents. Otherwise we know the file is bigger and can

+// decide to do the stat. If the name turned out to be a directory, then we

+// will get an error.

+package dirwalk

+import (

+ "io"

+ "os"

+ "path/filepath"

+ "sort"

+type SmallFile struct {

+ name string

+ data []byte

+type SmallFileByName []SmallFile

+func (a SmallFileByName) Len() int { return len(a) }

+func (a SmallFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

+func (a SmallFileByName) Less(i, j int) bool { return a[i].name < a[j].name }

+/*

+type LargeFile struct {

+ name string

+type LargeFileByName []LargeFile

+func (a LargeFileByName) Len() int { return len(a) }

+func (a LargeFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

+func (a LargeFileByName) Less(i, j int) bool { return a[i].name < a[j].name }

+*/

+type EntryError struct {

+ name string

+ err error

+type EntryErrorByName []EntryError

+func (a EntryErrorByName) Len() int { return len(a) }

+func (a EntryErrorByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

+func (a EntryErrorByName) Less(i, j int) bool { return a[i].name < a[j].name }

+func walkNoStatInternal(base string, files []string, smallfile_limit int64, obs WalkObserver) {

+ var errors []EntryError

+ var smallfiles []SmallFile

+ var largefiles []string

+ var dirs []EntryError

+ for _, name := range files {

+ fname := filepath.Join(base, name)

+ file, err := os.Open(fname)

+ if err != nil {

+ errors = append(errors, EntryError{fname, err})

+ continue

+ }

+ block := make([]byte, smallfile_limit)

+ count, err := file.Read(block)

+ if err != io.EOF && err != nil {

+ // Its probably a directory

+ dirs = append(dirs, EntryError{fname, err})

+ continue

+ }

+ // This file was bigger than the block size, stat it

+ if int64(count) == smallfile_limit {

+ /*

+ stat, err := file.Stat()

+ if err != nil {

+ errors = append(errors, EntryError{fname, err})

+ continue

+ }

+ */

+ largefiles = append(largefiles, fname) //LargeFile{name: fname, stat: &stat})

+ // This file was smaller than the block size

+ } else {

+ smallfiles = append(smallfiles, SmallFile{name: fname, data: block[:count]})

+ }

+ file.Close()

+ }

+ sort.Sort(SmallFileByName(smallfiles))

M-A Ruel 2016/09/23 01:48:17 Why pass the file names in order?

+ for _, f := range smallfiles {

+ obs.SmallFile(f.name, f.data)

+ }

+ sort.Strings(largefiles)

+ for _, fname := range largefiles {

+ obs.LargeFile(fname)

+ }

+ sort.Sort(EntryErrorByName(dirs))

+ for _, d := range dirs {

+ file, err := os.Open(d.name)

+ if err != nil {

+ obs.Error(d.name, d.err)

+ continue

+ }

+ names, err := file.Readdirnames(0)

+ if err != nil {

+ obs.Error(d.name, d.err)

+ continue

+ }

+ walkNoStatInternal(d.name, names, smallfile_limit, obs)

+ }

+func WalkNoStat(root string, smallfile_limit int64, obs WalkObserver) {

+ paths := []string{root}

+ walkNoStatInternal("", paths, smallfile_limit, obs)

+ obs.Finished()

« common/dirwalk/tests/tools/walkdir/walkers_phash.go ('K') | « common/dirwalk/walk_basic.go ('k') | common/dirwalk/walk_parallel.go » ('j') | common/dirwalk/walk_parallel.go » ('J')