Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(76)

Unified Diff: common/dirwalk/walk_nostat.go

Issue 2054763004: luci-go/common/dirwalk: Code for walking a directory tree efficiently Base URL: https://github.com/luci/luci-go@master
Patch Set: Fixes. Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: common/dirwalk/walk_nostat.go
diff --git a/common/dirwalk/walk_nostat.go b/common/dirwalk/walk_nostat.go
new file mode 100644
index 0000000000000000000000000000000000000000..e5182b09a77d3e5add2dc75e963100590ac7a6dc
--- /dev/null
+++ b/common/dirwalk/walk_nostat.go
@@ -0,0 +1,132 @@
+// Copyright 2016 The LUCI Authors. All rights reserved.
+// Use of this source code is governed under the Apache License, Version 2.0
+// that can be found in the LICENSE file.
+
+// This function works strangely for performance reasons.
+//
+// File systems have been heavily optimised for doing a directory walk in inode
+// order. It can be an order of magnitude faster to walk the directory in this
+// order so we do. *However*, we want out output to be in sorted so it is
+// deterministic.
+//
+// Calling `stat` is also one of the most expensive things you can do (it is
+// roughly equivalent to reading 64/128k of data). Hence, if you have a lot of
+// small files then just reading their contents directly is more efficient.
+// Rather then doing the stat, we assume everything is a file and just try to
+// read a chunk. If the file is smaller than the block size, we know that we
+// have the entire contents. Otherwise we know the file is bigger and can
+// decide to do the stat. If the name turned out to be a directory, then we
+// will get an error.
+
+package dirwalk
+
+import (
+ "io"
+ "os"
+ "path/filepath"
+ "sort"
+)
+
+type SmallFile struct {
+ name string
+ data []byte
+}
+type SmallFileByName []SmallFile
+
+func (a SmallFileByName) Len() int { return len(a) }
+func (a SmallFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a SmallFileByName) Less(i, j int) bool { return a[i].name < a[j].name }
+
+/*
+type LargeFile struct {
+ name string
+}
+type LargeFileByName []LargeFile
+func (a LargeFileByName) Len() int { return len(a) }
+func (a LargeFileByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a LargeFileByName) Less(i, j int) bool { return a[i].name < a[j].name }
+*/
+
+type EntryError struct {
+ name string
+ err error
+}
+type EntryErrorByName []EntryError
+
+func (a EntryErrorByName) Len() int { return len(a) }
+func (a EntryErrorByName) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
+func (a EntryErrorByName) Less(i, j int) bool { return a[i].name < a[j].name }
+
+func walkNoStatInternal(base string, files []string, smallfile_limit int64, obs WalkObserver) {
+ var errors []EntryError
+ var smallfiles []SmallFile
+ var largefiles []string
+ var dirs []EntryError
+
+ for _, name := range files {
+ fname := filepath.Join(base, name)
+ file, err := os.Open(fname)
+
+ if err != nil {
+ errors = append(errors, EntryError{fname, err})
+ continue
+ }
+
+ block := make([]byte, smallfile_limit)
+ count, err := file.Read(block)
+ if err != io.EOF && err != nil {
+ // Its probably a directory
+ dirs = append(dirs, EntryError{fname, err})
+ continue
+ }
+
+ // This file was bigger than the block size, stat it
+ if int64(count) == smallfile_limit {
+ /*
+ stat, err := file.Stat()
+ if err != nil {
+ errors = append(errors, EntryError{fname, err})
+ continue
+ }
+ */
+ largefiles = append(largefiles, fname) //LargeFile{name: fname, stat: &stat})
+
+ // This file was smaller than the block size
+ } else {
+ smallfiles = append(smallfiles, SmallFile{name: fname, data: block[:count]})
+ }
+ file.Close()
+ }
+
+ sort.Sort(SmallFileByName(smallfiles))
M-A Ruel 2016/09/23 01:48:17 Why pass the file names in order?
+ for _, f := range smallfiles {
+ obs.SmallFile(f.name, f.data)
+ }
+
+ sort.Strings(largefiles)
+ for _, fname := range largefiles {
+ obs.LargeFile(fname)
+ }
+
+ sort.Sort(EntryErrorByName(dirs))
+ for _, d := range dirs {
+ file, err := os.Open(d.name)
+ if err != nil {
+ obs.Error(d.name, d.err)
+ continue
+ }
+
+ names, err := file.Readdirnames(0)
+ if err != nil {
+ obs.Error(d.name, d.err)
+ continue
+ }
+ walkNoStatInternal(d.name, names, smallfile_limit, obs)
+ }
+}
+
+func WalkNoStat(root string, smallfile_limit int64, obs WalkObserver) {
+ paths := []string{root}
+ walkNoStatInternal("", paths, smallfile_limit, obs)
+ obs.Finished()
+}

Powered by Google App Engine
This is Rietveld 408576698