Index: fuzzer/go/frontend/gsloader/gsloader.go |
diff --git a/fuzzer/go/frontend/gsloader/gsloader.go b/fuzzer/go/frontend/gsloader/gsloader.go |
index ef8249bcd29776cffb74bafdb9c2b50718836a22..b8393ae62916ecdfc9ef3229c66661ab19b2a203 100644 |
--- a/fuzzer/go/frontend/gsloader/gsloader.go |
+++ b/fuzzer/go/frontend/gsloader/gsloader.go |
@@ -9,6 +9,7 @@ import ( |
"github.com/skia-dev/glog" |
"go.skia.org/infra/fuzzer/go/common" |
"go.skia.org/infra/fuzzer/go/config" |
+ "go.skia.org/infra/fuzzer/go/deduplicator" |
"go.skia.org/infra/fuzzer/go/frontend/data" |
"go.skia.org/infra/fuzzer/go/fuzzcache" |
"go.skia.org/infra/go/gs" |
@@ -18,7 +19,8 @@ import ( |
// LoadFromBoltDB loads the data.FuzzReport from FuzzReportCache associated with the given hash. |
// The FuzzReport is first put into the staging fuzz cache, and then into the current. |
// If a cache for the commit does not exist, or there are other problems with the retrieval, |
-// an error is returned. |
+// an error is returned. We do not need to deduplicate on extraction because |
+// the fuzzes were deduplicated on storage. |
func LoadFromBoltDB(cache *fuzzcache.FuzzReportCache) error { |
glog.Infof("Looking into cache for revision %s", config.FrontEnd.SkiaVersion.Hash) |
for _, category := range common.FUZZ_CATEGORIES { |
@@ -37,6 +39,7 @@ func LoadFromBoltDB(cache *fuzzcache.FuzzReportCache) error { |
type GSLoader struct { |
storageClient *storage.Client |
Cache *fuzzcache.FuzzReportCache |
+ deduplicator *deduplicator.Deduplicator |
// completedCounter is the number of fuzzes that have been downloaded from GCS, used for logging. |
completedCounter int32 |
@@ -47,6 +50,7 @@ func New(s *storage.Client, c *fuzzcache.FuzzReportCache) *GSLoader { |
return &GSLoader{ |
storageClient: s, |
Cache: c, |
+ deduplicator: deduplicator.New(), |
} |
} |
@@ -58,6 +62,7 @@ func New(s *storage.Client, c *fuzzcache.FuzzReportCache) *GSLoader { |
func (g *GSLoader) LoadFreshFromGoogleStorage() error { |
revision := config.FrontEnd.SkiaVersion.Hash |
data.ClearStaging() |
+ g.deduplicator.Clear() |
fuzzNames := make([]string, 0, 100) |
for _, cat := range common.FUZZ_CATEGORIES { |
badPath := fmt.Sprintf("%s/%s/bad", cat, revision) |
@@ -65,16 +70,23 @@ func (g *GSLoader) LoadFreshFromGoogleStorage() error { |
if err != nil { |
return err |
} |
- n := 0 |
+ b := 0 |
+ d := 0 |
for report := range reports { |
- data.NewFuzzFound(cat, report) |
+ // We always add the fuzzName, to avoid redownloading duplicates over and over again. |
fuzzNames = append(fuzzNames, report.FuzzName) |
- n++ |
+ if g.deduplicator.IsUnique(report) { |
+ data.NewFuzzFound(cat, report) |
+ b++ |
+ } else { |
+ d++ |
+ } |
+ |
} |
- glog.Infof("%d bad fuzzes freshly loaded from gs://%s/%s", n, config.GS.Bucket, badPath) |
+ glog.Infof("%d bad fuzzes (%d duplicate) freshly loaded from gs://%s/%s", b, d, config.GS.Bucket, badPath) |
+ data.StagingToCurrent() |
} |
- data.StagingToCurrent() |
for _, category := range common.FUZZ_CATEGORIES { |
if err := g.Cache.StoreTree(data.StagingCopy(category), category, revision); err != nil { |
glog.Errorf("Problem storing category %s to boltDB: %s", category, err) |
@@ -98,13 +110,19 @@ func (g *GSLoader) LoadBinaryFuzzesFromGoogleStorage(whitelist []string) error { |
if err != nil { |
return err |
} |
- n := 0 |
+ b := 0 |
+ d := 0 |
for report := range reports { |
- data.NewFuzzFound(cat, report) |
+ // We always add the fuzzName, to avoid redownloading duplicates over and over again. |
fuzzNames = append(fuzzNames, report.FuzzName) |
- n++ |
+ if g.deduplicator.IsUnique(report) { |
+ data.NewFuzzFound(cat, report) |
+ b++ |
+ } else { |
+ d++ |
+ } |
} |
- glog.Infof("%d bad fuzzes freshly loaded from gs://%s/%s", n, config.GS.Bucket, badPath) |
+ glog.Infof("%d bad fuzzes (%d duplicate) incrementally loaded from gs://%s/%s", b, d, config.GS.Bucket, badPath) |
} |
data.StagingToCurrent() |