perf/server/data.go - Issue 335833002: Start loading the BigQuery data and serving it to the UI.

Unified Diff: perf/server/data.go

Issue 335833002: Start loading the BigQuery data and serving it to the UI. (Closed) Base URL: https://skia.googlesource.com/buildbot.git@master

Patch Set: y Created 6 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: perf/server/data.go

diff --git a/perf/server/data.go b/perf/server/data.go

new file mode 100644

index 0000000000000000000000000000000000000000..404332e0b0c5adee8bbc542a66d6f9d9f44ceb91

--- /dev/null

+++ b/perf/server/data.go

@@ -0,0 +1,505 @@

+// Use of this source code is governed by a BSD-style license that can be found

+// in the LICENSE file.

+package main

+import (

+ "encoding/json"

+ "fmt"

+ "io"

+ "net/http"

+ "os/exec"

+ "reflect"

+ "strconv"

+ "strings"

+ "time"

+import (

+ "code.google.com/p/goauth2/oauth"

+ "code.google.com/p/google-api-go-client/bigquery/v2"

+ "github.com/oxtoacart/webbrowser"

+const (

+ MISSING_DATA_SENTINEL = 1e100

mtklein 2014/06/15 15:12:10 Cute. Go has +Inf, -Inf, and NaN too I think.

kelvinly 2014/06/15 15:49:26 Tested those, Go doesn't seem to marshal them corr

jcgregorio 2014/06/15 16:25:39 Go may have +-Ing and Nan, but JSON doesn't, thus

jcgregorio 2014/06/16 02:46:25 Added a note explaining the requirements for the s

+// Shouldn't need auth when running from GCE, but will need it for local dev.

+var config = &oauth.Config{

+ ClientId: "470362608618-nlbqngfl87f4b3mhqqe9ojgaoe11vrld.apps.googleusercontent.com",

+ ClientSecret: "J4YCkfMXFJISGyuBuVEiH60T",

mtklein 2014/06/15 15:12:12 Um... secret?

kelvinly 2014/06/15 15:49:26 Time to get new keys!

jcgregorio 2014/06/16 02:46:24 In a future CL I will move this to reading the cli

+ Scope: bigquery.BigqueryScope,

+ AuthURL: "https://accounts.google.com/o/oauth2/auth",

+ TokenURL: "https://accounts.google.com/o/oauth2/token",

+ RedirectURL: "urn:ietf:wg:oauth:2.0:oob",

+ TokenCache: oauth.CacheFile("bqtoken.data"),

+// runFlow runs through a 3LO OAuth 2.0 flow to get credentials for BigQuery.

+func runFlow(config *oauth.Config) (*http.Client, error) {

+ transport := &oauth.Transport{Config: config}

+ if _, err := config.TokenCache.Token(); err != nil {

+ url := config.AuthCodeURL("")

+ fmt.Printf(`Your browser has been opened to visit:

+ %s

+Enter the verification code:`, url)

+ webbrowser.Open(url)

+ var code string

+ fmt.Scan(&code)

+ if _, err := transport.Exchange(code); err != nil {

+ return nil, err

+ }

+ return transport.Client(), nil

+// Trace represents all the values of a single measurement over time.

+type Trace struct {

+ Key string `json:"key"`

+ Values []float64 `json:"values"`

+ Params map[string]string `json:"params"`

+ Trybot bool `json:"trybot"`

+// NewTrace allocates a new Trace set up for the given number of samples.

+//

+// The Trace Values are pre-filled in with the missing data sentinel since not

+// all tests will be run on all commits.

+func NewTrace(numSamples int) *Trace {

+ t := &Trace{

+ Values: make([]float64, numSamples, numSamples),

+ Params: make(map[string]string),

+ Trybot: false,

+ }

+ for i, _ := range t.Values {

+ t.Values[i] = MISSING_DATA_SENTINEL

+ }

+ return t

+// Annotations for commits.

+type Annotation struct {

+ ID int `json:"id"`

mtklein 2014/06/15 15:12:10 Might help to explain ID and Type here a bit.

jcgregorio 2014/06/16 02:46:24 Added note about this mapping to the MySQL databas

+ Notes string `json:"notes"`

+ Author string `json:"author"`

mtklein 2014/06/15 15:12:10 This is author-of-annotation right? Not author-of

jcgregorio 2014/06/16 02:46:25 Same as above. On 2014/06/15 15:12:10, mtklein wr

+ Type int `json:"type"`

+// Commit is information about each Git commit.

+type Commit struct {

+ CommitTime time.Time `json:"commit_time"`

+ Hash string `json:"hash"`

+ GitNumber int `json:"git_number"`

+ CommitMessage string `json:"commit_msg"`

+ Annotations []Annotation `json:"annotations,omitempty"`

+type Choices []string

mtklein 2014/06/15 15:12:10 Explain this a bit?

jcgregorio 2014/06/16 02:46:25 Done.

+// AllData is the top level struct we return via JSON to the UI.

+//

+// The length of the Commits array is the same length as all of the Values

+// arrays in all of the Traces.

+type AllData struct {

+ Traces []Trace `json:"traces"`

+ ParamSet map[string]Choices `json:"param_set"`

+ Commits []Commit `json:"commits"`

+// gitCommitsWithTestData returns the list of commits that have perf data

+// associated with them.

+//

+// Not all commits will have perf data, the builders don't necessarily run for

+// each commit.

+func gitCommitsWithTestData(service *bigquery.Service) (map[string]bool, error) {

+ query := `

+SELECT

+ gitHash

+FROM

+ (TABLE_DATE_RANGE(perf_skps_v2.skpbench,

+ DATE_ADD(CURRENT_TIMESTAMP(),

+ -2,

+ 'DAY'),

+ CURRENT_TIMESTAMP()))

+GROUP BY

+ gitHash;

+ `

+ iter, err := NewRowIter(service, query)

+ if err != nil {

+ return nil, fmt.Errorf("Failed to query for the Git hashes used: %s", err)

+ }

+ hashes := make(map[string]bool)

+ for iter.Next() {

+ h := &struct {

+ Hash string `bq:"gitHash"`

mtklein 2014/06/15 15:12:11 Neat.

+ }{}

+ err := iter.Decode(h)

+ if err != nil {

+ return nil, fmt.Errorf("Failed reading hashes from BigQuery: %s", err)

+ }

+ hashes[h.Hash] = true

+ }

+ return hashes, nil

+// GitHash represents information on a single Git commit.

mtklein 2014/06/15 15:12:11 Seems like a subset of Commit above? TimeStamp ==

jcgregorio 2014/06/16 02:46:24 Yeah, but this code may be changing soon as we may

+type GitHash struct {

+ Hash string

+ TimeStamp time.Time

+// readCommitsFromGit reads the commit history from a Git repository.

+func readCommitsFromGit(dir string) ([]GitHash, error) {

+ cmd := exec.Command("git", strings.Split("log --format=%H%x20%ci", " ")...)

+ cmd.Dir = dir

+ b, err := cmd.Output()

+ if err != nil {

+ return nil, fmt.Errorf("Failed to run Git: %s", err)

+ }

+ lines := strings.Split(string(b), "\n")

+ hashes := make([]GitHash, 0, len(lines))

+ for _, line := range lines {

+ parts := strings.SplitN(line, " ", 2)

+ if len(parts) == 2 {

+ t, err := time.Parse("2006-01-02 15:04:05 -0700", parts[1])

+ if err != nil {

+ return nil, fmt.Errorf("Failed parsing Git log timestamp: %s", err)

+ }

+ hashes = append(hashes, GitHash{Hash: parts[0], TimeStamp: t})

+ }

+ return hashes, nil

+// RowIter is a utility for reading data from a BigQuery query response.

+//

+// RowIter will iterate over all the results, even if they span more than one

+// page of results. It automatically uses page tokens to iterate over all the

+// pages to retrieve all results.

+type RowIter struct {

+ response *bigquery.GetQueryResultsResponse

+ jobId string

+ service *bigquery.Service

+ nextPageToken string

+ row int

+// poll until the job is complete.

+func (r *RowIter) poll() error {

mtklein 2014/06/15 15:12:10 Odd that bigquery doesn't provide this sort of thi

kelvinly 2014/06/15 15:49:26 I think the Go library was autogenerated from some

jcgregorio 2014/06/16 02:46:25 Yeah, that is true, but I was able to provide pagi

+ var queryResponse *bigquery.GetQueryResultsResponse

+ for {

+ var err error

+ queryCall := r.service.Jobs.GetQueryResults("google.com:chrome-skia", r.jobId)

+ if r.nextPageToken != "" {

+ queryCall.PageToken(r.nextPageToken)

+ }

+ queryResponse, err = queryCall.Do()

+ if err != nil {

+ return err

+ }

+ if queryResponse.JobComplete {

+ break

+ }

+ time.Sleep(time.Second)

+ }

+ r.nextPageToken = queryResponse.PageToken

+ r.response = queryResponse

+ return nil

+// NewRowIter starts a query and returns a RowIter for iterating through the

+// results.

+func NewRowIter(service *bigquery.Service, query string) (*RowIter, error) {

+ job := &bigquery.Job{

+ Configuration: &bigquery.JobConfiguration{

mtklein 2014/06/15 15:12:11 Can't you elide the nested type names in this sort

jcgregorio 2014/06/16 02:46:25 AFAIK only if I depend on the order and supply all

+ Query: &bigquery.JobConfigurationQuery{

+ Query: query,

+ },

+ }

+ jobResponse, err := service.Jobs.Insert("google.com:chrome-skia", job).Do()

+ if err != nil {

+ return nil, err

+ }

+ r := &RowIter{

+ jobId: jobResponse.JobReference.JobId,

+ service: service,

+ row: -1, // Start at -1 so the first call to Next() puts us at the 0th Row.

+ }

+ return r, r.poll()

+// Next moves to the next row in the response and returns true as long as data

+// is availble, returning false when the end of the results are reached.

+//

+// Calling Next() the first time actually points the iterator at the first row,

+// which makes it possible to use Next if a for loop:

+//

+// for iter.Next() { ... }

+//

+func (r *RowIter) Next() bool {

+ r.row++

+ if r.row >= len(r.response.Rows) {

+ if r.nextPageToken != "" {

+ r.poll()

+ r.row = 0

+ return len(r.response.Rows) > 0

+ } else {

+ return false

+ }

+ return true

+// DecodeParams pulls all the values in the params record out as a map[string]string.

+//

+// The schema for each table has a nested record called 'params' that contains

+// various axes along which queries could be built, such as the gpu the test was

+// run against. Pull out the entire record as a generic map[string]string.

+func (r *RowIter) DecodeParams() map[string]string {

+ row := r.response.Rows[r.row]

+ schema := r.response.Schema

+ params := map[string]string{}

+ for i, cell := range row.F {

+ if cell.V != nil {

+ name := schema.Fields[i].Name

+ if strings.HasPrefix(name, "params_") {

+ params[strings.TrimPrefix(name, "params_")] = cell.V.(string)

+ }

+ return params

+// Decode uses struct tags to decode a single row into a struct.

+//

+// For example, given a struct:

+//

+// type A struct {

+// Name string `bq:"name"`

+// Value float64 `bq:"measurement"`

+// }

+//

+// And a BigQuery table that contained two columns named "name" and

+// "measurement". Then calling Decode as follows would parse the column values

+// for "name" and "measurement" and place them in the Name and Value fields

+// respectively.

+//

+// a = &A{}

+// iter.Decode(a)

+//

+// Implementation Details:

+//

+// If a tag names a column that doesn't exist, the field is merely ignored,

+// i.e. it is left unchanged from when it was passed into Decode.

+//

+// Not all columns need to be tagged in the struct.

+//

+// The decoder doesn't handle nested structs, only the top level fields are decoded.

+//

+// The decoder only handles struct fields of type string, int, int32, int64,

+// float, float32 and float64.

+func (r *RowIter) Decode(s interface{}) error {

+ row := r.response.Rows[r.row]

+ schema := r.response.Schema

+ // Collapse the data in the row into a map[string]string.

+ rowMap := map[string]string{}

+ for i, cell := range row.F {

+ if cell.V != nil {

+ rowMap[schema.Fields[i].Name] = cell.V.(string)

+ }

+ // Then iter over the fields of 's' and set them from the row data.

+ sv := reflect.ValueOf(s).Elem()

+ st := sv.Type()

+ for i := 0; i < sv.NumField(); i++ {

+ columnName := st.Field(i).Tag.Get("bq")

+ if columnValue, ok := rowMap[columnName]; ok {

+ switch sv.Field(i).Kind() {

+ case reflect.String:

+ sv.Field(i).SetString(columnValue)

+ case reflect.Float32, reflect.Float64:

+ f, err := strconv.ParseFloat(columnValue, 64)

+ if err != nil {

+ return err

+ }

+ sv.Field(i).SetFloat(f)

+ case reflect.Int32, reflect.Int64:

+ parsedInt, err := strconv.ParseInt(columnValue, 10, 64)

+ if err != nil {

+ return err

+ }

+ sv.Field(i).SetInt(parsedInt)

+ default:

+ return fmt.Errorf("can't decode into field of type: %s", sv.Field(i).Kind())

+ }

+ return nil

+// populateTraces reads the measurement data from BigQuery and populates the Traces.

+func populateTraces(service *bigquery.Service, all *AllData, hashToIndex map[string]int, numSamples int) error {

+ type Measurement struct {

+ Value float64 `bq:"value"`

+ Key string `bq:"key"`

+ Hash string `bq:"gitHash"`

+ }

+ // Now query the actual samples.

+ query := `

+ SELECT

+ *

+ FROM

+ (TABLE_DATE_RANGE(perf_skps_v2.skpbench,

+ DATE_ADD(CURRENT_TIMESTAMP(),

+ -2,

+ 'DAY'),

+ CURRENT_TIMESTAMP()))

+ WHERE

+ params.benchName="tabl_worldjournal.skp"

+ OR

+ params.benchName="desk_amazon.skp"

+ ORDER BY

+ key DESC,

+ timestamp DESC;

+ `

+ iter, err := NewRowIter(service, query)

+ if err != nil {

+ return fmt.Errorf("Failed to query data from BigQuery: %s", err)

+ }

+ var trace *Trace = nil

+ currentKey := ""

+ for iter.Next() {

+ m := &Measurement{}

+ if err := iter.Decode(m); err != nil {

+ return fmt.Errorf("Failed to decode Measurement from BigQuery: %s", err)

+ }

+ if m.Key != currentKey {

+ if trace != nil {

+ all.Traces = append(all.Traces, *trace)

+ }

+ currentKey = m.Key

+ trace = NewTrace(numSamples)

+ trace.Params = iter.DecodeParams()

+ trace.Key = m.Key

+ }

+ if index, ok := hashToIndex[m.Hash]; ok {

+ trace.Values[index] = m.Value

+ }

+ all.Traces = append(all.Traces, *trace)

+ return nil

+// Data is the full set of traces for the last N days all parsed into structs.

+type Data struct {

+ all *AllData

+// AsJSON serializes the data as JSON.

+func (d *Data) AsJSON(w io.Writer) error {

+ // TODO(jcgregorio) Keep a cache of the gzipped JSON around and serve that as long as it's fresh.

mtklein 2014/06/15 15:12:11 How slow is the JSON encoding?

jcgregorio 2014/06/16 02:46:24 Not so much the JSON encoding as the gzip. The ung

jcgregorio 2014/06/16 12:59:55 Sorry wrong numbers, that should be 50MB and 5MB r

+ return json.NewEncoder(w).Encode(d.all)

+// populateParamSet returns the set of all possible values for all the 'params'

+// in AllData.

+func populateParamSet(all *AllData) {

+ // First pull the data out into a map of sets.

+ type ChoiceSet map[string]bool

+ c := make(map[string]ChoiceSet)

+ for _, t := range all.Traces {

+ for k, v := range t.Params {

+ if set, ok := c[k]; !ok {

+ c[k] = make(map[string]bool)

+ c[k][v] = true

+ } else {

+ set[v] = true

+ }

+ // Now flatten the sets into []string and populate all.ParamsSet with that.

+ for k, v := range c {

+ allOptions := []string{}

+ for option, _ := range v {

+ allOptions = append(allOptions, option)

+ }

+ all.ParamSet[k] = allOptions

+ }

+// NewData loads the data the first time and then starts a go routine to

+// preiodically refresh the data.

+//

+// TODO(jcgregorio) Actuall do the bit where we start a go routine.

mtklein 2014/06/15 15:12:11 So Data is going to keep itself updated asynchrono

jcgregorio 2014/06/16 02:46:25 Once I add the go routine for updating I will also

+func NewData(doOauth bool, gitRepoDir string) (*Data, error) {

+ var err error

+ var client *http.Client

+ if doOauth {

+ client, err = runFlow(config)

+ if err != nil {

+ return nil, fmt.Errorf("Failed to auth: %s", err)

+ }

+ } else {

+ client = http.DefaultClient

+ }

+ service, err := bigquery.New(client)

+ if err != nil {

+ return nil, fmt.Errorf("Failed to create a new BigQuery service object: %s", err)

+ }

+ // First query and get the list of hashes we are interested in and use that

+ // and the git log results to fill in the Commits.

+ allGitHashes, err := readCommitsFromGit(gitRepoDir)

+ if err != nil {

+ return nil, fmt.Errorf("Failed to read hashes from Git log: %s", err)

+ }

+ hashesTested, err := gitCommitsWithTestData(service)

+ if err != nil {

+ return nil, fmt.Errorf("Failed to read hashes from BigQuery: %s", err)

+ }

+ // Order the git hashes by commit log order.

+ commits := make([]Commit, 0, len(hashesTested))

+ for i := len(allGitHashes) - 1; i >= 0; i-- {

+ h := allGitHashes[i]

+ if _, ok := hashesTested[h.Hash]; ok {

+ commits = append(commits, Commit{Hash: h.Hash, CommitTime: h.TimeStamp})

+ }

+ // The number of samples that appear in each trace.

+ numSamples := len(commits)

+ // A mapping of Git hashes to where they appear in the Commits array, also the index

+ // at which a measurement gets stored in the Values array.

+ hashToIndex := make(map[string]int)

+ for i, commit := range commits {

+ hashToIndex[commit.Hash] = i

+ }

+ all := &AllData{

+ Traces: make([]Trace, 0, 0),

+ ParamSet: make(map[string]Choices),

+ Commits: commits,

+ }

+ if err := populateTraces(service, all, hashToIndex, numSamples); err != nil {

+ panic(err)

mtklein 2014/06/15 15:12:10 Seems like we'd want something a bit more robust?

jcgregorio 2014/06/16 02:46:24 This is only done once and on startup and I don't

+ }

+ populateParamSet(all)

+ return &Data{all: all}, nil

« no previous file with comments | « perf/server/README.md ('k') | perf/server/perf.go » ('j') | perf/server/perf.go » ('J')