Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(18)

Unified Diff: golden/go/pdfxform/main.go

Issue 1216483002: golden/pdfxform a pdf rasterization server (Closed) Base URL: https://skia.googlesource.com/buildbot@master
Patch Set: 2015-06-26 (Friday) 11:26:48 EDT Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « golden/go/pdfxform/data/45aa8af265d16839402583df5756a7c6.png ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: golden/go/pdfxform/main.go
diff --git a/golden/go/pdfxform/main.go b/golden/go/pdfxform/main.go
new file mode 100644
index 0000000000000000000000000000000000000000..92b93b3e5ce096ee14f9f76d2c17b119b33aa5bd
--- /dev/null
+++ b/golden/go/pdfxform/main.go
@@ -0,0 +1,526 @@
+// pdfxform is a server that rasterizes PDF documents into PNG
+package main
+
+import (
+ "bytes"
+ "crypto/md5"
+ "encoding/hex"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net/http"
+ "os"
+ "os/user"
+ "path"
+ "path/filepath"
+ "runtime"
+ "strings"
+ "time"
+
+ "github.com/skia-dev/glog"
+ "go.skia.org/infra/go/auth"
+ "go.skia.org/infra/go/gs"
+ "go.skia.org/infra/go/pdf"
+ "go.skia.org/infra/go/util"
+ "google.golang.org/api/storage/v1"
+)
+
+////////////////////////////////////////////////////////////////////////////////
+
+const (
+ pngExt = "png"
+ pdfExt = "pdf"
+)
+
+////////////////////////////////////////////////////////////////////////////////
+
+type md5digest [md5.Size]byte
stephana 2015/06/26 17:25:06 I think treating MD5 hashes as byte slices (instea
hal.canary 2015/06/26 18:44:12 The pdfXformer.results map can grow without constr
+
+func (digest md5digest) String() string {
+ return hex.EncodeToString(digest[:])
+}
+
+// decodeMd5 turns a hexadecimal string into a fixed-size binary array for more efficient storage.
+func decodeMd5(md5Str string) md5digest {
+ md5ByteSlice, err := hex.DecodeString(md5Str)
+ if err != nil || len(md5ByteSlice) != md5.Size {
+ glog.Errorf("Unable to decode base64-encoded MD5: %v: %s", md5Str, err)
+ return md5digest{}
+ }
+ var ret md5digest
+ copy(ret[:], md5ByteSlice)
+ return ret
+}
+
+// md5OfFile calculates the MD5 checksum of a file.
+func md5OfFile(path string) (sum md5digest, err error) {
+ md5 := md5.New()
+ f, err := os.Open(path)
+ if err != nil {
+ return
+ }
+ defer util.Close(f)
+ if _, err = io.Copy(md5, f); err != nil {
+ return
+ }
+ copy(sum[:], md5.Sum(nil))
+ return
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// removeIf is like util.Remove, but logs no error if the file does not exist.
+func removeIf(path string) {
+ if err := os.Remove(path); err != nil {
+ if !os.IsNotExist(err) {
+ glog.Errorf("Failed to Remove(%s): %v", path, err)
+ }
+ }
+}
+
+// isPDF returns true if the path appears to point to a PDF file.
+func isPDF(path string) bool {
+ f, err := os.Open(path)
+ if err != nil {
+ return false
+ }
+ defer util.Close(f)
+ var buffer [4]byte
+ if n, err := f.Read(buffer[:]); n != 4 || err != nil {
+ return false
+ }
+ var magic = [4]byte{'%', 'P', 'D', 'F'}
+ return bytes.Equal(magic[:], buffer[:])
+}
+
+// writeTo opens a file and dumps the contents of the reader into it.
+func writeTo(path string, reader *io.ReadCloser) error {
+ defer util.Close(*reader)
+ file, err := os.Create(path)
+ if err == nil {
+ _, err = io.Copy(file, *reader)
+ }
+ return err
+}
+
+// assertNil logs the err and exits if it is not nil.
+func assertNil(err error) {
+ if err != nil {
+ errMsg := ""
+ if _, fileName, line, ok := runtime.Caller(1); ok {
+ errMsg = fmt.Sprintf("-called from: %s:%d", fileName, line)
+ }
+ glog.Fatalf("Unexpected error %s: %s", errMsg, err)
+ }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// The dm struct mirrors the structure of the json files that will be
+// parsed and prooduced.
stephana 2015/06/26 17:25:06 Instead of the defining it's own structures to rec
hal.canary 2015/06/26 18:44:11 Done. Thanks.
+
+type dm struct {
+ Results []result `json:"results"`
+ BuildNumber string `json:"build_number"`
+ GitHash string `json:"gitHash"`
+ Key map[string]string `json:"key"`
+ MaxRSSMB int `json:"max_rss_MB"`
+}
+type keyType struct {
+ Config string `json:"config,omitempty"`
+ Name string `json:"name,omitempty"`
+ Source_options string `json:"source_options,omitempty"`
+ Source_type string `json:"source_type,omitempty"`
+ Rasterizer string `json:"rasterizer,omitempty"`
+}
+type optionsType struct {
+ Ext string `json:"ext,omitempty"`
+}
+type result struct {
+ Key keyType `json:"key"`
+ Md5 string `json:"md5"`
+ Options optionsType `json:"options"`
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// storageClient struct is used for uploading to cloud storage
+type storageClient struct {
+ httpClient *http.Client
+ storageService *storage.Service
+}
+
+// getClient returns an authorized storage.Service and the
+// corresponding http.Client; if anything goes wrong, it logs a fatal
+// error.
+func getClient(cacheFilePath string) storageClient {
+ config := auth.OAuthConfig(cacheFilePath, auth.SCOPE_FULL_CONTROL)
+ client, err := auth.RunFlow(config)
+ assertNil(err)
+ gsService, err := storage.New(client)
+ assertNil(err)
+ return storageClient{httpClient: client, storageService: gsService}
+}
+
+// gsFetch fetch the object's data from google storage
+func gsFetch(object *storage.Object, sc storageClient) (io.ReadCloser, int64, error) {
+ request, err := gs.RequestForStorageURL(object.MediaLink)
+ if err != nil {
+ return nil, -1, err
+ }
+ resp, err := sc.httpClient.Do(request)
+ if err != nil {
+ return nil, -1, err
+ }
+ if resp.StatusCode != 200 {
+ resp.Body.Close()
+ return nil, -1, fmt.Errorf("Failed to retrieve: %s %d %s", object.MediaLink, resp.StatusCode, resp.Status)
+ }
+ return resp.Body, resp.ContentLength, nil
+}
+
+// uploadFile uploads the specified file to the remote dir in Google
+// Storage. It also sets the appropriate ACLs on the uploaded file.
+// If the file already exists on the server, do nothing.
+func uploadFile(sc storageClient, input io.Reader, storageBucket, storagePath, accessControlEntity string) (bool, error) {
+ obj, _ := sc.storageService.Objects.Get(storageBucket, storagePath).Do()
+ if obj != nil {
+ return false, nil // noclobber
+ }
+ fullPath := fmt.Sprintf("gs://%s/%s", storageBucket, storagePath)
+ object := &storage.Object{Name: storagePath}
+ if _, err := sc.storageService.Objects.Insert(storageBucket, object).Media(input).Do(); err != nil {
+ return false, fmt.Errorf("Objects.Insert(%s) failed: %s", fullPath, err)
+ }
+ objectAcl := &storage.ObjectAccessControl{
+ Bucket: storageBucket, Entity: accessControlEntity, Object: storagePath, Role: "READER",
+ }
+ if _, err := sc.storageService.ObjectAccessControls.Insert(storageBucket, storagePath, objectAcl).Do(); err != nil {
+ return false, fmt.Errorf("Could not update ACL of %s: %s", fullPath, err)
+ }
+ return true, nil
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// The pdfXformer struct holds state (results, counter) and constants (bucket, directories)
+type pdfXformer struct {
+ client storageClient
+ storageBucket string
+ storageJsonDirectory string
+ storageImagesDirectory string
+ accessControlEntity string
+ rasterizers []pdf.Rasterizer
+ results map[md5digest]map[int]md5digest
+ counter int
+ identifier string
+}
+
+var errorImageMd5 = md5digest{
stephana 2015/06/26 17:25:06 Another argument to use strings instead of md5dige
hal.canary 2015/06/26 18:44:12 Done.
+ 0x45, 0xAA, 0x8A, 0xF2,
+ 0x65, 0xD1, 0x68, 0x39,
+ 0x40, 0x25, 0x83, 0xDF,
+ 0x57, 0x56, 0xA7, 0xC6,
+} // data/45aa8af265d16839402583df5756a7c6.png
+
+// rasterizeOnce applies a single rastetizer to the given pdf file.
+// If the rasterizer fails, use the errorImage. If everything
+// succeeds, upload the PNG.
+func (xformer *pdfXformer) rasterizeOnce(pdfPath string, rasterizerIndex int) (md5digest, error) {
+ rasterizer := xformer.rasterizers[rasterizerIndex]
+ tempdir := filepath.Dir(pdfPath)
+ pngPath := path.Join(tempdir, fmt.Sprintf("%s.%s", rasterizer.String(), pngExt))
+ defer removeIf(pngPath)
+ glog.Infof("> > > > rasterizing with %s", rasterizer)
+ err := rasterizer.Rasterize(pdfPath, pngPath)
+ if err != nil {
+ glog.Warningf("rasterizing %s with %s failed: %s", filepath.Base(pdfPath), rasterizer.String(), err)
+ return errorImageMd5, nil
+ }
+ md5, err := md5OfFile(pngPath)
+ if err != nil {
+ return md5digest{}, err
+ }
+ f, err := os.Open(pngPath)
+ if err != nil {
+ return md5digest{}, err
+ }
+ defer util.Close(f)
+ pngUploadPath := fmt.Sprintf("%s/%s.%s", xformer.storageImagesDirectory, md5.String(), pngExt)
+ didUpload, err := uploadFile(xformer.client, f, xformer.storageBucket, pngUploadPath, xformer.accessControlEntity)
+ if err != nil {
+ return md5digest{}, err
+ }
+ if didUpload {
+ glog.Infof("> > > > uploaded %s", pngUploadPath)
+ }
+ return md5, nil
+}
+
+// makeTmpDir returns a nicely-named directory for temp files in $TMPDIR
+func (xformer *pdfXformer) makeTmpDir() string {
+ if xformer.identifier == "" {
+ var host, userName string
+ if h, err := os.Hostname(); err == nil {
+ host = h
+ if i := strings.Index(host, "."); i >= 0 {
+ host = host[:i]
+ }
+ }
+ if currentUser, err := user.Current(); err == nil {
+ userName = currentUser.Username
+ }
+ userName = strings.Replace(userName, `\`, "_", -1)
+ xformer.identifier = fmt.Sprintf("%s.%s.%s.tmp.%d.", filepath.Base(os.Args[0]), host, userName, os.Getpid())
+ }
+ tempdir, err := ioutil.TempDir("", xformer.identifier)
+ assertNil(err)
+ return tempdir
+}
+
+// processResult rasterizes a single PDF result and returns a set of new results.
+func (xformer *pdfXformer) processResult(res result) []result {
+ rasterizedResults := []result{}
+ resultMap, found := xformer.results[decodeMd5(res.Md5)]
+ if found {
+ // Skip rasterizion steps: big win.
+ for index, rasterizer := range xformer.rasterizers {
+ keyCopy := res.Key
+ keyCopy.Rasterizer = rasterizer.String()
+ md5, ok := resultMap[index]
+ if ok {
+ rasterizedResults = append(rasterizedResults, result{Key: keyCopy, Md5: md5.String(), Options: optionsType{pngExt}})
+ } else {
+ glog.Errorf("missing rasterizer %s on %s", rasterizer.String(), res.Md5)
+ }
+ }
+ return rasterizedResults
+ }
+
+ tempdir := xformer.makeTmpDir()
+ defer util.RemoveAll(tempdir)
+ pdfPath := path.Join(tempdir, fmt.Sprintf("%s.pdf", res.Md5))
+ objectName := fmt.Sprintf("%s/%s.pdf", xformer.storageImagesDirectory, res.Md5)
+ storageURL := fmt.Sprintf("gs://%s/%s", xformer.storageBucket, objectName)
+ object, err := xformer.client.storageService.Objects.Get(xformer.storageBucket, objectName).Do()
+ if err != nil {
+ glog.Errorf("unable to find %s: %s", storageURL, err)
+ return []result{}
+ }
+ pdfData, _, err := gsFetch(object, xformer.client)
+ if err != nil {
+ glog.Errorf("unable to retrieve %s: %s", storageURL, err)
+ return []result{}
+ }
+ writeTo(pdfPath, &pdfData)
+ if !isPDF(pdfPath) {
+ glog.Errorf("%s is not a PDF", objectName)
+ return []result{}
+ }
+ resultMap = map[int]md5digest{}
+ for index, rasterizer := range xformer.rasterizers {
+ md5, err := xformer.rasterizeOnce(pdfPath, index)
+ if err != nil {
+ glog.Errorf("rasterizer %s failed on %s.pdf: %s", rasterizer, res.Md5, err)
+ continue
+ }
+ keyCopy := res.Key
+ keyCopy.Rasterizer = rasterizer.String()
+ rasterizedResults = append(rasterizedResults, result{Key: keyCopy, Md5: md5.String(), Options: optionsType{pngExt}})
+ resultMap[index] = md5
+ }
+ xformer.results[decodeMd5(res.Md5)] = resultMap
+ return rasterizedResults
+}
+
+// processJsonFile reads a json file and produces a new json file
+// with rasterized results.
+func (xformer *pdfXformer) processJsonFile(jsonFileObject *storage.Object) {
+ jsonURL := fmt.Sprintf("gs://%s/%s", xformer.storageBucket, jsonFileObject.Name)
+ // if jsonFileObject.Metadata["rasterized"] == "true" {
+ // glog.Infof("> > skipping %s (already processed) {%d}", jsonURL, xformer.counter)
+ // return
+ // }
+ body, length, err := gsFetch(jsonFileObject, xformer.client)
+ if err != nil {
+ glog.Errorf("Failed to fetch %s", jsonURL)
+ return
+ }
+ if 0 == length {
+ util.Close(body)
+ glog.Infof("> > skipping %s (empty file) {%d}", jsonURL, xformer.counter)
+ return
+ }
+ dmstruct := dm{}
+ err = json.NewDecoder(body).Decode(&dmstruct)
+ util.Close(body)
+ if err != nil {
+ glog.Errorf("Failed to parse %s", jsonURL)
+ return
+ }
+ countPdfResults := 0
+ for _, res := range dmstruct.Results {
+ if res.Options.Ext == pdfExt {
+ countPdfResults++
+ }
+ }
+ if 0 == countPdfResults {
+ glog.Infof("> > 0 PDFs found %s {%d}", jsonURL, xformer.counter)
+ xformer.setRasterized(jsonFileObject)
+ return
+ }
+
+ glog.Infof("> > processing %d pdfs of %d results {%d}", countPdfResults, len(dmstruct.Results), xformer.counter)
+ rasterizedResults := []result{}
+ i := 0
+ for _, res := range dmstruct.Results {
+ if res.Options.Ext == pdfExt {
+ i++
+ glog.Infof("> > > processing %s.pdf [%d/%d] {%d}", res.Md5, i, countPdfResults, xformer.counter)
+ rasterizedResults = append(rasterizedResults, xformer.processResult(res)...)
+ }
+ }
+ newDMStruct := dm{
+ BuildNumber: dmstruct.BuildNumber,
+ GitHash: dmstruct.GitHash,
+ Key: dmstruct.Key,
+ MaxRSSMB: dmstruct.MaxRSSMB,
+ Results: rasterizedResults,
+ }
+ newJson, err := json.Marshal(newDMStruct)
+ assertNil(err)
+
+ now := time.Now()
+ // Change the date; leave most of the rest of the path components.
+ jsonPathComponents := strings.Split(jsonFileObject.Name, "/") // []string
+ if len(jsonPathComponents) < 4 {
+ fmt.Errorf("unexpected number of path components %q", jsonPathComponents)
+ return
+ }
+ jsonPathComponents = jsonPathComponents[len(jsonPathComponents)-4:]
+ jsonPathComponents[1] += "-pdfxformer"
+ jsonUploadPath := fmt.Sprintf("%s/%d/%02d/%02d/%02d/%s",
+ xformer.storageJsonDirectory,
+ now.Year(),
+ int(now.Month()),
+ now.Day(),
+ now.Hour(),
+ strings.Join(jsonPathComponents, "/"))
+
+ _, err = uploadFile(xformer.client, bytes.NewReader(newJson), xformer.storageBucket, jsonUploadPath, xformer.accessControlEntity)
+ glog.Infof("> > wrote gs://%s/%s", xformer.storageBucket, jsonUploadPath)
+ newJsonFileObject, err := xformer.client.storageService.Objects.Get(xformer.storageBucket, jsonUploadPath).Do()
+ if err != nil {
+ glog.Errorf("Failed to find %s: %s", jsonUploadPath, err)
+ } else {
+ xformer.setRasterized(newJsonFileObject)
+ }
+ xformer.setRasterized(jsonFileObject)
+}
+
+// setRasterized sets the rasterized metadata flag of the given storage.Object
+func (xformer *pdfXformer) setRasterized(jsonFileObject *storage.Object) {
+ if nil == jsonFileObject.Metadata {
+ jsonFileObject.Metadata = map[string]string{}
+ }
+ jsonFileObject.Metadata["rasterized"] = "true"
+ _, err := xformer.client.storageService.Objects.Patch(xformer.storageBucket, jsonFileObject.Name, jsonFileObject).Do()
+ if err != nil {
+ glog.Errorf("Failed to update metadata of %s: %s", jsonFileObject.Name, err)
+ } else {
+ glog.Infof("> > Updated metadata of %s", jsonFileObject.Name)
+ }
+}
+
+// processTimeRange calls gs.GetLatestGSDirs to get a list of
+func (xformer *pdfXformer) processTimeRange(start time.Time, end time.Time) {
+ glog.Infof("Processing time range: (%s, %s)", start.Truncate(time.Second), end.Truncate(time.Second))
+ for _, dir := range gs.GetLatestGSDirs(start.Unix(), end.Unix(), xformer.storageJsonDirectory) {
+ glog.Infof("> Reading gs://%s/%s\n", xformer.storageBucket, dir)
+ requestedObjects := xformer.client.storageService.Objects.List(xformer.storageBucket).Prefix(dir).Fields(
+ "nextPageToken", "items/updated", "items/md5Hash", "items/mediaLink", "items/name", "items/metadata")
+ for requestedObjects != nil {
+ responseObjects, err := requestedObjects.Do()
+ if err != nil {
+ glog.Errorf("request %#v failed: %s", requestedObjects, err)
+ continue
+ }
+ for _, jsonObject := range responseObjects.Items {
+ xformer.counter++
+ glog.Infof("> > Processing object: gs://%s/%s {%d}", xformer.storageBucket, jsonObject.Name, xformer.counter)
+ xformer.processJsonFile(jsonObject)
+ }
+ if len(responseObjects.NextPageToken) > 0 {
+ requestedObjects.PageToken(responseObjects.NextPageToken)
+ } else {
+ requestedObjects = nil
+ }
+ }
+ }
+ glog.Infof("finished time range.")
+}
+
+// uploadErrorImage should be run once to verify that the
+func (xformer *pdfXformer) uploadErrorImage() {
+ _, thisFile, _, ok := runtime.Caller(0)
+ if !ok {
+ glog.Fatalf("Could not find data dir: runtime.Caller() failed.")
+ }
+ filename := fmt.Sprintf("%s.png", errorImageMd5.String())
+ f, err := os.Open(path.Join(path.Dir(thisFile), "data", filename))
+ if err != nil {
+ glog.Fatalf("Could not open data/%s", filename)
+ }
+ defer util.Close(f)
+ errorImagePath := fmt.Sprintf("%s/%s", xformer.storageImagesDirectory, filename)
+ _, err = uploadFile(xformer.client, f, xformer.storageBucket, errorImagePath, xformer.accessControlEntity)
+ assertNil(err) // If we can't upload this, we can't upload anything.
+}
+
+// Environment variables: we respect $TMPDIR
+// Arguments: glog uses -logtostderr, -log_dir
+func main() {
+ flag.Parse()
+
+ // TODO(halcanary): where should this file exist?
+ configDir := path.Join(os.Getenv("HOME"), ".config")
+ assertNil(os.MkdirAll(configDir, 0700))
+
+ xformer := pdfXformer{
+ client: getClient(path.Join(configDir, "google_storage_token.data")),
+ // storageBucket: "chromium-skia-gm",
+ // storageJsonDirectory: "dm-json-v1",
+ // storageImagesDirectory: "dm-images-v1",
+ accessControlEntity: "domain-google.com",
+ results: map[md5digest]map[int]md5digest{},
+ storageBucket: "skia-infra-testdata",
+ storageJsonDirectory: "pdfxformer-testdata/json",
+ storageImagesDirectory: "pdfxformer-testdata/img",
+ }
+
+ xformer.uploadErrorImage()
+
+ for _, rasterizer := range []pdf.Rasterizer{pdf.Pdfium{}, pdf.Poppler{}} {
+ if rasterizer.Enabled() {
+ xformer.rasterizers = append(xformer.rasterizers, rasterizer)
+ } else {
+ glog.Infof("rasterizer %s is disabled", rasterizer.String())
+ }
+ }
+ if len(xformer.rasterizers) == 0 {
+ glog.Fatalf("no rasterizers found")
+ }
+
+ end := time.Now()
+ start := end.Add(-72 * time.Hour)
+ timeTicker := time.Tick(time.Minute)
+ for {
+ xformer.processTimeRange(start, end)
+ glog.Flush() // Flush before waiting for next tick; it may be a while.
+ _ = <-timeTicker
+ start = end
+ end = time.Now()
+ }
+}
« no previous file with comments | « golden/go/pdfxform/data/45aa8af265d16839402583df5756a7c6.png ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698