Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(455)

Side by Side Diff: golden/go/pdfxform/main.go

Issue 1216483002: golden/pdfxform a pdf rasterization server (Closed) Base URL: https://skia.googlesource.com/buildbot@master
Patch Set: 2015-06-26 (Friday) 18:37:58 EDT Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « go/pdf/testdata/minimal.pdf ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // pdfxform is a server that rasterizes PDF documents into PNG
2 package main
3
4 import (
5 "bytes"
6 "crypto/md5"
7 "encoding/hex"
8 "encoding/json"
9 "flag"
10 "fmt"
11 "io"
12 "io/ioutil"
13 "net/http"
14 "os"
15 "os/user"
16 "path"
17 "path/filepath"
18 "runtime"
19 "strings"
20 "time"
21
22 "github.com/skia-dev/glog"
23 "go.skia.org/infra/go/auth"
24 "go.skia.org/infra/go/common"
25 "go.skia.org/infra/go/gs"
26 "go.skia.org/infra/go/pdf"
27 "go.skia.org/infra/go/util"
28 "go.skia.org/infra/perf/go/goldingester"
29 "google.golang.org/api/storage/v1"
30 )
31
32 ////////////////////////////////////////////////////////////////////////////////
33
34 const (
35 PNG_EXT = "png"
36 PDF_EXT = "pdf"
37 )
38
39 ////////////////////////////////////////////////////////////////////////////////
40
41 // md5OfFile calculates the MD5 checksum of a file.
42 func md5OfFile(path string) (string, error) {
43 md5 := md5.New()
44 f, err := os.Open(path)
45 if err != nil {
46 return "", err
47 }
48 defer util.Close(f)
49 if _, err = io.Copy(md5, f); err != nil {
50 return "", err
51 }
52 return hex.EncodeToString(md5.Sum(nil)), nil
53 }
54
55 // removeIfExists is like util.Remove, but logs no error if the file does not ex ist.
56 func removeIfExists(path string) {
57 if err := os.Remove(path); err != nil {
58 if !os.IsNotExist(err) {
59 glog.Errorf("Failed to Remove(%s): %v", path, err)
60 }
61 }
62 }
63
64 // isPDF returns true if the path appears to point to a PDF file.
65 func isPDF(path string) bool {
66 f, err := os.Open(path)
67 if err != nil {
68 return false
69 }
70 defer util.Close(f)
71 var buffer [4]byte
72 if n, err := f.Read(buffer[:]); n != 4 || err != nil {
73 return false
74 }
75 var magic = [4]byte{'%', 'P', 'D', 'F'}
76 return bytes.Equal(magic[:], buffer[:])
77 }
78
79 // writeTo opens a file and dumps the contents of the reader into it.
80 func writeTo(path string, reader *io.ReadCloser) error {
81 defer util.Close(*reader)
82 file, err := os.Create(path)
83 if err == nil {
84 _, err = io.Copy(file, *reader)
85 }
86 return err
87 }
88
89 // assertNil logs the err and exits if it is not nil.
stephana 2015/07/07 17:28:27 This is kind of a go antipattern. See https://gol
hal.canary 2015/07/07 18:59:52 Done.
90 func assertNil(err error) {
91 if err != nil {
92 errMsg := ""
93 if _, fileName, line, ok := runtime.Caller(1); ok {
94 errMsg = fmt.Sprintf("-called from: %s:%d", fileName, li ne)
95 }
96 glog.Fatalf("Unexpected error %s: %s", errMsg, err)
97 }
98 }
99
100 ////////////////////////////////////////////////////////////////////////////////
101
102 // storageClient struct is used for uploading to cloud storage
103 type storageClient struct {
104 httpClient *http.Client
105 storageService *storage.Service
106 }
107
108 // getClient returns an authorized storage.Service and the
109 // corresponding http.Client; if anything goes wrong, it logs a fatal
110 // error.
111 func getClient() storageClient {
112 var client *http.Client
113 var err error
114 if *local {
115 client, err = auth.RunFlow(auth.OAuthConfig(*oauthCacheFile, aut h.SCOPE_FULL_CONTROL))
116 // TODO(stephana): Replace auth.RunFlow with auth.NewClient
117 // client, err = auth.NewClient(true, *oauthCacheFile, auth.SCOP E_FULL_CONTROL, auth.SCOPE_GCE)
118 } else {
119 client = auth.GCEServiceAccountClient(&http.Transport{Dial: util .DialTimeout})
120 }
121 if err != nil {
122 glog.Fatalf("Failed to create authenticated HTTP client: %s", er r)
123 }
124 gsService, err := storage.New(client)
125 assertNil(err)
stephana 2015/07/07 17:28:27 Should return error instead.
hal.canary 2015/07/07 18:59:52 Done.
126 return storageClient{httpClient: client, storageService: gsService}
127 }
128
129 // gsFetch fetch the object's data from google storage
130 func gsFetch(object *storage.Object, sc storageClient) (io.ReadCloser, int64, er ror) {
131 request, err := gs.RequestForStorageURL(object.MediaLink)
132 if err != nil {
133 return nil, -1, err
134 }
135 resp, err := sc.httpClient.Do(request)
136 if err != nil {
137 return nil, -1, err
138 }
139 if resp.StatusCode != 200 {
140 resp.Body.Close()
141 return nil, -1, fmt.Errorf("Failed to retrieve: %s %d %s", objec t.MediaLink, resp.StatusCode, resp.Status)
142 }
143 return resp.Body, resp.ContentLength, nil
144 }
145
146 // uploadFile uploads the specified file to the remote dir in Google
147 // Storage. It also sets the appropriate ACLs on the uploaded file.
148 // If the file already exists on the server, do nothing.
149 func uploadFile(sc storageClient, input io.Reader, storageBucket, storagePath, a ccessControlEntity string) (bool, error) {
150 obj, _ := sc.storageService.Objects.Get(storageBucket, storagePath).Do()
151 if obj != nil {
152 return false, nil // noclobber
153 }
154 fullPath := fmt.Sprintf("gs://%s/%s", storageBucket, storagePath)
155 object := &storage.Object{Name: storagePath}
156 if _, err := sc.storageService.Objects.Insert(storageBucket, object).Med ia(input).Do(); err != nil {
157 return false, fmt.Errorf("Objects.Insert(%s) failed: %s", fullPa th, err)
158 }
159 objectAcl := &storage.ObjectAccessControl{
160 Bucket: storageBucket, Entity: accessControlEntity, Object: stor agePath, Role: "READER",
161 }
162 if _, err := sc.storageService.ObjectAccessControls.Insert(storageBucket , storagePath, objectAcl).Do(); err != nil {
163 return false, fmt.Errorf("Could not update ACL of %s: %s", fullP ath, err)
164 }
165 return true, nil
166 }
167
168 ////////////////////////////////////////////////////////////////////////////////
169
170 var (
171 local = flag.Bool("local", false, "True if not running in prod")
172 oauthCacheFile = flag.String("oauth_cache_file", "oauth_cache.da t", "path to look for and store an OAuth token")
173 dataDir = flag.String("data_dir", "", "Directory to store data in.")
174 failureImage = flag.String("failure_image", "", "Location of a PNG image. Must be set")
175 storageBucket = flag.String("storage_bucket", "chromium-skia-gm ", "")
176 storageJsonDirectory = flag.String("storage_json_directory", "dm-json- v1", "")
177 storageImagesDirectory = flag.String("storage_images_directory", "dm-ima ges-v1", "")
178 accessControlEntity = flag.String("access_control_entity", "domain-go ogle.com", "")
179 graphiteServer = flag.String("graphite_server", "", "")
stephana 2015/07/07 17:28:27 Nit: Please add descriptions to the flags.
hal.canary 2015/07/07 18:59:52 Done.
180 )
181
182 // The pdfXformer struct holds state
183 type pdfXformer struct {
184 client storageClient
185 rasterizers []pdf.Rasterizer
186 results map[string]map[int]string
187 counter int
188 identifier string
189 errorImageMd5 string
190 }
191
192 // rasterizeOnce applies a single rastetizer to the given pdf file.
193 // If the rasterizer fails, use the errorImage. If everything
194 // succeeds, upload the PNG.
195 func (xformer *pdfXformer) rasterizeOnce(pdfPath string, rasterizerIndex int) (s tring, error) {
196 rasterizer := xformer.rasterizers[rasterizerIndex]
197 tempdir := filepath.Dir(pdfPath)
198 pngPath := path.Join(tempdir, fmt.Sprintf("%s.%s", rasterizer.String(), PNG_EXT))
199 defer removeIfExists(pngPath)
200 glog.Infof("> > > > rasterizing with %s", rasterizer)
201 err := rasterizer.Rasterize(pdfPath, pngPath)
202 if err != nil {
203 glog.Warningf("rasterizing %s with %s failed: %s", filepath.Base (pdfPath), rasterizer.String(), err)
204 return xformer.errorImageMd5, nil
205 }
206 md5, err := md5OfFile(pngPath)
207 if err != nil {
208 return "", err
209 }
210 f, err := os.Open(pngPath)
211 if err != nil {
212 return "", err
213 }
214 defer util.Close(f)
215 pngUploadPath := fmt.Sprintf("%s/%s.%s", *storageImagesDirectory, md5, P NG_EXT)
216 didUpload, err := uploadFile(xformer.client, f, *storageBucket, pngUploa dPath, *accessControlEntity)
217 if err != nil {
218 return "", err
219 }
220 if didUpload {
221 glog.Infof("> > > > uploaded %s", pngUploadPath)
222 }
223 return md5, nil
224 }
225
226 // makeTmpDir returns a nicely-named directory for temp files in $TMPDIR
227 func (xformer *pdfXformer) makeTmpDir() string {
228 if xformer.identifier == "" {
229 var host, userName string
230 if h, err := os.Hostname(); err == nil {
231 host = h
232 if i := strings.Index(host, "."); i >= 0 {
233 host = host[:i]
234 }
235 }
236 if currentUser, err := user.Current(); err == nil {
237 userName = currentUser.Username
238 }
239 userName = strings.Replace(userName, `\`, "_", -1)
240 xformer.identifier = fmt.Sprintf("%s.%s.%s.tmp.%d.", filepath.Ba se(os.Args[0]), host, userName, os.Getpid())
241 }
242 tempdir, err := ioutil.TempDir(*dataDir, xformer.identifier)
243 assertNil(err)
stephana 2015/07/07 17:28:27 Should return error instead.
hal.canary 2015/07/07 18:59:52 Done.
244 return tempdir
245 }
246
247 func newResult(key map[string]string, rasterizerName, digest string) goldingeste r.Result {
248 keyCopy := map[string]string{}
249 for k, v := range key {
250 keyCopy[k] = v
251 }
252 keyCopy["rasterizer"] = rasterizerName
253 options := map[string]string{"ext": PNG_EXT}
254 return goldingester.Result{Key: keyCopy, Digest: digest, Options: option s}
255 }
256
257 // processResult rasterizes a single PDF result and returns a set of new results .
258 func (xformer *pdfXformer) processResult(res goldingester.Result) []goldingester .Result {
259 rasterizedResults := []goldingester.Result{}
260 resultMap, found := xformer.results[res.Digest]
261 if found {
262 // Skip rasterizion steps: big win.
263 for index, rasterizer := range xformer.rasterizers {
264 digest, ok := resultMap[index]
265 if ok {
266 rasterizedResults = append(rasterizedResults,
267 newResult(res.Key, rasterizer.String(), digest))
268 } else {
269 glog.Errorf("missing rasterizer %s on %s", raste rizer.String(), res.Digest)
270 }
271 }
272 return rasterizedResults
273 }
274
275 tempdir := xformer.makeTmpDir()
276 defer util.RemoveAll(tempdir)
277 pdfPath := path.Join(tempdir, fmt.Sprintf("%s.pdf", res.Digest))
278 objectName := fmt.Sprintf("%s/%s.pdf", *storageImagesDirectory, res.Dige st)
279 storageURL := fmt.Sprintf("gs://%s/%s", *storageBucket, objectName)
280 object, err := xformer.client.storageService.Objects.Get(*storageBucket, objectName).Do()
281 if err != nil {
282 glog.Errorf("unable to find %s: %s", storageURL, err)
283 return []goldingester.Result{}
284 }
285 pdfData, _, err := gsFetch(object, xformer.client)
286 if err != nil {
287 glog.Errorf("unable to retrieve %s: %s", storageURL, err)
288 return []goldingester.Result{}
289 }
290 writeTo(pdfPath, &pdfData)
291 if !isPDF(pdfPath) {
292 glog.Errorf("%s is not a PDF", objectName)
293 return []goldingester.Result{}
294 }
295 resultMap = map[int]string{}
296 for index, rasterizer := range xformer.rasterizers {
297 digest, err := xformer.rasterizeOnce(pdfPath, index)
298 if err != nil {
299 glog.Errorf("rasterizer %s failed on %s.pdf: %s", raster izer, res.Digest, err)
300 continue
301 }
302 rasterizedResults = append(rasterizedResults,
303 newResult(res.Key, rasterizer.String(), digest))
304 resultMap[index] = digest
305 }
306 xformer.results[res.Digest] = resultMap
307 return rasterizedResults
308 }
309
310 // processJsonFile reads a json file and produces a new json file
311 // with rasterized results.
312 func (xformer *pdfXformer) processJsonFile(jsonFileObject *storage.Object) {
313 jsonURL := fmt.Sprintf("gs://%s/%s", *storageBucket, jsonFileObject.Name )
314 if jsonFileObject.Metadata["rasterized"] == "true" {
315 glog.Infof("> > skipping %s (already processed) {%d}", jsonURL, xformer.counter)
316 return
317 }
318 body, length, err := gsFetch(jsonFileObject, xformer.client)
319 if err != nil {
320 glog.Errorf("Failed to fetch %s", jsonURL)
321 return
322 }
323 if 0 == length {
324 util.Close(body)
325 glog.Infof("> > skipping %s (empty file) {%d}", jsonURL, xformer .counter)
326 return
327 }
328 dmstruct := goldingester.DMResults{}
329 err = json.NewDecoder(body).Decode(&dmstruct)
330 util.Close(body)
331 if err != nil {
332 glog.Errorf("Failed to parse %s", jsonURL)
333 return
334 }
335 countPdfResults := 0
336 for _, res := range dmstruct.Results {
337 if res.Options["ext"] == PDF_EXT {
338 countPdfResults++
339 }
340 }
341 if 0 == countPdfResults {
342 glog.Infof("> > 0 PDFs found %s {%d}", jsonURL, xformer.counter)
343 xformer.setRasterized(jsonFileObject)
344 return
345 }
346
347 glog.Infof("> > processing %d pdfs of %d results {%d}", countPdfResults, len(dmstruct.Results), xformer.counter)
348 rasterizedResults := []*goldingester.Result{}
349 i := 0
350 for _, res := range dmstruct.Results {
351 if res.Options["ext"] == PDF_EXT {
352 i++
353 glog.Infof("> > > processing %s.pdf [%d/%d] {%d}", res.D igest, i, countPdfResults, xformer.counter)
354 for _, rasterizedResult := range xformer.processResult(* res) {
355 rasterizedResults = append(rasterizedResults, &r asterizedResult)
356 }
357 }
358 }
359 newDMStruct := goldingester.DMResults{
360 BuildNumber: dmstruct.BuildNumber,
361 GitHash: dmstruct.GitHash,
362 Key: dmstruct.Key,
363 Results: rasterizedResults,
364 }
365 newJson, err := json.Marshal(newDMStruct)
366 assertNil(err)
367
368 now := time.Now()
369 // Change the date; leave most of the rest of the path components.
370 jsonPathComponents := strings.Split(jsonFileObject.Name, "/") // []strin g
371 if len(jsonPathComponents) < 4 {
372 fmt.Errorf("unexpected number of path components %q", jsonPathCo mponents)
373 return
374 }
375 jsonPathComponents = jsonPathComponents[len(jsonPathComponents)-4:]
376 jsonPathComponents[1] += "-pdfxformer"
377 jsonUploadPath := fmt.Sprintf("%s/%d/%02d/%02d/%02d/%s",
378 *storageJsonDirectory,
379 now.Year(),
380 int(now.Month()),
381 now.Day(),
382 now.Hour(),
383 strings.Join(jsonPathComponents, "/"))
384
385 _, err = uploadFile(xformer.client, bytes.NewReader(newJson), *storageBu cket, jsonUploadPath, *accessControlEntity)
386 glog.Infof("> > wrote gs://%s/%s", *storageBucket, jsonUploadPath)
387 newJsonFileObject, err := xformer.client.storageService.Objects.Get(*sto rageBucket, jsonUploadPath).Do()
388 if err != nil {
389 glog.Errorf("Failed to find %s: %s", jsonUploadPath, err)
390 } else {
391 xformer.setRasterized(newJsonFileObject)
392 }
393 xformer.setRasterized(jsonFileObject)
394 }
395
396 // setRasterized sets the rasterized metadata flag of the given storage.Object
397 func (xformer *pdfXformer) setRasterized(jsonFileObject *storage.Object) {
398 if nil == jsonFileObject.Metadata {
399 jsonFileObject.Metadata = map[string]string{}
400 }
401 jsonFileObject.Metadata["rasterized"] = "true"
402 _, err := xformer.client.storageService.Objects.Patch(*storageBucket, js onFileObject.Name, jsonFileObject).Do()
403 if err != nil {
404 glog.Errorf("Failed to update metadata of %s: %s", jsonFileObjec t.Name, err)
405 } else {
406 glog.Infof("> > Updated metadata of %s", jsonFileObject.Name)
407 }
408 }
409
410 // processTimeRange calls gs.GetLatestGSDirs to get a list of
411 func (xformer *pdfXformer) processTimeRange(start time.Time, end time.Time) {
412 glog.Infof("Processing time range: (%s, %s)", start.Truncate(time.Second ), end.Truncate(time.Second))
413 for _, dir := range gs.GetLatestGSDirs(start.Unix(), end.Unix(), *storag eJsonDirectory) {
414 glog.Infof("> Reading gs://%s/%s\n", *storageBucket, dir)
415 requestedObjects := xformer.client.storageService.Objects.List(* storageBucket).Prefix(dir).Fields(
416 "nextPageToken", "items/updated", "items/md5Hash", "item s/mediaLink", "items/name", "items/metadata")
417 for requestedObjects != nil {
418 responseObjects, err := requestedObjects.Do()
419 if err != nil {
420 glog.Errorf("request %#v failed: %s", requestedO bjects, err)
421 } else {
422 for _, jsonObject := range responseObjects.Items {
423 xformer.counter++
424 glog.Infof("> > Processing object: gs:/ /%s/%s {%d}", *storageBucket, jsonObject.Name, xformer.counter)
425 xformer.processJsonFile(jsonObject)
426 }
427 }
428 if len(responseObjects.NextPageToken) > 0 {
429 requestedObjects.PageToken(responseObjects.NextP ageToken)
430 } else {
431 requestedObjects = nil
432 }
433 }
434 }
435 glog.Infof("finished time range.")
436 }
437
438 // uploadErrorImage should be run once to verify that the image is there
439 func (xformer *pdfXformer) uploadErrorImage(path string) {
440 if "" == path {
441 glog.Fatalf("Missing --path argument")
442 }
443 errorImageMd5, err := md5OfFile(path)
444 if err != nil {
445 glog.Fatalf("Bad --path argument")
446 }
447 errorImageFileReader, err := os.Open(path)
448 assertNil(err)
449 defer util.Close(errorImageFileReader)
450 errorImagePath := fmt.Sprintf("%s/%s.png", *storageImagesDirectory, erro rImageMd5)
451 _, err = uploadFile(xformer.client, errorImageFileReader, *storageBucket , errorImagePath, *accessControlEntity)
452 assertNil(err) // If we can't upload this, we can't upload anything.
453 xformer.errorImageMd5 = errorImageMd5
454 }
455
456 func main() {
457 flag.Parse()
458 common.InitWithMetrics("pdfxform", graphiteServer)
459
460 // *storageBucket = "skia-infra-testdata"
stephana 2015/07/07 17:28:27 Should this be removed ?
hal.canary 2015/07/07 18:59:52 Done.
461 // *storageJsonDirectory = "pdfxformer-testdata/json"
462 // *storageImagesDirectory = "pdfxformer-testdata/img"
463 // *failureImage = "golden/go/pdfxform/45aa8af265d16839402583df5756a7c6. png"
464
465 xformer := pdfXformer{
466 client: getClient(),
467 results: map[string]map[int]string{},
468 }
469
470 xformer.uploadErrorImage(*failureImage)
471
472 for _, rasterizer := range []pdf.Rasterizer{pdf.Pdfium{}, pdf.Poppler{}} {
473 if rasterizer.Enabled() {
474 xformer.rasterizers = append(xformer.rasterizers, raster izer)
475 } else {
476 glog.Infof("rasterizer %s is disabled", rasterizer.Strin g())
477 }
478 }
479 if len(xformer.rasterizers) == 0 {
480 glog.Fatalf("no rasterizers found")
481 }
482
483 end := time.Now()
484 start := end.Add(-172 * time.Hour)
485 xformer.processTimeRange(start, end)
486 glog.Flush() // Flush before waiting for next tick; it may be a while.
487 for _ = range time.Tick(time.Minute) {
488 start, end = end, time.Now()
489 xformer.processTimeRange(start, end)
490 glog.Flush()
491 }
492 }
OLDNEW
« no previous file with comments | « go/pdf/testdata/minimal.pdf ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698