Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(165)

Side by Side Diff: golden/go/pdfxform/pdfxform.go

Issue 1216483002: golden/pdfxform a pdf rasterization server (Closed) Base URL: https://skia.googlesource.com/buildbot@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « golden/go/pdfxform/data/45aa8af265d16839402583df5756a7c6.png ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // pdfxform is a server that rasterizes PDF documents into PNG
2 package main
3
4 import (
5 "bytes"
6 "crypto/md5"
7 "encoding/hex"
8 "encoding/json"
9 "flag"
10 "fmt"
11 "io"
12 "io/ioutil"
13 "net/http"
14 "os"
15 "os/user"
16 "path"
17 "path/filepath"
18 "runtime"
19 "strings"
20 "time"
21
22 "github.com/skia-dev/glog"
23 "go.skia.org/infra/go/auth"
24 "go.skia.org/infra/go/gs"
25 "go.skia.org/infra/go/pdf"
26 "go.skia.org/infra/go/util"
27 "google.golang.org/api/storage/v1"
28 )
29
30 ////////////////////////////////////////////////////////////////////////////////
31
32 const (
33 pngExt = "png"
34 pdfExt = "pdf"
35 )
36
37 ////////////////////////////////////////////////////////////////////////////////
38
39 type md5digest [md5.Size]byte
40
41 func (digest md5digest) String() string {
42 return hex.EncodeToString(digest[:])
43 }
44
45 // decodeMd5 turns a hexadecimal string into a fixed-size binary array for more efficient storage.
46 func decodeMd5(md5Str string) md5digest {
47 md5ByteSlice, err := hex.DecodeString(md5Str)
48 if err != nil || len(md5ByteSlice) != md5.Size {
49 glog.Errorf("Unable to decode base64-encoded MD5: %v: %s", md5St r, err)
50 return md5digest{}
51 }
52 var ret md5digest
53 copy(ret[:], md5ByteSlice)
54 return ret
55 }
56
57 // md5OfFile calculates the MD5 checksum of a file.
58 func md5OfFile(path string) (sum md5digest, err error) {
59 md5 := md5.New()
60 f, err := os.Open(path)
61 if err != nil {
62 return
63 }
64 defer util.Close(f)
65 if _, err = io.Copy(md5, f); err != nil {
66 return
67 }
68 copy(sum[:], md5.Sum(nil))
69 return
70 }
71
72 ////////////////////////////////////////////////////////////////////////////////
73
74 // removeIf is like util.Remove, but logs no error if the file does not exist.
75 func removeIf(path string) {
76 if err := os.Remove(path); err != nil {
77 if !os.IsNotExist(err) {
78 glog.Errorf("Failed to Remove(%s): %v", path, err)
79 }
80 }
81 }
82
83 // isPDF returns true if the path appears to point to a PDF file.
84 func isPDF(path string) bool {
85 f, err := os.Open(path)
86 if err != nil {
87 return false
88 }
89 defer util.Close(f)
90 var buffer [4]byte
91 if n, err := f.Read(buffer[:]); n != 4 || err != nil {
92 return false
93 }
94 var magic = [4]byte{'%', 'P', 'D', 'F'}
95 return bytes.Equal(magic[:], buffer[:])
96 }
97
98 // writeTo opens a file and dumps the contents of the reader into it.
99 func writeTo(path string, reader *io.ReadCloser) error {
100 defer util.Close(*reader)
101 file, err := os.Create(path)
102 if err == nil {
103 _, err = io.Copy(file, *reader)
104 }
105 return err
106 }
107
108 // assertNil logs the err and exits if it is not nil.
109 func assertNil(err error) {
110 if err != nil {
111 errMsg := ""
112 if _, fileName, line, ok := runtime.Caller(1); ok {
113 errMsg = fmt.Sprintf("-called from: %s:%d", fileName, li ne)
114 }
115 glog.Fatalf("Unexpected error %s: %s", errMsg, err)
116 }
117 }
118
119 ////////////////////////////////////////////////////////////////////////////////
120
121 // The dm struct mirrors the structure of the json files that will be
122 // parsed and prooduced.
123
124 type dm struct {
125 Results []result `json:"results"`
126 BuildNumber string `json:"build_number"`
127 GitHash string `json:"gitHash"`
128 Key map[string]string `json:"key"`
129 MaxRSSMB int `json:"max_rss_MB"`
130 }
131 type keyType struct {
132 Config string `json:"config,omitempty"`
133 Name string `json:"name,omitempty"`
134 Source_options string `json:"source_options,omitempty"`
135 Source_type string `json:"source_type,omitempty"`
136 Rasterizer string `json:"rasterizer,omitempty"`
137 }
138 type optionsType struct {
139 Ext string `json:"ext,omitempty"`
140 }
141 type result struct {
142 Key keyType `json:"key"`
143 Md5 string `json:"md5"`
144 Options optionsType `json:"options"`
145 }
146
147 ////////////////////////////////////////////////////////////////////////////////
148
149 // storageClient struct is used for uploading to cloud storage
150 type storageClient struct {
151 httpClient *http.Client
152 storageService *storage.Service
153 }
154
155 // getClient returns an authorized storage.Service and the
156 // corresponding http.Client; if anything goes wrong, it logs a fatal
157 // error.
158 func getClient(cacheFilePath string) storageClient {
159 config := auth.OAuthConfig(cacheFilePath, auth.SCOPE_FULL_CONTROL)
160 client, err := auth.RunFlow(config)
161 assertNil(err)
162 gsService, err := storage.New(client)
163 assertNil(err)
164 return storageClient{httpClient: client, storageService: gsService}
165 }
166
167 // gsFetch fetch the object's data from google storage
168 func gsFetch(object *storage.Object, sc storageClient) (io.ReadCloser, int64, er ror) {
169 request, err := gs.RequestForStorageURL(object.MediaLink)
170 if err != nil {
171 return nil, -1, err
172 }
173 resp, err := sc.httpClient.Do(request)
174 if err != nil {
175 return nil, -1, err
176 }
177 if resp.StatusCode != 200 {
178 resp.Body.Close()
179 return nil, -1, fmt.Errorf("Failed to retrieve: %s %d %s", objec t.MediaLink, resp.StatusCode, resp.Status)
180 }
181 return resp.Body, resp.ContentLength, nil
182 }
183
184 // uploadFile uploads the specified file to the remote dir in Google
185 // Storage. It also sets the appropriate ACLs on the uploaded file.
186 // If the file already exists on the server, do nothing.
187 func uploadFile(sc storageClient, input io.Reader, storageBucket, storagePath, a ccessControlEntity string) (bool, error) {
188 obj, _ := sc.storageService.Objects.Get(storageBucket, storagePath).Do()
189 if obj != nil {
190 return false, nil // noclobber
191 }
192 fullPath := fmt.Sprintf("gs://%s/%s", storageBucket, storagePath)
193 object := &storage.Object{Name: storagePath}
194 if _, err := sc.storageService.Objects.Insert(storageBucket, object).Med ia(input).Do(); err != nil {
195 return false, fmt.Errorf("Objects.Insert(%s) failed: %s", fullPa th, err)
196 }
197 objectAcl := &storage.ObjectAccessControl{
198 Bucket: storageBucket, Entity: accessControlEntity, Object: stor agePath, Role: "READER",
199 }
200 if _, err := sc.storageService.ObjectAccessControls.Insert(storageBucket , storagePath, objectAcl).Do(); err != nil {
201 return false, fmt.Errorf("Could not update ACL of %s: %s", fullP ath, err)
202 }
203 return true, nil
204 }
205
206 ////////////////////////////////////////////////////////////////////////////////
207
208 // The pdfXformer struct holds state (results, counter) and constants (bucket, d irectories)
209 type pdfXformer struct {
210 client storageClient
211 storageBucket string
212 storageJsonDirectory string
213 storageImagesDirectory string
214 accessControlEntity string
215 rasterizers []pdf.Rasterizer
216 results map[md5digest]map[int]md5digest
217 counter int
218 identifier string
219 }
220
221 var errorImageMd5 = md5digest{
222 0x45, 0xAA, 0x8A, 0xF2,
223 0x65, 0xD1, 0x68, 0x39,
224 0x40, 0x25, 0x83, 0xDF,
225 0x57, 0x56, 0xA7, 0xC6,
226 } // data/45aa8af265d16839402583df5756a7c6.png
jcgregorio 2015/06/26 15:09:23 How does this image get deployed to the server, an
hal.canary 2015/06/26 17:27:09 That would make sense if I had a lot of binary res
jcgregorio 2015/06/26 17:35:11 If this is the only external resource pdfxform nee
227
228 // rasterizeOnce applies a single rastetizer to the given pdf file.
229 // If the rasterizer fails, use the errorImage. If everything
230 // succeeds, upload the PNG.
231 func (xformer *pdfXformer) rasterizeOnce(pdfPath string, rasterizerIndex int) (m d5digest, error) {
232 rasterizer := xformer.rasterizers[rasterizerIndex]
233 tempdir := filepath.Dir(pdfPath)
234 pngPath := path.Join(tempdir, fmt.Sprintf("%s.%s", rasterizer.String(), pngExt))
235 defer removeIf(pngPath)
236 glog.Infof("> > > > rasterizing with %s", rasterizer)
237 err := rasterizer.Rasterize(pdfPath, pngPath)
238 if err != nil {
239 glog.Warningf("rasterizing %s with %s failed: %s", filepath.Base (pdfPath), rasterizer.String(), err)
240 return errorImageMd5, nil
241 }
242 md5, err := md5OfFile(pngPath)
243 if err != nil {
244 return md5digest{}, err
245 }
246 f, err := os.Open(pngPath)
247 if err != nil {
248 return md5digest{}, err
249 }
250 defer util.Close(f)
251 pngUploadPath := fmt.Sprintf("%s/%s.%s", xformer.storageImagesDirectory, md5.String(), pngExt)
252 didUpload, err := uploadFile(xformer.client, f, xformer.storageBucket, p ngUploadPath, xformer.accessControlEntity)
253 if err != nil {
254 return md5digest{}, err
255 }
256 if didUpload {
257 glog.Infof("> > > > uploaded %s", pngUploadPath)
258 }
259 return md5, nil
260 }
261
262 // makeTmpDir returns a nicely-named directory for temp files in $TMPDIR
263 func (xformer *pdfXformer) makeTmpDir() string {
264 if xformer.identifier == "" {
265 var host, userName string
266 if h, err := os.Hostname(); err == nil {
267 host = h
268 if i := strings.Index(host, "."); i >= 0 {
269 host = host[:i]
270 }
271 }
272 if currentUser, err := user.Current(); err == nil {
273 userName = currentUser.Username
274 }
275 userName = strings.Replace(userName, `\`, "_", -1)
276 xformer.identifier = fmt.Sprintf("%s.%s.%s.tmp.%d.", filepath.Ba se(os.Args[0]), host, userName, os.Getpid())
277 }
278 tempdir, err := ioutil.TempDir("", xformer.identifier)
279 assertNil(err)
280 return tempdir
281 }
282
283 // processResult rasterizes a single PDF result and returns a set of new results .
284 func (xformer *pdfXformer) processResult(res result) []result {
285 rasterizedResults := []result{}
286 resultMap, found := xformer.results[decodeMd5(res.Md5)]
287 if found {
288 // Skip rasterizion steps: big win.
289 for index, rasterizer := range xformer.rasterizers {
290 keyCopy := res.Key
291 keyCopy.Rasterizer = rasterizer.String()
292 md5, ok := resultMap[index]
293 if ok {
294 rasterizedResults = append(rasterizedResults, re sult{Key: keyCopy, Md5: md5.String(), Options: optionsType{pngExt}})
295 } else {
296 glog.Errorf("missing rasterizer %s on %s", raste rizer.String(), res.Md5)
297 }
298 }
299 return rasterizedResults
300 }
301
302 tempdir := xformer.makeTmpDir()
303 defer util.RemoveAll(tempdir)
304 pdfPath := path.Join(tempdir, fmt.Sprintf("%s.pdf", res.Md5))
305 objectName := fmt.Sprintf("%s/%s.pdf", xformer.storageImagesDirectory, r es.Md5)
306 storageURL := fmt.Sprintf("gs://%s/%s", xformer.storageBucket, objectNam e)
307 object, err := xformer.client.storageService.Objects.Get(xformer.storage Bucket, objectName).Do()
308 if err != nil {
309 glog.Errorf("unable to find %s: %s", storageURL, err)
310 return []result{}
311 }
312 pdfData, _, err := gsFetch(object, xformer.client)
313 if err != nil {
314 glog.Errorf("unable to retrieve %s: %s", storageURL, err)
315 return []result{}
316 }
317 writeTo(pdfPath, &pdfData)
318 if !isPDF(pdfPath) {
319 glog.Errorf("%s is not a PDF", objectName)
320 return []result{}
321 }
322 resultMap = map[int]md5digest{}
323 for index, rasterizer := range xformer.rasterizers {
324 md5, err := xformer.rasterizeOnce(pdfPath, index)
325 if err != nil {
326 glog.Errorf("rasterizer %s failed on %s.pdf: %s", raster izer, res.Md5, err)
327 continue
328 }
329 keyCopy := res.Key
330 keyCopy.Rasterizer = rasterizer.String()
331 rasterizedResults = append(rasterizedResults, result{Key: keyCop y, Md5: md5.String(), Options: optionsType{pngExt}})
332 resultMap[index] = md5
333 }
334 xformer.results[decodeMd5(res.Md5)] = resultMap
335 return rasterizedResults
336 }
337
338 // processJsonFile reads a json file and produces a new json file
339 // with rasterized results.
340 func (xformer *pdfXformer) processJsonFile(jsonFileObject *storage.Object) {
341 jsonURL := fmt.Sprintf("gs://%s/%s", xformer.storageBucket, jsonFileObje ct.Name)
342 // if jsonFileObject.Metadata["rasterized"] == "true" {
343 // glog.Infof("> > skipping %s (already processed) {%d}", jsonURL, xformer.counter)
344 // return
345 // }
346 body, length, err := gsFetch(jsonFileObject, xformer.client)
347 if err != nil {
348 glog.Errorf("Failed to fetch %s", jsonURL)
349 return
350 }
351 if 0 == length {
352 util.Close(body)
353 glog.Infof("> > skipping %s (empty file) {%d}", jsonURL, xformer .counter)
354 return
355 }
356 dmstruct := dm{}
357 err = json.NewDecoder(body).Decode(&dmstruct)
358 util.Close(body)
359 if err != nil {
360 glog.Errorf("Failed to parse %s", jsonURL)
361 return
362 }
363 countPdfResults := 0
364 for _, res := range dmstruct.Results {
365 if res.Options.Ext == pdfExt {
366 countPdfResults++
367 }
368 }
369 if 0 == countPdfResults {
370 glog.Infof("> > 0 PDFs found %s {%d}", jsonURL, xformer.counter)
371 xformer.setRasterized(jsonFileObject)
372 return
373 }
374
375 glog.Infof("> > processing %d pdfs of %d results {%d}", countPdfResults, len(dmstruct.Results), xformer.counter)
376 rasterizedResults := []result{}
377 i := 0
378 for _, res := range dmstruct.Results {
379 if res.Options.Ext == pdfExt {
380 i++
381 glog.Infof("> > > processing %s.pdf [%d/%d] {%d}", res.M d5, i, countPdfResults, xformer.counter)
382 rasterizedResults = append(rasterizedResults, xformer.pr ocessResult(res)...)
383 }
384 }
385 newDMStruct := dm{
386 BuildNumber: dmstruct.BuildNumber,
387 GitHash: dmstruct.GitHash,
388 Key: dmstruct.Key,
389 MaxRSSMB: dmstruct.MaxRSSMB,
390 Results: rasterizedResults,
391 }
392 newJson, err := json.Marshal(newDMStruct)
393 assertNil(err)
394
395 now := time.Now()
396 // Change the date; leave most of the rest of the path components.
397 jsonPathComponents := strings.Split(jsonFileObject.Name, "/") // []strin g
398 if len(jsonPathComponents) < 4 {
399 fmt.Errorf("unexpected number of path components %q", jsonPathCo mponents)
400 return
401 }
402 jsonPathComponents = jsonPathComponents[len(jsonPathComponents)-4:]
403 jsonPathComponents[1] += "-pdfxformer"
404 jsonUploadPath := fmt.Sprintf("%s/%d/%02d/%02d/%02d/%s",
405 xformer.storageJsonDirectory,
406 now.Year(),
407 int(now.Month()),
408 now.Day(),
409 now.Hour(),
410 strings.Join(jsonPathComponents, "/"))
411
412 _, err = uploadFile(xformer.client, bytes.NewReader(newJson), xformer.st orageBucket, jsonUploadPath, xformer.accessControlEntity)
413 glog.Infof("> > wrote gs://%s/%s", xformer.storageBucket, jsonUploadPath )
414 newJsonFileObject, err := xformer.client.storageService.Objects.Get(xfor mer.storageBucket, jsonUploadPath).Do()
415 if err != nil {
416 glog.Errorf("Failed to find %s: %s", jsonUploadPath, err)
417 } else {
418 xformer.setRasterized(newJsonFileObject)
419 }
420 xformer.setRasterized(jsonFileObject)
421 }
422
423 // setRasterized sets the rasterized metadata flag of the given storage.Object
424 func (xformer *pdfXformer) setRasterized(jsonFileObject *storage.Object) {
425 if nil == jsonFileObject.Metadata {
426 jsonFileObject.Metadata = map[string]string{}
427 }
428 jsonFileObject.Metadata["rasterized"] = "true"
429 _, err := xformer.client.storageService.Objects.Patch(xformer.storageBuc ket, jsonFileObject.Name, jsonFileObject).Do()
430 if err != nil {
431 glog.Errorf("Failed to update metadata of %s: %s", jsonFileObjec t.Name, err)
432 } else {
433 glog.Infof("> > Updated metadata of %s", jsonFileObject.Name)
434 }
435 }
436
437 // processTimeRange calls gs.GetLatestGSDirs to get a list of
438 func (xformer *pdfXformer) processTimeRange(start time.Time, end time.Time) {
439 glog.Infof("Processing time range: (%s, %s)", start.Truncate(time.Second ), end.Truncate(time.Second))
440 for _, dir := range gs.GetLatestGSDirs(start.Unix(), end.Unix(), xformer .storageJsonDirectory) {
441 glog.Infof("> Reading gs://%s/%s\n", xformer.storageBucket, dir)
442 requestedObjects := xformer.client.storageService.Objects.List(x former.storageBucket).Prefix(dir).Fields(
443 "nextPageToken", "items/updated", "items/md5Hash", "item s/mediaLink", "items/name", "items/metadata")
444 for requestedObjects != nil {
445 responseObjects, err := requestedObjects.Do()
446 if err != nil {
447 glog.Errorf("request %#v failed: %s", requestedO bjects, err)
448 continue
449 }
450 for _, jsonObject := range responseObjects.Items {
451 xformer.counter++
452 glog.Infof("> > Processing object: gs://%s/%s { %d}", xformer.storageBucket, jsonObject.Name, xformer.counter)
453 xformer.processJsonFile(jsonObject)
454 }
455 if len(responseObjects.NextPageToken) > 0 {
456 requestedObjects.PageToken(responseObjects.NextP ageToken)
457 } else {
458 requestedObjects = nil
459 }
460 }
461 }
462 glog.Infof("finished time range.")
463 }
464
465 // uploadErrorImage should be run once to verify that the
466 func (xformer *pdfXformer) uploadErrorImage() {
467 _, thisFile, _, ok := runtime.Caller(0)
468 if !ok {
469 glog.Fatalf("Could not find data dir: runtime.Caller() failed.")
470 }
471 filename := fmt.Sprintf("%s.png", errorImageMd5.String())
472 f, err := os.Open(path.Join(path.Dir(thisFile), "data", filename))
473 if err != nil {
474 glog.Fatalf("Could not open data/%s", filename)
475 }
476 defer util.Close(f)
477 errorImagePath := fmt.Sprintf("%s/%s", xformer.storageImagesDirectory, f ilename)
478 _, err = uploadFile(xformer.client, f, xformer.storageBucket, errorImage Path, xformer.accessControlEntity)
479 assertNil(err) // If we can't upload this, we can't upload anything.
480 }
481
482 // Environment variables: we respect $TMPDIR
483 // Arguments: glog uses -logtostderr, -log_dir
484 func main() {
485 flag.Parse()
486
487 // TODO(halcanary): where should this file exist?
488 configDir := path.Join(os.Getenv("HOME"), ".config")
489 assertNil(os.MkdirAll(configDir, 0700))
490
491 xformer := pdfXformer{
492 client: getClient(path.Join(configDir, "google_storage_token.dat a")),
493 // storageBucket: "chromium-skia-gm",
494 // storageJsonDirectory: "dm-json-v1",
495 // storageImagesDirectory: "dm-images-v1",
496 accessControlEntity: "domain-google.com",
497 results: map[md5digest]map[int]md5digest{},
498 storageBucket: "skia-infra-testdata",
499 storageJsonDirectory: "pdfxformer-testdata/json",
500 storageImagesDirectory: "pdfxformer-testdata/img",
501 }
502
503 xformer.uploadErrorImage()
504
505 for _, rasterizer := range []pdf.Rasterizer{pdf.Pdfium{}, pdf.Poppler{}} {
506 if rasterizer.Enabled() {
507 xformer.rasterizers = append(xformer.rasterizers, raster izer)
508 } else {
509 glog.Infof("rasterizer %s is disabled", rasterizer.Strin g())
510 }
511 }
512 if len(xformer.rasterizers) == 0 {
513 glog.Fatalf("no rasterizers found")
514 }
515
516 end := time.Now()
517 start := end.Add(-72 * time.Hour)
518 timeTicker := time.Tick(time.Minute)
519 for {
520 xformer.processTimeRange(start, end)
521 glog.Flush() // Flush before waiting for next tick; it may be a while.
522 _ = <-timeTicker
523 start = end
524 end = time.Now()
525 }
526 }
OLDNEW
« no previous file with comments | « golden/go/pdfxform/data/45aa8af265d16839402583df5756a7c6.png ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698