Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(231)

Side by Side Diff: experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.cpp

Issue 23020003: pdfviewer: debug code for drawText (show magenta background for text, to show text even when we fai… (Closed) Base URL: http://skia.googlecode.com/svn/trunk/
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 #include "SkNativeParsedPDF.h"
2 #include "SkPdfNativeTokenizer.h"
3 #include "SkPdfBasics.h"
4 #include "SkPdfObject.h"
5
6 #include <stdio.h>
7 #include <string.h>
8 #include <sys/types.h>
9 #include <sys/stat.h>
10
11 #include "SkPdfFileTrailerDictionary_autogen.h"
12 #include "SkPdfCatalogDictionary_autogen.h"
13 #include "SkPdfPageObjectDictionary_autogen.h"
14 #include "SkPdfPageTreeNodeDictionary_autogen.h"
15 #include "SkPdfMapper_autogen.h"
16
17 #include "SkStream.h"
18
19
20 static long getFileSize(const char* filename)
21 {
22 struct stat stat_buf;
23 int rc = stat(filename, &stat_buf);
24 return rc == 0 ? (long)stat_buf.st_size : -1;
25 }
26
27 static const unsigned char* lineHome(const unsigned char* start, const unsigned char* current) {
28 while (current > start && !isPdfEOL(*(current - 1))) {
29 current--;
30 }
31 return current;
32 }
33
34 static const unsigned char* previousLineHome(const unsigned char* start, const u nsigned char* current) {
35 if (current > start && isPdfEOL(*(current - 1))) {
36 current--;
37 }
38
39 // allows CR+LF, LF+CR but not two CR+CR or LF+LF
40 if (current > start && isPdfEOL(*(current - 1)) && *current != *(current - 1 )) {
41 current--;
42 }
43
44 while (current > start && !isPdfEOL(*(current - 1))) {
45 current--;
46 }
47
48 return current;
49 }
50
51 static const unsigned char* ignoreLine(const unsigned char* current, const unsig ned char* end) {
52 while (current < end && !isPdfEOL(*current)) {
53 current++;
54 }
55 current++;
56 if (current < end && isPdfEOL(*current) && *current != *(current - 1)) {
57 current++;
58 }
59 return current;
60 }
61
62 SkNativeParsedPDF* gDoc = NULL;
63
64 // TODO(edisonn): NYI
65 // TODO(edisonn): 3 constructuctors from URL, from stream, from file ...
66 // TODO(edisonn): write one that accepts errors in the file and ignores/fixis th em
67 // TODO(edisonn): testing:
68 // 1) run on a lot of file
69 // 2) recoverable corupt file: remove endobj, endsteam, remove other keywords, u se other white spaces, insert comments randomly, ...
70 // 3) irrecoverable corrupt file
71
72 SkNativeParsedPDF::SkNativeParsedPDF(SkStream* stream)
73 : fAllocator(new SkPdfAllocator())
74 , fFileContent(NULL)
75 , fContentLength(0)
76 , fRootCatalogRef(NULL)
77 , fRootCatalog(NULL) {
78 size_t size = stream->getLength();
79 void* ptr = sk_malloc_throw(size);
80 stream->read(ptr, size);
81
82 init(ptr, size);
83 }
84
85 SkNativeParsedPDF::SkNativeParsedPDF(const char* path)
86 : fAllocator(new SkPdfAllocator())
87 , fFileContent(NULL)
88 , fContentLength(0)
89 , fRootCatalogRef(NULL)
90 , fRootCatalog(NULL) {
91 gDoc = this;
92 FILE* file = fopen(path, "r");
93 // TODO(edisonn): put this in a function that can return NULL
94 if (file) {
95 size_t size = getFileSize(path);
96 void* content = sk_malloc_throw(size);
97 bool ok = (0 != fread(content, size, 1, file));
98 fclose(file);
99 if (!ok) {
100 sk_free(content);
101 // TODO(edisonn): report read error
102 // TODO(edisonn): not nice to return like this from constructor, cre ate a static
103 // function that can report NULL for failures.
104 return; // Doc will have 0 pages
105 }
106
107 init(content, size);
108 }
109 }
110
111 void SkNativeParsedPDF::init(const void* bytes, size_t length) {
112 fFileContent = (const unsigned char*)bytes;
113 fContentLength = length;
114 const unsigned char* eofLine = lineHome(fFileContent, fFileContent + fConten tLength - 1);
115 const unsigned char* xrefByteOffsetLine = previousLineHome(fFileContent, eof Line);
116 const unsigned char* xrefstartKeywordLine = previousLineHome(fFileContent, x refByteOffsetLine);
117
118 if (strcmp((char*)xrefstartKeywordLine, "startxref") != 0) {
119 // TODO(edisonn): report/issue
120 }
121
122 long xrefByteOffset = atol((const char*)xrefByteOffsetLine);
123
124 bool storeCatalog = true;
125 while (xrefByteOffset >= 0) {
126 const unsigned char* trailerStart = readCrossReferenceSection(fFileConte nt + xrefByteOffset, xrefstartKeywordLine);
127 xrefByteOffset = -1;
128 if (trailerStart < xrefstartKeywordLine) {
129 readTrailer(trailerStart, xrefstartKeywordLine, storeCatalog, &xrefB yteOffset, false);
130 storeCatalog = false;
131 }
132 }
133
134 // TODO(edisonn): warn/error expect fObjects[fRefCatalogId].fGeneration == f RefCatalogGeneration
135 // TODO(edisonn): security, verify that SkPdfCatalogDictionary is indeed usi ng mapper
136 // load catalog
137
138 if (fRootCatalogRef) {
139 fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef );
140 if (fRootCatalog->isDictionary() && fRootCatalog->valid()) {
141 SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
142 if (tree && tree->isDictionary() && tree->valid()) {
143 fillPages(tree);
144 }
145 }
146 }
147
148 // TODO(edisonn): clean up this doc, or better, let the caller call again an d build a new doc
149 // caller should be a static function.
150 if (pages() == 0) {
151 loadWithoutXRef();
152 }
153
154 // TODO(edisonn): corrupted pdf, read it from beginning and rebuild (xref, t railer, or just reall all objects)
155 // 0 pages
156
157 // now actually read all objects if we want, or do it lazyly
158 // and resolve references?... or not ...
159 }
160
161 void SkNativeParsedPDF::loadWithoutXRef() {
162 const unsigned char* current = fFileContent;
163 const unsigned char* end = fFileContent + fContentLength;
164
165 // TODO(edisonn): read pdf version
166 current = ignoreLine(current, end);
167
168 current = skipPdfWhiteSpaces(0, current, end);
169 while (current < end) {
170 SkPdfObject token;
171 current = nextObject(0, current, end, &token, NULL, NULL);
172 if (token.isInteger()) {
173 int id = (int)token.intValue();
174
175 token.reset();
176 current = nextObject(0, current, end, &token, NULL, NULL);
177 // int generation = (int)token.intValue(); // TODO(edisonn): ignore d for now
178
179 token.reset();
180 current = nextObject(0, current, end, &token, NULL, NULL);
181 // TODO(edisonn): must be obj, return error if not? ignore ?
182 if (!token.isKeyword("obj")) {
183 continue;
184 }
185
186 while (fObjects.count() < id + 1) {
187 reset(fObjects.append());
188 }
189
190 fObjects[id].fOffset = current - fFileContent;
191
192 SkPdfObject* obj = fAllocator->allocObject();
193 current = nextObject(0, current, end, obj, fAllocator, this);
194
195 fObjects[id].fResolvedReference = obj;
196 fObjects[id].fObj = obj;
197
198 // set objects
199 } else if (token.isKeyword("trailer")) {
200 long dummy;
201 current = readTrailer(current, end, true, &dummy, true);
202 } else if (token.isKeyword("startxref")) {
203 token.reset();
204 current = nextObject(0, current, end, &token, NULL, NULL); // ignor e
205 }
206
207 current = skipPdfWhiteSpaces(0, current, end);
208 }
209
210 // TODO(edisonn): hack, detect root catalog - we need to implement liniarize d support, and remove this hack.
211 if (!fRootCatalogRef) {
212 for (unsigned int i = 0 ; i < objects(); i++) {
213 SkPdfObject* obj = object(i);
214 SkPdfObject* root = (obj && obj->isDictionary()) ? obj->get("Root") : NULL;
215 if (root && root->isReference()) {
216 fRootCatalogRef = root;
217 }
218 }
219 }
220
221
222 if (fRootCatalogRef) {
223 fRootCatalog = (SkPdfCatalogDictionary*)resolveReference(fRootCatalogRef );
224 if (fRootCatalog->isDictionary() && fRootCatalog->valid()) {
225 SkPdfPageTreeNodeDictionary* tree = fRootCatalog->Pages(this);
226 if (tree && tree->isDictionary() && tree->valid()) {
227 fillPages(tree);
228 }
229 }
230 }
231
232
233 }
234
235 // TODO(edisonn): NYI
236 SkNativeParsedPDF::~SkNativeParsedPDF() {
237 sk_free((void*)fFileContent);
238 delete fAllocator;
239 }
240
241 const unsigned char* SkNativeParsedPDF::readCrossReferenceSection(const unsigned char* xrefStart, const unsigned char* trailerEnd) {
242 SkPdfObject xref;
243 const unsigned char* current = nextObject(0, xrefStart, trailerEnd, &xref, N ULL, NULL);
244
245 if (!xref.isKeyword("xref")) {
246 return trailerEnd;
247 }
248
249 SkPdfObject token;
250 while (current < trailerEnd) {
251 token.reset();
252 const unsigned char* previous = current;
253 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
254 if (!token.isInteger()) {
255 return previous;
256 }
257
258 int startId = (int)token.intValue();
259 token.reset();
260 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
261
262 if (!token.isInteger()) {
263 // TODO(edisonn): report/warning
264 return current;
265 }
266
267 int entries = (int)token.intValue();
268
269 for (int i = 0; i < entries; i++) {
270 token.reset();
271 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
272 if (!token.isInteger()) {
273 // TODO(edisonn): report/warning
274 return current;
275 }
276 int offset = (int)token.intValue();
277
278 token.reset();
279 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
280 if (!token.isInteger()) {
281 // TODO(edisonn): report/warning
282 return current;
283 }
284 int generation = (int)token.intValue();
285
286 token.reset();
287 current = nextObject(0, current, trailerEnd, &token, NULL, NULL);
288 if (!token.isKeyword() || token.lenstr() != 1 || (*token.c_str() != 'f' && *token.c_str() != 'n')) {
289 // TODO(edisonn): report/warning
290 return current;
291 }
292
293 addCrossSectionInfo(startId + i, generation, offset, *token.c_str() == 'f');
294 }
295 }
296 // TODO(edisonn): it should never get here? there is no trailer?
297 return current;
298 }
299
300 const unsigned char* SkNativeParsedPDF::readTrailer(const unsigned char* trailer Start, const unsigned char* trailerEnd, bool storeCatalog, long* prev, bool skip Keyword) {
301 *prev = -1;
302
303 const unsigned char* current = trailerStart;
304 if (!skipKeyword) {
305 SkPdfObject trailerKeyword;
306 // TODO(edisonn): use null allocator, and let it just fail if memory
307 // needs allocated (but no crash)!
308 current = nextObject(0, current, trailerEnd, &trailerKeyword, NULL, NULL );
309
310 if (!trailerKeyword.isKeyword() || strlen("trailer") != trailerKeyword.l enstr() ||
311 strncmp(trailerKeyword.c_str(), "trailer", strlen("trailer")) != 0) {
312 // TODO(edisonn): report warning, rebuild trailer from objects.
313 return current;
314 }
315 }
316
317 SkPdfObject token;
318 current = nextObject(0, current, trailerEnd, &token, fAllocator, NULL);
319 if (!token.isDictionary()) {
320 return current;
321 }
322 SkPdfFileTrailerDictionary* trailer = (SkPdfFileTrailerDictionary*)&token;
323 if (!trailer->valid()) {
324 return current;
325 }
326
327 if (storeCatalog) {
328 SkPdfObject* ref = trailer->Root(NULL);
329 if (ref == NULL || !ref->isReference()) {
330 // TODO(edisonn): oops, we have to fix the corrup pdf file
331 return current;
332 }
333 fRootCatalogRef = ref;
334 }
335
336 if (trailer->has_Prev()) {
337 *prev = (long)trailer->Prev(NULL);
338 }
339
340 return current;
341 }
342
343 void SkNativeParsedPDF::addCrossSectionInfo(int id, int generation, int offset, bool isFreed) {
344 // TODO(edisonn): security here
345 while (fObjects.count() < id + 1) {
346 reset(fObjects.append());
347 }
348
349 fObjects[id].fOffset = offset;
350 fObjects[id].fObj = NULL;
351 fObjects[id].fResolvedReference = NULL;
352 }
353
354 SkPdfObject* SkNativeParsedPDF::readObject(int id/*, int expectedGeneration*/) {
355 long startOffset = fObjects[id].fOffset;
356 //long endOffset = fObjects[id].fOffsetEnd;
357 // TODO(edisonn): use hinted endOffset
358 // TODO(edisonn): current implementation will result in a lot of memory usag e
359 // to decrease memory usage, we wither need to be smart and know where objec ts end, and we will
360 // alocate only the chancks needed, or the tokenizer will not make copies, b ut then it needs to
361 // cache the results so it does not go twice on the same buffer
362 const unsigned char* current = fFileContent + startOffset;
363 const unsigned char* end = fFileContent + fContentLength;
364
365 SkPdfNativeTokenizer tokenizer(current, end - current, fMapper, fAllocator, this);
366
367 SkPdfObject idObj;
368 SkPdfObject generationObj;
369 SkPdfObject objKeyword;
370 SkPdfObject* dict = fAllocator->allocObject();
371
372 current = nextObject(0, current, end, &idObj, NULL, NULL);
373 if (current >= end) {
374 // TODO(edisonn): report warning/error
375 return NULL;
376 }
377
378 current = nextObject(0, current, end, &generationObj, NULL, NULL);
379 if (current >= end) {
380 // TODO(edisonn): report warning/error
381 return NULL;
382 }
383
384 current = nextObject(0, current, end, &objKeyword, NULL, NULL);
385 if (current >= end) {
386 // TODO(edisonn): report warning/error
387 return NULL;
388 }
389
390 if (!idObj.isInteger() || !generationObj.isInteger() || id != idObj.intValue ()/* || generation != generationObj.intValue()*/) {
391 // TODO(edisonn): report warning/error
392 }
393
394 if (!objKeyword.isKeyword() || strcmp(objKeyword.c_str(), "obj") != 0) {
395 // TODO(edisonn): report warning/error
396 }
397
398 current = nextObject(1, current, end, dict, fAllocator, this);
399
400 // TODO(edisonn): report warning/error - verify last token is endobj
401
402 return dict;
403 }
404
405 void SkNativeParsedPDF::fillPages(SkPdfPageTreeNodeDictionary* tree) {
406 SkPdfArray* kids = tree->Kids(this);
407 if (kids == NULL) {
408 *fPages.append() = (SkPdfPageObjectDictionary*)tree;
409 return;
410 }
411
412 int cnt = kids->size();
413 for (int i = 0; i < cnt; i++) {
414 SkPdfObject* obj = resolveReference(kids->objAtAIndex(i));
415 if (fMapper->mapPageObjectDictionary(obj) != kPageObjectDictionary_SkPdf ObjectType) {
416 *fPages.append() = (SkPdfPageObjectDictionary*)obj;
417 } else {
418 // TODO(edisonn): verify that it is a page tree indeed
419 fillPages((SkPdfPageTreeNodeDictionary*)obj);
420 }
421 }
422 }
423
424 int SkNativeParsedPDF::pages() const {
425 return fPages.count();
426 }
427
428 SkPdfPageObjectDictionary* SkNativeParsedPDF::page(int page) {
429 SkASSERT(page >= 0 && page < fPages.count());
430 return fPages[page];
431 }
432
433
434 SkPdfResourceDictionary* SkNativeParsedPDF::pageResources(int page) {
435 SkASSERT(page >= 0 && page < fPages.count());
436 return fPages[page]->Resources(this);
437 }
438
439 // TODO(edisonn): Partial implemented. Move the logics directly in the code gene rator for inheritable and default value?
440 SkRect SkNativeParsedPDF::MediaBox(int page) {
441 SkPdfPageObjectDictionary* current = fPages[page];
442 while (!current->has_MediaBox() && current->has_Parent()) {
443 current = (SkPdfPageObjectDictionary*)current->Parent(this);
444 }
445 if (current) {
446 return current->MediaBox(this);
447 }
448 return SkRect::MakeEmpty();
449 }
450
451 // TODO(edisonn): stream or array ... ? for now only array
452 SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfPage(int page,
453 SkPdfAllocator* allocat or) {
454 if (fPages[page]->isContentsAStream(this)) {
455 return tokenizerOfStream(fPages[page]->getContentsAsStream(this), alloca tor);
456 } else {
457 // TODO(edisonn): NYI, we need to concatenate all streams in the array o r make the tokenizer smart
458 // so we don't allocate new memory
459 return NULL;
460 }
461 }
462
463 SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfStream(SkPdfObject* stream,
464 SkPdfAllocator* alloc ator) {
465 if (stream == NULL) {
466 return NULL;
467 }
468
469 return new SkPdfNativeTokenizer(stream, fMapper, allocator, this);
470 }
471
472 // TODO(edisonn): NYI
473 SkPdfNativeTokenizer* SkNativeParsedPDF::tokenizerOfBuffer(const unsigned char* buffer, size_t len,
474 SkPdfAllocator* alloc ator) {
475 // warning does not track two calls in the same buffer! the buffer is update d!
476 // make a clean copy if needed!
477 return new SkPdfNativeTokenizer(buffer, len, fMapper, allocator, this);
478 }
479
480 size_t SkNativeParsedPDF::objects() const {
481 return fObjects.count();
482 }
483
484 SkPdfObject* SkNativeParsedPDF::object(int i) {
485 SkASSERT(!(i < 0 || i > fObjects.count()));
486
487 if (i < 0 || i > fObjects.count()) {
488 return NULL;
489 }
490
491 if (fObjects[i].fObj == NULL) {
492 // TODO(edisonn): when we read the cross reference sections, store the s tart of the next object
493 // and fill fOffsetEnd
494 fObjects[i].fObj = readObject(i);
495 }
496
497 return fObjects[i].fObj;
498 }
499
500 const SkPdfMapper* SkNativeParsedPDF::mapper() const {
501 return fMapper;
502 }
503
504 SkPdfReal* SkNativeParsedPDF::createReal(double value) const {
505 SkPdfObject* obj = fAllocator->allocObject();
506 SkPdfObject::makeReal(value, obj);
507 return (SkPdfReal*)obj;
508 }
509
510 SkPdfInteger* SkNativeParsedPDF::createInteger(int value) const {
511 SkPdfObject* obj = fAllocator->allocObject();
512 SkPdfObject::makeInteger(value, obj);
513 return (SkPdfInteger*)obj;
514 }
515
516 SkPdfString* SkNativeParsedPDF::createString(const unsigned char* sz, size_t len ) const {
517 SkPdfObject* obj = fAllocator->allocObject();
518 SkPdfObject::makeString(sz, len, obj);
519 return (SkPdfString*)obj;
520 }
521
522 SkPdfAllocator* SkNativeParsedPDF::allocator() const {
523 return fAllocator;
524 }
525
526 // TODO(edisonn): fix infinite loop if ref to itself!
527 // TODO(edisonn): perf, fix refs at load, and resolve will simply return fResolv edReference?
528 SkPdfObject* SkNativeParsedPDF::resolveReference(SkPdfObject* ref) {
529 if (ref && ref->isReference()) {
530 int id = ref->referenceId();
531 // TODO(edisonn): generation/updates not supported now
532 //int gen = ref->referenceGeneration();
533
534 // TODO(edisonn): verify id and gen expected
535 if (id < 0 || id >= fObjects.count()) {
536 // TODO(edisonn): report error/warning
537 return NULL;
538 }
539
540 if (fObjects[id].fResolvedReference != NULL) {
541
542 #ifdef PDF_TRACE
543 printf("\nresolve(%s) = %s\n", ref->toString(0).c_str(), fObjects[id ].fResolvedReference->toString(0, ref->toString().size() + 13).c_str());
544 #endif
545
546 return fObjects[id].fResolvedReference;
547 }
548
549 if (fObjects[id].fObj == NULL) {
550 fObjects[id].fObj = readObject(id);
551 }
552
553 if (fObjects[id].fResolvedReference == NULL) {
554 if (!fObjects[id].fObj->isReference()) {
555 fObjects[id].fResolvedReference = fObjects[id].fObj;
556 } else {
557 fObjects[id].fResolvedReference = resolveReference(fObjects[id]. fObj);
558 }
559 }
560
561 #ifdef PDF_TRACE
562 printf("\nresolve(%s) = %s\n", ref->toString(0).c_str(), fObjects[id].fR esolvedReference->toString(0, ref->toString().size() + 13).c_str());
563 #endif
564 return fObjects[id].fResolvedReference;
565 }
566
567
568
569 // TODO(edisonn): fix the mess with const, probably we need to remove it pre tty much everywhere
570 return (SkPdfObject*)ref;
571 }
572
573 size_t SkNativeParsedPDF::bytesUsed() const {
574 return fAllocator->bytesUsed() +
575 fContentLength +
576 fObjects.count() * sizeof(PublicObjectEntry) +
577 fPages.count() * sizeof(SkPdfPageObjectDictionary*) +
578 sizeof(*this);
579 }
OLDNEW
« no previous file with comments | « experimental/PdfViewer/pdfparser/native/SkNativeParsedPDF.h ('k') | experimental/PdfViewer/pdfparser/native/SkPdfNativeDoc.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698