src/pdf/SkPDFMetadata.cpp - Issue 1394263003: SkPDF: Optionally output PDF/A-2b archive format.

Side by Side Diff: src/pdf/SkPDFMetadata.cpp

Issue 1394263003: SkPDF: Optionally output PDF/A-2b archive format. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: works on windows Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright 2015 Google Inc.

	3 *

	4 * Use of this source code is governed by a BSD-style license that can be

	5 * found in the LICENSE file.

	6 */

	7

	8 #include "SkPDFMetadata.h"

	9 #include "SkMD5.h"

	10 #include "SkPDFTypes.h"

	11

	12 SkPDFMetadata::UUID SkPDFMetadata::uuid() const {

	13 SkMD5 md5;

	14 const char uuidNamespace[] = "org.skia.pdf\n";
	tomhudson 2015/10/09 15:31:37 The require newline-termination of the namespace? The require newline-termination of the namespace? really? Eww. hal.canary 2015/10/09 19:13:27 // The main requirement is for the UUID to be uniq Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > The require newline-termination of the namespace? really? Eww. // The main requirement is for the UUID to be unique; the exact // format of the data that will be hashed is not important.
	15 md5.write(uuidNamespace, strlen(uuidNamespace));

	16 SkMSec msec = SkTime::GetMSecs();

	17 md5.write(&msec, sizeof(msec));

	18 SkTime::DateTime dateTime;

	19 SkTime::GetDateTime(&dateTime);

	20 md5.write(&dateTime, sizeof(dateTime));

	21 if (fCreation) {

	22 md5.write(fCreation.get(), sizeof(fCreation));

	23 }

	24 if (fModified) {

	25 md5.write(fModified.get(), sizeof(fModified));

	26 }

	27 for (const auto& kv : fInfo) {

	28 md5.write(kv.fKey.c_str(), kv.fKey.size());

	29 md5.write("\037", 1);

	30 md5.write(kv.fValue.c_str(), kv.fValue.size());

	31 md5.write("\036", 1);

	32 }

	33 SkMD5::Digest digest;

	34 md5.finish(digest);

	35 digest.data[6] = (digest.data[6] & 0x0F) \| 0x40;

	36 digest.data[8] = (digest.data[6] & 0x3F) \| 0x80;
	tomhudson 2015/10/09 15:31:37 Where do these magic numbers come from, the spec? Where do these magic numbers come from, the spec? hal.canary 2015/10/09 19:13:27 // See RFC 4122, page 6-7. Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > Where do these magic numbers come from, the spec? // See RFC 4122, page 6-7. tomhudson 2015/10/09 19:21:37 Acknowledged. Show quoted text On 2015/10/09 19:13:27, Hal Canary wrote: > On 2015/10/09 15:31:37, tomhudson wrote: > > Where do these magic numbers come from, the spec? > > // See RFC 4122, page 6-7. Acknowledged.
	37 static_assert(sizeof(digest) == sizeof(UUID), "uuid_size");

	38 SkPDFMetadata::UUID uuid;

	39 memcpy(&uuid, &digest, sizeof(digest));

	40 return uuid;

	41 }

	42

	43 SkPDFObject* SkPDFMetadata::CreatePdfId(const UUID& doc, const UUID& instance) {

	44 // /ID [ <81b14aafa313db63dbd6f981e49f94f4>

	45 // <81b14aafa313db63dbd6f981e49f94f4> ]

	46 SkAutoTUnref<SkPDFArray> array(new SkPDFArray);

	47 static_assert(sizeof(UUID) == 16, "uuid_size");

	48 array->appendString(

	49 SkString(reinterpret_cast<const char*>(&doc), sizeof(UUID)));

	50 array->appendString(

	51 SkString(reinterpret_cast<const char*>(&instance), sizeof(UUID)));

	52 return array.detach();

	53 }

	54

	55 static SkString pdf_date(const SkTime::DateTime& dt) {

	56 int timeZoneMinutes = SkToInt(dt.fTimeZoneMinutes);

	57 char timezoneSign = timeZoneMinutes >= 0 ? '+' : '-';

	58 int timeZoneHours = SkTAbs(timeZoneMinutes) / 60;

	59 timeZoneMinutes = SkTAbs(timeZoneMinutes) % 60;

	60 return SkStringPrintf(

	61 "D:%04u%02u%02u%02u%02u%02u%c%02d'%02d'",

	62 static_cast<unsigned>(dt.fYear), static_cast<unsigned>(dt.fMonth),

	63 static_cast<unsigned>(dt.fDay), static_cast<unsigned>(dt.fHour),

	64 static_cast<unsigned>(dt.fMinute),

	65 static_cast<unsigned>(dt.fSecond), timezoneSign, timeZoneHours,

	66 timeZoneMinutes);

	67 }

	68

	69 SkPDFObject* SkPDFMetadata::createDocumentInformationDict() const {

	70 SkAutoTUnref<SkPDFDict> dict(new SkPDFDict);

	71 static const char* keys[] = {

	72 "Title", "Author", "Subject", "Keywords", "Creator" };

	73 for (const char* key : keys) {

	74 for (const SkDocument::Attribute& keyValue : fInfo) {

	75 if (keyValue.fKey.equals(key)) {

	76 dict->insertString(key, keyValue.fValue);

	77 }

	78 }

	79 }

	80 dict->insertString("Producer", "Skia/PDF");

	81 if (fCreation) {

	82 dict->insertString("CreationDate", pdf_date(*fCreation.get()));

	83 }

	84 if (fModified) {

	85 dict->insertString("ModDate", pdf_date(*fModified.get()));

	86 }

	87 return dict.detach();

	88 }

	89

	90 #ifdef SK_PDF_GENERATE_PDFA

	91 // Improvement on SkStringPrintf to allow for arbitrarily long output.

	92 // TODO: replace SkStringPrintf.

	93 static SkString sk_string_printf(const char* format, ...) {

	94 #ifdef SK_BUILD_FOR_WIN

	95 va_list args;

	96 va_start(args, format);

	97 char buffer[1024];

	98 int length = _vsnprintf_s(buffer, sizeof(buffer), _TRUNCATE, format, args);

	99 va_end(args);

	100 if (length >= 0 && length < (int)sizeof(buffer)) {

	101 return SkString(buffer, length);

	102 }

	103 va_start(args, format);

	104 length = _vscprintf(format, args);

	105 va_end(args);

	106

	107 SkString string((size_t)length);

	108 va_start(args, format);

	109 SkDEBUGCODE(int check =)

	110 _vsnprintf_s(string.writable_str(), length + 1, _TRUNCATE, format, args);

	111 va_end(args);

	112 SkASSERT(check == length);

	113 SkASSERT(string[length] == '\0');

	114 return skstd::move(string);

	115 #else // C99/C++11 standard vsnprintf

	116 // TODO: When all compilers support this, remove windows-specific code.
	tomhudson 2015/10/09 15:31:37 Nit: how far are we from being able to get rid of Nit: how far are we from being able to get rid of the Windows-specific version? I thought we had C++11 everywhere now? hal.canary 2015/10/09 19:13:27 We have C++11 semantics everywhere. The version Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > Nit: how far are we from being able to get rid of the Windows-specific version? > I thought we had C++11 everywhere now? We have C++11 semantics everywhere. The version of the C standard library I have on my windows workstation has vsnprintf, but it does not behave according to the standard. tomhudson 2015/10/09 19:21:37 Oh, ouch. Show quoted text On 2015/10/09 19:13:27, Hal Canary wrote: > We have C++11 semantics everywhere. The version of the C standard library I > have on my windows workstation has vsnprintf, but it does not behave according > to the standard. Oh, ouch.
	117 va_list args;

	118 va_start(args, format);

	119 char buffer[1024];

	120 int length = vsnprintf(buffer, sizeof(buffer), format, args);

	121 va_end(args);

	122 if (length < 0) {

	123 return SkString();

	124 }

	125 if (length < (int)sizeof(buffer)) {

	126 return SkString(buffer, length);

	127 }

	128 SkString string((size_t)length);

	129 va_start(args, format);

	130 SkDEBUGCODE(int check =)

	131 vsnprintf(string.writable_str(), length + 1, format, args);

	132 va_end(args);

	133 SkASSERT(check == length);

	134 SkASSERT(string[length] == '\0');

	135 return skstd::move(string);

	136 #endif

	137 }

	138

	139 static const SkString get(const SkTArray<SkDocument::Attribute>& info,

	140 const char* key) {

	141 for (const auto& keyValue : info) {

	142 if (keyValue.fKey.equals(key)) {

	143 return keyValue.fValue;

	144 }

	145 }

	146 return SkString();

	147 }

	148

	149 #define HEXIFY(INPUT_PTR, OUTPUT_PTR, HEX_STRING, BYTE_COUNT) \

	150 do { \

	151 for (int i = 0; i < (BYTE_COUNT); ++i) { \

	152 uint8_t value = *(INPUT_PTR)++; \

	153 *(OUTPUT_PTR)++ = (HEX_STRING)[value >> 4]; \

	154 *(OUTPUT_PTR)++ = (HEX_STRING)[value & 0xF]; \

	155 } \

	156 } while (false)
	tomhudson 2015/10/09 15:31:37 Nit: if you already have a for() statement with ev Nit: if you already have a for() statement with everything scoped inside, is the do-while() necessary? hal.canary 2015/10/09 19:13:27 It means nothing, but allows me to put a semi-colo Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > Nit: if you already have a for() statement with everything scoped inside, is the > do-while() necessary? It means nothing, but allows me to put a semi-colon after HEXIFY(data, ptr, gHex, 4) tomhudson 2015/10/09 19:21:37 Acknowledged. Show quoted text On 2015/10/09 19:13:27, Hal Canary wrote: > On 2015/10/09 15:31:37, tomhudson wrote: > > Nit: if you already have a for() statement with everything scoped inside, is > the > > do-while() necessary? > > It means nothing, but allows me to put a semi-colon after > HEXIFY(data, ptr, gHex, 4) Acknowledged.
	157 static SkString uuid_to_string(const SkPDFMetadata::UUID& uuid) {

	158 // 8-4-4-4-12

	159 char buffer[36]; // [32 + 4]

	160 static const char gHex[] = "0123456789abcdef";

	161 SkASSERT(strlen(gHex) == 16);

	162 char* ptr = buffer;

	163 const uint8_t* data = uuid.fData;

	164 HEXIFY(data, ptr, gHex, 4);

	165 *ptr++ = '-';

	166 HEXIFY(data, ptr, gHex, 2);

	167 *ptr++ = '-';

	168 HEXIFY(data, ptr, gHex, 2);

	169 *ptr++ = '-';

	170 HEXIFY(data, ptr, gHex, 2);

	171 *ptr++ = '-';

	172 HEXIFY(data, ptr, gHex, 6);

	173 SkASSERT(ptr == buffer + 36);

	174 SkASSERT(data == uuid.fData + 16);

	175 return SkString(buffer, 36);

	176 }

	177 #undef HEXIFY

	178

	179 namespace {

	180 class PDFXMLObject : public SkPDFObject {

	181 public:

	182 PDFXMLObject(SkString xml) : fXML(skstd::move(xml)) {}

	183 void emitObject(SkWStream* stream,

	184 const SkPDFObjNumMap& omap,

	185 const SkPDFSubstituteMap& smap) const override {

	186 SkPDFDict dict("Metadata");

	187 dict.insertName("Subtype", "XML");

	188 dict.insertInt("Length", fXML.size());

	189 dict.emitObject(stream, omap, smap);

	190 static const char streamBegin[] = " stream\n";

	191 stream->write(streamBegin, strlen(streamBegin));

	192 // Do not compress this.
	tomhudson 2015/10/09 15:31:37 WHY, OH, WHY? (And why is compression even a topi WHY, OH, WHY? (And why is compression even a topic? Is it going on somewhere nearby unobviously? (I could easily have missed it if it isn't in code that is part of this review.)) hal.canary 2015/10/09 19:13:27 The standard requires that a program that does not Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > WHY, OH, WHY? The standard requires that a program that does not understand PDF can grep for "<?xpacket" and extracť the entire XML. Show quoted text > (And why is compression even a topic? Is it going on somewhere nearby > unobviously? (I could easily have missed it if it isn't in code that is > part of this review.)) If I wanted to compress, I'd use SkPDFStream. tomhudson 2015/10/09 19:21:37 Just trying to understand why the comment was ther Just trying to understand why the comment was there and how a future maintainer is supposed to be driven to act/understand based on it. hal.canary 2015/10/09 19:28:49 Comment fixed: // Do not compress this. The Show quoted text On 2015/10/09 19:21:37, tomhudson wrote: > Just trying to understand why the comment was there and how a future maintainer > is supposed to be driven to act/understand based on it. Comment fixed: // Do not compress this. The standard requires that a // program that does not understand PDF can grep for // "<?xpacket" and extracť the entire XML.
	193 stream->write(fXML.c_str(), fXML.size());

	194 static const char streamEnd[] = "\nendstream";

	195 stream->write(streamEnd, strlen(streamEnd));

	196 }

	197 private:

	198 const SkString fXML;

	199 };

	200 } // namespace

	201

	202 static int count_xml_escape_size(const SkString& input) {

	203 int extra = 0;

	204 for (size_t i = 0; i < input.size(); ++i) {

	205 if (input[i] == '&') {

	206 extra += 4; // strlen("&") - strlen("&")

	207 } else if (input[i] == '<') {

	208 extra += 3; // strlen("<") - strlen("<")
	tomhudson 2015/10/09 15:31:37 Nit: not obvious why we don't also have to escape Nit: not obvious why we don't also have to escape &gt to preserve correctness hal.canary 2015/10/09 19:13:27 <dc:description> <rdf:Alt> <rdf:li> Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > Nit: not obvious why we don't also have to escape &gt to preserve correctness <dc:description> <rdf:Alt> <rdf:li> My Document is called <<, aka "very much less than" </rdf:li> </rdf:Alt> </dc:description> is not valid XML. tomhudson 2015/10/09 19:21:37 I think the key question is: XML never has user-ge I think the key question is: XML never has user-generated text inside tags <>? (The way HTML can have <foo bar="baz">) Or, we assume that this is valid XML. hal.canary 2015/10/09 19:28:49 We don't use that aspect of XML here everywhere w Show quoted text On 2015/10/09 19:21:37, tomhudson wrote: > I think the key question is: XML never has user-generated text inside tags <>? > (The way HTML can have <foo bar="baz">) > Or, we assume that this is valid XML. > We don't use that aspect of XML here everywhere we have user-provided or client-provided text, it is in the form: <tag>user-provided plain text</tag>
	209 }

	210 }

	211 return extra;

	212 }

	213

	214 const SkString escape_xml(const SkString& input,

	215 const char* before = nullptr,

	216 const char* after = nullptr) {

	217 if (input.size() == 0) {

	218 return input;

	219 }

	220 // "&" --> "&" and "<" --> "<"

	221 // text is assumed to be in UTF-8

	222 // all strings are xml content, not attribute values.

	223 size_t beforeLen = before ? strlen(before) : 0;

	224 size_t afterLen = after ? strlen(after) : 0;

	225 int extra = count_xml_escape_size(input);

	226 SkString output(input.size() + extra + beforeLen + afterLen);

	227 char* out = output.writable_str();

	228 if (before) {

	229 strncpy(out, before, beforeLen);

	230 out += beforeLen;

	231 }

	232 static const char kAmp[] = "&";

	233 static const char kLt[] = "<";

	234 for (size_t i = 0; i < input.size(); ++i) {

	235 if (input[i] == '&') {

	236 strncpy(out, kAmp, strlen(kAmp));

	237 out += strlen(kAmp);

	238 } else if (input[i] == '<') {

	239 strncpy(out, kLt, strlen(kLt));

	240 out += strlen(kLt);

	241 } else {

	242 *out++ = input[i];

	243 }

	244 }

	245 if (after) {

	246 strncpy(out, after, afterLen);

	247 out += afterLen;

	248 }

	249 // Validate that we haven't written outside of our string.

	250 SkASSERT(out == &output.writable_str()[output.size()]);

	251 *out = '\0';

	252 return skstd::move(output);

	253 }

	254

	255 SkPDFObject* SkPDFMetadata::createXMPObject(const UUID& doc,

	256 const UUID& instance) const {

	257 // A PDF-1a file should have

	258 // the integer value 1 for pdfaid:part and
	tomhudson 2015/10/09 15:31:37 Yet you write '2' and 'B' below in templateString? Yet you write '2' and 'B' below in templateString? hal.canary 2015/10/09 19:13:27 The spec was confusing me. But I think I have it Show quoted text On 2015/10/09 15:31:37, tomhudson wrote: > Yet you write '2' and 'B' below in templateString? The spec was confusing me. But I think I have it correct in the code now. Comment removed.
	259 // the value "A" for pdfaid:conformance.

	260 static const char templateString[] =

	261 "<?xpacket begin=\"\" id=\"W5M0MpCehiHzreSzNTczkc9d\"?>\n"

	262 "<x:xmpmeta xmlns:x=\"adobe:ns:meta/\"\n"

	263 " x:xmptk=\"Adobe XMP Core 5.4-c005 78.147326, 2012/08/23-13:03:03\">\n"

	264 "<rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n"

	265 "<rdf:Description rdf:about=\"\"\n"

	266 " xmlns:xmp=\"http://ns.adobe.com/xap/1.0/\"\n"

	267 " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"

	268 " xmlns:xmpMM=\"http://ns.adobe.com/xap/1.0/mm/\"\n"

	269 " xmlns:pdf=\"http://ns.adobe.com/pdf/1.3/\"\n"

	270 " xmlns:pdfaid=\"http://www.aiim.org/pdfa/ns/id/\">\n"

	271 "<pdfaid:part>2</pdfaid:part>\n"

	272 "<pdfaid:conformance>B</pdfaid:conformance>\n"

	273 "%s" // ModifyDate

	274 "%s" // CreateDate

	275 "%s" // MetadataDate

	276 "%s" // xmp:CreatorTool

	277 "<dc:format>application/pdf</dc:format>\n"

	278 "%s" // dc:title

	279 "%s" // dc:description

	280 "%s" // author

	281 "%s" // keywords

	282 "<xmpMM:DocumentID>uuid:%s</xmpMM:DocumentID>\n"

	283 "<xmpMM:InstanceID>uuid:%s</xmpMM:InstanceID>\n"

	284 "<pdf:Producer>Skia/PDF</pdf:Producer>\n"

	285 "%s" // pdf:Keywords

	286 "</rdf:Description>\n"

	287 "</rdf:RDF>\n"

	288 "</x:xmpmeta>\n" // Note: the standard suggests 4k of padding.

	289 "<?xpacket end=\"w\"?>\n";

	290

	291 SkString creationDate;

	292 SkString modificationDate;

	293 SkString metadataDate;

	294 if (fCreation) {

	295 SkString tmp;

	296 fCreation->toISO8601(&tmp);

	297 SkASSERT(0 == count_xml_escape_size(tmp));

	298 // YYYY-mm-ddTHH:MM:SS[+\|-]ZZ:ZZ; no need to escape

	299 creationDate = sk_string_printf("<xmp:CreateDate>%s</xmp:CreateDate>\n",

	300 tmp.c_str());

	301 }

	302 if (fModified) {

	303 SkString tmp;

	304 fModified->toISO8601(&tmp);

	305 SkASSERT(0 == count_xml_escape_size(tmp));

	306 modificationDate =

	307 sk_string_printf("<xmp:ModifyDate>%s</xmp:ModifyDate>\n",

	308 tmp.c_str());

	309 metadataDate = sk_string_printf("<xmp:MetadataDate>%s</xmp:MetadataDate> \n",

	310 tmp.c_str());

	311 }

	312

	313 SkString title = escape_xml(get(fInfo, "Title"),

	314 "<dc:title><rdf:Alt><rdf:li>",

	315 "</rdf:li></rdf:Alt></dc:title>\n");

	316 SkString author = escape_xml(get(fInfo, "Author"),

	317 "<dc:creator><rdf:Bag><rdf:li>",

	318 "</rdf:li></rdf:Bag></dc:creator>\n");

	319 // TODO: in theory, XMP can support multiple authors. Split on a delimiter?

	320 SkString subject = escape_xml(get(fInfo, "Subject"),

	321 "<dc:description><rdf:Alt><rdf:li>",

	322 "</rdf:li></rdf:Alt></dc:description>\n");

	323 SkString keywords1 = escape_xml(get(fInfo, "Keywords"),

	324 "<dc:subject><rdf:Bag><rdf:li>",

	325 "</rdf:li></rdf:Bag></dc:subject>\n");

	326 SkString keywords2 = escape_xml(get(fInfo, "Keywords"),

	327 "<pdf:Keywords>",

	328 "</pdf:Keywords>\n");

	329

	330 // TODO: in theory, keywords can be a list too.

	331 SkString creator = escape_xml(get(fInfo, "Creator"),

	332 "<xmp:CreatorTool>", "</xmp:CreatorTool>\n");

	333 SkString documentID = uuid_to_string(doc); // no need to escape

	334 SkASSERT(0 == count_xml_escape_size(documentID));

	335 SkString instanceID = uuid_to_string(instance);

	336 SkASSERT(0 == count_xml_escape_size(instanceID));

	337 return new PDFXMLObject(sk_string_printf(templateString,

	338 modificationDate.c_str(),

	339 creationDate.c_str(),

	340 metadataDate.c_str(),

	341 creator.c_str(),

	342 title.c_str(),

	343 subject.c_str(),

	344 author.c_str(),

	345 keywords1.c_str(),

	346 documentID.c_str(),

	347 instanceID.c_str(),

	348 keywords2.c_str()));

	349 }

	350

	351 #endif // SK_PDF_GENERATE_PDFA

OLD	NEW

« src/pdf/SkPDFMetadata.h ('K') | « src/pdf/SkPDFMetadata.h ('k') | src/pdf/SkPDFTypes.h » ('j') | src/pdf/SkPDFTypes.h » ('J')