Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(55)

Side by Side Diff: cgpt/cgpt_common.c

Issue 5025003: The right implementation of CGPT label conversion between UTF8 and UTF16. (Closed) Base URL: ssh://git@gitrw.chromium.org:9222/vboot_reference.git
Patch Set: Fixed per code review opinion. Created 10 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « cgpt/cgpt.h ('k') | cgpt/cmd_add.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* Copyright (c) 2010 The Chromium OS Authors. All rights reserved. 1 /* Copyright (c) 2010 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be 2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file. 3 * found in the LICENSE file.
4 * 4 *
5 * Utility for ChromeOS-specific GPT partitions, Please see corresponding .c 5 * Utility for ChromeOS-specific GPT partitions, Please see corresponding .c
6 * files for more details. 6 * files for more details.
7 */ 7 */
8 8
9 #include "cgpt.h" 9 #include "cgpt.h"
10 10
(...skipping 332 matching lines...) Expand 10 before | Expand all | Expand 10 after
343 le16toh(guid->u.Uuid.time_high_and_version), 343 le16toh(guid->u.Uuid.time_high_and_version),
344 guid->u.Uuid.clock_seq_high_and_reserved, 344 guid->u.Uuid.clock_seq_high_and_reserved,
345 guid->u.Uuid.clock_seq_low, 345 guid->u.Uuid.clock_seq_low,
346 guid->u.Uuid.node[0], guid->u.Uuid.node[1], 346 guid->u.Uuid.node[0], guid->u.Uuid.node[1],
347 guid->u.Uuid.node[2], guid->u.Uuid.node[3], 347 guid->u.Uuid.node[2], guid->u.Uuid.node[3],
348 guid->u.Uuid.node[4], guid->u.Uuid.node[5]) == GUID_STRLEN-1); 348 guid->u.Uuid.node[4], guid->u.Uuid.node[5]) == GUID_STRLEN-1);
349 } 349 }
350 350
351 /* Convert possibly unterminated UTF16 string to UTF8. 351 /* Convert possibly unterminated UTF16 string to UTF8.
352 * Caller must prepare enough space for UTF8, which could be up to 352 * Caller must prepare enough space for UTF8, which could be up to
353 * twice the number of UTF16 chars plus the terminating '\0'. 353 * twice the byte length of UTF16 string plus the terminating '\0'.
354 * FIXME(wfrichar): The original implementation had security issues. As a 354 * See the following table for encoding lengths.
355 * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 355 *
356 * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix 356 * Code point UTF16 UTF8
357 * this. 357 * 0x0000-0x007F 2 bytes 1 byte
358 * 0x0080-0x07FF 2 bytes 2 bytes
359 * 0x0800-0xFFFF 2 bytes 3 bytes
360 * 0x10000-0x10FFFF 4 bytes 4 bytes
361 *
362 * This function uses a simple state meachine to convert UTF-16 char(s) to
363 * a code point. Once a code point is parsed out, the state machine throws
364 * out sequencial UTF-8 chars in one time.
365 *
366 * Return: CGPT_OK --- all character are converted successfully.
367 * CGPT_FAILED --- convert error, i.e. output buffer is too short.
358 */ 368 */
359 void UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput, 369 int UTF16ToUTF8(const uint16_t *utf16, unsigned int maxinput,
360 uint8_t *utf8, unsigned int maxoutput) 370 uint8_t *utf8, unsigned int maxoutput)
361 { 371 {
362 size_t s16idx, s8idx; 372 size_t s16idx, s8idx;
363 uint32_t utfchar; 373 uint32_t code_point;
374 int code_point_ready = 1; // code point is ready to output.
375 int retval = CGPT_OK;
364 376
365 if (!utf16 || !maxinput || !utf8 || !maxoutput) 377 if (!utf16 || !maxinput || !utf8 || !maxoutput)
366 return; 378 return CGPT_FAILED;
367 379
368 maxoutput--; /* plan for termination now */ 380 maxoutput--; /* plan for termination now */
369 381
370 for (s16idx = s8idx = 0; 382 for (s16idx = s8idx = 0;
371 s16idx < maxinput && utf16[s16idx] && maxoutput; 383 s16idx < maxinput && utf16[s16idx] && maxoutput;
372 s16idx++, maxoutput--) { 384 s16idx++) {
373 utfchar = le16toh(utf16[s16idx]); 385 uint16_t codeunit = le16toh(utf16[s16idx]);
374 utf8[s8idx++] = utfchar & 0x7F; 386
387 if (code_point_ready) {
388 if (codeunit >= 0xD800 && codeunit <= 0xDBFF) {
389 /* high surrogate, need the low surrogate. */
390 code_point_ready = 0;
391 code_point = (codeunit & 0x03FF) + 0x0040;
392 } else {
393 /* BMP char, output it. */
394 code_point = codeunit;
395 }
396 } else {
397 /* expect the low surrogate */
398 if (codeunit >= 0xDC00 && codeunit <= 0xDFFF) {
399 code_point = (code_point << 10) | (codeunit & 0x03FF);
400 code_point_ready = 1;
401 } else {
402 /* the second code unit is NOT the low surrogate. Unexpected. */
403 retval = CGPT_FAILED;
404 break;
405 }
406 }
407
408 /* If UTF code point is ready, output it. */
409 if (code_point_ready) {
410 require(code_point <= 0x10FFFF);
411 if (code_point <= 0x7F && maxoutput >= 1) {
412 maxoutput -= 1;
413 utf8[s8idx++] = code_point & 0x7F;
414 } else if (code_point <= 0x7FF && maxoutput >= 2) {
415 maxoutput -= 2;
416 utf8[s8idx++] = 0xC0 | (code_point >> 6);
417 utf8[s8idx++] = 0x80 | (code_point & 0x3F);
418 } else if (code_point <= 0xFFFF && maxoutput >= 3) {
419 maxoutput -= 3;
420 utf8[s8idx++] = 0xE0 | (code_point >> 12);
421 utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
422 utf8[s8idx++] = 0x80 | (code_point & 0x3F);
423 } else if (code_point <= 0x10FFFF && maxoutput >= 4) {
424 maxoutput -= 4;
425 utf8[s8idx++] = 0xF0 | (code_point >> 18);
426 utf8[s8idx++] = 0x80 | ((code_point >> 12) & 0x3F);
427 utf8[s8idx++] = 0x80 | ((code_point >> 6) & 0x3F);
428 utf8[s8idx++] = 0x80 | (code_point & 0x3F);
429 } else {
430 /* buffer underrun */
431 retval = CGPT_FAILED;
432 break;
433 }
434 }
375 } 435 }
376 utf8[s8idx++] = 0; 436 utf8[s8idx++] = 0;
437 return retval;
377 } 438 }
378 439
379 /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated. 440 /* Convert UTF8 string to UTF16. The UTF8 string must be null-terminated.
380 * Caller must prepare enough space for UTF16, including a terminating 0x0000. 441 * Caller must prepare enough space for UTF16, including a terminating 0x0000.
381 * FIXME(wfrichar): The original implementation had security issues. As a 442 * See the following table for encoding lengths. In any case, the caller
382 * temporary fix, I'm making this ONLY support ASCII codepoints. Bug 7542 443 * just needs to prepare the byte length of UTF8 plus the terminating 0x0000.
383 * (http://code.google.com/p/chromium-os/issues/detail?id=7542) is filed to fix 444 *
384 * this. 445 * Code point UTF16 UTF8
446 * 0x0000-0x007F 2 bytes 1 byte
447 * 0x0080-0x07FF 2 bytes 2 bytes
448 * 0x0800-0xFFFF 2 bytes 3 bytes
449 * 0x10000-0x10FFFF 4 bytes 4 bytes
450 *
451 * This function converts UTF8 chars to a code point first. Then, convrts it
452 * to UTF16 code unit(s).
453 *
454 * Return: CGPT_OK --- all character are converted successfully.
455 * CGPT_FAILED --- convert error, i.e. output buffer is too short.
385 */ 456 */
386 void UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput) 457 int UTF8ToUTF16(const uint8_t *utf8, uint16_t *utf16, unsigned int maxoutput)
387 { 458 {
388 size_t s16idx, s8idx; 459 size_t s16idx, s8idx;
389 uint32_t utfchar; 460 uint32_t code_point = 0;
461 unsigned int expected_units = 1;
462 unsigned int decoded_units = 1;
463 int retval = CGPT_OK;
390 464
391 if (!utf8 || !utf16 || !maxoutput) 465 if (!utf8 || !utf16 || !maxoutput)
392 return; 466 return CGPT_FAILED;
393 467
394 maxoutput--; /* plan for termination */ 468 maxoutput--; /* plan for termination */
395 469
396 for (s8idx = s16idx = 0; 470 for (s8idx = s16idx = 0;
397 utf8[s8idx] && maxoutput; 471 utf8[s8idx] && maxoutput;
398 s8idx++, maxoutput--) { 472 s8idx++) {
399 utfchar = utf8[s8idx]; 473 uint8_t code_unit;
400 utf16[s16idx++] = utfchar & 0x7F; 474 code_unit = utf8[s8idx];
475
476 if (expected_units != decoded_units) {
477 /* Trailing bytes of multi-byte character */
478 if ((code_unit & 0xC0) == 0x80) {
479 code_point = (code_point << 6) | (code_unit & 0x3F);
480 ++decoded_units;
481 } else {
482 /* Unexpected code unit. */
483 retval = CGPT_FAILED;
484 break;
485 }
486 } else {
487 /* parsing a new code point. */
488 decoded_units = 1;
489 if (code_unit <= 0x7F) {
490 code_point = code_unit;
491 expected_units = 1;
492 } else if (code_unit <= 0xBF) {
493 /* 0x80-0xBF must NOT be the heading byte unit of a new code point. */
494 retval = CGPT_FAILED;
495 break;
496 } else if (code_unit >= 0xC2 && code_unit <= 0xDF) {
497 code_point = code_unit & 0x1F;
498 expected_units = 2;
499 } else if (code_unit >= 0xE0 && code_unit <= 0xEF) {
500 code_point = code_unit & 0x0F;
501 expected_units = 3;
502 } else if (code_unit >= 0xF0 && code_unit <= 0xF4) {
503 code_point = code_unit & 0x07;
504 expected_units = 4;
505 } else {
506 /* illegal code unit: 0xC0-0xC1, 0xF5-0xFF */
507 retval = CGPT_FAILED;
508 break;
509 }
510 }
511
512 /* If no more unit is needed, output the UTF16 unit(s). */
513 if (expected_units == decoded_units) {
514 /* Check if the encoding is the shortest possible UTF-8 sequence. */
515 switch (expected_units) {
516 case 2:
517 if (code_point <= 0x7F) retval = CGPT_FAILED;
518 break;
519 case 3:
520 if (code_point <= 0x7FF) retval = CGPT_FAILED;
521 break;
522 case 4:
523 if (code_point <= 0xFFFF) retval = CGPT_FAILED;
524 break;
525 }
526 if (retval == CGPT_FAILED) break; /* leave immediately */
527
528 if ((code_point <= 0xD7FF) ||
529 (code_point >= 0xE000 && code_point <= 0xFFFF)) {
530 utf16[s16idx++] = code_point;
531 maxoutput -= 1;
532 } else if (code_point >= 0x10000 && code_point <= 0x10FFFF &&
533 maxoutput >= 2) {
534 utf16[s16idx++] = 0xD800 | ((code_point >> 10) - 0x0040);
535 utf16[s16idx++] = 0xDC00 | (code_point & 0x03FF);
536 maxoutput -= 2;
537 } else {
538 /* Three possibilities fall into here. Both are failure cases.
539 * a. surrogate pair (non-BMP characters; 0xD800~0xDFFF)
540 * b. invalid code point > 0x10FFFF
541 * c. buffer underrun
542 */
543 retval = CGPT_FAILED;
544 break;
545 }
546 }
401 } 547 }
548
549 /* A null-terminator shows up before the UTF8 sequence ends. */
550 if (expected_units != decoded_units) {
551 retval = CGPT_FAILED;
552 }
553
402 utf16[s16idx++] = 0; 554 utf16[s16idx++] = 0;
555 return retval;
403 } 556 }
404 557
405 struct { 558 struct {
406 Guid type; 559 Guid type;
407 char *name; 560 char *name;
408 char *description; 561 char *description;
409 } supported_types[] = { 562 } supported_types[] = {
410 {GPT_ENT_TYPE_CHROMEOS_KERNEL, "kernel", "ChromeOS kernel"}, 563 {GPT_ENT_TYPE_CHROMEOS_KERNEL, "kernel", "ChromeOS kernel"},
411 {GPT_ENT_TYPE_CHROMEOS_ROOTFS, "rootfs", "ChromeOS rootfs"}, 564 {GPT_ENT_TYPE_CHROMEOS_ROOTFS, "rootfs", "ChromeOS rootfs"},
412 {GPT_ENT_TYPE_LINUX_DATA, "data", "Linux data"}, 565 {GPT_ENT_TYPE_LINUX_DATA, "data", "Linux data"},
(...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after
694 847
695 void PMBRToStr(struct pmbr *pmbr, char *str, unsigned int buflen) { 848 void PMBRToStr(struct pmbr *pmbr, char *str, unsigned int buflen) {
696 char buf[GUID_STRLEN]; 849 char buf[GUID_STRLEN];
697 if (IsZero(&pmbr->boot_guid)) { 850 if (IsZero(&pmbr->boot_guid)) {
698 require(snprintf(str, buflen, "PMBR") < buflen); 851 require(snprintf(str, buflen, "PMBR") < buflen);
699 } else { 852 } else {
700 GuidToStr(&pmbr->boot_guid, buf, sizeof(buf)); 853 GuidToStr(&pmbr->boot_guid, buf, sizeof(buf));
701 require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen); 854 require(snprintf(str, buflen, "PMBR (Boot GUID: %s)", buf) < buflen);
702 } 855 }
703 } 856 }
OLDNEW
« no previous file with comments | « cgpt/cgpt.h ('k') | cgpt/cmd_add.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698