Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(131)

Unified Diff: tools/addlatexhash.dart

Issue 646003002: Introduced hash valued location markers in the spec (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« docs/language/dart.sty ('K') | « docs/language/dartLangSpec.tex ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/addlatexhash.dart
diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart
new file mode 100644
index 0000000000000000000000000000000000000000..aa92504a56552366a77ed8a4b3ba472209a2a410
--- /dev/null
+++ b/tools/addlatexhash.dart
@@ -0,0 +1,508 @@
+// ----------------------------------------------------------------------
ricow1 2014/10/13 06:07:03 Add copyright header to this file
eernst 2014/10/13 08:03:27 Done.
+// File 'addlatexhash.dart'
ricow1 2014/10/13 06:07:03 We don't normally have this as a header, just leav
eernst 2014/10/13 08:03:27 Done.
+//
+// This is a very specialized tool which was created in order to support
+// adding hash values used as location markers in the LaTeX source of the
+// language specification. It is intended to be used as a filter from
+// the directory ../docs/language, in commands like the following:
+//
+// dart ../../tools/addlatexhash.dart < dartLangSpec.tex >tmp.tex
ricow1 2014/10/13 06:07:03 If I where you I would simple use command line par
eernst 2014/10/13 08:03:27 OK. But there are myriads of ways to define the k
Lasse Reichstein Nielsen 2014/10/15 09:13:17 I'd favor taking the input name as unnnamed argume
ricow1 2014/10/15 09:23:06 That is already the case, you are looking at the o
eernst 2014/10/15 12:01:10 Actually, using -o for the output and default to s
+//
+// This will yield a variant tmp.tex of the language specification with
+// hash values filled in. For more details, please check the language
+// specification source itself.
+
+import 'dart:io';
+import 'dart:convert';
+
+// ----------------------------------------------------------------------
+// Computation of SHA1 sums
+//
+// NB: To keep this script as independent of installation as possible,
+// this section was copied from crypto-0.9.0.tar.gz on Oct 9, 2014,
+// from https://pub.dartlang.org/packages/crypto, from the source files
+// crypto.dart, hash_utils.dart, and sha1.dart,
+// with the following copyright statement:
+//
+// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
+// for details. All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+/**
+ * Interface for cryptographic hash functions.
+ *
+ * The [add] method is used to add data to the hash. The [close] method
+ * is used to extract the message digest.
+ *
+ * Once the [close] method has been called no more data can be added using the
+ * [add] method. If [add] is called after the first call to [close] a
+ * HashException is thrown.
+ *
+ * If multiple instances of a given Hash is needed the [newInstance]
+ * method can provide a new instance.
+ */
+// TODO(floitsch): make Hash implement Sink, EventSink or similar.
+abstract class Hash {
+ /**
+ * Add a list of bytes to the hash computation.
+ */
+ void add(List<int> data);
+
+ /**
+ * Finish the hash computation and extract the message digest as
+ * a list of bytes.
+ */
+ List<int> close();
+
+ /**
+ * Returns a new instance of this hash function.
+ */
+ Hash newInstance();
+
+ /**
+ * Internal block size of the hash in bytes.
+ *
+ * This is exposed for use by the HMAC class which needs to know the
+ * block size for the [Hash] it is using.
+ */
+ int get blockSize;
+}
+
+// Constants.
+const _MASK_8 = 0xff;
+const _MASK_32 = 0xffffffff;
+const _BITS_PER_BYTE = 8;
+const _BYTES_PER_WORD = 4;
+
+// Helper functions used by more than one hasher.
+
+// Rotate left limiting to unsigned 32-bit values.
+int _rotl32(int val, int shift) {
+ var mod_shift = shift & 31;
+ return ((val << mod_shift) & _MASK_32) |
+ ((val & _MASK_32) >> (32 - mod_shift));
+}
+
+// Base class encapsulating common behavior for cryptographic hash
+// functions.
+abstract class _HashBase implements Hash {
+ _HashBase(int chunkSizeInWords,
+ int digestSizeInWords,
+ bool this._bigEndianWords)
+ : _pendingData = [],
+ _currentChunk = new List(chunkSizeInWords),
+ _h = new List(digestSizeInWords),
+ _chunkSizeInWords = chunkSizeInWords,
+ _digestSizeInWords = digestSizeInWords;
+
+ // Update the hasher with more data.
+ void add(List<int> data) {
+ if (_digestCalled) {
+ throw new StateError(
+ 'Hash update method called after digest was retrieved');
+ }
+ _lengthInBytes += data.length;
+ _pendingData.addAll(data);
+ _iterate();
+ }
+
+ // Finish the hash computation and return the digest string.
+ List<int> close() {
+ if (_digestCalled) {
+ return _resultAsBytes();
+ }
+ _digestCalled = true;
+ _finalizeData();
+ _iterate();
+ assert(_pendingData.length == 0);
+ return _resultAsBytes();
+ }
+
+ // Returns the block size of the hash in bytes.
+ int get blockSize {
+ return _chunkSizeInWords * _BYTES_PER_WORD;
+ }
+
+ // One round of the hash computation.
+ void _updateHash(List<int> m);
+
+ // Helper methods.
+ int _add32(x, y) => (x + y) & _MASK_32;
+ int _roundUp(val, n) => (val + n - 1) & -n;
+
+ // Compute the final result as a list of bytes from the hash words.
+ List<int> _resultAsBytes() {
+ var result = [];
+ for (var i = 0; i < _h.length; i++) {
+ result.addAll(_wordToBytes(_h[i]));
+ }
+ return result;
+ }
+
+ // Converts a list of bytes to a chunk of 32-bit words.
+ void _bytesToChunk(List<int> data, int dataIndex) {
+ assert((data.length - dataIndex) >= (_chunkSizeInWords * _BYTES_PER_WORD));
+
+ for (var wordIndex = 0; wordIndex < _chunkSizeInWords; wordIndex++) {
+ var w3 = _bigEndianWords ? data[dataIndex] : data[dataIndex + 3];
+ var w2 = _bigEndianWords ? data[dataIndex + 1] : data[dataIndex + 2];
+ var w1 = _bigEndianWords ? data[dataIndex + 2] : data[dataIndex + 1];
+ var w0 = _bigEndianWords ? data[dataIndex + 3] : data[dataIndex];
+ dataIndex += 4;
+ var word = (w3 & 0xff) << 24;
+ word |= (w2 & _MASK_8) << 16;
+ word |= (w1 & _MASK_8) << 8;
+ word |= (w0 & _MASK_8);
+ _currentChunk[wordIndex] = word;
+ }
+ }
+
+ // Convert a 32-bit word to four bytes.
+ List<int> _wordToBytes(int word) {
+ List<int> bytes = new List(_BYTES_PER_WORD);
+ bytes[0] = (word >> (_bigEndianWords ? 24 : 0)) & _MASK_8;
+ bytes[1] = (word >> (_bigEndianWords ? 16 : 8)) & _MASK_8;
+ bytes[2] = (word >> (_bigEndianWords ? 8 : 16)) & _MASK_8;
+ bytes[3] = (word >> (_bigEndianWords ? 0 : 24)) & _MASK_8;
+ return bytes;
+ }
+
+ // Iterate through data updating the hash computation for each
+ // chunk.
+ void _iterate() {
+ var len = _pendingData.length;
+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD;
+ if (len >= chunkSizeInBytes) {
+ var index = 0;
+ for (; (len - index) >= chunkSizeInBytes; index += chunkSizeInBytes) {
+ _bytesToChunk(_pendingData, index);
+ _updateHash(_currentChunk);
+ }
+ _pendingData = _pendingData.sublist(index, len);
+ }
+ }
+
+ // Finalize the data. Add a 1 bit to the end of the message. Expand with
+ // 0 bits and add the length of the message.
+ void _finalizeData() {
+ _pendingData.add(0x80);
+ var contentsLength = _lengthInBytes + 9;
+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD;
+ var finalizedLength = _roundUp(contentsLength, chunkSizeInBytes);
+ var zeroPadding = finalizedLength - contentsLength;
+ for (var i = 0; i < zeroPadding; i++) {
+ _pendingData.add(0);
+ }
+ var lengthInBits = _lengthInBytes * _BITS_PER_BYTE;
+ assert(lengthInBits < pow(2, 32));
+ if (_bigEndianWords) {
+ _pendingData.addAll(_wordToBytes(0));
+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32));
+ } else {
+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32));
+ _pendingData.addAll(_wordToBytes(0));
+ }
+ }
+
+ // Hasher state.
+ final int _chunkSizeInWords;
+ final int _digestSizeInWords;
+ final bool _bigEndianWords;
+ int _lengthInBytes = 0;
+ List<int> _pendingData;
+ final List<int> _currentChunk;
+ final List<int> _h;
+ bool _digestCalled = false;
+}
+
+/**
+ * SHA1 hash function implementation.
+ */
+class SHA1 extends _HashBase {
+ // Construct a SHA1 hasher object.
+ SHA1() : _w = new List(80), super(16, 5, true) {
+ _h[0] = 0x67452301;
+ _h[1] = 0xEFCDAB89;
+ _h[2] = 0x98BADCFE;
+ _h[3] = 0x10325476;
+ _h[4] = 0xC3D2E1F0;
+ }
+
+ // Returns a new instance of this Hash.
+ SHA1 newInstance() {
+ return new SHA1();
+ }
+
+ // Compute one iteration of the SHA1 algorithm with a chunk of
+ // 16 32-bit pieces.
+ void _updateHash(List<int> m) {
+ assert(m.length == 16);
+
+ var a = _h[0];
+ var b = _h[1];
+ var c = _h[2];
+ var d = _h[3];
+ var e = _h[4];
+
+ for (var i = 0; i < 80; i++) {
+ if (i < 16) {
+ _w[i] = m[i];
+ } else {
+ var n = _w[i - 3] ^ _w[i - 8] ^ _w[i - 14] ^ _w[i - 16];
+ _w[i] = _rotl32(n, 1);
+ }
+ var t = _add32(_add32(_rotl32(a, 5), e), _w[i]);
+ if (i < 20) {
+ t = _add32(_add32(t, (b & c) | (~b & d)), 0x5A827999);
+ } else if (i < 40) {
+ t = _add32(_add32(t, (b ^ c ^ d)), 0x6ED9EBA1);
+ } else if (i < 60) {
+ t = _add32(_add32(t, (b & c) | (b & d) | (c & d)), 0x8F1BBCDC);
+ } else {
+ t = _add32(_add32(t, b ^ c ^ d), 0xCA62C1D6);
+ }
+
+ e = d;
+ d = c;
+ c = _rotl32(b, 30);
+ b = a;
+ a = t & _MASK_32;
+ }
+
+ _h[0] = _add32(a, _h[0]);
+ _h[1] = _add32(b, _h[1]);
+ _h[2] = _add32(c, _h[2]);
+ _h[3] = _add32(d, _h[3]);
+ _h[4] = _add32(e, _h[4]);
+ }
+
+ List<int> _w;
+}
+
+// ----------------------------------------------------------------------
+// Normalization of the text, i.e., removal or normalization
+// of elements that do not affect the output from latex
+
+// regexps
+
+var comment_all_re = new RegExp("^%");
ricow1 2014/10/13 06:07:04 camel case variables, these could be const? (in wh
eernst 2014/10/13 08:03:26 Now using camel case, but 'const RegExp(..)' is no
eernst 2014/10/13 08:03:30 CamelCasing done. But it can't be const (can't use
+var comment_re = new RegExp("[^\\\\]%[^\\n]*");
+var whitespace_all_re = new RegExp("^\\s+\$");
+var whitespace_leading_re = new RegExp("^\\s+[^\\n]");
+var whitespace_re = new RegExp("[ \\t][ \\t]+");
+
+// normalization steps
+
+cut_match(line, match, {start_offset:0, end_offset:0, glue:""}) {
ricow1 2014/10/13 06:07:03 Camel case method names, here and below
eernst 2014/10/13 08:03:30 Done.
+ if (match == null) return line;
+ var start = match.start + start_offset;
+ var end = match.end + end_offset;
+ var len = line.length;
+ if (start < 0) start=0;
ricow1 2014/10/13 06:07:03 space around =
eernst 2014/10/13 08:03:29 Done.
+ if (end > len) end=len;
ricow1 2014/10/13 06:07:04 space around =
eernst 2014/10/13 08:03:29 Done.
+ return line.substring(0,start) + glue + line.substring(end);
+}
+
+cut_regexp(line, re, {start_offset:0, end_offset:0, glue:""}) {
Lasse Reichstein Nielsen 2014/10/15 09:13:17 Is this function used?
Lasse Reichstein Nielsen 2014/10/15 09:13:58 Is this comment still here? Yes it is. Should it b
+ return cut_match(line, re.firstMatch(line),
+ start_offset: start_offset,
+ end_offset: end_offset,
+ glue: glue);
+}
+
+cut_from_match(line, match, {offset:0, glue:""}) {
+ if (match == null) return line;
+ return line.substring(0,match.start+offset) + glue;
ricow1 2014/10/13 06:07:03 space around +
eernst 2014/10/13 08:03:29 Done.
+}
+
+cut_from_regexp(line, re, {offset:0, glue:""}) {
+ return cut_from_match(line, re.firstMatch(line), offset:offset, glue:glue);
+}
+
+is_ws_only(line) => whitespace_all_re.firstMatch(line) != null;
+is_comment_only(line) => comment_all_re.firstMatch(line) != null;
Lasse Reichstein Nielsen 2014/10/15 09:13:17 This would be the non-regexp version: => line.star
eernst 2014/10/15 12:01:10 Done. Used to have a slightly more general commen
+
+just_eol(line) {
+ if (line.length == 0) return line;
+ return line[line.length-1] == '\n'? "\n" : "";
ricow1 2014/10/13 06:07:03 space before ?
eernst 2014/10/13 08:03:29 Done.
+}
+
+strip_comment(line) {
+ // NB: it is tempting to remove everything from the '%' and out,
+ // including the final newline, if any, but this does not work.
+ // The problem is that TeX will do exactly this, but then it will
+ // add back a character that depends on its state (S, M, or N),
+ // and it is tricky to maintain a similar state that matches the
+ // state of TeX faithfully. Hence, we remove the content of
+ // comments but do not remove the comments themselves, we just
+ // leave the '%' at the end of the line and let TeX manage its
+ // states in a way that does not differ from the file from before
+ // strip_comment
+ if (is_comment_only(line)) return "%\n";
+ return cut_regexp(line, comment_re, start_offset:2);
+}
+
+// reduce a ws_only line to its eol, remove leading ws
+// entirely, and reduce multiple ws chars to one
+normalize_whitespace(line) {
+ if (is_ws_only(line)) return just_eol(line);
+ line = cut_regexp(line, whitespace_leading_re, end_offset:-1);
+ var match;
+ while ((match = whitespace_re.firstMatch(line)) != null)
+ line = cut_match(line, match, glue:" ");
ricow1 2014/10/13 06:07:03 always encapsulate loop and conditional blocks in
eernst 2014/10/13 08:03:26 Done.
eernst 2014/10/13 08:03:30 Done.
+ return line;
+}
+
+// reduce sequences of >1 ws_only lines to 1, and
ricow1 2014/10/13 06:07:03 reduce -> Reduce
eernst 2014/10/13 08:03:25 Done.
eernst 2014/10/13 08:03:29 Done.
+// and sequences of >1 comment_only lines to 1
ricow1 2014/10/13 06:07:03 and and -> and
eernst 2014/10/13 08:03:30 Done.
+multiline_normalize(lines) {
+ var oldlines = lines;
+ var after_blank_lines = false; // does 'line' succeed >0 empty lines?
+ var after_comment_lines = false; // .. succeed >0 comment_only lines?
+ lines = new List();
+ for (var line in oldlines) {
+ if (after_blank_lines && after_comment_lines) {
+ // can never happen
+ throw new Error("Bug, please report");
ricow1 2014/10/13 06:07:04 I would do Bug, please report to eernst@
eernst 2014/10/13 08:03:29 Done.
+ }
ricow1 2014/10/13 06:07:04 move else if up on this line
eernst 2014/10/15 12:01:10 Done.
+ else if (after_blank_lines && !after_comment_lines) {
+ // at least one line before 'line' is ws_only
+ if (!is_ws_only(line)) {
+ // blank line block ended
+ after_comment_lines = is_comment_only(line);
+ // special case: it seems to be safe to remove comment_only lines
+ // after ws_only lines, so the TeX state must be predictably right;
+ // next line will then be after_comment_lines and be dropped, so
+ // we drop the entire comment block---which is very useful; we can
+ // also consider this comment line to be an empty line, such that
+ // subsequent empty lines can be considered to be in a block of
+ // empty lines; note that almost all variants of this will break..
+ if (after_comment_lines) {
+ // _current_ 'line' a comment_only here
+ after_blank_lines = true;
+ after_comment_lines = false;
+ // and do not add 'line'
+ }
ricow1 2014/10/13 06:07:03 move else up here
eernst 2014/10/13 08:03:29 Done.
+ else {
+ // after blanks, but current 'line' is neither blank nor comment
+ after_blank_lines = false;
+ lines.add(line);
+ }
+ }
+ else {
+ // blank line block continues, do not add 'line'
+ }
+ }
+ else if (!after_blank_lines && after_comment_lines) {
+ // at least one line before 'line' is comment_only
+ if (!is_comment_only(line)) {
+ // comment line block ended
+ after_blank_lines = is_ws_only(line);
+ after_comment_lines = false;
+ lines.add(line);
+ }
+ else {
+ // comment line block continues, do not add 'line'
+ }
+ }
+ else /* !after_blank_lines && !after_comment_lines */ {
+ // no ws_only or comment_only lines preceed 'line'
+ if (is_ws_only(line))
+ after_blank_lines = true;
+ if (is_comment_only(line))
+ after_comment_lines = true;
+ if (!after_comment_lines)
+ lines.add(line);
+ else {
+ // skipping comment_only line after non_ws, non_comment text
+ }
+ }
+ }
+ return lines;
+}
+
+// select the elements in the pipeline
+
+normalize(line) => normalize_whitespace(strip_comment(line));
+
+sisp_normalize(line) => strip_comment(line);
+
+// testing
+
ricow1 2014/10/13 06:07:03 you should add a real test that imports this file
eernst 2014/10/13 08:03:30 Need a bit more input on how to do this.
+one_test_cut_match(line,re,expected) {
+ stdout.write("cut_match: ${line} --[${re}]--> ");
+ var result = cut_match(line,new RegExp(re).firstMatch(line));
+ stdout.write(result+"\n");
+ return expected == result;
+}
+
+test_cut_match() {
+ one_test_cut_match("test","e","tst") &&
+ one_test_cut_match("test","te","st") &&
+ one_test_cut_match("test","st","te") &&
+ one_test_cut_match("test","","test") &&
+ one_test_cut_match("test","test","")
+ ? print("OK") : print("ERROR");
+}
+
+// ----------------------------------------------------------------------
+// Managing fragments with significant spacing
+
+final dart_code_begin_re = new RegExp("^\\s*\\\\begin{dartCode}");
+final dart_code_end_re = new RegExp ("^\\s*\\\\end{dartCode}");
+
+sisp_is(line, target_re) {
+ return target_re.firstMatch(line) != null;
+}
+
+sisp_is_dart_begin(line) => sisp_is(line, dart_code_begin_re);
+sisp_is_dart_end(line) => sisp_is(line, dart_code_end_re);
+
+// testing
+
+one_test_sisp(sisp_fun, line, expectation) {
+ var result = sisp_fun(line) == expectation;
+ stdout.write("sisp_is_dart_*: ${line}: ${expectation}\n");
+ return result;
+}
+
+test_sisp() {
+ one_test_sisp(sisp_is_dart_begin,"\\begin{dartCode}\n", true) &&
+ one_test_sisp(sisp_is_dart_begin," \\begin{dartCode}\n", true) &&
+ one_test_sisp(sisp_is_dart_begin,"whatever else ..", false) &&
+ one_test_sisp(sisp_is_dart_end,"\\end{dartCode}", true) &&
+ one_test_sisp(sisp_is_dart_end," \\end{dartCode}\t \n", true) &&
+ one_test_sisp(sisp_is_dart_end,"whatever else ..", false)
+ ? print("OK") : print("ERROR");
+}
+
+// ----------------------------------------------------------------------
+// io
+
+rl() => stdin.readLineSync(
ricow1 2014/10/13 06:07:03 we don't normally shorten names, i.e., rl should b
eernst 2014/10/13 08:03:30 Done.
+ retainNewlines: true,
+ encoding: const AsciiCodec());
+
+// ----------------------------------------------------------------------
+// main
+
+main () {
ricow1 2014/10/13 06:07:03 As stated in the top this becomes much easier if y
+ var lines = new List(), line;
ricow1 2014/10/13 06:07:03 I would do the definition of line on a separate li
eernst 2014/10/13 08:03:30 Done.
+
+ // single-line normalization
+ var in_dart_code = false;
+ while ((line = rl()) != null) {
+ if (sisp_is_dart_begin(line))
+ in_dart_code = true;
ricow1 2014/10/13 06:07:03 block in {}
eernst 2014/10/13 08:03:28 Already changed this to single line as a result of
eernst 2014/10/13 08:03:29 Changed this to single line already when you descr
+ else if (sisp_is_dart_end(line))
+ in_dart_code = false;
ricow1 2014/10/13 06:07:03 block in {}
eernst 2014/10/13 08:03:27 Same situation as l.496.
+ if (in_dart_code) lines.add(sisp_normalize(line));
ricow1 2014/10/13 06:07:04 always use {} blocks when you have anything but si
eernst 2014/10/13 08:03:30 Ah, so you're saying that the presence of 'else' (
+ else lines.add(normalize(line));
+ }
+
+ // multi-line normalization
+ lines = multiline_normalize(lines);
+
+ // output result
+ for (var line in lines) stdout.write(line);
+}
« docs/language/dart.sty ('K') | « docs/language/dartLangSpec.tex ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698