tools/addlatexhash.dart - Issue 646003002: Introduced hash valued location markers in the spec

Unified Diff: tools/addlatexhash.dart

Issue 646003002: Introduced hash valued location markers in the spec (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/addlatexhash.dart

diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart

new file mode 100644

index 0000000000000000000000000000000000000000..aa92504a56552366a77ed8a4b3ba472209a2a410

--- /dev/null

+++ b/tools/addlatexhash.dart

@@ -0,0 +1,508 @@

+// ----------------------------------------------------------------------

ricow1 2014/10/13 06:07:03 Add copyright header to this file

eernst 2014/10/13 08:03:27 Done.

+// File 'addlatexhash.dart'

ricow1 2014/10/13 06:07:03 We don't normally have this as a header, just leav

eernst 2014/10/13 08:03:27 Done.

+//

+// This is a very specialized tool which was created in order to support

+// adding hash values used as location markers in the LaTeX source of the

+// language specification. It is intended to be used as a filter from

+// the directory ../docs/language, in commands like the following:

+//

+// dart ../../tools/addlatexhash.dart < dartLangSpec.tex >tmp.tex

ricow1 2014/10/13 06:07:03 If I where you I would simple use command line par

eernst 2014/10/13 08:03:27 OK. But there are myriads of ways to define the k

Lasse Reichstein Nielsen 2014/10/15 09:13:17 I'd favor taking the input name as unnnamed argume

ricow1 2014/10/15 09:23:06 That is already the case, you are looking at the o

eernst 2014/10/15 12:01:10 Actually, using -o for the output and default to s

+//

+// This will yield a variant tmp.tex of the language specification with

+// hash values filled in. For more details, please check the language

+// specification source itself.

+import 'dart:io';

+import 'dart:convert';

+// ----------------------------------------------------------------------

+// Computation of SHA1 sums

+//

+// NB: To keep this script as independent of installation as possible,

+// this section was copied from crypto-0.9.0.tar.gz on Oct 9, 2014,

+// from https://pub.dartlang.org/packages/crypto, from the source files

+// crypto.dart, hash_utils.dart, and sha1.dart,

+// with the following copyright statement:

+//

+// BSD-style license that can be found in the LICENSE file.

+/**

+ * Interface for cryptographic hash functions.

+ *

+ * The [add] method is used to add data to the hash. The [close] method

+ * is used to extract the message digest.

+ *

+ * Once the [close] method has been called no more data can be added using the

+ * [add] method. If [add] is called after the first call to [close] a

+ * HashException is thrown.

+ *

+ * If multiple instances of a given Hash is needed the [newInstance]

+ * method can provide a new instance.

+ */

+// TODO(floitsch): make Hash implement Sink, EventSink or similar.

+abstract class Hash {

+ /**

+ * Add a list of bytes to the hash computation.

+ */

+ void add(List<int> data);

+ /**

+ * Finish the hash computation and extract the message digest as

+ * a list of bytes.

+ */

+ List<int> close();

+ /**

+ * Returns a new instance of this hash function.

+ */

+ Hash newInstance();

+ /**

+ * Internal block size of the hash in bytes.

+ *

+ * This is exposed for use by the HMAC class which needs to know the

+ * block size for the [Hash] it is using.

+ */

+ int get blockSize;

+// Constants.

+const _MASK_8 = 0xff;

+const _MASK_32 = 0xffffffff;

+const _BITS_PER_BYTE = 8;

+const _BYTES_PER_WORD = 4;

+// Helper functions used by more than one hasher.

+// Rotate left limiting to unsigned 32-bit values.

+int _rotl32(int val, int shift) {

+ var mod_shift = shift & 31;

+ return ((val << mod_shift) & _MASK_32) |

+ ((val & _MASK_32) >> (32 - mod_shift));

+// Base class encapsulating common behavior for cryptographic hash

+// functions.

+abstract class _HashBase implements Hash {

+ _HashBase(int chunkSizeInWords,

+ int digestSizeInWords,

+ bool this._bigEndianWords)

+ : _pendingData = [],

+ _currentChunk = new List(chunkSizeInWords),

+ _h = new List(digestSizeInWords),

+ _chunkSizeInWords = chunkSizeInWords,

+ _digestSizeInWords = digestSizeInWords;

+ // Update the hasher with more data.

+ void add(List<int> data) {

+ if (_digestCalled) {

+ throw new StateError(

+ 'Hash update method called after digest was retrieved');

+ }

+ _lengthInBytes += data.length;

+ _pendingData.addAll(data);

+ _iterate();

+ }

+ // Finish the hash computation and return the digest string.

+ List<int> close() {

+ if (_digestCalled) {

+ return _resultAsBytes();

+ }

+ _digestCalled = true;

+ _finalizeData();

+ _iterate();

+ assert(_pendingData.length == 0);

+ return _resultAsBytes();

+ }

+ // Returns the block size of the hash in bytes.

+ int get blockSize {

+ return _chunkSizeInWords * _BYTES_PER_WORD;

+ }

+ // One round of the hash computation.

+ void _updateHash(List<int> m);

+ // Helper methods.

+ int _add32(x, y) => (x + y) & _MASK_32;

+ int _roundUp(val, n) => (val + n - 1) & -n;

+ // Compute the final result as a list of bytes from the hash words.

+ List<int> _resultAsBytes() {

+ var result = [];

+ for (var i = 0; i < _h.length; i++) {

+ result.addAll(_wordToBytes(_h[i]));

+ }

+ return result;

+ }

+ // Converts a list of bytes to a chunk of 32-bit words.

+ void _bytesToChunk(List<int> data, int dataIndex) {

+ assert((data.length - dataIndex) >= (_chunkSizeInWords * _BYTES_PER_WORD));

+ for (var wordIndex = 0; wordIndex < _chunkSizeInWords; wordIndex++) {

+ var w3 = _bigEndianWords ? data[dataIndex] : data[dataIndex + 3];

+ var w2 = _bigEndianWords ? data[dataIndex + 1] : data[dataIndex + 2];

+ var w1 = _bigEndianWords ? data[dataIndex + 2] : data[dataIndex + 1];

+ var w0 = _bigEndianWords ? data[dataIndex + 3] : data[dataIndex];

+ dataIndex += 4;

+ var word = (w3 & 0xff) << 24;

+ word |= (w2 & _MASK_8) << 16;

+ word |= (w1 & _MASK_8) << 8;

+ word |= (w0 & _MASK_8);

+ _currentChunk[wordIndex] = word;

+ }

+ // Convert a 32-bit word to four bytes.

+ List<int> _wordToBytes(int word) {

+ List<int> bytes = new List(_BYTES_PER_WORD);

+ bytes[0] = (word >> (_bigEndianWords ? 24 : 0)) & _MASK_8;

+ bytes[1] = (word >> (_bigEndianWords ? 16 : 8)) & _MASK_8;

+ bytes[2] = (word >> (_bigEndianWords ? 8 : 16)) & _MASK_8;

+ bytes[3] = (word >> (_bigEndianWords ? 0 : 24)) & _MASK_8;

+ return bytes;

+ }

+ // Iterate through data updating the hash computation for each

+ // chunk.

+ void _iterate() {

+ var len = _pendingData.length;

+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD;

+ if (len >= chunkSizeInBytes) {

+ var index = 0;

+ for (; (len - index) >= chunkSizeInBytes; index += chunkSizeInBytes) {

+ _bytesToChunk(_pendingData, index);

+ _updateHash(_currentChunk);

+ }

+ _pendingData = _pendingData.sublist(index, len);

+ }

+ // Finalize the data. Add a 1 bit to the end of the message. Expand with

+ // 0 bits and add the length of the message.

+ void _finalizeData() {

+ _pendingData.add(0x80);

+ var contentsLength = _lengthInBytes + 9;

+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD;

+ var finalizedLength = _roundUp(contentsLength, chunkSizeInBytes);

+ var zeroPadding = finalizedLength - contentsLength;

+ for (var i = 0; i < zeroPadding; i++) {

+ _pendingData.add(0);

+ }

+ var lengthInBits = _lengthInBytes * _BITS_PER_BYTE;

+ assert(lengthInBits < pow(2, 32));

+ if (_bigEndianWords) {

+ _pendingData.addAll(_wordToBytes(0));

+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32));

+ } else {

+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32));

+ _pendingData.addAll(_wordToBytes(0));

+ }

+ // Hasher state.

+ final int _chunkSizeInWords;

+ final int _digestSizeInWords;

+ final bool _bigEndianWords;

+ int _lengthInBytes = 0;

+ List<int> _pendingData;

+ final List<int> _currentChunk;

+ final List<int> _h;

+ bool _digestCalled = false;

+/**

+ * SHA1 hash function implementation.

+ */

+class SHA1 extends _HashBase {

+ // Construct a SHA1 hasher object.

+ SHA1() : _w = new List(80), super(16, 5, true) {

+ _h[0] = 0x67452301;

+ _h[1] = 0xEFCDAB89;

+ _h[2] = 0x98BADCFE;

+ _h[3] = 0x10325476;

+ _h[4] = 0xC3D2E1F0;

+ }

+ // Returns a new instance of this Hash.

+ SHA1 newInstance() {

+ return new SHA1();

+ }

+ // Compute one iteration of the SHA1 algorithm with a chunk of

+ // 16 32-bit pieces.

+ void _updateHash(List<int> m) {

+ assert(m.length == 16);

+ var a = _h[0];

+ var b = _h[1];

+ var c = _h[2];

+ var d = _h[3];

+ var e = _h[4];

+ for (var i = 0; i < 80; i++) {

+ if (i < 16) {

+ _w[i] = m[i];

+ } else {

+ var n = _w[i - 3] ^ _w[i - 8] ^ _w[i - 14] ^ _w[i - 16];

+ _w[i] = _rotl32(n, 1);

+ }

+ var t = _add32(_add32(_rotl32(a, 5), e), _w[i]);

+ if (i < 20) {

+ t = _add32(_add32(t, (b & c) | (~b & d)), 0x5A827999);

+ } else if (i < 40) {

+ t = _add32(_add32(t, (b ^ c ^ d)), 0x6ED9EBA1);

+ } else if (i < 60) {

+ t = _add32(_add32(t, (b & c) | (b & d) | (c & d)), 0x8F1BBCDC);

+ } else {

+ t = _add32(_add32(t, b ^ c ^ d), 0xCA62C1D6);

+ }

+ e = d;

+ d = c;

+ c = _rotl32(b, 30);

+ b = a;

+ a = t & _MASK_32;

+ }

+ _h[0] = _add32(a, _h[0]);

+ _h[1] = _add32(b, _h[1]);

+ _h[2] = _add32(c, _h[2]);

+ _h[3] = _add32(d, _h[3]);

+ _h[4] = _add32(e, _h[4]);

+ }

+ List<int> _w;

+// ----------------------------------------------------------------------

+// Normalization of the text, i.e., removal or normalization

+// of elements that do not affect the output from latex

+// regexps

+var comment_all_re = new RegExp("^%");

ricow1 2014/10/13 06:07:04 camel case variables, these could be const? (in wh

eernst 2014/10/13 08:03:26 Now using camel case, but 'const RegExp(..)' is no

eernst 2014/10/13 08:03:30 CamelCasing done. But it can't be const (can't use

+var comment_re = new RegExp("[^\\\\]%[^\\n]*");

+var whitespace_all_re = new RegExp("^\\s+\$");

+var whitespace_leading_re = new RegExp("^\\s+[^\\n]");

+var whitespace_re = new RegExp("[ \\t][ \\t]+");

+// normalization steps

+cut_match(line, match, {start_offset:0, end_offset:0, glue:""}) {

ricow1 2014/10/13 06:07:03 Camel case method names, here and below

eernst 2014/10/13 08:03:30 Done.

+ if (match == null) return line;

+ var start = match.start + start_offset;

+ var end = match.end + end_offset;

+ var len = line.length;

+ if (start < 0) start=0;

ricow1 2014/10/13 06:07:03 space around =

eernst 2014/10/13 08:03:29 Done.

+ if (end > len) end=len;

ricow1 2014/10/13 06:07:04 space around =

eernst 2014/10/13 08:03:29 Done.

+ return line.substring(0,start) + glue + line.substring(end);

+cut_regexp(line, re, {start_offset:0, end_offset:0, glue:""}) {

Lasse Reichstein Nielsen 2014/10/15 09:13:17 Is this function used?

Lasse Reichstein Nielsen 2014/10/15 09:13:58 Is this comment still here? Yes it is. Should it b

+ return cut_match(line, re.firstMatch(line),

+ start_offset: start_offset,

+ end_offset: end_offset,

+ glue: glue);

+cut_from_match(line, match, {offset:0, glue:""}) {

+ if (match == null) return line;

+ return line.substring(0,match.start+offset) + glue;

ricow1 2014/10/13 06:07:03 space around +

eernst 2014/10/13 08:03:29 Done.

+cut_from_regexp(line, re, {offset:0, glue:""}) {

+ return cut_from_match(line, re.firstMatch(line), offset:offset, glue:glue);

+is_ws_only(line) => whitespace_all_re.firstMatch(line) != null;

+is_comment_only(line) => comment_all_re.firstMatch(line) != null;

Lasse Reichstein Nielsen 2014/10/15 09:13:17 This would be the non-regexp version: => line.star

eernst 2014/10/15 12:01:10 Done. Used to have a slightly more general commen

+just_eol(line) {

+ if (line.length == 0) return line;

+ return line[line.length-1] == '\n'? "\n" : "";

ricow1 2014/10/13 06:07:03 space before ?

eernst 2014/10/13 08:03:29 Done.

+strip_comment(line) {

+ // NB: it is tempting to remove everything from the '%' and out,

+ // including the final newline, if any, but this does not work.

+ // The problem is that TeX will do exactly this, but then it will

+ // add back a character that depends on its state (S, M, or N),

+ // and it is tricky to maintain a similar state that matches the

+ // state of TeX faithfully. Hence, we remove the content of

+ // comments but do not remove the comments themselves, we just

+ // leave the '%' at the end of the line and let TeX manage its

+ // states in a way that does not differ from the file from before

+ // strip_comment

+ if (is_comment_only(line)) return "%\n";

+ return cut_regexp(line, comment_re, start_offset:2);

+// reduce a ws_only line to its eol, remove leading ws

+// entirely, and reduce multiple ws chars to one

+normalize_whitespace(line) {

+ if (is_ws_only(line)) return just_eol(line);

+ line = cut_regexp(line, whitespace_leading_re, end_offset:-1);

+ var match;

+ while ((match = whitespace_re.firstMatch(line)) != null)

+ line = cut_match(line, match, glue:" ");

ricow1 2014/10/13 06:07:03 always encapsulate loop and conditional blocks in

eernst 2014/10/13 08:03:26 Done.

eernst 2014/10/13 08:03:30 Done.

+ return line;

+// reduce sequences of >1 ws_only lines to 1, and

ricow1 2014/10/13 06:07:03 reduce -> Reduce

eernst 2014/10/13 08:03:25 Done.

eernst 2014/10/13 08:03:29 Done.

+// and sequences of >1 comment_only lines to 1

ricow1 2014/10/13 06:07:03 and and -> and

eernst 2014/10/13 08:03:30 Done.

+multiline_normalize(lines) {

+ var oldlines = lines;

+ var after_blank_lines = false; // does 'line' succeed >0 empty lines?

+ var after_comment_lines = false; // .. succeed >0 comment_only lines?

+ lines = new List();

+ for (var line in oldlines) {

+ if (after_blank_lines && after_comment_lines) {

+ // can never happen

+ throw new Error("Bug, please report");

ricow1 2014/10/13 06:07:04 I would do Bug, please report to eernst@

eernst 2014/10/13 08:03:29 Done.

+ }

ricow1 2014/10/13 06:07:04 move else if up on this line

eernst 2014/10/15 12:01:10 Done.

+ else if (after_blank_lines && !after_comment_lines) {

+ // at least one line before 'line' is ws_only

+ if (!is_ws_only(line)) {

+ // blank line block ended

+ after_comment_lines = is_comment_only(line);

+ // special case: it seems to be safe to remove comment_only lines

+ // after ws_only lines, so the TeX state must be predictably right;

+ // next line will then be after_comment_lines and be dropped, so

+ // we drop the entire comment block---which is very useful; we can

+ // also consider this comment line to be an empty line, such that

+ // subsequent empty lines can be considered to be in a block of

+ // empty lines; note that almost all variants of this will break..

+ if (after_comment_lines) {

+ // _current_ 'line' a comment_only here

+ after_blank_lines = true;

+ after_comment_lines = false;

+ // and do not add 'line'

+ }

ricow1 2014/10/13 06:07:03 move else up here

eernst 2014/10/13 08:03:29 Done.

+ else {

+ // after blanks, but current 'line' is neither blank nor comment

+ after_blank_lines = false;

+ lines.add(line);

+ }

+ else {

+ // blank line block continues, do not add 'line'

+ }

+ else if (!after_blank_lines && after_comment_lines) {

+ // at least one line before 'line' is comment_only

+ if (!is_comment_only(line)) {

+ // comment line block ended

+ after_blank_lines = is_ws_only(line);

+ after_comment_lines = false;

+ lines.add(line);

+ }

+ else {

+ // comment line block continues, do not add 'line'

+ }

+ else /* !after_blank_lines && !after_comment_lines */ {

+ // no ws_only or comment_only lines preceed 'line'

+ if (is_ws_only(line))

+ after_blank_lines = true;

+ if (is_comment_only(line))

+ after_comment_lines = true;

+ if (!after_comment_lines)

+ lines.add(line);

+ else {

+ // skipping comment_only line after non_ws, non_comment text

+ }

+ return lines;

+// select the elements in the pipeline

+normalize(line) => normalize_whitespace(strip_comment(line));

+sisp_normalize(line) => strip_comment(line);

+// testing

ricow1 2014/10/13 06:07:03 you should add a real test that imports this file

eernst 2014/10/13 08:03:30 Need a bit more input on how to do this.

+one_test_cut_match(line,re,expected) {

+ stdout.write("cut_match: ${line} --[${re}]--> ");

+ var result = cut_match(line,new RegExp(re).firstMatch(line));

+ stdout.write(result+"\n");

+ return expected == result;

+test_cut_match() {

+ one_test_cut_match("test","e","tst") &&

+ one_test_cut_match("test","te","st") &&

+ one_test_cut_match("test","st","te") &&

+ one_test_cut_match("test","","test") &&

+ one_test_cut_match("test","test","")

+ ? print("OK") : print("ERROR");

+// ----------------------------------------------------------------------

+// Managing fragments with significant spacing

+final dart_code_begin_re = new RegExp("^\\s*\\\\begin{dartCode}");

+final dart_code_end_re = new RegExp ("^\\s*\\\\end{dartCode}");

+sisp_is(line, target_re) {

+ return target_re.firstMatch(line) != null;

+sisp_is_dart_begin(line) => sisp_is(line, dart_code_begin_re);

+sisp_is_dart_end(line) => sisp_is(line, dart_code_end_re);

+// testing

+one_test_sisp(sisp_fun, line, expectation) {

+ var result = sisp_fun(line) == expectation;

+ stdout.write("sisp_is_dart_*: ${line}: ${expectation}\n");

+ return result;

+test_sisp() {

+ one_test_sisp(sisp_is_dart_begin,"\\begin{dartCode}\n", true) &&

+ one_test_sisp(sisp_is_dart_begin," \\begin{dartCode}\n", true) &&

+ one_test_sisp(sisp_is_dart_begin,"whatever else ..", false) &&

+ one_test_sisp(sisp_is_dart_end,"\\end{dartCode}", true) &&

+ one_test_sisp(sisp_is_dart_end," \\end{dartCode}\t \n", true) &&

+ one_test_sisp(sisp_is_dart_end,"whatever else ..", false)

+ ? print("OK") : print("ERROR");

+// ----------------------------------------------------------------------

+// io

+rl() => stdin.readLineSync(

ricow1 2014/10/13 06:07:03 we don't normally shorten names, i.e., rl should b

eernst 2014/10/13 08:03:30 Done.

+ retainNewlines: true,

+ encoding: const AsciiCodec());

+// ----------------------------------------------------------------------

+// main

+main () {

ricow1 2014/10/13 06:07:03 As stated in the top this becomes much easier if y

+ var lines = new List(), line;

ricow1 2014/10/13 06:07:03 I would do the definition of line on a separate li

eernst 2014/10/13 08:03:30 Done.

+ // single-line normalization

+ var in_dart_code = false;

+ while ((line = rl()) != null) {

+ if (sisp_is_dart_begin(line))

+ in_dart_code = true;

ricow1 2014/10/13 06:07:03 block in {}

eernst 2014/10/13 08:03:28 Already changed this to single line as a result of

eernst 2014/10/13 08:03:29 Changed this to single line already when you descr

+ else if (sisp_is_dart_end(line))

+ in_dart_code = false;

ricow1 2014/10/13 06:07:03 block in {}

eernst 2014/10/13 08:03:27 Same situation as l.496.

+ if (in_dart_code) lines.add(sisp_normalize(line));

ricow1 2014/10/13 06:07:04 always use {} blocks when you have anything but si

eernst 2014/10/13 08:03:30 Ah, so you're saying that the presence of 'else' (

+ else lines.add(normalize(line));

+ }

+ // multi-line normalization

+ lines = multiline_normalize(lines);

+ // output result

+ for (var line in lines) stdout.write(line);

« docs/language/dart.sty ('K') | « docs/language/dartLangSpec.tex ('k') | no next file » | no next file with comments »