Index: tools/addlatexhash.dart |
diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart |
new file mode 100644 |
index 0000000000000000000000000000000000000000..aa92504a56552366a77ed8a4b3ba472209a2a410 |
--- /dev/null |
+++ b/tools/addlatexhash.dart |
@@ -0,0 +1,508 @@ |
+// ---------------------------------------------------------------------- |
ricow1
2014/10/13 06:07:03
Add copyright header to this file
eernst
2014/10/13 08:03:27
Done.
|
+// File 'addlatexhash.dart' |
ricow1
2014/10/13 06:07:03
We don't normally have this as a header, just leav
eernst
2014/10/13 08:03:27
Done.
|
+// |
+// This is a very specialized tool which was created in order to support |
+// adding hash values used as location markers in the LaTeX source of the |
+// language specification. It is intended to be used as a filter from |
+// the directory ../docs/language, in commands like the following: |
+// |
+// dart ../../tools/addlatexhash.dart < dartLangSpec.tex >tmp.tex |
ricow1
2014/10/13 06:07:03
If I where you I would simple use command line par
eernst
2014/10/13 08:03:27
OK. But there are myriads of ways to define the k
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I'd favor taking the input name as unnnamed argume
ricow1
2014/10/15 09:23:06
That is already the case, you are looking at the o
eernst
2014/10/15 12:01:10
Actually, using -o for the output and default to s
|
+// |
+// This will yield a variant tmp.tex of the language specification with |
+// hash values filled in. For more details, please check the language |
+// specification source itself. |
+ |
+import 'dart:io'; |
+import 'dart:convert'; |
+ |
+// ---------------------------------------------------------------------- |
+// Computation of SHA1 sums |
+// |
+// NB: To keep this script as independent of installation as possible, |
+// this section was copied from crypto-0.9.0.tar.gz on Oct 9, 2014, |
+// from https://pub.dartlang.org/packages/crypto, from the source files |
+// crypto.dart, hash_utils.dart, and sha1.dart, |
+// with the following copyright statement: |
+// |
+// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
+// for details. All rights reserved. Use of this source code is governed by a |
+// BSD-style license that can be found in the LICENSE file. |
+ |
+/** |
+ * Interface for cryptographic hash functions. |
+ * |
+ * The [add] method is used to add data to the hash. The [close] method |
+ * is used to extract the message digest. |
+ * |
+ * Once the [close] method has been called no more data can be added using the |
+ * [add] method. If [add] is called after the first call to [close] a |
+ * HashException is thrown. |
+ * |
+ * If multiple instances of a given Hash is needed the [newInstance] |
+ * method can provide a new instance. |
+ */ |
+// TODO(floitsch): make Hash implement Sink, EventSink or similar. |
+abstract class Hash { |
+ /** |
+ * Add a list of bytes to the hash computation. |
+ */ |
+ void add(List<int> data); |
+ |
+ /** |
+ * Finish the hash computation and extract the message digest as |
+ * a list of bytes. |
+ */ |
+ List<int> close(); |
+ |
+ /** |
+ * Returns a new instance of this hash function. |
+ */ |
+ Hash newInstance(); |
+ |
+ /** |
+ * Internal block size of the hash in bytes. |
+ * |
+ * This is exposed for use by the HMAC class which needs to know the |
+ * block size for the [Hash] it is using. |
+ */ |
+ int get blockSize; |
+} |
+ |
+// Constants. |
+const _MASK_8 = 0xff; |
+const _MASK_32 = 0xffffffff; |
+const _BITS_PER_BYTE = 8; |
+const _BYTES_PER_WORD = 4; |
+ |
+// Helper functions used by more than one hasher. |
+ |
+// Rotate left limiting to unsigned 32-bit values. |
+int _rotl32(int val, int shift) { |
+ var mod_shift = shift & 31; |
+ return ((val << mod_shift) & _MASK_32) | |
+ ((val & _MASK_32) >> (32 - mod_shift)); |
+} |
+ |
+// Base class encapsulating common behavior for cryptographic hash |
+// functions. |
+abstract class _HashBase implements Hash { |
+ _HashBase(int chunkSizeInWords, |
+ int digestSizeInWords, |
+ bool this._bigEndianWords) |
+ : _pendingData = [], |
+ _currentChunk = new List(chunkSizeInWords), |
+ _h = new List(digestSizeInWords), |
+ _chunkSizeInWords = chunkSizeInWords, |
+ _digestSizeInWords = digestSizeInWords; |
+ |
+ // Update the hasher with more data. |
+ void add(List<int> data) { |
+ if (_digestCalled) { |
+ throw new StateError( |
+ 'Hash update method called after digest was retrieved'); |
+ } |
+ _lengthInBytes += data.length; |
+ _pendingData.addAll(data); |
+ _iterate(); |
+ } |
+ |
+ // Finish the hash computation and return the digest string. |
+ List<int> close() { |
+ if (_digestCalled) { |
+ return _resultAsBytes(); |
+ } |
+ _digestCalled = true; |
+ _finalizeData(); |
+ _iterate(); |
+ assert(_pendingData.length == 0); |
+ return _resultAsBytes(); |
+ } |
+ |
+ // Returns the block size of the hash in bytes. |
+ int get blockSize { |
+ return _chunkSizeInWords * _BYTES_PER_WORD; |
+ } |
+ |
+ // One round of the hash computation. |
+ void _updateHash(List<int> m); |
+ |
+ // Helper methods. |
+ int _add32(x, y) => (x + y) & _MASK_32; |
+ int _roundUp(val, n) => (val + n - 1) & -n; |
+ |
+ // Compute the final result as a list of bytes from the hash words. |
+ List<int> _resultAsBytes() { |
+ var result = []; |
+ for (var i = 0; i < _h.length; i++) { |
+ result.addAll(_wordToBytes(_h[i])); |
+ } |
+ return result; |
+ } |
+ |
+ // Converts a list of bytes to a chunk of 32-bit words. |
+ void _bytesToChunk(List<int> data, int dataIndex) { |
+ assert((data.length - dataIndex) >= (_chunkSizeInWords * _BYTES_PER_WORD)); |
+ |
+ for (var wordIndex = 0; wordIndex < _chunkSizeInWords; wordIndex++) { |
+ var w3 = _bigEndianWords ? data[dataIndex] : data[dataIndex + 3]; |
+ var w2 = _bigEndianWords ? data[dataIndex + 1] : data[dataIndex + 2]; |
+ var w1 = _bigEndianWords ? data[dataIndex + 2] : data[dataIndex + 1]; |
+ var w0 = _bigEndianWords ? data[dataIndex + 3] : data[dataIndex]; |
+ dataIndex += 4; |
+ var word = (w3 & 0xff) << 24; |
+ word |= (w2 & _MASK_8) << 16; |
+ word |= (w1 & _MASK_8) << 8; |
+ word |= (w0 & _MASK_8); |
+ _currentChunk[wordIndex] = word; |
+ } |
+ } |
+ |
+ // Convert a 32-bit word to four bytes. |
+ List<int> _wordToBytes(int word) { |
+ List<int> bytes = new List(_BYTES_PER_WORD); |
+ bytes[0] = (word >> (_bigEndianWords ? 24 : 0)) & _MASK_8; |
+ bytes[1] = (word >> (_bigEndianWords ? 16 : 8)) & _MASK_8; |
+ bytes[2] = (word >> (_bigEndianWords ? 8 : 16)) & _MASK_8; |
+ bytes[3] = (word >> (_bigEndianWords ? 0 : 24)) & _MASK_8; |
+ return bytes; |
+ } |
+ |
+ // Iterate through data updating the hash computation for each |
+ // chunk. |
+ void _iterate() { |
+ var len = _pendingData.length; |
+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD; |
+ if (len >= chunkSizeInBytes) { |
+ var index = 0; |
+ for (; (len - index) >= chunkSizeInBytes; index += chunkSizeInBytes) { |
+ _bytesToChunk(_pendingData, index); |
+ _updateHash(_currentChunk); |
+ } |
+ _pendingData = _pendingData.sublist(index, len); |
+ } |
+ } |
+ |
+ // Finalize the data. Add a 1 bit to the end of the message. Expand with |
+ // 0 bits and add the length of the message. |
+ void _finalizeData() { |
+ _pendingData.add(0x80); |
+ var contentsLength = _lengthInBytes + 9; |
+ var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD; |
+ var finalizedLength = _roundUp(contentsLength, chunkSizeInBytes); |
+ var zeroPadding = finalizedLength - contentsLength; |
+ for (var i = 0; i < zeroPadding; i++) { |
+ _pendingData.add(0); |
+ } |
+ var lengthInBits = _lengthInBytes * _BITS_PER_BYTE; |
+ assert(lengthInBits < pow(2, 32)); |
+ if (_bigEndianWords) { |
+ _pendingData.addAll(_wordToBytes(0)); |
+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32)); |
+ } else { |
+ _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32)); |
+ _pendingData.addAll(_wordToBytes(0)); |
+ } |
+ } |
+ |
+ // Hasher state. |
+ final int _chunkSizeInWords; |
+ final int _digestSizeInWords; |
+ final bool _bigEndianWords; |
+ int _lengthInBytes = 0; |
+ List<int> _pendingData; |
+ final List<int> _currentChunk; |
+ final List<int> _h; |
+ bool _digestCalled = false; |
+} |
+ |
+/** |
+ * SHA1 hash function implementation. |
+ */ |
+class SHA1 extends _HashBase { |
+ // Construct a SHA1 hasher object. |
+ SHA1() : _w = new List(80), super(16, 5, true) { |
+ _h[0] = 0x67452301; |
+ _h[1] = 0xEFCDAB89; |
+ _h[2] = 0x98BADCFE; |
+ _h[3] = 0x10325476; |
+ _h[4] = 0xC3D2E1F0; |
+ } |
+ |
+ // Returns a new instance of this Hash. |
+ SHA1 newInstance() { |
+ return new SHA1(); |
+ } |
+ |
+ // Compute one iteration of the SHA1 algorithm with a chunk of |
+ // 16 32-bit pieces. |
+ void _updateHash(List<int> m) { |
+ assert(m.length == 16); |
+ |
+ var a = _h[0]; |
+ var b = _h[1]; |
+ var c = _h[2]; |
+ var d = _h[3]; |
+ var e = _h[4]; |
+ |
+ for (var i = 0; i < 80; i++) { |
+ if (i < 16) { |
+ _w[i] = m[i]; |
+ } else { |
+ var n = _w[i - 3] ^ _w[i - 8] ^ _w[i - 14] ^ _w[i - 16]; |
+ _w[i] = _rotl32(n, 1); |
+ } |
+ var t = _add32(_add32(_rotl32(a, 5), e), _w[i]); |
+ if (i < 20) { |
+ t = _add32(_add32(t, (b & c) | (~b & d)), 0x5A827999); |
+ } else if (i < 40) { |
+ t = _add32(_add32(t, (b ^ c ^ d)), 0x6ED9EBA1); |
+ } else if (i < 60) { |
+ t = _add32(_add32(t, (b & c) | (b & d) | (c & d)), 0x8F1BBCDC); |
+ } else { |
+ t = _add32(_add32(t, b ^ c ^ d), 0xCA62C1D6); |
+ } |
+ |
+ e = d; |
+ d = c; |
+ c = _rotl32(b, 30); |
+ b = a; |
+ a = t & _MASK_32; |
+ } |
+ |
+ _h[0] = _add32(a, _h[0]); |
+ _h[1] = _add32(b, _h[1]); |
+ _h[2] = _add32(c, _h[2]); |
+ _h[3] = _add32(d, _h[3]); |
+ _h[4] = _add32(e, _h[4]); |
+ } |
+ |
+ List<int> _w; |
+} |
+ |
+// ---------------------------------------------------------------------- |
+// Normalization of the text, i.e., removal or normalization |
+// of elements that do not affect the output from latex |
+ |
+// regexps |
+ |
+var comment_all_re = new RegExp("^%"); |
ricow1
2014/10/13 06:07:04
camel case variables, these could be const? (in wh
eernst
2014/10/13 08:03:26
Now using camel case, but 'const RegExp(..)' is no
eernst
2014/10/13 08:03:30
CamelCasing done.
But it can't be const (can't use
|
+var comment_re = new RegExp("[^\\\\]%[^\\n]*"); |
+var whitespace_all_re = new RegExp("^\\s+\$"); |
+var whitespace_leading_re = new RegExp("^\\s+[^\\n]"); |
+var whitespace_re = new RegExp("[ \\t][ \\t]+"); |
+ |
+// normalization steps |
+ |
+cut_match(line, match, {start_offset:0, end_offset:0, glue:""}) { |
ricow1
2014/10/13 06:07:03
Camel case method names, here and below
eernst
2014/10/13 08:03:30
Done.
|
+ if (match == null) return line; |
+ var start = match.start + start_offset; |
+ var end = match.end + end_offset; |
+ var len = line.length; |
+ if (start < 0) start=0; |
ricow1
2014/10/13 06:07:03
space around =
eernst
2014/10/13 08:03:29
Done.
|
+ if (end > len) end=len; |
ricow1
2014/10/13 06:07:04
space around =
eernst
2014/10/13 08:03:29
Done.
|
+ return line.substring(0,start) + glue + line.substring(end); |
+} |
+ |
+cut_regexp(line, re, {start_offset:0, end_offset:0, glue:""}) { |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Is this function used?
Lasse Reichstein Nielsen
2014/10/15 09:13:58
Is this comment still here? Yes it is. Should it b
|
+ return cut_match(line, re.firstMatch(line), |
+ start_offset: start_offset, |
+ end_offset: end_offset, |
+ glue: glue); |
+} |
+ |
+cut_from_match(line, match, {offset:0, glue:""}) { |
+ if (match == null) return line; |
+ return line.substring(0,match.start+offset) + glue; |
ricow1
2014/10/13 06:07:03
space around +
eernst
2014/10/13 08:03:29
Done.
|
+} |
+ |
+cut_from_regexp(line, re, {offset:0, glue:""}) { |
+ return cut_from_match(line, re.firstMatch(line), offset:offset, glue:glue); |
+} |
+ |
+is_ws_only(line) => whitespace_all_re.firstMatch(line) != null; |
+is_comment_only(line) => comment_all_re.firstMatch(line) != null; |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
This would be the non-regexp version: => line.star
eernst
2014/10/15 12:01:10
Done.
Used to have a slightly more general commen
|
+ |
+just_eol(line) { |
+ if (line.length == 0) return line; |
+ return line[line.length-1] == '\n'? "\n" : ""; |
ricow1
2014/10/13 06:07:03
space before ?
eernst
2014/10/13 08:03:29
Done.
|
+} |
+ |
+strip_comment(line) { |
+ // NB: it is tempting to remove everything from the '%' and out, |
+ // including the final newline, if any, but this does not work. |
+ // The problem is that TeX will do exactly this, but then it will |
+ // add back a character that depends on its state (S, M, or N), |
+ // and it is tricky to maintain a similar state that matches the |
+ // state of TeX faithfully. Hence, we remove the content of |
+ // comments but do not remove the comments themselves, we just |
+ // leave the '%' at the end of the line and let TeX manage its |
+ // states in a way that does not differ from the file from before |
+ // strip_comment |
+ if (is_comment_only(line)) return "%\n"; |
+ return cut_regexp(line, comment_re, start_offset:2); |
+} |
+ |
+// reduce a ws_only line to its eol, remove leading ws |
+// entirely, and reduce multiple ws chars to one |
+normalize_whitespace(line) { |
+ if (is_ws_only(line)) return just_eol(line); |
+ line = cut_regexp(line, whitespace_leading_re, end_offset:-1); |
+ var match; |
+ while ((match = whitespace_re.firstMatch(line)) != null) |
+ line = cut_match(line, match, glue:" "); |
ricow1
2014/10/13 06:07:03
always encapsulate loop and conditional blocks in
eernst
2014/10/13 08:03:26
Done.
eernst
2014/10/13 08:03:30
Done.
|
+ return line; |
+} |
+ |
+// reduce sequences of >1 ws_only lines to 1, and |
ricow1
2014/10/13 06:07:03
reduce -> Reduce
eernst
2014/10/13 08:03:25
Done.
eernst
2014/10/13 08:03:29
Done.
|
+// and sequences of >1 comment_only lines to 1 |
ricow1
2014/10/13 06:07:03
and and -> and
eernst
2014/10/13 08:03:30
Done.
|
+multiline_normalize(lines) { |
+ var oldlines = lines; |
+ var after_blank_lines = false; // does 'line' succeed >0 empty lines? |
+ var after_comment_lines = false; // .. succeed >0 comment_only lines? |
+ lines = new List(); |
+ for (var line in oldlines) { |
+ if (after_blank_lines && after_comment_lines) { |
+ // can never happen |
+ throw new Error("Bug, please report"); |
ricow1
2014/10/13 06:07:04
I would do Bug, please report to eernst@
eernst
2014/10/13 08:03:29
Done.
|
+ } |
ricow1
2014/10/13 06:07:04
move else if up on this line
eernst
2014/10/15 12:01:10
Done.
|
+ else if (after_blank_lines && !after_comment_lines) { |
+ // at least one line before 'line' is ws_only |
+ if (!is_ws_only(line)) { |
+ // blank line block ended |
+ after_comment_lines = is_comment_only(line); |
+ // special case: it seems to be safe to remove comment_only lines |
+ // after ws_only lines, so the TeX state must be predictably right; |
+ // next line will then be after_comment_lines and be dropped, so |
+ // we drop the entire comment block---which is very useful; we can |
+ // also consider this comment line to be an empty line, such that |
+ // subsequent empty lines can be considered to be in a block of |
+ // empty lines; note that almost all variants of this will break.. |
+ if (after_comment_lines) { |
+ // _current_ 'line' a comment_only here |
+ after_blank_lines = true; |
+ after_comment_lines = false; |
+ // and do not add 'line' |
+ } |
ricow1
2014/10/13 06:07:03
move else up here
eernst
2014/10/13 08:03:29
Done.
|
+ else { |
+ // after blanks, but current 'line' is neither blank nor comment |
+ after_blank_lines = false; |
+ lines.add(line); |
+ } |
+ } |
+ else { |
+ // blank line block continues, do not add 'line' |
+ } |
+ } |
+ else if (!after_blank_lines && after_comment_lines) { |
+ // at least one line before 'line' is comment_only |
+ if (!is_comment_only(line)) { |
+ // comment line block ended |
+ after_blank_lines = is_ws_only(line); |
+ after_comment_lines = false; |
+ lines.add(line); |
+ } |
+ else { |
+ // comment line block continues, do not add 'line' |
+ } |
+ } |
+ else /* !after_blank_lines && !after_comment_lines */ { |
+ // no ws_only or comment_only lines preceed 'line' |
+ if (is_ws_only(line)) |
+ after_blank_lines = true; |
+ if (is_comment_only(line)) |
+ after_comment_lines = true; |
+ if (!after_comment_lines) |
+ lines.add(line); |
+ else { |
+ // skipping comment_only line after non_ws, non_comment text |
+ } |
+ } |
+ } |
+ return lines; |
+} |
+ |
+// select the elements in the pipeline |
+ |
+normalize(line) => normalize_whitespace(strip_comment(line)); |
+ |
+sisp_normalize(line) => strip_comment(line); |
+ |
+// testing |
+ |
ricow1
2014/10/13 06:07:03
you should add a real test that imports this file
eernst
2014/10/13 08:03:30
Need a bit more input on how to do this.
|
+one_test_cut_match(line,re,expected) { |
+ stdout.write("cut_match: ${line} --[${re}]--> "); |
+ var result = cut_match(line,new RegExp(re).firstMatch(line)); |
+ stdout.write(result+"\n"); |
+ return expected == result; |
+} |
+ |
+test_cut_match() { |
+ one_test_cut_match("test","e","tst") && |
+ one_test_cut_match("test","te","st") && |
+ one_test_cut_match("test","st","te") && |
+ one_test_cut_match("test","","test") && |
+ one_test_cut_match("test","test","") |
+ ? print("OK") : print("ERROR"); |
+} |
+ |
+// ---------------------------------------------------------------------- |
+// Managing fragments with significant spacing |
+ |
+final dart_code_begin_re = new RegExp("^\\s*\\\\begin{dartCode}"); |
+final dart_code_end_re = new RegExp ("^\\s*\\\\end{dartCode}"); |
+ |
+sisp_is(line, target_re) { |
+ return target_re.firstMatch(line) != null; |
+} |
+ |
+sisp_is_dart_begin(line) => sisp_is(line, dart_code_begin_re); |
+sisp_is_dart_end(line) => sisp_is(line, dart_code_end_re); |
+ |
+// testing |
+ |
+one_test_sisp(sisp_fun, line, expectation) { |
+ var result = sisp_fun(line) == expectation; |
+ stdout.write("sisp_is_dart_*: ${line}: ${expectation}\n"); |
+ return result; |
+} |
+ |
+test_sisp() { |
+ one_test_sisp(sisp_is_dart_begin,"\\begin{dartCode}\n", true) && |
+ one_test_sisp(sisp_is_dart_begin," \\begin{dartCode}\n", true) && |
+ one_test_sisp(sisp_is_dart_begin,"whatever else ..", false) && |
+ one_test_sisp(sisp_is_dart_end,"\\end{dartCode}", true) && |
+ one_test_sisp(sisp_is_dart_end," \\end{dartCode}\t \n", true) && |
+ one_test_sisp(sisp_is_dart_end,"whatever else ..", false) |
+ ? print("OK") : print("ERROR"); |
+} |
+ |
+// ---------------------------------------------------------------------- |
+// io |
+ |
+rl() => stdin.readLineSync( |
ricow1
2014/10/13 06:07:03
we don't normally shorten names, i.e., rl should b
eernst
2014/10/13 08:03:30
Done.
|
+ retainNewlines: true, |
+ encoding: const AsciiCodec()); |
+ |
+// ---------------------------------------------------------------------- |
+// main |
+ |
+main () { |
ricow1
2014/10/13 06:07:03
As stated in the top this becomes much easier if y
|
+ var lines = new List(), line; |
ricow1
2014/10/13 06:07:03
I would do the definition of line on a separate li
eernst
2014/10/13 08:03:30
Done.
|
+ |
+ // single-line normalization |
+ var in_dart_code = false; |
+ while ((line = rl()) != null) { |
+ if (sisp_is_dart_begin(line)) |
+ in_dart_code = true; |
ricow1
2014/10/13 06:07:03
block in {}
eernst
2014/10/13 08:03:28
Already changed this to single line as a result of
eernst
2014/10/13 08:03:29
Changed this to single line already when you descr
|
+ else if (sisp_is_dart_end(line)) |
+ in_dart_code = false; |
ricow1
2014/10/13 06:07:03
block in {}
eernst
2014/10/13 08:03:27
Same situation as l.496.
|
+ if (in_dart_code) lines.add(sisp_normalize(line)); |
ricow1
2014/10/13 06:07:04
always use {} blocks when you have anything but si
eernst
2014/10/13 08:03:30
Ah, so you're saying that the presence of 'else' (
|
+ else lines.add(normalize(line)); |
+ } |
+ |
+ // multi-line normalization |
+ lines = multiline_normalize(lines); |
+ |
+ // output result |
+ for (var line in lines) stdout.write(line); |
+} |