Chromium Code Reviews| Index: tools/addlatexhash.dart |
| diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..aa92504a56552366a77ed8a4b3ba472209a2a410 |
| --- /dev/null |
| +++ b/tools/addlatexhash.dart |
| @@ -0,0 +1,508 @@ |
| +// ---------------------------------------------------------------------- |
|
ricow1
2014/10/13 06:07:03
Add copyright header to this file
eernst
2014/10/13 08:03:27
Done.
|
| +// File 'addlatexhash.dart' |
|
ricow1
2014/10/13 06:07:03
We don't normally have this as a header, just leav
eernst
2014/10/13 08:03:27
Done.
|
| +// |
| +// This is a very specialized tool which was created in order to support |
| +// adding hash values used as location markers in the LaTeX source of the |
| +// language specification. It is intended to be used as a filter from |
| +// the directory ../docs/language, in commands like the following: |
| +// |
| +// dart ../../tools/addlatexhash.dart < dartLangSpec.tex >tmp.tex |
|
ricow1
2014/10/13 06:07:03
If I where you I would simple use command line par
eernst
2014/10/13 08:03:27
OK. But there are myriads of ways to define the k
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I'd favor taking the input name as unnnamed argume
ricow1
2014/10/15 09:23:06
That is already the case, you are looking at the o
eernst
2014/10/15 12:01:10
Actually, using -o for the output and default to s
|
| +// |
| +// This will yield a variant tmp.tex of the language specification with |
| +// hash values filled in. For more details, please check the language |
| +// specification source itself. |
| + |
| +import 'dart:io'; |
| +import 'dart:convert'; |
| + |
| +// ---------------------------------------------------------------------- |
| +// Computation of SHA1 sums |
| +// |
| +// NB: To keep this script as independent of installation as possible, |
| +// this section was copied from crypto-0.9.0.tar.gz on Oct 9, 2014, |
| +// from https://pub.dartlang.org/packages/crypto, from the source files |
| +// crypto.dart, hash_utils.dart, and sha1.dart, |
| +// with the following copyright statement: |
| +// |
| +// Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| +// for details. All rights reserved. Use of this source code is governed by a |
| +// BSD-style license that can be found in the LICENSE file. |
| + |
| +/** |
| + * Interface for cryptographic hash functions. |
| + * |
| + * The [add] method is used to add data to the hash. The [close] method |
| + * is used to extract the message digest. |
| + * |
| + * Once the [close] method has been called no more data can be added using the |
| + * [add] method. If [add] is called after the first call to [close] a |
| + * HashException is thrown. |
| + * |
| + * If multiple instances of a given Hash is needed the [newInstance] |
| + * method can provide a new instance. |
| + */ |
| +// TODO(floitsch): make Hash implement Sink, EventSink or similar. |
| +abstract class Hash { |
| + /** |
| + * Add a list of bytes to the hash computation. |
| + */ |
| + void add(List<int> data); |
| + |
| + /** |
| + * Finish the hash computation and extract the message digest as |
| + * a list of bytes. |
| + */ |
| + List<int> close(); |
| + |
| + /** |
| + * Returns a new instance of this hash function. |
| + */ |
| + Hash newInstance(); |
| + |
| + /** |
| + * Internal block size of the hash in bytes. |
| + * |
| + * This is exposed for use by the HMAC class which needs to know the |
| + * block size for the [Hash] it is using. |
| + */ |
| + int get blockSize; |
| +} |
| + |
| +// Constants. |
| +const _MASK_8 = 0xff; |
| +const _MASK_32 = 0xffffffff; |
| +const _BITS_PER_BYTE = 8; |
| +const _BYTES_PER_WORD = 4; |
| + |
| +// Helper functions used by more than one hasher. |
| + |
| +// Rotate left limiting to unsigned 32-bit values. |
| +int _rotl32(int val, int shift) { |
| + var mod_shift = shift & 31; |
| + return ((val << mod_shift) & _MASK_32) | |
| + ((val & _MASK_32) >> (32 - mod_shift)); |
| +} |
| + |
| +// Base class encapsulating common behavior for cryptographic hash |
| +// functions. |
| +abstract class _HashBase implements Hash { |
| + _HashBase(int chunkSizeInWords, |
| + int digestSizeInWords, |
| + bool this._bigEndianWords) |
| + : _pendingData = [], |
| + _currentChunk = new List(chunkSizeInWords), |
| + _h = new List(digestSizeInWords), |
| + _chunkSizeInWords = chunkSizeInWords, |
| + _digestSizeInWords = digestSizeInWords; |
| + |
| + // Update the hasher with more data. |
| + void add(List<int> data) { |
| + if (_digestCalled) { |
| + throw new StateError( |
| + 'Hash update method called after digest was retrieved'); |
| + } |
| + _lengthInBytes += data.length; |
| + _pendingData.addAll(data); |
| + _iterate(); |
| + } |
| + |
| + // Finish the hash computation and return the digest string. |
| + List<int> close() { |
| + if (_digestCalled) { |
| + return _resultAsBytes(); |
| + } |
| + _digestCalled = true; |
| + _finalizeData(); |
| + _iterate(); |
| + assert(_pendingData.length == 0); |
| + return _resultAsBytes(); |
| + } |
| + |
| + // Returns the block size of the hash in bytes. |
| + int get blockSize { |
| + return _chunkSizeInWords * _BYTES_PER_WORD; |
| + } |
| + |
| + // One round of the hash computation. |
| + void _updateHash(List<int> m); |
| + |
| + // Helper methods. |
| + int _add32(x, y) => (x + y) & _MASK_32; |
| + int _roundUp(val, n) => (val + n - 1) & -n; |
| + |
| + // Compute the final result as a list of bytes from the hash words. |
| + List<int> _resultAsBytes() { |
| + var result = []; |
| + for (var i = 0; i < _h.length; i++) { |
| + result.addAll(_wordToBytes(_h[i])); |
| + } |
| + return result; |
| + } |
| + |
| + // Converts a list of bytes to a chunk of 32-bit words. |
| + void _bytesToChunk(List<int> data, int dataIndex) { |
| + assert((data.length - dataIndex) >= (_chunkSizeInWords * _BYTES_PER_WORD)); |
| + |
| + for (var wordIndex = 0; wordIndex < _chunkSizeInWords; wordIndex++) { |
| + var w3 = _bigEndianWords ? data[dataIndex] : data[dataIndex + 3]; |
| + var w2 = _bigEndianWords ? data[dataIndex + 1] : data[dataIndex + 2]; |
| + var w1 = _bigEndianWords ? data[dataIndex + 2] : data[dataIndex + 1]; |
| + var w0 = _bigEndianWords ? data[dataIndex + 3] : data[dataIndex]; |
| + dataIndex += 4; |
| + var word = (w3 & 0xff) << 24; |
| + word |= (w2 & _MASK_8) << 16; |
| + word |= (w1 & _MASK_8) << 8; |
| + word |= (w0 & _MASK_8); |
| + _currentChunk[wordIndex] = word; |
| + } |
| + } |
| + |
| + // Convert a 32-bit word to four bytes. |
| + List<int> _wordToBytes(int word) { |
| + List<int> bytes = new List(_BYTES_PER_WORD); |
| + bytes[0] = (word >> (_bigEndianWords ? 24 : 0)) & _MASK_8; |
| + bytes[1] = (word >> (_bigEndianWords ? 16 : 8)) & _MASK_8; |
| + bytes[2] = (word >> (_bigEndianWords ? 8 : 16)) & _MASK_8; |
| + bytes[3] = (word >> (_bigEndianWords ? 0 : 24)) & _MASK_8; |
| + return bytes; |
| + } |
| + |
| + // Iterate through data updating the hash computation for each |
| + // chunk. |
| + void _iterate() { |
| + var len = _pendingData.length; |
| + var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD; |
| + if (len >= chunkSizeInBytes) { |
| + var index = 0; |
| + for (; (len - index) >= chunkSizeInBytes; index += chunkSizeInBytes) { |
| + _bytesToChunk(_pendingData, index); |
| + _updateHash(_currentChunk); |
| + } |
| + _pendingData = _pendingData.sublist(index, len); |
| + } |
| + } |
| + |
| + // Finalize the data. Add a 1 bit to the end of the message. Expand with |
| + // 0 bits and add the length of the message. |
| + void _finalizeData() { |
| + _pendingData.add(0x80); |
| + var contentsLength = _lengthInBytes + 9; |
| + var chunkSizeInBytes = _chunkSizeInWords * _BYTES_PER_WORD; |
| + var finalizedLength = _roundUp(contentsLength, chunkSizeInBytes); |
| + var zeroPadding = finalizedLength - contentsLength; |
| + for (var i = 0; i < zeroPadding; i++) { |
| + _pendingData.add(0); |
| + } |
| + var lengthInBits = _lengthInBytes * _BITS_PER_BYTE; |
| + assert(lengthInBits < pow(2, 32)); |
| + if (_bigEndianWords) { |
| + _pendingData.addAll(_wordToBytes(0)); |
| + _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32)); |
| + } else { |
| + _pendingData.addAll(_wordToBytes(lengthInBits & _MASK_32)); |
| + _pendingData.addAll(_wordToBytes(0)); |
| + } |
| + } |
| + |
| + // Hasher state. |
| + final int _chunkSizeInWords; |
| + final int _digestSizeInWords; |
| + final bool _bigEndianWords; |
| + int _lengthInBytes = 0; |
| + List<int> _pendingData; |
| + final List<int> _currentChunk; |
| + final List<int> _h; |
| + bool _digestCalled = false; |
| +} |
| + |
| +/** |
| + * SHA1 hash function implementation. |
| + */ |
| +class SHA1 extends _HashBase { |
| + // Construct a SHA1 hasher object. |
| + SHA1() : _w = new List(80), super(16, 5, true) { |
| + _h[0] = 0x67452301; |
| + _h[1] = 0xEFCDAB89; |
| + _h[2] = 0x98BADCFE; |
| + _h[3] = 0x10325476; |
| + _h[4] = 0xC3D2E1F0; |
| + } |
| + |
| + // Returns a new instance of this Hash. |
| + SHA1 newInstance() { |
| + return new SHA1(); |
| + } |
| + |
| + // Compute one iteration of the SHA1 algorithm with a chunk of |
| + // 16 32-bit pieces. |
| + void _updateHash(List<int> m) { |
| + assert(m.length == 16); |
| + |
| + var a = _h[0]; |
| + var b = _h[1]; |
| + var c = _h[2]; |
| + var d = _h[3]; |
| + var e = _h[4]; |
| + |
| + for (var i = 0; i < 80; i++) { |
| + if (i < 16) { |
| + _w[i] = m[i]; |
| + } else { |
| + var n = _w[i - 3] ^ _w[i - 8] ^ _w[i - 14] ^ _w[i - 16]; |
| + _w[i] = _rotl32(n, 1); |
| + } |
| + var t = _add32(_add32(_rotl32(a, 5), e), _w[i]); |
| + if (i < 20) { |
| + t = _add32(_add32(t, (b & c) | (~b & d)), 0x5A827999); |
| + } else if (i < 40) { |
| + t = _add32(_add32(t, (b ^ c ^ d)), 0x6ED9EBA1); |
| + } else if (i < 60) { |
| + t = _add32(_add32(t, (b & c) | (b & d) | (c & d)), 0x8F1BBCDC); |
| + } else { |
| + t = _add32(_add32(t, b ^ c ^ d), 0xCA62C1D6); |
| + } |
| + |
| + e = d; |
| + d = c; |
| + c = _rotl32(b, 30); |
| + b = a; |
| + a = t & _MASK_32; |
| + } |
| + |
| + _h[0] = _add32(a, _h[0]); |
| + _h[1] = _add32(b, _h[1]); |
| + _h[2] = _add32(c, _h[2]); |
| + _h[3] = _add32(d, _h[3]); |
| + _h[4] = _add32(e, _h[4]); |
| + } |
| + |
| + List<int> _w; |
| +} |
| + |
| +// ---------------------------------------------------------------------- |
| +// Normalization of the text, i.e., removal or normalization |
| +// of elements that do not affect the output from latex |
| + |
| +// regexps |
| + |
| +var comment_all_re = new RegExp("^%"); |
|
ricow1
2014/10/13 06:07:04
camel case variables, these could be const? (in wh
eernst
2014/10/13 08:03:26
Now using camel case, but 'const RegExp(..)' is no
eernst
2014/10/13 08:03:30
CamelCasing done.
But it can't be const (can't use
|
| +var comment_re = new RegExp("[^\\\\]%[^\\n]*"); |
| +var whitespace_all_re = new RegExp("^\\s+\$"); |
| +var whitespace_leading_re = new RegExp("^\\s+[^\\n]"); |
| +var whitespace_re = new RegExp("[ \\t][ \\t]+"); |
| + |
| +// normalization steps |
| + |
| +cut_match(line, match, {start_offset:0, end_offset:0, glue:""}) { |
|
ricow1
2014/10/13 06:07:03
Camel case method names, here and below
eernst
2014/10/13 08:03:30
Done.
|
| + if (match == null) return line; |
| + var start = match.start + start_offset; |
| + var end = match.end + end_offset; |
| + var len = line.length; |
| + if (start < 0) start=0; |
|
ricow1
2014/10/13 06:07:03
space around =
eernst
2014/10/13 08:03:29
Done.
|
| + if (end > len) end=len; |
|
ricow1
2014/10/13 06:07:04
space around =
eernst
2014/10/13 08:03:29
Done.
|
| + return line.substring(0,start) + glue + line.substring(end); |
| +} |
| + |
| +cut_regexp(line, re, {start_offset:0, end_offset:0, glue:""}) { |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Is this function used?
Lasse Reichstein Nielsen
2014/10/15 09:13:58
Is this comment still here? Yes it is. Should it b
|
| + return cut_match(line, re.firstMatch(line), |
| + start_offset: start_offset, |
| + end_offset: end_offset, |
| + glue: glue); |
| +} |
| + |
| +cut_from_match(line, match, {offset:0, glue:""}) { |
| + if (match == null) return line; |
| + return line.substring(0,match.start+offset) + glue; |
|
ricow1
2014/10/13 06:07:03
space around +
eernst
2014/10/13 08:03:29
Done.
|
| +} |
| + |
| +cut_from_regexp(line, re, {offset:0, glue:""}) { |
| + return cut_from_match(line, re.firstMatch(line), offset:offset, glue:glue); |
| +} |
| + |
| +is_ws_only(line) => whitespace_all_re.firstMatch(line) != null; |
| +is_comment_only(line) => comment_all_re.firstMatch(line) != null; |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
This would be the non-regexp version: => line.star
eernst
2014/10/15 12:01:10
Done.
Used to have a slightly more general commen
|
| + |
| +just_eol(line) { |
| + if (line.length == 0) return line; |
| + return line[line.length-1] == '\n'? "\n" : ""; |
|
ricow1
2014/10/13 06:07:03
space before ?
eernst
2014/10/13 08:03:29
Done.
|
| +} |
| + |
| +strip_comment(line) { |
| + // NB: it is tempting to remove everything from the '%' and out, |
| + // including the final newline, if any, but this does not work. |
| + // The problem is that TeX will do exactly this, but then it will |
| + // add back a character that depends on its state (S, M, or N), |
| + // and it is tricky to maintain a similar state that matches the |
| + // state of TeX faithfully. Hence, we remove the content of |
| + // comments but do not remove the comments themselves, we just |
| + // leave the '%' at the end of the line and let TeX manage its |
| + // states in a way that does not differ from the file from before |
| + // strip_comment |
| + if (is_comment_only(line)) return "%\n"; |
| + return cut_regexp(line, comment_re, start_offset:2); |
| +} |
| + |
| +// reduce a ws_only line to its eol, remove leading ws |
| +// entirely, and reduce multiple ws chars to one |
| +normalize_whitespace(line) { |
| + if (is_ws_only(line)) return just_eol(line); |
| + line = cut_regexp(line, whitespace_leading_re, end_offset:-1); |
| + var match; |
| + while ((match = whitespace_re.firstMatch(line)) != null) |
| + line = cut_match(line, match, glue:" "); |
|
ricow1
2014/10/13 06:07:03
always encapsulate loop and conditional blocks in
eernst
2014/10/13 08:03:26
Done.
eernst
2014/10/13 08:03:30
Done.
|
| + return line; |
| +} |
| + |
| +// reduce sequences of >1 ws_only lines to 1, and |
|
ricow1
2014/10/13 06:07:03
reduce -> Reduce
eernst
2014/10/13 08:03:25
Done.
eernst
2014/10/13 08:03:29
Done.
|
| +// and sequences of >1 comment_only lines to 1 |
|
ricow1
2014/10/13 06:07:03
and and -> and
eernst
2014/10/13 08:03:30
Done.
|
| +multiline_normalize(lines) { |
| + var oldlines = lines; |
| + var after_blank_lines = false; // does 'line' succeed >0 empty lines? |
| + var after_comment_lines = false; // .. succeed >0 comment_only lines? |
| + lines = new List(); |
| + for (var line in oldlines) { |
| + if (after_blank_lines && after_comment_lines) { |
| + // can never happen |
| + throw new Error("Bug, please report"); |
|
ricow1
2014/10/13 06:07:04
I would do Bug, please report to eernst@
eernst
2014/10/13 08:03:29
Done.
|
| + } |
|
ricow1
2014/10/13 06:07:04
move else if up on this line
eernst
2014/10/15 12:01:10
Done.
|
| + else if (after_blank_lines && !after_comment_lines) { |
| + // at least one line before 'line' is ws_only |
| + if (!is_ws_only(line)) { |
| + // blank line block ended |
| + after_comment_lines = is_comment_only(line); |
| + // special case: it seems to be safe to remove comment_only lines |
| + // after ws_only lines, so the TeX state must be predictably right; |
| + // next line will then be after_comment_lines and be dropped, so |
| + // we drop the entire comment block---which is very useful; we can |
| + // also consider this comment line to be an empty line, such that |
| + // subsequent empty lines can be considered to be in a block of |
| + // empty lines; note that almost all variants of this will break.. |
| + if (after_comment_lines) { |
| + // _current_ 'line' a comment_only here |
| + after_blank_lines = true; |
| + after_comment_lines = false; |
| + // and do not add 'line' |
| + } |
|
ricow1
2014/10/13 06:07:03
move else up here
eernst
2014/10/13 08:03:29
Done.
|
| + else { |
| + // after blanks, but current 'line' is neither blank nor comment |
| + after_blank_lines = false; |
| + lines.add(line); |
| + } |
| + } |
| + else { |
| + // blank line block continues, do not add 'line' |
| + } |
| + } |
| + else if (!after_blank_lines && after_comment_lines) { |
| + // at least one line before 'line' is comment_only |
| + if (!is_comment_only(line)) { |
| + // comment line block ended |
| + after_blank_lines = is_ws_only(line); |
| + after_comment_lines = false; |
| + lines.add(line); |
| + } |
| + else { |
| + // comment line block continues, do not add 'line' |
| + } |
| + } |
| + else /* !after_blank_lines && !after_comment_lines */ { |
| + // no ws_only or comment_only lines preceed 'line' |
| + if (is_ws_only(line)) |
| + after_blank_lines = true; |
| + if (is_comment_only(line)) |
| + after_comment_lines = true; |
| + if (!after_comment_lines) |
| + lines.add(line); |
| + else { |
| + // skipping comment_only line after non_ws, non_comment text |
| + } |
| + } |
| + } |
| + return lines; |
| +} |
| + |
| +// select the elements in the pipeline |
| + |
| +normalize(line) => normalize_whitespace(strip_comment(line)); |
| + |
| +sisp_normalize(line) => strip_comment(line); |
| + |
| +// testing |
| + |
|
ricow1
2014/10/13 06:07:03
you should add a real test that imports this file
eernst
2014/10/13 08:03:30
Need a bit more input on how to do this.
|
| +one_test_cut_match(line,re,expected) { |
| + stdout.write("cut_match: ${line} --[${re}]--> "); |
| + var result = cut_match(line,new RegExp(re).firstMatch(line)); |
| + stdout.write(result+"\n"); |
| + return expected == result; |
| +} |
| + |
| +test_cut_match() { |
| + one_test_cut_match("test","e","tst") && |
| + one_test_cut_match("test","te","st") && |
| + one_test_cut_match("test","st","te") && |
| + one_test_cut_match("test","","test") && |
| + one_test_cut_match("test","test","") |
| + ? print("OK") : print("ERROR"); |
| +} |
| + |
| +// ---------------------------------------------------------------------- |
| +// Managing fragments with significant spacing |
| + |
| +final dart_code_begin_re = new RegExp("^\\s*\\\\begin{dartCode}"); |
| +final dart_code_end_re = new RegExp ("^\\s*\\\\end{dartCode}"); |
| + |
| +sisp_is(line, target_re) { |
| + return target_re.firstMatch(line) != null; |
| +} |
| + |
| +sisp_is_dart_begin(line) => sisp_is(line, dart_code_begin_re); |
| +sisp_is_dart_end(line) => sisp_is(line, dart_code_end_re); |
| + |
| +// testing |
| + |
| +one_test_sisp(sisp_fun, line, expectation) { |
| + var result = sisp_fun(line) == expectation; |
| + stdout.write("sisp_is_dart_*: ${line}: ${expectation}\n"); |
| + return result; |
| +} |
| + |
| +test_sisp() { |
| + one_test_sisp(sisp_is_dart_begin,"\\begin{dartCode}\n", true) && |
| + one_test_sisp(sisp_is_dart_begin," \\begin{dartCode}\n", true) && |
| + one_test_sisp(sisp_is_dart_begin,"whatever else ..", false) && |
| + one_test_sisp(sisp_is_dart_end,"\\end{dartCode}", true) && |
| + one_test_sisp(sisp_is_dart_end," \\end{dartCode}\t \n", true) && |
| + one_test_sisp(sisp_is_dart_end,"whatever else ..", false) |
| + ? print("OK") : print("ERROR"); |
| +} |
| + |
| +// ---------------------------------------------------------------------- |
| +// io |
| + |
| +rl() => stdin.readLineSync( |
|
ricow1
2014/10/13 06:07:03
we don't normally shorten names, i.e., rl should b
eernst
2014/10/13 08:03:30
Done.
|
| + retainNewlines: true, |
| + encoding: const AsciiCodec()); |
| + |
| +// ---------------------------------------------------------------------- |
| +// main |
| + |
| +main () { |
|
ricow1
2014/10/13 06:07:03
As stated in the top this becomes much easier if y
|
| + var lines = new List(), line; |
|
ricow1
2014/10/13 06:07:03
I would do the definition of line on a separate li
eernst
2014/10/13 08:03:30
Done.
|
| + |
| + // single-line normalization |
| + var in_dart_code = false; |
| + while ((line = rl()) != null) { |
| + if (sisp_is_dart_begin(line)) |
| + in_dart_code = true; |
|
ricow1
2014/10/13 06:07:03
block in {}
eernst
2014/10/13 08:03:28
Already changed this to single line as a result of
eernst
2014/10/13 08:03:29
Changed this to single line already when you descr
|
| + else if (sisp_is_dart_end(line)) |
| + in_dart_code = false; |
|
ricow1
2014/10/13 06:07:03
block in {}
eernst
2014/10/13 08:03:27
Same situation as l.496.
|
| + if (in_dart_code) lines.add(sisp_normalize(line)); |
|
ricow1
2014/10/13 06:07:04
always use {} blocks when you have anything but si
eernst
2014/10/13 08:03:30
Ah, so you're saying that the presence of 'else' (
|
| + else lines.add(normalize(line)); |
| + } |
| + |
| + // multi-line normalization |
| + lines = multiline_normalize(lines); |
| + |
| + // output result |
| + for (var line in lines) stdout.write(line); |
| +} |