Chromium Code Reviews| Index: tools/addlatexhash.dart |
| diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..6f1b446df46b4f8f85fa9820f2d905b775cfdbdc |
| --- /dev/null |
| +++ b/tools/addlatexhash.dart |
| @@ -0,0 +1,215 @@ |
| +// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
| +// for details. All rights reserved. Use of this source code is governed by a |
| +// BSD-style license that can be found in the LICENSE file. |
| +// |
| +// ---------------------------------------------------------------------- |
| +// This is a very specialized tool which was created in order to support |
| +// adding hash values used as location markers in the LaTeX source of the |
| +// language specification. It is intended to take its input file as the |
| +// first argument and the output file name as the second argument. From |
| +// docs/language a typical usage would be as follows: |
| +// |
| +// dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex |
| +// |
| +// This will yield a normalized variant tmp.tex of the language |
| +// specification with hash values filled in. For more details, please |
| +// check the language specification source itself. |
| +// |
| +// NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
| +// source file receieved as input; it will not work with other styles. |
| +// |
| +// TODO: The current version does not fill in hash values, it only |
| +// standardizes the LaTeX source by removing comments and normalizing |
| +// white space. |
| + |
| +import 'dart:io'; |
| +import 'dart:convert'; |
| +import '../pkg/crypto/lib/crypto.dart'; |
| + |
| +// ---------------------------------------------------------------------- |
| +// Normalization of the text, i.e., removal or normalization |
| +// of elements that do not affect the output from latex |
| + |
| +final commentAllRe = new RegExp("^%"); |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Using a RegExp for this is overkill, just do strin
eernst
2014/10/15 13:19:27
Looks more meaningful with "RE". Done.
|
| +final commentRe = new RegExp("[^\\\\]%[^\\n]*"); |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I recommend using raw strings for RegExp sources:
eernst
2014/10/15 13:19:27
Indeed; added the final "*", done.
Lasse Reichstein Nielsen
2014/10/15 14:09:57
Also just noticed that as a regexp, "[^n]" is equi
eernst
2014/10/15 14:26:41
Cool! Done + Added a comment, just in case someon
|
| +final whitespaceAllRe = new RegExp("^\\s+\$"); |
| +final whitespaceLeadingRe = new RegExp("^\\s+[^\\n]"); |
| +final whitespaceRe = new RegExp("[ \\t][ \\t]+"); |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Shorter regexp possible:
final whitespaceRE = ne
eernst
2014/10/15 13:19:27
Done.
|
| + |
| +// normalization steps |
| + |
| +cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
| + if (match == null) return line; |
| + var start = match.start + startOffset; |
| + var end = match.end + endOffset; |
| + var len = line.length; |
| + if (start < 0) start = 0; |
| + if (end > len) end = len; |
| + return line.substring(0, start) + glue + line.substring(end); |
| +} |
| + |
| +cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { |
| + return cutMatch(line, re.firstMatch(line), |
| + startOffset: startOffset, |
| + endOffset: endOffset, |
| + glue: glue); |
| +} |
| + |
| +cutFromMatch(line, match, {offset: 0, glue: ""}) { |
| + if (match == null) return line; |
| + return line.substring(0, match.start + offset) + glue; |
| +} |
| + |
| +cutFromRegexp(line, re, {offset: 0, glue: ""}) { |
| + return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); |
| +} |
| + |
| +isWsOnly(line) => whitespaceAllRe.firstMatch(line) != null; |
| +isCommentOnly(line) => commentAllRe.firstMatch(line) != null; |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
This would be the non-regexp version: => line.star
eernst
2014/10/15 13:19:27
Arg null is treated also as an error by firstMatch
|
| + |
| +justEol(line) { |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I'd prefer return types, and parameter types, in g
eernst
2014/10/15 13:19:27
I decided to use a typeless style in this particul
|
| + if (line.length == 0) return line; |
| + return line[line.length-1] == "\n" ? "\n" : ""; |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
return line.endsWith("\n") ? "\n" : "";
eernst
2014/10/15 13:19:27
Done.
|
| +} |
| + |
| +stripComment(line) { |
| + // NB: it is tempting to remove everything from the '%' and out, |
| + // including the final newline, if any, but this does not work. |
| + // The problem is that TeX will do exactly this, but then it will |
| + // add back a character that depends on its state (S, M, or N), |
| + // and it is tricky to maintain a similar state that matches the |
| + // state of TeX faithfully. Hence, we remove the content of |
| + // comments but do not remove the comments themselves, we just |
| + // leave the '%' at the end of the line and let TeX manage its |
| + // states in a way that does not differ from the file from before |
| + // stripComment |
| + if (isCommentOnly(line)) return "%\n"; |
| + return cutRegexp(line, commentRe, startOffset: 2); |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Doesn't this loose the trailing '\n'?
eernst
2014/10/15 13:19:27
No, commentRE avoids matching a trailing '\n', so
|
| +} |
| + |
| +// Reduce a wsOnly line to its eol, remove leading ws |
| +// entirely, and reduce multiple ws chars to one |
| +normalizeWhitespace(line) { |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
To remove leading WS (including empty lines), try:
eernst
2014/10/15 13:19:27
Done.
|
| + if (isWsOnly(line)) return justEol(line); |
| + line = cutRegexp(line, whitespaceLeadingRe, endOffset: -1); |
| + var match; |
| + while ((match = whitespaceRe.firstMatch(line)) != null) { |
| + line = cutMatch(line, match, glue: " "); |
| + } |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Maybe just:
line = line.replaceAll(whitespaceRe,
eernst
2014/10/15 13:19:27
Entire method much nicer now! Done.
|
| + return line; |
| +} |
| + |
| +// Reduce sequences of >1 wsOnly lines to 1, and sequences of >1 |
| +// commentOnly lines to 1; moreover, treat commentOnly lines as |
| +// wsOnly when occurring in wsOnly line blocks |
| +multilineNormalize(lines) { |
| + var afterBlankLines = false; // does 'line' succeed >0 empty lines? |
| + var afterCommentLines = false; // .. succeed >0 commentOnly lines? |
| + var newLines = new List(); |
| + for (var line in lines) { |
| + if (afterBlankLines && afterCommentLines) { |
| + // can never happen |
| + throw "Bug, please report to eernst@"; |
| + } else if (afterBlankLines && !afterCommentLines) { |
| + // at least one line before 'line' is wsOnly |
| + if (!isWsOnly(line)) { |
| + // blank line block ended |
| + afterCommentLines = isCommentOnly(line); |
| + // special case: it seems to be safe to remove commentOnly lines |
| + // after wsOnly lines, so the TeX state must be predictably right; |
| + // next line will then be afterCommentLines and be dropped, so |
| + // we drop the entire comment block---which is very useful; we can |
| + // also consider this comment line to be an empty line, such that |
| + // subsequent empty lines can be considered to be in a block of |
| + // empty lines; note that almost all variants of this will break.. |
| + if (afterCommentLines) { |
| + // _current_ 'line' a commentOnly here |
| + afterBlankLines = true; |
| + afterCommentLines = false; |
| + // and do not add 'line' |
| + } else { |
| + // after blanks, but current 'line' is neither blank nor comment |
| + afterBlankLines = false; |
| + newLines.add(line); |
| + } |
| + } else { |
| + // blank line block continues, do not add 'line' |
| + } |
| + } else if (!afterBlankLines && afterCommentLines) { |
| + // at least one line before 'line' is commentOnly |
| + if (!isCommentOnly(line)) { |
| + // comment line block ended |
| + afterBlankLines = isWsOnly(line); |
| + afterCommentLines = false; |
| + newLines.add(line); |
| + } else { |
| + // comment line block continues, do not add 'line' |
| + } |
| + } else { |
| + assert(!afterBlankLines && !afterCommentLines); |
| + // no wsOnly or commentOnly lines preceed 'line' |
| + if (isWsOnly(line)) afterBlankLines = true; |
| + if (isCommentOnly(line)) afterCommentLines = true; |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Maybe:
if (isCommentOnly(line)) {
afterCOmmentL
eernst
2014/10/15 13:19:27
Actually that was because of the following pattern
|
| + if (!afterCommentLines) newLines.add(line); |
| + // else skipping commentOnly line after nonWs, nonComment text |
| + } |
| + } |
| + return newLines; |
| +} |
| + |
| +// select the elements in the pipeline |
| + |
| +normalize(line) => normalizeWhitespace(stripComment(line)); |
| + |
| +sispNormalize(line) => stripComment(line); |
| + |
| +// ---------------------------------------------------------------------- |
| +// Managing fragments with significant spacing |
| + |
| +final dartCodeBeginRe = new RegExp("^\\s*\\\\begin{dartCode}"); |
|
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Need to escape '{' and '}' in RegExp:
new RegEx
eernst
2014/10/15 13:19:27
OK. Note that we get no exceptions for this one,
Lasse Reichstein Nielsen
2014/10/15 14:09:57
RegExp in browsers have traditionally been very fo
eernst
2014/10/15 14:26:41
What's the smart way to install a wakeup call to a
|
| +final dartCodeEndRe = new RegExp ("^\\s*\\\\end{dartCode}"); |
| + |
| +sispIs(line, targetRe) { |
| + return targetRe.firstMatch(line) != null; |
| +} |
| + |
| +sispIsDartBegin(line) => sispIs(line, dartCodeBeginRe); |
| +sispIsDartEnd(line) => sispIs(line, dartCodeEndRe); |
| + |
| +// ---------------------------------------------------------------------- |
| +// main |
| + |
| +main ([args]) { |
| + if (args.length != 2) { |
| + print("Usage: addlatexhash.dart <input-file> <output-file>"); |
| + throw "Received ${args.length} arguments, expected two"; |
| + } |
| + |
| + var inputFile = new File(args[0]); |
| + var outputFile = new File(args[1]); |
| + assert(inputFile.existsSync()); |
| + |
| + var lines = inputFile.readAsLinesSync(); |
| + // single-line normalization |
| + var inDartCode = false; |
| + var newLines = new List(); |
| + |
| + for (var line in lines) { |
| + if (sispIsDartBegin(line)) { |
| + inDartCode = true; |
| + } else if (sispIsDartEnd(line)) { |
| + inDartCode = false; |
| + } |
| + if (inDartCode) { |
| + newLines.add(sispNormalize(line + "\n")); |
| + } else { |
| + newLines.add(normalize(line + "\n")); |
| + } |
| + } |
| + |
| + // multi-line normalization |
| + newLines = multilineNormalize(newLines); |
| + |
| + // output result |
| + outputFile.writeAsStringSync(newLines.join()); |
| +} |