Chromium Code Reviews| Index: tools/addlatexhash.dart |
| diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart |
| index f79a0b32296cffdb983eeb279a2930d41389b341..86802d8a6df7966a69fb203fba0b58762fa606c0 100644 |
| --- a/tools/addlatexhash.dart |
| +++ b/tools/addlatexhash.dart |
| @@ -6,35 +6,36 @@ |
| // This is a very specialized tool which was created in order to support |
| // adding hash values used as location markers in the LaTeX source of the |
| // language specification. It is intended to take its input file as the |
| -// first argument and the output file name as the second argument. From |
| -// docs/language a typical usage would be as follows: |
| +// first argument, an output file name as the second argument, and a |
| +// hash listing file name as the third argument. From docs/language a |
| +// typical usage would be as follows: |
| // |
| -// dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex |
| +// dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
|
ricow1
2014/10/27 10:08:03
hash.txt - what is this, windows 3.11? :-)
eernst
2014/11/03 14:17:46
What's wrong with that? ;-) The short file names
|
| // |
| -// This will yield a normalized variant tmp.tex of the language |
| -// specification with hash values filled in. For more details, please |
| -// check the language specification source itself. |
| +// This will produce a normalized variant out.tex of the language |
| +// specification with hash values filled in, and a listing hash.txt of |
| +// all the hash values along with the label of their textual context |
| +// (section, subsection, subsubsection, paragraph) . For more details, |
| +// please check the language specification source itself. |
| // |
| // NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
| // source file receieved as input; it will not work with other styles. |
| -// |
| -// TODO: The current version does not fill in hash values, it only |
| -// standardizes the LaTeX source by removing comments and normalizing |
| -// white space. |
| import 'dart:io'; |
| import 'dart:convert'; |
| +import '../pkg/utf/lib/utf.dart'; |
| import '../pkg/crypto/lib/crypto.dart'; |
| -// Normalization of the text, i.e., removal or normalization |
| -// of elements that do not affect the output from latex |
| +// ---------------------------------------------------------------------- |
| +// Normalization of the text: removal or normalization of parts that |
| +// do not affect the output from latex, such as white space |
| final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n |
| final whitespaceAllRE = new RegExp(r"^\s+$"); |
| final whitespaceRE = new RegExp(r"[ \t]{2,}"); |
| -// normalization steps |
| - |
| +// Remove 'match'ing part of 'line', possibly with given offsets |
| +// and inserting the given 'glue' to replace the match |
| cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
| if (match == null) return line; |
| var start = match.start + startOffset; |
| @@ -149,39 +150,207 @@ multilineNormalize(lines) { |
| return newLines; |
| } |
| -// Selecting the elements in the pipeline |
| +// Select the elements in the pipeline |
| normalize(line) => normalizeWhitespace(stripComment(line)); |
| sispNormalize(line) => stripComment(line); |
| -// Managing fragments with significant spacing |
| +// Manage fragments with significant spacing |
| final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); |
| final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); |
| -sispIs(line, targetRE) { |
| - return targetRE.firstMatch(line) != null; |
| -} |
| +// Recognize begin/end line of a Dart code block |
| +sispIs(line, targetRE) => targetRE.firstMatch(line) != null; |
| sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); |
| sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); |
| -// Transform input file into output file |
| +// ---------------------------------------------------------------------- |
| +// Removal of non-normative elements of the text (rationale, commentary) |
| + |
| +final hashMarkRE = new RegExp(r"^\\LMHash{.*}\s*$"); |
| +final hashBlockTerminatorRE = new RegExp(r"\\((|sub(|sub))section|paragraph)"); |
| + |
| +// Recognize begin/end line of each block of lines getting a hash value |
| +isArg(argRE, line) => argRE.firstMatch(line) != null; |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:12
=> line.contains(argRE);
Probably reduces to the
eernst
2014/11/03 14:17:46
Done.
|
| +isHashMarker(line) => isArg(hashMarkRE, line); |
| +isHashBlockTerminator(line) => isArg(hashBlockTerminatorRE, line); |
| + |
| +// Return the indices of lines satisfying the given test |
| +findLineNumbers(lines, test()) { |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Type of `test` is incorrect, it's typed to be null
eernst
2014/11/03 14:17:46
Interesting! In fact, having worked with types fo
|
| + var lineNumbers = new List(); |
| + var lineNumber = 0; |
| + for (var line in lines) { |
| + if (test(line)) lineNumbers.add(lineNumber); |
| + lineNumber++; |
| + } |
| + return lineNumbers; |
| +} |
| + |
| +findHashLineNumbers(lines) => findLineNumbers(lines, isHashMarker); |
| + |
| +// Return 'line' without the "\\cmdName\s*{..}" command starting at |
|
ricow1
2014/10/27 10:08:03
use doc style comments for method comments (///) s
eernst
2014/11/03 14:17:46
Done.
|
| +// 'startIndex'; note that it is assumed but not checked that 'line' |
| +// contains "\\cmdType\s*{..", and note that the end of the {..} block |
| +// is found via brace matching (i.e., nested {..} blocks are handled), |
| +// but it may break if '{' is made an active character etc.etc. |
| +removeCommand(line, cmdName, startIndex) { |
| + const BACKSLASH = 92; // char code for '\\' |
| + const BRACE_BEGIN = 123; // char code for '{' |
| + const BRACE_END = 125; // char code for '}' |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: Two spaces before '//' comments, and you
Lasse Reichstein Nielsen
2014/11/03 11:34:35
Obviously, to align it, it needs to be two *or mor
eernst
2014/11/03 14:17:46
Done.
|
| + |
| + var blockStartIndex = startIndex + cmdName.length + 1; |
| + while (blockStartIndex < line.length && |
| + line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { |
| + blockStartIndex++; |
| + } |
| + blockStartIndex++; |
| + if (blockStartIndex > line.length) { |
| + // caller's fault |
| + throw "Bug, please report to eernst@"; |
| + } |
| + // blockStartIndex just after '{' |
| + |
| + var afterEscape = false; // actually after '{' |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Two space before '//'. I think that is a general s
eernst
2014/11/03 14:17:47
Couldn't find it in the style guide, but done anyw
|
| + var braceLevel = 1; // number of '{' minus number of '}' seen |
| + |
| + for (var index = blockStartIndex; index < line.length; index++) { |
| + switch (line.codeUnitAt(index)) { |
| + case BRACE_BEGIN: |
| + if (afterEscape) afterEscape = false; else braceLevel++; break; |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
An if-with-an-else should always be put on multipl
eernst
2014/11/03 14:17:46
Done.
|
| + case BRACE_END: |
| + if (afterEscape) afterEscape = false; else braceLevel--; break; |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Consider checking if braceLevel goes negative.
Oth
eernst
2014/11/03 14:17:47
Actually, the latex command will be used on the so
|
| + case BACKSLASH: |
| + afterEscape = true; break; |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Newline after ';'
eernst
2014/11/03 14:17:46
Done. Presumably this does not apply to 'for'?
Lasse Reichstein Nielsen
2014/11/03 16:32:04
Not to 'for' no. It's a matter of "only one statem
|
| + default: |
| + afterEscape = false; |
| + } |
| + if (braceLevel == 0) { |
| + return line.substring(0, startIndex) + line.substring(index + 1); |
| + } |
| + } |
| + // removal failed; we consider this to mean that the input is ill-formed |
| + throw "Unmatched braces"; |
| +} |
| + |
| +final commentaryRE = new RegExp(r"\\commentary\s*{"); |
| +final rationaleRE = new RegExp(r"\\rationale\s*{"); |
| + |
| +removeCommentary(line) { |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
You are actively removing commentary code from the
eernst
2014/11/03 14:17:46
That wouldn't work in this case, because the strin
|
| + var match = commentaryRE.firstMatch(line); |
| + if (match == null) return line; |
| + return removeCommentary(removeCommand(line, r"commentary", match.start)); |
| +} |
| + |
| +removeRationale(line) { |
| + var match = rationaleRE.firstMatch(line); |
| + if (match == null) return line; |
| + return removeRationale(removeCommand(line, r"rationale", match.start)); |
| +} |
| + |
| +// Remove commentary and rationale from 'line' |
| +simplifyLine(line) { |
| + var simplerLine = removeCommentary(line); |
| + simplerLine = removeRationale(simplerLine); |
| + simplerLine = normalizeWhitespace(simplerLine); |
| + return simplerLine; |
| +} |
| + |
| +// ---------------------------------------------------------------------- |
| +// Recognition of line blocks, insertion of block hash into \LMHash{} |
| + |
| +final hashMarkArgumentRE = new RegExp(r"{.*}"); |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Escape '{' characters.
eernst
2014/11/03 14:17:46
Done, here and in several other similar locations.
|
| + |
| +cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); |
| + |
| +gatherLines(lines, startIndex, nextIndex) { |
| + var gatheredLine = ""; |
| + var isFirst = true; |
| + for (var index = startIndex; index < nextIndex; index++) { |
| + var line = lines[index]; |
| + if (isHashBlockTerminator(line)) break; |
| + if (isFirst) { |
| + gatheredLine += cleanupLine(line); |
| + isFirst = false; |
| + } else { |
| + gatheredLine += " " + cleanupLine(line); |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:13
This will take time O(lines * chars-in-lines), so
Lasse Reichstein Nielsen
2014/10/31 13:52:39
Even more "functional":
lines.getRange(startInde
eernst
2014/11/03 14:17:46
Done, with some adjustments. Nice! ;)
|
| + } |
| + } |
| + return gatheredLine; |
| +} |
| + |
| +// Compute the hash value for the line block starting at 'startIndex' |
| +// in 'lines' and stopping just before 'nextIndex'; SIDE EFFECT: |
| +// output the simplified text and its hash value to 'listSink' |
| +computeHashValue(lines, startIndex, nextIndex, listSink) { |
| + final hashEncoder = new SHA1(); |
| + final gatheredLine = gatherLines(lines, startIndex, nextIndex); |
| + final simplifiedLine = simplifyLine(gatheredLine); |
| + listSink.write(" % $simplifiedLine\n"); |
| + hashEncoder.add(encodeUtf8(simplifiedLine)); |
| + return hashEncoder.close(); |
| +} |
| + |
| +computeHashString(lines, startIndex, nextIndex, listSink) => |
| + CryptoUtils.bytesToHex(computeHashValue(lines, |
| + startIndex, |
| + nextIndex, |
| + listSink)); |
| + |
| +// Compute and add hashes to \LMHash{} lines (which must be on the |
| +// indices 'hashLineNumbers' of 'lines'), and emit the simplified |
| +// text and hash values to 'listSink' |
| +addHashMarks(lines, hashLineNumbers, listSink) { |
| + if (hashLineNumbers.length == 0) return lines; // noop |
| + for (var n = 0; n < hashLineNumbers.length - 1; n++) { |
| + final hashIndex = hashLineNumbers[n]; |
| + final nextIndex = hashLineNumbers[n + 1]; |
| + final hashValue = computeHashString(lines, |
| + hashIndex + 1, |
| + nextIndex, |
| + listSink); |
| + lines[hashIndex] = |
| + lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); |
| + listSink.write(" $hashValue\n"); |
| + } |
| + |
| + final hashIndex = hashLineNumbers[hashLineNumbers.length - 1]; |
| + final nextIndex = lines.length; |
| + final hashValue = computeHashString(lines, |
| + hashIndex + 1, |
| + nextIndex, |
| + listSink); |
| + lines[hashIndex] = |
| + lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); |
| + listSink.write(" $hashValue\n"); |
| + return lines; |
| +} |
| + |
| +// ---------------------------------------------------------------------- |
| +// Transformation of input file to output file |
| main ([args]) { |
| - if (args.length != 2) { |
| - print("Usage: addlatexhash.dart <input-file> <output-file>"); |
| - throw "Received ${args.length} arguments, expected two"; |
| + if (args.length != 3) { |
| + print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
| + throw "Received ${args.length} arguments, expected three"; |
| } |
| + // latex source |
|
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: It's "LaTeX" :)
(Yes, I also insist on A
eernst
2014/11/03 14:17:47
Done. ;-)
|
| var inputFile = new File(args[0]); |
| - var outputFile = new File(args[1]); |
| assert(inputFile.existsSync()); |
| - |
| var lines = inputFile.readAsLinesSync(); |
| + |
| + // latex source with 'normalized' spacing etc., and with hash values |
| + var outputFile = new File(args[1]); |
| + |
| + // hierarchical list of hash values |
| + var listFile = new File(args[2]); |
| + var listSink = listFile.openWrite(); |
| + |
| // single-line normalization |
| var inDartCode = false; |
| - var newLines = new List(); |
| + var normalizedLines = new List(); |
| for (var line in lines) { |
| if (sispIsDartBegin(line)) { |
| @@ -190,15 +359,20 @@ main ([args]) { |
| inDartCode = false; |
| } |
| if (inDartCode) { |
| - newLines.add(sispNormalize(line + "\n")); |
| + normalizedLines.add(sispNormalize(line + "\n")); |
| } else { |
| - newLines.add(normalize(line + "\n")); |
| + normalizedLines.add(normalize(line + "\n")); |
| } |
| } |
| // multi-line normalization |
| - newLines = multilineNormalize(newLines); |
| + normalizedLines = multilineNormalize(normalizedLines); |
| + |
| + // insertion of hash values |
| + var hashLineNumbers = findHashLineNumbers(normalizedLines); |
| + var hashMarkedLines = addHashMarks(normalizedLines,hashLineNumbers,listSink); |
| - // output result |
| - outputFile.writeAsStringSync(newLines.join()); |
| + // output |
| + outputFile.writeAsStringSync(hashMarkedLines.join()); |
| + listSink.close(); |
| } |