Index: tools/addlatexhash.dart |
diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart |
index f79a0b32296cffdb983eeb279a2930d41389b341..86802d8a6df7966a69fb203fba0b58762fa606c0 100644 |
--- a/tools/addlatexhash.dart |
+++ b/tools/addlatexhash.dart |
@@ -6,35 +6,36 @@ |
// This is a very specialized tool which was created in order to support |
// adding hash values used as location markers in the LaTeX source of the |
// language specification. It is intended to take its input file as the |
-// first argument and the output file name as the second argument. From |
-// docs/language a typical usage would be as follows: |
+// first argument, an output file name as the second argument, and a |
+// hash listing file name as the third argument. From docs/language a |
+// typical usage would be as follows: |
// |
-// dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex |
+// dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
ricow1
2014/10/27 10:08:03
hash.txt - what is this, windows 3.11? :-)
eernst
2014/11/03 14:17:46
What's wrong with that? ;-) The short file names
|
// |
-// This will yield a normalized variant tmp.tex of the language |
-// specification with hash values filled in. For more details, please |
-// check the language specification source itself. |
+// This will produce a normalized variant out.tex of the language |
+// specification with hash values filled in, and a listing hash.txt of |
+// all the hash values along with the label of their textual context |
+// (section, subsection, subsubsection, paragraph) . For more details, |
+// please check the language specification source itself. |
// |
// NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
// source file receieved as input; it will not work with other styles. |
-// |
-// TODO: The current version does not fill in hash values, it only |
-// standardizes the LaTeX source by removing comments and normalizing |
-// white space. |
import 'dart:io'; |
import 'dart:convert'; |
+import '../pkg/utf/lib/utf.dart'; |
import '../pkg/crypto/lib/crypto.dart'; |
-// Normalization of the text, i.e., removal or normalization |
-// of elements that do not affect the output from latex |
+// ---------------------------------------------------------------------- |
+// Normalization of the text: removal or normalization of parts that |
+// do not affect the output from latex, such as white space |
final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n |
final whitespaceAllRE = new RegExp(r"^\s+$"); |
final whitespaceRE = new RegExp(r"[ \t]{2,}"); |
-// normalization steps |
- |
+// Remove 'match'ing part of 'line', possibly with given offsets |
+// and inserting the given 'glue' to replace the match |
cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
if (match == null) return line; |
var start = match.start + startOffset; |
@@ -149,39 +150,207 @@ multilineNormalize(lines) { |
return newLines; |
} |
-// Selecting the elements in the pipeline |
+// Select the elements in the pipeline |
normalize(line) => normalizeWhitespace(stripComment(line)); |
sispNormalize(line) => stripComment(line); |
-// Managing fragments with significant spacing |
+// Manage fragments with significant spacing |
final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); |
final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); |
-sispIs(line, targetRE) { |
- return targetRE.firstMatch(line) != null; |
-} |
+// Recognize begin/end line of a Dart code block |
+sispIs(line, targetRE) => targetRE.firstMatch(line) != null; |
sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); |
sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); |
-// Transform input file into output file |
+// ---------------------------------------------------------------------- |
+// Removal of non-normative elements of the text (rationale, commentary) |
+ |
+final hashMarkRE = new RegExp(r"^\\LMHash{.*}\s*$"); |
+final hashBlockTerminatorRE = new RegExp(r"\\((|sub(|sub))section|paragraph)"); |
+ |
+// Recognize begin/end line of each block of lines getting a hash value |
+isArg(argRE, line) => argRE.firstMatch(line) != null; |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
=> line.contains(argRE);
Probably reduces to the
eernst
2014/11/03 14:17:46
Done.
|
+isHashMarker(line) => isArg(hashMarkRE, line); |
+isHashBlockTerminator(line) => isArg(hashBlockTerminatorRE, line); |
+ |
+// Return the indices of lines satisfying the given test |
+findLineNumbers(lines, test()) { |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Type of `test` is incorrect, it's typed to be null
eernst
2014/11/03 14:17:46
Interesting! In fact, having worked with types fo
|
+ var lineNumbers = new List(); |
+ var lineNumber = 0; |
+ for (var line in lines) { |
+ if (test(line)) lineNumbers.add(lineNumber); |
+ lineNumber++; |
+ } |
+ return lineNumbers; |
+} |
+ |
+findHashLineNumbers(lines) => findLineNumbers(lines, isHashMarker); |
+ |
+// Return 'line' without the "\\cmdName\s*{..}" command starting at |
ricow1
2014/10/27 10:08:03
use doc style comments for method comments (///) s
eernst
2014/11/03 14:17:46
Done.
|
+// 'startIndex'; note that it is assumed but not checked that 'line' |
+// contains "\\cmdType\s*{..", and note that the end of the {..} block |
+// is found via brace matching (i.e., nested {..} blocks are handled), |
+// but it may break if '{' is made an active character etc.etc. |
+removeCommand(line, cmdName, startIndex) { |
+ const BACKSLASH = 92; // char code for '\\' |
+ const BRACE_BEGIN = 123; // char code for '{' |
+ const BRACE_END = 125; // char code for '}' |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: Two spaces before '//' comments, and you
Lasse Reichstein Nielsen
2014/11/03 11:34:35
Obviously, to align it, it needs to be two *or mor
eernst
2014/11/03 14:17:46
Done.
|
+ |
+ var blockStartIndex = startIndex + cmdName.length + 1; |
+ while (blockStartIndex < line.length && |
+ line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { |
+ blockStartIndex++; |
+ } |
+ blockStartIndex++; |
+ if (blockStartIndex > line.length) { |
+ // caller's fault |
+ throw "Bug, please report to eernst@"; |
+ } |
+ // blockStartIndex just after '{' |
+ |
+ var afterEscape = false; // actually after '{' |
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Two space before '//'. I think that is a general s
eernst
2014/11/03 14:17:47
Couldn't find it in the style guide, but done anyw
|
+ var braceLevel = 1; // number of '{' minus number of '}' seen |
+ |
+ for (var index = blockStartIndex; index < line.length; index++) { |
+ switch (line.codeUnitAt(index)) { |
+ case BRACE_BEGIN: |
+ if (afterEscape) afterEscape = false; else braceLevel++; break; |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
An if-with-an-else should always be put on multipl
eernst
2014/11/03 14:17:46
Done.
|
+ case BRACE_END: |
+ if (afterEscape) afterEscape = false; else braceLevel--; break; |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Consider checking if braceLevel goes negative.
Oth
eernst
2014/11/03 14:17:47
Actually, the latex command will be used on the so
|
+ case BACKSLASH: |
+ afterEscape = true; break; |
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Newline after ';'
eernst
2014/11/03 14:17:46
Done. Presumably this does not apply to 'for'?
Lasse Reichstein Nielsen
2014/11/03 16:32:04
Not to 'for' no. It's a matter of "only one statem
|
+ default: |
+ afterEscape = false; |
+ } |
+ if (braceLevel == 0) { |
+ return line.substring(0, startIndex) + line.substring(index + 1); |
+ } |
+ } |
+ // removal failed; we consider this to mean that the input is ill-formed |
+ throw "Unmatched braces"; |
+} |
+ |
+final commentaryRE = new RegExp(r"\\commentary\s*{"); |
+final rationaleRE = new RegExp(r"\\rationale\s*{"); |
+ |
+removeCommentary(line) { |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
You are actively removing commentary code from the
eernst
2014/11/03 14:17:46
That wouldn't work in this case, because the strin
|
+ var match = commentaryRE.firstMatch(line); |
+ if (match == null) return line; |
+ return removeCommentary(removeCommand(line, r"commentary", match.start)); |
+} |
+ |
+removeRationale(line) { |
+ var match = rationaleRE.firstMatch(line); |
+ if (match == null) return line; |
+ return removeRationale(removeCommand(line, r"rationale", match.start)); |
+} |
+ |
+// Remove commentary and rationale from 'line' |
+simplifyLine(line) { |
+ var simplerLine = removeCommentary(line); |
+ simplerLine = removeRationale(simplerLine); |
+ simplerLine = normalizeWhitespace(simplerLine); |
+ return simplerLine; |
+} |
+ |
+// ---------------------------------------------------------------------- |
+// Recognition of line blocks, insertion of block hash into \LMHash{} |
+ |
+final hashMarkArgumentRE = new RegExp(r"{.*}"); |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Escape '{' characters.
eernst
2014/11/03 14:17:46
Done, here and in several other similar locations.
|
+ |
+cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); |
+ |
+gatherLines(lines, startIndex, nextIndex) { |
+ var gatheredLine = ""; |
+ var isFirst = true; |
+ for (var index = startIndex; index < nextIndex; index++) { |
+ var line = lines[index]; |
+ if (isHashBlockTerminator(line)) break; |
+ if (isFirst) { |
+ gatheredLine += cleanupLine(line); |
+ isFirst = false; |
+ } else { |
+ gatheredLine += " " + cleanupLine(line); |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
This will take time O(lines * chars-in-lines), so
Lasse Reichstein Nielsen
2014/10/31 13:52:39
Even more "functional":
lines.getRange(startInde
eernst
2014/11/03 14:17:46
Done, with some adjustments. Nice! ;)
|
+ } |
+ } |
+ return gatheredLine; |
+} |
+ |
+// Compute the hash value for the line block starting at 'startIndex' |
+// in 'lines' and stopping just before 'nextIndex'; SIDE EFFECT: |
+// output the simplified text and its hash value to 'listSink' |
+computeHashValue(lines, startIndex, nextIndex, listSink) { |
+ final hashEncoder = new SHA1(); |
+ final gatheredLine = gatherLines(lines, startIndex, nextIndex); |
+ final simplifiedLine = simplifyLine(gatheredLine); |
+ listSink.write(" % $simplifiedLine\n"); |
+ hashEncoder.add(encodeUtf8(simplifiedLine)); |
+ return hashEncoder.close(); |
+} |
+ |
+computeHashString(lines, startIndex, nextIndex, listSink) => |
+ CryptoUtils.bytesToHex(computeHashValue(lines, |
+ startIndex, |
+ nextIndex, |
+ listSink)); |
+ |
+// Compute and add hashes to \LMHash{} lines (which must be on the |
+// indices 'hashLineNumbers' of 'lines'), and emit the simplified |
+// text and hash values to 'listSink' |
+addHashMarks(lines, hashLineNumbers, listSink) { |
+ if (hashLineNumbers.length == 0) return lines; // noop |
+ for (var n = 0; n < hashLineNumbers.length - 1; n++) { |
+ final hashIndex = hashLineNumbers[n]; |
+ final nextIndex = hashLineNumbers[n + 1]; |
+ final hashValue = computeHashString(lines, |
+ hashIndex + 1, |
+ nextIndex, |
+ listSink); |
+ lines[hashIndex] = |
+ lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); |
+ listSink.write(" $hashValue\n"); |
+ } |
+ |
+ final hashIndex = hashLineNumbers[hashLineNumbers.length - 1]; |
+ final nextIndex = lines.length; |
+ final hashValue = computeHashString(lines, |
+ hashIndex + 1, |
+ nextIndex, |
+ listSink); |
+ lines[hashIndex] = |
+ lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); |
+ listSink.write(" $hashValue\n"); |
+ return lines; |
+} |
+ |
+// ---------------------------------------------------------------------- |
+// Transformation of input file to output file |
main ([args]) { |
- if (args.length != 2) { |
- print("Usage: addlatexhash.dart <input-file> <output-file>"); |
- throw "Received ${args.length} arguments, expected two"; |
+ if (args.length != 3) { |
+ print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
+ throw "Received ${args.length} arguments, expected three"; |
} |
+ // latex source |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: It's "LaTeX" :)
(Yes, I also insist on A
eernst
2014/11/03 14:17:47
Done. ;-)
|
var inputFile = new File(args[0]); |
- var outputFile = new File(args[1]); |
assert(inputFile.existsSync()); |
- |
var lines = inputFile.readAsLinesSync(); |
+ |
+ // latex source with 'normalized' spacing etc., and with hash values |
+ var outputFile = new File(args[1]); |
+ |
+ // hierarchical list of hash values |
+ var listFile = new File(args[2]); |
+ var listSink = listFile.openWrite(); |
+ |
// single-line normalization |
var inDartCode = false; |
- var newLines = new List(); |
+ var normalizedLines = new List(); |
for (var line in lines) { |
if (sispIsDartBegin(line)) { |
@@ -190,15 +359,20 @@ main ([args]) { |
inDartCode = false; |
} |
if (inDartCode) { |
- newLines.add(sispNormalize(line + "\n")); |
+ normalizedLines.add(sispNormalize(line + "\n")); |
} else { |
- newLines.add(normalize(line + "\n")); |
+ normalizedLines.add(normalize(line + "\n")); |
} |
} |
// multi-line normalization |
- newLines = multilineNormalize(newLines); |
+ normalizedLines = multilineNormalize(normalizedLines); |
+ |
+ // insertion of hash values |
+ var hashLineNumbers = findHashLineNumbers(normalizedLines); |
+ var hashMarkedLines = addHashMarks(normalizedLines,hashLineNumbers,listSink); |
- // output result |
- outputFile.writeAsStringSync(newLines.join()); |
+ // output |
+ outputFile.writeAsStringSync(hashMarkedLines.join()); |
+ listSink.close(); |
} |