OLD | NEW |
---|---|
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 // | 4 // |
5 // ---------------------------------------------------------------------- | 5 // ---------------------------------------------------------------------- |
6 // This is a very specialized tool which was created in order to support | 6 // This is a very specialized tool which was created in order to support |
7 // adding hash values used as location markers in the LaTeX source of the | 7 // adding hash values used as location markers in the LaTeX source of the |
8 // language specification. It is intended to take its input file as the | 8 // language specification. It is intended to take its input file as the |
9 // first argument and the output file name as the second argument. From | 9 // first argument, an output file name as the second argument, and a |
10 // docs/language a typical usage would be as follows: | 10 // hash listing file name as the third argument. From docs/language a |
11 // typical usage would be as follows: | |
11 // | 12 // |
12 // dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex | 13 // dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
ricow1
2014/10/27 10:08:03
hash.txt - what is this, windows 3.11? :-)
eernst
2014/11/03 14:17:46
What's wrong with that? ;-) The short file names
| |
13 // | 14 // |
14 // This will yield a normalized variant tmp.tex of the language | 15 // This will produce a normalized variant out.tex of the language |
15 // specification with hash values filled in. For more details, please | 16 // specification with hash values filled in, and a listing hash.txt of |
16 // check the language specification source itself. | 17 // all the hash values along with the label of their textual context |
18 // (section, subsection, subsubsection, paragraph) . For more details, | |
19 // please check the language specification source itself. | |
17 // | 20 // |
18 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX | 21 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
19 // source file receieved as input; it will not work with other styles. | 22 // source file receieved as input; it will not work with other styles. |
20 // | |
21 // TODO: The current version does not fill in hash values, it only | |
22 // standardizes the LaTeX source by removing comments and normalizing | |
23 // white space. | |
24 | 23 |
25 import 'dart:io'; | 24 import 'dart:io'; |
26 import 'dart:convert'; | 25 import 'dart:convert'; |
26 import '../pkg/utf/lib/utf.dart'; | |
27 import '../pkg/crypto/lib/crypto.dart'; | 27 import '../pkg/crypto/lib/crypto.dart'; |
28 | 28 |
29 // Normalization of the text, i.e., removal or normalization | 29 // ---------------------------------------------------------------------- |
30 // of elements that do not affect the output from latex | 30 // Normalization of the text: removal or normalization of parts that |
31 // do not affect the output from latex, such as white space | |
31 | 32 |
32 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n | 33 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n |
33 final whitespaceAllRE = new RegExp(r"^\s+$"); | 34 final whitespaceAllRE = new RegExp(r"^\s+$"); |
34 final whitespaceRE = new RegExp(r"[ \t]{2,}"); | 35 final whitespaceRE = new RegExp(r"[ \t]{2,}"); |
35 | 36 |
36 // normalization steps | 37 // Remove 'match'ing part of 'line', possibly with given offsets |
37 | 38 // and inserting the given 'glue' to replace the match |
38 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { | 39 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
39 if (match == null) return line; | 40 if (match == null) return line; |
40 var start = match.start + startOffset; | 41 var start = match.start + startOffset; |
41 var end = match.end + endOffset; | 42 var end = match.end + endOffset; |
42 var len = line.length; | 43 var len = line.length; |
43 if (start < 0) start = 0; | 44 if (start < 0) start = 0; |
44 if (end > len) end = len; | 45 if (end > len) end = len; |
45 return line.substring(0, start) + glue + line.substring(end); | 46 return line.substring(0, start) + glue + line.substring(end); |
46 } | 47 } |
47 | 48 |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
142 // no wsOnly or commentOnly lines preceed 'line' | 143 // no wsOnly or commentOnly lines preceed 'line' |
143 afterBlankLines = isWsOnly(line); | 144 afterBlankLines = isWsOnly(line); |
144 afterCommentLines = isCommentOnly(line); | 145 afterCommentLines = isCommentOnly(line); |
145 if (!afterCommentLines) newLines.add(line); | 146 if (!afterCommentLines) newLines.add(line); |
146 // else skipping commentOnly line after nonWs, nonComment text | 147 // else skipping commentOnly line after nonWs, nonComment text |
147 } | 148 } |
148 } | 149 } |
149 return newLines; | 150 return newLines; |
150 } | 151 } |
151 | 152 |
152 // Selecting the elements in the pipeline | 153 // Select the elements in the pipeline |
153 | 154 |
154 normalize(line) => normalizeWhitespace(stripComment(line)); | 155 normalize(line) => normalizeWhitespace(stripComment(line)); |
155 sispNormalize(line) => stripComment(line); | 156 sispNormalize(line) => stripComment(line); |
156 | 157 |
157 // Managing fragments with significant spacing | 158 // Manage fragments with significant spacing |
158 | 159 |
159 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); | 160 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); |
160 final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); | 161 final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); |
161 | 162 |
162 sispIs(line, targetRE) { | 163 // Recognize begin/end line of a Dart code block |
163 return targetRE.firstMatch(line) != null; | 164 |
164 } | 165 sispIs(line, targetRE) => targetRE.firstMatch(line) != null; |
165 | |
166 sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); | 166 sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); |
167 sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); | 167 sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); |
168 | 168 |
169 // Transform input file into output file | 169 // ---------------------------------------------------------------------- |
170 // Removal of non-normative elements of the text (rationale, commentary) | |
171 | |
172 final hashMarkRE = new RegExp(r"^\\LMHash{.*}\s*$"); | |
173 final hashBlockTerminatorRE = new RegExp(r"\\((|sub(|sub))section|paragraph)"); | |
174 | |
175 // Recognize begin/end line of each block of lines getting a hash value | |
176 isArg(argRE, line) => argRE.firstMatch(line) != null; | |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
=> line.contains(argRE);
Probably reduces to the
eernst
2014/11/03 14:17:46
Done.
| |
177 isHashMarker(line) => isArg(hashMarkRE, line); | |
178 isHashBlockTerminator(line) => isArg(hashBlockTerminatorRE, line); | |
179 | |
180 // Return the indices of lines satisfying the given test | |
181 findLineNumbers(lines, test()) { | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Type of `test` is incorrect, it's typed to be null
eernst
2014/11/03 14:17:46
Interesting! In fact, having worked with types fo
| |
182 var lineNumbers = new List(); | |
183 var lineNumber = 0; | |
184 for (var line in lines) { | |
185 if (test(line)) lineNumbers.add(lineNumber); | |
186 lineNumber++; | |
187 } | |
188 return lineNumbers; | |
189 } | |
190 | |
191 findHashLineNumbers(lines) => findLineNumbers(lines, isHashMarker); | |
192 | |
193 // Return 'line' without the "\\cmdName\s*{..}" command starting at | |
ricow1
2014/10/27 10:08:03
use doc style comments for method comments (///) s
eernst
2014/11/03 14:17:46
Done.
| |
194 // 'startIndex'; note that it is assumed but not checked that 'line' | |
195 // contains "\\cmdType\s*{..", and note that the end of the {..} block | |
196 // is found via brace matching (i.e., nested {..} blocks are handled), | |
197 // but it may break if '{' is made an active character etc.etc. | |
198 removeCommand(line, cmdName, startIndex) { | |
199 const BACKSLASH = 92; // char code for '\\' | |
200 const BRACE_BEGIN = 123; // char code for '{' | |
201 const BRACE_END = 125; // char code for '}' | |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: Two spaces before '//' comments, and you
Lasse Reichstein Nielsen
2014/11/03 11:34:35
Obviously, to align it, it needs to be two *or mor
eernst
2014/11/03 14:17:46
Done.
| |
202 | |
203 var blockStartIndex = startIndex + cmdName.length + 1; | |
204 while (blockStartIndex < line.length && | |
205 line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { | |
206 blockStartIndex++; | |
207 } | |
208 blockStartIndex++; | |
209 if (blockStartIndex > line.length) { | |
210 // caller's fault | |
211 throw "Bug, please report to eernst@"; | |
212 } | |
213 // blockStartIndex just after '{' | |
214 | |
215 var afterEscape = false; // actually after '{' | |
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Two space before '//'. I think that is a general s
eernst
2014/11/03 14:17:47
Couldn't find it in the style guide, but done anyw
| |
216 var braceLevel = 1; // number of '{' minus number of '}' seen | |
217 | |
218 for (var index = blockStartIndex; index < line.length; index++) { | |
219 switch (line.codeUnitAt(index)) { | |
220 case BRACE_BEGIN: | |
221 if (afterEscape) afterEscape = false; else braceLevel++; break; | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
An if-with-an-else should always be put on multipl
eernst
2014/11/03 14:17:46
Done.
| |
222 case BRACE_END: | |
223 if (afterEscape) afterEscape = false; else braceLevel--; break; | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Consider checking if braceLevel goes negative.
Oth
eernst
2014/11/03 14:17:47
Actually, the latex command will be used on the so
| |
224 case BACKSLASH: | |
225 afterEscape = true; break; | |
Lasse Reichstein Nielsen
2014/10/28 10:12:11
Newline after ';'
eernst
2014/11/03 14:17:46
Done. Presumably this does not apply to 'for'?
Lasse Reichstein Nielsen
2014/11/03 16:32:04
Not to 'for' no. It's a matter of "only one statem
| |
226 default: | |
227 afterEscape = false; | |
228 } | |
229 if (braceLevel == 0) { | |
230 return line.substring(0, startIndex) + line.substring(index + 1); | |
231 } | |
232 } | |
233 // removal failed; we consider this to mean that the input is ill-formed | |
234 throw "Unmatched braces"; | |
235 } | |
236 | |
237 final commentaryRE = new RegExp(r"\\commentary\s*{"); | |
238 final rationaleRE = new RegExp(r"\\rationale\s*{"); | |
239 | |
240 removeCommentary(line) { | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
You are actively removing commentary code from the
eernst
2014/11/03 14:17:46
That wouldn't work in this case, because the strin
| |
241 var match = commentaryRE.firstMatch(line); | |
242 if (match == null) return line; | |
243 return removeCommentary(removeCommand(line, r"commentary", match.start)); | |
244 } | |
245 | |
246 removeRationale(line) { | |
247 var match = rationaleRE.firstMatch(line); | |
248 if (match == null) return line; | |
249 return removeRationale(removeCommand(line, r"rationale", match.start)); | |
250 } | |
251 | |
252 // Remove commentary and rationale from 'line' | |
253 simplifyLine(line) { | |
254 var simplerLine = removeCommentary(line); | |
255 simplerLine = removeRationale(simplerLine); | |
256 simplerLine = normalizeWhitespace(simplerLine); | |
257 return simplerLine; | |
258 } | |
259 | |
260 // ---------------------------------------------------------------------- | |
261 // Recognition of line blocks, insertion of block hash into \LMHash{} | |
262 | |
263 final hashMarkArgumentRE = new RegExp(r"{.*}"); | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
Escape '{' characters.
eernst
2014/11/03 14:17:46
Done, here and in several other similar locations.
| |
264 | |
265 cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); | |
266 | |
267 gatherLines(lines, startIndex, nextIndex) { | |
268 var gatheredLine = ""; | |
269 var isFirst = true; | |
270 for (var index = startIndex; index < nextIndex; index++) { | |
271 var line = lines[index]; | |
272 if (isHashBlockTerminator(line)) break; | |
273 if (isFirst) { | |
274 gatheredLine += cleanupLine(line); | |
275 isFirst = false; | |
276 } else { | |
277 gatheredLine += " " + cleanupLine(line); | |
Lasse Reichstein Nielsen
2014/10/28 10:12:13
This will take time O(lines * chars-in-lines), so
Lasse Reichstein Nielsen
2014/10/31 13:52:39
Even more "functional":
lines.getRange(startInde
eernst
2014/11/03 14:17:46
Done, with some adjustments. Nice! ;)
| |
278 } | |
279 } | |
280 return gatheredLine; | |
281 } | |
282 | |
283 // Compute the hash value for the line block starting at 'startIndex' | |
284 // in 'lines' and stopping just before 'nextIndex'; SIDE EFFECT: | |
285 // output the simplified text and its hash value to 'listSink' | |
286 computeHashValue(lines, startIndex, nextIndex, listSink) { | |
287 final hashEncoder = new SHA1(); | |
288 final gatheredLine = gatherLines(lines, startIndex, nextIndex); | |
289 final simplifiedLine = simplifyLine(gatheredLine); | |
290 listSink.write(" % $simplifiedLine\n"); | |
291 hashEncoder.add(encodeUtf8(simplifiedLine)); | |
292 return hashEncoder.close(); | |
293 } | |
294 | |
295 computeHashString(lines, startIndex, nextIndex, listSink) => | |
296 CryptoUtils.bytesToHex(computeHashValue(lines, | |
297 startIndex, | |
298 nextIndex, | |
299 listSink)); | |
300 | |
301 // Compute and add hashes to \LMHash{} lines (which must be on the | |
302 // indices 'hashLineNumbers' of 'lines'), and emit the simplified | |
303 // text and hash values to 'listSink' | |
304 addHashMarks(lines, hashLineNumbers, listSink) { | |
305 if (hashLineNumbers.length == 0) return lines; // noop | |
306 for (var n = 0; n < hashLineNumbers.length - 1; n++) { | |
307 final hashIndex = hashLineNumbers[n]; | |
308 final nextIndex = hashLineNumbers[n + 1]; | |
309 final hashValue = computeHashString(lines, | |
310 hashIndex + 1, | |
311 nextIndex, | |
312 listSink); | |
313 lines[hashIndex] = | |
314 lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); | |
315 listSink.write(" $hashValue\n"); | |
316 } | |
317 | |
318 final hashIndex = hashLineNumbers[hashLineNumbers.length - 1]; | |
319 final nextIndex = lines.length; | |
320 final hashValue = computeHashString(lines, | |
321 hashIndex + 1, | |
322 nextIndex, | |
323 listSink); | |
324 lines[hashIndex] = | |
325 lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}"); | |
326 listSink.write(" $hashValue\n"); | |
327 return lines; | |
328 } | |
329 | |
330 // ---------------------------------------------------------------------- | |
331 // Transformation of input file to output file | |
170 | 332 |
171 main ([args]) { | 333 main ([args]) { |
172 if (args.length != 2) { | 334 if (args.length != 3) { |
173 print("Usage: addlatexhash.dart <input-file> <output-file>"); | 335 print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
174 throw "Received ${args.length} arguments, expected two"; | 336 throw "Received ${args.length} arguments, expected three"; |
175 } | 337 } |
176 | 338 |
339 // latex source | |
Lasse Reichstein Nielsen
2014/10/28 10:12:12
Pedantry: It's "LaTeX" :)
(Yes, I also insist on A
eernst
2014/11/03 14:17:47
Done. ;-)
| |
177 var inputFile = new File(args[0]); | 340 var inputFile = new File(args[0]); |
341 assert(inputFile.existsSync()); | |
342 var lines = inputFile.readAsLinesSync(); | |
343 | |
344 // latex source with 'normalized' spacing etc., and with hash values | |
178 var outputFile = new File(args[1]); | 345 var outputFile = new File(args[1]); |
179 assert(inputFile.existsSync()); | 346 |
180 | 347 // hierarchical list of hash values |
181 var lines = inputFile.readAsLinesSync(); | 348 var listFile = new File(args[2]); |
349 var listSink = listFile.openWrite(); | |
350 | |
182 // single-line normalization | 351 // single-line normalization |
183 var inDartCode = false; | 352 var inDartCode = false; |
184 var newLines = new List(); | 353 var normalizedLines = new List(); |
185 | 354 |
186 for (var line in lines) { | 355 for (var line in lines) { |
187 if (sispIsDartBegin(line)) { | 356 if (sispIsDartBegin(line)) { |
188 inDartCode = true; | 357 inDartCode = true; |
189 } else if (sispIsDartEnd(line)) { | 358 } else if (sispIsDartEnd(line)) { |
190 inDartCode = false; | 359 inDartCode = false; |
191 } | 360 } |
192 if (inDartCode) { | 361 if (inDartCode) { |
193 newLines.add(sispNormalize(line + "\n")); | 362 normalizedLines.add(sispNormalize(line + "\n")); |
194 } else { | 363 } else { |
195 newLines.add(normalize(line + "\n")); | 364 normalizedLines.add(normalize(line + "\n")); |
196 } | 365 } |
197 } | 366 } |
198 | 367 |
199 // multi-line normalization | 368 // multi-line normalization |
200 newLines = multilineNormalize(newLines); | 369 normalizedLines = multilineNormalize(normalizedLines); |
201 | 370 |
202 // output result | 371 // insertion of hash values |
203 outputFile.writeAsStringSync(newLines.join()); | 372 var hashLineNumbers = findHashLineNumbers(normalizedLines); |
373 var hashMarkedLines = addHashMarks(normalizedLines,hashLineNumbers,listSink); | |
374 | |
375 // output | |
376 outputFile.writeAsStringSync(hashMarkedLines.join()); | |
377 listSink.close(); | |
204 } | 378 } |
OLD | NEW |