OLD | NEW |
---|---|
1 #!/usr/bin/env dart | |
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | 2 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 3 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 4 // BSD-style license that can be found in the LICENSE file. |
4 // | 5 // |
5 // ---------------------------------------------------------------------- | 6 // ---------------------------------------------------------------------- |
6 // This is a very specialized tool which was created in order to support | 7 // This is a very specialized tool which was created in order to support |
7 // adding hash values used as location markers in the LaTeX source of the | 8 // adding hash values used as location markers in the LaTeX source of the |
8 // language specification. It is intended to take its input file as the | 9 // language specification. It is intended to take its input file as the |
9 // first argument and the output file name as the second argument. From | 10 // first argument, an output file name as the second argument, and a |
10 // docs/language a typical usage would be as follows: | 11 // hash listing file name as the third argument. From docs/language a |
12 // typical usage would be as follows: | |
11 // | 13 // |
12 // dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex | 14 // dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
13 // | 15 // |
14 // This will yield a normalized variant tmp.tex of the language | 16 // This will produce a normalized variant out.tex of the language |
15 // specification with hash values filled in. For more details, please | 17 // specification with hash values filled in, and a listing hash.txt of |
16 // check the language specification source itself. | 18 // all the hash values along with the label of their textual context |
19 // (section, subsection, subsubsection, paragraph) . For more details, | |
20 // please check the language specification source itself. | |
17 // | 21 // |
18 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX | 22 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
19 // source file receieved as input; it will not work with other styles. | 23 // source file receieved as input; it will not work with other styles. |
20 // | |
21 // TODO: The current version does not fill in hash values, it only | |
22 // standardizes the LaTeX source by removing comments and normalizing | |
23 // white space. | |
24 | 24 |
25 import 'dart:io'; | 25 import 'dart:io'; |
26 import 'dart:convert'; | 26 import 'dart:convert'; |
27 import '../pkg/utf/lib/utf.dart'; | |
27 import '../pkg/crypto/lib/crypto.dart'; | 28 import '../pkg/crypto/lib/crypto.dart'; |
28 | 29 |
29 // Normalization of the text, i.e., removal or normalization | 30 // ---------------------------------------------------------------------- |
30 // of elements that do not affect the output from latex | 31 // Normalization of the text: removal or normalization of parts that |
32 // do not affect the output from latex, such as white space. | |
31 | 33 |
32 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n | 34 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n. |
33 final whitespaceAllRE = new RegExp(r"^\s+$"); | 35 final whitespaceAllRE = new RegExp(r"^\s+$"); |
34 final whitespaceRE = new RegExp(r"[ \t]{2,}"); | 36 final whitespaceRE = new RegExp(r"[ \t]{2,}"); |
Lasse Reichstein Nielsen
2014/11/11 08:13:48
Why is one "whitespace" using \s and the other [ \
eernst
2014/11/11 09:04:54
That's indeed wrong --- the intention is to match
Lasse Reichstein Nielsen
2014/11/11 09:13:23
There is no special recommendation in a RegExp con
| |
35 | 37 |
36 // normalization steps | 38 /// Removes [match]ing part of [line], adjusting that part with the |
37 | 39 /// given [startOffset] and [endOffset], bounded to be valid indices |
40 /// into the string if needed, then inserts [glue] where text was | |
41 /// removed. If there is no match then [line] is returned. | |
38 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { | 42 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
39 if (match == null) return line; | 43 if (match == null) return line; |
40 var start = match.start + startOffset; | 44 var start = match.start + startOffset; |
41 var end = match.end + endOffset; | 45 var end = match.end + endOffset; |
42 var len = line.length; | 46 var len = line.length; |
43 if (start < 0) start = 0; | 47 if (start < 0) start = 0; |
44 if (end > len) end = len; | 48 if (end > len) end = len; |
45 return line.substring(0, start) + glue + line.substring(end); | 49 return line.substring(0, start) + glue + line.substring(end); |
46 } | 50 } |
47 | 51 |
48 cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { | 52 cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { |
49 return cutMatch(line, re.firstMatch(line), | 53 return cutMatch(line, re.firstMatch(line), |
50 startOffset: startOffset, | 54 startOffset: startOffset, |
51 endOffset: endOffset, | 55 endOffset: endOffset, |
52 glue: glue); | 56 glue: glue); |
53 } | 57 } |
54 | 58 |
59 /// Removes the rest of [line] starting from the beginning of the | |
60 /// given [match], and adjusting with the given [offset]. If there | |
61 /// is no match then [line] is returned. | |
55 cutFromMatch(line, match, {offset: 0, glue: ""}) { | 62 cutFromMatch(line, match, {offset: 0, glue: ""}) { |
56 if (match == null) return line; | 63 if (match == null) return line; |
57 return line.substring(0, match.start + offset) + glue; | 64 return line.substring(0, match.start + offset) + glue; |
58 } | 65 } |
59 | 66 |
60 cutFromRegexp(line, re, {offset: 0, glue: ""}) { | 67 cutFromRegexp(line, re, {offset: 0, glue: ""}) { |
61 return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); | 68 return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); |
62 } | 69 } |
63 | 70 |
64 isWsOnly(line) => whitespaceAllRE.firstMatch(line) != null; | 71 isWsOnly(line) => whitespaceAllRE.firstMatch(line) != null; |
65 isCommentOnly(line) => line.startsWith("%"); | 72 isCommentOnly(line) => line.startsWith("%"); |
66 | 73 |
74 /// Returns the end-of-line character at the end of [line], if any, | |
75 /// otherwise returns the empty string. | |
67 justEol(line) { | 76 justEol(line) { |
68 return line.endsWith("\n") ? "\n" : ""; | 77 return line.endsWith("\n") ? "\n" : ""; |
69 } | 78 } |
70 | 79 |
80 /// Removes the contents of the comment at the end of [line], | |
81 /// leaving the "%" in place. If no comment is present, | |
82 /// return [line]. | |
83 /// | |
84 /// NB: it is tempting to remove everything from the '%' and out, | |
85 /// including the final newline, if any, but this does not work. | |
86 /// The problem is that TeX will do exactly this, but then it will | |
87 /// add back a character that depends on its state (S, M, or N), | |
88 /// and it is tricky to maintain a similar state that matches the | |
89 /// state of TeX faithfully. Hence, we remove the content of | |
90 /// comments but do not remove the comments themselves, we just | |
91 /// leave the '%' at the end of the line and let TeX manage its | |
92 /// states in a way that does not differ from the file from before | |
93 /// stripComment. | |
71 stripComment(line) { | 94 stripComment(line) { |
72 // NB: it is tempting to remove everything from the '%' and out, | |
73 // including the final newline, if any, but this does not work. | |
74 // The problem is that TeX will do exactly this, but then it will | |
75 // add back a character that depends on its state (S, M, or N), | |
76 // and it is tricky to maintain a similar state that matches the | |
77 // state of TeX faithfully. Hence, we remove the content of | |
78 // comments but do not remove the comments themselves, we just | |
79 // leave the '%' at the end of the line and let TeX manage its | |
80 // states in a way that does not differ from the file from before | |
81 // stripComment | |
82 if (isCommentOnly(line)) return "%\n"; | 95 if (isCommentOnly(line)) return "%\n"; |
83 return cutRegexp(line, commentRE, startOffset: 2); | 96 return cutRegexp(line, commentRE, startOffset: 2); |
84 } | 97 } |
85 | 98 |
86 // Reduce a wsOnly line to its eol, remove leading ws | 99 /// Reduces a white-space-only [line] to its eol character, |
87 // entirely, and reduce multiple ws chars to one | 100 /// removes leading ws entirely, and reduces multiple |
101 /// white-space chars to one. | |
88 normalizeWhitespace(line) { | 102 normalizeWhitespace(line) { |
89 var trimLine = line.trimLeft(); | 103 var trimLine = line.trimLeft(); |
90 if (trimLine.isEmpty) return justEol(line); | 104 if (trimLine.isEmpty) return justEol(line); |
91 return trimLine.replaceAll(whitespaceRE, " "); | 105 return trimLine.replaceAll(whitespaceRE, " "); |
92 } | 106 } |
93 | 107 |
94 // Reduce sequences of >1 wsOnly lines to 1, and sequences of >1 | 108 /// Reduces sequences of >1 white-space-only lines in [lines] to 1, |
95 // commentOnly lines to 1; moreover, treat commentOnly lines as | 109 /// and sequences of >1 comment-only lines to 1. Treats comment-only |
96 // wsOnly when occurring in wsOnly line blocks | 110 /// lines as white-space-only when they occur in white-space-only |
111 /// line blocks. | |
97 multilineNormalize(lines) { | 112 multilineNormalize(lines) { |
98 var afterBlankLines = false; // does 'line' succeed >0 empty lines? | 113 var afterBlankLines = false; // Does [line] succeed >0 empty lines? |
99 var afterCommentLines = false; // .. succeed >0 commentOnly lines? | 114 var afterCommentLines = false; // Does [line] succeed >0 commentOnly lines? |
100 var newLines = new List(); | 115 var newLines = new List(); |
101 for (var line in lines) { | 116 for (var line in lines) { |
102 if (afterBlankLines && afterCommentLines) { | 117 if (afterBlankLines && afterCommentLines) { |
103 // can never happen | 118 // Previous line was both blank and a comment: not possible. |
104 throw "Bug, please report to eernst@"; | 119 throw "Bug, please report to eernst@"; |
105 } else if (afterBlankLines && !afterCommentLines) { | 120 } else if (afterBlankLines && !afterCommentLines) { |
106 // at least one line before 'line' is wsOnly | 121 // At least one line before [line] is wsOnly. |
107 if (!isWsOnly(line)) { | 122 if (!isWsOnly(line)) { |
108 // blank line block ended | 123 // Blank line block ended. |
109 afterCommentLines = isCommentOnly(line); | 124 afterCommentLines = isCommentOnly(line); |
110 // special case: it seems to be safe to remove commentOnly lines | 125 // Special case: It seems to be safe to remove commentOnly lines |
111 // after wsOnly lines, so the TeX state must be predictably right; | 126 // after wsOnly lines, so the TeX state must be predictably right; |
112 // next line will then be afterCommentLines and be dropped, so | 127 // next line will then be afterCommentLines and be dropped, so |
113 // we drop the entire comment block---which is very useful; we can | 128 // we drop the entire comment block---which is very useful. We can |
114 // also consider this comment line to be an empty line, such that | 129 // also consider this comment line to be an empty line, such that |
115 // subsequent empty lines can be considered to be in a block of | 130 // subsequent empty lines can be considered to be in a block of |
116 // empty lines; note that almost all variants of this will break.. | 131 // empty lines. Note that almost all variants of this breaks. |
117 if (afterCommentLines) { | 132 if (afterCommentLines) { |
118 // _current_ 'line' a commentOnly here | 133 // _Current_ 'line' is a commentOnly here. |
119 afterBlankLines = true; | 134 afterBlankLines = true; |
120 afterCommentLines = false; | 135 afterCommentLines = false; |
121 // and do not add 'line' | 136 // Omit addition of [line]. |
122 } else { | 137 } else { |
123 // after blanks, but current 'line' is neither blank nor comment | 138 // After blanks, but current 'line' is neither blank nor comment. |
124 afterBlankLines = false; | 139 afterBlankLines = false; |
125 newLines.add(line); | 140 newLines.add(line); |
126 } | 141 } |
127 } else { | 142 } else { |
128 // blank line block continues, do not add 'line' | 143 // Blank line block continues, omit addition of [line]. |
129 } | 144 } |
130 } else if (!afterBlankLines && afterCommentLines) { | 145 } else if (!afterBlankLines && afterCommentLines) { |
131 // at least one line before 'line' is commentOnly | 146 // At least one line before [line] is commentOnly. |
132 if (!isCommentOnly(line)) { | 147 if (!isCommentOnly(line)) { |
133 // comment line block ended | 148 // Comment block ended. |
134 afterBlankLines = isWsOnly(line); | 149 afterBlankLines = isWsOnly(line); |
135 afterCommentLines = false; | 150 afterCommentLines = false; |
136 newLines.add(line); | 151 newLines.add(line); |
137 } else { | 152 } else { |
138 // comment line block continues, do not add 'line' | 153 // Comment block continues, do not add [line]. |
139 } | 154 } |
140 } else { | 155 } else { |
141 assert(!afterBlankLines && !afterCommentLines); | 156 assert(!afterBlankLines && !afterCommentLines); |
142 // no wsOnly or commentOnly lines preceed 'line' | 157 // No wsOnly or commentOnly lines preceed [line]. |
143 afterBlankLines = isWsOnly(line); | 158 afterBlankLines = isWsOnly(line); |
144 afterCommentLines = isCommentOnly(line); | 159 afterCommentLines = isCommentOnly(line); |
145 if (!afterCommentLines) newLines.add(line); | 160 if (!afterCommentLines) { |
146 // else skipping commentOnly line after nonWs, nonComment text | 161 newLines.add(line); |
162 } else { | |
163 // skip commentOnly line after nonWs/nonComment text. | |
164 } | |
147 } | 165 } |
148 } | 166 } |
149 return newLines; | 167 return newLines; |
150 } | 168 } |
151 | 169 |
152 // Selecting the elements in the pipeline | 170 /// Selects the elements in the normalization pipeline. |
153 | |
154 normalize(line) => normalizeWhitespace(stripComment(line)); | 171 normalize(line) => normalizeWhitespace(stripComment(line)); |
172 | |
173 /// Selects the elements in the significant-spacing block | |
174 /// normalization pipeline. | |
155 sispNormalize(line) => stripComment(line); | 175 sispNormalize(line) => stripComment(line); |
156 | 176 |
157 // Managing fragments with significant spacing | 177 // Managing fragments with significant spacing. |
158 | 178 |
159 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); | 179 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\s*\{dartCode\}"); |
160 final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); | 180 final dartCodeEndRE = new RegExp (r"^\s*\\end\s*\{dartCode\}"); |
161 | 181 |
162 sispIs(line, targetRE) { | 182 /// Recognizes beginning of dartCode block. |
163 return targetRE.firstMatch(line) != null; | 183 sispIsDartBegin(line) => line.contains(dartCodeBeginRE); |
164 } | 184 |
165 | 185 /// Recognizes end of dartCode block. |
166 sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); | 186 sispIsDartEnd(line) => line.contains(dartCodeEndRE); |
167 sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); | 187 |
168 | 188 // ---------------------------------------------------------------------- |
169 // Transform input file into output file | 189 // Analyzing the input to point out "interesting" lines |
170 | 190 |
191 /// Returns the event information for [lines] as determined by the | |
192 /// given [analyzer]. The method [analyzer.analyze] indicates that a | |
193 /// line is "uninteresting" by returning null (i.e., no events here), | |
194 /// and "interesting" lines may be characterized by [analysisFunc] via | |
195 /// the returned event object. | |
196 findEvents(lines, analyzer) { | |
197 var events = new List(); | |
198 for (var line in lines) { | |
199 var event = analyzer.analyze(line); | |
200 if (event != null) events.add(event); | |
201 } | |
202 return events; | |
203 } | |
204 | |
205 /// Returns RegExp text for recognizing a command occupying a line | |
206 /// of its own, given the part of the RegExp that recognizes the | |
207 /// command name, [cmdNameRE] | |
208 lineCommandRE(cmdNameRE) => | |
209 new RegExp(r"^\s*\\" + cmdNameRE + r"\s*\{.*\}\s*$"); | |
210 | |
211 final hashLabelStartRE = new RegExp(r"^\s*\\LMLabel\s*\{"); | |
212 final hashLabelEndRE = new RegExp(r"\}\s*$"); | |
213 | |
214 final hashMarkRE = lineCommandRE("LMHash"); | |
215 final hashLabelRE = lineCommandRE("LMLabel"); | |
216 final sectioningRE = lineCommandRE("((|sub(|sub))section|paragraph)"); | |
217 final sectionRE = lineCommandRE("section"); | |
218 final subsectionRE = lineCommandRE("subsection"); | |
219 final subsubsectionRE = lineCommandRE("subsubsection"); | |
220 final paragraphRE = lineCommandRE("paragraph"); | |
221 | |
222 /// Returns true iff [line] begins a block of lines that gets a hash value. | |
223 isHashMarker(line) => line.contains(hashMarkRE); | |
224 | |
225 /// Returns true iff [line] defines a sectioning label. | |
226 isHashLabel(line) => line.contains(hashLabelRE); | |
227 | |
228 /// Returns true iff [line] is a sectioning command resp. one of its | |
229 /// more specific forms; note that it is assumed that sectioning commands | |
230 /// do not contain a newline between the command name and the '{'. | |
231 isSectioningCommand(line) => line.contains(sectioningRE); | |
232 isSectionCommand(line) => line.contains(sectionRE); | |
233 isSubsectionCommand(line) => line.contains(subsectionRE); | |
234 isSubsubsectionCommand(line) => line.contains(subsubsectionRE); | |
235 isParagraphCommand(line) => line.contains(paragraphRE); | |
236 | |
237 /// Returns true iff [line] does not end a block of lines that gets | |
238 /// a hash value. | |
239 isntHashBlockTerminator(line) => !isSectioningCommand(line); | |
240 | |
241 /// Returns the label text part from [line], based on the assumption | |
242 /// that isHashLabel(line) returns true. | |
243 extractHashLabel(line) { | |
244 var startMatch = hashLabelStartRE.firstMatch(line); | |
245 var endMatch = hashLabelEndRE.firstMatch(line); | |
246 assert(startMatch != null && endMatch != null); | |
247 return line.substring(startMatch.end, endMatch.start); | |
248 } | |
249 | |
250 // Event classes: Keep track of relevant information about the LaTeX | |
251 // source code lines, such as where \LMHash and \LMLabel commands are | |
252 // used, and how they are embedded in the sectioning structure. | |
253 | |
254 /// Abstract events, enabling us to [setEndLineNumber] on all events. | |
255 abstract class HashEvent { | |
256 /// For events that have an endLineNumber, set it; otherwise ignore. | |
257 /// The endLineNumber specifies the end of the block of lines | |
258 /// associated with a given event, for event types concerned with | |
259 /// blocks of lines rather than single lines. | |
260 setEndLineNumber(n) {} | |
261 | |
262 /// Returns null except for \LMHash{} events, where it returns | |
263 /// the startLineNumber. This serves to specify a boundary because | |
264 /// the preceding \LMHash{} block should stop before the line of | |
265 /// this \LMHash{} command. Note that hash blocks may stop earlier, | |
266 /// because they cannot contain sectioning commands. | |
267 getStartLineNumber() => null; | |
268 } | |
269 | |
270 class HashMarkerEvent extends HashEvent { | |
271 | |
272 // Line number of first line in block that gets hashed. | |
273 var startLineNumber; | |
274 | |
275 // Highest possible number of first line after block that gets | |
276 // hashed (where the next \LMHash{} occurs). Note that this value | |
277 // is not known initially (because that line has not yet been | |
278 // reached), so [endLineNumber] will be initialized in a separate | |
279 // scan. Also note that the block may end earlier, because a block | |
280 // ends if it would otherwise include a sectioning command. | |
281 var endLineNumber; | |
282 | |
283 HashMarkerEvent(this.startLineNumber); | |
284 | |
285 setEndLineNumber(n) { endLineNumber = n; } | |
286 getStartLineNumber() => startLineNumber; | |
287 } | |
288 | |
289 class HashLabelEvent extends HashEvent { | |
290 var labelText; | |
291 HashLabelEvent(this.labelText); | |
292 } | |
293 | |
294 class HashAnalyzer { | |
295 // List of kinds of pending (= most recently seen) sectioning command. | |
296 // When updating this list, also update sectioningPrefix below. | |
297 static const PENDING_IS_NONE = 0; | |
298 static const PENDING_IS_SECTION = 1; | |
299 static const PENDING_IS_SUBSECTION = 2; | |
300 static const PENDING_IS_SUBSUBSECTION = 3; | |
301 static const PENDING_IS_PARAGRAPH = 1; | |
302 | |
303 var lineNumber = 0; | |
304 var pendingSectioning = PENDING_IS_NONE; | |
305 | |
306 HashAnalyzer(); | |
307 | |
308 setPendingToSection() { | |
309 pendingSectioning = PENDING_IS_SECTION; | |
310 } | |
311 | |
312 setPendingToSubsection() { | |
313 pendingSectioning = PENDING_IS_SUBSECTION; | |
314 } | |
315 | |
316 setPendingToSubsubsection() { | |
317 pendingSectioning = PENDING_IS_SUBSUBSECTION; | |
318 } | |
319 | |
320 setPendingToParagraph() { | |
321 pendingSectioning = PENDING_IS_PARAGRAPH; | |
322 } | |
323 | |
324 clearPending() { | |
325 pendingSectioning = PENDING_IS_NONE; | |
326 } | |
327 | |
328 sectioningPrefix() { | |
329 switch (pendingSectioning) { | |
330 case PENDING_IS_SECTION: return "sec:"; | |
331 case PENDING_IS_SUBSECTION: return "subsec:"; | |
332 case PENDING_IS_SUBSUBSECTION: return "subsubsec:"; | |
333 case PENDING_IS_PARAGRAPH: return "par:"; | |
334 case PENDING_IS_NONE: | |
335 throw | |
336 "\\LMHash{..} should only be used after a sectioning command " + | |
337 "(\\section, \\subsection, \\subsubsection, \\paragraph)"; | |
338 default: | |
339 // set of PENDING_IS_.. was extended, but updates here omitted | |
340 throw "Bug, please report to eernst@"; | |
341 } | |
342 } | |
343 | |
344 analyze(line) { | |
345 var currentLineNumber = lineNumber++; | |
346 if (isHashMarker(line)) { | |
347 return new HashMarkerEvent(currentLineNumber); | |
348 } else if (isHashLabel(line)) { | |
349 var labelText = sectioningPrefix() + extractHashLabel(line); | |
350 return new HashLabelEvent(labelText); | |
351 } else { | |
352 // No events to emit, but we may need to note state changes | |
353 if (isSectionCommand(line)) { | |
354 setPendingToSection(); | |
355 } else if (isSubsectionCommand(line)) { | |
356 setPendingToSubsection(); | |
357 } else if (isSubsubsectionCommand(line)) { | |
358 setPendingToSubsubsection(); | |
359 } else if (isParagraphCommand(line)) { | |
360 setPendingToParagraph(); | |
361 } else { | |
362 // No state changes. | |
363 } | |
364 return null; | |
365 } | |
366 } | |
367 } | |
368 | |
369 findHashEvents(lines) { | |
370 // Create the list of events, omitting endLineNumbers. | |
371 var events = findEvents(lines, new HashAnalyzer()); | |
372 // Set the endLineNumbers. | |
373 var currentEndLineNumber = lines.length; | |
374 for (var event in events.reversed) { | |
375 event.setEndLineNumber(currentEndLineNumber); | |
376 var nextEndLineNumber = event.getStartLineNumber(); | |
377 if (nextEndLineNumber != null) currentEndLineNumber = nextEndLineNumber; | |
378 } | |
379 return events; | |
380 } | |
381 | |
382 // ---------------------------------------------------------------------- | |
383 // Removal of non-normative elements of the text (rationale, commentary). | |
384 | |
385 /// Returns [line] without the command [cmdName] (based on a match | |
386 /// on "\\cmdName\s*{..}") starting at [startIndex]; note that it is | |
387 /// assumed but not checked that [line] contains "\\cmdType\s*{..", | |
388 /// and note that the end of the {..} block is found via brace matching | |
389 /// (i.e., nested {..} blocks are handled), but it may break if '{' is | |
390 /// made an active character etc.etc. | |
391 removeCommand(line, cmdName, startIndex) { | |
392 const BACKSLASH = 92; // char code for '\\'. | |
393 const BRACE_BEGIN = 123; // char code for '{'. | |
394 const BRACE_END = 125; // char code for '}'. | |
395 | |
396 var blockStartIndex = startIndex + cmdName.length + 1; | |
397 while (blockStartIndex < line.length && | |
398 line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { | |
399 blockStartIndex++; | |
400 } | |
401 blockStartIndex++; | |
402 if (blockStartIndex > line.length) { | |
403 throw "Bug, please report to eernst@"; | |
404 } | |
405 // [blockStartIndex] has index just after '{'. | |
406 | |
407 var afterEscape = false; // Is true iff [index] is just after '{'. | |
408 var braceLevel = 1; // Have seen so many '{'s minus so many '}'s. | |
409 | |
410 for (var index = blockStartIndex; index < line.length; index++) { | |
411 switch (line.codeUnitAt(index)) { | |
412 case BRACE_BEGIN: | |
413 if (afterEscape) { | |
414 afterEscape = false; | |
415 } else { | |
416 braceLevel++; | |
417 } | |
418 break; | |
419 case BRACE_END: | |
420 if (afterEscape) { | |
421 afterEscape = false; | |
422 } else { | |
423 braceLevel--; | |
424 } | |
425 break; | |
426 case BACKSLASH: | |
427 afterEscape = true; | |
428 break; | |
429 default: | |
430 afterEscape = false; | |
431 } | |
432 if (braceLevel == 0) { | |
433 return line.substring(0, startIndex) + line.substring(index + 1); | |
434 } | |
435 } | |
436 // Removal failed; we consider this to mean that the input is ill-formed. | |
437 throw "Unmatched braces"; | |
438 } | |
439 | |
440 final commentaryRE = new RegExp(r"\\commentary\s*\{"); | |
441 final rationaleRE = new RegExp(r"\\rationale\s*\{"); | |
442 | |
443 /// Removes {}-balanced '\commentary{..}' commands from [line]. | |
444 removeCommentary(line) { | |
445 var match = commentaryRE.firstMatch(line); | |
446 if (match == null) return line; | |
447 return removeCommentary(removeCommand(line, r"commentary", match.start)); | |
448 } | |
449 | |
450 /// Removes {}-balanced '\rationale{..}' commands from [line]. | |
451 removeRationale(line) { | |
452 var match = rationaleRE.firstMatch(line); | |
453 if (match == null) return line; | |
454 return removeRationale(removeCommand(line, r"rationale", match.start)); | |
455 } | |
456 | |
457 /// Removes {}-balanced '\commentary{..}' and '\rationale{..}' | |
458 /// commands from [line], then normalizes its white-space. | |
459 simplifyLine(line) { | |
460 var simplerLine = removeCommentary(line); | |
461 simplerLine = removeRationale(simplerLine); | |
462 simplerLine = normalizeWhitespace(simplerLine); | |
463 return simplerLine; | |
464 } | |
465 | |
466 // ---------------------------------------------------------------------- | |
467 // Recognition of line blocks, insertion of block hash into \LMHash{}. | |
468 | |
469 final latexArgumentRE = new RegExp(r"\{.*\}"); | |
470 | |
471 cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); | |
472 | |
473 /// Returns concatenation of all lines from [startIndex] in [lines] until | |
474 /// a hash block terminator is encountered or [nextIndex] reached (if so, | |
475 /// the line lines[nextIndex] itself is not included); each line is cleaned | |
476 /// up using [cleanupLine], and " " is inserted between the lines gathered. | |
477 gatherLines(lines, startIndex, nextIndex) => | |
478 lines.getRange(startIndex, nextIndex) | |
479 .takeWhile(isntHashBlockTerminator) | |
480 .map(cleanupLine) | |
481 .join(" "); | |
482 | |
483 /// Computes the hash value for the line block starting at [startIndex] | |
484 /// in [lines], stopping just before [nextIndex]. SIDE EFFECT: | |
485 /// Outputs the simplified text and its hash value to [listSink]. | |
486 computeHashValue(lines, startIndex, nextIndex, listSink) { | |
487 final hashEncoder = new SHA1(); | |
488 final gatheredLine = gatherLines(lines, startIndex, nextIndex); | |
489 final simplifiedLine = simplifyLine(gatheredLine); | |
490 listSink.write(" % $simplifiedLine\n"); | |
491 hashEncoder.add(encodeUtf8(simplifiedLine)); | |
492 return hashEncoder.close(); | |
493 } | |
494 | |
495 computeHashString(lines, startIndex, nextIndex, listSink) => | |
496 CryptoUtils.bytesToHex(computeHashValue(lines, | |
497 startIndex, | |
498 nextIndex, | |
499 listSink)); | |
500 | |
501 /// Computes and adds hashes to \LMHash{} lines in [lines] (which | |
502 /// must be on the line numbers specified in [hashEvents]), and emits | |
503 /// sectioning markers and hash values to [listSink], along with | |
504 /// "comments" containing the simplified text (using the format | |
505 /// ' % <text>', where the text is one, long line, for easy grepping | |
506 /// etc.). | |
507 addHashMarks(lines, hashEvents, listSink) { | |
508 for (var hashEvent in hashEvents) { | |
509 if (hashEvent is HashMarkerEvent) { | |
510 var start = hashEvent.startLineNumber; | |
511 var end = hashEvent.endLineNumber; | |
512 final hashValue = computeHashString(lines, start + 1, end, listSink); | |
513 lines[start] = | |
514 lines[start].replaceAll(latexArgumentRE, "{" + hashValue + "}"); | |
515 listSink.write(" $hashValue\n"); | |
516 } else if (hashEvent is HashLabelEvent) { | |
517 listSink.write("${hashEvent.labelText}\n"); | |
518 } | |
519 } | |
520 } | |
521 | |
522 /// Transforms LaTeX input to LaTeX output plus hash value list file. | |
171 main ([args]) { | 523 main ([args]) { |
172 if (args.length != 2) { | 524 if (args.length != 3) { |
173 print("Usage: addlatexhash.dart <input-file> <output-file>"); | 525 print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
174 throw "Received ${args.length} arguments, expected two"; | 526 throw "Received ${args.length} arguments, expected three"; |
175 } | 527 } |
176 | 528 |
529 // Get LaTeX source. | |
177 var inputFile = new File(args[0]); | 530 var inputFile = new File(args[0]); |
531 assert(inputFile.existsSync()); | |
532 var lines = inputFile.readAsLinesSync(); | |
533 | |
534 // Will hold LaTeX source with normalized spacing etc., plus hash values. | |
178 var outputFile = new File(args[1]); | 535 var outputFile = new File(args[1]); |
179 assert(inputFile.existsSync()); | 536 |
180 | 537 // Will hold hierarchical list of hash values. |
181 var lines = inputFile.readAsLinesSync(); | 538 var listFile = new File(args[2]); |
182 // single-line normalization | 539 var listSink = listFile.openWrite(); |
540 | |
541 // Perform single-line normalization. | |
183 var inDartCode = false; | 542 var inDartCode = false; |
184 var newLines = new List(); | 543 var normalizedLines = new List(); |
185 | 544 |
186 for (var line in lines) { | 545 for (var line in lines) { |
187 if (sispIsDartBegin(line)) { | 546 if (sispIsDartBegin(line)) { |
188 inDartCode = true; | 547 inDartCode = true; |
189 } else if (sispIsDartEnd(line)) { | 548 } else if (sispIsDartEnd(line)) { |
190 inDartCode = false; | 549 inDartCode = false; |
191 } | 550 } |
192 if (inDartCode) { | 551 if (inDartCode) { |
193 newLines.add(sispNormalize(line + "\n")); | 552 normalizedLines.add(sispNormalize(line + "\n")); |
194 } else { | 553 } else { |
195 newLines.add(normalize(line + "\n")); | 554 normalizedLines.add(normalize(line + "\n")); |
196 } | 555 } |
197 } | 556 } |
198 | 557 |
199 // multi-line normalization | 558 // Perform multi-line normalization. |
200 newLines = multilineNormalize(newLines); | 559 normalizedLines = multilineNormalize(normalizedLines); |
201 | 560 |
202 // output result | 561 // Insert hash values. |
203 outputFile.writeAsStringSync(newLines.join()); | 562 var hashEvents = findHashEvents(normalizedLines); |
563 addHashMarks(normalizedLines, hashEvents, listSink); | |
564 | |
565 // Produce/finalize output. | |
566 outputFile.writeAsStringSync(normalizedLines.join()); | |
567 listSink.close(); | |
204 } | 568 } |
OLD | NEW |