OLD | NEW |
| 1 #!/usr/bin/env dart |
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | 2 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 3 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 4 // BSD-style license that can be found in the LICENSE file. |
4 // | 5 // |
5 // ---------------------------------------------------------------------- | 6 // ---------------------------------------------------------------------- |
6 // This is a very specialized tool which was created in order to support | 7 // This is a very specialized tool which was created in order to support |
7 // adding hash values used as location markers in the LaTeX source of the | 8 // adding hash values used as location markers in the LaTeX source of the |
8 // language specification. It is intended to take its input file as the | 9 // language specification. It is intended to take its input file as the |
9 // first argument and the output file name as the second argument. From | 10 // first argument, an output file name as the second argument, and a |
10 // docs/language a typical usage would be as follows: | 11 // hash listing file name as the third argument. From docs/language a |
| 12 // typical usage would be as follows: |
11 // | 13 // |
12 // dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex | 14 // dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt |
13 // | 15 // |
14 // This will yield a normalized variant tmp.tex of the language | 16 // This will produce a normalized variant out.tex of the language |
15 // specification with hash values filled in. For more details, please | 17 // specification with hash values filled in, and a listing hash.txt of |
16 // check the language specification source itself. | 18 // all the hash values along with the label of their textual context |
| 19 // (section, subsection, subsubsection, paragraph) . For more details, |
| 20 // please check the language specification source itself. |
17 // | 21 // |
18 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX | 22 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX |
19 // source file receieved as input; it will not work with other styles. | 23 // source file receieved as input; it will not work with other styles. |
20 // | |
21 // TODO: The current version does not fill in hash values, it only | |
22 // standardizes the LaTeX source by removing comments and normalizing | |
23 // white space. | |
24 | 24 |
25 import 'dart:io'; | 25 import 'dart:io'; |
26 import 'dart:convert'; | 26 import 'dart:convert'; |
| 27 import '../pkg/utf/lib/utf.dart'; |
27 import '../pkg/crypto/lib/crypto.dart'; | 28 import '../pkg/crypto/lib/crypto.dart'; |
28 | 29 |
29 // Normalization of the text, i.e., removal or normalization | 30 // ---------------------------------------------------------------------- |
30 // of elements that do not affect the output from latex | 31 // Normalization of the text: removal or normalization of parts that |
| 32 // do not affect the output from latex, such as white space. |
31 | 33 |
32 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n | 34 final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n. |
33 final whitespaceAllRE = new RegExp(r"^\s+$"); | 35 final whitespaceAllRE = new RegExp(r"^\s+$"); |
34 final whitespaceRE = new RegExp(r"[ \t]{2,}"); | 36 final whitespaceRE = new RegExp(r"[ \t]{2,}"); |
35 | 37 |
36 // normalization steps | 38 /// Removes [match]ing part of [line], adjusting that part with the |
37 | 39 /// given [startOffset] and [endOffset], bounded to be valid indices |
| 40 /// into the string if needed, then inserts [glue] where text was |
| 41 /// removed. If there is no match then [line] is returned. |
38 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { | 42 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { |
39 if (match == null) return line; | 43 if (match == null) return line; |
40 var start = match.start + startOffset; | 44 var start = match.start + startOffset; |
41 var end = match.end + endOffset; | 45 var end = match.end + endOffset; |
42 var len = line.length; | 46 var len = line.length; |
43 if (start < 0) start = 0; | 47 if (start < 0) start = 0; |
44 if (end > len) end = len; | 48 if (end > len) end = len; |
45 return line.substring(0, start) + glue + line.substring(end); | 49 return line.substring(0, start) + glue + line.substring(end); |
46 } | 50 } |
47 | 51 |
48 cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { | 52 cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { |
49 return cutMatch(line, re.firstMatch(line), | 53 return cutMatch(line, re.firstMatch(line), |
50 startOffset: startOffset, | 54 startOffset: startOffset, |
51 endOffset: endOffset, | 55 endOffset: endOffset, |
52 glue: glue); | 56 glue: glue); |
53 } | 57 } |
54 | 58 |
| 59 /// Removes the rest of [line] starting from the beginning of the |
| 60 /// given [match], and adjusting with the given [offset]. If there |
| 61 /// is no match then [line] is returned. |
55 cutFromMatch(line, match, {offset: 0, glue: ""}) { | 62 cutFromMatch(line, match, {offset: 0, glue: ""}) { |
56 if (match == null) return line; | 63 if (match == null) return line; |
57 return line.substring(0, match.start + offset) + glue; | 64 return line.substring(0, match.start + offset) + glue; |
58 } | 65 } |
59 | 66 |
60 cutFromRegexp(line, re, {offset: 0, glue: ""}) { | 67 cutFromRegexp(line, re, {offset: 0, glue: ""}) { |
61 return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); | 68 return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); |
62 } | 69 } |
63 | 70 |
64 isWsOnly(line) => whitespaceAllRE.firstMatch(line) != null; | 71 isWsOnly(line) => whitespaceAllRE.firstMatch(line) != null; |
65 isCommentOnly(line) => line.startsWith("%"); | 72 isCommentOnly(line) => line.startsWith("%"); |
66 | 73 |
| 74 /// Returns the end-of-line character at the end of [line], if any, |
| 75 /// otherwise returns the empty string. |
67 justEol(line) { | 76 justEol(line) { |
68 return line.endsWith("\n") ? "\n" : ""; | 77 return line.endsWith("\n") ? "\n" : ""; |
69 } | 78 } |
70 | 79 |
| 80 /// Removes the contents of the comment at the end of [line], |
| 81 /// leaving the "%" in place. If no comment is present, |
| 82 /// return [line]. |
| 83 /// |
| 84 /// NB: it is tempting to remove everything from the '%' and out, |
| 85 /// including the final newline, if any, but this does not work. |
| 86 /// The problem is that TeX will do exactly this, but then it will |
| 87 /// add back a character that depends on its state (S, M, or N), |
| 88 /// and it is tricky to maintain a similar state that matches the |
| 89 /// state of TeX faithfully. Hence, we remove the content of |
| 90 /// comments but do not remove the comments themselves, we just |
| 91 /// leave the '%' at the end of the line and let TeX manage its |
| 92 /// states in a way that does not differ from the file from before |
| 93 /// stripComment. |
71 stripComment(line) { | 94 stripComment(line) { |
72 // NB: it is tempting to remove everything from the '%' and out, | |
73 // including the final newline, if any, but this does not work. | |
74 // The problem is that TeX will do exactly this, but then it will | |
75 // add back a character that depends on its state (S, M, or N), | |
76 // and it is tricky to maintain a similar state that matches the | |
77 // state of TeX faithfully. Hence, we remove the content of | |
78 // comments but do not remove the comments themselves, we just | |
79 // leave the '%' at the end of the line and let TeX manage its | |
80 // states in a way that does not differ from the file from before | |
81 // stripComment | |
82 if (isCommentOnly(line)) return "%\n"; | 95 if (isCommentOnly(line)) return "%\n"; |
83 return cutRegexp(line, commentRE, startOffset: 2); | 96 return cutRegexp(line, commentRE, startOffset: 2); |
84 } | 97 } |
85 | 98 |
86 // Reduce a wsOnly line to its eol, remove leading ws | 99 /// Reduces a white-space-only [line] to its eol character, |
87 // entirely, and reduce multiple ws chars to one | 100 /// removes leading ws entirely, and reduces multiple |
| 101 /// white-space chars to one. |
88 normalizeWhitespace(line) { | 102 normalizeWhitespace(line) { |
89 var trimLine = line.trimLeft(); | 103 var trimLine = line.trimLeft(); |
90 if (trimLine.isEmpty) return justEol(line); | 104 if (trimLine.isEmpty) return justEol(line); |
91 return trimLine.replaceAll(whitespaceRE, " "); | 105 return trimLine.replaceAll(whitespaceRE, " "); |
92 } | 106 } |
93 | 107 |
94 // Reduce sequences of >1 wsOnly lines to 1, and sequences of >1 | 108 /// Reduces sequences of >1 white-space-only lines in [lines] to 1, |
95 // commentOnly lines to 1; moreover, treat commentOnly lines as | 109 /// and sequences of >1 comment-only lines to 1. Treats comment-only |
96 // wsOnly when occurring in wsOnly line blocks | 110 /// lines as white-space-only when they occur in white-space-only |
| 111 /// line blocks. |
97 multilineNormalize(lines) { | 112 multilineNormalize(lines) { |
98 var afterBlankLines = false; // does 'line' succeed >0 empty lines? | 113 var afterBlankLines = false; // Does [line] succeed >0 empty lines? |
99 var afterCommentLines = false; // .. succeed >0 commentOnly lines? | 114 var afterCommentLines = false; // Does [line] succeed >0 commentOnly lines? |
100 var newLines = new List(); | 115 var newLines = new List(); |
101 for (var line in lines) { | 116 for (var line in lines) { |
102 if (afterBlankLines && afterCommentLines) { | 117 if (afterBlankLines && afterCommentLines) { |
103 // can never happen | 118 // Previous line was both blank and a comment: not possible. |
104 throw "Bug, please report to eernst@"; | 119 throw "Bug, please report to eernst@"; |
105 } else if (afterBlankLines && !afterCommentLines) { | 120 } else if (afterBlankLines && !afterCommentLines) { |
106 // at least one line before 'line' is wsOnly | 121 // At least one line before [line] is wsOnly. |
107 if (!isWsOnly(line)) { | 122 if (!isWsOnly(line)) { |
108 // blank line block ended | 123 // Blank line block ended. |
109 afterCommentLines = isCommentOnly(line); | 124 afterCommentLines = isCommentOnly(line); |
110 // special case: it seems to be safe to remove commentOnly lines | 125 // Special case: It seems to be safe to remove commentOnly lines |
111 // after wsOnly lines, so the TeX state must be predictably right; | 126 // after wsOnly lines, so the TeX state must be predictably right; |
112 // next line will then be afterCommentLines and be dropped, so | 127 // next line will then be afterCommentLines and be dropped, so |
113 // we drop the entire comment block---which is very useful; we can | 128 // we drop the entire comment block---which is very useful. We can |
114 // also consider this comment line to be an empty line, such that | 129 // also consider this comment line to be an empty line, such that |
115 // subsequent empty lines can be considered to be in a block of | 130 // subsequent empty lines can be considered to be in a block of |
116 // empty lines; note that almost all variants of this will break.. | 131 // empty lines. Note that almost all variants of this breaks. |
117 if (afterCommentLines) { | 132 if (afterCommentLines) { |
118 // _current_ 'line' a commentOnly here | 133 // _Current_ 'line' is a commentOnly here. |
119 afterBlankLines = true; | 134 afterBlankLines = true; |
120 afterCommentLines = false; | 135 afterCommentLines = false; |
121 // and do not add 'line' | 136 // Omit addition of [line]. |
122 } else { | 137 } else { |
123 // after blanks, but current 'line' is neither blank nor comment | 138 // After blanks, but current 'line' is neither blank nor comment. |
124 afterBlankLines = false; | 139 afterBlankLines = false; |
125 newLines.add(line); | 140 newLines.add(line); |
126 } | 141 } |
127 } else { | 142 } else { |
128 // blank line block continues, do not add 'line' | 143 // Blank line block continues, omit addition of [line]. |
129 } | 144 } |
130 } else if (!afterBlankLines && afterCommentLines) { | 145 } else if (!afterBlankLines && afterCommentLines) { |
131 // at least one line before 'line' is commentOnly | 146 // At least one line before [line] is commentOnly. |
132 if (!isCommentOnly(line)) { | 147 if (!isCommentOnly(line)) { |
133 // comment line block ended | 148 // Comment block ended. |
134 afterBlankLines = isWsOnly(line); | 149 afterBlankLines = isWsOnly(line); |
135 afterCommentLines = false; | 150 afterCommentLines = false; |
136 newLines.add(line); | 151 newLines.add(line); |
137 } else { | 152 } else { |
138 // comment line block continues, do not add 'line' | 153 // Comment block continues, do not add [line]. |
139 } | 154 } |
140 } else { | 155 } else { |
141 assert(!afterBlankLines && !afterCommentLines); | 156 assert(!afterBlankLines && !afterCommentLines); |
142 // no wsOnly or commentOnly lines preceed 'line' | 157 // No wsOnly or commentOnly lines preceed [line]. |
143 afterBlankLines = isWsOnly(line); | 158 afterBlankLines = isWsOnly(line); |
144 afterCommentLines = isCommentOnly(line); | 159 afterCommentLines = isCommentOnly(line); |
145 if (!afterCommentLines) newLines.add(line); | 160 if (!afterCommentLines) { |
146 // else skipping commentOnly line after nonWs, nonComment text | 161 newLines.add(line); |
| 162 } else { |
| 163 // skip commentOnly line after nonWs/nonComment text. |
| 164 } |
147 } | 165 } |
148 } | 166 } |
149 return newLines; | 167 return newLines; |
150 } | 168 } |
151 | 169 |
152 // Selecting the elements in the pipeline | 170 /// Selects the elements in the normalization pipeline. |
153 | |
154 normalize(line) => normalizeWhitespace(stripComment(line)); | 171 normalize(line) => normalizeWhitespace(stripComment(line)); |
| 172 |
| 173 /// Selects the elements in the significant-spacing block |
| 174 /// normalization pipeline. |
155 sispNormalize(line) => stripComment(line); | 175 sispNormalize(line) => stripComment(line); |
156 | 176 |
157 // Managing fragments with significant spacing | 177 // Managing fragments with significant spacing. |
158 | 178 |
159 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}"); | 179 final dartCodeBeginRE = new RegExp(r"^\s*\\begin\s*\{dartCode\}"); |
160 final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}"); | 180 final dartCodeEndRE = new RegExp (r"^\s*\\end\s*\{dartCode\}"); |
161 | 181 |
162 sispIs(line, targetRE) { | 182 /// Recognizes beginning of dartCode block. |
163 return targetRE.firstMatch(line) != null; | 183 sispIsDartBegin(line) => line.contains(dartCodeBeginRE); |
164 } | 184 |
165 | 185 /// Recognizes end of dartCode block. |
166 sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE); | 186 sispIsDartEnd(line) => line.contains(dartCodeEndRE); |
167 sispIsDartEnd(line) => sispIs(line, dartCodeEndRE); | 187 |
168 | 188 // ---------------------------------------------------------------------- |
169 // Transform input file into output file | 189 // Analyzing the input to point out "interesting" lines |
170 | 190 |
| 191 /// Returns the event information for [lines] as determined by the |
| 192 /// given [analyzer]. The method [analyzer.analyze] indicates that a |
| 193 /// line is "uninteresting" by returning null (i.e., no events here), |
| 194 /// and "interesting" lines may be characterized by [analysisFunc] via |
| 195 /// the returned event object. |
| 196 findEvents(lines, analyzer) { |
| 197 var events = new List(); |
| 198 for (var line in lines) { |
| 199 var event = analyzer.analyze(line); |
| 200 if (event != null) events.add(event); |
| 201 } |
| 202 return events; |
| 203 } |
| 204 |
| 205 /// Returns RegExp text for recognizing a command occupying a line |
| 206 /// of its own, given the part of the RegExp that recognizes the |
| 207 /// command name, [cmdNameRE] |
| 208 lineCommandRE(cmdNameRE) => |
| 209 new RegExp(r"^\s*\\" + cmdNameRE + r"\s*\{.*\}\s*$"); |
| 210 |
| 211 final hashLabelStartRE = new RegExp(r"^\s*\\LMLabel\s*\{"); |
| 212 final hashLabelEndRE = new RegExp(r"\}\s*$"); |
| 213 |
| 214 final hashMarkRE = lineCommandRE("LMHash"); |
| 215 final hashLabelRE = lineCommandRE("LMLabel"); |
| 216 final sectioningRE = lineCommandRE("((|sub(|sub))section|paragraph)"); |
| 217 final sectionRE = lineCommandRE("section"); |
| 218 final subsectionRE = lineCommandRE("subsection"); |
| 219 final subsubsectionRE = lineCommandRE("subsubsection"); |
| 220 final paragraphRE = lineCommandRE("paragraph"); |
| 221 |
| 222 /// Returns true iff [line] begins a block of lines that gets a hash value. |
| 223 isHashMarker(line) => line.contains(hashMarkRE); |
| 224 |
| 225 /// Returns true iff [line] defines a sectioning label. |
| 226 isHashLabel(line) => line.contains(hashLabelRE); |
| 227 |
| 228 /// Returns true iff [line] is a sectioning command resp. one of its |
| 229 /// more specific forms; note that it is assumed that sectioning commands |
| 230 /// do not contain a newline between the command name and the '{'. |
| 231 isSectioningCommand(line) => line.contains(sectioningRE); |
| 232 isSectionCommand(line) => line.contains(sectionRE); |
| 233 isSubsectionCommand(line) => line.contains(subsectionRE); |
| 234 isSubsubsectionCommand(line) => line.contains(subsubsectionRE); |
| 235 isParagraphCommand(line) => line.contains(paragraphRE); |
| 236 |
| 237 /// Returns true iff [line] does not end a block of lines that gets |
| 238 /// a hash value. |
| 239 isntHashBlockTerminator(line) => !isSectioningCommand(line); |
| 240 |
| 241 /// Returns the label text part from [line], based on the assumption |
| 242 /// that isHashLabel(line) returns true. |
| 243 extractHashLabel(line) { |
| 244 var startMatch = hashLabelStartRE.firstMatch(line); |
| 245 var endMatch = hashLabelEndRE.firstMatch(line); |
| 246 assert(startMatch != null && endMatch != null); |
| 247 return line.substring(startMatch.end, endMatch.start); |
| 248 } |
| 249 |
| 250 // Event classes: Keep track of relevant information about the LaTeX |
| 251 // source code lines, such as where \LMHash and \LMLabel commands are |
| 252 // used, and how they are embedded in the sectioning structure. |
| 253 |
| 254 /// Abstract events, enabling us to [setEndLineNumber] on all events. |
| 255 abstract class HashEvent { |
| 256 /// For events that have an endLineNumber, set it; otherwise ignore. |
| 257 /// The endLineNumber specifies the end of the block of lines |
| 258 /// associated with a given event, for event types concerned with |
| 259 /// blocks of lines rather than single lines. |
| 260 setEndLineNumber(n) {} |
| 261 |
| 262 /// Returns null except for \LMHash{} events, where it returns |
| 263 /// the startLineNumber. This serves to specify a boundary because |
| 264 /// the preceding \LMHash{} block should stop before the line of |
| 265 /// this \LMHash{} command. Note that hash blocks may stop earlier, |
| 266 /// because they cannot contain sectioning commands. |
| 267 getStartLineNumber() => null; |
| 268 } |
| 269 |
| 270 class HashMarkerEvent extends HashEvent { |
| 271 |
| 272 // Line number of first line in block that gets hashed. |
| 273 var startLineNumber; |
| 274 |
| 275 // Highest possible number of first line after block that gets |
| 276 // hashed (where the next \LMHash{} occurs). Note that this value |
| 277 // is not known initially (because that line has not yet been |
| 278 // reached), so [endLineNumber] will be initialized in a separate |
| 279 // scan. Also note that the block may end earlier, because a block |
| 280 // ends if it would otherwise include a sectioning command. |
| 281 var endLineNumber; |
| 282 |
| 283 HashMarkerEvent(this.startLineNumber); |
| 284 |
| 285 setEndLineNumber(n) { endLineNumber = n; } |
| 286 getStartLineNumber() => startLineNumber; |
| 287 } |
| 288 |
| 289 class HashLabelEvent extends HashEvent { |
| 290 var labelText; |
| 291 HashLabelEvent(this.labelText); |
| 292 } |
| 293 |
| 294 class HashAnalyzer { |
| 295 // List of kinds of pending (= most recently seen) sectioning command. |
| 296 // When updating this list, also update sectioningPrefix below. |
| 297 static const PENDING_IS_NONE = 0; |
| 298 static const PENDING_IS_SECTION = 1; |
| 299 static const PENDING_IS_SUBSECTION = 2; |
| 300 static const PENDING_IS_SUBSUBSECTION = 3; |
| 301 static const PENDING_IS_PARAGRAPH = 1; |
| 302 |
| 303 var lineNumber = 0; |
| 304 var pendingSectioning = PENDING_IS_NONE; |
| 305 |
| 306 HashAnalyzer(); |
| 307 |
| 308 setPendingToSection() { |
| 309 pendingSectioning = PENDING_IS_SECTION; |
| 310 } |
| 311 |
| 312 setPendingToSubsection() { |
| 313 pendingSectioning = PENDING_IS_SUBSECTION; |
| 314 } |
| 315 |
| 316 setPendingToSubsubsection() { |
| 317 pendingSectioning = PENDING_IS_SUBSUBSECTION; |
| 318 } |
| 319 |
| 320 setPendingToParagraph() { |
| 321 pendingSectioning = PENDING_IS_PARAGRAPH; |
| 322 } |
| 323 |
| 324 clearPending() { |
| 325 pendingSectioning = PENDING_IS_NONE; |
| 326 } |
| 327 |
| 328 sectioningPrefix() { |
| 329 switch (pendingSectioning) { |
| 330 case PENDING_IS_SECTION: return "sec:"; |
| 331 case PENDING_IS_SUBSECTION: return "subsec:"; |
| 332 case PENDING_IS_SUBSUBSECTION: return "subsubsec:"; |
| 333 case PENDING_IS_PARAGRAPH: return "par:"; |
| 334 case PENDING_IS_NONE: |
| 335 throw |
| 336 "\\LMHash{..} should only be used after a sectioning command " + |
| 337 "(\\section, \\subsection, \\subsubsection, \\paragraph)"; |
| 338 default: |
| 339 // set of PENDING_IS_.. was extended, but updates here omitted |
| 340 throw "Bug, please report to eernst@"; |
| 341 } |
| 342 } |
| 343 |
| 344 analyze(line) { |
| 345 var currentLineNumber = lineNumber++; |
| 346 if (isHashMarker(line)) { |
| 347 return new HashMarkerEvent(currentLineNumber); |
| 348 } else if (isHashLabel(line)) { |
| 349 var labelText = sectioningPrefix() + extractHashLabel(line); |
| 350 return new HashLabelEvent(labelText); |
| 351 } else { |
| 352 // No events to emit, but we may need to note state changes |
| 353 if (isSectionCommand(line)) { |
| 354 setPendingToSection(); |
| 355 } else if (isSubsectionCommand(line)) { |
| 356 setPendingToSubsection(); |
| 357 } else if (isSubsubsectionCommand(line)) { |
| 358 setPendingToSubsubsection(); |
| 359 } else if (isParagraphCommand(line)) { |
| 360 setPendingToParagraph(); |
| 361 } else { |
| 362 // No state changes. |
| 363 } |
| 364 return null; |
| 365 } |
| 366 } |
| 367 } |
| 368 |
| 369 findHashEvents(lines) { |
| 370 // Create the list of events, omitting endLineNumbers. |
| 371 var events = findEvents(lines, new HashAnalyzer()); |
| 372 // Set the endLineNumbers. |
| 373 var currentEndLineNumber = lines.length; |
| 374 for (var event in events.reversed) { |
| 375 event.setEndLineNumber(currentEndLineNumber); |
| 376 var nextEndLineNumber = event.getStartLineNumber(); |
| 377 if (nextEndLineNumber != null) currentEndLineNumber = nextEndLineNumber; |
| 378 } |
| 379 return events; |
| 380 } |
| 381 |
| 382 // ---------------------------------------------------------------------- |
| 383 // Removal of non-normative elements of the text (rationale, commentary). |
| 384 |
| 385 /// Returns [line] without the command [cmdName] (based on a match |
| 386 /// on "\\cmdName\s*{..}") starting at [startIndex]; note that it is |
| 387 /// assumed but not checked that [line] contains "\\cmdType\s*{..", |
| 388 /// and note that the end of the {..} block is found via brace matching |
| 389 /// (i.e., nested {..} blocks are handled), but it may break if '{' is |
| 390 /// made an active character etc.etc. |
| 391 removeCommand(line, cmdName, startIndex) { |
| 392 const BACKSLASH = 92; // char code for '\\'. |
| 393 const BRACE_BEGIN = 123; // char code for '{'. |
| 394 const BRACE_END = 125; // char code for '}'. |
| 395 |
| 396 var blockStartIndex = startIndex + cmdName.length + 1; |
| 397 while (blockStartIndex < line.length && |
| 398 line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) { |
| 399 blockStartIndex++; |
| 400 } |
| 401 blockStartIndex++; |
| 402 if (blockStartIndex > line.length) { |
| 403 throw "Bug, please report to eernst@"; |
| 404 } |
| 405 // [blockStartIndex] has index just after '{'. |
| 406 |
| 407 var afterEscape = false; // Is true iff [index] is just after '{'. |
| 408 var braceLevel = 1; // Have seen so many '{'s minus so many '}'s. |
| 409 |
| 410 for (var index = blockStartIndex; index < line.length; index++) { |
| 411 switch (line.codeUnitAt(index)) { |
| 412 case BRACE_BEGIN: |
| 413 if (afterEscape) { |
| 414 afterEscape = false; |
| 415 } else { |
| 416 braceLevel++; |
| 417 } |
| 418 break; |
| 419 case BRACE_END: |
| 420 if (afterEscape) { |
| 421 afterEscape = false; |
| 422 } else { |
| 423 braceLevel--; |
| 424 } |
| 425 break; |
| 426 case BACKSLASH: |
| 427 afterEscape = true; |
| 428 break; |
| 429 default: |
| 430 afterEscape = false; |
| 431 } |
| 432 if (braceLevel == 0) { |
| 433 return line.substring(0, startIndex) + line.substring(index + 1); |
| 434 } |
| 435 } |
| 436 // Removal failed; we consider this to mean that the input is ill-formed. |
| 437 throw "Unmatched braces"; |
| 438 } |
| 439 |
| 440 final commentaryRE = new RegExp(r"\\commentary\s*\{"); |
| 441 final rationaleRE = new RegExp(r"\\rationale\s*\{"); |
| 442 |
| 443 /// Removes {}-balanced '\commentary{..}' commands from [line]. |
| 444 removeCommentary(line) { |
| 445 var match = commentaryRE.firstMatch(line); |
| 446 if (match == null) return line; |
| 447 return removeCommentary(removeCommand(line, r"commentary", match.start)); |
| 448 } |
| 449 |
| 450 /// Removes {}-balanced '\rationale{..}' commands from [line]. |
| 451 removeRationale(line) { |
| 452 var match = rationaleRE.firstMatch(line); |
| 453 if (match == null) return line; |
| 454 return removeRationale(removeCommand(line, r"rationale", match.start)); |
| 455 } |
| 456 |
| 457 /// Removes {}-balanced '\commentary{..}' and '\rationale{..}' |
| 458 /// commands from [line], then normalizes its white-space. |
| 459 simplifyLine(line) { |
| 460 var simplerLine = removeCommentary(line); |
| 461 simplerLine = removeRationale(simplerLine); |
| 462 simplerLine = normalizeWhitespace(simplerLine); |
| 463 return simplerLine; |
| 464 } |
| 465 |
| 466 // ---------------------------------------------------------------------- |
| 467 // Recognition of line blocks, insertion of block hash into \LMHash{}. |
| 468 |
| 469 final latexArgumentRE = new RegExp(r"\{.*\}"); |
| 470 |
| 471 cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight(); |
| 472 |
| 473 /// Returns concatenation of all lines from [startIndex] in [lines] until |
| 474 /// a hash block terminator is encountered or [nextIndex] reached (if so, |
| 475 /// the line lines[nextIndex] itself is not included); each line is cleaned |
| 476 /// up using [cleanupLine], and " " is inserted between the lines gathered. |
| 477 gatherLines(lines, startIndex, nextIndex) => |
| 478 lines.getRange(startIndex, nextIndex) |
| 479 .takeWhile(isntHashBlockTerminator) |
| 480 .map(cleanupLine) |
| 481 .join(" "); |
| 482 |
| 483 /// Computes the hash value for the line block starting at [startIndex] |
| 484 /// in [lines], stopping just before [nextIndex]. SIDE EFFECT: |
| 485 /// Outputs the simplified text and its hash value to [listSink]. |
| 486 computeHashValue(lines, startIndex, nextIndex, listSink) { |
| 487 final hashEncoder = new SHA1(); |
| 488 final gatheredLine = gatherLines(lines, startIndex, nextIndex); |
| 489 final simplifiedLine = simplifyLine(gatheredLine); |
| 490 listSink.write(" % $simplifiedLine\n"); |
| 491 hashEncoder.add(encodeUtf8(simplifiedLine)); |
| 492 return hashEncoder.close(); |
| 493 } |
| 494 |
| 495 computeHashString(lines, startIndex, nextIndex, listSink) => |
| 496 CryptoUtils.bytesToHex(computeHashValue(lines, |
| 497 startIndex, |
| 498 nextIndex, |
| 499 listSink)); |
| 500 |
| 501 /// Computes and adds hashes to \LMHash{} lines in [lines] (which |
| 502 /// must be on the line numbers specified in [hashEvents]), and emits |
| 503 /// sectioning markers and hash values to [listSink], along with |
| 504 /// "comments" containing the simplified text (using the format |
| 505 /// ' % <text>', where the text is one, long line, for easy grepping |
| 506 /// etc.). |
| 507 addHashMarks(lines, hashEvents, listSink) { |
| 508 for (var hashEvent in hashEvents) { |
| 509 if (hashEvent is HashMarkerEvent) { |
| 510 var start = hashEvent.startLineNumber; |
| 511 var end = hashEvent.endLineNumber; |
| 512 final hashValue = computeHashString(lines, start + 1, end, listSink); |
| 513 lines[start] = |
| 514 lines[start].replaceAll(latexArgumentRE, "{" + hashValue + "}"); |
| 515 listSink.write(" $hashValue\n"); |
| 516 } else if (hashEvent is HashLabelEvent) { |
| 517 listSink.write("${hashEvent.labelText}\n"); |
| 518 } |
| 519 } |
| 520 } |
| 521 |
| 522 /// Transforms LaTeX input to LaTeX output plus hash value list file. |
171 main ([args]) { | 523 main ([args]) { |
172 if (args.length != 2) { | 524 if (args.length != 3) { |
173 print("Usage: addlatexhash.dart <input-file> <output-file>"); | 525 print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>"); |
174 throw "Received ${args.length} arguments, expected two"; | 526 throw "Received ${args.length} arguments, expected three"; |
175 } | 527 } |
176 | 528 |
| 529 // Get LaTeX source. |
177 var inputFile = new File(args[0]); | 530 var inputFile = new File(args[0]); |
| 531 assert(inputFile.existsSync()); |
| 532 var lines = inputFile.readAsLinesSync(); |
| 533 |
| 534 // Will hold LaTeX source with normalized spacing etc., plus hash values. |
178 var outputFile = new File(args[1]); | 535 var outputFile = new File(args[1]); |
179 assert(inputFile.existsSync()); | 536 |
180 | 537 // Will hold hierarchical list of hash values. |
181 var lines = inputFile.readAsLinesSync(); | 538 var listFile = new File(args[2]); |
182 // single-line normalization | 539 var listSink = listFile.openWrite(); |
| 540 |
| 541 // Perform single-line normalization. |
183 var inDartCode = false; | 542 var inDartCode = false; |
184 var newLines = new List(); | 543 var normalizedLines = new List(); |
185 | 544 |
186 for (var line in lines) { | 545 for (var line in lines) { |
187 if (sispIsDartBegin(line)) { | 546 if (sispIsDartBegin(line)) { |
188 inDartCode = true; | 547 inDartCode = true; |
189 } else if (sispIsDartEnd(line)) { | 548 } else if (sispIsDartEnd(line)) { |
190 inDartCode = false; | 549 inDartCode = false; |
191 } | 550 } |
192 if (inDartCode) { | 551 if (inDartCode) { |
193 newLines.add(sispNormalize(line + "\n")); | 552 normalizedLines.add(sispNormalize(line + "\n")); |
194 } else { | 553 } else { |
195 newLines.add(normalize(line + "\n")); | 554 normalizedLines.add(normalize(line + "\n")); |
196 } | 555 } |
197 } | 556 } |
198 | 557 |
199 // multi-line normalization | 558 // Perform multi-line normalization. |
200 newLines = multilineNormalize(newLines); | 559 normalizedLines = multilineNormalize(normalizedLines); |
201 | 560 |
202 // output result | 561 // Insert hash values. |
203 outputFile.writeAsStringSync(newLines.join()); | 562 var hashEvents = findHashEvents(normalizedLines); |
| 563 addHashMarks(normalizedLines, hashEvents, listSink); |
| 564 |
| 565 // Produce/finalize output. |
| 566 outputFile.writeAsStringSync(normalizedLines.join()); |
| 567 listSink.close(); |
204 } | 568 } |
OLD | NEW |