OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 // | |
5 // ---------------------------------------------------------------------- | |
6 // This is a very specialized tool which was created in order to support | |
7 // adding hash values used as location markers in the LaTeX source of the | |
8 // language specification. It is intended to take its input file as the | |
9 // first argument and the output file name as the second argument. From | |
10 // docs/language a typical usage would be as follows: | |
11 // | |
12 // dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex | |
13 // | |
14 // This will yield a normalized variant tmp.tex of the language | |
15 // specification with hash values filled in. For more details, please | |
16 // check the language specification source itself. | |
17 // | |
18 // NB: This utility assumes UN*X style line endings, \n, in the LaTeX | |
19 // source file receieved as input; it will not work with other styles. | |
20 // | |
21 // TODO: The current version does not fill in hash values, it only | |
22 // standardizes the LaTeX source by removing comments and normalizing | |
23 // white space. | |
24 | |
25 import 'dart:io'; | |
26 import 'dart:convert'; | |
27 import '../pkg/crypto/lib/crypto.dart'; | |
28 | |
29 // ---------------------------------------------------------------------- | |
30 // Normalization of the text, i.e., removal or normalization | |
31 // of elements that do not affect the output from latex | |
32 | |
33 final commentAllRe = new RegExp("^%"); | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Using a RegExp for this is overkill, just do strin
eernst
2014/10/15 13:19:27
Looks more meaningful with "RE". Done.
| |
34 final commentRe = new RegExp("[^\\\\]%[^\\n]*"); | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I recommend using raw strings for RegExp sources:
eernst
2014/10/15 13:19:27
Indeed; added the final "*", done.
Lasse Reichstein Nielsen
2014/10/15 14:09:57
Also just noticed that as a regexp, "[^n]" is equi
eernst
2014/10/15 14:26:41
Cool! Done + Added a comment, just in case someon
| |
35 final whitespaceAllRe = new RegExp("^\\s+\$"); | |
36 final whitespaceLeadingRe = new RegExp("^\\s+[^\\n]"); | |
37 final whitespaceRe = new RegExp("[ \\t][ \\t]+"); | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Shorter regexp possible:
final whitespaceRE = ne
eernst
2014/10/15 13:19:27
Done.
| |
38 | |
39 // normalization steps | |
40 | |
41 cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) { | |
42 if (match == null) return line; | |
43 var start = match.start + startOffset; | |
44 var end = match.end + endOffset; | |
45 var len = line.length; | |
46 if (start < 0) start = 0; | |
47 if (end > len) end = len; | |
48 return line.substring(0, start) + glue + line.substring(end); | |
49 } | |
50 | |
51 cutRegexp(line, re, {startOffset: 0, endOffset: 0, glue: ""}) { | |
52 return cutMatch(line, re.firstMatch(line), | |
53 startOffset: startOffset, | |
54 endOffset: endOffset, | |
55 glue: glue); | |
56 } | |
57 | |
58 cutFromMatch(line, match, {offset: 0, glue: ""}) { | |
59 if (match == null) return line; | |
60 return line.substring(0, match.start + offset) + glue; | |
61 } | |
62 | |
63 cutFromRegexp(line, re, {offset: 0, glue: ""}) { | |
64 return cutFromMatch(line, re.firstMatch(line), offset: offset, glue: glue); | |
65 } | |
66 | |
67 isWsOnly(line) => whitespaceAllRe.firstMatch(line) != null; | |
68 isCommentOnly(line) => commentAllRe.firstMatch(line) != null; | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
This would be the non-regexp version: => line.star
eernst
2014/10/15 13:19:27
Arg null is treated also as an error by firstMatch
| |
69 | |
70 justEol(line) { | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
I'd prefer return types, and parameter types, in g
eernst
2014/10/15 13:19:27
I decided to use a typeless style in this particul
| |
71 if (line.length == 0) return line; | |
72 return line[line.length-1] == "\n" ? "\n" : ""; | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
return line.endsWith("\n") ? "\n" : "";
eernst
2014/10/15 13:19:27
Done.
| |
73 } | |
74 | |
75 stripComment(line) { | |
76 // NB: it is tempting to remove everything from the '%' and out, | |
77 // including the final newline, if any, but this does not work. | |
78 // The problem is that TeX will do exactly this, but then it will | |
79 // add back a character that depends on its state (S, M, or N), | |
80 // and it is tricky to maintain a similar state that matches the | |
81 // state of TeX faithfully. Hence, we remove the content of | |
82 // comments but do not remove the comments themselves, we just | |
83 // leave the '%' at the end of the line and let TeX manage its | |
84 // states in a way that does not differ from the file from before | |
85 // stripComment | |
86 if (isCommentOnly(line)) return "%\n"; | |
87 return cutRegexp(line, commentRe, startOffset: 2); | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Doesn't this loose the trailing '\n'?
eernst
2014/10/15 13:19:27
No, commentRE avoids matching a trailing '\n', so
| |
88 } | |
89 | |
90 // Reduce a wsOnly line to its eol, remove leading ws | |
91 // entirely, and reduce multiple ws chars to one | |
92 normalizeWhitespace(line) { | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
To remove leading WS (including empty lines), try:
eernst
2014/10/15 13:19:27
Done.
| |
93 if (isWsOnly(line)) return justEol(line); | |
94 line = cutRegexp(line, whitespaceLeadingRe, endOffset: -1); | |
95 var match; | |
96 while ((match = whitespaceRe.firstMatch(line)) != null) { | |
97 line = cutMatch(line, match, glue: " "); | |
98 } | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Maybe just:
line = line.replaceAll(whitespaceRe,
eernst
2014/10/15 13:19:27
Entire method much nicer now! Done.
| |
99 return line; | |
100 } | |
101 | |
102 // Reduce sequences of >1 wsOnly lines to 1, and sequences of >1 | |
103 // commentOnly lines to 1; moreover, treat commentOnly lines as | |
104 // wsOnly when occurring in wsOnly line blocks | |
105 multilineNormalize(lines) { | |
106 var afterBlankLines = false; // does 'line' succeed >0 empty lines? | |
107 var afterCommentLines = false; // .. succeed >0 commentOnly lines? | |
108 var newLines = new List(); | |
109 for (var line in lines) { | |
110 if (afterBlankLines && afterCommentLines) { | |
111 // can never happen | |
112 throw "Bug, please report to eernst@"; | |
113 } else if (afterBlankLines && !afterCommentLines) { | |
114 // at least one line before 'line' is wsOnly | |
115 if (!isWsOnly(line)) { | |
116 // blank line block ended | |
117 afterCommentLines = isCommentOnly(line); | |
118 // special case: it seems to be safe to remove commentOnly lines | |
119 // after wsOnly lines, so the TeX state must be predictably right; | |
120 // next line will then be afterCommentLines and be dropped, so | |
121 // we drop the entire comment block---which is very useful; we can | |
122 // also consider this comment line to be an empty line, such that | |
123 // subsequent empty lines can be considered to be in a block of | |
124 // empty lines; note that almost all variants of this will break.. | |
125 if (afterCommentLines) { | |
126 // _current_ 'line' a commentOnly here | |
127 afterBlankLines = true; | |
128 afterCommentLines = false; | |
129 // and do not add 'line' | |
130 } else { | |
131 // after blanks, but current 'line' is neither blank nor comment | |
132 afterBlankLines = false; | |
133 newLines.add(line); | |
134 } | |
135 } else { | |
136 // blank line block continues, do not add 'line' | |
137 } | |
138 } else if (!afterBlankLines && afterCommentLines) { | |
139 // at least one line before 'line' is commentOnly | |
140 if (!isCommentOnly(line)) { | |
141 // comment line block ended | |
142 afterBlankLines = isWsOnly(line); | |
143 afterCommentLines = false; | |
144 newLines.add(line); | |
145 } else { | |
146 // comment line block continues, do not add 'line' | |
147 } | |
148 } else { | |
149 assert(!afterBlankLines && !afterCommentLines); | |
150 // no wsOnly or commentOnly lines preceed 'line' | |
151 if (isWsOnly(line)) afterBlankLines = true; | |
152 if (isCommentOnly(line)) afterCommentLines = true; | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Maybe:
if (isCommentOnly(line)) {
afterCOmmentL
eernst
2014/10/15 13:19:27
Actually that was because of the following pattern
| |
153 if (!afterCommentLines) newLines.add(line); | |
154 // else skipping commentOnly line after nonWs, nonComment text | |
155 } | |
156 } | |
157 return newLines; | |
158 } | |
159 | |
160 // select the elements in the pipeline | |
161 | |
162 normalize(line) => normalizeWhitespace(stripComment(line)); | |
163 | |
164 sispNormalize(line) => stripComment(line); | |
165 | |
166 // ---------------------------------------------------------------------- | |
167 // Managing fragments with significant spacing | |
168 | |
169 final dartCodeBeginRe = new RegExp("^\\s*\\\\begin{dartCode}"); | |
Lasse Reichstein Nielsen
2014/10/15 09:13:17
Need to escape '{' and '}' in RegExp:
new RegEx
eernst
2014/10/15 13:19:27
OK. Note that we get no exceptions for this one,
Lasse Reichstein Nielsen
2014/10/15 14:09:57
RegExp in browsers have traditionally been very fo
eernst
2014/10/15 14:26:41
What's the smart way to install a wakeup call to a
| |
170 final dartCodeEndRe = new RegExp ("^\\s*\\\\end{dartCode}"); | |
171 | |
172 sispIs(line, targetRe) { | |
173 return targetRe.firstMatch(line) != null; | |
174 } | |
175 | |
176 sispIsDartBegin(line) => sispIs(line, dartCodeBeginRe); | |
177 sispIsDartEnd(line) => sispIs(line, dartCodeEndRe); | |
178 | |
179 // ---------------------------------------------------------------------- | |
180 // main | |
181 | |
182 main ([args]) { | |
183 if (args.length != 2) { | |
184 print("Usage: addlatexhash.dart <input-file> <output-file>"); | |
185 throw "Received ${args.length} arguments, expected two"; | |
186 } | |
187 | |
188 var inputFile = new File(args[0]); | |
189 var outputFile = new File(args[1]); | |
190 assert(inputFile.existsSync()); | |
191 | |
192 var lines = inputFile.readAsLinesSync(); | |
193 // single-line normalization | |
194 var inDartCode = false; | |
195 var newLines = new List(); | |
196 | |
197 for (var line in lines) { | |
198 if (sispIsDartBegin(line)) { | |
199 inDartCode = true; | |
200 } else if (sispIsDartEnd(line)) { | |
201 inDartCode = false; | |
202 } | |
203 if (inDartCode) { | |
204 newLines.add(sispNormalize(line + "\n")); | |
205 } else { | |
206 newLines.add(normalize(line + "\n")); | |
207 } | |
208 } | |
209 | |
210 // multi-line normalization | |
211 newLines = multilineNormalize(newLines); | |
212 | |
213 // output result | |
214 outputFile.writeAsStringSync(newLines.join()); | |
215 } | |
OLD | NEW |