tools/addlatexhash.dart - Issue 652993005: Working insertion of hash values; added a few labels in spec

Unified Diff: tools/addlatexhash.dart

Issue 652993005: Working insertion of hash values; added a few labels in spec (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/addlatexhash.dart

diff --git a/tools/addlatexhash.dart b/tools/addlatexhash.dart

index f79a0b32296cffdb983eeb279a2930d41389b341..86802d8a6df7966a69fb203fba0b58762fa606c0 100644

--- a/tools/addlatexhash.dart

+++ b/tools/addlatexhash.dart

@@ -6,35 +6,36 @@

// This is a very specialized tool which was created in order to support

// adding hash values used as location markers in the LaTeX source of the

// language specification. It is intended to take its input file as the

-// first argument and the output file name as the second argument. From

-// docs/language a typical usage would be as follows:

+// first argument, an output file name as the second argument, and a

+// hash listing file name as the third argument. From docs/language a

+// typical usage would be as follows:

-// dart ../../tools/addlatexhash.dart dartLangSpec.tex tmp.tex

+// dart ../../tools/addlatexhash.dart dartLangSpec.tex out.tex hash.txt

ricow1 2014/10/27 10:08:03 hash.txt - what is this, windows 3.11? :-)

eernst 2014/11/03 14:17:46 What's wrong with that? ;-) The short file names

-// This will yield a normalized variant tmp.tex of the language

-// specification with hash values filled in. For more details, please

-// check the language specification source itself.

+// This will produce a normalized variant out.tex of the language

+// specification with hash values filled in, and a listing hash.txt of

+// all the hash values along with the label of their textual context

+// (section, subsection, subsubsection, paragraph) . For more details,

+// please check the language specification source itself.

// NB: This utility assumes UN*X style line endings, \n, in the LaTeX

// source file receieved as input; it will not work with other styles.

-//

-// TODO: The current version does not fill in hash values, it only

-// standardizes the LaTeX source by removing comments and normalizing

-// white space.

import 'dart:io';

import 'dart:convert';

+import '../pkg/utf/lib/utf.dart';

import '../pkg/crypto/lib/crypto.dart';

-// Normalization of the text, i.e., removal or normalization

-// of elements that do not affect the output from latex

+// ----------------------------------------------------------------------

+// Normalization of the text: removal or normalization of parts that

+// do not affect the output from latex, such as white space

final commentRE = new RegExp(r"[^\\]%.*"); // NB: . does not match \n

final whitespaceAllRE = new RegExp(r"^\s+$");

final whitespaceRE = new RegExp(r"[ \t]{2,}");

-// normalization steps

+// Remove 'match'ing part of 'line', possibly with given offsets

+// and inserting the given 'glue' to replace the match

cutMatch(line, match, {startOffset: 0, endOffset: 0, glue: ""}) {

if (match == null) return line;

var start = match.start + startOffset;

@@ -149,39 +150,207 @@ multilineNormalize(lines) {

return newLines;

}

-// Selecting the elements in the pipeline

+// Select the elements in the pipeline

normalize(line) => normalizeWhitespace(stripComment(line));

sispNormalize(line) => stripComment(line);

-// Managing fragments with significant spacing

+// Manage fragments with significant spacing

final dartCodeBeginRE = new RegExp(r"^\s*\\begin\{dartCode\}");

final dartCodeEndRE = new RegExp (r"^\s*\\end\{dartCode\}");

-sispIs(line, targetRE) {

- return targetRE.firstMatch(line) != null;

+// Recognize begin/end line of a Dart code block

+sispIs(line, targetRE) => targetRE.firstMatch(line) != null;

sispIsDartBegin(line) => sispIs(line, dartCodeBeginRE);

sispIsDartEnd(line) => sispIs(line, dartCodeEndRE);

-// Transform input file into output file

+// ----------------------------------------------------------------------

+// Removal of non-normative elements of the text (rationale, commentary)

+final hashMarkRE = new RegExp(r"^\\LMHash{.*}\s*$");

+final hashBlockTerminatorRE = new RegExp(r"\\((|sub(|sub))section|paragraph)");

+// Recognize begin/end line of each block of lines getting a hash value

+isArg(argRE, line) => argRE.firstMatch(line) != null;

Lasse Reichstein Nielsen 2014/10/28 10:12:12 => line.contains(argRE); Probably reduces to the

eernst 2014/11/03 14:17:46 Done.

+isHashMarker(line) => isArg(hashMarkRE, line);

+isHashBlockTerminator(line) => isArg(hashBlockTerminatorRE, line);

+// Return the indices of lines satisfying the given test

+findLineNumbers(lines, test()) {

Lasse Reichstein Nielsen 2014/10/28 10:12:13 Type of `test` is incorrect, it's typed to be null

eernst 2014/11/03 14:17:46 Interesting! In fact, having worked with types fo

+ var lineNumbers = new List();

+ var lineNumber = 0;

+ for (var line in lines) {

+ if (test(line)) lineNumbers.add(lineNumber);

+ lineNumber++;

+ }

+ return lineNumbers;

+findHashLineNumbers(lines) => findLineNumbers(lines, isHashMarker);

+// Return 'line' without the "\\cmdName\s*{..}" command starting at

ricow1 2014/10/27 10:08:03 use doc style comments for method comments (///) s

eernst 2014/11/03 14:17:46 Done.

+// 'startIndex'; note that it is assumed but not checked that 'line'

+// contains "\\cmdType\s*{..", and note that the end of the {..} block

+// is found via brace matching (i.e., nested {..} blocks are handled),

+// but it may break if '{' is made an active character etc.etc.

+removeCommand(line, cmdName, startIndex) {

+ const BACKSLASH = 92; // char code for '\\'

+ const BRACE_BEGIN = 123; // char code for '{'

+ const BRACE_END = 125; // char code for '}'

Lasse Reichstein Nielsen 2014/10/28 10:12:12 Pedantry: Two spaces before '//' comments, and you

Lasse Reichstein Nielsen 2014/11/03 11:34:35 Obviously, to align it, it needs to be two *or mor

eernst 2014/11/03 14:17:46 Done.

+ var blockStartIndex = startIndex + cmdName.length + 1;

+ while (blockStartIndex < line.length &&

+ line.codeUnitAt(blockStartIndex) != BRACE_BEGIN) {

+ blockStartIndex++;

+ }

+ blockStartIndex++;

+ if (blockStartIndex > line.length) {

+ // caller's fault

+ throw "Bug, please report to eernst@";

+ }

+ // blockStartIndex just after '{'

+ var afterEscape = false; // actually after '{'

Lasse Reichstein Nielsen 2014/10/28 10:12:11 Two space before '//'. I think that is a general s

eernst 2014/11/03 14:17:47 Couldn't find it in the style guide, but done anyw

+ var braceLevel = 1; // number of '{' minus number of '}' seen

+ for (var index = blockStartIndex; index < line.length; index++) {

+ switch (line.codeUnitAt(index)) {

+ case BRACE_BEGIN:

+ if (afterEscape) afterEscape = false; else braceLevel++; break;

Lasse Reichstein Nielsen 2014/10/28 10:12:13 An if-with-an-else should always be put on multipl

eernst 2014/11/03 14:17:46 Done.

+ case BRACE_END:

+ if (afterEscape) afterEscape = false; else braceLevel--; break;

Lasse Reichstein Nielsen 2014/10/28 10:12:13 Consider checking if braceLevel goes negative. Oth

eernst 2014/11/03 14:17:47 Actually, the latex command will be used on the so

+ case BACKSLASH:

+ afterEscape = true; break;

Lasse Reichstein Nielsen 2014/10/28 10:12:11 Newline after ';'

eernst 2014/11/03 14:17:46 Done. Presumably this does not apply to 'for'?

Lasse Reichstein Nielsen 2014/11/03 16:32:04 Not to 'for' no. It's a matter of "only one statem

+ default:

+ afterEscape = false;

+ }

+ if (braceLevel == 0) {

+ return line.substring(0, startIndex) + line.substring(index + 1);

+ }

+ // removal failed; we consider this to mean that the input is ill-formed

+ throw "Unmatched braces";

+final commentaryRE = new RegExp(r"\\commentary\s*{");

+final rationaleRE = new RegExp(r"\\rationale\s*{");

+removeCommentary(line) {

Lasse Reichstein Nielsen 2014/10/28 10:12:13 You are actively removing commentary code from the

eernst 2014/11/03 14:17:46 That wouldn't work in this case, because the strin

+ var match = commentaryRE.firstMatch(line);

+ if (match == null) return line;

+ return removeCommentary(removeCommand(line, r"commentary", match.start));

+removeRationale(line) {

+ var match = rationaleRE.firstMatch(line);

+ if (match == null) return line;

+ return removeRationale(removeCommand(line, r"rationale", match.start));

+// Remove commentary and rationale from 'line'

+simplifyLine(line) {

+ var simplerLine = removeCommentary(line);

+ simplerLine = removeRationale(simplerLine);

+ simplerLine = normalizeWhitespace(simplerLine);

+ return simplerLine;

+// ----------------------------------------------------------------------

+// Recognition of line blocks, insertion of block hash into \LMHash{}

+final hashMarkArgumentRE = new RegExp(r"{.*}");

Lasse Reichstein Nielsen 2014/10/28 10:12:13 Escape '{' characters.

eernst 2014/11/03 14:17:46 Done, here and in several other similar locations.

+cleanupLine(line) => cutRegexp(line, commentRE, startOffset: 1).trimRight();

+gatherLines(lines, startIndex, nextIndex) {

+ var gatheredLine = "";

+ var isFirst = true;

+ for (var index = startIndex; index < nextIndex; index++) {

+ var line = lines[index];

+ if (isHashBlockTerminator(line)) break;

+ if (isFirst) {

+ gatheredLine += cleanupLine(line);

+ isFirst = false;

+ } else {

+ gatheredLine += " " + cleanupLine(line);

Lasse Reichstein Nielsen 2014/10/28 10:12:13 This will take time O(lines * chars-in-lines), so

Lasse Reichstein Nielsen 2014/10/31 13:52:39 Even more "functional": lines.getRange(startInde

eernst 2014/11/03 14:17:46 Done, with some adjustments. Nice! ;)

+ }

+ return gatheredLine;

+// Compute the hash value for the line block starting at 'startIndex'

+// in 'lines' and stopping just before 'nextIndex'; SIDE EFFECT:

+// output the simplified text and its hash value to 'listSink'

+computeHashValue(lines, startIndex, nextIndex, listSink) {

+ final hashEncoder = new SHA1();

+ final gatheredLine = gatherLines(lines, startIndex, nextIndex);

+ final simplifiedLine = simplifyLine(gatheredLine);

+ listSink.write(" % $simplifiedLine\n");

+ hashEncoder.add(encodeUtf8(simplifiedLine));

+ return hashEncoder.close();

+computeHashString(lines, startIndex, nextIndex, listSink) =>

+ CryptoUtils.bytesToHex(computeHashValue(lines,

+ startIndex,

+ nextIndex,

+ listSink));

+// Compute and add hashes to \LMHash{} lines (which must be on the

+// indices 'hashLineNumbers' of 'lines'), and emit the simplified

+// text and hash values to 'listSink'

+addHashMarks(lines, hashLineNumbers, listSink) {

+ if (hashLineNumbers.length == 0) return lines; // noop

+ for (var n = 0; n < hashLineNumbers.length - 1; n++) {

+ final hashIndex = hashLineNumbers[n];

+ final nextIndex = hashLineNumbers[n + 1];

+ final hashValue = computeHashString(lines,

+ hashIndex + 1,

+ nextIndex,

+ listSink);

+ lines[hashIndex] =

+ lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}");

+ listSink.write(" $hashValue\n");

+ }

+ final hashIndex = hashLineNumbers[hashLineNumbers.length - 1];

+ final nextIndex = lines.length;

+ final hashValue = computeHashString(lines,

+ hashIndex + 1,

+ nextIndex,

+ listSink);

+ lines[hashIndex] =

+ lines[hashIndex].replaceAll(hashMarkArgumentRE, "{" + hashValue + "}");

+ listSink.write(" $hashValue\n");

+ return lines;

+// ----------------------------------------------------------------------

+// Transformation of input file to output file

main ([args]) {

- if (args.length != 2) {

- print("Usage: addlatexhash.dart <input-file> <output-file>");

- throw "Received ${args.length} arguments, expected two";

+ if (args.length != 3) {

+ print("Usage: addlatexhash.dart <input-file> <output-file> <list-file>");

+ throw "Received ${args.length} arguments, expected three";

}

+ // latex source

Lasse Reichstein Nielsen 2014/10/28 10:12:12 Pedantry: It's "LaTeX" :) (Yes, I also insist on A

eernst 2014/11/03 14:17:47 Done. ;-)

var inputFile = new File(args[0]);

- var outputFile = new File(args[1]);

assert(inputFile.existsSync());

var lines = inputFile.readAsLinesSync();

+ // latex source with 'normalized' spacing etc., and with hash values

+ var outputFile = new File(args[1]);

+ // hierarchical list of hash values

+ var listFile = new File(args[2]);

+ var listSink = listFile.openWrite();

// single-line normalization

var inDartCode = false;

- var newLines = new List();

+ var normalizedLines = new List();

for (var line in lines) {

if (sispIsDartBegin(line)) {

@@ -190,15 +359,20 @@ main ([args]) {

inDartCode = false;

}

if (inDartCode) {

- newLines.add(sispNormalize(line + "\n"));

+ normalizedLines.add(sispNormalize(line + "\n"));

} else {

- newLines.add(normalize(line + "\n"));

+ normalizedLines.add(normalize(line + "\n"));

}

// multi-line normalization

- newLines = multilineNormalize(newLines);

+ normalizedLines = multilineNormalize(normalizedLines);

+ // insertion of hash values

+ var hashLineNumbers = findHashLineNumbers(normalizedLines);

+ var hashMarkedLines = addHashMarks(normalizedLines,hashLineNumbers,listSink);

- // output result

- outputFile.writeAsStringSync(newLines.join());

+ // output

+ outputFile.writeAsStringSync(hashMarkedLines.join());

+ listSink.close();

}

« no previous file with comments | « tests/standalone/io/addlatexhash_test_src.tex ('k') | no next file » | no next file with comments »