View Javadoc

1   ////////////////////////////////////////////////////////////////////////////////
2   // MillScript: an Open Spice interpreter and batch website creation tool
3   // Copyright (C) 2001-2004 Open World Ltd
4   // Copyright (C) 2004-2005 Kevin Rogers
5   // Copyright (C) 2004 Stephen F. K. Leach
6   //
7   // This file is part of MillScript.
8   //
9   // MillScript is free software; you can redistribute it and/or modify it under
10  // the terms of the GNU General Public License as published by the Free
11  // Software Foundation; either version 2 of the License, or (at your option)
12  // any later version.
13  //
14  // MillScript is distributed in the hope that it will be useful, but WITHOUT
15  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  // more details.
18  //
19  // You should have received a copy of the GNU General Public License along with
20  // MillScript; if not, write to the Free Software Foundation, Inc., 59 Temple
21  // Place, Suite 330, Boston, MA  02111-1307  USA
22  ////////////////////////////////////////////////////////////////////////////////
23  package org.millscript.millscript.syntax;
24  
25  import org.millscript.commons.util.list.ELinkedList;
26  import org.millscript.commons.util.map.EHashMap;
27  import org.millscript.millscript.alert.Alerts;
28  import org.millscript.millscript.conf.Configuration;
29  
30  import java.io.IOException;
31  import java.io.LineNumberReader;
32  import java.io.Reader;
33  import java.util.regex.Pattern;
34  
35  /**
36   * This class performs the basic tokenisation of the MillScript language. A
37   * single instance of this class would be used to tokenise one file/source.
38   */
39  public class TokenizerImpl implements Tokenizer {
40  
41      /**
42       * The buffer used while constructing a token.
43       */
44      private StringBuffer buff = new StringBuffer();
45  
46      /**
47       * The current configuration.
48       */
49      protected Configuration config;
50  
51      /**
52       * Flag indicating if end of line comments are available in this tokenizer.
53       * This is required because <code>#</code> could be in use as the alternate
54       * string quote within a template.
55       */
56      private boolean eolc;
57  
58      /**
59       * Used to store interpolation values, mapping the position in the
60       * interpolated string to the value to be interpolated.
61       */
62      protected EHashMap< Integer, CharSequence > interpolationMap = new EHashMap< Integer, CharSequence >();
63  
64      /**
65       * The origin message for this tokenizers input.
66       */
67      private String origin;
68  
69      /**
70       * The line number reader which provides the character stream to tokenize.
71       */
72      private LineNumberReader reader;
73  
74      /**
75       * Storage for a saved character. This allows the tokenizer to
76       * <em>unread</em> a read character, as the read character may be the start
77       * of the next token.
78       *
79       * <dl>
80       * <dt>-2</dt>
81       * <dd>No pushback character available</dd>
82       * <dt>-1</dt>
83       * <dd>End of file</dd>
84       * <dt>&gt;=0</dt>
85       * <dd>the saved character</dd>
86       * </dl>
87       */
88      private int savedChar = -2;
89  
90      /**
91       * Used to indicate the type of token that has just been read.
92       */
93      private TokenType ttype = TokenType.NEED_NEW;
94  
95      /**
96       * Used to determine where the tokenizer is during the tokenizing process.
97       * We have to be able to distinguish whether we are tokenizing an XML tag or
98       * attribute name, or the contents of an XML comment. In these situations
99       * the tokenizer must behave differently.
100      *
101      * <dl>
102      * <dt>'n'</dt>
103      * <dd>tag or attribute name</dd>
104      * <dt>'c'</dt>
105      * <dd>comment</dd>
106      * <dt>'?'</dt>
107      * <dd>somewhere else</dd>
108      * </dl>
109      *
110      * @todo    this should probably be handled with different classes of
111      *          tokenizer
112      */
113      private int where = '?';
114 
115     /**
116      * Constructs a new <code>Tokenizer</code> with the specified origin,
117      * source, interactive prompt and end of line comment status.
118      *
119      * @param o the origin message for this tokenizer
120      * @param r the character source to tokenize
121      * @param e flag indicating if end of line comments are supported
122      * @param c the configuration
123      */
124     TokenizerImpl( final String o, final Reader r, final boolean e, final Configuration c ) {
125         this.origin = o;
126         this.reader = new LineNumberReader( r );
127         this.eolc  = e;
128         this.config = c;
129     }
130 
131     /**
132      * @see org.millscript.millscript.syntax.Tokenizer#getOrigin()
133      */
134     public String getOrigin() {
135         return origin;
136     }
137 
138     /**
139      * @see org.millscript.millscript.syntax.Tokenizer#getLineNumber()
140      */
141     public int getLineNumber() {
142         //    Adjust for 0-indexing.
143         return reader.getLineNumber() + 1;
144     }
145 
146     /**
147      * @see org.millscript.millscript.syntax.Tokenizer#setWhere(char)
148      */
149     public void setWhere( final char ch ) {
150         if ( "nc?".indexOf( ch ) >= 0 ) {
151             where = ch;
152         }
153     }
154 
155     /**
156      * @see org.millscript.millscript.syntax.Tokenizer#checkWhere(char)
157      */
158     public boolean checkWhere( final char ch ) {
159         return where == ch;
160     }
161 
162     /**
163      * Returns the integer value of the next character from this tokenizers
164      * input. If a character has previously been pushed back, it will be
165      * returned, otherwise another character will be read from this tokenizers
166      * reader.
167      *
168      * @return  the <code>int</code> value of the next character
169      */
170     private int getIntChar() {
171         try {
172             if ( savedChar == -2 ) {
173                 int c = reader.read();
174                 return c;
175             } else {
176                 int x = savedChar;
177                 savedChar = -2;
178                 return x;
179             }
180         } catch ( IOException ex ) {
181             throw(
182                 Alerts.parse(
183                     "Could not read from input source",
184                     null
185                 ).culprit( "message", ex.getMessage() ).mishap()
186             );
187         }
188     }
189 
190     /**
191      * Returns the <code>char</code> value of the next character from this
192      * tokenizers input.
193      *
194      * @return  the <code>char</code> value of the next character
195      */
196     private char getChar() {
197         int ch = getIntChar();
198         if ( ch == -1 ) {
199             throw(
200                 Alerts.parse(
201                     "Unexpected end of file",
202                     null
203                 ).mishap()
204             );
205         }
206         return (char)ch;
207     }
208 
209     /**
210      * Pushes back the specified character. This will be the next character
211      * returned by <code>getIntChar</code> or <code>getChar</code>. This method
212      * would be used when a read character is not part of the current token.
213      *
214      * @param ch    the character to push back
215      */
216     private void pushBackChar( final int ch ) {
217         savedChar = ch;
218     }
219 
220     /**
221      * Adds the specified character to the token being constructed.
222      *
223      * @param ch    the character to be appended to the current token
224      */
225     private void addChar( final int ch ) {
226         buff.append( (char)ch );
227     }
228 
229     /**
230      * Accepts the specified character as part of the current token and returns
231      * the next character from the input.
232      *
233      * @param   ch  the character to append to the current token
234      * @return  the <code>int</code> value of the next character
235      */
236     private int acceptChar( final int ch ) {
237         addChar( ch );
238         return getIntChar();
239     }
240 
241     /**
242      * Accepts the specified character as part of the current token and returns
243      * the next character from the input.
244      *
245      * @param   ch  the character to append to the current token
246      * @return  the <code>char</code> value of the next character
247      */
248     private char okChar( final int ch ) {
249         addChar( ch );
250         return getChar();
251     }
252 
253     /**
254      * Determines if the specified character is a MillScript sign character. The
255      * following characters are MillScript signs:
256      *
257      * <ul>
258      * <li>&lt;&gt;!$%^&#38;*-+=|:?/~</li>
259      * </ul>
260      *
261      * @param   ch  the character to test.
262      * @return  <code>true</code> if the character is a MillScript sign,
263      *          <code>false</code> otherwise.
264      */
265     private static boolean isSign( final char ch ) {
266         return "<>!$%^&*-+=|:?/~".indexOf( ch ) >= 0;
267     }
268 
269     /**
270      * Returns the <code>char</code> value of the last character in the current
271      * token, without removing it.
272      *
273      * @return  the <code>char</code> value of the last character in the current
274      *          token.
275      * @see #cantStick
276      */
277     private char lastCharInBuff() {
278         return buff.charAt( buff.length() - 1 );
279     }
280 
281     /**
282      * Determines if the specified character cannot be stuck onto the current
283      * token. This is required to support the tokenization bodge for REXML i.e.
284      * to unstick sequences such as <code>&gt;&lt;/</code> into
285      * <code>&gt;</code> and <code>&lt;/</code>
286      *
287      * @param ch    the character to test
288      * @return  <code>true</code> if the character cannot be stuck onto the
289      * last character in the buffer, <code>false</code> otherwise
290      */
291     private boolean cantStick( final char ch ) {
292         return (
293             ch == '<' && lastCharInBuff() == '>' ||
294             ch == '>' && lastCharInBuff() == '<'
295         );
296     }
297 
298     /**
299      * Accepts repetitions of the specified character into the current token.
300      * This is currently used to read sequences of periods or <code>@</code>
301      * symbols.
302      *
303      * @param   orig    the character to read repetitions of
304      */
305     private void readRepetitions( final int orig ) {
306         int ich = orig;
307         while ( orig == ( ich = acceptChar( ich ) ) ) {
308         }
309         pushBackChar( ich );
310     }
311 
312     /**
313      * Reads an interpolated section of a string. This method is called when an
314      * escaped <code>(</code> is found in an input string. This handles nested
315      * strings and parenthesis within the interpolated section.
316      *
317      * @return a CharSequence for the interpolated section
318      */
319     private CharSequence readInterpolatedExpr() {
320         // buffer for the interpolated section
321         final StringBuffer interp = new StringBuffer();
322         int state = 0;
323         int parenLevel = 0;
324         final ELinkedList< Character > nestingQuotes = new ELinkedList< Character >();
325         char quote = ' ';
326         for (;;) {
327             final char ch = this.getChar();
328             if ( ch == '\n' || ch == '\r' ) {
329                 throw(
330                     Alerts.parse(
331                         "Line break before end of string",
332                         null
333                     ).mishap()
334                 );
335             }
336             switch ( state ) {
337                 case 0:
338                     switch ( ch ) {
339                         case '"':
340                         case '\'':
341                         case '`':
342                             state = 1;
343                             quote = ch;
344                             break;
345                         case ')':
346                             if ( parenLevel == 0 ) {
347                                 if ( nestingQuotes.isEmtpy() ) {
348                                     return interp.toString();
349                                 } else {
350                                     quote = nestingQuotes.last().charValue();
351                                     nestingQuotes.deleteLast();
352                                     state = 1;
353                                 }
354                             } else {
355                                 parenLevel -= 1;
356                             }
357                             break;
358                         case '(':
359                             parenLevel += 1;
360                             break;
361                         default :
362                             break;
363                     }
364                     break;
365                 case 1:
366                     if ( ch == quote ) {
367                         state = 0;
368                     } else if ( ch == '\\' ) {
369                         state = 2;
370                     }
371                     break;
372                 case 2:
373                     int count = 0;
374                     switch ( ch ) {
375                         case '(':
376                             state = 0;
377                             nestingQuotes.addLast( new Character( quote ) );
378                             break;
379                         case '^':
380                             count = 1;
381                             break;
382                         case '0':
383                             count = 3;
384                             break;
385                         case 'u':
386                             count = 4;
387                             break;
388                     }
389                     for ( int i = 0; i < count; i++ ) {
390                         final char x = this.getChar();
391                         if ( !Character.isLetterOrDigit( ch ) ) {
392                             this.config.reportAlertAsWarning(
393                                 Alerts.parse(
394                                     "Unexpected backquote sequence",
395                                     null
396                                 ).culprit( "char", new Character( x ) )
397                             );
398                         }
399                     }
400                     break;
401                 default :
402                     break;
403             }
404             interp.append( ch );
405         }
406     }
407 
408     /**
409      * Adds the specified character sequence as the interpolated section for
410      * the current buffer position. The current buffer position represents the
411      * position in the string that the interpolated section begins.
412      *
413      * @param cs    the CharSequence to add as the interpolated section for the
414      * current buffer position.
415      */
416     private void addInterpolation( final CharSequence cs ) {
417         this.interpolationMap.insert( new Integer( this.buff.length() - 1 ), cs );
418     }
419 
420     /**
421      * Tokenizes a fat string with the specified character as the string
422      * quotes. The string terminates at the next triple quote sequence.
423      *
424      * @param   endChar <code>char</code> value of this strings string quotes.
425      */
426     private void readFatString( final char endChar ) {
427         addChar( endChar );
428         char ch = getChar();
429         while( true ) {
430             if ( ch == '\r' || ch == '\n' ) {
431                 // We're at the end of line, so throw away all whitespace
432                 while ( Character.isWhitespace( ch ) ) {
433                     ch = getChar();
434                 }
435             } else if ( ch == '|' ) {
436                 // We've just read the marker character, throw it away and read
437                 // up to the end of line
438                 ch = getChar();
439                 while ( ch != '\r' && ch != '\n' ) {
440                     ch = okChar( ch );
441                 }
442             } else if ( ch == endChar ) {
443                 final char ch2 = getChar();
444                 if ( ch2 == endChar ) {
445                     final char ch3 = getChar();
446                     if ( ch3 == endChar ) {
447                         // We've just read the trailing triple quote sequence,
448                         // append them all to the token
449                         addChar( endChar );
450                         break;
451                     } else {
452                         throw(
453                             Alerts.compile(
454                                 "Unexpected fat string close quote",
455                                 "Closing triple-quote must appear on a line by itself"
456                             ).culprit(
457                                 "partial string",
458                                 getErrorString()
459                             ).culprit(
460                                 "Unexpected close quote",
461                                 String.copyValueOf( new char[] { ch, ch2, ch3 } )
462                             ).mishap()
463                         );
464                     }
465                 } else {
466                     throw(
467                         Alerts.compile(
468                             "Unexpected fat string close quote",
469                             "Closing triple-quote must appear on a line by itself"
470                         ).culprit(
471                             "partial string",
472                             getErrorString()
473                         ).culprit(
474                             "Unexpected close quote",
475                             String.copyValueOf( new char[] { ch, ch2 } )
476                         ).mishap()
477                     );
478                 }
479             } else {
480                 throw(
481                     Alerts.compile(
482                         "Unexpected character begining a fat string text line",
483                         "Each text line must start with a vertical pipe or the end-string triple-quote"
484                     ).culprit(
485                         "partial string",
486                         getErrorString()
487                     ).culprit(
488                         "Unexpected character",
489                         ch
490                     ).mishap()
491                 );
492             }
493         }
494     }
495 
496     /**
497      * Tokenizes a string with the specified character as the string quotes. The
498      * string terminates at the first unescaped quote character, i.e. by
499      * preceeding the quote character by <code>\</code> you can include it in
500      * the tokenized string.
501      *
502      * @param   endChar <code>char</code> value of this strings string quotes.
503      */
504     private void readString( final char endChar ) {
505         addChar( endChar );
506         markReader();
507         char ch = getChar();
508 
509         // flag to indicate we keep going.
510         // This is required for XML comment parsing.
511         boolean keepReading = ( ch != endChar );
512 
513         while ( keepReading ) {
514             // Check if we're reading the end XML comment tag.
515             if ( ch == '-' ) {
516                 ch = okChar( ch );
517                 if ( ch == '-' ) {
518                     ch = okChar( ch );
519                     if ( ch == '>' && where == 'c' ) {
520                         resetReader();
521                         keepReading = false;
522                     }
523                 }
524             } else if ( where != 'c' && ch == endChar ) {
525                 // If we're in a comment we MUST ignore the endChar variable.
526                 // We're in the comment until we've read the end XML comment tag "-->"
527                 keepReading = false;
528             } else if ( ( ch == '\r' || ch == '\n' ) && where != 'c' ) {
529                 // Newlines and carriage returns are valid in XML comments.
530                 throw(
531                     Alerts.compile(
532                         "Unterminated string",
533                         "A newline or carriage return was encountered before the closing quotes"
534                     ).culprit( "partial string", getErrorString() ).mishap()
535                 );
536             } else if ( ch == '\\' ) {
537                 ch = getChar();
538                 if ( ch == endChar ) {
539                     ch = okChar( ch );
540                 } else if ( ch == '\\' ) {
541                     ch = okChar( '\\' );
542                 } else if ( ch == 't' ) {
543                     ch = okChar( '\t' );
544                 } else if ( ch == 'n' ) {
545                     ch = okChar( '\n' );
546                 } else if ( ch == 'r' ) {
547                     ch = okChar( '\r' );
548                 } else if ( ch == '&' ) {
549                     StringBuffer b = new StringBuffer();
550                     ch = getChar();
551                     if ( ch == '#' ) {
552                         int radix = 10;
553                         ch = getChar();
554                         if ( ch == 'x' ) {
555                             //    hex
556                             ch = getChar();
557                             radix = 16;
558                         }
559                         for (;;) {
560                             int n = Character.digit( ch, radix );
561                             if ( n == -1 ) {
562                                 break;
563                             }
564                             b.append( ch );
565                             ch = getChar();
566                         }
567                         buff.append( (char) Integer.parseInt( b.toString(), radix ) );
568                     } else {
569                         for (;;) {
570                             if ( !Character.isLetter( ch ) ) {
571                                 break;
572                             }
573                             b.append( ch );
574                             ch = getChar();
575                         }
576                         String s = b.toString().intern();
577                         Character tmp = config.getHTMLCharacterEntity().getCharacterFor( s );
578                         if ( tmp == null ) {
579                             throw(
580                                 Alerts.parse(
581                                     "Unrecognized HTML entity in string",
582                                     "Not all entities are recognized yet"
583                                 ).culprit( "entity name", s ).mishap()
584                             );
585                         } else {
586                             buff.append( tmp );
587                         }
588                     }
589                     if ( ch != ';' ) {
590                         b.append( ch );
591                         String culprit = "&" + b.toString();
592                         throw(
593                             Alerts.parse(
594                                 "Unexpected HTML entity sequence in string",
595                                 "Entities are a sequence of letters terminated by a semi-colon"
596                             ).culprit( "entity", culprit ).mishap()
597                         );
598                     }
599                     ch = getChar();
600                 } else if ( ch == '(' ) {
601                     // Read the interpolated section
602                     final CharSequence cs = this.readInterpolatedExpr();
603                     this.addInterpolation( cs );
604                     ch = this.getChar();
605                 } else {
606                     throw(
607                         Alerts.parse(
608                             "Unexpected escape sequence in string",
609                             null
610                         ).
611                         culprit( "sequence", "\\" + Character.toString( ch ) ).
612                         mishap()
613                     );
614                 }
615             } else {
616                 addChar( ch );
617                 // Mark the reader in case we read the start of the end XML comment tag.
618                 markReader();
619                 ch = getChar();
620             }
621         }
622         addChar( ch );
623     }
624 
625     /**
626      * Tokenizes a traditional regular expression with the specified character
627      * as the terminator character. The regular expression terminates at the
628      * first unescaped end character, i.e. by preceeding the quote character by
629      * <code>\</code> you can include it in the tokenized string.
630      *
631      * @param   endChar <code>char</code> value of this traditional regular
632      * expression end quote
633      */
634     private void readTraditionalRegex( final char endChar ) {
635         // We start with "//"
636         addChar( endChar );
637         addChar( endChar );
638         // Get the next character
639         char ch = getChar();
640 
641         while ( ch != endChar ) {
642             if ( ch == '\r' || ch == '\n' ) {
643                 // Newlines and carriage returns are not valid in any regular
644                 // expression
645                 throw(
646                     Alerts.compile(
647                         "Unterminated regular expression",
648                         "A newline or carriage return was encountered before the closing quote"
649                     ).culprit( "partial regular expression", getErrorString() ).mishap()
650                 );
651             } else if ( ch == endChar ) {
652                 // Break out of the loop
653                 break;
654             } else if ( ch == '\\' ) {
655                 // Just pass the escape character straight through
656                 ch = okChar( ch );
657                 if ( ch == endChar ) {
658                     // The escaped character was the end character, so just
659                     // pass that through too
660                     ch = okChar( ch );
661                 }
662             } else {
663                 // Normal traditional regex character, accept and get the next
664                 // one
665                 addChar( ch );
666                 ch = getChar();
667             }
668         }
669         // Add the closing quote
670         addChar( ch );
671         // Now check for any further regular expression flags. Get the next
672         // character after the regex, which might not be a flag
673         ch = getChar();
674         while ( "xiumsd".indexOf( ch ) != -1 ) {
675             // Accept the regex flag
676             addChar( ch );
677             // Get the next character to test, which might not be a flag
678             ch = getChar();
679         }
680         // The last character read wasn't a flag. Guaranteed
681         this.pushBackChar( ch );
682     }
683 
684     /**
685      * @see org.millscript.millscript.syntax.Tokenizer#makePattern()
686      */
687     public Pattern makePattern() {
688         // Get the buffer contents
689         final String s = this.buff.toString();
690         // Find the last '/', so we can extract any flags
691         final int n = s.lastIndexOf( '/' );
692         // Get the regular expression part
693         final String regex = s.substring( 2, n );
694         // Get the flags
695         final String flags = s.substring( n + 1 );
696         // Iterate through the specified flags, validate them and make up a bit
697         // mask for all of them
698         int iflags = 0;
699         for ( int i = 0; i < flags.length(); i++ ) {
700             final char f = flags.charAt( i );
701             switch ( f ) {
702                 case 'i' :
703                     iflags |= Pattern.CASE_INSENSITIVE;
704                     break;
705                 case 'x' :
706                     iflags |= Pattern.COMMENTS;
707                     break;
708                 case 's' :
709                     iflags |= Pattern.DOTALL;
710                     break;
711                 case 'm' :
712                     iflags |= Pattern.MULTILINE;
713                     break;
714                 case 'u' :
715                     iflags |= Pattern.UNICODE_CASE;
716                     break;
717                 case 'd' :
718                     iflags |= Pattern.UNIX_LINES;
719                     break;
720                 default :
721                     throw(
722                         Alerts.fault(
723                             "Invalid flag for traditional regex"
724                         ).culprit( "flag", new Character( f ) ).mishap()
725                     );
726             }
727         }
728         // Return the compiled regular expression
729         return Pattern.compile( regex, iflags );
730     }
731 
732     /**
733      * Checks if the next character is the same as the specified one.
734      *
735      * @param ch    the character to compare with the next one from the input
736      * @return  <code>true</code> if the next character in the input is the
737      * same as the specified one
738      */
739     private boolean tryChar( final char ch ) {
740         final char nch = this.getChar();
741         if ( ch == nch ) {
742             return true;
743         } else {
744             this.pushBackChar( nch );
745             return false;
746         }
747     }
748 
749     /**
750      * @see org.millscript.millscript.syntax.Tokenizer#nextToken()
751      */
752     public TokenType nextToken() {
753         if ( ttype == TokenType.NEED_NEW ) {
754             buff.setLength( 0 );
755             this.interpolationMap.removeAll();
756             int ch = getIntChar();
757             // If we're inside an XML comment and the current character is not "-"
758             // (the start of the end XML comment tag?), then read a String from the
759             // input.
760             if ( where == 'c' && ch != '-' ) {
761                 readString( (char)ch );
762                 return TokenType.STRING;
763             }
764             while ( Character.isWhitespace( (char)ch ) ) {
765                 ch = getIntChar();
766             }
767             if ( ch == -1 ) {
768                 return TokenType.EOF;
769             } else if ( ch == '#' && eolc ) {
770                 //    Dispose of end-of-line comments
771                 for (;;) {
772                     ch = getIntChar();
773                     if ( ch == '\n' || ch == -1 ) {
774                         break;
775                     }
776                 }
777                 return nextToken();
778             } else if ( Character.isLetter( (char)ch ) ) {
779                 markReader();
780                 ch = acceptChar( ch );
781                 while  ( Character.isLetterOrDigit( (char)ch ) || ch == '_' || ( ch == '-' && where == 'n' ) ) {
782                     markReader();
783                     ch = acceptChar( ch );
784                 }
785                 if ( ch == ':' ) {
786                     ch = acceptChar( ch );
787                     if ( ch == ':' ) {
788                         resetReader();
789                         buff.deleteCharAt( buff.length() - 1 );
790                         return TokenType.NAME;
791                     }
792                     while ( Character.isLetterOrDigit( (char)ch ) || ch == '_' || ( ch == '-' && where == 'n' ) ) {
793                         ch = acceptChar( ch );
794                     }
795                 }
796                 pushBackChar( ch );
797                 return TokenType.NAME;
798             } else if ( Character.isDigit( (char)ch ) || ( ch == '-' && where != 'c' ) ) {
799                 ch = acceptChar( ch );
800                 while  ( Character.isDigit( (char)ch ) ) {
801                     ch = acceptChar( ch );
802                 }
803                 pushBackChar( ch );
804                 if ( buff.length() == 1 && buff.charAt( 0 ) == '-' ) {
805                     return TokenType.NAME;
806                 } else {
807                     return TokenType.INTEGER;
808                 }
809             } else if ( ch == '"' ) {
810                 markReader();
811                 ch = getChar(); // ch is now the second char in the sequence
812                 if ( ch == '"' ) {
813                     final char ch2 = getChar(); // ch2 is actually the third in sequence
814                     if ( ch2 == '"' ) {
815                         // It's a tripe quote sequence, read as a fat string
816                         readFatString( (char) ch );
817                     } else {
818                         resetReader();
819                         readString( (char) ch );
820                     }
821                 } else {
822                     resetReader();
823                     readString( '"' );
824                 }
825                 return TokenType.STRING;
826             } else if ( ch == '"' || ch == '\'' || ch == '`' || ch == '#' && !eolc ) {
827                 readString( (char)ch );
828                 return TokenType.STRING;
829             } else if ( ch == '/' && this.tryChar( '/') ) {
830                 readTraditionalRegex( (char) ch );
831                 return TokenType.TRADITIONAL_REGEX;
832             } else if ( ch == '.' || ch == '@' ) {
833                 readRepetitions( ch );
834                 return TokenType.NAME;
835             } else if ( isSign( (char)ch ) ) {
836                 ch = acceptChar( ch );
837                 while ( isSign( (char)ch ) && !( cantStick( (char)ch ) ) ) {
838                     ch = acceptChar( ch );
839                 }
840                 pushBackChar( ch );
841                 return TokenType.NAME;
842             } else {
843                 addChar( ch );
844                 return TokenType.NAME;
845             }
846         } else {
847             final TokenType tt = ttype;
848             ttype = TokenType.NEED_NEW;
849             return tt;
850         }
851     }
852 
853     /**
854      * @see org.millscript.millscript.syntax.Tokenizer#peekToken()
855      */
856     public TokenType peekToken() {
857         return ttype = nextToken();
858     }
859 
860     /**
861      * @see org.millscript.millscript.syntax.Tokenizer#dropToken()
862      */
863     public void dropToken() {
864         ttype = TokenType.NEED_NEW;
865     }
866 
867     /**
868      * @see org.millscript.millscript.syntax.Tokenizer#tryRead(java.lang.String)
869      */
870     public boolean tryRead( final String sym ) {
871         if ( peekToken() == TokenType.NAME && getName() == sym ) {
872             dropToken();
873             return true;
874         } else {
875             return false;
876         }
877     }
878 
879     /**
880      * @see org.millscript.millscript.syntax.Tokenizer#markReader()
881      */
882     public void markReader() {
883         try {
884             reader.mark( 2048 );
885         } catch ( IOException ex ) {
886             throw(
887                 Alerts.parse(
888                     "Problem with input source",
889                     null
890                 ).culprit( "message", ex.getMessage() ).mishap()
891             );
892         }
893     }
894 
895     /**
896      * @see org.millscript.millscript.syntax.Tokenizer#resetReader()
897      */
898     public void resetReader() {
899         try {
900             reader.reset();
901         } catch ( IOException ex ) {
902             throw(
903                 Alerts.parse(
904                     "Problem with input source",
905                     null
906                 ).culprit( "message", ex.getMessage() ).mishap()
907             );
908         }
909     }
910 
911     /**
912      * @see org.millscript.millscript.syntax.Tokenizer#peekRead(java.lang.String)
913      */
914     public boolean peekRead( final String sym ) {
915         return peekToken() == TokenType.NAME && getName() == sym;
916     }
917 
918     /**
919      * @see org.millscript.millscript.syntax.Tokenizer#mustRead(java.lang.String)
920      */
921     public void mustRead( final String sym ) {
922         if ( !tryRead( sym ) ) {
923             throw(
924                 Alerts.parse(
925                     "Unexpected symbol",
926                     null
927                 ).
928                 culprit( "found", getErrorString() ).
929                 culprit( "wanted", sym ).
930                 origin( this ).
931                 mishap()
932             );
933         }
934     }
935 
936     /**
937      * @see org.millscript.millscript.syntax.Tokenizer#getString()
938      */
939     public String getString() {
940         return buff.toString();
941     }
942 
943     /**
944      * @see org.millscript.millscript.syntax.Tokenizer#getErrorString()
945      */
946     public String getErrorString() {
947         return (
948             ttype == TokenType.EOF ? "<end of file>" :
949             buff.toString()
950         );
951 
952     }
953 
954     /**
955      * @see org.millscript.millscript.syntax.Tokenizer#getAttributeName()
956      */
957     public String getAttributeName() {
958         int size = buff.length();
959         for ( int i = 0; i < size; i++ ) {
960             char ch = buff.charAt( i );
961             if ( !(
962                 Character.isLetterOrDigit( ch ) ||
963                 "_:-".indexOf( ch ) >= 0
964             ) ) {
965                 throw(
966                     Alerts.parse(
967                         "Invalid attribute name",
968                         null
969                     ).culprit( "name", buff.toString() ).mishap()
970                 );
971             }
972         }
973         return buff.toString();
974     }
975 
976     /**
977      * @see org.millscript.millscript.syntax.Tokenizer#getTagName()
978      */
979     public String getTagName() {
980         int size = buff.length();
981         for ( int i = 0; i < size; i++ ) {
982             char ch = buff.charAt( i );
983             if ( !(
984                 Character.isLetterOrDigit( ch ) ||
985                 "_:-".indexOf( ch ) >= 0
986             ) ) {
987                 throw(
988                     Alerts.parse(
989                         "Invalid tag name",
990                         null
991                     ).culprit( "name", buff.toString() ).mishap()
992                 );
993             }
994         }
995         return buff.toString();
996     }
997 
998 
999     /**
1000      * @see org.millscript.millscript.syntax.Tokenizer#getStringNoQuotes()
1001      */
1002     public String getStringNoQuotes() {
1003         return this.getStringNoQuotes( 1, 1 );
1004     }
1005 
1006     /**
1007      * @see org.millscript.millscript.syntax.Tokenizer#getStringNoQuotes(int, int)
1008      */
1009     public String getStringNoQuotes( final int a, final int b ) {
1010         String s = buff.toString();
1011         return s.substring( a, s.length() - b );
1012     }
1013 
1014     /**
1015      * @see org.millscript.millscript.syntax.Tokenizer#getQuoteChar()
1016      */
1017     public char getQuoteChar() {
1018         return buff.charAt( 0 );
1019     }
1020 
1021     /**
1022      * @see org.millscript.millscript.syntax.Tokenizer#getName()
1023      */
1024     public String getName() {
1025         return buff.toString().intern();
1026     }
1027 
1028     /**
1029      * @see org.millscript.millscript.syntax.Tokenizer#getInt()
1030      */
1031     public int getInt() {
1032         return Integer.parseInt( buff.toString() );
1033     }
1034 
1035     /**
1036      * @see org.millscript.commons.alert.AlertOrigin#setContext(java.lang.String, int)
1037      */
1038     public void setContext( final String s, final int n ) {
1039         this.origin = s;
1040         // You can't set the line number for a tokenizer so ignore them
1041     }
1042 
1043     /**
1044      * @see org.millscript.commons.alert.AlertOrigin#setLineNumber(int)
1045      */
1046     public void setLineNumber( final int n ) {
1047         // You can't set the line number for a tokenizer so ignore them
1048     }
1049 
1050     /**
1051      * @see org.millscript.commons.alert.AlertOrigin#setOrigin(java.lang.String)
1052      */
1053     public void setOrigin( final String o ) {
1054         this.origin = o;
1055     }
1056 
1057 }