View Javadoc

1   ////////////////////////////////////////////////////////////////////////////////
2   // MillScript-XML: an Open Spice interpreter and batch website creation tool
3   // Copyright (C) 2005 Kevin Rogers
4   //
5   // This file is part of MillScript-XML.
6   //
7   // MillScript-XML is free software; you can redistribute it and/or modify it under
8   // the terms of the GNU General Public License as published by the Free
9   // Software Foundation; either version 2 of the License, or (at your option)
10  // any later version.
11  //
12  // MillScript-XML is distributed in the hope that it will be useful, but WITHOUT
13  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  // more details.
16  //
17  // You should have received a copy of the GNU General Public License along with
18  // MillScript-XML; if not, write to the Free Software Foundation, Inc., 59 Temple
19  // Place, Suite 330, Boston, MA  02111-1307  USA
20  ////////////////////////////////////////////////////////////////////////////////
21  package org.millscript.commons.xml.tokenizer;
22  
23  import org.millscript.commons.xml.alerts.XmlErrorAlert;
24  
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.nio.charset.Charset;
28  
29  /**
30   * This class provides an <code>XmlTokenizer</code> implementation for
31   * tokenizing an XML 1.0 document.
32   */
33  public class Xml10Tokenizer extends AbstractXmlTokenizerImpl {
34  
35      /**
36       * Constructs a new XML 1.0 tokenizer to read from the specified input
37       * stream, using the specified character set, with optional namespace
38       * support.
39       *
40       * @param is    the <code>InputStream</code> to read from
41       * @param cs    the <code>Charset</code> to decode the
42       * <code>InputStream</code> with
43       * @param namespaceAware    indicates if the tokenizer should be namespace
44       * aware
45       */
46      public Xml10Tokenizer( final InputStream is, final Charset cs, final boolean namespaceAware ) {
47          super( is, cs, namespaceAware );
48      }
49  
50      /**
51       * Constructs a new XML 1.0 tokenizer to read from the specified reader,
52       * with optional namespace support.
53       *
54       * @param r the <code>Reader</code> to obtain characters from
55       * @param namespaceAware    indicates if the tokenizer should be namespace
56       * aware
57       */
58      public Xml10Tokenizer( final Reader r, final boolean namespaceAware ) {
59          super( r, namespaceAware );
60      }
61  
62      /**
63       * Constructs a new XML 1.0 tokenizer which will copy it's state from the
64       * specified existing tokenizer.
65       *
66       * @param a the existing tokenizer to copy state from
67       */
68      public Xml10Tokenizer( final AbstractXmlTokenizerImpl a ) {
69          super( a );
70      }
71  
72      /**
73       * Constructs a new XML 1.0 tokenizer which will copy it's state from the
74       * specified existing tokenizer, but will use the specified reader instead
75       * of the one from the existing tokenizer.
76       *
77       * @param a the existing tokenizer to copy state from
78       * @param r the new reader this tokenizer should read characters from
79       */
80      public Xml10Tokenizer( final AbstractXmlTokenizerImpl a, final Reader r ) {
81          super( a, r );
82      }
83  
84      /**
85       * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#handleIntChar(int)
86       */
87      @Override
88      public int handleIntChar( final int ch ) {
89          if ( !this.isChar( ch ) ) {
90              throw new XmlErrorAlert( "Illegal Char in document" ).culpritChar( ch ).mishap();
91          } else if ( ch == 0x0D ) {
92              final int ch2 = this.getIntChar();
93              if ( ch2 == -1 ) {
94                  // end of file
95                  return ch;
96              } else if ( ch2 != 0x0A ) {
97                  // The 0x0D is NOT followed by 0x0A so push back the second
98                  // char and continue
99                  this.pushBack( (char) ch2 );
100             }
101             // Reset the counters for a new line
102             this.lineNumber++;
103             this.columnNumber = 1;
104             // Convert any line endings into 0x0A
105             return 0x0A;
106         } else if ( ch == 0x0A ) {
107             // Reset the counters for a new line
108             this.lineNumber++;
109             this.columnNumber = 1;
110             // Convert any line endings into 0x0A
111             return 0x0A;
112         } else {
113             // Increment the column number
114             this.columnNumber++;
115             return ch;
116         }
117     }
118 
119     /**
120      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isChar(int)
121      */
122     @Override
123     public boolean isChar( final int ch ) {
124         return ch == 0x09 || ch == 0x0A || ch == 0x0D
125         || ( ch >= 0x20 && ch <= 0xD7FF )
126         || ( ch >= 0xE000 && ch <= 0xFFFD )
127         || ( ch >= 0x10000 && ch <= 0x10FFFF );
128     }
129 
130     /**
131      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameChar(char)
132      */
133     @Override
134     public boolean isNameChar( final char ch ) {
135         // FIXME - This needs to be XML 1.0 NOT XML 1.1
136         return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_' || ch == '-' || ch == '.'
137             || ( ch >= 'A' && ch <= 'Z' )
138             || ( ch >= '0' && ch <= '9' ) || ch == 0xB7
139             || ( ch >= 0xC0 && ch <= 0xD6 )
140             || ( ch >= 0xD8 && ch <= 0xF6 )
141             || ( ch >= 0xF8 && ch <= 0x2FF )
142             || ( ch >= 0x300 && ch <= 0x37D )
143             || ( ch >= 0x37F && ch <= 0x1FFF )
144             || ( ch >= 0x200C && ch <= 0x200D )
145             || ( ch >= 0x203F && ch <= 0x2040 )
146             || ( ch >= 0x2070 && ch <= 0x218F )
147             || ( ch >= 0x2C00 && ch <= 0x2FEF )
148             || ( ch >= 0x3001 && ch <= 0xD7FF )
149             || ( ch >= 0xF900 && ch <= 0xFDCF )
150             || ( ch >= 0xFDF0 && ch <= 0xFFFD )
151             || ( ch >= 0x10000 && ch <= 0xEFFFF );
152     }
153 
154     /**
155      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameStartChar(char)
156      */
157     @Override
158     public boolean isNameStartChar( final char ch ) {
159         // FIXME - This needs to be XML 1.0 NOT XML 1.1
160         return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_'
161             || ( ch >= 'A' && ch <= 'Z' )
162             || ( ch >= 0xC0 && ch <= 0xD6 )
163             || ( ch >= 0xD8 && ch <= 0xF6 )
164             || ( ch >= 0xF8 && ch <= 0x2FF )
165             || ( ch >= 0x370 && ch <= 0x37D )
166             || ( ch >= 0x37F && ch <= 0x1FFF )
167             || ( ch >= 0x200C && ch <= 0x200D )
168             || ( ch >= 0x2070 && ch <= 0x218F )
169             || ( ch >= 0x2C00 && ch <= 0x2FEF )
170             || ( ch >= 0x3001 && ch <= 0xD7FF )
171             || ( ch >= 0xF900 && ch <= 0xFDCF )
172             || ( ch >= 0xFDF0 && ch <= 0xFFFD )
173             || ( ch >= 0x10000 && ch <= 0xEFFFF );
174     }
175 
176 }