View Javadoc

1   ////////////////////////////////////////////////////////////////////////////////
2   // MillScript-XML: an Open Spice interpreter and batch website creation tool
3   // Copyright (C) 2005 Kevin Rogers
4   //
5   // This file is part of MillScript-XML.
6   //
7   // MillScript-XML is free software; you can redistribute it and/or modify it under
8   // the terms of the GNU General Public License as published by the Free
9   // Software Foundation; either version 2 of the License, or (at your option)
10  // any later version.
11  //
12  // MillScript-XML is distributed in the hope that it will be useful, but WITHOUT
13  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  // more details.
16  //
17  // You should have received a copy of the GNU General Public License along with
18  // MillScript-XML; if not, write to the Free Software Foundation, Inc., 59 Temple
19  // Place, Suite 330, Boston, MA  02111-1307  USA
20  ////////////////////////////////////////////////////////////////////////////////
21  package org.millscript.commons.xml.tokenizer;
22  
23  import org.millscript.commons.xml.alerts.XmlErrorAlert;
24  
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.nio.charset.Charset;
28  
29  /**
30   * This class provides an <code>XmlTokenizer</code> implementation for
31   * tokenizing an XML 1.1 document.
32   */
33  public class Xml11Tokenizer extends AbstractXmlTokenizerImpl {
34  
35      /**
36       * Constructs a new XML 1.1 tokenizer to read from the specified input
37       * stream, using the specified character set, with optional namespace
38       * support.
39       *
40       * @param is    the <code>InputStream</code> to read from
41       * @param cs    the <code>Charset</code> to decode the
42       * <code>InputStream</code> with
43       * @param namespaceAware    indicates if the tokenizer should be namespace
44       * aware
45       */
46      public Xml11Tokenizer( final InputStream is, final Charset cs, final boolean namespaceAware ) {
47          super( is, cs, namespaceAware );
48      }
49  
50      /**
51       * Constructs a new XML 1.0 tokenizer to read from the specified reader,
52       * with optional namespace support.
53       *
54       * @param r the <code>Reader</code> to obtain characters from
55       * @param namespaceAware    indicates if the tokenizer should be namespace
56       * aware
57       */
58      public Xml11Tokenizer( final Reader r, final boolean namespaceAware ) {
59          super( r, namespaceAware );
60      }
61  
62      /**
63       * Constructs a new XML 1.0 tokenizer which will copy it's state from the
64       * specified existing tokenizer.
65       *
66       * @param a the existing tokenizer to copy state from
67       */
68      public Xml11Tokenizer( final AbstractXmlTokenizerImpl a ) {
69          super( a );
70      }
71  
72      /**
73       * Constructs a new XML 1.0 tokenizer which will copy it's state from the
74       * specified existing tokenizer, but will use the specified reader instead
75       * of the one from the existing tokenizer.
76       *
77       * @param a the existing tokenizer to copy state from
78       * @param r the new reader this tokenizer should read characters from
79       */
80      public Xml11Tokenizer( final AbstractXmlTokenizerImpl a, final Reader r ) {
81          super( a, r );
82      }
83  
84      /**
85       * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#handleIntChar(int)
86       */
87      @Override
88      public int handleIntChar( final int ch ) {
89          if ( !this.isChar( ch ) || this.isRestrictedChar( ch ) ) {
90              throw new XmlErrorAlert( "Illegal Char in document" ).culpritChar( ch ).mishap();
91          } else if ( ch == 0x0D ) {
92              final int ch2 = this.getIntChar();
93              if ( ch2 == -1 ) {
94                  // end of file
95                  return ch;
96              } else if ( ch2 != 0x0A && ch2 != 0x85 ) {
97                  // The 0x0D is NOT followed by 0x0A or 0x85 so push back the
98                  // second char and continue
99                  this.pushBack( (char) ch2 );
100             }
101             // Reset the counters for a new line
102             this.lineNumber++;
103             this.columnNumber = 1;
104             // Convert any line endings into 0x0A
105             return 0x0A;
106         } else if ( ch == 0x0A || ch == 0x85 || ch == 0x2028 ) {
107             // Reset the counters for a new line
108             this.lineNumber++;
109             this.columnNumber = 1;
110             // Convert any line endings into 0x0A
111             return 0x0A;
112         } else {
113             // Increment the column number
114             this.columnNumber++;
115             return ch;
116         }
117     }
118 
119     /**
120      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isChar(int)
121      */
122     @Override
123     public boolean isChar( final int ch ) {
124         return ( ch >= 0x01 && ch <= 0xD7FF )
125         || ( ch >= 0xE000 && ch <= 0xFFFD )
126         || ( ch >= 0x10000 && ch <= 0x10FFFF );
127     }
128 
129     /**
130      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameChar(char)
131      */
132     @Override
133     public boolean isNameChar( final char ch ) {
134         return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_' || ch == '-' || ch == '.'
135             || ( ch >= 'A' && ch <= 'Z' )
136             || ( ch >= '0' && ch <= '9' ) || ch == 0xB7
137             || ( ch >= 0xC0 && ch <= 0xD6 )
138             || ( ch >= 0xD8 && ch <= 0xF6 )
139             || ( ch >= 0xF8 && ch <= 0x2FF )
140             || ( ch >= 0x300 && ch <= 0x37D )
141             || ( ch >= 0x37F && ch <= 0x1FFF )
142             || ( ch >= 0x200C && ch <= 0x200D )
143             || ( ch >= 0x203F && ch <= 0x2040 )
144             || ( ch >= 0x2070 && ch <= 0x218F )
145             || ( ch >= 0x2C00 && ch <= 0x2FEF )
146             || ( ch >= 0x3001 && ch <= 0xD7FF )
147             || ( ch >= 0xF900 && ch <= 0xFDCF )
148             || ( ch >= 0xFDF0 && ch <= 0xFFFD )
149             || ( ch >= 0x10000 && ch <= 0xEFFFF );
150     }
151 
152     /**
153      * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameStartChar(char)
154      */
155     @Override
156     public boolean isNameStartChar( final char ch ) {
157         return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_'
158             || ( ch >= 'A' && ch <= 'Z' )
159             || ( ch >= 0xC0 && ch <= 0xD6 )
160             || ( ch >= 0xD8 && ch <= 0xF6 )
161             || ( ch >= 0xF8 && ch <= 0x2FF )
162             || ( ch >= 0x370 && ch <= 0x37D )
163             || ( ch >= 0x37F && ch <= 0x1FFF )
164             || ( ch >= 0x200C && ch <= 0x200D )
165             || ( ch >= 0x2070 && ch <= 0x218F )
166             || ( ch >= 0x2C00 && ch <= 0x2FEF )
167             || ( ch >= 0x3001 && ch <= 0xD7FF )
168             || ( ch >= 0xF900 && ch <= 0xFDCF )
169             || ( ch >= 0xFDF0 && ch <= 0xFFFD )
170             || ( ch >= 0x10000 && ch <= 0xEFFFF );
171     }
172 
173     /**
174      * Tests if the specified character matches the <code>RestrictedChar</code>
175      * production in the XML specification.
176      *
177      * @param ch    the character to test
178      * @return  <code>true</code> if the character is a
179      * <code>RestrictedChar</code> and <code>false</code> otherwise
180      */
181     public boolean isRestrictedChar( final int ch ) {
182         return ( ch >= 0x01 && ch <= 0x08 )
183         || ch == 0x0B || ch == 0x0C
184         || ( ch >= 0xE && ch <= 0x1F )
185         || ( ch >= 0x7F && ch <= 0x84 )
186         || ( ch >= 0x86 && ch <= 0x9F );
187     }
188 
189 }