1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.millscript.commons.xml.tokenizer;
22
23 import org.millscript.commons.xml.alerts.XmlErrorAlert;
24
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.nio.charset.Charset;
28
29 /**
30 * This class provides an <code>XmlTokenizer</code> implementation for
31 * tokenizing an XML 1.1 document.
32 */
33 public class Xml11Tokenizer extends AbstractXmlTokenizerImpl {
34
35 /**
36 * Constructs a new XML 1.1 tokenizer to read from the specified input
37 * stream, using the specified character set, with optional namespace
38 * support.
39 *
40 * @param is the <code>InputStream</code> to read from
41 * @param cs the <code>Charset</code> to decode the
42 * <code>InputStream</code> with
43 * @param namespaceAware indicates if the tokenizer should be namespace
44 * aware
45 */
46 public Xml11Tokenizer( final InputStream is, final Charset cs, final boolean namespaceAware ) {
47 super( is, cs, namespaceAware );
48 }
49
50 /**
51 * Constructs a new XML 1.0 tokenizer to read from the specified reader,
52 * with optional namespace support.
53 *
54 * @param r the <code>Reader</code> to obtain characters from
55 * @param namespaceAware indicates if the tokenizer should be namespace
56 * aware
57 */
58 public Xml11Tokenizer( final Reader r, final boolean namespaceAware ) {
59 super( r, namespaceAware );
60 }
61
62 /**
63 * Constructs a new XML 1.0 tokenizer which will copy it's state from the
64 * specified existing tokenizer.
65 *
66 * @param a the existing tokenizer to copy state from
67 */
68 public Xml11Tokenizer( final AbstractXmlTokenizerImpl a ) {
69 super( a );
70 }
71
72 /**
73 * Constructs a new XML 1.0 tokenizer which will copy it's state from the
74 * specified existing tokenizer, but will use the specified reader instead
75 * of the one from the existing tokenizer.
76 *
77 * @param a the existing tokenizer to copy state from
78 * @param r the new reader this tokenizer should read characters from
79 */
80 public Xml11Tokenizer( final AbstractXmlTokenizerImpl a, final Reader r ) {
81 super( a, r );
82 }
83
84 /**
85 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#handleIntChar(int)
86 */
87 @Override
88 public int handleIntChar( final int ch ) {
89 if ( !this.isChar( ch ) || this.isRestrictedChar( ch ) ) {
90 throw new XmlErrorAlert( "Illegal Char in document" ).culpritChar( ch ).mishap();
91 } else if ( ch == 0x0D ) {
92 final int ch2 = this.getIntChar();
93 if ( ch2 == -1 ) {
94
95 return ch;
96 } else if ( ch2 != 0x0A && ch2 != 0x85 ) {
97
98
99 this.pushBack( (char) ch2 );
100 }
101
102 this.lineNumber++;
103 this.columnNumber = 1;
104
105 return 0x0A;
106 } else if ( ch == 0x0A || ch == 0x85 || ch == 0x2028 ) {
107
108 this.lineNumber++;
109 this.columnNumber = 1;
110
111 return 0x0A;
112 } else {
113
114 this.columnNumber++;
115 return ch;
116 }
117 }
118
119 /**
120 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isChar(int)
121 */
122 @Override
123 public boolean isChar( final int ch ) {
124 return ( ch >= 0x01 && ch <= 0xD7FF )
125 || ( ch >= 0xE000 && ch <= 0xFFFD )
126 || ( ch >= 0x10000 && ch <= 0x10FFFF );
127 }
128
129 /**
130 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameChar(char)
131 */
132 @Override
133 public boolean isNameChar( final char ch ) {
134 return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_' || ch == '-' || ch == '.'
135 || ( ch >= 'A' && ch <= 'Z' )
136 || ( ch >= '0' && ch <= '9' ) || ch == 0xB7
137 || ( ch >= 0xC0 && ch <= 0xD6 )
138 || ( ch >= 0xD8 && ch <= 0xF6 )
139 || ( ch >= 0xF8 && ch <= 0x2FF )
140 || ( ch >= 0x300 && ch <= 0x37D )
141 || ( ch >= 0x37F && ch <= 0x1FFF )
142 || ( ch >= 0x200C && ch <= 0x200D )
143 || ( ch >= 0x203F && ch <= 0x2040 )
144 || ( ch >= 0x2070 && ch <= 0x218F )
145 || ( ch >= 0x2C00 && ch <= 0x2FEF )
146 || ( ch >= 0x3001 && ch <= 0xD7FF )
147 || ( ch >= 0xF900 && ch <= 0xFDCF )
148 || ( ch >= 0xFDF0 && ch <= 0xFFFD )
149 || ( ch >= 0x10000 && ch <= 0xEFFFF );
150 }
151
152 /**
153 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameStartChar(char)
154 */
155 @Override
156 public boolean isNameStartChar( final char ch ) {
157 return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_'
158 || ( ch >= 'A' && ch <= 'Z' )
159 || ( ch >= 0xC0 && ch <= 0xD6 )
160 || ( ch >= 0xD8 && ch <= 0xF6 )
161 || ( ch >= 0xF8 && ch <= 0x2FF )
162 || ( ch >= 0x370 && ch <= 0x37D )
163 || ( ch >= 0x37F && ch <= 0x1FFF )
164 || ( ch >= 0x200C && ch <= 0x200D )
165 || ( ch >= 0x2070 && ch <= 0x218F )
166 || ( ch >= 0x2C00 && ch <= 0x2FEF )
167 || ( ch >= 0x3001 && ch <= 0xD7FF )
168 || ( ch >= 0xF900 && ch <= 0xFDCF )
169 || ( ch >= 0xFDF0 && ch <= 0xFFFD )
170 || ( ch >= 0x10000 && ch <= 0xEFFFF );
171 }
172
173 /**
174 * Tests if the specified character matches the <code>RestrictedChar</code>
175 * production in the XML specification.
176 *
177 * @param ch the character to test
178 * @return <code>true</code> if the character is a
179 * <code>RestrictedChar</code> and <code>false</code> otherwise
180 */
181 public boolean isRestrictedChar( final int ch ) {
182 return ( ch >= 0x01 && ch <= 0x08 )
183 || ch == 0x0B || ch == 0x0C
184 || ( ch >= 0xE && ch <= 0x1F )
185 || ( ch >= 0x7F && ch <= 0x84 )
186 || ( ch >= 0x86 && ch <= 0x9F );
187 }
188
189 }