1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.millscript.commons.xml.tokenizer;
22
23 import org.millscript.commons.xml.alerts.XmlErrorAlert;
24
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.nio.charset.Charset;
28
29 /**
30 * This class provides an <code>XmlTokenizer</code> implementation for
31 * tokenizing an XML 1.0 document.
32 */
33 public class Xml10Tokenizer extends AbstractXmlTokenizerImpl {
34
35 /**
36 * Constructs a new XML 1.0 tokenizer to read from the specified input
37 * stream, using the specified character set, with optional namespace
38 * support.
39 *
40 * @param is the <code>InputStream</code> to read from
41 * @param cs the <code>Charset</code> to decode the
42 * <code>InputStream</code> with
43 * @param namespaceAware indicates if the tokenizer should be namespace
44 * aware
45 */
46 public Xml10Tokenizer( final InputStream is, final Charset cs, final boolean namespaceAware ) {
47 super( is, cs, namespaceAware );
48 }
49
50 /**
51 * Constructs a new XML 1.0 tokenizer to read from the specified reader,
52 * with optional namespace support.
53 *
54 * @param r the <code>Reader</code> to obtain characters from
55 * @param namespaceAware indicates if the tokenizer should be namespace
56 * aware
57 */
58 public Xml10Tokenizer( final Reader r, final boolean namespaceAware ) {
59 super( r, namespaceAware );
60 }
61
62 /**
63 * Constructs a new XML 1.0 tokenizer which will copy it's state from the
64 * specified existing tokenizer.
65 *
66 * @param a the existing tokenizer to copy state from
67 */
68 public Xml10Tokenizer( final AbstractXmlTokenizerImpl a ) {
69 super( a );
70 }
71
72 /**
73 * Constructs a new XML 1.0 tokenizer which will copy it's state from the
74 * specified existing tokenizer, but will use the specified reader instead
75 * of the one from the existing tokenizer.
76 *
77 * @param a the existing tokenizer to copy state from
78 * @param r the new reader this tokenizer should read characters from
79 */
80 public Xml10Tokenizer( final AbstractXmlTokenizerImpl a, final Reader r ) {
81 super( a, r );
82 }
83
84 /**
85 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#handleIntChar(int)
86 */
87 @Override
88 public int handleIntChar( final int ch ) {
89 if ( !this.isChar( ch ) ) {
90 throw new XmlErrorAlert( "Illegal Char in document" ).culpritChar( ch ).mishap();
91 } else if ( ch == 0x0D ) {
92 final int ch2 = this.getIntChar();
93 if ( ch2 == -1 ) {
94
95 return ch;
96 } else if ( ch2 != 0x0A ) {
97
98
99 this.pushBack( (char) ch2 );
100 }
101
102 this.lineNumber++;
103 this.columnNumber = 1;
104
105 return 0x0A;
106 } else if ( ch == 0x0A ) {
107
108 this.lineNumber++;
109 this.columnNumber = 1;
110
111 return 0x0A;
112 } else {
113
114 this.columnNumber++;
115 return ch;
116 }
117 }
118
119 /**
120 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isChar(int)
121 */
122 @Override
123 public boolean isChar( final int ch ) {
124 return ch == 0x09 || ch == 0x0A || ch == 0x0D
125 || ( ch >= 0x20 && ch <= 0xD7FF )
126 || ( ch >= 0xE000 && ch <= 0xFFFD )
127 || ( ch >= 0x10000 && ch <= 0x10FFFF );
128 }
129
130 /**
131 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameChar(char)
132 */
133 @Override
134 public boolean isNameChar( final char ch ) {
135
136 return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_' || ch == '-' || ch == '.'
137 || ( ch >= 'A' && ch <= 'Z' )
138 || ( ch >= '0' && ch <= '9' ) || ch == 0xB7
139 || ( ch >= 0xC0 && ch <= 0xD6 )
140 || ( ch >= 0xD8 && ch <= 0xF6 )
141 || ( ch >= 0xF8 && ch <= 0x2FF )
142 || ( ch >= 0x300 && ch <= 0x37D )
143 || ( ch >= 0x37F && ch <= 0x1FFF )
144 || ( ch >= 0x200C && ch <= 0x200D )
145 || ( ch >= 0x203F && ch <= 0x2040 )
146 || ( ch >= 0x2070 && ch <= 0x218F )
147 || ( ch >= 0x2C00 && ch <= 0x2FEF )
148 || ( ch >= 0x3001 && ch <= 0xD7FF )
149 || ( ch >= 0xF900 && ch <= 0xFDCF )
150 || ( ch >= 0xFDF0 && ch <= 0xFFFD )
151 || ( ch >= 0x10000 && ch <= 0xEFFFF );
152 }
153
154 /**
155 * @see org.millscript.commons.xml.tokenizer.AbstractXmlTokenizerImpl#isNameStartChar(char)
156 */
157 @Override
158 public boolean isNameStartChar( final char ch ) {
159
160 return ( ch >= 'a' && ch <= 'z' ) || ch == ':' || ch == '_'
161 || ( ch >= 'A' && ch <= 'Z' )
162 || ( ch >= 0xC0 && ch <= 0xD6 )
163 || ( ch >= 0xD8 && ch <= 0xF6 )
164 || ( ch >= 0xF8 && ch <= 0x2FF )
165 || ( ch >= 0x370 && ch <= 0x37D )
166 || ( ch >= 0x37F && ch <= 0x1FFF )
167 || ( ch >= 0x200C && ch <= 0x200D )
168 || ( ch >= 0x2070 && ch <= 0x218F )
169 || ( ch >= 0x2C00 && ch <= 0x2FEF )
170 || ( ch >= 0x3001 && ch <= 0xD7FF )
171 || ( ch >= 0xF900 && ch <= 0xFDCF )
172 || ( ch >= 0xFDF0 && ch <= 0xFFFD )
173 || ( ch >= 0x10000 && ch <= 0xEFFFF );
174 }
175
176 }