1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.millscript.commons.xml.tokenizer;
22
23 import org.millscript.commons.alert.alerts.Fault;
24 import org.millscript.commons.alert.alerts.IOAlert;
25 import org.millscript.commons.util.EList;
26 import org.millscript.commons.util.EMap;
27 import org.millscript.commons.util.IMap;
28 import org.millscript.commons.util.list.EArrayList;
29 import org.millscript.commons.util.list.ELinkedList;
30 import org.millscript.commons.util.list.ISingletonList;
31 import org.millscript.commons.util.map.EHashMap;
32 import org.millscript.commons.xml.alerts.XmlErrorAlert;
33 import org.millscript.commons.xml.alerts.XmlValidityConstraintAlert;
34 import org.millscript.commons.xml.alerts.XmlWellFormednessAlert;
35 import org.millscript.commons.xml.api.AttDef;
36 import org.millscript.commons.xml.api.Contentspec;
37 import org.millscript.commons.xml.api.Name;
38 import org.millscript.commons.xml.api.contentspec.ChoiceContentspec;
39 import org.millscript.commons.xml.api.contentspec.CpContentspec;
40 import org.millscript.commons.xml.api.contentspec.SeqContentspec;
41 import org.millscript.commons.xml.api.token.AttListDeclToken;
42 import org.millscript.commons.xml.api.token.CharDataToken;
43 import org.millscript.commons.xml.api.token.CommentToken;
44 import org.millscript.commons.xml.api.token.DTDToken;
45 import org.millscript.commons.xml.api.token.ElementDeclToken;
46 import org.millscript.commons.xml.api.token.EndTagToken;
47 import org.millscript.commons.xml.api.token.EntityDeclToken;
48 import org.millscript.commons.xml.api.token.NotationDeclToken;
49 import org.millscript.commons.xml.api.token.PIToken;
50 import org.millscript.commons.xml.api.token.StartTagToken;
51 import org.millscript.commons.xml.api.token.Token;
52 import org.millscript.commons.xml.api.token.TokenVisitor;
53 import org.millscript.commons.xml.api.token.XmlDeclToken;
54 import org.millscript.commons.xml.api.tokenizer.XmlTokenizer;
55 import org.millscript.commons.xml.atttype.CdataAttDefImpl;
56 import org.millscript.commons.xml.atttype.EntitiesAttDefImpl;
57 import org.millscript.commons.xml.atttype.EntityAttDefImpl;
58 import org.millscript.commons.xml.atttype.EnumerationAttDefImpl;
59 import org.millscript.commons.xml.atttype.IdAttDefImpl;
60 import org.millscript.commons.xml.atttype.IdrefAttDefImpl;
61 import org.millscript.commons.xml.atttype.IdrefsAttDefImpl;
62 import org.millscript.commons.xml.atttype.NmtokenAttDefImpl;
63 import org.millscript.commons.xml.atttype.NmtokensAttDefImpl;
64 import org.millscript.commons.xml.atttype.NotationAttDefImpl;
65 import org.millscript.commons.xml.contentspec.AnyContentspecImpl;
66 import org.millscript.commons.xml.contentspec.ChoiceContentspecImpl;
67 import org.millscript.commons.xml.contentspec.EmptyContentspecImpl;
68 import org.millscript.commons.xml.contentspec.MixedContentspecImpl;
69 import org.millscript.commons.xml.contentspec.NameContentspecImpl;
70 import org.millscript.commons.xml.contentspec.SeqContentspecImpl;
71 import org.millscript.commons.xml.entity.CharacterEntityImpl;
72 import org.millscript.commons.xml.entity.EntityImpl;
73 import org.millscript.commons.xml.entity.ExternalGeneralEntity;
74 import org.millscript.commons.xml.entity.ExternalParameterEntity;
75 import org.millscript.commons.xml.entity.InternalGeneralEntity;
76 import org.millscript.commons.xml.entity.InternalParameterEntity;
77 import org.millscript.commons.xml.entity.UnparsedGeneralEntity;
78 import org.millscript.commons.xml.token.AttListDeclTokenImpl;
79 import org.millscript.commons.xml.token.CharDataTokenImpl;
80 import org.millscript.commons.xml.token.CommentTokenImpl;
81 import org.millscript.commons.xml.token.DTDTokenImpl;
82 import org.millscript.commons.xml.token.ElementDeclTokenImpl;
83 import org.millscript.commons.xml.token.EmptyElemTokenImpl;
84 import org.millscript.commons.xml.token.EndTagTokenImpl;
85 import org.millscript.commons.xml.token.EntityDeclTokenImpl;
86 import org.millscript.commons.xml.token.NotationDeclTokenImpl;
87 import org.millscript.commons.xml.token.PITokenImpl;
88 import org.millscript.commons.xml.token.StartTagTokenImpl;
89 import org.millscript.commons.xml.token.XmlDeclTokenImpl;
90
91 import java.io.IOException;
92 import java.io.InputStream;
93 import java.io.InputStreamReader;
94 import java.io.Reader;
95 import java.nio.charset.Charset;
96
97 /**
98 * This class provides an <code>XmlTokenizer</code> implementation that breaks
99 * an XML document into tokens, such as a start tag, end tag, character data,
100 * etc. This tokenizer will only perform a minimum number of well-formedness
101 * checks, such as for illegal characters, attributes, etc. This tokenizer does
102 * not perform checks such as for matching start/end tags, or that a DTD
103 * appears at the start of a document.
104 */
105 public abstract class AbstractXmlTokenizerImpl implements XmlTokenizer {
106
107 /**
108 * The number of the current character on the current line.
109 */
110 protected int columnNumber = 0;
111
112 /**
113 * This buffer is used when constructing certain tokens, as characters
114 * sometimes need to be appended from outside the individual token reading
115 * methods.
116 */
117 private final StringBuffer currentTokenData = new StringBuffer();
118
119 /**
120 * Indicates when we've reached the end of the file.
121 */
122 private boolean endOfFile = false;
123
124 /**
125 * This map contains the mapping from an entity name to it's
126 * <code>Entity</code> object.
127 */
128 private final EMap< String, EntityImpl > entities = new EHashMap< String, EntityImpl >();
129
130 /**
131 * The current line number.
132 */
133 protected int lineNumber = 1;
134
135 private IMap< String, String > prefixToNamespace = new PrefixToNamespaceMap();
136
137 /**
138 * This name tokenizer is used to tokenizer qualified names. Depending on
139 * whether we are handling namespaces, we can choose between a namespace
140 * aware or not name tokenzier for this.
141 */
142 private final NameTokenizer nameTokenizer;
143
144 /**
145 * This is the buffer we use when we need to push back characters, which we
146 * can then tokenize.
147 */
148 private char[] pushBackBuffer = new char[ 16 ];
149
150 /**
151 * The current position in the push back buffer.
152 */
153 private int pushBackPos = -1;
154
155 /**
156 * The reader we should obtain characters to tokenize from.
157 */
158 private final Reader reader;
159
160 /**
161 * Constructs a new XML tokenizer to read from the specified input stream,
162 * using the specified character set, with optional namespace support.
163 *
164 * @param is the <code>InputStream</code> to read from
165 * @param cs the <code>Charset</code> to decode the
166 * <code>InputStream</code> with
167 * @param namespaceAware indicates if the tokenizer should be namespace
168 * aware
169 */
170 protected AbstractXmlTokenizerImpl( final InputStream is, final Charset cs, final boolean namespaceAware ) {
171 this(
172 new InputStreamReader( is, cs ),
173 namespaceAware
174 );
175 }
176
177 /**
178 * Constructs a new XML tokenizer to read from the specified reader, with
179 * optional namespace support.
180 *
181 * @param r the <code>Reader</code> to obtain characters from
182 * @param namespaceAware indicates if the tokenizer should be namespace
183 * aware
184 */
185 protected AbstractXmlTokenizerImpl( final Reader r, final boolean namespaceAware ) {
186
187 this.entities.insert( "amp", new InternalGeneralEntity( this, "amp", "&" ) );
188 this.entities.insert( "lt", new InternalGeneralEntity( this, "lt", "<" ) );
189 this.entities.insert( "gt", new InternalGeneralEntity( this, "gt", ">" ) );
190 this.entities.insert( "apos", new InternalGeneralEntity( this, "apos", "'" ) );
191 this.entities.insert( "quot", new InternalGeneralEntity( this, "quot", """ ) );
192 this.reader = r;
193 if ( namespaceAware ) {
194 this.nameTokenizer = new NamespaceAwareNameTokenizer( this );
195 } else {
196 this.nameTokenizer = new NamespaceIgnoreNameTokenizer( this );
197 }
198 }
199
200 /**
201 * Constructs a new XML tokenizer which will copy it's state from the
202 * specified existing tokenizer.
203 *
204 * @param axti the existing tokenizer to copy state from
205 */
206 protected AbstractXmlTokenizerImpl( final AbstractXmlTokenizerImpl axti ) {
207 this( axti, axti.reader );
208 }
209
210 /**
211 * Constructs a new XML tokenizer which will copy it's state from the
212 * specified existing tokenizer, but will use the specified reader instead
213 * of the one from the existing tokenizer.
214 *
215 * @param axti the existing tokenizer to copy state from
216 * @param rr the new reader this tokenizer should read characters from
217 */
218 protected AbstractXmlTokenizerImpl( final AbstractXmlTokenizerImpl axti, final Reader rr ) {
219 this.columnNumber = axti.columnNumber;
220 this.endOfFile = axti.endOfFile;
221 this.entities.insertAll( axti.entities );
222 this.lineNumber = axti.lineNumber;
223 if ( axti.nameTokenizer instanceof NamespaceAwareNameTokenizer ) {
224 this.nameTokenizer = new NamespaceAwareNameTokenizer( this );
225 } else {
226 this.nameTokenizer = new NamespaceIgnoreNameTokenizer( this );
227 }
228 this.pushBackBuffer = axti.pushBackBuffer;
229 this.pushBackPos = axti.pushBackPos;
230 this.reader = rr;
231 }
232
233 /**
234 * Appends the specified <code>char</code> to the current token.
235 *
236 * @param ch the <code>char</code> to append
237 */
238 public void appendCurrentTokenData( final char ch ) {
239 this.currentTokenData.append( ch );
240 }
241
242 /**
243 * Calculates the a suitable new size for the push back buffer, based on
244 * the current size and the required free space.
245 *
246 * @param currentLength the length of the push back buffer
247 * @param spaceRequired the required amount of free space
248 * @return an <code>int</code> size for the push back buffer that contains
249 * at least the required free space
250 */
251 private int calcNewSize( final int currentLength, final int spaceRequired ) {
252 if ( currentLength - this.pushBackPos < spaceRequired ) {
253 return this.calcNewSize( currentLength * 2, spaceRequired );
254 } else {
255 return currentLength;
256 }
257 }
258
259 /**
260 * Drops and characters from the input stream that match the <code>S</code>
261 * production in the XML specification.
262 * <pre>
263 * [3] S ::= (#x20 | #x9 | #xD | #xA)+
264 * </pre>
265 */
266 public void dropS() {
267
268 char ch = this.getChar();
269
270 while ( this.isS( ch ) ) {
271 ch = this.getChar();
272 }
273
274 this.pushBack( ch );
275 }
276
277 /**
278 * Returns the next character from the input stream, throwing an alert if
279 * the end of file is reached.
280 *
281 * @return the next <code>char</code> from the input stream
282 */
283 public char getChar() {
284 final int ch = this.getIntChar();
285 if ( ch == -1 ) {
286 throw new IOAlert( "Unexpected end of file" ).mishap();
287 } else {
288 return (char) ch;
289 }
290 }
291
292 /**
293 * Returns the raw <code>int</code> version of the next char, handling any
294 * push back characters and XML version dependencies. This method accounts
295 * for the set of legal characters in an XML document.
296 *
297 * @return the <code>int</code> version of the next char or
298 * <code>-1</code> if there are no more characters
299 */
300 public int getIntChar() {
301
302 if ( this.pushBackPos == -1 ) {
303
304 try {
305 final int ch = this.reader.read();
306 if ( ch == -1 ) {
307 this.endOfFile = true;
308 return ch;
309 }
310 return this.handleIntChar( ch );
311 } catch ( IOException e ) {
312 throw new IOAlert().setParentThrowable( e ).mishap();
313 }
314 } else if ( this.endOfFile ) {
315
316 return -1;
317 } else {
318
319 return this.pushBackBuffer[ this.pushBackPos-- ];
320 }
321 }
322
323 /**
324 * @see org.millscript.commons.xml.api.tokenizer.XmlTokenizer#getLineNumber()
325 */
326 public int getLineNumber() {
327 return this.lineNumber;
328 }
329
330 /**
331 * Returns the next <code>char</code>, checking that it is a legal quote
332 * character.
333 *
334 * @return the next <code>char</code>, if it is a legal quote character
335 */
336 public char getQuoteChar() {
337 final char ch = this.getChar();
338 if ( ch == '"' || ch == '\'' ) {
339 return ch;
340 } else {
341 throw new XmlWellFormednessAlert(
342 "Illegal quote character"
343 ).culpritChar( ch ).mishap();
344 }
345 }
346
347 /**
348 * Handles the specified character, performing any XML version dependent
349 * line break conversions and checks on it's validity.
350 *
351 * @param ch the character to test
352 * @return the handled character, which may not be the same as that
353 * supplied as the argument
354 */
355 public abstract int handleIntChar( final int ch );
356
357 /**
358 * @see org.millscript.commons.xml.api.tokenizer.XmlTokenizer#hasNextToken()
359 */
360 public boolean hasNextToken() {
361
362 if ( this.pushBackPos == -1 ) {
363
364 try {
365 final int ch = this.reader.read();
366 if ( ch == -1 ) {
367 this.endOfFile = true;
368 return false;
369 }
370
371
372 this.pushBack( (char) this.handleIntChar( ch ) );
373
374 return true;
375 } catch ( IOException e ) {
376 throw new IOAlert().setParentThrowable( e ).mishap();
377 }
378 } else {
379
380
381
382 return !this.endOfFile;
383 }
384 }
385
386 /**
387 * Tests if the specified character matches the <code>Char</code>
388 * production in the XML specification.
389 *
390 * @param ch the character to test
391 * @return <code>true</code> if the character is a <code>Char</code> and
392 * <code>false</code> otherwise
393 */
394 public abstract boolean isChar( final int ch );
395
396 /**
397 * Tests if the specified character matches the <code>NameChar</code>
398 * production in the XML specification.
399 *
400 * @param ch the character to test
401 * @return <code>true</code> if the character is a <code>NameChar</code>
402 * and <code>false</code> otherwise
403 */
404 public abstract boolean isNameChar( final char ch );
405
406 /**
407 * Tests if the specified character matches the <code>NameStartChar</code>
408 * production in the XML specification.
409 *
410 * @param ch the character to test
411 * @return <code>true</code> if the character is a
412 * <code>NameStartChar</code> and <code>false</code> otherwise
413 */
414 public abstract boolean isNameStartChar( final char ch );
415
416 /**
417 * Tests if the specified character matches the <code>NCNameChar</code>
418 * production in the XML namespace specification.
419 *
420 * @param ch the character to test
421 * @return <code>true</code> if the character is a <code>NCNameChar</code>
422 * and <code>false</code> otherwise
423 */
424 public boolean isNCNameChar( final char ch ) {
425 return ch != ':' && this.isNameChar( ch );
426 }
427
428 /**
429 * Tests if the specified character matches the
430 * <code>NCNameStartChar</code> production in the XML namespace
431 * specification.
432 *
433 * @param ch the character to test
434 * @return <code>true</code> if the character is a
435 * <code>NCNameStartChar</code> and <code>false</code> otherwise
436 */
437 public boolean isNCNameStartChar( final char ch ) {
438 return ch != ':' && this.isNameStartChar( ch );
439 }
440
441 /**
442 * Tests if the specified character matches the <code>S</code> production
443 * in the XML specification.
444 * <pre>
445 * [3] S ::= (#x20 | #x9 | #xD | #xA)+
446 * </pre>
447 *
448 * @param ch the character to test
449 * @return <code>true</code> if the character is a <code>S</code> character
450 * and <code>false</code> otherwise
451 */
452 public boolean isS( final int ch ) {
453 return ch == 0x20 || ch == 0x09 || ch == 0x0D || ch == 0x0A;
454 }
455
456 /**
457 * Tests that the next character is the specified one, otherwise it throws
458 * an Alert.
459 *
460 * @param testch the character we must read next
461 */
462 public void mustRead( final char testch ) {
463 final char ch = this.getChar();
464 if ( ch != testch ) {
465 throw new XmlErrorAlert(
466 "Unexpected character"
467 ).culprit(
468 "found",
469 new Character( ch )
470 ).culprit(
471 "wanted",
472 new Character( testch )
473 ).mishap();
474 }
475 }
476
477 /**
478 * Tests if the next input sequence matches the <code>Eq</code> production
479 * in the XML specification, otherwise it throws an Alert. If the sequence
480 * matches, it will be dropped.
481 * <pre>
482 * [25] Eq ::= S? '=' S?
483 * </pre>
484 */
485 public void mustReadEq() {
486
487 this.dropS();
488
489 char ch = this.getChar();
490 if ( ch != '=' ) {
491 throw new XmlWellFormednessAlert( "'=' character expected" ).culpritChar( ch ).mishap();
492 }
493
494 this.dropS();
495 }
496
497 /**
498 * Tests if the next input sequence matches the <code>S</code> production
499 * in the XML specification, otherwise it throws an Alert. If the sequence
500 * matches, it will be dropped.
501 * <pre>
502 * [25] Eq ::= S? '=' S?
503 * </pre>
504 */
505 public void mustReadS() {
506 final char ch = this.getChar();
507 if ( this.isS( ch ) ) {
508 this.dropS();
509 } else {
510 throw new XmlWellFormednessAlert( "White space expected" ).culpritChar( ch ).mishap();
511 }
512 }
513
514 /**
515 * @see org.millscript.commons.xml.api.tokenizer.XmlTokenizer#nextToken()
516 */
517 public Token nextToken() {
518 if ( this.tryRead( '<' ) ) {
519
520
521
522 final char ch2 = this.getChar();
523 switch ( ch2 ) {
524 case '/':
525
526 return this.readETag();
527 case '!':
528
529
530 if ( this.tryRead( '-', '-' ) ) {
531
532 return this.readComment();
533 } else if ( this.tryRead( '[' ) ) {
534
535 return this.readCDSect();
536 } else {
537 final String name = this.readName();
538 if ( name.equals( "DOCTYPE" ) ) {
539
540 return this.readDoctypeDecl();
541 } else if ( name.equals( "ELEMENT" ) ) {
542
543 return this.readElementDecl();
544 } else if ( name.equals( "ATTLIST" ) ) {
545
546 return this.readAttlistDecl();
547 } else if ( name.equals( "ENTITY" ) ) {
548
549 return this.readEntityDecl();
550 } else if ( name.equals( "NOTATION" ) ) {
551
552 return this.readNotationDecl();
553 } else {
554
555 throw new XmlWellFormednessAlert(
556 "Expected doctype or markup declaration"
557 ).culpritToken( "<!" + name ).mishap();
558 }
559 }
560 case '?':
561
562 return this.readPI();
563 default:
564
565 this.pushBack( ch2 );
566 return this.readSTag();
567 }
568 } else {
569
570 return this.readCharData();
571 }
572 }
573
574 /**
575 * Tests that the next character is the specified one.
576 *
577 * @param testch the character to test for.
578 * @return <code>true</code> if the character is the required one and
579 * <code>false</code> otherwise
580 */
581 public boolean peekRead( final char testch ) {
582 final char ch = this.getChar();
583 this.pushBack( ch );
584 if ( ch == testch ) {
585 return true;
586 }
587 return false;
588 }
589
590 /**
591 * Tests if the next available character matches the <code>S</code>
592 * production in the XML specification.
593 * <pre>
594 * [3] S ::= (#x20 | #x9 | #xD | #xA)+
595 * </pre>
596 *
597 * @return <code>true</code> if the next character is a <code>S</code>
598 * character and <code>false</code> otherwise
599 */
600 public boolean peekS() {
601 final char ch = this.getChar();
602 this.pushBack( ch );
603 return this.isS( ch );
604 }
605
606 /**
607 * Pushes back the specified character so it will be the next one returned
608 * by the {@link #getChar()} method.
609 *
610 * @param ch the <code>char</code> to push back
611 */
612 public void pushBack( final char ch ) {
613 if ( ++this.pushBackPos == this.pushBackBuffer.length ) {
614 final char[] oldBuffer = this.pushBackBuffer;
615
616 this.pushBackBuffer = new char[ this.pushBackBuffer.length * 2 ];
617
618 System.arraycopy( oldBuffer, 0, this.pushBackBuffer, 0, oldBuffer.length );
619 }
620
621 this.pushBackBuffer[ this.pushBackPos ] = ch;
622 }
623
624 /**
625 * Pushes back all the characters in the string, so they will be returned
626 * by subsequent calls to the {@link #getChar()} method. The characters are
627 * pushed in reverse order, so that the first character in the string will
628 * be the first character returned by {@link #getChar()}.
629 *
630 * @param s the <code>String</code> to push back
631 */
632 public void pushBack( final String s ) {
633 final int newSize = this.calcNewSize( this.pushBackBuffer.length, s.length() );
634 if ( this.pushBackBuffer.length != newSize ) {
635 final char[] oldBuffer = this.pushBackBuffer;
636
637 this.pushBackBuffer = new char[ newSize ];
638
639 System.arraycopy( oldBuffer, 0, this.pushBackBuffer, 0, oldBuffer.length );
640 }
641
642 for ( int i = s.length() - 1; i >= 0; i-- ) {
643 this.pushBack( s.charAt( i ) );
644 }
645 }
646
647 /**
648 * Returns the next input sequence as an attribute list declaration token.
649 * This will generate an <code>Alert</code> if the input sequence doesn't
650 * match the <code>AttlistDecl</code> production in the XML specification.
651 * <pre>
652 * [52] AttlistDecl ::='<!ATTLIST' S Name AttDef* S? '>'
653 * [53] AttDef ::= S Name S AttType S DefaultDecl
654 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
655 * [55] StringType ::= 'CDATA'
656 * [56] TokenizedType ::= 'ID' [VC: ID][VC: One ID per Element Type][VC: ID Attribute Default]
657 * | 'IDREF' [VC: IDREF]
658 * | 'IDREFS' [VC: IDREF]
659 * | 'ENTITY' [VC: Entity Name]
660 * | 'ENTITIES' [VC: Entity Name]
661 * | 'NMTOKEN' [VC: Name Token]
662 * | 'NMTOKENS' [VC: Name Token]
663 * [57] EnumeratedType ::= NotationType | Enumeration
664 * [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' [VC: Notation Attributes][VC: One Notation Per Element Type][VC: No Notation on Empty Element][VC: No Duplicate Tokens]
665 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] [VC: No Duplicate Tokens]
666 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue) [VC: Required Attribute][VC: Attribute Default Value Syntactically Correct][WFC: No < in Attribute Values][VC: Fixed Attribute Default]
667 * </pre>
668 * <p>
669 * When this method is called the identifying sequence, i.e.
670 * '<code><!ATTLIST</code>', and it should NOT be expected.
671 * </p>
672 *
673 * @return an <code>AttListDeclToken</code> for the attribute list
674 * declaration
675 */
676 public AttListDeclToken readAttlistDecl() {
677 this.mustReadS();
678 final Name elementName = this.nameTokenizer.readTagName();
679 this.dropS();
680 final EList< AttDef > attDefList = new ELinkedList< AttDef >();
681 while ( !this.tryRead( '>' ) ) {
682 final Name attributeName = this.nameTokenizer.readTagName();
683 if ( this.tryRead( '(' ) ) {
684
685 final EList< String > enumerationNmtokens = new ELinkedList< String >();
686
687 enumerationNmtokens.addLast( this.readNmtoken() );
688
689 this.dropS();
690
691 while ( !this.tryRead( ')' ) ) {
692 this.mustRead( '|' );
693 this.dropS();
694 enumerationNmtokens.addLast( this.readNmtoken() );
695 this.dropS();
696 }
697 attDefList.addLast(
698 new EnumerationAttDefImpl( attributeName, enumerationNmtokens, this )
699 );
700 } else {
701 final String attType = this.readNmtoken();
702 if ( attType.equals( "CDATA" ) ) {
703 attDefList.addLast(
704 new CdataAttDefImpl( attributeName, this )
705 );
706 } else if ( attType.equals( "ID" ) ) {
707 attDefList.addLast(
708 new IdAttDefImpl( attributeName, this )
709 );
710 } else if ( attType.equals( "IDREF" ) ) {
711 attDefList.addLast(
712 new IdrefAttDefImpl( attributeName, this )
713 );
714 } else if ( attType.equals( "IDREFS" ) ) {
715 attDefList.addLast(
716 new IdrefsAttDefImpl( attributeName, this )
717 );
718 } else if ( attType.equals( "ENTITY" ) ) {
719 attDefList.addLast(
720 new EntityAttDefImpl( attributeName, this )
721 );
722 } else if ( attType.equals( "ENTITIES" ) ) {
723 attDefList.addLast(
724 new EntitiesAttDefImpl( attributeName, this )
725 );
726 } else if ( attType.equals( "NMTOKEN" ) ) {
727 attDefList.addLast(
728 new NmtokenAttDefImpl( attributeName, this )
729 );
730 } else if ( attType.equals( "NMTOKENS" ) ) {
731 attDefList.addLast(
732 new NmtokensAttDefImpl( attributeName, this )
733 );
734 } else if ( attType.equals( "NOTATION" ) ) {
735 final EList< String > notationNames = new ELinkedList< String >();
736
737 notationNames.addLast( this.readName() );
738
739 this.dropS();
740
741 while ( !this.tryRead( ')' ) ) {
742 this.mustRead( '|' );
743 this.dropS();
744 notationNames.addLast( this.readName() );
745 this.dropS();
746 }
747 attDefList.addLast(
748 new NotationAttDefImpl( attributeName, notationNames, this )
749 );
750 }
751 }
752 }
753
754 return new AttListDeclTokenImpl( elementName, attDefList );
755 }
756
757 /**
758 * Returns the next input sequence as an attribute value string. This will
759 * generate an <code>Alert</code> if the input sequence doesn't match the
760 * <code>AttValue</code> production in the XML specification.
761 *
762 * @return a <code>String</code> holding the attribute value
763 */
764 public String readAttValue() {
765
766 this.mustReadEq();
767
768 this.currentTokenData.setLength( 0 );
769
770 final char quoteChar = this.getQuoteChar();
771
772 for ( char ch = this.getChar(); ch != quoteChar; ch = this.getChar() ) {
773 switch ( ch ) {
774 case '<':
775 throw new XmlWellFormednessAlert(
776 "Attribute values cannot contain the '<' character"
777 ).mishap();
778 case '&':
779
780 this.readReference().referenceInAttributeValue( quoteChar );
781 break;
782 case ' ':
783 case '\n':
784 case '\r':
785 case '\t':
786
787 this.currentTokenData.append( ' ' );
788 break;
789 default:
790 this.currentTokenData.append( ch );
791 break;
792 }
793 }
794
795
796 return this.currentTokenData.toString();
797 }
798
799 /**
800 * Returns the next input sequence as a CDATA section. This will generate
801 * an <code>Alert</code> if the input sequence doesn't match the
802 * <code>CDSect</code> production in the XML specification.
803 * <pre>
804 * [18] CDSect ::= CDStart CData CDEnd
805 * [19] CDStart ::= '<![CDATA['
806 * [20] CData ::= (Char* - (Char* ']]>' Char*))
807 * [21] CDEnd ::= ']]>'
808 * </pre>
809 * <p>
810 * When this method is called the first three characters
811 * '<code><![</code>' will have already been processed and should NOT be
812 * expected.
813 * </p>
814 *
815 * @return a <code>CharDataToken</code> for the CDATA section
816 */
817 public CharDataToken readCDSect() {
818
819 this.currentTokenData.setLength( 0 );
820 for ( int c = 0; c < 6; c++ ) {
821 this.currentTokenData.append( this.getChar() );
822 }
823 boolean justWhiteSpace = true;
824 if ( this.currentTokenData.lastIndexOf( "CDATA[" ) == 0 ) {
825
826 this.currentTokenData.setLength( 0 );
827 for ( char ch = this.getChar();; ch = this.getChar() ) {
828 if ( ch == ']' && this.tryRead( ']', '>' ) ) {
829
830
831 return new CharDataTokenImpl(
832 this.currentTokenData.toString(),
833 true,
834 justWhiteSpace
835 );
836 } else {
837
838
839
840 if ( justWhiteSpace && !this.isS( ch ) ) {
841
842 justWhiteSpace = false;
843 }
844 this.currentTokenData.append( ch );
845 }
846 }
847 } else {
848 throw new XmlWellFormednessAlert(
849 "Expected CDStart tag"
850 ).culpritToken(
851 this.currentTokenData.insert( 0, "<!" ).toString()
852 ).mishap();
853 }
854 }
855
856 /**
857 * Returns the next input sequence as a character data token. This will
858 * generate an <code>Alert</code> if the input sequence doesn't match the
859 * <code>CharData</code> production in the XML specification.
860 * <pre>
861 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
862 * </pre>
863 *
864 * @return a <code>CharDataToken</code> for the character data
865 */
866 public CharDataToken readCharData() {
867 this.currentTokenData.setLength( 0 );
868 boolean justWhiteSpace = true;
869 for ( int ch = this.getIntChar(); ch != -1; ch = this.getIntChar() ) {
870 if ( ch == '<' ) {
871
872 this.pushBack( (char) ch );
873
874 return new CharDataTokenImpl(
875 this.currentTokenData.toString(),
876 false,
877 justWhiteSpace
878 );
879 } else if ( ch == '&' ) {
880
881 this.readReference().referenceInContent();
882 } else if ( ch == ']' && this.tryRead( ']', '>' ) ) {
883
884 throw new XmlWellFormednessAlert(
885 "']]>' sequence is not allowed in CharData"
886 ).mishap();
887 } else {
888
889
890
891 if ( justWhiteSpace & !this.isS( ch ) ) {
892
893 justWhiteSpace = false;
894 }
895 this.currentTokenData.append( (char) ch );
896 }
897 }
898 if ( this.currentTokenData.length() != 0 ) {
899
900
901 return new CharDataTokenImpl(
902 this.currentTokenData.toString(),
903 false,
904 justWhiteSpace
905 );
906 } else {
907
908 throw new Fault( "This message indicates we need an EOFToken" ).mishap();
909 }
910 }
911
912 /**
913 * Returns the next input sequence as a children content particle model.
914 * This will generate an <code>Alert</code> if the input sequence doesn't
915 * match the <code>children</code> production in the XML specification.
916 * <pre>
917 * [47] children ::= (choice | seq) ('?' | '*' | '+')?
918 * [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting]
919 * </pre>
920 *
921 * @return a <code>CpContentspec</code> for the children declaration
922 */
923 CpContentspec readChildren() {
924 if ( this.tryRead( '(' ) ) {
925
926
927 this.dropS();
928
929 CpContentspec cs = this.readCp();
930
931 this.dropS();
932
933 if ( this.tryRead( '|' ) ) {
934
935 cs = this.readChoice( cs );
936 } else if ( this.tryRead( ',' ) ) {
937
938 cs = this.readSeq( cs );
939 } else if ( this.tryRead( ')' ) ) {
940
941 cs = new SeqContentspecImpl( new ISingletonList< CpContentspec >( cs ) );
942 } else {
943
944 throw new XmlWellFormednessAlert(
945 "Expected a choice('|') or sequence(',') in the contentspec"
946 ).culpritChar( this.getChar() ).mishap();
947 }
948
949 if ( this.tryRead( '?' ) ) {
950 cs.setMatchMode( CpContentspec.ZERO_OR_ONE );
951 } else if ( this.tryRead( '*' ) ) {
952 cs.setMatchMode( CpContentspec.ZERO_OR_MORE );
953 } else if ( this.tryRead( '+' ) ) {
954 cs.setMatchMode( CpContentspec.ONE_OR_MORE );
955 }
956
957 return cs;
958 } else {
959 throw new XmlWellFormednessAlert(
960 "Expected '(' to start a choice or sequence in the element declaration"
961 ).culpritChar( this.getChar() ).mishap();
962 }
963 }
964
965 /**
966 * Returns the next input sequence as a choice content particle model.
967 * This will generate an <code>Alert</code> if the input sequence doesn't
968 * match the <code>choice</code> production in the XML specification.
969 * <pre>
970 * [49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting]
971 * </pre>
972 *
973 * @param first the first content particle in the choice
974 * @return a <code>ChoiceContentspec</code> for the choice declaration
975 */
976 ChoiceContentspec readChoice( final CpContentspec first ) {
977 final EList< CpContentspec > list = new ELinkedList< CpContentspec >();
978 list.addLast( first );
979
980 this.dropS();
981
982 list.addLast( this.readCp() );
983
984 while ( !this.tryRead( ')' ) ) {
985 this.mustRead( '|' );
986 this.dropS();
987 list.addLast( this.readCp() );
988 this.dropS();
989 }
990 return new ChoiceContentspecImpl( list );
991 }
992
993 /**
994 * Returns the next input sequence as a comment token. This will generate
995 * an <code>Alert</code> if the input sequence doesn't match the
996 * <code>CharData</code> production in the XML specification.
997 * <pre>
998 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
999 * </pre>
1000 * <p>
1001 * When this method is called the first four characters
1002 * '<code><!--</code>' will have already been processed and should NOT
1003 * be expected.
1004 * </p>
1005 *
1006 * @return a <code>CommentToken</code> for the comment
1007 */
1008 public CommentToken readComment() {
1009 this.currentTokenData.setLength( 0 );
1010 for ( char ch = this.getChar();; ch = this.getChar() ) {
1011 if ( ch == '-' ) {
1012 char ch2 = this.getChar();
1013 if ( ch2 == '-' ) {
1014 char ch3 = this.getChar();
1015 if ( ch3 == '>' ) {
1016
1017
1018 return new CommentTokenImpl(
1019 this.currentTokenData.toString()
1020 );
1021 } else {
1022
1023
1024 throw new XmlWellFormednessAlert(
1025 "The sequence '--' is not allowed in the body of a comment"
1026 ).mishap();
1027 }
1028 } else {
1029
1030
1031 this.currentTokenData.append( ch );
1032 this.currentTokenData.append( ch2 );
1033 }
1034 } else {
1035 this.currentTokenData.append( ch );
1036 }
1037 }
1038 }
1039
1040 /**
1041 * Returns the next input sequence as an element content model. This will
1042 * generate an <code>Alert</code> if the input sequence doesn't match the
1043 * <code>contentspec</code> production in the XML specification.
1044 * <pre>
1045 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
1046 * </pre>
1047 *
1048 * @return a <code>Contentspec</code> for the content model
1049 */
1050 Contentspec readContentspec() {
1051 if ( this.tryRead( '(' ) ) {
1052
1053
1054 this.dropS();
1055 if ( this.tryRead( '#' ) ) {
1056
1057 final String name = this.readNmtoken();
1058 if ( name.equals( "#PCDATA" ) ) {
1059
1060 return this.readMixed();
1061 } else {
1062
1063 throw new XmlWellFormednessAlert(
1064 "Expected '#PCDATA'"
1065 ).culpritToken( name ).mishap();
1066 }
1067 } else {
1068
1069 return this.readChildren();
1070 }
1071 } else {
1072
1073 final String name = this.readNmtoken();
1074 if ( name.equals( "EMPTY" ) ) {
1075 return new EmptyContentspecImpl();
1076 } else if ( name.equals( "ANY" ) ) {
1077 return new AnyContentspecImpl();
1078 } else {
1079 throw new XmlWellFormednessAlert(
1080 "Expected 'EMPTY', 'ANY', Mixed or children"
1081 ).culpritToken( name ).mishap();
1082 }
1083 }
1084 }
1085
1086 /**
1087 * Returns the next input sequence as a content particle in a content
1088 * model. This will generate an <code>Alert</code> if the input sequence
1089 * doesn't match the <code>cp</code> production in the XML specification.
1090 * <pre>
1091 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1092 * </pre>
1093 *
1094 * @return a <code>CpContentspec</code> for the content particle
1095 */
1096 CpContentspec readCp() {
1097 CpContentspec cs = null;
1098 if ( this.peekRead( '(' ) ) {
1099
1100 cs = this.readChildren();
1101 } else {
1102
1103 cs = new NameContentspecImpl( this.nameTokenizer.readTagName() );
1104 }
1105
1106 if ( this.tryRead( '?' ) ) {
1107 cs.setMatchMode( CpContentspec.ZERO_OR_ONE );
1108 } else if ( this.tryRead( '*' ) ) {
1109 cs.setMatchMode( CpContentspec.ZERO_OR_MORE );
1110 } else if ( this.tryRead( '+' ) ) {
1111 cs.setMatchMode( CpContentspec.ONE_OR_MORE );
1112 }
1113
1114 return cs;
1115 }
1116
1117 /**
1118 * Returns the next input sequence as an document type declaration token.
1119 * This will generate an <code>Alert</code> if the input sequence doesn't
1120 * match the <code>doctypedecl</code> production in the XML specification.
1121 * <pre>
1122 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' [VC: Root Element Type] [WFC: External Subset]
1123 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
1124 * </pre>
1125 * <p>
1126 * When this method is called the identifying sequence, i.e.
1127 * '<code><!DOCTYPE</code>', and it should NOT be expected.
1128 * </p>
1129 *
1130 * @return a <code>DTDToken</code> for the document type declaration
1131 */
1132 public DTDToken readDoctypeDecl() {
1133 this.dropS();
1134 final Name name = this.nameTokenizer.readTagName();
1135 String sysid = null;
1136 String pubid = null;
1137 this.dropS();
1138 if ( this.peekRead( 'S' ) || this.peekRead( 'P' ) ) {
1139 final String id = this.readNmtoken();
1140 this.dropS();
1141 if ( id.equals( "SYSTEM" ) ) {
1142 sysid = this.readSystemLiteral();
1143 } else if ( id.equals( "PUBLIC" ) ) {
1144 pubid = this.readPubidLiteral();
1145 this.mustReadS();
1146 sysid = this.readSystemLiteral();
1147 } else {
1148
1149
1150 throw new XmlWellFormednessAlert(
1151 "An ExternalID must start with 'SYSTEM' or 'PUBLIC'"
1152 ).culpritToken( id ).mishap();
1153 }
1154
1155 this.dropS();
1156 }
1157
1158
1159 if ( this.tryRead( '[' ) ) {
1160
1161 this.readIntSubset();
1162 }
1163
1164
1165 this.dropS();
1166 this.mustRead( '>' );
1167
1168 return new DTDTokenImpl( name, pubid, sysid );
1169 }
1170
1171 /**
1172 * Returns the next input sequence as an element declaration token. This
1173 * will generate an <code>Alert</code> if the input sequence doesn't match
1174 * the <code>elementdecl</code> production in the XML specification.
1175 * <pre>
1176 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration]
1177 * </pre>
1178 * <p>
1179 * When this method is called the identifying sequence, i.e.
1180 * '<code><!ELEMENT</code>', and it should NOT be expected.
1181 * </p>
1182 *
1183 * @return an <code>ElementDeclToken</code> for the element declaration
1184 */
1185 public ElementDeclToken readElementDecl() {
1186
1187 this.mustReadS();
1188 final Name name = this.nameTokenizer.readTagName();
1189 this.mustReadS();
1190
1191 final Contentspec contentspec = this.readContentspec();
1192
1193 this.dropS();
1194 if ( this.tryRead( '>' ) ) {
1195
1196 return new ElementDeclTokenImpl( name, contentspec );
1197 } else {
1198 throw new XmlWellFormednessAlert(
1199 "Expected '>' to end element declaration"
1200 ).culpritChar( this.getChar() ).mishap();
1201 }
1202 }
1203
1204 /**
1205 * Returns the next input sequence as an encoding declaration. This will
1206 * generate an <code>Alert</code> if the input sequence doesn't match the
1207 * <code>encodingDecl</code> production in the XML specification.
1208 * <pre>
1209 * [80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
1210 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
1211 * </pre>
1212 *
1213 * @return a <code>String</code> holding the value of the encoding
1214 * declaration
1215 */
1216 public String readEncodingDecl() {
1217
1218 if ( this.tryReadS() ) {
1219
1220
1221
1222 if ( this.peekRead( 'e' ) ) {
1223 final String attributeName = this.readName();
1224
1225 if ( !attributeName.equals( "encoding" ) ) {
1226 throw new XmlWellFormednessAlert(
1227 "Expected 'encoding' attribute in text declaration"
1228 ).culprit( "attribute", attributeName ).mishap();
1229 }
1230
1231 this.mustReadEq();
1232
1233 this.currentTokenData.setLength( 0 );
1234
1235 final char quoteChar = this.getQuoteChar();
1236
1237 for ( char ch = this.getChar(); ch != quoteChar; ch = this.getChar() ) {
1238 if ( ( ch >= 'a' && ch <= 'z' ) || ( ch >= 'A' && ch <= 'Z' ) || ( ch >= '0' && ch <= '9' ) || ch == '-' || ch == '_' || ch == '.' ) {
1239 this.currentTokenData.append( ch );
1240 } else {
1241 throw new XmlWellFormednessAlert(
1242 "Illegal character in encoding name"
1243 ).culpritChar( ch ).mishap();
1244 }
1245 }
1246
1247
1248
1249 return this.currentTokenData.toString();
1250 } else {
1251
1252
1253 this.pushBack( " " );
1254 }
1255 }
1256
1257 return null;
1258 }
1259
1260 /**
1261 * Returns the next input sequence as an entity declaration token. This
1262 * will generate an <code>Alert</code> if the input sequence doesn't match
1263 * the <code>EntityDecl</code> production in the XML specification.
1264 * <pre>
1265 * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
1266 * | "'" ([^%&'] | PEReference | Reference)* "'"
1267 * [70] EntityDecl ::= GEDecl | PEDecl
1268 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1269 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1270 * [73] EntityDef ::= EntityValue| (ExternalID NDataDecl?)
1271 * [74] PEDef ::= EntityValue | ExternalID
1272 * </pre>
1273 * <p>
1274 * When this method is called the identifying sequence, i.e.
1275 * '<code><!ENTITY</code>', and it should NOT be expected.
1276 * </p>
1277 *
1278 * @return an <code>EntityDeclToken</code> for the entity declaration
1279 */
1280 public EntityDeclToken readEntityDecl() {
1281
1282 this.mustReadS();
1283
1284 final boolean pe = this.tryRead( '%' );
1285
1286 if ( pe ) {
1287 this.mustReadS();
1288 }
1289
1290 final String name = this.readName();
1291
1292 this.mustReadS();
1293
1294 if ( this.peekRead( '"' ) || this.peekRead( '\'' ) ) {
1295
1296 this.currentTokenData.setLength( 0 );
1297
1298 final char quoteChar = this.getQuoteChar();
1299
1300 for ( char ch = this.getChar(); ch != quoteChar; ch = this.getChar() ) {
1301 switch ( ch ) {
1302 case '<':
1303 throw new XmlWellFormednessAlert(
1304 "Attribute values cannot contain the '<' character"
1305 ).mishap();
1306 case '&':
1307