View Javadoc

1   ////////////////////////////////////////////////////////////////////////////////
2   // MillScript: an Open Spice interpreter and batch website creation tool
3   // Copyright (C) 2001-2005 Open World Ltd
4   // Copyright (C) 2005 Kevin Rogers
5   //
6   // This file is part of MillScript.
7   //
8   // MillScript is free software; you can redistribute it and/or modify it under
9   // the terms of the GNU General Public License as published by the Free
10  // Software Foundation; either version 2 of the License, or (at your option)
11  // any later version.
12  //
13  // MillScript is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15  // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
16  // more details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // MillScript; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA  02111-1307  USA
21  ////////////////////////////////////////////////////////////////////////////////
22  package org.millscript.millscript.render;
23  
24  import org.millscript.commons.util.MapIterator;
25  import org.millscript.commons.vfs.VFile;
26  import org.millscript.commons.xml.api.Name;
27  import org.millscript.millscript.alert.Alerts;
28  import org.millscript.millscript.conf.Configuration;
29  import org.millscript.millscript.datatypes.XmlElement;
30  
31  import java.io.IOException;
32  import java.nio.ByteBuffer;
33  import java.nio.CharBuffer;
34  import java.nio.charset.CharacterCodingException;
35  import java.nio.charset.Charset;
36  import java.nio.charset.CharsetEncoder;
37  import java.util.HashSet;
38  import java.util.Set;
39  
40  /**
41   * This class implements MillScripts HTML renderer. It renders to the HTML 4
42   * specification, but may not render valid HTML 4.
43   * <p>
44   * The renderers ability to render valid HTML depends on the input it's
45   * provided, e.g. supported tags. The main issue with the renderer is that
46   * database content can contain HTML tags, as a result we cannot automatically
47   * encode "&lt;" characters that occur in string content. When we can parse
48   * database content into strings, XmlElement, etc, our ability to render valid
49   * documents will greatly improve.
50   * </p>
51   */
52  public final class HTMLRenderer extends AbstractRenderer {
53  
54      /**
55       * The set of empty elements for HTML. These are the set of tags that must
56       * not have any children.
57       */
58      static final Set< String > EMPTY_TAGS = new HashSet< String >();
59  
60      // Initialiser for the empty HTML tags set.
61      static {
62          EMPTY_TAGS.add( "area" );
63          EMPTY_TAGS.add( "base" );
64          EMPTY_TAGS.add( "basefont" );
65          EMPTY_TAGS.add( "bgsound" );
66          EMPTY_TAGS.add( "br" );
67          EMPTY_TAGS.add( "col" );
68          EMPTY_TAGS.add( "frame" );
69          EMPTY_TAGS.add( "hr" );
70          EMPTY_TAGS.add( "img" );
71          EMPTY_TAGS.add( "input" );
72          EMPTY_TAGS.add( "link" );
73          EMPTY_TAGS.add( "meta" );
74          EMPTY_TAGS.add( "param" );
75          EMPTY_TAGS.add( "spacer" );
76          EMPTY_TAGS.add( "wbr" );
77      }
78  
79      /**
80       * The set of HTML elements which are defined as having CDATA content. In
81       * this context CDATA means that entities are not resolved, so only
82       * characters in the documents character set are allowed.
83       */
84      static final Set< String > SCRIPT_STYLE_TAGS = new HashSet< String >();
85  
86      // Initialiser for the set of HTML tags with no child entities.
87      static {
88          SCRIPT_STYLE_TAGS.add( "script" );
89          SCRIPT_STYLE_TAGS.add( "style" );
90      }
91  
92      /**
93       * The set of HTML attributes whose values are defined as URI's. These
94       * require specical encoding/rendering.
95       */
96      static final Set< String > URI_ATTRS = new HashSet< String >();
97  
98      // Initialiser for the URI HTML attributes.
99      static {
100         URI_ATTRS.add( "action" );
101         URI_ATTRS.add( "background" );
102         URI_ATTRS.add( "cite" );
103         URI_ATTRS.add( "classid" );
104         URI_ATTRS.add( "codebase" );
105         URI_ATTRS.add( "data" );
106         URI_ATTRS.add( "href" );
107         URI_ATTRS.add( "longdesc" );
108         URI_ATTRS.add( "profile" );
109         URI_ATTRS.add( "src" );
110         URI_ATTRS.add( "usemap" );
111     }
112 
113     /**
114      * ASCII charset encoder for URI's, so we can properly encode non-ASCII
115      * values in URI's.
116      */
117     private static final CharsetEncoder URI_ENCODER = Charset.forName( "US-ASCII" ).newEncoder();
118 
119     /**
120      * UTF8 charset encoder for non-ASCII URI values, so we can properly encode
121      * non-ASCII values in URI's.
122      */
123     private static final CharsetEncoder UTF8_ENCODER = Charset.forName( "UTF-8" ).newEncoder();
124 
125     /**
126      * This renderer is used to renders CDATA sections, i.e. the contents of a
127      * script or style tag.
128      */
129     private final HTMLCDATARenderer CDATA_RENDERER;
130 
131     /**
132      * Constructs a new HTML renderer, to render to the specified virtual file
133      * using the given confguration.
134      *
135      * @param conf  the configuration to get rendering parameters from
136      * @param file  the virtual output file
137      */
138     public HTMLRenderer( final Configuration conf, final VFile file ) {
139         super( conf.getHTMLCharacterEntity(), conf, conf.getOutputCharset(), file );
140         this.CDATA_RENDERER = new HTMLCDATARenderer( this );
141     }
142 
143     /**
144      * @see org.millscript.millscript.render.Renderer#append(char)
145      */
146     public void append( final char ch ) throws IOException {
147         if ( !this.canEncode( ch ) || ch == '&' || ch == '<' || ch == '>' ) {
148             // Ok, the character cannot be written in the current encoding OR
149             // it must be escaped, so write the escape
150             this.appendEscapeFor( ch );
151         } else {
152             // The character is ok, so simply write out the byte sequence
153             this.outputWriter.write( ch );
154         }
155     }
156 
157     /**
158      * @see org.millscript.millscript.render.Renderer#appendEscapeFor(char)
159      */
160     public void appendEscapeFor( final char ch ) throws IOException {
161         this.appendNoEscape( this.availableEntities.getEntityFor( ch ) );
162     }
163 
164     /**
165      * @see org.millscript.millscript.render.Renderer#appendNoEscape(char)
166      */
167     public void appendNoEscape( final char ch ) throws IOException {
168         if ( this.canEncode( ch ) ) {
169             this.outputWriter.write( ch );
170         } else {
171             throw(
172                 Alerts.eval(
173                     "HTML does not allow an entity at this point",
174                     "The character is not supported by this character set"
175                 ).
176                 culprit( "character", new Character( ch ) ).
177                 mishap()
178             );
179         }
180     }
181 
182     /**
183      * @see org.millscript.millscript.render.Renderer#renderDocumentFooter()
184      */
185     public void renderDocumentFooter() throws IOException {
186         // Put a newline at the end of the file to make it look a bit better
187         this.outputWriter.write( '\n' );
188     }
189 
190     /**
191      * @see org.millscript.millscript.render.Renderer#renderDocumentHeader()
192      */
193     public void renderDocumentHeader() throws IOException {
194         this.appendNoEscape( "<!DOCTYPE HTML PUBLIC " );
195         this.appendNoEscape( "\"-//W3C//DTD HTML 4.01 Transitional//EN\" " );
196         this.appendNoEscape( "\"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd\">\n" );
197     }
198 
199     /**
200      * @see org.millscript.millscript.render.Renderer#renderObject(java.lang.Object)
201      */
202     public void renderObject( final Object o ) throws IOException {
203         String s = o.toString();
204         int len = s.length();
205         for ( int i = 0; i < len; i++ ) {
206             char ch = s.charAt( i );
207             // Check we can encode this character
208             if ( !this.canEncode( ch ) || ch == '&' ) {
209                 this.appendEscapeFor( ch );
210             } else {
211                 this.outputWriter.write( ch );
212             }
213         }
214     }
215 
216     /**
217      * Renders an object as a URI. This means that ampersands are encoded as
218      * "&#38;" and any non ASCII character is UTF-8 encoded and then escaped
219      * using the % syntax.
220      *
221      * @param x the object to render
222      */
223     public void renderURI( final Object x ) throws IOException {
224         String s = x.toString();
225         int len = s.length();
226         for ( int i = 0; i < len; i++ ) {
227             char ch = s.charAt( i );
228             if ( ch == '&' ) {
229                 // We always write "&#38;" if we find an ampersand
230                 this.appendNoEscape( "&#38;" );
231             } else if ( ch == '"' ) {
232                 // We always write "&#34;" if we find a double quote.
233                 this.appendNoEscape( "&#34;" );
234             } else if ( URI_ENCODER.canEncode( ch ) ) {
235                 // The character can be encoded in the ASCII character set,
236                 // so we simply write the character.
237                 this.appendNoEscape( ch );
238             } else {
239                 // The character cannot be written in the ASCII character set,
240                 // so we convert to UTF-8, then percent encode each resulting
241                 // byte.
242                 // NOTE - Technically we could probably not percent encode
243                 //        any resulting ASCII characters...
244                 try {
245                     ByteBuffer bb = UTF8_ENCODER.encode( CharBuffer.wrap( new char[] { ch } ) );
246                     byte[] bbarray = bb.array();
247                     for ( int j = 0; j < bbarray.length; j++ ) {
248                         this.appendNoEscape( '%' );
249                         // We bitwise AND each byte will 255, to convert to an
250                         // unsigned integer, before write the hex value.
251                         this.appendNoEscape( Integer.toHexString( bbarray[j] & 255 ) );
252                     }
253                 } catch ( CharacterCodingException ex ) {
254                     throw(
255                         Alerts.eval(
256                             "URI contains invalid character",
257                             "A non ASCII character in the URI couldn't be encoded in UTF8"
258                         ).culprit( "url", x ).culprit( "character", new Character( ch ) ).mishap()
259                     );
260                 }
261             }
262         }
263     }
264 
265     /**
266      * @see org.millscript.millscript.render.Renderer#renderXMLElement(org.millscript.millscript.datatypes.XmlElement)
267      */
268     public void renderXMLElement( final XmlElement x ) throws IOException {
269         this.appendNoEscape( '<' );
270         this.appendNoEscape( x.tagName() );
271 
272         MapIterator< Name, String > it = x.getAttributes().iterator( true );
273         while ( it.hasNext() ) {
274             Name key = it.nextKey();
275             String val = it.currentValue();
276             if ( key != null ) {
277                 this.appendNoEscape( ' ' );
278                 this.appendNoEscape( key.getQName() );
279                 if ( val != null ) {
280                     this.appendNoEscape( "=\"" );
281                     if ( URI_ATTRS.contains( key.getLocalName() ) ) {
282                         this.renderURI( val );
283                     } else {
284                         String s = val.toString();
285                         int len = s.length();
286                         for ( int i = 0; i < len; i++ ) {
287                             char ch = s.charAt( i );
288                             // Entity encode characters not supported by this documents
289                             // character set, ampersands, double-quotes and ISO control codes
290                             // that are not whitespace.
291                             if ( !this.canEncode( ch ) || ch == '&' || ch == '"' ) {
292                                 this.appendEscapeFor( ch );
293                             } else {
294                                 // The character is ok, so simply write out the byte sequence
295                                 this.outputWriter.write( ch );
296                             }
297                         }
298                     }
299                     this.appendNoEscape( '"' );
300                 }
301             }
302         }
303 
304         final Object[] kids = x.getChildren();
305         final int nkids = kids.length;
306 
307         if ( nkids == 0 ) {
308             this.appendNoEscape( '>' );
309             if ( !EMPTY_TAGS.contains( x.tagName() ) ) {
310                 this.appendNoEscape( "</" );
311                 this.appendNoEscape( x.tagName() );
312                 this.appendNoEscape( '>' );
313             }
314         } else {
315             if ( EMPTY_TAGS.contains( x.tagName() ) ) {
316                 throw(
317                     Alerts.eval(
318                         "Cannot render children of this element",
319                         "This is a mandatory empty element"
320                     ).culprit( "element", x ).mishap()
321                 );
322             } else {
323                 this.appendNoEscape( '>' );
324                 if ( SCRIPT_STYLE_TAGS.contains( x.tagName() ) ) {
325                     for ( int i = 0; i < nkids; i++ ) {
326                         this.CDATA_RENDERER.render( kids[ i ] );
327                     }
328                 } else {
329                     for ( int i = 0; i < nkids; i++ ) {
330                         this.render( kids[ i ] );
331                     }
332                 }
333                 this.appendNoEscape( "</" );
334                 this.appendNoEscape( x.tagName() );
335                 this.appendNoEscape( '>' );
336             }
337         }
338     }
339 
340 }