1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package org.millscript.millscript.render;
23
24 import org.millscript.commons.util.MapIterator;
25 import org.millscript.commons.vfs.VFile;
26 import org.millscript.commons.xml.api.Name;
27 import org.millscript.millscript.alert.Alerts;
28 import org.millscript.millscript.conf.Configuration;
29 import org.millscript.millscript.datatypes.XmlElement;
30
31 import java.io.IOException;
32 import java.nio.ByteBuffer;
33 import java.nio.CharBuffer;
34 import java.nio.charset.CharacterCodingException;
35 import java.nio.charset.Charset;
36 import java.nio.charset.CharsetEncoder;
37 import java.util.HashSet;
38 import java.util.Set;
39
40 /**
41 * This class implements MillScripts HTML renderer. It renders to the HTML 4
42 * specification, but may not render valid HTML 4.
43 * <p>
44 * The renderers ability to render valid HTML depends on the input it's
45 * provided, e.g. supported tags. The main issue with the renderer is that
46 * database content can contain HTML tags, as a result we cannot automatically
47 * encode "<" characters that occur in string content. When we can parse
48 * database content into strings, XmlElement, etc, our ability to render valid
49 * documents will greatly improve.
50 * </p>
51 */
52 public final class HTMLRenderer extends AbstractRenderer {
53
54 /**
55 * The set of empty elements for HTML. These are the set of tags that must
56 * not have any children.
57 */
58 static final Set< String > EMPTY_TAGS = new HashSet< String >();
59
60
61 static {
62 EMPTY_TAGS.add( "area" );
63 EMPTY_TAGS.add( "base" );
64 EMPTY_TAGS.add( "basefont" );
65 EMPTY_TAGS.add( "bgsound" );
66 EMPTY_TAGS.add( "br" );
67 EMPTY_TAGS.add( "col" );
68 EMPTY_TAGS.add( "frame" );
69 EMPTY_TAGS.add( "hr" );
70 EMPTY_TAGS.add( "img" );
71 EMPTY_TAGS.add( "input" );
72 EMPTY_TAGS.add( "link" );
73 EMPTY_TAGS.add( "meta" );
74 EMPTY_TAGS.add( "param" );
75 EMPTY_TAGS.add( "spacer" );
76 EMPTY_TAGS.add( "wbr" );
77 }
78
79 /**
80 * The set of HTML elements which are defined as having CDATA content. In
81 * this context CDATA means that entities are not resolved, so only
82 * characters in the documents character set are allowed.
83 */
84 static final Set< String > SCRIPT_STYLE_TAGS = new HashSet< String >();
85
86
87 static {
88 SCRIPT_STYLE_TAGS.add( "script" );
89 SCRIPT_STYLE_TAGS.add( "style" );
90 }
91
92 /**
93 * The set of HTML attributes whose values are defined as URI's. These
94 * require specical encoding/rendering.
95 */
96 static final Set< String > URI_ATTRS = new HashSet< String >();
97
98
99 static {
100 URI_ATTRS.add( "action" );
101 URI_ATTRS.add( "background" );
102 URI_ATTRS.add( "cite" );
103 URI_ATTRS.add( "classid" );
104 URI_ATTRS.add( "codebase" );
105 URI_ATTRS.add( "data" );
106 URI_ATTRS.add( "href" );
107 URI_ATTRS.add( "longdesc" );
108 URI_ATTRS.add( "profile" );
109 URI_ATTRS.add( "src" );
110 URI_ATTRS.add( "usemap" );
111 }
112
113 /**
114 * ASCII charset encoder for URI's, so we can properly encode non-ASCII
115 * values in URI's.
116 */
117 private static final CharsetEncoder URI_ENCODER = Charset.forName( "US-ASCII" ).newEncoder();
118
119 /**
120 * UTF8 charset encoder for non-ASCII URI values, so we can properly encode
121 * non-ASCII values in URI's.
122 */
123 private static final CharsetEncoder UTF8_ENCODER = Charset.forName( "UTF-8" ).newEncoder();
124
125 /**
126 * This renderer is used to renders CDATA sections, i.e. the contents of a
127 * script or style tag.
128 */
129 private final HTMLCDATARenderer CDATA_RENDERER;
130
131 /**
132 * Constructs a new HTML renderer, to render to the specified virtual file
133 * using the given confguration.
134 *
135 * @param conf the configuration to get rendering parameters from
136 * @param file the virtual output file
137 */
138 public HTMLRenderer( final Configuration conf, final VFile file ) {
139 super( conf.getHTMLCharacterEntity(), conf, conf.getOutputCharset(), file );
140 this.CDATA_RENDERER = new HTMLCDATARenderer( this );
141 }
142
143 /**
144 * @see org.millscript.millscript.render.Renderer#append(char)
145 */
146 public void append( final char ch ) throws IOException {
147 if ( !this.canEncode( ch ) || ch == '&' || ch == '<' || ch == '>' ) {
148
149
150 this.appendEscapeFor( ch );
151 } else {
152
153 this.outputWriter.write( ch );
154 }
155 }
156
157 /**
158 * @see org.millscript.millscript.render.Renderer#appendEscapeFor(char)
159 */
160 public void appendEscapeFor( final char ch ) throws IOException {
161 this.appendNoEscape( this.availableEntities.getEntityFor( ch ) );
162 }
163
164 /**
165 * @see org.millscript.millscript.render.Renderer#appendNoEscape(char)
166 */
167 public void appendNoEscape( final char ch ) throws IOException {
168 if ( this.canEncode( ch ) ) {
169 this.outputWriter.write( ch );
170 } else {
171 throw(
172 Alerts.eval(
173 "HTML does not allow an entity at this point",
174 "The character is not supported by this character set"
175 ).
176 culprit( "character", new Character( ch ) ).
177 mishap()
178 );
179 }
180 }
181
182 /**
183 * @see org.millscript.millscript.render.Renderer#renderDocumentFooter()
184 */
185 public void renderDocumentFooter() throws IOException {
186
187 this.outputWriter.write( '\n' );
188 }
189
190 /**
191 * @see org.millscript.millscript.render.Renderer#renderDocumentHeader()
192 */
193 public void renderDocumentHeader() throws IOException {
194 this.appendNoEscape( "<!DOCTYPE HTML PUBLIC " );
195 this.appendNoEscape( "\"-//W3C//DTD HTML 4.01 Transitional//EN\" " );
196 this.appendNoEscape( "\"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd\">\n" );
197 }
198
199 /**
200 * @see org.millscript.millscript.render.Renderer#renderObject(java.lang.Object)
201 */
202 public void renderObject( final Object o ) throws IOException {
203 String s = o.toString();
204 int len = s.length();
205 for ( int i = 0; i < len; i++ ) {
206 char ch = s.charAt( i );
207
208 if ( !this.canEncode( ch ) || ch == '&' ) {
209 this.appendEscapeFor( ch );
210 } else {
211 this.outputWriter.write( ch );
212 }
213 }
214 }
215
216 /**
217 * Renders an object as a URI. This means that ampersands are encoded as
218 * "&" and any non ASCII character is UTF-8 encoded and then escaped
219 * using the % syntax.
220 *
221 * @param x the object to render
222 */
223 public void renderURI( final Object x ) throws IOException {
224 String s = x.toString();
225 int len = s.length();
226 for ( int i = 0; i < len; i++ ) {
227 char ch = s.charAt( i );
228 if ( ch == '&' ) {
229
230 this.appendNoEscape( "&" );
231 } else if ( ch == '"' ) {
232
233 this.appendNoEscape( """ );
234 } else if ( URI_ENCODER.canEncode( ch ) ) {
235
236
237 this.appendNoEscape( ch );
238 } else {
239
240
241
242
243
244 try {
245 ByteBuffer bb = UTF8_ENCODER.encode( CharBuffer.wrap( new char[] { ch } ) );
246 byte[] bbarray = bb.array();
247 for ( int j = 0; j < bbarray.length; j++ ) {
248 this.appendNoEscape( '%' );
249
250
251 this.appendNoEscape( Integer.toHexString( bbarray[j] & 255 ) );
252 }
253 } catch ( CharacterCodingException ex ) {
254 throw(
255 Alerts.eval(
256 "URI contains invalid character",
257 "A non ASCII character in the URI couldn't be encoded in UTF8"
258 ).culprit( "url", x ).culprit( "character", new Character( ch ) ).mishap()
259 );
260 }
261 }
262 }
263 }
264
265 /**
266 * @see org.millscript.millscript.render.Renderer#renderXMLElement(org.millscript.millscript.datatypes.XmlElement)
267 */
268 public void renderXMLElement( final XmlElement x ) throws IOException {
269 this.appendNoEscape( '<' );
270 this.appendNoEscape( x.tagName() );
271
272 MapIterator< Name, String > it = x.getAttributes().iterator( true );
273 while ( it.hasNext() ) {
274 Name key = it.nextKey();
275 String val = it.currentValue();
276 if ( key != null ) {
277 this.appendNoEscape( ' ' );
278 this.appendNoEscape( key.getQName() );
279 if ( val != null ) {
280 this.appendNoEscape( "=\"" );
281 if ( URI_ATTRS.contains( key.getLocalName() ) ) {
282 this.renderURI( val );
283 } else {
284 String s = val.toString();
285 int len = s.length();
286 for ( int i = 0; i < len; i++ ) {
287 char ch = s.charAt( i );
288
289
290
291 if ( !this.canEncode( ch ) || ch == '&' || ch == '"' ) {
292 this.appendEscapeFor( ch );
293 } else {
294
295 this.outputWriter.write( ch );
296 }
297 }
298 }
299 this.appendNoEscape( '"' );
300 }
301 }
302 }
303
304 final Object[] kids = x.getChildren();
305 final int nkids = kids.length;
306
307 if ( nkids == 0 ) {
308 this.appendNoEscape( '>' );
309 if ( !EMPTY_TAGS.contains( x.tagName() ) ) {
310 this.appendNoEscape( "</" );
311 this.appendNoEscape( x.tagName() );
312 this.appendNoEscape( '>' );
313 }
314 } else {
315 if ( EMPTY_TAGS.contains( x.tagName() ) ) {
316 throw(
317 Alerts.eval(
318 "Cannot render children of this element",
319 "This is a mandatory empty element"
320 ).culprit( "element", x ).mishap()
321 );
322 } else {
323 this.appendNoEscape( '>' );
324 if ( SCRIPT_STYLE_TAGS.contains( x.tagName() ) ) {
325 for ( int i = 0; i < nkids; i++ ) {
326 this.CDATA_RENDERER.render( kids[ i ] );
327 }
328 } else {
329 for ( int i = 0; i < nkids; i++ ) {
330 this.render( kids[ i ] );
331 }
332 }
333 this.appendNoEscape( "</" );
334 this.appendNoEscape( x.tagName() );
335 this.appendNoEscape( '>' );
336 }
337 }
338 }
339
340 }