1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.myfaces.shared_orchestra.renderkit.html.util;
20
21 import java.io.ByteArrayOutputStream;
22 import java.io.IOException;
23 import java.io.OutputStreamWriter;
24 import java.io.Writer;
25
26 /**
27 * Converts Strings so that they can be used within HTML-Code.
28 */
29 public abstract class HTMLEncoder
30 {
31 /**
32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33 */
34 public static String encode (String string)
35 {
36 return encode(string, false, true);
37 }
38
39 /**
40 * Variant of {@link #encode} where encodeNbsp is true.
41 */
42 public static String encode (String string, boolean encodeNewline)
43 {
44 return encode(string, encodeNewline, true);
45 }
46
47 /**
48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
49 */
50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51 {
52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53 }
54
55 /**
56 * Encodes the given string, so that it can be used within a html page.
57 * @param string the string to convert
58 * @param encodeNewline if true newline characters are converted to <br>'s
59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
60 * @param encodeNonLatin if true encode non-latin characters as numeric character references
61 */
62 public static String encode (String string,
63 boolean encodeNewline,
64 boolean encodeSubsequentBlanksToNbsp,
65 boolean encodeNonLatin)
66 {
67 if (string == null)
68 {
69 return "";
70 }
71
72 StringBuffer sb = null; //create later on demand
73 String app;
74 char c;
75 for (int i = 0; i < string.length (); ++i)
76 {
77 app = null;
78 c = string.charAt(i);
79 switch (c)
80 {
81 case '"': app = """; break; //"
82 case '&': app = "&"; break; //&
83 case '<': app = "<"; break; //<
84 case '>': app = ">"; break; //>
85 case ' ':
86 if (encodeSubsequentBlanksToNbsp &&
87 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
88 {
89 //Space at beginning or after another space
90 app = " ";
91 }
92 break;
93 case '\n':
94 if (encodeNewline)
95 {
96 app = "<br/>";
97 }
98 break;
99
100
101 default:
102 if (encodeNonLatin) switch(c) {
103 //german umlauts
104 case '\u00E4' : app = "ä"; break;
105 case '\u00C4' : app = "Ä"; break;
106 case '\u00F6' : app = "ö"; break;
107 case '\u00D6' : app = "Ö"; break;
108 case '\u00FC' : app = "ü"; break;
109 case '\u00DC' : app = "Ü"; break;
110 case '\u00DF' : app = "ß"; break;
111
112 //misc
113 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
114 case '\u20AC': app = "€"; break;
115 case '\u00AB': app = "«"; break;
116 case '\u00BB': app = "»"; break;
117 case '\u00A0': app = " "; break;
118
119 default :
120 if (((int)c) >= 0x80)
121 {
122 //encode all non basic latin characters
123 app = "&#" + ((int)c) + ";";
124 }
125 break;
126 }
127 break;
128 }
129 if (app != null)
130 {
131 if (sb == null)
132 {
133 sb = new StringBuffer(string.substring(0, i));
134 }
135 sb.append(app);
136 } else {
137 if (sb != null)
138 {
139 sb.append(c);
140 }
141 }
142 }
143
144 if (sb == null)
145 {
146 return string;
147 }
148 else
149 {
150 return sb.toString();
151 }
152 }
153
154
155 private static final String HEX_CHARSET = "0123456789ABCDEF";
156
157 private static final String UTF8 = "UTF-8";
158
159 /**
160 * Encode an URI, escaping or percent-encoding all required characters and
161 * following the rules mentioned on RFC 3986.
162 *
163 * @param string
164 * @param encodeNonLatin
165 * @return
166 * @throws IOException
167 */
168 public static String encodeURIAtributte(final String string, final String characterEncoding)
169 throws IOException
170 {
171 StringBuffer sb = null; //create later on demand
172 String app;
173 char c;
174 boolean endLoop = false;
175 for (int i = 0; i < string.length (); ++i)
176 {
177 app = null;
178 c = string.charAt(i);
179
180 // This are the guidelines to be taken into account by this algorithm to encode:
181
182 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
183 //
184 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
185 // space = <US-ASCII coded character 20 hexadecimal>
186 // delims = "<" | ">" | "#" | "%" | <">
187 // %3C %3E %23 %25 %22
188 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
189 // %7D %7B %7C %5C %5E %5B %5D %60
190 //
191 // ".... Data corresponding to excluded characters must be escaped in order to
192 // be properly represented within a URI....."
193
194 // RFC 3986 Section 3. Syntax Components
195 //
196 // "... The generic URI syntax consists of a hierarchical sequence of
197 // components referred to as the scheme, authority, path, query, and
198 // fragment.
199 //
200 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
201 //
202 // hier-part = "//" authority path-abempty
203 // / path-absolute
204 // / path-rootless
205 // / path-empty
206 // ...."
207
208 // RFC 3986 Section 2.2:
209 // Reserved characters (should not be percent-encoded)
210 // reserved = gen-delims / sub-delims
211 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
212 // %3A %2F %3F %23 %5B %5D %40
213 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
214 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
215
216 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
217 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
218 // "...those rules were redefined to directly specify the characters allowed...."
219 // There is also other characters moved from excluded list to reserved:
220 // "[" / "]" / "#"
221
222 // RFC 3986 Section 2.3:
223 // "... for consistency, percent-encoded octets in the ranges of ALPHA
224 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
225 // underscore (%5F), or tilde (%7E) should not be created by URI
226 // producers...."
227
228 // RFC 3986 Section 3.2.2. Host
229
230 // host = IP-literal / IPv4address / reg-name
231
232 // The reg-name syntax allows percent-encoded octets in order to
233 // represent non-ASCII registered names in a uniform way that is
234 // independent of the underlying name resolution technology. Non-ASCII
235 // characters must first be encoded according to UTF-8 [STD63], and then
236 // each octet of the corresponding UTF-8 sequence must be percent-
237 // encoded to be represented as URI characters. URI producing
238 // applications must not use percent-encoding in host unless it is used
239 // to represent a UTF-8 character sequence.
240
241 // RFC 3986 Section 3.4 Query
242 // query = *( pchar / "/" / "?" )
243 //
244 // "... However, as query components are often used to carry identifying information
245 // in the form of "key=value" pairs and one frequently used value is a reference to
246 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
247 //
248 // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
249 //
250 // When a new URI scheme defines a component that represents textual
251 // data consisting of characters from the Universal Character Set [UCS],
252 // the data should first be encoded as octets according to the UTF-8
253 // character encoding [STD63]; then only those octets that do not
254 // correspond to characters in the unreserved set should be percent-
255 // encoded. For example, the character A would be represented as "A",
256 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
257 // as "%C3%80", and the character KATAKANA LETTER A would be represented
258 // as "%E3%82%A2".
259 //
260 // RFC 3986 Section 3.5 Fragment
261 // fragment = *( pchar / "/" / "?" )
262 //
263 // Note that follows the same as query
264
265 // Based on the extracts the strategy to apply on this method is:
266 //
267 // On scheme ":" hier-part
268 //
269 // Escape or percent encode chars inside :
270 //
271 // - From %00 to %20,
272 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
273 // duplicate encoding, encode it when we are sure
274 // that there are not encoded twice)
275 // - "<" %3C, ">" %3E
276 // - "\" %5C, "^" %5E, "`" %60
277 // - "{" %7B, "|" %7C, "}" %7D
278 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
279 // part of an URI, but it is preferred to encode it that omit it).
280 //
281 // The remaining characters must not be encoded
282 //
283 // Characters after ? or # should be percent encoding but only the necessary ones:
284 //
285 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
286 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
287 // duplicate encoding, encode it when we are sure
288 // that there are not encoded twice)
289 // - "<" %3C, ">" %3E,
290 // - "\" %5C, "^" %5E, "`" %60
291 // - "{" %7B, "|" %7C, "}" %7D
292 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
293 // that a single char should contain 2,3 or more bytes!. This data should be encoded
294 // translating from the document character encoding to percent encoding, because this values
295 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding
296 // for decode values)
297 //
298 // "&" should be encoded as "&" because this link is inside an html page, and
299 // put only & is invalid in this context.
300
301 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
302 c == '"' || c == '<' ||
303 c == '>' || c == '\\' || c == '^' || c == '`' ||
304 c == '{' || c == '|' || c == '}')
305 {
306 // The percent encoding on this part should be done using UTF-8 charset
307 // as RFC 3986 Section 3.2.2 says.
308 // Also there is a reference on
309 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
310 // that recommend use of UTF-8 instead the document character encoding.
311 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
312 app = percentEncode(c, "UTF-8");
313 }
314 else if (c == '%')
315 {
316 if (i + 2 < string.length())
317 {
318 char c1 = string.charAt(i+1);
319 char c2 = string.charAt(i+2);
320 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
321 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
322 {
323 // do not percent encode, because it could be already encoded
324 // and we don't want encode it twice
325 }
326 else
327 {
328 app = percentEncode(c, UTF8);
329 }
330 }
331 else
332 {
333 app = percentEncode(c, UTF8);
334 }
335 }
336 else if (c == '?' || c == '#')
337 {
338 if (i+1 < string.length())
339 {
340 // The remaining part of the URI are data that should be encoded
341 // using the document character encoding.
342 app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
343 endLoop = true;
344 }
345 }
346 else
347 {
348 //No encoding, just do nothing, char will be added later.
349 }
350
351 if (app != null)
352 {
353 if (sb == null)
354 {
355 sb = new StringBuffer(string.substring(0, i));
356 }
357 sb.append(app);
358 } else {
359 if (sb != null)
360 {
361 sb.append(c);
362 }
363 }
364 if (endLoop)
365 {
366 break;
367 }
368 }
369 if (sb == null)
370 {
371 return string;
372 }
373 else
374 {
375 return sb.toString();
376 }
377 }
378
379 /**
380 * Encode a unicode char value in percentEncode, decoding its bytes using a specified
381 * characterEncoding.
382 *
383 * @param c
384 * @param characterEncoding
385 * @return
386 */
387 private static String percentEncode(char c, String characterEncoding)
388 {
389 String app = null;
390 if (c > (char)((short)0x007F))
391 {
392 //percent encode in the proper encoding to be consistent
393 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
394 }
395 else
396 {
397 //percent encode US-ASCII char (0x00-0x7F range)
398 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
399 }
400 return app;
401 }
402
403 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
404 {
405 ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
406 StringBuffer builder = new StringBuffer();
407 try
408 {
409 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
410 writer.write(c);
411 writer.flush();
412 }
413 catch(IOException e)
414 {
415 baos.reset();
416 return null;
417 }
418
419 byte [] byteArray = baos.toByteArray();
420 for (int i=0; i < byteArray.length; i++)
421 {
422 builder.append('%');
423 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
424 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
425 }
426
427 return builder.toString();
428 }
429
430 /**
431 * Encode the query part using the document charset encoding provided.
432 *
433 *
434 * @param string
435 * @param characterEncoding
436 * @return
437 */
438 private static String encodeURIQuery(final String string, final String characterEncoding)
439 {
440 StringBuffer sb = null; //create later on demand
441 String app;
442 char c;
443 boolean endLoop = false;
444 for (int i = 0; i < string.length (); ++i)
445 {
446 app = null;
447 c = string.charAt(i);
448
449 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
450 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
451 // - "<" %3C, ">" %3E,
452 // - "\" %5C, "^" %5E, "`" %60
453 // - "{" %7B, "|" %7C, "}" %7D
454 // - From %7F ad infinitum (each character as many bytes as necessary but take into account
455 // that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
456 // character encoding to percent encoding)
457 //
458 // "&" should be encoded as "&" because this link is inside an html page, and
459 // put & is invalid in this context
460
461 if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
462 c == '"' || c == '<' ||
463 c == '>' || c == '\\' || c == '^' || c == '`' ||
464 c == '{' || c == '|' || c == '}')
465 {
466 // The percent encoding on this part should be done using UTF-8 charset
467 // as RFC 3986 Section 3.2.2 says
468 app = percentEncode(c, characterEncoding);
469 }
470 else if (c == '%')
471 {
472 if (i + 2 < string.length())
473 {
474 char c1 = string.charAt(i+1);
475 char c2 = string.charAt(i+2);
476 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
477 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
478 {
479 // do not percent encode, because it could be already encoded
480 }
481 else
482 {
483 app = percentEncode(c, characterEncoding);
484 }
485 }
486 else
487 {
488 app = percentEncode(c, characterEncoding);
489 }
490 }
491 else if (c == '&')
492 {
493 if (i+4 < string.length() )
494 {
495 if ('a' == string.charAt(i+1) &&
496 'm' == string.charAt(i+2) &&
497 'p' == string.charAt(i+3) &&
498 ';' == string.charAt(i+4))
499 {
500 //Skip
501 }
502 else
503 {
504 app = "&";
505 }
506 }
507 else
508 {
509 app = "&";
510 }
511 }
512 else
513 {
514 //No encoding, just do nothing, char will be added later.
515 }
516
517 if (app != null)
518 {
519 if (sb == null)
520 {
521 sb = new StringBuffer(string.substring(0, i));
522 }
523 sb.append(app);
524 } else {
525 if (sb != null)
526 {
527 sb.append(c);
528 }
529 }
530 if (endLoop)
531 {
532 break;
533 }
534 }
535 if (sb == null)
536 {
537 return string;
538 }
539 else
540 {
541 return sb.toString();
542 }
543 }
544 }