001    /*
002     * JBoss, Home of Professional Open Source.
003     * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004     * as indicated by the @author tags. See the copyright.txt file in the
005     * distribution for a full listing of individual contributors. 
006     *
007     * This is free software; you can redistribute it and/or modify it
008     * under the terms of the GNU Lesser General Public License as
009     * published by the Free Software Foundation; either version 2.1 of
010     * the License, or (at your option) any later version.
011     *
012     * This software is distributed in the hope that it will be useful,
013     * but WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * Lesser General Public License for more details.
016     *
017     * You should have received a copy of the GNU Lesser General Public
018     * License along with this software; if not, write to the Free
019     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021     */
022    package org.jboss.dna.common.text;
023    
024    import java.text.CharacterIterator;
025    import java.text.StringCharacterIterator;
026    import java.util.BitSet;
027    
028    /**
029     * An encoder useful for converting text to be used within a URL, as defined by Section 2.3 of <a
030     * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. Note that this class does not encode a complete URL ({@link java.net.URLEncoder}
031     * and {@link java.net.URLDecoder} should be used for such purposes).
032     * 
033     * @author Randall Hauch
034     */
035    public class UrlEncoder implements TextEncoder, TextDecoder {
036    
037        /**
038         * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved. These include upper and
039         * lower case letters, decimal digits, and a limited set of punctuation marks and symbols.
040         * 
041         * <pre>
042         * unreserved  = alphanum | mark
043         * mark        = &quot;-&quot; | &quot;_&quot; | &quot;.&quot; | &quot;!&quot; | &quot;&tilde;&quot; | &quot;*&quot; | &quot;'&quot; | &quot;(&quot; | &quot;)&quot;
044         * </pre>
045         * 
046         * Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI
047         * is being used in a context that does not allow the unescaped character to appear.
048         */
049        private static final BitSet RFC2396_UNRESERVED_CHARACTERS = new BitSet(256);
050        private static final BitSet RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS;
051    
052        public static final char ESCAPE_CHARACTER = '%';
053    
054        static {
055            RFC2396_UNRESERVED_CHARACTERS.set('a', 'z' + 1);
056            RFC2396_UNRESERVED_CHARACTERS.set('A', 'Z' + 1);
057            RFC2396_UNRESERVED_CHARACTERS.set('0', '9' + 1);
058            RFC2396_UNRESERVED_CHARACTERS.set('-');
059            RFC2396_UNRESERVED_CHARACTERS.set('_');
060            RFC2396_UNRESERVED_CHARACTERS.set('.');
061            RFC2396_UNRESERVED_CHARACTERS.set('!');
062            RFC2396_UNRESERVED_CHARACTERS.set('~');
063            RFC2396_UNRESERVED_CHARACTERS.set('*');
064            RFC2396_UNRESERVED_CHARACTERS.set('\'');
065            RFC2396_UNRESERVED_CHARACTERS.set('(');
066            RFC2396_UNRESERVED_CHARACTERS.set(')');
067    
068            RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS = (BitSet)RFC2396_UNRESERVED_CHARACTERS.clone();
069            RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS.set('/');
070        }
071    
072        private boolean slashEncoded = true;
073    
074        /**
075         * {@inheritDoc}
076         */
077        public String encode( String text ) {
078            if (text == null) return null;
079            if (text.length() == 0) return text;
080            final BitSet safeChars = isSlashEncoded() ? RFC2396_UNRESERVED_CHARACTERS : RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS;
081            final StringBuilder result = new StringBuilder();
082            final CharacterIterator iter = new StringCharacterIterator(text);
083            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
084                if (safeChars.get(c)) {
085                    // Safe character, so just pass through ...
086                    result.append(c);
087                } else {
088                    // The character is not a safe character, and must be escaped ...
089                    result.append(ESCAPE_CHARACTER);
090                    result.append(Character.toLowerCase(Character.forDigit(c / 16, 16)));
091                    result.append(Character.toLowerCase(Character.forDigit(c % 16, 16)));
092                }
093            }
094            return result.toString();
095        }
096    
097        /**
098         * {@inheritDoc}
099         */
100        public String decode( String encodedText ) {
101            if (encodedText == null) return null;
102            if (encodedText.length() == 0) return encodedText;
103            final StringBuilder result = new StringBuilder();
104            final CharacterIterator iter = new StringCharacterIterator(encodedText);
105            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
106                if (c == ESCAPE_CHARACTER) {
107                    boolean foundEscapedCharacter = false;
108                    // Found the first character in a potential escape sequence, so grab the next two characters ...
109                    char hexChar1 = iter.next();
110                    char hexChar2 = hexChar1 != CharacterIterator.DONE ? iter.next() : CharacterIterator.DONE;
111                    if (hexChar2 != CharacterIterator.DONE) {
112                        // We found two more characters, but ensure they form a valid hexadecimal number ...
113                        int hexNum1 = Character.digit(hexChar1, 16);
114                        int hexNum2 = Character.digit(hexChar2, 16);
115                        if (hexNum1 > -1 && hexNum2 > -1) {
116                            foundEscapedCharacter = true;
117                            result.append((char)(hexNum1 * 16 + hexNum2));
118                        }
119                    }
120                    if (!foundEscapedCharacter) {
121                        result.append(c);
122                        if (hexChar1 != CharacterIterator.DONE) result.append(hexChar1);
123                        if (hexChar2 != CharacterIterator.DONE) result.append(hexChar2);
124                    }
125                } else {
126                    result.append(c);
127                }
128            }
129            return result.toString();
130        }
131    
132        /**
133         * @return slashEncoded
134         */
135        public boolean isSlashEncoded() {
136            return this.slashEncoded;
137        }
138    
139        /**
140         * @param slashEncoded Sets slashEncoded to the specified value.
141         * @return this object, for method chaining
142         */
143        public UrlEncoder setSlashEncoded( boolean slashEncoded ) {
144            this.slashEncoded = slashEncoded;
145            return this;
146        }
147    
148    }