001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006     * See the AUTHORS.txt file in the distribution for a full listing of 
007     * individual contributors. 
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.common.text;
025    
026    import java.text.CharacterIterator;
027    import java.text.StringCharacterIterator;
028    import java.util.BitSet;
029    
030    /**
031     * An encoder useful for converting text to be used within a URL, as defined by Section 2.3 of <a
032     * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. Note that this class does not encode a complete URL ({@link java.net.URLEncoder}
033     * and {@link java.net.URLDecoder} should be used for such purposes).
034     * 
035     * @author Randall Hauch
036     */
037    public class UrlEncoder implements TextEncoder, TextDecoder {
038    
039        /**
040         * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved. These include upper and
041         * lower case letters, decimal digits, and a limited set of punctuation marks and symbols.
042         * 
043         * <pre>
044         * unreserved  = alphanum | mark
045         * mark        = &quot;-&quot; | &quot;_&quot; | &quot;.&quot; | &quot;!&quot; | &quot;&tilde;&quot; | &quot;*&quot; | &quot;'&quot; | &quot;(&quot; | &quot;)&quot;
046         * </pre>
047         * 
048         * Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI
049         * is being used in a context that does not allow the unescaped character to appear.
050         */
051        private static final BitSet RFC2396_UNRESERVED_CHARACTERS = new BitSet(256);
052        private static final BitSet RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS;
053    
054        public static final char ESCAPE_CHARACTER = '%';
055    
056        static {
057            RFC2396_UNRESERVED_CHARACTERS.set('a', 'z' + 1);
058            RFC2396_UNRESERVED_CHARACTERS.set('A', 'Z' + 1);
059            RFC2396_UNRESERVED_CHARACTERS.set('0', '9' + 1);
060            RFC2396_UNRESERVED_CHARACTERS.set('-');
061            RFC2396_UNRESERVED_CHARACTERS.set('_');
062            RFC2396_UNRESERVED_CHARACTERS.set('.');
063            RFC2396_UNRESERVED_CHARACTERS.set('!');
064            RFC2396_UNRESERVED_CHARACTERS.set('~');
065            RFC2396_UNRESERVED_CHARACTERS.set('*');
066            RFC2396_UNRESERVED_CHARACTERS.set('\'');
067            RFC2396_UNRESERVED_CHARACTERS.set('(');
068            RFC2396_UNRESERVED_CHARACTERS.set(')');
069    
070            RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS = (BitSet)RFC2396_UNRESERVED_CHARACTERS.clone();
071            RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS.set('/');
072        }
073    
074        private boolean slashEncoded = true;
075    
076        /**
077         * {@inheritDoc}
078         */
079        public String encode( String text ) {
080            if (text == null) return null;
081            if (text.length() == 0) return text;
082            final BitSet safeChars = isSlashEncoded() ? RFC2396_UNRESERVED_CHARACTERS : RFC2396_UNRESERVED_WITH_SLASH_CHARACTERS;
083            final StringBuilder result = new StringBuilder();
084            final CharacterIterator iter = new StringCharacterIterator(text);
085            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
086                if (safeChars.get(c)) {
087                    // Safe character, so just pass through ...
088                    result.append(c);
089                } else {
090                    // The character is not a safe character, and must be escaped ...
091                    result.append(ESCAPE_CHARACTER);
092                    result.append(Character.toLowerCase(Character.forDigit(c / 16, 16)));
093                    result.append(Character.toLowerCase(Character.forDigit(c % 16, 16)));
094                }
095            }
096            return result.toString();
097        }
098    
099        /**
100         * {@inheritDoc}
101         */
102        public String decode( String encodedText ) {
103            if (encodedText == null) return null;
104            if (encodedText.length() == 0) return encodedText;
105            final StringBuilder result = new StringBuilder();
106            final CharacterIterator iter = new StringCharacterIterator(encodedText);
107            for (char c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
108                if (c == ESCAPE_CHARACTER) {
109                    boolean foundEscapedCharacter = false;
110                    // Found the first character in a potential escape sequence, so grab the next two characters ...
111                    char hexChar1 = iter.next();
112                    char hexChar2 = hexChar1 != CharacterIterator.DONE ? iter.next() : CharacterIterator.DONE;
113                    if (hexChar2 != CharacterIterator.DONE) {
114                        // We found two more characters, but ensure they form a valid hexadecimal number ...
115                        int hexNum1 = Character.digit(hexChar1, 16);
116                        int hexNum2 = Character.digit(hexChar2, 16);
117                        if (hexNum1 > -1 && hexNum2 > -1) {
118                            foundEscapedCharacter = true;
119                            result.append((char)(hexNum1 * 16 + hexNum2));
120                        }
121                    }
122                    if (!foundEscapedCharacter) {
123                        result.append(c);
124                        if (hexChar1 != CharacterIterator.DONE) result.append(hexChar1);
125                        if (hexChar2 != CharacterIterator.DONE) result.append(hexChar2);
126                    }
127                } else {
128                    result.append(c);
129                }
130            }
131            return result.toString();
132        }
133    
134        /**
135         * @return slashEncoded
136         */
137        public boolean isSlashEncoded() {
138            return this.slashEncoded;
139        }
140    
141        /**
142         * @param slashEncoded Sets slashEncoded to the specified value.
143         * @return this object, for method chaining
144         */
145        public UrlEncoder setSlashEncoded( boolean slashEncoded ) {
146            this.slashEncoded = slashEncoded;
147            return this;
148        }
149    
150    }