001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006     * See the AUTHORS.txt file in the distribution for a full listing of 
007     * individual contributors.
008     *
009     * Unless otherwise indicated, all code in JBoss DNA is licensed
010     * to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     * 
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.common.xml;
025    
026    import java.text.CharacterIterator;
027    import java.text.StringCharacterIterator;
028    
029    /**
030     * A utility class for determining the validity of various XML names, per the 
031     * <a href="http://www.w3.org/TR/REC-xml/">XML 1.0 Specification</a>.
032     */
033    public class XmlCharacters {
034    
035        private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000
036    
037        /**
038         * This implementation uses an array that captures for each character the XML classifications.
039         * An array is used because it is a fast way of looking up each character.
040         */
041        private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS];
042    
043        private static final int VALID_CHARACTER = 1;
044        private static final int CONTENT_CHARACTER = 1 <<1;
045        private static final int SPACE_CHARACTER = 1 <<2;
046        private static final int NAME_START_CHARACTER = 1<<3;
047        private static final int NAME_CHARACTER = 1<<4;
048        private static final int NCNAME_START_CHARACTER = 1<<5;
049        private static final int NCNAME_CHARACTER = 1<<6;
050        private static final int PUBID_CHARACTER = 1<<7;
051    
052        static {
053    
054            // ----------------
055            // Valid Characters
056            // ----------------
057            // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
058            // See http://www.w3.org/TR/REC-xml/#charsets
059            MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER;
060            MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER;
061            MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER;
062            for (int i = 0x20; i <= 0xD7FF; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
063            for (int i = 0xE000; i <= 0xFFFD; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER;
064            // Last range is bigger than our character array, so we'll handle in the 'isValid' method  ...
065            // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER;
066    
067            // Remove the other characters that are not allowed in XML content:
068            // '<', '&', '\n', '\r', ']'
069            MASKS['<'] &= ~(CONTENT_CHARACTER);
070            MASKS['&'] &= ~(CONTENT_CHARACTER);
071            MASKS['\n'] &= ~(CONTENT_CHARACTER);
072            MASKS['\r'] &= ~(CONTENT_CHARACTER);
073            MASKS[']'] &= ~(CONTENT_CHARACTER);
074            
075            // ---------------------
076            // Whitespace Characters
077            // ---------------------
078            // [3] S ::= (#x20 | #x9 | #xD | #xA)+
079            // See http://www.w3.org/TR/REC-xml/#sec-common-syn
080            MASKS[0x20] |= SPACE_CHARACTER;
081            MASKS[0x9] |= SPACE_CHARACTER;
082            MASKS[0xA] |= SPACE_CHARACTER;
083            MASKS[0xD] |= SPACE_CHARACTER;
084    
085            // ---------------------
086            // Name Start Characters
087            // ---------------------
088            // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
089            // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
090            // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
091            // [#x10000-#xEFFFF]
092            // See http://www.w3.org/TR/REC-xml/#sec-common-syn
093            //
094            // Note that all these start characters AND characters are valid for NAME and NCNAME
095            int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER;
096            MASKS[':'] |= nameStartMask;
097            MASKS['_'] |= nameStartMask;
098            for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= nameStartMask;
099            for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= nameStartMask;
100            for (int i = 0xC0; i <= 0xD6; ++i) MASKS[i] |= nameStartMask;
101            for (int i = 0xD8; i <= 0xF6; ++i) MASKS[i] |= nameStartMask;
102            for (int i = 0xF8; i <= 0x2FF; ++i) MASKS[i] |= nameStartMask;
103            for (int i = 0x370; i <= 0x37D; ++i) MASKS[i] |= nameStartMask;
104            for (int i = 0x37F; i <= 0x1FFF; ++i) MASKS[i] |= nameStartMask;
105            for (int i = 0x200C; i <= 0x200D; ++i) MASKS[i] |= nameStartMask;
106            for (int i = 0x2070; i <= 0x218F; ++i) MASKS[i] |= nameStartMask;
107            for (int i = 0x2C00; i <= 0x2FEF; ++i) MASKS[i] |= nameStartMask;
108            for (int i = 0x3001; i <= 0xD7FF; ++i) MASKS[i] |= nameStartMask;
109            for (int i = 0xF900; i <= 0xFDCF; ++i) MASKS[i] |= nameStartMask;
110            for (int i = 0xFDF0; i <= 0xFFFD; ++i) MASKS[i] |= nameStartMask;
111            // Last range is bigger than our character array ...
112            //for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask;
113    
114            // ---------------
115            // Name Characters
116            // ---------------
117            // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
118            // See http://www.w3.org/TR/REC-xml/#sec-common-syn
119            //
120            // Note that all these characters are valid for NAME and NCNAME
121            int nameMask = NAME_CHARACTER | NCNAME_CHARACTER;
122            MASKS['-'] |= nameMask;
123            MASKS['.'] |= nameMask;
124            MASKS[0xB7] |= nameMask;
125            for (int i = '0'; i <= '9'; ++i) MASKS[i] |= nameMask;
126            for (int i = 0x0300; i <= 0x036F; ++i) MASKS[i] |= nameStartMask;
127            for (int i = 0x203F; i <= 0x2040; ++i) MASKS[i] |= nameStartMask;
128            
129            // --------
130            // NC Names
131            // --------
132            // [4] NCName ::= NCNameStartChar NCNameChar*
133            // which is just an XML Name, minus the ":"
134            // See http://www.w3.org/TR/REC-xml-names/#ns-decl
135            // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ...
136            MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER);
137            
138            // --------------------
139            // Public ID characters
140            // --------------------
141            // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
142            MASKS[0x20] |= PUBID_CHARACTER;
143            MASKS[0xA] |= PUBID_CHARACTER;
144            MASKS[0xD] |= PUBID_CHARACTER;
145            for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= PUBID_CHARACTER;
146            for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= PUBID_CHARACTER;
147            for (int i = '0'; i <= '9'; ++i) MASKS[i] |= PUBID_CHARACTER;
148            MASKS['-'] |= PUBID_CHARACTER;
149            MASKS['\''] |= PUBID_CHARACTER;
150            MASKS['('] |= PUBID_CHARACTER;
151            MASKS[')'] |= PUBID_CHARACTER;
152            MASKS['+'] |= PUBID_CHARACTER;
153            MASKS[','] |= PUBID_CHARACTER;
154            MASKS['.'] |= PUBID_CHARACTER;
155            MASKS['/'] |= PUBID_CHARACTER;
156            MASKS[':'] |= PUBID_CHARACTER;
157            MASKS['='] |= PUBID_CHARACTER;
158            MASKS['?'] |= PUBID_CHARACTER;
159            MASKS[';'] |= PUBID_CHARACTER;
160            MASKS['!'] |= PUBID_CHARACTER;
161            MASKS['*'] |= PUBID_CHARACTER;
162            MASKS['#'] |= PUBID_CHARACTER;
163            MASKS['@'] |= PUBID_CHARACTER;
164            MASKS['$'] |= PUBID_CHARACTER;
165            MASKS['_'] |= PUBID_CHARACTER;
166            MASKS['%'] |= PUBID_CHARACTER;
167    
168        }
169    
170        private XmlCharacters() {
171        }
172    
173        /**
174         * Determine whether the supplied character is a valid first character in an XML Name.
175         * The first character in an XML name is more restrictive than the {@link #isValidName(int) remaining characters}.
176         * 
177         * @param c the character
178         * @return true if the character is valid for an XML Name's first character
179         */
180        public static boolean isValidNameStart( int c ) {
181            return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_START_CHARACTER ) != 0;
182        }
183    
184        /**
185         * Determine whether the supplied character is a valid first character in an XML NCName.
186         * The first character in an XML NCName is more restrictive than the {@link #isValidName(int) remaining characters}.
187         * 
188         * @param c the character
189         * @return true if the character is valid for an XML NCName's first character
190         */
191        public static boolean isValidNcNameStart( int c ) {
192            return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_START_CHARACTER ) != 0;
193        }
194    
195        /**
196         * Determine whether the supplied character is a valid non-first character in an XML Name.
197         * The {@link #isValidNameStart(int) first character} in an XML name is more restrictive than the remaining characters.
198         * 
199         * @param c the character
200         * @return true if the character is valid character in an XML Name
201         */
202        public static boolean isValidName( int c ) {
203            return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER ) != 0;
204        }
205    
206        /**
207         * Determine whether the supplied character is a valid non-first character in an XML NCName.
208         * The {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters.
209         * 
210         * @param c the character
211         * @return true if the character is valid character in an XML NCName
212         */
213        public static boolean isValidNcName( int c ) {
214            return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER ) != 0;
215        }
216    
217        /**
218         * Determine whether the supplied character is a valid character in an XML Pubid.
219         * 
220         * @param c the character
221         * @return true if the character is valid character in an XML Pubid
222         */
223        public static boolean isValidPubid( int c ) {
224            return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER ) != 0;
225        }
226    
227        /**
228         * Determine whether the supplied character is a valid character in XML.
229         * 
230         * @param c the character
231         * @return true if the character is valid character in XML
232         */
233        public static boolean isValid( int c ) {
234            return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
235        }
236    
237        /**
238         * Determine whether the supplied character is a valid character in XML content
239         * 
240         * @param c the character
241         * @return true if the character is valid character in XML content
242         */
243        public static boolean isValidContent( int c ) {
244            return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF);
245        }
246    
247        /**
248         * Determine whether the supplied character is a valid whitespace character in XML
249         * 
250         * @param c the character
251         * @return true if the character is valid whitespace character in XML
252         */
253        public static boolean isValidSpace( int c ) {
254            return c <= 0x20 && ( MASKS[c] & SPACE_CHARACTER ) != 0;
255        }
256        
257        /**
258         * Determine if the supplied name is a valid XML Name.
259         * 
260         * @param name the string being checked
261         * @return true if the supplied name is indeed a valid XML Name, or false otherwise
262         */
263        public static boolean isValidName( String name ) {
264            if ( name == null || name.length() == 0 ) return false;
265            CharacterIterator iter = new StringCharacterIterator(name);
266            char c = iter.first();
267            if ( !isValidNameStart(c) ) return false;
268            while ( c != CharacterIterator.DONE ) {
269                if ( !isValidName(c) ) return false;
270                c = iter.next();
271            }
272            return true;
273        }
274        
275        /**
276         * Determine if the supplied name is a valid XML NCName.
277         * 
278         * @param name the string being checked
279         * @return true if the supplied name is indeed a valid XML NCName, or false otherwise
280         */
281        public static boolean isValidNcName( String name ) {
282            if ( name == null || name.length() == 0 ) return false;
283            CharacterIterator iter = new StringCharacterIterator(name);
284            char c = iter.first();
285            if ( !isValidNcNameStart(c) ) return false;
286            while ( c != CharacterIterator.DONE ) {
287                if ( !isValidNcName(c) ) return false;
288                c = iter.next();
289            }
290            return true;
291        }
292    }