001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * Unless otherwise indicated, all code in JBoss DNA is licensed 010 * to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 package org.jboss.dna.common.xml; 025 026 import java.text.CharacterIterator; 027 import java.text.StringCharacterIterator; 028 029 /** 030 * A utility class for determining the validity of various XML names, per the 031 * <a href="http://www.w3.org/TR/REC-xml/">XML 1.0 Specification</a>. 032 */ 033 public class XmlCharacters { 034 035 private static final int NUMBER_OF_CHARACTERS = 1 << 16; // 65536 or 0x10000 036 037 /** 038 * This implementation uses an array that captures for each character the XML classifications. 039 * An array is used because it is a fast way of looking up each character. 040 */ 041 private static final char[] MASKS = new char[NUMBER_OF_CHARACTERS]; 042 043 private static final int VALID_CHARACTER = 1; 044 private static final int CONTENT_CHARACTER = 1 <<1; 045 private static final int SPACE_CHARACTER = 1 <<2; 046 private static final int NAME_START_CHARACTER = 1<<3; 047 private static final int NAME_CHARACTER = 1<<4; 048 private static final int NCNAME_START_CHARACTER = 1<<5; 049 private static final int NCNAME_CHARACTER = 1<<6; 050 private static final int PUBID_CHARACTER = 1<<7; 051 052 static { 053 054 // ---------------- 055 // Valid Characters 056 // ---------------- 057 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 058 // See http://www.w3.org/TR/REC-xml/#charsets 059 MASKS[0x9] |= VALID_CHARACTER | CONTENT_CHARACTER; 060 MASKS[0xA] |= VALID_CHARACTER | CONTENT_CHARACTER; 061 MASKS[0xD] |= VALID_CHARACTER | CONTENT_CHARACTER; 062 for (int i = 0x20; i <= 0xD7FF; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER; 063 for (int i = 0xE000; i <= 0xFFFD; ++i) MASKS[i] |= VALID_CHARACTER | CONTENT_CHARACTER; 064 // Last range is bigger than our character array, so we'll handle in the 'isValid' method ... 065 // for ( int i=0x10000; i<=0x10FFFF; ++i ) MASKS[i] = VALID_CHARACTER_MASK | CONTENT_CHARACTER; 066 067 // Remove the other characters that are not allowed in XML content: 068 // '<', '&', '\n', '\r', ']' 069 MASKS['<'] &= ~(CONTENT_CHARACTER); 070 MASKS['&'] &= ~(CONTENT_CHARACTER); 071 MASKS['\n'] &= ~(CONTENT_CHARACTER); 072 MASKS['\r'] &= ~(CONTENT_CHARACTER); 073 MASKS[']'] &= ~(CONTENT_CHARACTER); 074 075 // --------------------- 076 // Whitespace Characters 077 // --------------------- 078 // [3] S ::= (#x20 | #x9 | #xD | #xA)+ 079 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 080 MASKS[0x20] |= SPACE_CHARACTER; 081 MASKS[0x9] |= SPACE_CHARACTER; 082 MASKS[0xA] |= SPACE_CHARACTER; 083 MASKS[0xD] |= SPACE_CHARACTER; 084 085 // --------------------- 086 // Name Start Characters 087 // --------------------- 088 // [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | 089 // [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | 090 // [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 091 // [#x10000-#xEFFFF] 092 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 093 // 094 // Note that all these start characters AND characters are valid for NAME and NCNAME 095 int nameStartMask = NAME_START_CHARACTER | NCNAME_START_CHARACTER | NAME_CHARACTER | NCNAME_CHARACTER; 096 MASKS[':'] |= nameStartMask; 097 MASKS['_'] |= nameStartMask; 098 for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= nameStartMask; 099 for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= nameStartMask; 100 for (int i = 0xC0; i <= 0xD6; ++i) MASKS[i] |= nameStartMask; 101 for (int i = 0xD8; i <= 0xF6; ++i) MASKS[i] |= nameStartMask; 102 for (int i = 0xF8; i <= 0x2FF; ++i) MASKS[i] |= nameStartMask; 103 for (int i = 0x370; i <= 0x37D; ++i) MASKS[i] |= nameStartMask; 104 for (int i = 0x37F; i <= 0x1FFF; ++i) MASKS[i] |= nameStartMask; 105 for (int i = 0x200C; i <= 0x200D; ++i) MASKS[i] |= nameStartMask; 106 for (int i = 0x2070; i <= 0x218F; ++i) MASKS[i] |= nameStartMask; 107 for (int i = 0x2C00; i <= 0x2FEF; ++i) MASKS[i] |= nameStartMask; 108 for (int i = 0x3001; i <= 0xD7FF; ++i) MASKS[i] |= nameStartMask; 109 for (int i = 0xF900; i <= 0xFDCF; ++i) MASKS[i] |= nameStartMask; 110 for (int i = 0xFDF0; i <= 0xFFFD; ++i) MASKS[i] |= nameStartMask; 111 // Last range is bigger than our character array ... 112 //for (int i = 0x10000; i <= 0xEFFFF; ++i) MASKS[i] |= nameStartMask; 113 114 // --------------- 115 // Name Characters 116 // --------------- 117 // [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 118 // See http://www.w3.org/TR/REC-xml/#sec-common-syn 119 // 120 // Note that all these characters are valid for NAME and NCNAME 121 int nameMask = NAME_CHARACTER | NCNAME_CHARACTER; 122 MASKS['-'] |= nameMask; 123 MASKS['.'] |= nameMask; 124 MASKS[0xB7] |= nameMask; 125 for (int i = '0'; i <= '9'; ++i) MASKS[i] |= nameMask; 126 for (int i = 0x0300; i <= 0x036F; ++i) MASKS[i] |= nameStartMask; 127 for (int i = 0x203F; i <= 0x2040; ++i) MASKS[i] |= nameStartMask; 128 129 // -------- 130 // NC Names 131 // -------- 132 // [4] NCName ::= NCNameStartChar NCNameChar* 133 // which is just an XML Name, minus the ":" 134 // See http://www.w3.org/TR/REC-xml-names/#ns-decl 135 // So, remove the NCNAME_CHARACTER and NCNAME_START_CHARACTER masks from ':' ... 136 MASKS[':'] &= ~(NCNAME_START_CHARACTER | NCNAME_CHARACTER); 137 138 // -------------------- 139 // Public ID characters 140 // -------------------- 141 // [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%] 142 MASKS[0x20] |= PUBID_CHARACTER; 143 MASKS[0xA] |= PUBID_CHARACTER; 144 MASKS[0xD] |= PUBID_CHARACTER; 145 for (int i = 'A'; i <= 'Z'; ++i) MASKS[i] |= PUBID_CHARACTER; 146 for (int i = 'a'; i <= 'z'; ++i) MASKS[i] |= PUBID_CHARACTER; 147 for (int i = '0'; i <= '9'; ++i) MASKS[i] |= PUBID_CHARACTER; 148 MASKS['-'] |= PUBID_CHARACTER; 149 MASKS['\''] |= PUBID_CHARACTER; 150 MASKS['('] |= PUBID_CHARACTER; 151 MASKS[')'] |= PUBID_CHARACTER; 152 MASKS['+'] |= PUBID_CHARACTER; 153 MASKS[','] |= PUBID_CHARACTER; 154 MASKS['.'] |= PUBID_CHARACTER; 155 MASKS['/'] |= PUBID_CHARACTER; 156 MASKS[':'] |= PUBID_CHARACTER; 157 MASKS['='] |= PUBID_CHARACTER; 158 MASKS['?'] |= PUBID_CHARACTER; 159 MASKS[';'] |= PUBID_CHARACTER; 160 MASKS['!'] |= PUBID_CHARACTER; 161 MASKS['*'] |= PUBID_CHARACTER; 162 MASKS['#'] |= PUBID_CHARACTER; 163 MASKS['@'] |= PUBID_CHARACTER; 164 MASKS['$'] |= PUBID_CHARACTER; 165 MASKS['_'] |= PUBID_CHARACTER; 166 MASKS['%'] |= PUBID_CHARACTER; 167 168 } 169 170 private XmlCharacters() { 171 } 172 173 /** 174 * Determine whether the supplied character is a valid first character in an XML Name. 175 * The first character in an XML name is more restrictive than the {@link #isValidName(int) remaining characters}. 176 * 177 * @param c the character 178 * @return true if the character is valid for an XML Name's first character 179 */ 180 public static boolean isValidNameStart( int c ) { 181 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_START_CHARACTER ) != 0; 182 } 183 184 /** 185 * Determine whether the supplied character is a valid first character in an XML NCName. 186 * The first character in an XML NCName is more restrictive than the {@link #isValidName(int) remaining characters}. 187 * 188 * @param c the character 189 * @return true if the character is valid for an XML NCName's first character 190 */ 191 public static boolean isValidNcNameStart( int c ) { 192 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_START_CHARACTER ) != 0; 193 } 194 195 /** 196 * Determine whether the supplied character is a valid non-first character in an XML Name. 197 * The {@link #isValidNameStart(int) first character} in an XML name is more restrictive than the remaining characters. 198 * 199 * @param c the character 200 * @return true if the character is valid character in an XML Name 201 */ 202 public static boolean isValidName( int c ) { 203 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NAME_CHARACTER ) != 0; 204 } 205 206 /** 207 * Determine whether the supplied character is a valid non-first character in an XML NCName. 208 * The {@link #isValidNcNameStart(int) first character} in an XML NCName is more restrictive than the remaining characters. 209 * 210 * @param c the character 211 * @return true if the character is valid character in an XML NCName 212 */ 213 public static boolean isValidNcName( int c ) { 214 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & NCNAME_CHARACTER ) != 0; 215 } 216 217 /** 218 * Determine whether the supplied character is a valid character in an XML Pubid. 219 * 220 * @param c the character 221 * @return true if the character is valid character in an XML Pubid 222 */ 223 public static boolean isValidPubid( int c ) { 224 return c < NUMBER_OF_CHARACTERS && ( MASKS[c] & PUBID_CHARACTER ) != 0; 225 } 226 227 /** 228 * Determine whether the supplied character is a valid character in XML. 229 * 230 * @param c the character 231 * @return true if the character is valid character in XML 232 */ 233 public static boolean isValid( int c ) { 234 return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & VALID_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF); 235 } 236 237 /** 238 * Determine whether the supplied character is a valid character in XML content 239 * 240 * @param c the character 241 * @return true if the character is valid character in XML content 242 */ 243 public static boolean isValidContent( int c ) { 244 return (c < NUMBER_OF_CHARACTERS && ( MASKS[c] & CONTENT_CHARACTER ) != 0) || ( 0x10000 <= c && c <= 0x10FFFF); 245 } 246 247 /** 248 * Determine whether the supplied character is a valid whitespace character in XML 249 * 250 * @param c the character 251 * @return true if the character is valid whitespace character in XML 252 */ 253 public static boolean isValidSpace( int c ) { 254 return c <= 0x20 && ( MASKS[c] & SPACE_CHARACTER ) != 0; 255 } 256 257 /** 258 * Determine if the supplied name is a valid XML Name. 259 * 260 * @param name the string being checked 261 * @return true if the supplied name is indeed a valid XML Name, or false otherwise 262 */ 263 public static boolean isValidName( String name ) { 264 if ( name == null || name.length() == 0 ) return false; 265 CharacterIterator iter = new StringCharacterIterator(name); 266 char c = iter.first(); 267 if ( !isValidNameStart(c) ) return false; 268 while ( c != CharacterIterator.DONE ) { 269 if ( !isValidName(c) ) return false; 270 c = iter.next(); 271 } 272 return true; 273 } 274 275 /** 276 * Determine if the supplied name is a valid XML NCName. 277 * 278 * @param name the string being checked 279 * @return true if the supplied name is indeed a valid XML NCName, or false otherwise 280 */ 281 public static boolean isValidNcName( String name ) { 282 if ( name == null || name.length() == 0 ) return false; 283 CharacterIterator iter = new StringCharacterIterator(name); 284 char c = iter.first(); 285 if ( !isValidNcNameStart(c) ) return false; 286 while ( c != CharacterIterator.DONE ) { 287 if ( !isValidNcName(c) ) return false; 288 c = iter.next(); 289 } 290 return true; 291 } 292 }