001 /* 002 * JBoss, Home of Professional Open Source. 003 * Copyright 2008, Red Hat Middleware LLC, and individual contributors 004 * as indicated by the @author tags. See the copyright.txt file in the 005 * distribution for a full listing of individual contributors. 006 * 007 * This is free software; you can redistribute it and/or modify it 008 * under the terms of the GNU Lesser General Public License as 009 * published by the Free Software Foundation; either version 2.1 of 010 * the License, or (at your option) any later version. 011 * 012 * This software is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * You should have received a copy of the GNU Lesser General Public 018 * License along with this software; if not, write to the Free 019 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 020 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 021 */ 022 package org.jboss.dna.common.text; 023 024 import java.util.HashSet; 025 import java.util.LinkedList; 026 import java.util.Set; 027 import java.util.regex.Matcher; 028 import java.util.regex.Pattern; 029 import org.jboss.dna.common.util.CheckArg; 030 031 /** 032 * Transforms words to singular, plural, humanized (human readable), underscore, camel case, or ordinal form. This is inspired by 033 * the <a href="http://api.rubyonrails.org/classes/Inflector.html">Inflector</a> class in <a 034 * href="http://www.rubyonrails.org">Ruby on Rails</a>, which is distributed under the <a 035 * href="http://wiki.rubyonrails.org/rails/pages/License">Rails license</a>. 036 * 037 * @author Randall Hauch 038 */ 039 public class Inflector { 040 041 protected static final Inflector INSTANCE = new Inflector(); 042 043 public static final Inflector getInstance() { 044 return INSTANCE; 045 } 046 047 protected class Rule { 048 049 protected final String expression; 050 protected final Pattern expressionPattern; 051 protected final String replacement; 052 053 protected Rule( String expression, 054 String replacement ) { 055 this.expression = expression; 056 this.replacement = replacement != null ? replacement : ""; 057 this.expressionPattern = Pattern.compile(this.expression, Pattern.CASE_INSENSITIVE); 058 } 059 060 /** 061 * Apply the rule against the input string, returning the modified string or null if the rule didn't apply (and no 062 * modifications were made) 063 * 064 * @param input the input string 065 * @return the modified string if this rule applied, or null if the input was not modified by this rule 066 */ 067 protected String apply( String input ) { 068 Matcher matcher = this.expressionPattern.matcher(input); 069 if (!matcher.find()) return null; 070 return matcher.replaceAll(this.replacement); 071 } 072 073 @Override 074 public int hashCode() { 075 return expression.hashCode(); 076 } 077 078 @Override 079 public boolean equals( Object obj ) { 080 if (obj == this) return true; 081 if (obj != null && obj.getClass() == this.getClass()) { 082 final Rule that = (Rule)obj; 083 if (this.expression.equalsIgnoreCase(that.expression)) return true; 084 } 085 return false; 086 } 087 088 @Override 089 public String toString() { 090 return expression + ", " + replacement; 091 } 092 } 093 094 private LinkedList<Rule> plurals = new LinkedList<Rule>(); 095 private LinkedList<Rule> singulars = new LinkedList<Rule>(); 096 /** 097 * The lowercase words that are to be excluded and not processed. This map can be modified by the users via 098 * {@link #getUncountables()}. 099 */ 100 private final Set<String> uncountables = new HashSet<String>(); 101 102 public Inflector() { 103 initialize(); 104 } 105 106 protected Inflector( Inflector original ) { 107 this.plurals.addAll(original.plurals); 108 this.singulars.addAll(original.singulars); 109 this.uncountables.addAll(original.uncountables); 110 } 111 112 @Override 113 public Inflector clone() { 114 return new Inflector(this); 115 } 116 117 // ------------------------------------------------------------------------------------------------ 118 // Usage functions 119 // ------------------------------------------------------------------------------------------------ 120 121 /** 122 * Returns the plural form of the word in the string. 123 * <p> 124 * Examples: 125 * 126 * <pre> 127 * inflector.pluralize("post") #=> "posts" 128 * inflector.pluralize("octopus") #=> "octopi" 129 * inflector.pluralize("sheep") #=> "sheep" 130 * inflector.pluralize("words") #=> "words" 131 * inflector.pluralize("the blue mailman") #=> "the blue mailmen" 132 * inflector.pluralize("CamelOctopus") #=> "CamelOctopi" 133 * </pre> 134 * 135 * </p> 136 * <p> 137 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too. 138 * </p> 139 * 140 * @param word the word that is to be pluralized. 141 * @return the pluralized form of the word, or the word itself if it could not be pluralized 142 * @see #singularize(Object) 143 */ 144 public String pluralize( Object word ) { 145 if (word == null) return null; 146 String wordStr = word.toString().trim(); 147 if (wordStr.length() == 0) return wordStr; 148 if (isUncountable(wordStr)) return wordStr; 149 for (Rule rule : this.plurals) { 150 String result = rule.apply(wordStr); 151 if (result != null) return result; 152 } 153 return wordStr; 154 } 155 156 public String pluralize( Object word, 157 int count ) { 158 if (word == null) return null; 159 if (count == 1 || count == -1) { 160 return word.toString(); 161 } 162 return pluralize(word); 163 } 164 165 /** 166 * Returns the singular form of the word in the string. 167 * <p> 168 * Examples: 169 * 170 * <pre> 171 * inflector.singularize("posts") #=> "post" 172 * inflector.singularize("octopi") #=> "octopus" 173 * inflector.singularize("sheep") #=> "sheep" 174 * inflector.singularize("words") #=> "word" 175 * inflector.singularize("the blue mailmen") #=> "the blue mailman" 176 * inflector.singularize("CamelOctopi") #=> "CamelOctopus" 177 * </pre> 178 * 179 * </p> 180 * <p> 181 * Note that if the {@link Object#toString()} is called on the supplied object, so this method works for non-strings, too. 182 * </p> 183 * 184 * @param word the word that is to be pluralized. 185 * @return the pluralized form of the word, or the word itself if it could not be pluralized 186 * @see #pluralize(Object) 187 */ 188 public String singularize( Object word ) { 189 if (word == null) return null; 190 String wordStr = word.toString().trim(); 191 if (wordStr.length() == 0) return wordStr; 192 if (isUncountable(wordStr)) return wordStr; 193 for (Rule rule : this.singulars) { 194 String result = rule.apply(wordStr); 195 if (result != null) return result; 196 } 197 return wordStr; 198 } 199 200 /** 201 * Converts strings to lowerCamelCase. This method will also use any extra delimiter characters to identify word boundaries. 202 * <p> 203 * Examples: 204 * 205 * <pre> 206 * inflector.lowerCamelCase("active_record") #=> "activeRecord" 207 * inflector.lowerCamelCase("first_name") #=> "firstName" 208 * inflector.lowerCamelCase("name") #=> "name" 209 * inflector.lowerCamelCase("the-first_name",'-') #=> "theFirstName" 210 * </pre> 211 * 212 * </p> 213 * 214 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 215 * @param delimiterChars optional characters that are used to delimit word boundaries 216 * @return the lower camel case version of the word 217 * @see #underscore(String, char[]) 218 * @see #camelCase(String, boolean, char[]) 219 * @see #upperCamelCase(String, char[]) 220 */ 221 public String lowerCamelCase( String lowerCaseAndUnderscoredWord, 222 char... delimiterChars ) { 223 return camelCase(lowerCaseAndUnderscoredWord, false, delimiterChars); 224 } 225 226 /** 227 * Converts strings to UpperCamelCase. This method will also use any extra delimiter characters to identify word boundaries. 228 * <p> 229 * Examples: 230 * 231 * <pre> 232 * inflector.upperCamelCase("active_record") #=> "SctiveRecord" 233 * inflector.upperCamelCase("first_name") #=> "FirstName" 234 * inflector.upperCamelCase("name") #=> "Name" 235 * inflector.lowerCamelCase("the-first_name",'-') #=> "TheFirstName" 236 * </pre> 237 * 238 * </p> 239 * 240 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 241 * @param delimiterChars optional characters that are used to delimit word boundaries 242 * @return the upper camel case version of the word 243 * @see #underscore(String, char[]) 244 * @see #camelCase(String, boolean, char[]) 245 * @see #lowerCamelCase(String, char[]) 246 */ 247 public String upperCamelCase( String lowerCaseAndUnderscoredWord, 248 char... delimiterChars ) { 249 return camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars); 250 } 251 252 /** 253 * By default, this method converts strings to UpperCamelCase. If the <code>uppercaseFirstLetter</code> argument to false, 254 * then this method produces lowerCamelCase. This method will also use any extra delimiter characters to identify word 255 * boundaries. 256 * <p> 257 * Examples: 258 * 259 * <pre> 260 * inflector.camelCase("active_record",false) #=> "activeRecord" 261 * inflector.camelCase("active_record",true) #=> "ActiveRecord" 262 * inflector.camelCase("first_name",false) #=> "firstName" 263 * inflector.camelCase("first_name",true) #=> "FirstName" 264 * inflector.camelCase("name",false) #=> "name" 265 * inflector.camelCase("name",true) #=> "Name" 266 * </pre> 267 * 268 * </p> 269 * 270 * @param lowerCaseAndUnderscoredWord the word that is to be converted to camel case 271 * @param uppercaseFirstLetter true if the first character is to be uppercased, or false if the first character is to be 272 * lowercased 273 * @param delimiterChars optional characters that are used to delimit word boundaries 274 * @return the camel case version of the word 275 * @see #underscore(String, char[]) 276 * @see #upperCamelCase(String, char[]) 277 * @see #lowerCamelCase(String, char[]) 278 */ 279 public String camelCase( String lowerCaseAndUnderscoredWord, 280 boolean uppercaseFirstLetter, 281 char... delimiterChars ) { 282 if (lowerCaseAndUnderscoredWord == null) return null; 283 lowerCaseAndUnderscoredWord = lowerCaseAndUnderscoredWord.trim(); 284 if (lowerCaseAndUnderscoredWord.length() == 0) return ""; 285 if (uppercaseFirstLetter) { 286 String result = lowerCaseAndUnderscoredWord; 287 // Replace any extra delimiters with underscores (before the underscores are converted in the next step)... 288 if (delimiterChars != null) { 289 for (char delimiterChar : delimiterChars) { 290 result = result.replace(delimiterChar, '_'); 291 } 292 } 293 294 // Change the case at the beginning at after each underscore ... 295 return replaceAllWithUppercase(result, "(^|_)(.)", 2); 296 } 297 if (lowerCaseAndUnderscoredWord.length() < 2) return lowerCaseAndUnderscoredWord; 298 return "" + lowerCaseAndUnderscoredWord.charAt(0) 299 + camelCase(lowerCaseAndUnderscoredWord, true, delimiterChars).substring(1); 300 } 301 302 /** 303 * Makes an underscored form from the expression in the string (the reverse of the 304 * {@link #camelCase(String, boolean, char[]) camelCase} method. Also changes any characters that match the supplied 305 * delimiters into underscore. 306 * <p> 307 * Examples: 308 * 309 * <pre> 310 * inflector.underscore("activeRecord") #=> "active_record" 311 * inflector.underscore("ActiveRecord") #=> "active_record" 312 * inflector.underscore("firstName") #=> "first_name" 313 * inflector.underscore("FirstName") #=> "first_name" 314 * inflector.underscore("name") #=> "name" 315 * inflector.underscore("The.firstName") #=> "the_first_name" 316 * </pre> 317 * 318 * </p> 319 * 320 * @param camelCaseWord the camel-cased word that is to be converted; 321 * @param delimiterChars optional characters that are used to delimit word boundaries (beyond capitalization) 322 * @return a lower-cased version of the input, with separate words delimited by the underscore character. 323 */ 324 public String underscore( String camelCaseWord, 325 char... delimiterChars ) { 326 if (camelCaseWord == null) return null; 327 String result = camelCaseWord.trim(); 328 if (result.length() == 0) return ""; 329 result = result.replaceAll("([A-Z]+)([A-Z][a-z])", "$1_$2"); 330 result = result.replaceAll("([a-z\\d])([A-Z])", "$1_$2"); 331 result = result.replace('-', '_'); 332 if (delimiterChars != null) { 333 for (char delimiterChar : delimiterChars) { 334 result = result.replace(delimiterChar, '_'); 335 } 336 } 337 return result.toLowerCase(); 338 } 339 340 /** 341 * Returns a copy of the input with the first character converted to uppercase and the remainder to lowercase. 342 * 343 * @param words the word to be capitalized 344 * @return the string with the first character capitalized and the remaining characters lowercased 345 */ 346 public String capitalize( String words ) { 347 if (words == null) return null; 348 String result = words.trim(); 349 if (result.length() == 0) return ""; 350 if (result.length() == 1) return result.toUpperCase(); 351 return "" + Character.toUpperCase(result.charAt(0)) + result.substring(1).toLowerCase(); 352 } 353 354 /** 355 * Capitalizes the first word and turns underscores into spaces and strips trailing "_id" and any supplied removable tokens. 356 * Like {@link #titleCase(String, String[])}, this is meant for creating pretty output. 357 * <p> 358 * Examples: 359 * 360 * <pre> 361 * inflector.humanize("employee_salary") #=> "Employee salary" 362 * inflector.humanize("author_id") #=> "Author" 363 * </pre> 364 * 365 * </p> 366 * 367 * @param lowerCaseAndUnderscoredWords the input to be humanized 368 * @param removableTokens optional array of tokens that are to be removed 369 * @return the humanized string 370 * @see #titleCase(String, String[]) 371 */ 372 public String humanize( String lowerCaseAndUnderscoredWords, 373 String... removableTokens ) { 374 if (lowerCaseAndUnderscoredWords == null) return null; 375 String result = lowerCaseAndUnderscoredWords.trim(); 376 if (result.length() == 0) return ""; 377 // Remove a trailing "_id" token 378 result = result.replaceAll("_id$", ""); 379 // Remove all of the tokens that should be removed 380 if (removableTokens != null) { 381 for (String removableToken : removableTokens) { 382 result = result.replaceAll(removableToken, ""); 383 } 384 } 385 result = result.replaceAll("_+", " "); // replace all adjacent underscores with a single space 386 return capitalize(result); 387 } 388 389 /** 390 * Capitalizes all the words and replaces some characters in the string to create a nicer looking title. Underscores are 391 * changed to spaces, a trailing "_id" is removed, and any of the supplied tokens are removed. Like 392 * {@link #humanize(String, String[])}, this is meant for creating pretty output. 393 * <p> 394 * Examples: 395 * 396 * <pre> 397 * inflector.titleCase("man from the boondocks") #=> "Man From The Boondocks" 398 * inflector.titleCase("x-men: the last stand") #=> "X Men: The Last Stand" 399 * </pre> 400 * 401 * </p> 402 * 403 * @param words the input to be turned into title case 404 * @param removableTokens optional array of tokens that are to be removed 405 * @return the title-case version of the supplied words 406 */ 407 public String titleCase( String words, 408 String... removableTokens ) { 409 String result = humanize(words, removableTokens); 410 result = replaceAllWithUppercase(result, "\\b([a-z])", 1); // change first char of each word to uppercase 411 return result; 412 } 413 414 /** 415 * Turns a non-negative number into an ordinal string used to denote the position in an ordered sequence, such as 1st, 2nd, 416 * 3rd, 4th. 417 * 418 * @param number the non-negative number 419 * @return the string with the number and ordinal suffix 420 */ 421 public String ordinalize( int number ) { 422 int remainder = number % 100; 423 String numberStr = Integer.toString(number); 424 if (11 <= number && number <= 13) return numberStr + "th"; 425 remainder = number % 10; 426 if (remainder == 1) return numberStr + "st"; 427 if (remainder == 2) return numberStr + "nd"; 428 if (remainder == 3) return numberStr + "rd"; 429 return numberStr + "th"; 430 } 431 432 // ------------------------------------------------------------------------------------------------ 433 // Management methods 434 // ------------------------------------------------------------------------------------------------ 435 436 /** 437 * Determine whether the supplied word is considered uncountable by the {@link #pluralize(Object) pluralize} and 438 * {@link #singularize(Object) singularize} methods. 439 * 440 * @param word the word 441 * @return true if the plural and singular forms of the word are the same 442 */ 443 public boolean isUncountable( String word ) { 444 if (word == null) return false; 445 String trimmedLower = word.trim().toLowerCase(); 446 return this.uncountables.contains(trimmedLower); 447 } 448 449 /** 450 * Get the set of words that are not processed by the Inflector. The resulting map is directly modifiable. 451 * 452 * @return the set of uncountable words 453 */ 454 public Set<String> getUncountables() { 455 return uncountables; 456 } 457 458 public void addPluralize( String rule, 459 String replacement ) { 460 final Rule pluralizeRule = new Rule(rule, replacement); 461 this.plurals.addFirst(pluralizeRule); 462 } 463 464 public void addSingularize( String rule, 465 String replacement ) { 466 final Rule singularizeRule = new Rule(rule, replacement); 467 this.singulars.addFirst(singularizeRule); 468 } 469 470 public void addIrregular( String singular, 471 String plural ) { 472 CheckArg.isNotEmpty(singular, "singular rule"); 473 CheckArg.isNotEmpty(plural, "plural rule"); 474 String singularRemainder = singular.length() > 1 ? singular.substring(1) : ""; 475 String pluralRemainder = plural.length() > 1 ? plural.substring(1) : ""; 476 addPluralize("(" + singular.charAt(0) + ")" + singularRemainder + "$", "$1" + pluralRemainder); 477 addSingularize("(" + plural.charAt(0) + ")" + pluralRemainder + "$", "$1" + singularRemainder); 478 } 479 480 public void addUncountable( String... words ) { 481 if (words == null || words.length == 0) return; 482 for (String word : words) { 483 if (word != null) uncountables.add(word.trim().toLowerCase()); 484 } 485 } 486 487 /** 488 * Utility method to replace all occurrences given by the specific backreference with its uppercased form, and remove all 489 * other backreferences. 490 * <p> 491 * The Java {@link Pattern regular expression processing} does not use the preprocessing directives <code>\l</code>, 492 * <code>\u</code>, <code>\L</code>, and <code>\U</code>. If so, such directives could be used in the replacement 493 * string to uppercase or lowercase the backreferences. For example, <code>\L1</code> would lowercase the first 494 * backreference, and <code>\u3</code> would uppercase the 3rd backreference. 495 * </p> 496 * 497 * @param input 498 * @param regex 499 * @param groupNumberToUppercase 500 * @return the input string with the appropriate characters converted to upper-case 501 */ 502 protected static String replaceAllWithUppercase( String input, 503 String regex, 504 int groupNumberToUppercase ) { 505 Pattern underscoreAndDotPattern = Pattern.compile(regex); 506 Matcher matcher = underscoreAndDotPattern.matcher(input); 507 StringBuffer sb = new StringBuffer(); 508 while (matcher.find()) { 509 matcher.appendReplacement(sb, matcher.group(groupNumberToUppercase).toUpperCase()); 510 } 511 matcher.appendTail(sb); 512 return sb.toString(); 513 } 514 515 /** 516 * Completely remove all rules within this inflector. 517 */ 518 public void clear() { 519 this.uncountables.clear(); 520 this.plurals.clear(); 521 this.singulars.clear(); 522 } 523 524 protected void initialize() { 525 Inflector inflect = this; 526 inflect.addPluralize("$", "s"); 527 inflect.addPluralize("s$", "s"); 528 inflect.addPluralize("(ax|test)is$", "$1es"); 529 inflect.addPluralize("(octop|vir)us$", "$1i"); 530 inflect.addPluralize("(octop|vir)i$", "$1i"); // already plural 531 inflect.addPluralize("(alias|status)$", "$1es"); 532 inflect.addPluralize("(bu)s$", "$1ses"); 533 inflect.addPluralize("(buffal|tomat)o$", "$1oes"); 534 inflect.addPluralize("([ti])um$", "$1a"); 535 inflect.addPluralize("([ti])a$", "$1a"); // already plural 536 inflect.addPluralize("sis$", "ses"); 537 inflect.addPluralize("(?:([^f])fe|([lr])f)$", "$1$2ves"); 538 inflect.addPluralize("(hive)$", "$1s"); 539 inflect.addPluralize("([^aeiouy]|qu)y$", "$1ies"); 540 inflect.addPluralize("(x|ch|ss|sh)$", "$1es"); 541 inflect.addPluralize("(matr|vert|ind)ix|ex$", "$1ices"); 542 inflect.addPluralize("([m|l])ouse$", "$1ice"); 543 inflect.addPluralize("([m|l])ice$", "$1ice"); 544 inflect.addPluralize("^(ox)$", "$1en"); 545 inflect.addPluralize("(quiz)$", "$1zes"); 546 // Need to check for the following words that are already pluralized: 547 inflect.addPluralize("(people|men|children|sexes|moves|stadiums)$", "$1"); // irregulars 548 inflect.addPluralize("(oxen|octopi|viri|aliases|quizzes)$", "$1"); // special rules 549 550 inflect.addSingularize("s$", ""); 551 inflect.addSingularize("(s|si|u)s$", "$1s"); // '-us' and '-ss' are already singular 552 inflect.addSingularize("(n)ews$", "$1ews"); 553 inflect.addSingularize("([ti])a$", "$1um"); 554 inflect.addSingularize("((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis"); 555 inflect.addSingularize("(^analy)ses$", "$1sis"); 556 inflect.addSingularize("(^analy)sis$", "$1sis"); // already singular, but ends in 's' 557 inflect.addSingularize("([^f])ves$", "$1fe"); 558 inflect.addSingularize("(hive)s$", "$1"); 559 inflect.addSingularize("(tive)s$", "$1"); 560 inflect.addSingularize("([lr])ves$", "$1f"); 561 inflect.addSingularize("([^aeiouy]|qu)ies$", "$1y"); 562 inflect.addSingularize("(s)eries$", "$1eries"); 563 inflect.addSingularize("(m)ovies$", "$1ovie"); 564 inflect.addSingularize("(x|ch|ss|sh)es$", "$1"); 565 inflect.addSingularize("([m|l])ice$", "$1ouse"); 566 inflect.addSingularize("(bus)es$", "$1"); 567 inflect.addSingularize("(o)es$", "$1"); 568 inflect.addSingularize("(shoe)s$", "$1"); 569 inflect.addSingularize("(cris|ax|test)is$", "$1is"); // already singular, but ends in 's' 570 inflect.addSingularize("(cris|ax|test)es$", "$1is"); 571 inflect.addSingularize("(octop|vir)i$", "$1us"); 572 inflect.addSingularize("(octop|vir)us$", "$1us"); // already singular, but ends in 's' 573 inflect.addSingularize("(alias|status)es$", "$1"); 574 inflect.addSingularize("(alias|status)$", "$1"); // already singular, but ends in 's' 575 inflect.addSingularize("^(ox)en", "$1"); 576 inflect.addSingularize("(vert|ind)ices$", "$1ex"); 577 inflect.addSingularize("(matr)ices$", "$1ix"); 578 inflect.addSingularize("(quiz)zes$", "$1"); 579 580 inflect.addIrregular("person", "people"); 581 inflect.addIrregular("man", "men"); 582 inflect.addIrregular("child", "children"); 583 inflect.addIrregular("sex", "sexes"); 584 inflect.addIrregular("move", "moves"); 585 inflect.addIrregular("stadium", "stadiums"); 586 587 inflect.addUncountable("equipment", "information", "rice", "money", "species", "series", "fish", "sheep"); 588 } 589 590 }