001 /* 002 * JBoss, Home of Professional Open Source. 003 * Copyright 2008, Red Hat Middleware LLC, and individual contributors 004 * as indicated by the @author tags. See the copyright.txt file in the 005 * distribution for a full listing of individual contributors. 006 * 007 * This is free software; you can redistribute it and/or modify it 008 * under the terms of the GNU Lesser General Public License as 009 * published by the Free Software Foundation; either version 2.1 of 010 * the License, or (at your option) any later version. 011 * 012 * This software is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 * Lesser General Public License for more details. 016 * 017 * You should have received a copy of the GNU Lesser General Public 018 * License along with this software; if not, write to the Free 019 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 020 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 021 */ 022 package org.jboss.dna.graph.properties; 023 024 import java.io.Serializable; 025 import java.util.regex.Pattern; 026 import java.util.regex.PatternSyntaxException; 027 import net.jcip.annotations.Immutable; 028 import org.jboss.dna.common.util.CheckArg; 029 import org.jboss.dna.common.util.HashCode; 030 import org.jboss.dna.graph.GraphI18n; 031 032 /** 033 * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to 034 * represent node paths or properties. 035 * <p> 036 * Path expressions consist of two parts: a selection criteria (or an input path) and an output path: 037 * </p> 038 * 039 * <pre> 040 * inputPath => outputPath 041 * </pre> 042 * <p> 043 * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of ' 044 * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the 045 * same-name-sibling indexes) and '<code>@</code>' signifies a property name. 046 * </p> 047 * <p> 048 * Let's first look at some simple examples: 049 * </p> 050 * <table> 051 * <tr> 052 * <th>Input Path</th> 053 * <th>Description</th> 054 * </tr> 055 * <tr> 056 * <td>/a/b</td> 057 * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any 058 * same-name-sibilings.</td> 059 * </tr> 060 * <tr> 061 * <td>/a/*</td> 062 * <td>Match any child node of the top level node "<code>a</code>".</td> 063 * </tr> 064 * <tr> 065 * <td>/a/*.txt</td> 066 * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td> 067 * </tr> 068 * <tr> 069 * <td>/a/b@c</td> 070 * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td> 071 * </tr> 072 * <tr> 073 * <td>/a/b[2]</td> 074 * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td> 075 * </tr> 076 * <tr> 077 * <td>/a/b[2,3,4]</td> 078 * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td> 079 * </tr> 080 * <tr> 081 * <td>/a/b[*]</td> 082 * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td> 083 * </tr> 084 * <tr> 085 * <td>//a/b</td> 086 * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>" 087 * occurs. Again, neither node may have any same-name-sibilings.</td> 088 * </tr> 089 * </table> 090 * <p> 091 * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard 092 * character that matches any character or sequence of characters in a node's name (or index if appearing in between square 093 * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>"). 094 * </p> 095 * <p> 096 * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You 097 * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no 098 * same-name-sibilings, or any positive number to match the specific same-name-sibling. 099 * </p> 100 * <p> 101 * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or 102 * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential 103 * slash characters are treated as two. 104 * </p> 105 * <p> 106 * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more 107 * examples: 108 * </p> 109 * <table> 110 * <tr> 111 * <th>Input Path</th> 112 * <th>Description</th> 113 * </tr> 114 * <tr> 115 * <td>/a/(b|c|d)</td> 116 * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code> 117 * ". None of the nodes may have same-name-sibling indexes.</td> 118 * </tr> 119 * <tr> 120 * <td>/a/b[c/d]</td> 121 * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named " 122 * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code> 123 * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td> 124 * </tr> 125 * <tr> 126 * <td>/a(/(b|c|d|)/e)[f/g/@something]</td> 127 * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code> 128 * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>". 129 * None of the nodes may have same-name-sibling indexes.</td> 130 * </tr> 131 * </table> 132 * <p> 133 * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define 134 * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between 135 * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node " 136 * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the 137 * number used in the output path identifies the parentheses. 138 * </p> 139 * <p> 140 * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square 141 * brackets does not appear in the selected node. 142 * </p> 143 * 144 * @author Randall Hauch 145 */ 146 @Immutable 147 public class PathExpression implements Serializable { 148 149 /** 150 * Initial version 151 */ 152 private static final long serialVersionUID = 1L; 153 154 /** 155 * Compile the supplied expression and return the resulting path expression instance. 156 * 157 * @param expression the expression 158 * @return the path expression; never null 159 * @throws IllegalArgumentException if the expression is null 160 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression 161 */ 162 public static final PathExpression compile( String expression ) throws InvalidPathExpressionException { 163 return new PathExpression(expression); 164 } 165 166 private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\] 167 private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING); 168 169 /** 170 * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates 171 * in group 1. Note that some predicates may be valid at the end but not valid elsewhere. 172 * <p> 173 * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties 174 * are allowed only as the last predicate. Predicates with any operators are unused. 175 * </p> 176 * <p> 177 * Nested predicates are not currently allowed. 178 * </p> 179 */ 180 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\]) 181 private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])"; 182 private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING); 183 184 /** 185 * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the 186 * predicates to be removed in group 1. 187 */ 188 // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\]) 189 private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])"; 190 private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING); 191 192 private final String expression; 193 private final Pattern matchPattern; 194 private final Pattern selectPattern; 195 196 /** 197 * Create the supplied expression. 198 * 199 * @param expression the expression 200 * @throws IllegalArgumentException if the expression is null 201 * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression 202 */ 203 public PathExpression( String expression ) throws InvalidPathExpressionException { 204 CheckArg.isNotNull(expression, "path expression"); 205 this.expression = expression.trim(); 206 if (this.expression.length() == 0) { 207 throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text()); 208 } 209 // Build the match pattern, which determines whether a path matches the condition ... 210 String matchString = this.expression; 211 try { 212 matchString = removeUnusedPredicates(matchString); 213 matchString = replaceXPathPatterns(matchString); 214 this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE); 215 } catch (PatternSyntaxException e) { 216 String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression); 217 throw new InvalidPathExpressionException(msg, e); 218 } 219 // Build the select pattern, which determines the path that will be selected ... 220 String selectString = this.expression; 221 try { 222 selectString = removeAllPredicatesExceptIndexes(selectString); 223 selectString = replaceXPathPatterns(selectString); 224 selectString = "(" + selectString + ").*"; // group 1 will have selected path ... 225 this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE); 226 } catch (PatternSyntaxException e) { 227 String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression); 228 throw new InvalidPathExpressionException(msg, e); 229 } 230 } 231 232 /** 233 * @return expression 234 */ 235 public String getExpression() { 236 return expression; 237 } 238 239 /** 240 * Replace certain XPath patterns that are not used or understood. 241 * 242 * @param expression the input regular expressions string; may not be null 243 * @return the regular expression with all unused XPath patterns removed; never null 244 */ 245 protected String removeUnusedPredicates( String expression ) { 246 assert expression != null; 247 java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression); 248 StringBuffer sb = new StringBuffer(); 249 if (matcher.find()) { 250 do { 251 // Remove those predicates that show up in group 1 ... 252 String predicateStr = matcher.group(0); 253 String unusablePredicateStr = matcher.group(1); 254 if (unusablePredicateStr != null) { 255 predicateStr = ""; 256 } 257 matcher.appendReplacement(sb, predicateStr); 258 } while (matcher.find()); 259 matcher.appendTail(sb); 260 expression = sb.toString(); 261 } 262 return expression; 263 } 264 265 /** 266 * Remove all XPath predicates from the supplied regular expression string. 267 * 268 * @param expression the input regular expressions string; may not be null 269 * @return the regular expression with all XPath predicates removed; never null 270 */ 271 protected String removeAllPredicatesExceptIndexes( String expression ) { 272 assert expression != null; 273 java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression); 274 StringBuffer sb = new StringBuffer(); 275 if (matcher.find()) { 276 do { 277 // Remove those predicates that show up in group 1 ... 278 String predicateStr = matcher.group(0); 279 String unusablePredicateStr = matcher.group(1); 280 if (unusablePredicateStr != null) { 281 predicateStr = ""; 282 } 283 matcher.appendReplacement(sb, predicateStr); 284 } while (matcher.find()); 285 matcher.appendTail(sb); 286 expression = sb.toString(); 287 } 288 return expression; 289 } 290 291 /** 292 * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions. 293 * 294 * @param expression the input regular expressions string; may not be null 295 * @return the regular expression with XPath patterns replaced with regular expression fragments; never null 296 */ 297 protected String replaceXPathPatterns( String expression ) { 298 assert expression != null; 299 // replace 2 or more sequential '|' characters in an OR expression 300 expression = expression.replaceAll("[\\|]{2,}", "|"); 301 // if there is an empty expression in an OR expression, make the whole segment optional ... 302 // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d" 303 expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?"); 304 expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?"); 305 expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?"); 306 307 // // Allow any path (that doesn't contain an explicit counter) to contain a counter, 308 // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'... 309 // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1"); 310 311 // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)... 312 // '[*]/' => '(\[\d+\])?/' 313 expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional 314 // '[]/' => '(\[\d+\])?/' 315 expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional 316 // '[0]/' => '(\[0\])?/' 317 expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional 318 // '[n]/' => '\[n\]/' 319 expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required 320 321 // Change any other end predicates to not be wrapped by braces but to begin with a slash ... 322 // ...'[x]' => ...'/x' 323 expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1"); 324 325 // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]' 326 java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression); 327 StringBuffer sb = new StringBuffer(); 328 boolean result = matcher.find(); 329 if (result) { 330 do { 331 String sequenceStr = matcher.group(1); 332 boolean optional = false; 333 if (sequenceStr.startsWith("0,")) { 334 sequenceStr = sequenceStr.replaceFirst("^0,", ""); 335 optional = true; 336 } 337 if (sequenceStr.endsWith(",0")) { 338 sequenceStr = sequenceStr.replaceFirst(",0$", ""); 339 optional = true; 340 } 341 if (sequenceStr.contains(",0,")) { 342 sequenceStr = sequenceStr.replaceAll(",0,", ","); 343 optional = true; 344 } 345 sequenceStr = sequenceStr.replaceAll(",", "|"); 346 String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]"; 347 if (optional) { 348 replacement = "(?:" + replacement + ")?"; 349 } 350 matcher.appendReplacement(sb, replacement); 351 result = matcher.find(); 352 } while (result); 353 matcher.appendTail(sb); 354 expression = sb.toString(); 355 } 356 357 // Order is important here 358 expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '(' 359 expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*"); 360 expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//' 361 expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//' 362 return expression; 363 } 364 365 /** 366 * @return the expression 367 */ 368 public String getSelectExpression() { 369 return this.expression; 370 } 371 372 /** 373 * {@inheritDoc} 374 */ 375 @Override 376 public int hashCode() { 377 return this.expression.hashCode(); 378 } 379 380 /** 381 * {@inheritDoc} 382 */ 383 @Override 384 public boolean equals( Object obj ) { 385 if (obj == this) return true; 386 if (obj instanceof PathExpression) { 387 PathExpression that = (PathExpression)obj; 388 if (!this.expression.equalsIgnoreCase(that.expression)) return false; 389 return true; 390 } 391 return false; 392 } 393 394 /** 395 * {@inheritDoc} 396 */ 397 @Override 398 public String toString() { 399 return this.expression; 400 } 401 402 /** 403 * @param absolutePath 404 * @return the matcher 405 */ 406 public Matcher matcher( String absolutePath ) { 407 // Determine if the input path match the select expression ... 408 String originalAbsolutePath = absolutePath; 409 // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/"; 410 // Remove all trailing '/' ... 411 absolutePath = absolutePath.replaceAll("/+$", ""); 412 413 // See if the supplied absolute path matches the pattern ... 414 final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath); 415 if (!matcher.matches()) { 416 // No match, so return immediately ... 417 return new Matcher(matcher, originalAbsolutePath, null); 418 } 419 420 // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ... 421 final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath); 422 if (!selectMatcher.matches()) { 423 // Nothing can be selected, so return immediately ... 424 return new Matcher(matcher, null, null); 425 } 426 // Grab the selected path ... 427 String selectedPath = selectMatcher.group(1); 428 429 // Remove the trailing '/@property' ... 430 selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", ""); 431 432 return new Matcher(matcher, originalAbsolutePath, selectedPath); 433 } 434 435 @Immutable 436 public static class Matcher { 437 438 private final String inputPath; 439 private final String selectedPath; 440 private final java.util.regex.Matcher inputMatcher; 441 private final int hc; 442 443 protected Matcher( java.util.regex.Matcher inputMatcher, 444 String inputPath, 445 String selectedPath ) { 446 this.inputMatcher = inputMatcher; 447 this.inputPath = inputPath; 448 this.selectedPath = selectedPath; 449 this.hc = HashCode.compute(this.inputPath, this.selectedPath); 450 } 451 452 public boolean matches() { 453 return this.selectedPath != null; 454 } 455 456 /** 457 * @return inputPath 458 */ 459 public String getInputPath() { 460 return this.inputPath; 461 } 462 463 /** 464 * @return selectPattern 465 */ 466 public String getSelectedNodePath() { 467 return this.selectedPath; 468 } 469 470 public int groupCount() { 471 return this.inputMatcher.groupCount(); 472 } 473 474 public String group( int groupNumber ) { 475 return this.inputMatcher.group(groupNumber); 476 } 477 478 /** 479 * {@inheritDoc} 480 */ 481 @Override 482 public int hashCode() { 483 return this.hc; 484 } 485 486 /** 487 * {@inheritDoc} 488 */ 489 @Override 490 public boolean equals( Object obj ) { 491 if (obj == this) return true; 492 if (obj instanceof PathExpression.Matcher) { 493 PathExpression.Matcher that = (PathExpression.Matcher)obj; 494 if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false; 495 if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false; 496 return true; 497 } 498 return false; 499 } 500 501 /** 502 * {@inheritDoc} 503 */ 504 @Override 505 public String toString() { 506 return this.selectedPath; 507 } 508 } 509 510 /** 511 * Regular expression used to determine if the expression matches any single-level wildcard. 512 */ 513 // /*(?:[*.](?:\[\*?\])?/*)* 514 private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*"; 515 private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING); 516 517 /** 518 * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes (" 519 * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard (" 520 * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also 521 * considered to match anything. 522 * 523 * @return true if the expression matches anything, or false otherwise 524 */ 525 public boolean matchesAnything() { 526 return ANYTHING_PATTERN.matcher(expression).matches(); 527 } 528 529 public static PathExpression all() { 530 return ALL_PATHS_EXPRESSION; 531 } 532 533 private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//"); 534 535 }