001    /*
002     * JBoss, Home of Professional Open Source.
003     * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004     * as indicated by the @author tags. See the copyright.txt file in the
005     * distribution for a full listing of individual contributors. 
006     *
007     * This is free software; you can redistribute it and/or modify it
008     * under the terms of the GNU Lesser General Public License as
009     * published by the Free Software Foundation; either version 2.1 of
010     * the License, or (at your option) any later version.
011     *
012     * This software is distributed in the hope that it will be useful,
013     * but WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * Lesser General Public License for more details.
016     *
017     * You should have received a copy of the GNU Lesser General Public
018     * License along with this software; if not, write to the Free
019     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021     */
022    package org.jboss.dna.graph.properties;
023    
024    import java.io.Serializable;
025    import java.util.regex.Pattern;
026    import java.util.regex.PatternSyntaxException;
027    import net.jcip.annotations.Immutable;
028    import org.jboss.dna.common.util.CheckArg;
029    import org.jboss.dna.common.util.HashCode;
030    import org.jboss.dna.graph.GraphI18n;
031    
032    /**
033     * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to
034     * represent node paths or properties.
035     * <p>
036     * Path expressions consist of two parts: a selection criteria (or an input path) and an output path:
037     * </p>
038     * 
039     * <pre>
040     * inputPath =&gt; outputPath
041     * </pre>
042     * <p>
043     * The <i>inputPath</i> part defines an expression for the path of a node that is to be sequenced. Input paths consist of '
044     * <code>/</code>' separated segments, where each segment represents a pattern for a single node's name (including the
045     * same-name-sibling indexes) and '<code>@</code>' signifies a property name.
046     * </p>
047     * <p>
048     * Let's first look at some simple examples:
049     * </p>
050     * <table>
051     * <tr>
052     * <th>Input Path</th>
053     * <th>Description</th>
054     * </tr>
055     * <tr>
056     * <td>/a/b</td>
057     * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any
058     * same-name-sibilings.</td>
059     * </tr>
060     * <tr>
061     * <td>/a/*</td>
062     * <td>Match any child node of the top level node "<code>a</code>".</td>
063     * </tr>
064     * <tr>
065     * <td>/a/*.txt</td>
066     * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td>
067     * </tr>
068     * <tr>
069     * <td>/a/b@c</td>
070     * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td>
071     * </tr>
072     * <tr>
073     * <td>/a/b[2]</td>
074     * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td>
075     * </tr>
076     * <tr>
077     * <td>/a/b[2,3,4]</td>
078     * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td>
079     * </tr>
080     * <tr>
081     * <td>/a/b[*]</td>
082     * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td>
083     * </tr>
084     * <tr>
085     * <td>//a/b</td>
086     * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>"
087     * occurs. Again, neither node may have any same-name-sibilings.</td>
088     * </tr>
089     * </table>
090     * <p>
091     * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard
092     * character that matches any character or sequence of characters in a node's name (or index if appearing in between square
093     * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>").
094     * </p>
095     * <p>
096     * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You
097     * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no
098     * same-name-sibilings, or any positive number to match the specific same-name-sibling.
099     * </p>
100     * <p>
101     * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or
102     * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential
103     * slash characters are treated as two.
104     * </p>
105     * <p>
106     * Many input paths can be created using just these simple rules. However, input paths can be more complicated. Here are some more
107     * examples:
108     * </p>
109     * <table>
110     * <tr>
111     * <th>Input Path</th>
112     * <th>Description</th>
113     * </tr>
114     * <tr>
115     * <td>/a/(b|c|d)</td>
116     * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code>
117     * ". None of the nodes may have same-name-sibling indexes.</td>
118     * </tr>
119     * <tr>
120     * <td>/a/b[c/d]</td>
121     * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named "
122     * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code>
123     * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td>
124     * </tr>
125     * <tr>
126     * <td>/a(/(b|c|d|)/e)[f/g/@something]</td>
127     * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code>
128     * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>".
129     * None of the nodes may have same-name-sibling indexes.</td>
130     * </tr>
131     * </table>
132     * <p>
133     * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define
134     * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between
135     * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node "
136     * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the
137     * number used in the output path identifies the parentheses.
138     * </p>
139     * <p>
140     * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square
141     * brackets does not appear in the selected node.
142     * </p>
143     * 
144     * @author Randall Hauch
145     */
146    @Immutable
147    public class PathExpression implements Serializable {
148    
149        /**
150         * Initial version
151         */
152        private static final long serialVersionUID = 1L;
153    
154        /**
155         * Compile the supplied expression and return the resulting path expression instance.
156         * 
157         * @param expression the expression
158         * @return the path expression; never null
159         * @throws IllegalArgumentException if the expression is null
160         * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
161         */
162        public static final PathExpression compile( String expression ) throws InvalidPathExpressionException {
163            return new PathExpression(expression);
164        }
165    
166        private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\]
167        private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING);
168    
169        /**
170         * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates
171         * in group 1. Note that some predicates may be valid at the end but not valid elsewhere.
172         * <p>
173         * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties
174         * are allowed only as the last predicate. Predicates with any operators are unused.
175         * </p>
176         * <p>
177         * Nested predicates are not currently allowed.
178         * </p>
179         */
180        // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\])
181        private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])";
182        private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING);
183    
184        /**
185         * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the
186         * predicates to be removed in group 1.
187         */
188        // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\])
189        private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])";
190        private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING);
191    
192        private final String expression;
193        private final Pattern matchPattern;
194        private final Pattern selectPattern;
195    
196        /**
197         * Create the supplied expression.
198         * 
199         * @param expression the expression
200         * @throws IllegalArgumentException if the expression is null
201         * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression
202         */
203        public PathExpression( String expression ) throws InvalidPathExpressionException {
204            CheckArg.isNotNull(expression, "path expression");
205            this.expression = expression.trim();
206            if (this.expression.length() == 0) {
207                throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text());
208            }
209            // Build the match pattern, which determines whether a path matches the condition ...
210            String matchString = this.expression;
211            try {
212                matchString = removeUnusedPredicates(matchString);
213                matchString = replaceXPathPatterns(matchString);
214                this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE);
215            } catch (PatternSyntaxException e) {
216                String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression);
217                throw new InvalidPathExpressionException(msg, e);
218            }
219            // Build the select pattern, which determines the path that will be selected ...
220            String selectString = this.expression;
221            try {
222                selectString = removeAllPredicatesExceptIndexes(selectString);
223                selectString = replaceXPathPatterns(selectString);
224                selectString = "(" + selectString + ").*"; // group 1 will have selected path ...
225                this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE);
226            } catch (PatternSyntaxException e) {
227                String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression);
228                throw new InvalidPathExpressionException(msg, e);
229            }
230        }
231    
232        /**
233         * @return expression
234         */
235        public String getExpression() {
236            return expression;
237        }
238    
239        /**
240         * Replace certain XPath patterns that are not used or understood.
241         * 
242         * @param expression the input regular expressions string; may not be null
243         * @return the regular expression with all unused XPath patterns removed; never null
244         */
245        protected String removeUnusedPredicates( String expression ) {
246            assert expression != null;
247            java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression);
248            StringBuffer sb = new StringBuffer();
249            if (matcher.find()) {
250                do {
251                    // Remove those predicates that show up in group 1 ...
252                    String predicateStr = matcher.group(0);
253                    String unusablePredicateStr = matcher.group(1);
254                    if (unusablePredicateStr != null) {
255                        predicateStr = "";
256                    }
257                    matcher.appendReplacement(sb, predicateStr);
258                } while (matcher.find());
259                matcher.appendTail(sb);
260                expression = sb.toString();
261            }
262            return expression;
263        }
264    
265        /**
266         * Remove all XPath predicates from the supplied regular expression string.
267         * 
268         * @param expression the input regular expressions string; may not be null
269         * @return the regular expression with all XPath predicates removed; never null
270         */
271        protected String removeAllPredicatesExceptIndexes( String expression ) {
272            assert expression != null;
273            java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression);
274            StringBuffer sb = new StringBuffer();
275            if (matcher.find()) {
276                do {
277                    // Remove those predicates that show up in group 1 ...
278                    String predicateStr = matcher.group(0);
279                    String unusablePredicateStr = matcher.group(1);
280                    if (unusablePredicateStr != null) {
281                        predicateStr = "";
282                    }
283                    matcher.appendReplacement(sb, predicateStr);
284                } while (matcher.find());
285                matcher.appendTail(sb);
286                expression = sb.toString();
287            }
288            return expression;
289        }
290    
291        /**
292         * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions.
293         * 
294         * @param expression the input regular expressions string; may not be null
295         * @return the regular expression with XPath patterns replaced with regular expression fragments; never null
296         */
297        protected String replaceXPathPatterns( String expression ) {
298            assert expression != null;
299            // replace 2 or more sequential '|' characters in an OR expression
300            expression = expression.replaceAll("[\\|]{2,}", "|");
301            // if there is an empty expression in an OR expression, make the whole segment optional ...
302            // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d"
303            expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?");
304            expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?");
305            expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?");
306    
307            // // Allow any path (that doesn't contain an explicit counter) to contain a counter,
308            // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'...
309            // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1");
310    
311            // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)...
312            // '[*]/' => '(\[\d+\])?/'
313            expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
314            // '[]/' => '(\[\d+\])?/'
315            expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional
316            // '[0]/' => '(\[0\])?/'
317            expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional
318            // '[n]/' => '\[n\]/'
319            expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required
320    
321            // Change any other end predicates to not be wrapped by braces but to begin with a slash ...
322            // ...'[x]' => ...'/x'
323            expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1");
324    
325            // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]'
326            java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression);
327            StringBuffer sb = new StringBuffer();
328            boolean result = matcher.find();
329            if (result) {
330                do {
331                    String sequenceStr = matcher.group(1);
332                    boolean optional = false;
333                    if (sequenceStr.startsWith("0,")) {
334                        sequenceStr = sequenceStr.replaceFirst("^0,", "");
335                        optional = true;
336                    }
337                    if (sequenceStr.endsWith(",0")) {
338                        sequenceStr = sequenceStr.replaceFirst(",0$", "");
339                        optional = true;
340                    }
341                    if (sequenceStr.contains(",0,")) {
342                        sequenceStr = sequenceStr.replaceAll(",0,", ",");
343                        optional = true;
344                    }
345                    sequenceStr = sequenceStr.replaceAll(",", "|");
346                    String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]";
347                    if (optional) {
348                        replacement = "(?:" + replacement + ")?";
349                    }
350                    matcher.appendReplacement(sb, replacement);
351                    result = matcher.find();
352                } while (result);
353                matcher.appendTail(sb);
354                expression = sb.toString();
355            }
356    
357            // Order is important here
358            expression = expression.replaceAll("[*]([^/(\\\\])", "[^/$1]*$1"); // '*' not followed by '/', '\\', or '('
359            expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*");
360            expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//'
361            expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//'
362            return expression;
363        }
364    
365        /**
366         * @return the expression
367         */
368        public String getSelectExpression() {
369            return this.expression;
370        }
371    
372        /**
373         * {@inheritDoc}
374         */
375        @Override
376        public int hashCode() {
377            return this.expression.hashCode();
378        }
379    
380        /**
381         * {@inheritDoc}
382         */
383        @Override
384        public boolean equals( Object obj ) {
385            if (obj == this) return true;
386            if (obj instanceof PathExpression) {
387                PathExpression that = (PathExpression)obj;
388                if (!this.expression.equalsIgnoreCase(that.expression)) return false;
389                return true;
390            }
391            return false;
392        }
393    
394        /**
395         * {@inheritDoc}
396         */
397        @Override
398        public String toString() {
399            return this.expression;
400        }
401    
402        /**
403         * @param absolutePath
404         * @return the matcher
405         */
406        public Matcher matcher( String absolutePath ) {
407            // Determine if the input path match the select expression ...
408            String originalAbsolutePath = absolutePath;
409            // if (!absolutePath.endsWith("/")) absolutePath = absolutePath + "/";
410            // Remove all trailing '/' ...
411            absolutePath = absolutePath.replaceAll("/+$", "");
412    
413            // See if the supplied absolute path matches the pattern ...
414            final java.util.regex.Matcher matcher = this.matchPattern.matcher(absolutePath);
415            if (!matcher.matches()) {
416                // No match, so return immediately ...
417                return new Matcher(matcher, originalAbsolutePath, null);
418            }
419    
420            // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ...
421            final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(absolutePath);
422            if (!selectMatcher.matches()) {
423                // Nothing can be selected, so return immediately ...
424                return new Matcher(matcher, null, null);
425            }
426            // Grab the selected path ...
427            String selectedPath = selectMatcher.group(1);
428    
429            // Remove the trailing '/@property' ...
430            selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", "");
431    
432            return new Matcher(matcher, originalAbsolutePath, selectedPath);
433        }
434    
435        @Immutable
436        public static class Matcher {
437    
438            private final String inputPath;
439            private final String selectedPath;
440            private final java.util.regex.Matcher inputMatcher;
441            private final int hc;
442    
443            protected Matcher( java.util.regex.Matcher inputMatcher,
444                               String inputPath,
445                               String selectedPath ) {
446                this.inputMatcher = inputMatcher;
447                this.inputPath = inputPath;
448                this.selectedPath = selectedPath;
449                this.hc = HashCode.compute(this.inputPath, this.selectedPath);
450            }
451    
452            public boolean matches() {
453                return this.selectedPath != null;
454            }
455    
456            /**
457             * @return inputPath
458             */
459            public String getInputPath() {
460                return this.inputPath;
461            }
462    
463            /**
464             * @return selectPattern
465             */
466            public String getSelectedNodePath() {
467                return this.selectedPath;
468            }
469    
470            public int groupCount() {
471                return this.inputMatcher.groupCount();
472            }
473    
474            public String group( int groupNumber ) {
475                return this.inputMatcher.group(groupNumber);
476            }
477    
478            /**
479             * {@inheritDoc}
480             */
481            @Override
482            public int hashCode() {
483                return this.hc;
484            }
485    
486            /**
487             * {@inheritDoc}
488             */
489            @Override
490            public boolean equals( Object obj ) {
491                if (obj == this) return true;
492                if (obj instanceof PathExpression.Matcher) {
493                    PathExpression.Matcher that = (PathExpression.Matcher)obj;
494                    if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false;
495                    if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false;
496                    return true;
497                }
498                return false;
499            }
500    
501            /**
502             * {@inheritDoc}
503             */
504            @Override
505            public String toString() {
506                return this.selectedPath;
507            }
508        }
509    
510        /**
511         * Regular expression used to determine if the expression matches any single-level wildcard.
512         */
513        // /*(?:[*.](?:\[\*?\])?/*)*
514        private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*";
515        private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING);
516    
517        /**
518         * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes ("
519         * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard ("
520         * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also
521         * considered to match anything.
522         * 
523         * @return true if the expression matches anything, or false otherwise
524         */
525        public boolean matchesAnything() {
526            return ANYTHING_PATTERN.matcher(expression).matches();
527        }
528    
529        public static PathExpression all() {
530            return ALL_PATHS_EXPRESSION;
531        }
532    
533        private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//");
534    
535    }