001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006     * See the AUTHORS.txt file in the distribution for a full listing of 
007     * individual contributors. 
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    package org.jboss.dna.sequencer.xml;
025    
026    import java.util.ArrayList;
027    import java.util.HashMap;
028    import java.util.LinkedList;
029    import java.util.List;
030    import java.util.Map;
031    import org.jboss.dna.common.collection.Problems;
032    import org.jboss.dna.common.text.TextDecoder;
033    import org.jboss.dna.common.text.XmlNameEncoder;
034    import org.jboss.dna.common.util.CheckArg;
035    import org.jboss.dna.common.util.StringUtil;
036    import org.jboss.dna.graph.ExecutionContext;
037    import org.jboss.dna.graph.JcrLexicon;
038    import org.jboss.dna.graph.property.Name;
039    import org.jboss.dna.graph.property.NameFactory;
040    import org.jboss.dna.graph.property.NamespaceRegistry;
041    import org.jboss.dna.graph.property.Path;
042    import org.jboss.dna.graph.property.PathFactory;
043    import org.jboss.dna.graph.property.PropertyFactory;
044    import org.jboss.dna.graph.property.ValueFormatException;
045    import org.jboss.dna.graph.property.basic.LocalNamespaceRegistry;
046    import org.jboss.dna.graph.sequencer.StreamSequencerContext;
047    import org.jboss.dna.graph.sequencer.SequencerOutput;
048    import org.xml.sax.Attributes;
049    import org.xml.sax.SAXParseException;
050    import org.xml.sax.ext.DefaultHandler2;
051    
052    /**
053     * @author Randall Hauch
054     */
055    public class XmlSequencerHandler extends DefaultHandler2 {
056    
057        private final SequencerOutput output;
058        private final StreamSequencerContext context;
059    
060        /**
061         * Decoder for XML names, to turn '_xHHHH_' sequences in the XML element and attribute names into the corresponding UTF-16
062         * characters.
063         */
064        public static TextDecoder DEFAULT_DECODER = new XmlNameEncoder();
065    
066        /**
067         * The default {@link XmlSequencer.AttributeScoping}.
068         */
069        public static XmlSequencer.AttributeScoping DEFAULT_ATTRIBUTE_SCOPING = XmlSequencer.AttributeScoping.USE_DEFAULT_NAMESPACE;
070    
071        /**
072         * The name of the attribute that should be used for the node name.
073         */
074        protected final Name nameAttribute;
075    
076        /**
077         * The default primary type.
078         */
079        protected final Name defaultPrimaryType;
080    
081        /**
082         * The cached reference to the graph's path factory.
083         */
084        protected final PathFactory pathFactory;
085    
086        /**
087         * The cached reference to the graph's name factory.
088         */
089        protected final NameFactory nameFactory;
090    
091        /**
092         * The cached reference to the graph's property factory.
093         */
094        protected final PropertyFactory propertyFactory;
095    
096        /**
097         * The cached reference to the graph's namespace registry.
098         */
099        protected final NamespaceRegistry namespaceRegistry;
100    
101        /**
102         * The TextDecoder that is used to decode the names.
103         */
104        protected final TextDecoder decoder;
105    
106        /**
107         * The stack of prefixes for each namespace, which is used to keep the {@link #namespaceRegistry local namespace registry} in
108         * sync with the namespaces in the XML document.
109         */
110        private final Map<String, LinkedList<String>> prefixStackByUri = new HashMap<String, LinkedList<String>>();
111    
112        private final XmlSequencer.AttributeScoping attributeScoping;
113    
114        /**
115         * The path for the node representing the current element. This starts out as the path supplied by the constructor, and never
116         * is shorter than that initial path.
117         */
118        protected Path currentPath;
119    
120        // Recursive map used to track the number of occurrences of names for elements under a particular path
121        private Map<Name, List<IndexedName>> nameToIndexedNamesMap = new HashMap<Name, List<IndexedName>>();
122    
123        // The stack of recursive maps being processed, with the head entry being the map for the current path
124        private final LinkedList<Map<Name, List<IndexedName>>> nameToIndexedNamesMapStack = new LinkedList<Map<Name, List<IndexedName>>>();
125    
126        private String currentEntityName;
127        private StringBuilder cDataContent;
128        private StringBuilder contentBuilder;
129        private final Problems problems;
130        private final Map<String, String> entityValues = new HashMap<String, String>();
131    
132        /**
133         * @param output
134         * @param context
135         * @param nameAttribute
136         * @param defaultPrimaryType
137         * @param textDecoder
138         * @param scoping
139         */
140        XmlSequencerHandler( SequencerOutput output,
141                             StreamSequencerContext context,
142                             Name nameAttribute,
143                             Name defaultPrimaryType,
144                             TextDecoder textDecoder,
145                             XmlSequencer.AttributeScoping scoping ) {
146            CheckArg.isNotNull(output, "output");
147            CheckArg.isNotNull(context, "context");
148    
149            // Use the execution context ...
150            this.output = output;
151            this.context = context;
152            this.problems = context.getProblems();
153            assert this.problems != null;
154    
155            this.nameAttribute = nameAttribute;
156            this.defaultPrimaryType = defaultPrimaryType;
157            this.decoder = textDecoder != null ? textDecoder : DEFAULT_DECODER;
158            this.attributeScoping = scoping != null ? scoping : DEFAULT_ATTRIBUTE_SCOPING;
159    
160            // Set up a local namespace registry that is kept in sync with the namespaces found in this XML document ...
161            NamespaceRegistry namespaceRegistry = new LocalNamespaceRegistry(this.context.getNamespaceRegistry());
162            final ExecutionContext localContext = this.context.with(namespaceRegistry);
163    
164            // Set up references to frequently-used objects in the context ...
165            this.nameFactory = localContext.getValueFactories().getNameFactory();
166            this.pathFactory = localContext.getValueFactories().getPathFactory();
167            this.propertyFactory = localContext.getPropertyFactory();
168            this.namespaceRegistry = localContext.getNamespaceRegistry();
169            assert this.nameFactory != null;
170            assert this.pathFactory != null;
171            assert this.propertyFactory != null;
172            assert this.namespaceRegistry != null;
173    
174            // Set up the initial path ...
175            this.currentPath = this.pathFactory.createRelativePath();
176            assert this.currentPath != null;
177        }
178    
179        private void startNode( Name name ) {
180            // Check if content still needs to be output
181            if (contentBuilder != null) endContent();
182            // Add name to list of indexed names for this element to ensure we use the correct index (which is the size of the
183            // list)
184            List<IndexedName> indexedNames = nameToIndexedNamesMap.get(name);
185            if (indexedNames == null) {
186                indexedNames = new ArrayList<IndexedName>();
187                nameToIndexedNamesMap.put(name, indexedNames);
188            }
189            IndexedName indexedName = new IndexedName();
190            indexedNames.add(indexedName);
191            // Add element name and the appropriate index to the path.
192            // Per the JCR spec, the index must be relative to same-name sibling nodes
193            currentPath = pathFactory.create(currentPath, name, indexedNames.size()).getNormalizedPath();
194            // currentPath = currentPath.getNormalizedPath();
195            // Add the indexed name map to the stack and set the current map to the new element's map
196            nameToIndexedNamesMapStack.addFirst(nameToIndexedNamesMap);
197            nameToIndexedNamesMap = indexedName.nameToIndexedNamesMap;
198        }
199    
200        private void endNode() {
201            // Recover parent's path, namespace, and indexedName map, clearing the ended element's map to free memory
202            currentPath = currentPath.getParent();
203            currentPath = currentPath.getNormalizedPath();
204            nameToIndexedNamesMap.clear();
205            nameToIndexedNamesMap = nameToIndexedNamesMapStack.removeFirst();
206        }
207    
208        /**
209         * See if there is any element content that needs to be completed.
210         */
211        protected void endContent() {
212            // Process the content of the element ...
213            String content = StringUtil.normalize(contentBuilder.toString());
214            // Null-out builder to setup for subsequent content.
215            // Must be done before call to startElement below to prevent infinite loop.
216            contentBuilder = null;
217            // Skip if nothing in content but whitespace
218            if (content.length() > 0) {
219                // Create separate node for each content entry since entries can be interspersed amongst child elements
220                startNode(DnaXmlLexicon.ELEMENT_CONTENT);
221                output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.ELEMENT_CONTENT);
222                output.setProperty(currentPath, DnaXmlLexicon.ELEMENT_CONTENT, content);
223                endNode();
224            }
225        }
226    
227        /**
228         * <p>
229         * {@inheritDoc}
230         * </p>
231         * 
232         * @see org.xml.sax.helpers.DefaultHandler#startDocument()
233         */
234        @Override
235        public void startDocument() {
236            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.DOCUMENT);
237        }
238    
239        /**
240         * <p>
241         * {@inheritDoc}
242         * </p>
243         * 
244         * @see org.xml.sax.ext.DefaultHandler2#startDTD(java.lang.String, java.lang.String, java.lang.String)
245         */
246        @Override
247        public void startDTD( String name,
248                              String publicId,
249                              String systemId ) {
250            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
251            output.setProperty(currentPath, DnaDtdLexicon.PUBLIC_ID, publicId);
252            output.setProperty(currentPath, DnaDtdLexicon.SYSTEM_ID, systemId);
253        }
254    
255        /**
256         * <p>
257         * {@inheritDoc}
258         * </p>
259         * 
260         * @see org.xml.sax.ext.DefaultHandler2#externalEntityDecl(java.lang.String, java.lang.String, java.lang.String)
261         */
262        @Override
263        public void externalEntityDecl( String name,
264                                        String publicId,
265                                        String systemId ) {
266            // Add "synthetic" entity container to path to help prevent name collisions with XML elements
267            startNode(DnaDtdLexicon.ENTITY);
268            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaDtdLexicon.ENTITY);
269            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
270            if (publicId != null) output.setProperty(currentPath, DnaDtdLexicon.PUBLIC_ID, publicId);
271            if (systemId != null) output.setProperty(currentPath, DnaDtdLexicon.SYSTEM_ID, systemId);
272            endNode();
273        }
274    
275        /**
276         * {@inheritDoc}
277         * 
278         * @see org.xml.sax.ext.DefaultHandler2#internalEntityDecl(java.lang.String, java.lang.String)
279         */
280        @Override
281        public void internalEntityDecl( String name,
282                                        String value ) {
283            // Add "synthetic" entity container to path to help prevent name collisions with XML elements
284            startNode(DnaDtdLexicon.ENTITY);
285            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaDtdLexicon.ENTITY);
286            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
287            output.setProperty(currentPath, DnaDtdLexicon.VALUE, value);
288            // Record the name/value pair ...
289            entityValues.put(name, value);
290            endNode();
291        }
292    
293        /**
294         * <p>
295         * {@inheritDoc}
296         * </p>
297         * 
298         * @see org.xml.sax.helpers.DefaultHandler#processingInstruction(java.lang.String, java.lang.String)
299         */
300        @Override
301        public void processingInstruction( String target,
302                                           String data ) {
303            // Output separate nodes for each instruction since multiple are allowed
304            startNode(DnaXmlLexicon.PROCESSING_INSTRUCTION);
305            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.PROCESSING_INSTRUCTION);
306            output.setProperty(currentPath, DnaXmlLexicon.TARGET, target.trim());
307            if (data != null) {
308                output.setProperty(currentPath, DnaXmlLexicon.PROCESSING_INSTRUCTION_CONTENT, data.trim());
309            }
310            endNode();
311        }
312    
313        /**
314         * {@inheritDoc}
315         * <p>
316         * This method ensures that the namespace is registered with the {@link NamespaceRegistry registry}, using the supplied prefix
317         * to register the namespace if required. Note that because this class does not really use the namespace prefixes to create
318         * {@link Name} objects, no attempt is made to match the XML namespace prefixes.
319         * </p>
320         * 
321         * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
322         */
323        @Override
324        public void startPrefixMapping( String prefix,
325                                        String uri ) {
326            assert uri != null;
327            // Add the prefix to the stack ...
328            LinkedList<String> prefixStack = this.prefixStackByUri.get(uri);
329            if (prefixStack == null) {
330                prefixStack = new LinkedList<String>();
331                this.prefixStackByUri.put(uri, prefixStack);
332            }
333            prefixStack.addFirst(prefix);
334    
335            // If the namespace is already registered, then we'll have to register it in the context's registry, too.
336            if (!namespaceRegistry.isRegisteredNamespaceUri(uri)) {
337                // The namespace is not already registered (locally or in the context's registry), so we have to
338                // register it with the context's registry (which the local register then inherits).
339                NamespaceRegistry contextRegistry = context.getNamespaceRegistry();
340                if (contextRegistry.getNamespaceForPrefix(prefix) != null) {
341                    // The prefix is already bound, so register and generate a unique prefix
342                    context.getNamespaceRegistry().getPrefixForNamespaceUri(uri, true);
343                    // Now register locally with the supplied prefix ...
344                    namespaceRegistry.register(prefix, uri);
345                } else {
346                    context.getNamespaceRegistry().register(prefix, uri);
347                }
348            } else {
349                // It is already registered, but re-register it locally using the supplied prefix ...
350                namespaceRegistry.register(prefix, uri);
351            }
352        }
353    
354        /**
355         * {@inheritDoc}
356         * 
357         * @see org.xml.sax.helpers.DefaultHandler#endPrefixMapping(java.lang.String)
358         */
359        @Override
360        public void endPrefixMapping( String prefix ) {
361            assert prefix != null;
362            // Get the current URI for this prefix ...
363            String uri = namespaceRegistry.getNamespaceForPrefix(prefix);
364            assert uri != null;
365    
366            // Get the previous prefix from the stack ...
367            LinkedList<String> prefixStack = this.prefixStackByUri.get(uri);
368            assert prefixStack != null;
369            assert !prefixStack.isEmpty();
370            String existingPrefix = prefixStack.removeFirst();
371            assert prefix.equals(existingPrefix);
372    
373            // If there are no previous prefixes, then remove the mapping ...
374            if (prefixStack.isEmpty()) {
375                namespaceRegistry.unregister(uri);
376                prefixStackByUri.remove(uri);
377            } else {
378                String previous = prefixStack.getFirst();
379                namespaceRegistry.register(previous, uri);
380            }
381        }
382    
383        /**
384         * <p>
385         * {@inheritDoc}
386         * </p>
387         * 
388         * @see org.xml.sax.ext.DefaultHandler2#startEntity(java.lang.String)
389         */
390        @Override
391        public void startEntity( String name ) {
392            // Record that we've started an entity by capturing the name of the entity ...
393            currentEntityName = name;
394        }
395    
396        /**
397         * <p>
398         * {@inheritDoc}
399         * </p>
400         * 
401         * @see org.xml.sax.ext.DefaultHandler2#endEntity(java.lang.String)
402         */
403        @Override
404        public void endEntity( String name ) {
405            // currentEntityName is nulled in 'characters(...)', not here.
406            // See DNA-231 for an issue related to this
407        }
408    
409        /**
410         * <p>
411         * {@inheritDoc}
412         * </p>
413         * 
414         * @see org.xml.sax.ext.DefaultHandler2#startCDATA()
415         */
416        @Override
417        public void startCDATA() {
418            // CDATA sections can start in the middle of element content, so there may already be some
419            // element content already processed ...
420            if (contentBuilder != null) endContent();
421    
422            // Prepare builder for concatenating consecutive lines of CDATA
423            cDataContent = new StringBuilder();
424        }
425    
426        /**
427         * {@inheritDoc}
428         * 
429         * @see org.xml.sax.ext.DefaultHandler2#endCDATA()
430         */
431        @Override
432        public void endCDATA() {
433            // Output CDATA built in characters() method
434            startNode(DnaXmlLexicon.CDATA);
435            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, defaultPrimaryType);
436            output.setProperty(currentPath, DnaXmlLexicon.CDATA_CONTENT, cDataContent.toString());
437            endNode();
438            // Null-out builder to free memory
439            cDataContent = null;
440        }
441    
442        /**
443         * {@inheritDoc}
444         * 
445         * @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int)
446         */
447        @Override
448        public void characters( char[] ch,
449                                int start,
450                                int length ) {
451            String content = String.valueOf(ch, start, length);
452            if (cDataContent != null) {
453                // Processing the characters in the CDATA, so add to the builder
454                cDataContent.append(ch, start, length);
455                // Text within builder will be output at the end of CDATA
456            } else {
457                if (contentBuilder == null) {
458                    // This is the first line of content, so we have to create the StringBuilder ...
459                    contentBuilder = new StringBuilder();
460                }
461                if (currentEntityName != null) {
462                    // This is an entity reference, so rather than use the entity value characters (the content passed
463                    // into this method), we want to keep the entity reference ...
464                    contentBuilder.append('&').append(currentEntityName).append(';');
465    
466                    // Normally, 'characters' is called with just the entity replacement characters,
467                    // and is called between 'startEntity' and 'endEntity'. However, per DNA-231, some JVMs
468                    // use an incorrect ordering: 'startEntity', 'endEntity' and then 'characters', and the
469                    // content passed to the 'characters' call not only includes the entity replacement characters
470                    // followed by other content. Look for this condition ...
471                    String entityValue = entityValues.get(currentEntityName);
472                    if (!content.equals(entityValue) && entityValue != null && entityValue.length() < content.length()) {
473                        // Per DNA-231, there's extra content after the entity value. So replace the entity value in the
474                        // content with the entity reference (not the replacement characters), and add the extra content ...
475                        String extraContent = content.substring(entityValue.length());
476                        contentBuilder.append(extraContent);
477                    }
478                    // We're done reading the entity characters, so null it out
479                    currentEntityName = null;
480                } else {
481                    // Just append the content normally ...
482                    contentBuilder.append(content);
483                }
484                // Text within builder will be output when another element or CDATA is encountered
485            }
486        }
487    
488        /**
489         * {@inheritDoc}
490         * 
491         * @see org.xml.sax.ext.DefaultHandler2#comment(char[], int, int)
492         */
493        @Override
494        public void comment( char[] ch,
495                             int start,
496                             int length ) {
497            // Output separate nodes for each comment since multiple are allowed
498            startNode(DnaXmlLexicon.COMMENT);
499            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.COMMENT);
500            output.setProperty(currentPath, DnaXmlLexicon.COMMENT_CONTENT, String.valueOf(ch, start, length).trim());
501            endNode();
502        }
503    
504        /**
505         * {@inheritDoc}
506         * 
507         * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String,
508         *      org.xml.sax.Attributes)
509         */
510        @Override
511        public void startElement( String uri,
512                                  String localName,
513                                  String name,
514                                  Attributes attributes ) {
515            assert localName != null;
516    
517            // Create the node with the name built from the element's name ...
518            Name nodeName = null;
519            if (nameAttribute != null) {
520                try {
521                    String jcrNameValue = attributes.getValue(nameAttribute.getNamespaceUri(), nameAttribute.getLocalName());
522                    nodeName = nameFactory.create(jcrNameValue);
523                } catch (ValueFormatException e) {
524                }
525            }
526            if (nodeName == null) nodeName = nameFactory.create(uri, localName, decoder);
527            startNode(nodeName);
528    
529            // Set the type of the node ...
530            if (defaultPrimaryType != null) {
531                output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, defaultPrimaryType);
532            }
533    
534            // Now, set each attribute as a property ...
535            for (int i = 0, len = attributes.getLength(); i != len; ++i) {
536                String attributeLocalName = attributes.getLocalName(i);
537                String attributeUri = attributes.getURI(i);
538                Name attributeName = null;
539                if ((attributeUri == null || attributeUri.length() == 0) && attributes.getQName(i).indexOf(':') == -1) {
540                    switch (this.attributeScoping) {
541                        case INHERIT_ELEMENT_NAMESPACE:
542                            attributeName = nameFactory.create(uri, attributeLocalName, decoder);
543                            break;
544                        case USE_DEFAULT_NAMESPACE:
545                            attributeName = nameFactory.create(attributeLocalName, decoder);
546                            break;
547                    }
548                } else {
549                    attributeName = nameFactory.create(attributeUri, attributeLocalName, decoder);
550                }
551                assert attributeName != null;
552                if (JcrLexicon.NAME.equals(attributeName)) {
553                    // We don't want to record the "jcr:name" attribute since it won't match the node name ...
554                    continue;
555                }
556                Object value = attributes.getValue(i);
557                if (JcrLexicon.PRIMARY_TYPE.equals(attributeName)) {
558                    // Convert it to a name ...
559                    value = nameFactory.create(value);
560                }
561                output.setProperty(currentPath, attributeName, attributes.getValue(i));
562            }
563        }
564    
565        /**
566         * {@inheritDoc}
567         * 
568         * @see org.jboss.dna.graph.xml.XmlHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
569         */
570        @Override
571        public void endElement( String uri,
572                                String localName,
573                                String name ) {
574            // Check if content still needs to be output
575            if (contentBuilder != null) endContent();
576    
577            // End the current node ...
578            endNode();
579        }
580    
581        /**
582         * <p>
583         * {@inheritDoc}
584         * </p>
585         * 
586         * @see org.xml.sax.helpers.DefaultHandler#warning(org.xml.sax.SAXParseException)
587         */
588        @Override
589        public void warning( SAXParseException warning ) {
590            problems.addWarning(warning, XmlSequencerI18n.warningSequencingXmlDocument, warning);
591        }
592    
593        /**
594         * {@inheritDoc}
595         * 
596         * @see org.xml.sax.helpers.DefaultHandler#error(org.xml.sax.SAXParseException)
597         */
598        @Override
599        public void error( SAXParseException error ) {
600            problems.addError(error, XmlSequencerI18n.errorSequencingXmlDocument, error);
601        }
602    
603        /**
604         * {@inheritDoc}
605         * 
606         * @see org.xml.sax.helpers.DefaultHandler#fatalError(org.xml.sax.SAXParseException)
607         */
608        @Override
609        public void fatalError( SAXParseException error ) {
610            problems.addError(error, XmlSequencerI18n.errorSequencingXmlDocument, error);
611        }
612    
613        private class IndexedName {
614    
615            Map<Name, List<IndexedName>> nameToIndexedNamesMap = new HashMap<Name, List<IndexedName>>();
616    
617            IndexedName() {
618            }
619        }
620    }