001    /*
002     * JBoss, Home of Professional Open Source.
003     * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004     * as indicated by the @author tags. See the copyright.txt file in the
005     * distribution for a full listing of individual contributors. 
006     *
007     * This is free software; you can redistribute it and/or modify it
008     * under the terms of the GNU Lesser General Public License as
009     * published by the Free Software Foundation; either version 2.1 of
010     * the License, or (at your option) any later version.
011     *
012     * This software is distributed in the hope that it will be useful,
013     * but WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * Lesser General Public License for more details.
016     *
017     * You should have received a copy of the GNU Lesser General Public
018     * License along with this software; if not, write to the Free
019     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021     */
022    package org.jboss.dna.sequencer.xml;
023    
024    import java.util.ArrayList;
025    import java.util.HashMap;
026    import java.util.LinkedList;
027    import java.util.List;
028    import java.util.Map;
029    import org.jboss.dna.common.collection.Problems;
030    import org.jboss.dna.common.text.TextDecoder;
031    import org.jboss.dna.common.text.XmlNameEncoder;
032    import org.jboss.dna.common.util.CheckArg;
033    import org.jboss.dna.common.util.StringUtil;
034    import org.jboss.dna.graph.BasicExecutionContext;
035    import org.jboss.dna.graph.ExecutionContext;
036    import org.jboss.dna.graph.JcrLexicon;
037    import org.jboss.dna.graph.properties.Name;
038    import org.jboss.dna.graph.properties.NameFactory;
039    import org.jboss.dna.graph.properties.NamespaceRegistry;
040    import org.jboss.dna.graph.properties.Path;
041    import org.jboss.dna.graph.properties.PathFactory;
042    import org.jboss.dna.graph.properties.PropertyFactory;
043    import org.jboss.dna.graph.properties.ValueFormatException;
044    import org.jboss.dna.graph.properties.basic.LocalNamespaceRegistry;
045    import org.jboss.dna.graph.sequencers.SequencerContext;
046    import org.jboss.dna.graph.sequencers.SequencerOutput;
047    import org.xml.sax.Attributes;
048    import org.xml.sax.SAXParseException;
049    import org.xml.sax.ext.DefaultHandler2;
050    
051    /**
052     * @author Randall Hauch
053     */
054    public class XmlSequencerHandler extends DefaultHandler2 {
055    
056        private final SequencerOutput output;
057        private final SequencerContext context;
058    
059        /**
060         * Decoder for XML names, to turn '_xHHHH_' sequences in the XML element and attribute names into the corresponding UTF-16
061         * characters.
062         */
063        public static TextDecoder DEFAULT_DECODER = new XmlNameEncoder();
064    
065        /**
066         * The default {@link XmlSequencer.AttributeScoping}.
067         */
068        public static XmlSequencer.AttributeScoping DEFAULT_ATTRIBUTE_SCOPING = XmlSequencer.AttributeScoping.USE_DEFAULT_NAMESPACE;
069    
070        /**
071         * The name of the attribute that should be used for the node name.
072         */
073        protected final Name nameAttribute;
074    
075        /**
076         * The default primary type.
077         */
078        protected final Name defaultPrimaryType;
079    
080        /**
081         * The cached reference to the graph's path factory.
082         */
083        protected final PathFactory pathFactory;
084    
085        /**
086         * The cached reference to the graph's name factory.
087         */
088        protected final NameFactory nameFactory;
089    
090        /**
091         * The cached reference to the graph's property factory.
092         */
093        protected final PropertyFactory propertyFactory;
094    
095        /**
096         * The cached reference to the graph's namespace registry.
097         */
098        protected final NamespaceRegistry namespaceRegistry;
099    
100        /**
101         * The TextDecoder that is used to decode the names.
102         */
103        protected final TextDecoder decoder;
104    
105        /**
106         * The stack of prefixes for each namespace, which is used to keep the {@link #namespaceRegistry local namespace registry} in
107         * sync with the namespaces in the XML document.
108         */
109        private final Map<String, LinkedList<String>> prefixStackByUri = new HashMap<String, LinkedList<String>>();
110    
111        private final XmlSequencer.AttributeScoping attributeScoping;
112    
113        /**
114         * The path for the node representing the current element. This starts out as the path supplied by the constructor, and never
115         * is shorter than that initial path.
116         */
117        protected Path currentPath;
118    
119        // Recursive map used to track the number of occurrences of names for elements under a particular path
120        private Map<Name, List<IndexedName>> nameToIndexedNamesMap = new HashMap<Name, List<IndexedName>>();
121    
122        // The stack of recursive maps being processed, with the head entry being the map for the current path
123        private final LinkedList<Map<Name, List<IndexedName>>> nameToIndexedNamesMapStack = new LinkedList<Map<Name, List<IndexedName>>>();
124    
125        private String currentEntityName;
126        private StringBuilder cDataContent;
127        private StringBuilder contentBuilder;
128        private final Problems problems;
129        private final Map<String, String> entityValues = new HashMap<String, String>();
130    
131        /**
132         * @param output
133         * @param context
134         * @param nameAttribute
135         * @param defaultPrimaryType
136         * @param textDecoder
137         * @param scoping
138         */
139        XmlSequencerHandler( SequencerOutput output,
140                                          SequencerContext context,
141                                          Name nameAttribute,
142                                          Name defaultPrimaryType,
143                                          TextDecoder textDecoder,
144                                          XmlSequencer.AttributeScoping scoping ) {
145            CheckArg.isNotNull(output, "output");
146            CheckArg.isNotNull(context, "context");
147    
148            // Use the execution context ...
149            this.output = output;
150            this.context = context;
151            this.problems = context.getProblems();
152            assert this.problems != null;
153    
154            this.nameAttribute = nameAttribute;
155            this.defaultPrimaryType = defaultPrimaryType;
156            this.decoder = textDecoder != null ? textDecoder : DEFAULT_DECODER;
157            this.attributeScoping = scoping != null ? scoping : DEFAULT_ATTRIBUTE_SCOPING;
158    
159            // Set up a local namespace registry that is kept in sync with the namespaces found in this XML document ...
160            NamespaceRegistry namespaceRegistry = new LocalNamespaceRegistry(this.context.getNamespaceRegistry());
161            final ExecutionContext localContext = new BasicExecutionContext(this.context, namespaceRegistry);
162    
163            // Set up references to frequently-used objects in the context ...
164            this.nameFactory = localContext.getValueFactories().getNameFactory();
165            this.pathFactory = localContext.getValueFactories().getPathFactory();
166            this.propertyFactory = localContext.getPropertyFactory();
167            this.namespaceRegistry = localContext.getNamespaceRegistry();
168            assert this.nameFactory != null;
169            assert this.pathFactory != null;
170            assert this.propertyFactory != null;
171            assert this.namespaceRegistry != null;
172    
173            // Set up the initial path ...
174            this.currentPath = this.pathFactory.createRelativePath();
175            assert this.currentPath != null;
176        }
177    
178        private void startNode( Name name ) {
179            // Check if content still needs to be output
180            if (contentBuilder != null) endContent();
181            // Add name to list of indexed names for this element to ensure we use the correct index (which is the size of the
182            // list)
183            List<IndexedName> indexedNames = nameToIndexedNamesMap.get(name);
184            if (indexedNames == null) {
185                indexedNames = new ArrayList<IndexedName>();
186                nameToIndexedNamesMap.put(name, indexedNames);
187            }
188            IndexedName indexedName = new IndexedName();
189            indexedNames.add(indexedName);
190            // Add element name and the appropriate index to the path.
191            // Per the JCR spec, the index must be relative to same-name sibling nodes
192            currentPath = pathFactory.create(currentPath, name, indexedNames.size()).getNormalizedPath();
193            // currentPath = currentPath.getNormalizedPath();
194            // Add the indexed name map to the stack and set the current map to the new element's map
195            nameToIndexedNamesMapStack.addFirst(nameToIndexedNamesMap);
196            nameToIndexedNamesMap = indexedName.nameToIndexedNamesMap;
197        }
198    
199        private void endNode() {
200            // Recover parent's path, namespace, and indexedName map, clearing the ended element's map to free memory
201            currentPath = currentPath.getParent();
202            currentPath = currentPath.getNormalizedPath();
203            nameToIndexedNamesMap.clear();
204            nameToIndexedNamesMap = nameToIndexedNamesMapStack.removeFirst();
205        }
206    
207        /**
208         * See if there is any element content that needs to be completed.
209         */
210        protected void endContent() {
211            // Process the content of the element ...
212            String content = StringUtil.normalize(contentBuilder.toString());
213            // Null-out builder to setup for subsequent content.
214            // Must be done before call to startElement below to prevent infinite loop.
215            contentBuilder = null;
216            // Skip if nothing in content but whitespace
217            if (content.length() > 0) {
218                // Create separate node for each content entry since entries can be interspersed amongst child elements
219                startNode(DnaXmlLexicon.ELEMENT_CONTENT);
220                output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.ELEMENT_CONTENT);
221                output.setProperty(currentPath, DnaXmlLexicon.ELEMENT_CONTENT, content);
222                endNode();
223            }
224        }
225    
226        /**
227         * <p>
228         * {@inheritDoc}
229         * </p>
230         * 
231         * @see org.xml.sax.helpers.DefaultHandler#startDocument()
232         */
233        @Override
234        public void startDocument() {
235            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.DOCUMENT);
236        }
237    
238        /**
239         * <p>
240         * {@inheritDoc}
241         * </p>
242         * 
243         * @see org.xml.sax.ext.DefaultHandler2#startDTD(java.lang.String, java.lang.String, java.lang.String)
244         */
245        @Override
246        public void startDTD( String name,
247                              String publicId,
248                              String systemId ) {
249            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
250            output.setProperty(currentPath, DnaDtdLexicon.PUBLIC_ID, publicId);
251            output.setProperty(currentPath, DnaDtdLexicon.SYSTEM_ID, systemId);
252        }
253    
254        /**
255         * <p>
256         * {@inheritDoc}
257         * </p>
258         * 
259         * @see org.xml.sax.ext.DefaultHandler2#externalEntityDecl(java.lang.String, java.lang.String, java.lang.String)
260         */
261        @Override
262        public void externalEntityDecl( String name,
263                                        String publicId,
264                                        String systemId ) {
265            // Add "synthetic" entity container to path to help prevent name collisions with XML elements
266            startNode(DnaDtdLexicon.ENTITY);
267            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaDtdLexicon.ENTITY);
268            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
269            if (publicId != null) output.setProperty(currentPath, DnaDtdLexicon.PUBLIC_ID, publicId);
270            if (systemId != null) output.setProperty(currentPath, DnaDtdLexicon.SYSTEM_ID, systemId);
271            endNode();
272        }
273    
274        /**
275         * {@inheritDoc}
276         * 
277         * @see org.xml.sax.ext.DefaultHandler2#internalEntityDecl(java.lang.String, java.lang.String)
278         */
279        @Override
280        public void internalEntityDecl( String name,
281                                        String value ) {
282            // Add "synthetic" entity container to path to help prevent name collisions with XML elements
283            startNode(DnaDtdLexicon.ENTITY);
284            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaDtdLexicon.ENTITY);
285            output.setProperty(currentPath, DnaDtdLexicon.NAME, name);
286            output.setProperty(currentPath, DnaDtdLexicon.VALUE, value);
287            // Record the name/value pair ...
288            entityValues.put(name, value);
289            endNode();
290        }
291    
292        /**
293         * <p>
294         * {@inheritDoc}
295         * </p>
296         * 
297         * @see org.xml.sax.helpers.DefaultHandler#processingInstruction(java.lang.String, java.lang.String)
298         */
299        @Override
300        public void processingInstruction( String target,
301                                           String data ) {
302            // Output separate nodes for each instruction since multiple are allowed
303            startNode(DnaXmlLexicon.PROCESSING_INSTRUCTION);
304            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.PROCESSING_INSTRUCTION);
305            output.setProperty(currentPath, DnaXmlLexicon.TARGET, target.trim());
306            if (data != null) {
307                output.setProperty(currentPath, DnaXmlLexicon.PROCESSING_INSTRUCTION_CONTENT, data.trim());
308            }
309            endNode();
310        }
311    
312        /**
313         * {@inheritDoc}
314         * <p>
315         * This method ensures that the namespace is registered with the {@link NamespaceRegistry registry}, using the supplied prefix
316         * to register the namespace if required. Note that because this class does not really use the namespace prefixes to create
317         * {@link Name} objects, no attempt is made to match the XML namespace prefixes.
318         * </p>
319         * 
320         * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
321         */
322        @Override
323        public void startPrefixMapping( String prefix,
324                                        String uri ) {
325            assert uri != null;
326            // Add the prefix to the stack ...
327            LinkedList<String> prefixStack = this.prefixStackByUri.get(uri);
328            if (prefixStack == null) {
329                prefixStack = new LinkedList<String>();
330                this.prefixStackByUri.put(uri, prefixStack);
331            }
332            prefixStack.addFirst(prefix);
333    
334            // If the namespace is already registered, then we'll have to register it in the context's registry, too.
335            if (!namespaceRegistry.isRegisteredNamespaceUri(uri)) {
336                // The namespace is not already registered (locally or in the context's registry), so we have to
337                // register it with the context's registry (which the local register then inherits).
338                NamespaceRegistry contextRegistry = context.getNamespaceRegistry();
339                if (contextRegistry.getNamespaceForPrefix(prefix) != null) {
340                    // The prefix is already bound, so register and generate a unique prefix
341                    context.getNamespaceRegistry().getPrefixForNamespaceUri(uri, true);
342                    // Now register locally with the supplied prefix ...
343                    namespaceRegistry.register(prefix, uri);
344                } else {
345                    context.getNamespaceRegistry().register(prefix, uri);
346                }
347            } else {
348                // It is already registered, but re-register it locally using the supplied prefix ...
349                namespaceRegistry.register(prefix, uri);
350            }
351        }
352    
353        /**
354         * {@inheritDoc}
355         * 
356         * @see org.xml.sax.helpers.DefaultHandler#endPrefixMapping(java.lang.String)
357         */
358        @Override
359        public void endPrefixMapping( String prefix ) {
360            assert prefix != null;
361            // Get the current URI for this prefix ...
362            String uri = namespaceRegistry.getNamespaceForPrefix(prefix);
363            assert uri != null;
364    
365            // Get the previous prefix from the stack ...
366            LinkedList<String> prefixStack = this.prefixStackByUri.get(uri);
367            assert prefixStack != null;
368            assert !prefixStack.isEmpty();
369            String existingPrefix = prefixStack.removeFirst();
370            assert prefix.equals(existingPrefix);
371    
372            // If there are no previous prefixes, then remove the mapping ...
373            if (prefixStack.isEmpty()) {
374                namespaceRegistry.unregister(uri);
375                prefixStackByUri.remove(uri);
376            } else {
377                String previous = prefixStack.getFirst();
378                namespaceRegistry.register(previous, uri);
379            }
380        }
381    
382        /**
383         * <p>
384         * {@inheritDoc}
385         * </p>
386         * 
387         * @see org.xml.sax.ext.DefaultHandler2#startEntity(java.lang.String)
388         */
389        @Override
390        public void startEntity( String name ) {
391            // Record that we've started an entity by capturing the name of the entity ...
392            currentEntityName = name;
393        }
394    
395        /**
396         * <p>
397         * {@inheritDoc}
398         * </p>
399         * 
400         * @see org.xml.sax.ext.DefaultHandler2#endEntity(java.lang.String)
401         */
402        @Override
403        public void endEntity( String name ) {
404            // currentEntityName is nulled in 'characters(...)', not here.
405            // See DNA-231 for an issue related to this
406        }
407    
408        /**
409         * <p>
410         * {@inheritDoc}
411         * </p>
412         * 
413         * @see org.xml.sax.ext.DefaultHandler2#startCDATA()
414         */
415        @Override
416        public void startCDATA() {
417            // CDATA sections can start in the middle of element content, so there may already be some
418            // element content already processed ...
419            if (contentBuilder != null) endContent();
420    
421            // Prepare builder for concatenating consecutive lines of CDATA
422            cDataContent = new StringBuilder();
423        }
424    
425        /**
426         * {@inheritDoc}
427         * 
428         * @see org.xml.sax.ext.DefaultHandler2#endCDATA()
429         */
430        @Override
431        public void endCDATA() {
432            // Output CDATA built in characters() method
433            startNode(DnaXmlLexicon.CDATA);
434            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, defaultPrimaryType);
435            output.setProperty(currentPath, DnaXmlLexicon.CDATA_CONTENT, cDataContent.toString());
436            endNode();
437            // Null-out builder to free memory
438            cDataContent = null;
439        }
440    
441        /**
442         * {@inheritDoc}
443         * 
444         * @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int)
445         */
446        @Override
447        public void characters( char[] ch,
448                                int start,
449                                int length ) {
450            String content = String.valueOf(ch, start, length);
451            if (cDataContent != null) {
452                // Processing the characters in the CDATA, so add to the builder
453                cDataContent.append(ch, start, length);
454                // Text within builder will be output at the end of CDATA
455            } else {
456                if (contentBuilder == null) {
457                    // This is the first line of content, so we have to create the StringBuilder ...
458                    contentBuilder = new StringBuilder();
459                }
460                if (currentEntityName != null) {
461                    // This is an entity reference, so rather than use the entity value characters (the content passed
462                    // into this method), we want to keep the entity reference ...
463                    contentBuilder.append('&').append(currentEntityName).append(';');
464    
465                    // Normally, 'characters' is called with just the entity replacement characters,
466                    // and is called between 'startEntity' and 'endEntity'. However, per DNA-231, some JVMs
467                    // use an incorrect ordering: 'startEntity', 'endEntity' and then 'characters', and the
468                    // content passed to the 'characters' call not only includes the entity replacement characters
469                    // followed by other content. Look for this condition ...
470                    String entityValue = entityValues.get(currentEntityName);
471                    if (!content.equals(entityValue) && entityValue != null && entityValue.length() < content.length()) {
472                        // Per DNA-231, there's extra content after the entity value. So replace the entity value in the
473                        // content with the entity reference (not the replacement characters), and add the extra content ...
474                        String extraContent = content.substring(entityValue.length());
475                        contentBuilder.append(extraContent);
476                    }
477                    // We're done reading the entity characters, so null it out
478                    currentEntityName = null;
479                } else {
480                    // Just append the content normally ...
481                    contentBuilder.append(content);
482                }
483                // Text within builder will be output when another element or CDATA is encountered
484            }
485        }
486    
487        /**
488         * {@inheritDoc}
489         * 
490         * @see org.xml.sax.ext.DefaultHandler2#comment(char[], int, int)
491         */
492        @Override
493        public void comment( char[] ch,
494                             int start,
495                             int length ) {
496            // Output separate nodes for each comment since multiple are allowed
497            startNode(DnaXmlLexicon.COMMENT);
498            output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, DnaXmlLexicon.COMMENT);
499            output.setProperty(currentPath, DnaXmlLexicon.COMMENT_CONTENT, String.valueOf(ch, start, length).trim());
500            endNode();
501        }
502    
503        /**
504         * {@inheritDoc}
505         * 
506         * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String,
507         *      org.xml.sax.Attributes)
508         */
509        @Override
510        public void startElement( String uri,
511                                  String localName,
512                                  String name,
513                                  Attributes attributes ) {
514            assert localName != null;
515    
516            // Create the node with the name built from the element's name ...
517            Name nodeName = null;
518            if (nameAttribute != null) {
519                try {
520                    String jcrNameValue = attributes.getValue(nameAttribute.getNamespaceUri(), nameAttribute.getLocalName());
521                    nodeName = nameFactory.create(jcrNameValue);
522                } catch (ValueFormatException e) {
523                }
524            }
525            if (nodeName == null) nodeName = nameFactory.create(uri, localName, decoder);
526            startNode(nodeName);
527    
528            // Set the type of the node ...
529            if (defaultPrimaryType != null) {
530                output.setProperty(currentPath, JcrLexicon.PRIMARY_TYPE, defaultPrimaryType);
531            }
532    
533            // Now, set each attribute as a property ...
534            for (int i = 0, len = attributes.getLength(); i != len; ++i) {
535                String attributeLocalName = attributes.getLocalName(i);
536                String attributeUri = attributes.getURI(i);
537                Name attributeName = null;
538                if ((attributeUri == null || attributeUri.length() == 0) && attributes.getQName(i).indexOf(':') == -1) {
539                    switch (this.attributeScoping) {
540                        case INHERIT_ELEMENT_NAMESPACE:
541                            attributeName = nameFactory.create(uri, attributeLocalName, decoder);
542                            break;
543                        case USE_DEFAULT_NAMESPACE:
544                            attributeName = nameFactory.create(attributeLocalName, decoder);
545                            break;
546                    }
547                } else {
548                    attributeName = nameFactory.create(attributeUri, attributeLocalName, decoder);
549                }
550                assert attributeName != null;
551                if (JcrLexicon.NAME.equals(attributeName)) {
552                    // We don't want to record the "jcr:name" attribute since it won't match the node name ...
553                    continue;
554                }
555                Object value = attributes.getValue(i);
556                if (JcrLexicon.PRIMARY_TYPE.equals(attributeName)) {
557                    // Convert it to a name ...
558                    value = nameFactory.create(value);
559                }
560                output.setProperty(currentPath, attributeName, attributes.getValue(i));
561            }
562        }
563    
564        /**
565         * {@inheritDoc}
566         * 
567         * @see org.jboss.dna.graph.xml.XmlHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
568         */
569        @Override
570        public void endElement( String uri,
571                                String localName,
572                                String name ) {
573            // Check if content still needs to be output
574            if (contentBuilder != null) endContent();
575    
576            // End the current node ...
577            endNode();
578        }
579    
580        /**
581         * <p>
582         * {@inheritDoc}
583         * </p>
584         * 
585         * @see org.xml.sax.helpers.DefaultHandler#warning(org.xml.sax.SAXParseException)
586         */
587        @Override
588        public void warning( SAXParseException warning ) {
589            problems.addWarning(warning, XmlSequencerI18n.warningSequencingXmlDocument, warning);
590        }
591    
592        /**
593         * {@inheritDoc}
594         * 
595         * @see org.xml.sax.helpers.DefaultHandler#error(org.xml.sax.SAXParseException)
596         */
597        @Override
598        public void error( SAXParseException error ) {
599            problems.addError(error, XmlSequencerI18n.errorSequencingXmlDocument, error);
600        }
601    
602        /**
603         * {@inheritDoc}
604         * 
605         * @see org.xml.sax.helpers.DefaultHandler#fatalError(org.xml.sax.SAXParseException)
606         */
607        @Override
608        public void fatalError( SAXParseException error ) {
609            problems.addError(error, XmlSequencerI18n.errorSequencingXmlDocument, error);
610        }
611    
612        private class IndexedName {
613    
614            Map<Name, List<IndexedName>> nameToIndexedNamesMap = new HashMap<Name, List<IndexedName>>();
615    
616            IndexedName() {
617            }
618        }
619    }