001    /*
002     * JBoss DNA (http://www.jboss.org/dna)
003     * See the COPYRIGHT.txt file distributed with this work for information
004     * regarding copyright ownership.  Some portions may be licensed
005     * to Red Hat, Inc. under one or more contributor license agreements.
006    * See the AUTHORS.txt file in the distribution for a full listing of 
007    * individual contributors.
008     *
009     * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA
010     * is licensed to you under the terms of the GNU Lesser General Public License as
011     * published by the Free Software Foundation; either version 2.1 of
012     * the License, or (at your option) any later version.
013     *
014     * JBoss DNA is distributed in the hope that it will be useful,
015     * but WITHOUT ANY WARRANTY; without even the implied warranty of
016     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
017     * Lesser General Public License for more details.
018     *
019     * You should have received a copy of the GNU Lesser General Public
020     * License along with this software; if not, write to the Free
021     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
023     */
024    
025    package org.jboss.dna.sequencer.msoffice.word;
026    
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.util.ArrayList;
030    import java.util.List;
031    
032    import org.apache.poi.hwpf.HWPFDocument;
033    import org.apache.poi.hwpf.model.StyleSheet;
034    import org.apache.poi.hwpf.usermodel.Paragraph;
035    import org.apache.poi.hwpf.usermodel.Range;
036    import org.jboss.dna.common.util.Logger;
037    
038    /**
039     * Infers table of contents from Word document by reading all paragraphs
040     * with style <code>Heading*</code>.  This is analogous to the default 
041     * behavior of Word when generating a table of contents.
042     * 
043     * @author Michael Trezzi
044     */
045    public class WordMetadataReader {
046    
047        private static final Logger log = Logger.getLogger(WordMetadataReader.class);
048        
049        /** Prefix for styles that will be extracted and treated as outline information for the document */
050        private static final String HEADER_PREFIX = "Heading";
051        
052        public static WordMetadata instance( InputStream stream ) throws IOException {
053            WordMetadata metadata = new WordMetadata();        
054            List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>();
055    
056            HWPFDocument document = new HWPFDocument(stream);
057            Range range = document.getRange();
058    
059            StyleSheet stylesheet = document.getStyleSheet();
060            
061            for (int i = 0; i < range.numParagraphs(); i++) {
062                Paragraph paragraph = range.getParagraph(i);
063                
064                String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName();
065                
066                if (styleName.startsWith(HEADER_PREFIX)) {
067                    String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim();
068                    int levelNum = 0;
069                    
070                    try {
071                        levelNum = Integer.parseInt(rawLevelNum);
072                    }
073                    catch (NumberFormatException nfe) {
074                        log.debug("Could not parse heading level from: " + styleName);
075                    }
076    
077                    String text = Paragraph.stripFields(paragraph.text());
078                    
079                    if ('\r' == text.charAt(text.length() - 1)) {
080                        text = text.substring(0, text.length() - 1);
081                    }
082                    
083                    headings.add(new WordMetadata.WordHeading(text, levelNum));
084                }
085            }
086            
087            metadata.setHeadings(headings);
088            return metadata;
089        }
090    }