001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA 010 * is licensed to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 025 package org.jboss.dna.sequencer.msoffice.word; 026 027 import java.io.IOException; 028 import java.io.InputStream; 029 import java.util.ArrayList; 030 import java.util.List; 031 032 import org.apache.poi.hwpf.HWPFDocument; 033 import org.apache.poi.hwpf.model.StyleSheet; 034 import org.apache.poi.hwpf.usermodel.Paragraph; 035 import org.apache.poi.hwpf.usermodel.Range; 036 import org.jboss.dna.common.util.Logger; 037 038 /** 039 * Infers table of contents from Word document by reading all paragraphs 040 * with style <code>Heading*</code>. This is analogous to the default 041 * behavior of Word when generating a table of contents. 042 * 043 * @author Michael Trezzi 044 */ 045 public class WordMetadataReader { 046 047 private static final Logger log = Logger.getLogger(WordMetadataReader.class); 048 049 /** Prefix for styles that will be extracted and treated as outline information for the document */ 050 private static final String HEADER_PREFIX = "Heading"; 051 052 public static WordMetadata instance( InputStream stream ) throws IOException { 053 WordMetadata metadata = new WordMetadata(); 054 List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>(); 055 056 HWPFDocument document = new HWPFDocument(stream); 057 Range range = document.getRange(); 058 059 StyleSheet stylesheet = document.getStyleSheet(); 060 061 for (int i = 0; i < range.numParagraphs(); i++) { 062 Paragraph paragraph = range.getParagraph(i); 063 064 String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName(); 065 066 if (styleName.startsWith(HEADER_PREFIX)) { 067 String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim(); 068 int levelNum = 0; 069 070 try { 071 levelNum = Integer.parseInt(rawLevelNum); 072 } 073 catch (NumberFormatException nfe) { 074 log.debug("Could not parse heading level from: " + styleName); 075 } 076 077 String text = Paragraph.stripFields(paragraph.text()); 078 079 if ('\r' == text.charAt(text.length() - 1)) { 080 text = text.substring(0, text.length() - 1); 081 } 082 083 headings.add(new WordMetadata.WordHeading(text, levelNum)); 084 } 085 } 086 087 metadata.setHeadings(headings); 088 return metadata; 089 } 090 }