001 /* 002 * JBoss DNA (http://www.jboss.org/dna) 003 * See the COPYRIGHT.txt file distributed with this work for information 004 * regarding copyright ownership. Some portions may be licensed 005 * to Red Hat, Inc. under one or more contributor license agreements. 006 * See the AUTHORS.txt file in the distribution for a full listing of 007 * individual contributors. 008 * 009 * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA 010 * is licensed to you under the terms of the GNU Lesser General Public License as 011 * published by the Free Software Foundation; either version 2.1 of 012 * the License, or (at your option) any later version. 013 * 014 * JBoss DNA is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 017 * Lesser General Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser General Public 020 * License along with this software; if not, write to the Free 021 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 022 * 02110-1301 USA, or see the FSF site: http://www.fsf.org. 023 */ 024 package org.jboss.dna.sequencer.msoffice; 025 026 import java.io.IOException; 027 import java.io.InputStream; 028 import java.util.Iterator; 029 import java.util.List; 030 import org.jboss.dna.graph.sequencer.SequencerContext; 031 import org.jboss.dna.graph.sequencer.SequencerOutput; 032 import org.jboss.dna.graph.sequencer.StreamSequencer; 033 import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadata; 034 import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader; 035 import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader; 036 import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata; 037 import org.jboss.dna.sequencer.msoffice.word.WordMetadata; 038 import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader; 039 040 /** 041 * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that 042 * metadata to the repository. 043 * <p> 044 * This sequencer produces data that corresponds to the following structure: 045 * <ul> 046 * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code> 047 * <ul> 048 * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li> 049 * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li> 050 * <li><strong>msoffice:author</strong> optional string property for the author of the document</li> 051 * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li> 052 * <li><strong>msoffice:comment</strong> optional string property for the document comment</li> 053 * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li> 054 * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li> 055 * <li><strong>msoffice:revision</strong> optional string property for this document revision</li> 056 * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li> 057 * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li> 058 * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li> 059 * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li> 060 * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li> 061 * <li><strong>msoffice:words</strong> long property for the number of words in this document</li> 062 * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li> 063 * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li> 064 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li> 065 * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li> 066 * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li> 067 * </ul> 068 * </li> 069 * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code> 070 * <ul> 071 * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li> 072 * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li> 073 * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li> 074 * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li> 075 * </ul> 076 * </li> 077 * </ul> 078 * </p> 079 * 080 * @author Michael Trezzi 081 * @author John Verhaeg 082 */ 083 public class MSOfficeMetadataSequencer implements StreamSequencer { 084 085 public static final String METADATA_NODE = "msoffice:metadata"; 086 public static final String MSOFFICE_PRIMARY_TYPE = "jcr:primaryType"; 087 public static final String MSOFFICE_TITLE = "msoffice:title"; 088 public static final String MSOFFICE_SUBJECT = "msoffice:subject"; 089 public static final String MSOFFICE_AUTHOR = "msoffice:author"; 090 public static final String MSOFFICE_KEYWORDS = "msoffice:keywords"; 091 public static final String MSOFFICE_COMMENT = "msoffice:comment"; 092 public static final String MSOFFICE_TEMPLATE = "msoffice:template"; 093 public static final String MSOFFICE_LAST_SAVED_BY = "msoffice:last_saved_by"; 094 public static final String MSOFFICE_REVISION = "msoffice:revision"; 095 public static final String MSOFFICE_TOTAL_EDITING_TIME = "msoffice:total_editing_time"; 096 public static final String MSOFFICE_LAST_PRINTED = "msoffice:last_printed"; 097 public static final String MSOFFICE_CREATED = "msoffice:created"; 098 public static final String MSOFFICE_SAVED = "msoffice:saved"; 099 public static final String MSOFFICE_PAGES = "msoffice:pages"; 100 public static final String MSOFFICE_WORDS = "msoffice:words"; 101 public static final String MSOFFICE_CHARACTERS = "msoffice:characters"; 102 public static final String MSOFFICE_CREATING_APPLICATION = "msoffice:creating_application"; 103 public static final String MSOFFICE_THUMBNAIL = "msoffice:thumbnail"; 104 105 // PowerPoint specific 106 public static final String POWERPOINT_SLIDE_NODE = "msoffice:slide"; 107 public static final String SLIDE_TITLE = "msoffice:title"; 108 public static final String SLIDE_TEXT = "msoffice:text"; 109 public static final String SLIDE_NOTES = "msoffice:notes"; 110 public static final String SLIDE_THUMBNAIL = "msoffice:thumbnail"; 111 112 // Excel specific 113 public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents"; 114 public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name"; 115 116 // Word specific 117 public static final String WORD_HEADING_NODE = "msoffice:heading"; 118 public static final String WORD_HEADING_NAME = "msoffice:heading_name"; 119 public static final String WORD_HEADING_LEVEL = "msoffice:heading_level"; 120 121 /** 122 * {@inheritDoc} 123 */ 124 public void sequence( InputStream stream, 125 SequencerOutput output, 126 SequencerContext context ) { 127 128 MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream); 129 130 String mimeType = context.getMimeType(); 131 132 if (metadata != null) { 133 output.setProperty(METADATA_NODE, MSOFFICE_PRIMARY_TYPE, "msoffice:metadata"); 134 output.setProperty(METADATA_NODE, MSOFFICE_TITLE, metadata.getTitle()); 135 output.setProperty(METADATA_NODE, MSOFFICE_SUBJECT, metadata.getSubject()); 136 output.setProperty(METADATA_NODE, MSOFFICE_AUTHOR, metadata.getAuthor()); 137 output.setProperty(METADATA_NODE, MSOFFICE_KEYWORDS, metadata.getKeywords()); 138 output.setProperty(METADATA_NODE, MSOFFICE_COMMENT, metadata.getComment()); 139 output.setProperty(METADATA_NODE, MSOFFICE_TEMPLATE, metadata.getTemplate()); 140 output.setProperty(METADATA_NODE, MSOFFICE_LAST_SAVED_BY, metadata.getLastSavedBy()); 141 output.setProperty(METADATA_NODE, MSOFFICE_REVISION, metadata.getRevision()); 142 output.setProperty(METADATA_NODE, MSOFFICE_TOTAL_EDITING_TIME, metadata.getTotalEditingTime()); 143 output.setProperty(METADATA_NODE, MSOFFICE_LAST_PRINTED, metadata.getLastPrinted()); 144 output.setProperty(METADATA_NODE, MSOFFICE_CREATED, metadata.getCreated()); 145 output.setProperty(METADATA_NODE, MSOFFICE_SAVED, metadata.getSaved()); 146 output.setProperty(METADATA_NODE, MSOFFICE_PAGES, metadata.getPages()); 147 output.setProperty(METADATA_NODE, MSOFFICE_WORDS, metadata.getWords()); 148 output.setProperty(METADATA_NODE, MSOFFICE_CHARACTERS, metadata.getCharacters()); 149 output.setProperty(METADATA_NODE, MSOFFICE_CREATING_APPLICATION, metadata.getCreatingApplication()); 150 output.setProperty(METADATA_NODE, MSOFFICE_THUMBNAIL, metadata.getThumbnail()); 151 152 } else { 153 return; 154 } 155 156 // process PowerPoint specific metadata 157 if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced 158 try { 159 List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream); 160 if (ppt_metadata != null) { 161 for (SlideMetadata sm : ppt_metadata) { 162 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TITLE, sm.getTitle()); 163 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TEXT, sm.getText()); 164 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_NOTES, sm.getNotes()); 165 output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_THUMBNAIL, sm.getThumbnail()); 166 } 167 } 168 } catch (IOException e) { 169 // There was an error reading, so log and continue ... 170 context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata"); 171 } 172 } 173 174 if (mimeType.equals("application/vnd.ms-word")) { 175 // Sometime in the future this will sequence WORD Table of contents. 176 try { 177 WordMetadata wordMetadata = WordMetadataReader.instance(stream); 178 179 for (Iterator<WordMetadata.WordHeading> iter = wordMetadata.getHeadings().iterator(); iter.hasNext(); ) { 180 WordMetadata.WordHeading heading = iter.next(); 181 182 output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_NAME, heading.getText()); 183 output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_LEVEL, heading.getHeaderLevel()); 184 185 } 186 187 } catch (IOException e) { 188 // There was an error reading, so log and continue ... 189 context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata"); 190 } 191 192 } 193 194 if (mimeType.equals("application/vnd.ms-excel")) { 195 try { 196 ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream); 197 if (excel_metadata != null) { 198 output.setProperty(METADATA_NODE, EXCEL_FULL_CONTENT, excel_metadata.getText()); 199 for (String sheet : excel_metadata.getSheets()) { 200 output.setProperty(METADATA_NODE, EXCEL_SHEET_NAME, sheet); 201 } 202 } 203 } catch (IOException e) { 204 // There was an error reading, so log and continue ... 205 context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata"); 206 } 207 } 208 } 209 }