001    /*
002     * JBoss, Home of Professional Open Source.
003     * Copyright 2008, Red Hat Middleware LLC, and individual contributors
004     * as indicated by the @author tags. See the copyright.txt file in the
005     * distribution for a full listing of individual contributors.
006     *
007     * This is free software; you can redistribute it and/or modify it
008     * under the terms of the GNU Lesser General Public License as
009     * published by the Free Software Foundation; either version 2.1 of
010     * the License, or (at your option) any later version.
011     *
012     * This software is distributed in the hope that it will be useful,
013     * but WITHOUT ANY WARRANTY; without even the implied warranty of
014     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
015     * Lesser General Public License for more details.
016     *
017     * You should have received a copy of the GNU Lesser General Public
018     * License along with this software; if not, write to the Free
019     * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
020     * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
021     */
022    package org.jboss.dna.sequencer.msoffice;
023    
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.util.List;
027    import org.jboss.dna.graph.sequencers.SequencerContext;
028    import org.jboss.dna.graph.sequencers.SequencerOutput;
029    import org.jboss.dna.graph.sequencers.StreamSequencer;
030    import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadata;
031    import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader;
032    import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader;
033    import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata;
034    import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader;
035    
036    /**
037     * A sequencer that processes the content of an MS Office document, extracts the metadata for the file, and then writes that
038     * metadata to the repository.
039     * <p>
040     * This sequencer produces data that corresponds to the following structure:
041     * <ul>
042     * <li><strong>msoffice:metadata</strong> node of type <code>msoffice:metadata</code>
043     * <ul>
044     * <li><strong>msoffice:title</strong> optional string property for the title of the documnt</li>
045     * <li><strong>msoffice:subject</strong> optional string property for the subject of the document</li>
046     * <li><strong>msoffice:author</strong> optional string property for the author of the document</li>
047     * <li><strong>msoffice:keywords</strong> optional string property for the document keywords</li>
048     * <li><strong>msoffice:comment</strong> optional string property for the document comment</li>
049     * <li><strong>msoffice:template</strong> optional string property for the template from which this document originates</li>
050     * <li><strong>msoffice:last_saved_by</strong> optional string property for the person that last saved this document</li>
051     * <li><strong>msoffice:revision</strong> optional string property for this document revision</li>
052     * <li><strong>msoffice:total_editing_time</strong> optional long property for the length this document has been edited</li>
053     * <li><strong>msoffice:last_printed</strong> optional date property for the date of last printing this document</li>
054     * <li><strong>msoffice:created</strong> date property for the date of creation of the document</li>
055     * <li><strong>msoffice:saved</strong> date property for the date of last save of this document</li>
056     * <li><strong>msoffice:pages</strong> long property for the number of pages of this document</li>
057     * <li><strong>msoffice:words</strong> long property for the number of words in this document</li>
058     * <li><strong>msoffice:characters</strong> long property for the number of characters in this document</li>
059     * <li><strong>msoffice:creating_application</strong> string property for the application used to create this document</li>
060     * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbanail of this document</li>
061     * <li><strong>msoffice:full_contents</strong> optional String property holding the text contents of an excel file</li>
062     * <li><strong>msoffice:sheet_name</strong> optional String property for the name of a sheet in excel (multiple)</li>
063     * </ul>
064     * </li>
065     * <li><strong>msoffice:slide</strong> node of type <code>msoffice:pptslide</code>
066     * <ul>
067     * <li><strong>msoffice:title</strong> optional String property for the title of a slide</li>
068     * <li><strong>msoffice:notes</strong> optional String property for the notes of a slide</li>
069     * <li><strong>msoffice:text</strong> optional String property for the text of a slide</li>
070     * <li><strong>msoffice:thumbnail</strong> optional binary property for the thumbnail of a slide (PNG image)</li>
071     * </ul>
072     * </li>
073     * </ul>
074     * </p>
075     * 
076     * @author Michael Trezzi
077     * @author John Verhaeg
078     */
079    public class MSOfficeMetadataSequencer implements StreamSequencer {
080    
081        public static final String METADATA_NODE = "msoffice:metadata";
082        public static final String MSOFFICE_PRIMARY_TYPE = "jcr:primaryType";
083        public static final String MSOFFICE_TITLE = "msoffice:title";
084        public static final String MSOFFICE_SUBJECT = "msoffice:subject";
085        public static final String MSOFFICE_AUTHOR = "msoffice:author";
086        public static final String MSOFFICE_KEYWORDS = "msoffice:keywords";
087        public static final String MSOFFICE_COMMENT = "msoffice:comment";
088        public static final String MSOFFICE_TEMPLATE = "msoffice:template";
089        public static final String MSOFFICE_LAST_SAVED_BY = "msoffice:last_saved_by";
090        public static final String MSOFFICE_REVISION = "msoffice:revision";
091        public static final String MSOFFICE_TOTAL_EDITING_TIME = "msoffice:total_editing_time";
092        public static final String MSOFFICE_LAST_PRINTED = "msoffice:last_printed";
093        public static final String MSOFFICE_CREATED = "msoffice:created";
094        public static final String MSOFFICE_SAVED = "msoffice:saved";
095        public static final String MSOFFICE_PAGES = "msoffice:pages";
096        public static final String MSOFFICE_WORDS = "msoffice:words";
097        public static final String MSOFFICE_CHARACTERS = "msoffice:characters";
098        public static final String MSOFFICE_CREATING_APPLICATION = "msoffice:creating_application";
099        public static final String MSOFFICE_THUMBNAIL = "msoffice:thumbnail";
100    
101        // PowerPoint specific
102        public static final String POWERPOINT_SLIDE_NODE = "msoffice:slide";
103        public static final String SLIDE_TITLE = "msoffice:title";
104        public static final String SLIDE_TEXT = "msoffice:text";
105        public static final String SLIDE_NOTES = "msoffice:notes";
106        public static final String SLIDE_THUMBNAIL = "msoffice:thumbnail";
107    
108        // Excel specific
109        public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents";
110        public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name";
111    
112        /**
113         * {@inheritDoc}
114         */
115        public void sequence( InputStream stream,
116                              SequencerOutput output,
117                              SequencerContext context ) {
118    
119            MSOfficeMetadata metadata = MSOfficeMetadataReader.instance(stream);
120    
121            String mimeType = context.getMimeType();
122    
123            if (metadata != null) {
124                output.setProperty(METADATA_NODE, MSOFFICE_PRIMARY_TYPE, "msoffice:metadata");
125                output.setProperty(METADATA_NODE, MSOFFICE_TITLE, metadata.getTitle());
126                output.setProperty(METADATA_NODE, MSOFFICE_SUBJECT, metadata.getSubject());
127                output.setProperty(METADATA_NODE, MSOFFICE_AUTHOR, metadata.getAuthor());
128                output.setProperty(METADATA_NODE, MSOFFICE_KEYWORDS, metadata.getKeywords());
129                output.setProperty(METADATA_NODE, MSOFFICE_COMMENT, metadata.getComment());
130                output.setProperty(METADATA_NODE, MSOFFICE_TEMPLATE, metadata.getTemplate());
131                output.setProperty(METADATA_NODE, MSOFFICE_LAST_SAVED_BY, metadata.getLastSavedBy());
132                output.setProperty(METADATA_NODE, MSOFFICE_REVISION, metadata.getRevision());
133                output.setProperty(METADATA_NODE, MSOFFICE_TOTAL_EDITING_TIME, metadata.getTotalEditingTime());
134                output.setProperty(METADATA_NODE, MSOFFICE_LAST_PRINTED, metadata.getLastPrinted());
135                output.setProperty(METADATA_NODE, MSOFFICE_CREATED, metadata.getCreated());
136                output.setProperty(METADATA_NODE, MSOFFICE_SAVED, metadata.getSaved());
137                output.setProperty(METADATA_NODE, MSOFFICE_PAGES, metadata.getPages());
138                output.setProperty(METADATA_NODE, MSOFFICE_WORDS, metadata.getWords());
139                output.setProperty(METADATA_NODE, MSOFFICE_CHARACTERS, metadata.getCharacters());
140                output.setProperty(METADATA_NODE, MSOFFICE_CREATING_APPLICATION, metadata.getCreatingApplication());
141                output.setProperty(METADATA_NODE, MSOFFICE_THUMBNAIL, metadata.getThumbnail());
142    
143            } else {
144                return;
145            }
146    
147            // process PowerPoint specific metadata
148            if (mimeType.equals("application/vnd.ms-powerpoint")) { // replace true with check if it's ppt file being sequenced
149                try {
150                    List<SlideMetadata> ppt_metadata = PowerPointMetadataReader.instance(stream);
151                    if (ppt_metadata != null) {
152                        for (SlideMetadata sm : ppt_metadata) {
153                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TITLE, sm.getTitle());
154                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_TEXT, sm.getText());
155                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_NOTES, sm.getNotes());
156                            output.setProperty(METADATA_NODE + "/" + POWERPOINT_SLIDE_NODE, SLIDE_THUMBNAIL, sm.getThumbnail());
157                        }
158                    }
159                } catch (IOException e) {
160                    // There was an error reading, so log and continue ...
161                    context.getLogger(this.getClass()).debug(e, "Error while extracting the PowerPoint metadata");
162                }
163            }
164    
165            if (mimeType.equals("application/vnd.ms-word")) {
166                // Sometime in the future this will sequence WORD Table of contents.
167                try {
168                    /*WordMetadata wordMetadata =*/WordMetadataReader.invoke(stream);
169                } catch (IOException e) {
170                    // There was an error reading, so log and continue ...
171                    context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata");
172                }
173    
174            }
175    
176            if (mimeType.equals("application/vnd.ms-excel")) {
177                try {
178                    ExcelMetadata excel_metadata = ExcelMetadataReader.instance(stream);
179                    if (excel_metadata != null) {
180                        output.setProperty(METADATA_NODE, EXCEL_FULL_CONTENT, excel_metadata.getText());
181                        for (String sheet : excel_metadata.getSheets()) {
182                            output.setProperty(METADATA_NODE, EXCEL_SHEET_NAME, sheet);
183                        }
184                    }
185                } catch (IOException e) {
186                    // There was an error reading, so log and continue ...
187                    context.getLogger(this.getClass()).debug(e, "Error while extracting the Excel metadata");
188                }
189            }
190        }
191    }