001/* 002 * Copyright 2014 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.plugins.explorer.resources.metadata.parsing; 017 018import java.io.ByteArrayInputStream; 019import java.io.ByteArrayOutputStream; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.io.Reader; 024import java.util.Date; 025import java.util.Iterator; 026 027import org.apache.commons.lang.StringUtils; 028import org.apache.jempbox.xmp.XMPMetadata; 029import org.apache.jempbox.xmp.XMPSchemaDublinCore; 030import org.apache.tika.exception.TikaException; 031import org.apache.tika.metadata.DublinCore; 032import org.apache.tika.metadata.Metadata; 033import org.apache.tika.metadata.TikaCoreProperties; 034import org.apache.tika.parser.image.xmp.JempboxExtractor; 035import org.apache.tika.parser.image.xmp.XMPPacketScanner; 036import org.xml.sax.InputSource; 037 038/** 039 * Custom {@link JempboxExtractor} extracting all Dublin Core metadata. 040 */ 041public class AmetysJempboxExtractor 042{ 043 // The XMP spec says it must be unicode, but for most file formats it specifies 044 // "must be encoded in UTF-8" 045 private static final String DEFAULT_XMP_CHARSET = "UTF-8"; 046 047 private static XMPPacketScanner _SCANNER = new XMPPacketScanner(); 048 049 private Metadata _metadata; 050 051 /** 052 * Build a jempbox extra 053 * @param metadata The metadata concerned 054 */ 055 public AmetysJempboxExtractor(Metadata metadata) 056 { 057 this._metadata = metadata; 058 } 059 060 /** 061 * Parse the file and extract the dublin core metadata. 062 * @param stream the stream to parse. 063 * @throws IOException if an error occurred 064 * @throws TikaException if an error occurred 065 */ 066 public void parse(InputStream stream) throws IOException, TikaException 067 { 068 try (ByteArrayOutputStream xmpraw = new ByteArrayOutputStream()) 069 { 070 if (!_SCANNER.parse(stream, xmpraw)) 071 { 072 return; 073 } 074 075 Reader decoded = new InputStreamReader(new ByteArrayInputStream(xmpraw.toByteArray()), DEFAULT_XMP_CHARSET); 076 try 077 { 078 XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded)); 079 XMPSchemaDublinCore dc = xmp.getDublinCoreSchema(); 080 if (dc != null) 081 { 082 _setDCTitle(dc); 083 _setDCDescription(dc); 084 _setDCCreator(dc); 085 _setDCSubject(dc); 086 // Ametys-specific 087 _setDCContributor(dc); 088 _setDCCoverage(dc); 089 _setDcDates(dc); 090 _setDCFormat(dc); 091 _setDCLanguage(dc); 092 _setDCPublisher(dc); 093 _setDCRelation(dc); 094 _setDCRights(dc); 095 _setDCSource(dc); 096 _setDCType(dc); 097 } 098 } 099 catch (IOException e) 100 { 101 // Could not parse embedded XMP metadata. That's not a serious 102 // problem, so we'll just ignore the issue for now. 103 // TODO: Make error handling like this configurable. 104 } 105 } 106 } 107 108 /** 109 * Set DC Title 110 * @param dc The dublin core info 111 */ 112 protected void _setDCTitle(XMPSchemaDublinCore dc) 113 { 114 if (dc.getTitle() != null) 115 { 116 _metadata.set(TikaCoreProperties.TITLE, dc.getTitle()); 117 } 118 } 119 120 /** 121 * Set DC Description 122 * @param dc The dublin core info 123 */ 124 protected void _setDCDescription(XMPSchemaDublinCore dc) 125 { 126 if (dc.getDescription() != null) 127 { 128 _metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription()); 129 } 130 } 131 132 /** 133 * Set DC Creator 134 * @param dc The dublin core info 135 */ 136 protected void _setDCCreator(XMPSchemaDublinCore dc) 137 { 138 if (dc.getCreators() != null && dc.getCreators().size() > 0) 139 { 140 _metadata.set(TikaCoreProperties.CREATOR, StringUtils.join(dc.getCreators(), ", ")); 141 } 142 } 143 144 /** 145 * Set DC Subject 146 * @param dc The dublin core info 147 */ 148 protected void _setDCSubject(XMPSchemaDublinCore dc) 149 { 150 if (dc.getSubjects() != null && dc.getSubjects().size() > 0) 151 { 152 Iterator<String> keywords = dc.getSubjects().iterator(); 153 while (keywords.hasNext()) 154 { 155 _metadata.add(TikaCoreProperties.KEYWORDS, keywords.next()); 156 } 157 } 158 } 159 160 /** 161 * Set DC Contributor 162 * @param dc The dublin core info 163 */ 164 protected void _setDCContributor(XMPSchemaDublinCore dc) 165 { 166 167 if (dc.getContributors() != null && !dc.getContributors().isEmpty()) 168 { 169 _metadata.set(TikaCoreProperties.CONTRIBUTOR, StringUtils.join(dc.getContributors(), ", ")); 170 } 171 } 172 173 /** 174 * Set DC Coverage 175 * @param dc The dublin core info 176 */ 177 protected void _setDCCoverage(XMPSchemaDublinCore dc) 178 { 179 if (dc.getCoverage() != null) 180 { 181 _metadata.set(TikaCoreProperties.COVERAGE, dc.getCoverage()); 182 } 183 } 184 185 /** 186 * Set DC Dates 187 * @param dc The dublin core info 188 * @throws IOException If an error occurred 189 */ 190 protected void _setDcDates(XMPSchemaDublinCore dc) throws IOException 191 { 192 if (dc.getDates() != null && !dc.getDates().isEmpty()) 193 { 194 Date date = dc.getDates().get(0).getTime(); 195 _metadata.set(TikaCoreProperties.CREATED, date); 196 _metadata.set(DublinCore.DATE, date); 197 } 198 } 199 200 /** 201 * Set DC Format 202 * @param dc The dublin core info 203 */ 204 protected void _setDCFormat(XMPSchemaDublinCore dc) 205 { 206 if (dc.getFormat() != null) 207 { 208 _metadata.set(TikaCoreProperties.FORMAT, dc.getFormat()); 209 } 210 } 211 212 /** 213 * Set DC Language 214 * @param dc The dublin core info 215 */ 216 protected void _setDCLanguage(XMPSchemaDublinCore dc) 217 { 218 if (dc.getLanguages() != null && !dc.getLanguages().isEmpty()) 219 { 220 _metadata.set(TikaCoreProperties.LANGUAGE, StringUtils.join(dc.getLanguages(), ", ")); 221 } 222 } 223 224 /** 225 * Set DC Publisher 226 * @param dc The dublin core info 227 */ 228 protected void _setDCPublisher(XMPSchemaDublinCore dc) 229 { 230 if (dc.getPublishers() != null && !dc.getPublishers().isEmpty()) 231 { 232 _metadata.set(TikaCoreProperties.PUBLISHER, StringUtils.join(dc.getPublishers(), ", ")); 233 } 234 } 235 236 /** 237 * Set DC Relation 238 * @param dc The dublin core info 239 */ 240 protected void _setDCRelation(XMPSchemaDublinCore dc) 241 { 242 if (dc.getRelationships() != null && !dc.getRelationships().isEmpty()) 243 { 244 _metadata.set(TikaCoreProperties.RELATION, StringUtils.join(dc.getRelationships(), ", ")); 245 } 246 } 247 248 /** 249 * Set DC Rights 250 * @param dc The dublin core info 251 */ 252 protected void _setDCRights(XMPSchemaDublinCore dc) 253 { 254 if (dc.getRights() != null) 255 { 256 _metadata.set(TikaCoreProperties.RIGHTS, dc.getRights()); 257 } 258 } 259 260 /** 261 * Set DC Source 262 * @param dc The dublin core info 263 */ 264 protected void _setDCSource(XMPSchemaDublinCore dc) 265 { 266 if (dc.getSource() != null) 267 { 268 _metadata.set(TikaCoreProperties.SOURCE, dc.getSource()); 269 } 270 } 271 272 /** 273 * Set DC Type 274 * @param dc The dublin core info 275 */ 276 protected void _setDCType(XMPSchemaDublinCore dc) 277 { 278 if (dc.getTypes() != null && !dc.getTypes().isEmpty()) 279 { 280 _metadata.set(TikaCoreProperties.TYPE, StringUtils.join(dc.getTypes(), ", ")); 281 } 282 } 283}