001/*
002 *  Copyright 2014 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.plugins.explorer.resources.metadata.parsing;
017
018import java.io.ByteArrayInputStream;
019import java.io.ByteArrayOutputStream;
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.InputStreamReader;
023import java.io.Reader;
024import java.util.Date;
025import java.util.Iterator;
026
027import org.apache.commons.lang.StringUtils;
028import org.apache.jempbox.xmp.XMPMetadata;
029import org.apache.jempbox.xmp.XMPSchemaDublinCore;
030import org.apache.tika.exception.TikaException;
031import org.apache.tika.metadata.DublinCore;
032import org.apache.tika.metadata.Metadata;
033import org.apache.tika.metadata.TikaCoreProperties;
034import org.apache.tika.parser.image.xmp.JempboxExtractor;
035import org.apache.tika.parser.image.xmp.XMPPacketScanner;
036import org.xml.sax.InputSource;
037
038/**
039 * Custom {@link JempboxExtractor} extracting all Dublin Core metadata.
040 */
041public class AmetysJempboxExtractor
042{
043    // The XMP spec says it must be unicode, but for most file formats it specifies
044    // "must be encoded in UTF-8"
045    private static final String DEFAULT_XMP_CHARSET = "UTF-8";
046    
047    private static XMPPacketScanner _SCANNER = new XMPPacketScanner();
048
049    private Metadata _metadata;
050
051    /**
052     * Build a jempbox extra
053     * @param metadata The metadata concerned
054     */
055    public AmetysJempboxExtractor(Metadata metadata)
056    {
057        this._metadata = metadata;
058    }
059    
060    /**
061     * Parse the file and extract the dublin core metadata.
062     * @param stream the stream to parse.
063     * @throws IOException if an error occurred
064     * @throws TikaException if an error occurred
065     */
066    public void parse(InputStream stream) throws IOException, TikaException
067    {
068        try (ByteArrayOutputStream xmpraw = new ByteArrayOutputStream())
069        {
070            if (!_SCANNER.parse(stream, xmpraw))
071            {
072                return;
073            }
074            
075            Reader decoded = new InputStreamReader(new ByteArrayInputStream(xmpraw.toByteArray()), DEFAULT_XMP_CHARSET);
076            try
077            {
078                XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
079                XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
080                if (dc != null)
081                {
082                    _setDCTitle(dc);
083                    _setDCDescription(dc);
084                    _setDCCreator(dc);
085                    _setDCSubject(dc);
086                    // Ametys-specific
087                    _setDCContributor(dc);
088                    _setDCCoverage(dc);
089                    _setDcDates(dc);
090                    _setDCFormat(dc);
091                    _setDCLanguage(dc);
092                    _setDCPublisher(dc);
093                    _setDCRelation(dc);
094                    _setDCRights(dc);
095                    _setDCSource(dc);
096                    _setDCType(dc);
097                }
098            }
099            catch (IOException e)
100            {
101                // Could not parse embedded XMP metadata. That's not a serious
102                // problem, so we'll just ignore the issue for now.
103                // TODO: Make error handling like this configurable.
104            }
105        }    
106    }
107
108    /**
109     * Set DC Title
110     * @param dc The dublin core info
111     */
112    protected void _setDCTitle(XMPSchemaDublinCore dc)
113    {
114        if (dc.getTitle() != null)
115        {
116            _metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
117        }
118    }
119
120    /**
121     * Set DC Description
122     * @param dc The dublin core info
123     */
124    protected void _setDCDescription(XMPSchemaDublinCore dc)
125    {
126        if (dc.getDescription() != null)
127        {
128            _metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
129        }
130    }
131
132    /**
133     * Set DC Creator
134     * @param dc The dublin core info
135     */
136    protected void _setDCCreator(XMPSchemaDublinCore dc)
137    {
138        if (dc.getCreators() != null && dc.getCreators().size() > 0)
139        {
140            _metadata.set(TikaCoreProperties.CREATOR, StringUtils.join(dc.getCreators(), ", "));
141        }
142    }
143
144    /**
145     * Set DC Subject
146     * @param dc The dublin core info
147     */
148    protected void _setDCSubject(XMPSchemaDublinCore dc)
149    {
150        if (dc.getSubjects() != null && dc.getSubjects().size() > 0)
151        {
152            Iterator<String> keywords = dc.getSubjects().iterator();
153            while (keywords.hasNext())
154            {
155                _metadata.add(TikaCoreProperties.KEYWORDS, keywords.next());
156            }
157        }
158    }
159
160    /**
161     * Set DC Contributor
162     * @param dc The dublin core info
163     */
164    protected void _setDCContributor(XMPSchemaDublinCore dc)
165    {
166        
167        if (dc.getContributors() != null && !dc.getContributors().isEmpty())
168        {
169            _metadata.set(TikaCoreProperties.CONTRIBUTOR, StringUtils.join(dc.getContributors(), ", "));
170        }
171    }
172
173    /**
174     * Set DC Coverage
175     * @param dc The dublin core info
176     */
177    protected void _setDCCoverage(XMPSchemaDublinCore dc)
178    {
179        if (dc.getCoverage() != null)
180        {
181            _metadata.set(TikaCoreProperties.COVERAGE, dc.getCoverage());
182        }
183    }
184
185    /**
186     * Set DC Dates
187     * @param dc The dublin core info
188     * @throws IOException If an error occurred
189     */
190    protected void _setDcDates(XMPSchemaDublinCore dc) throws IOException
191    {
192        if (dc.getDates() != null && !dc.getDates().isEmpty())
193        {
194            Date date = dc.getDates().get(0).getTime();
195            _metadata.set(TikaCoreProperties.CREATED, date);
196            _metadata.set(DublinCore.DATE, date);
197        }
198    }
199
200    /**
201     * Set DC Format
202     * @param dc The dublin core info
203     */
204    protected void _setDCFormat(XMPSchemaDublinCore dc)
205    {
206        if (dc.getFormat() != null)
207        {
208            _metadata.set(TikaCoreProperties.FORMAT, dc.getFormat());
209        }
210    }
211
212    /**
213     * Set DC Language
214     * @param dc The dublin core info
215     */
216    protected void _setDCLanguage(XMPSchemaDublinCore dc)
217    {
218        if (dc.getLanguages() != null && !dc.getLanguages().isEmpty())
219        {
220            _metadata.set(TikaCoreProperties.LANGUAGE, StringUtils.join(dc.getLanguages(), ", "));
221        }
222    }
223
224    /**
225     * Set DC Publisher
226     * @param dc The dublin core info
227     */
228    protected void _setDCPublisher(XMPSchemaDublinCore dc)
229    {
230        if (dc.getPublishers() != null && !dc.getPublishers().isEmpty())
231        {
232            _metadata.set(TikaCoreProperties.PUBLISHER, StringUtils.join(dc.getPublishers(), ", "));
233        }
234    }
235
236    /**
237     * Set DC Relation
238     * @param dc The dublin core info
239     */
240    protected void _setDCRelation(XMPSchemaDublinCore dc)
241    {
242        if (dc.getRelationships() != null && !dc.getRelationships().isEmpty())
243        {
244            _metadata.set(TikaCoreProperties.RELATION, StringUtils.join(dc.getRelationships(), ", "));
245        }
246    }
247
248    /**
249     * Set DC Rights
250     * @param dc The dublin core info
251     */
252    protected void _setDCRights(XMPSchemaDublinCore dc)
253    {
254        if (dc.getRights() != null)
255        {
256            _metadata.set(TikaCoreProperties.RIGHTS, dc.getRights());
257        }
258    }
259
260    /**
261     * Set DC Source
262     * @param dc The dublin core info
263     */
264    protected void _setDCSource(XMPSchemaDublinCore dc)
265    {
266        if (dc.getSource() != null)
267        {
268            _metadata.set(TikaCoreProperties.SOURCE, dc.getSource());
269        }
270    }
271
272    /**
273     * Set DC Type
274     * @param dc The dublin core info
275     */
276    protected void _setDCType(XMPSchemaDublinCore dc)
277    {
278        if (dc.getTypes() != null && !dc.getTypes().isEmpty())
279        {
280            _metadata.set(TikaCoreProperties.TYPE, StringUtils.join(dc.getTypes(), ", "));
281        }
282    }
283}