001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.Date;
020import java.util.Set;
021
022import org.apache.avalon.framework.component.Component;
023import org.apache.avalon.framework.service.ServiceException;
024import org.apache.avalon.framework.service.ServiceManager;
025import org.apache.avalon.framework.service.Serviceable;
026import org.apache.commons.lang3.StringUtils;
027import org.apache.solr.common.SolrInputDocument;
028import org.apache.tika.Tika;
029
030import org.ametys.cms.languages.LanguagesManager;
031import org.ametys.core.user.UserIdentity;
032import org.ametys.plugins.explorer.resources.Resource;
033import org.ametys.plugins.explorer.resources.ResourceCollection;
034import org.ametys.plugins.explorer.resources.ResourceHelper;
035import org.ametys.plugins.explorer.resources.metadata.TikaProvider;
036import org.ametys.plugins.repository.AmetysObject;
037import org.ametys.plugins.repository.TraversableAmetysObject;
038import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
039import org.ametys.runtime.plugin.component.AbstractLogEnabled;
040
041/**
042 * Solr resource indexer.<p>
043 * Populate a Solr input document with the following fields:
044 * <dl>
045 *  <dt>id
046 *  <dd>resource id
047 *  <dt>type
048 *  <dd>with <code>"document"</code> value
049 *  <dt>full
050 *  <dd>resource content
051 * </dl>
052 */
053public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
054{
055    /** The avalon role. */
056    public static final String ROLE = SolrResourceIndexer.class.getName();
057    
058    /** The Tika instance. */
059    protected Tika _tika;
060    
061    /** The language manager. */
062    protected LanguagesManager _langManager;
063    
064    @Override
065    public void service(ServiceManager manager) throws ServiceException
066    {
067        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
068        _tika = tikaProvider.getTika();
069        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
070    }
071    
072    /**
073     * Index a resource.
074     * @param resource The resource to index.
075     * @param document The Solr document to index into.
076     * @param documentType The document type of the resource
077     * @throws Exception if an error occurs.
078     */
079    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
080    {
081        indexResource(resource, document, documentType, null, null);
082    }
083    
084    /**
085     * Index a resource.
086     * @param resource The resource to index.
087     * @param document The Solr document to index into.
088     * @param documentType The document type of the resource
089     * @param language The query language.
090     * @throws Exception if an error occurs.
091     */
092    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
093    {
094        indexResource(resource, document, documentType, language, null);
095    }
096    
097    /**
098     * Index a resource.
099     * @param resource The resource to index.
100     * @param document The Solr document to index into.
101     * @param documentType The document type of the resource
102     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
103     * @throws Exception if an error occurs.
104     */
105    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
106    {
107        indexResource(resource, document, documentType, null, resourceRoot);
108    }
109    
110    /**
111     * Index a resource.
112     * @param resource The resource to index.
113     * @param document The Solr document to index into.
114     * @param documentType The document type of the resource
115     * @param language The language, can be null.
116     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
117     * @throws Exception if an error occurs.
118     */
119    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
120    {
121        // Resource id - Store.YES, Index.NOT_ANALYZED
122        document.addField(ID, resource.getId());
123        // Type is resource - Store.YES, Index.NOT_ANALYZED
124        document.addField(DOCUMENT_TYPE, documentType);
125        // The resource path.
126        document.setField(PATH, resource.getResourcePath());
127        document.addField(FILENAME, resource.getName());
128        
129        // Title
130        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
131        document.addField(TITLE, title);
132        document.setField(TITLE_SORT, resource.getName());
133        document.addField(TITLE + "_s", title);
134        document.addField(TITLE + "_s_sort", title);
135        
136        // Replaces "all-not-analyzed".
137        indexFulltextValue(document, title, language);
138        
139        // Last modified
140        document.addField(LAST_MODIFIED + "_dt", SolrIndexer.dateFormat().format(resource.getLastModified()));
141
142        // Mime types - Store.YES, Index.ANALYZED
143        document.addField(MIME_TYPES, resource.getMimeType());
144        // Length - Store.YES, Index.NO
145        document.addField(LENGTH, resource.getLength());
146        
147        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
148        document.addField(RESOURCE_ROOT_ID, root.getId());
149        
150        // Solr facet specific : dates-facet
151        Date date = resource.getDCDate();
152        String formattedDate = SolrIndexer.dateFormat().format(date);
153        if (formattedDate != null)
154        {
155            document.setField(RESOURCE_DATE, formattedDate);
156            document.setField(DATE_FOR_SORTING, formattedDate);
157            document.setField(DATES_FACET, formattedDate);
158        }
159        
160        // Resource author
161        String author = UserIdentity.userIdentityToString(resource.getCreator());
162        if (StringUtils.isNotBlank(author))
163        {
164            document.setField(RESOURCE_CREATOR, author);
165        }
166        
167        // Hard-coded content type for facets.
168        // TODO Move to specific "embedded mode" method?
169        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
170        
171        indexResourceContent(resource, document, language);
172    }
173    
174    /**
175     * Index a collection of resources.
176     * @param resourceCollection the resource collection to index.
177     * @param document The document to index into.
178     * @param language The current language.
179     * @throws Exception if an error occurs while indexing.
180     */
181    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
182    {
183        if (resourceCollection == null)
184        {
185            return;
186        }
187        
188        for (AmetysObject object : resourceCollection.getChildren())
189        {
190            if (object instanceof ResourceCollection)
191            {
192                indexResourceCollection((ResourceCollection) object, document, language);
193            }
194            else if (object instanceof Resource)
195            {
196                indexResourceContent((Resource) object, document, language);
197            }
198        }
199    }
200    
201    /**
202     * Index a resource content (text in case of a document, and Dublin Core metadata).
203     * @param resource The resource to index.
204     * @param document The document to index into.
205     * @param language The current language, can be null.
206     */
207    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
208    {
209        try (InputStream is = resource.getInputStream())
210        {
211            String value = _tika.parseToString(is);
212            indexFulltextValue(document, value, language);
213            
214            if (StringUtils.isNotBlank(value))
215            {
216                int summaryEndIndex = value.lastIndexOf(' ', 200);
217                if (summaryEndIndex == -1)
218                {
219                    summaryEndIndex = value.length();
220                }
221                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
222            }
223            
224            for (String keyword : resource.getDCSubject())
225            {
226                indexFulltextValue(document, keyword, language);
227            }
228
229            String desc = resource.getDCDescription();
230            if (desc != null)
231            {
232                indexFulltextValue(document, desc, language);
233            }
234            
235            // DC meta
236            indexDublinCoreMetadata(resource, document);
237        }
238        catch (Throwable e)
239        {
240            getLogger().error("Unable to index resource at " + resource.getPath(), e);
241        }
242    }
243    
244    /**
245     * Index a full-text value.
246     * @param document The document to index into.
247     * @param text The text to index.
248     * @param language The content language, can be null.
249     */
250    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
251    {
252        if (StringUtils.isNotEmpty(language))
253        {
254            SolrContentIndexer.indexFulltextValue(document, text, language);
255        }
256        else
257        {
258            Set<String> languages = _langManager.getAvailableLanguages().keySet();
259            SolrContentIndexer.indexFulltextValue(document, text, languages);
260        }
261    }
262    
263    ///////////////////////////////////////////////////////////////////////////
264    
265    /**
266     * Index Dublin core metadata.
267     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
268     * @param document the solr input document to populate.
269     */
270    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
271    {
272        _indexNonNullValue(document, "DCTitle", object.getDCTitle());
273        _indexNonNullValue(document, "DCSubject", object.getDCSubject());
274        _indexNonNullValue(document, "DCDescription", object.getDCDescription());
275        _indexNonNullValue(document, "DCContributor", object.getDCContributor());
276        _indexNonNullValue(document, "DCCoverage", object.getDCCoverage());
277        _indexNonNullValue(document, "DCCreator", object.getDCCreator());
278        _indexNonNullValue(document, "DCFormat", object.getDCFormat());
279        _indexNonNullValue(document, "DCLanguage", object.getDCLanguage());
280        _indexNonNullValue(document, "DCPublisher", object.getDCPublisher());
281        _indexNonNullValue(document, "DCRights", object.getDCRights());
282        _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate()));
283    }
284    
285    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
286    {
287        if (value != null)
288        {
289            document.addField(fieldName, value);
290        }
291    }
292    
293    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
294    {
295        if (values != null)
296        {
297            for (String value : values)
298            {
299                document.addField(fieldName, value);
300            }
301        }
302    }
303    
304    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
305    {
306        if (value != null)
307        {
308            document.addField(fieldName, value);
309        }
310    }
311    
312}