001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Set;
023
024import org.apache.avalon.framework.component.Component;
025import org.apache.avalon.framework.service.ServiceException;
026import org.apache.avalon.framework.service.ServiceManager;
027import org.apache.avalon.framework.service.Serviceable;
028import org.apache.commons.lang3.StringUtils;
029import org.apache.solr.common.SolrInputDocument;
030import org.apache.tika.Tika;
031
032import org.ametys.cms.languages.LanguagesManager;
033import org.ametys.core.user.UserIdentity;
034import org.ametys.plugins.explorer.resources.Resource;
035import org.ametys.plugins.explorer.resources.ResourceCollection;
036import org.ametys.plugins.explorer.resources.ResourceHelper;
037import org.ametys.plugins.explorer.resources.metadata.TikaProvider;
038import org.ametys.plugins.repository.AmetysObject;
039import org.ametys.plugins.repository.TraversableAmetysObject;
040import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
041import org.ametys.runtime.plugin.component.AbstractLogEnabled;
042
043/**
044 * Solr resource indexer.<p>
045 * Populate a Solr input document with the following fields:
046 * <dl>
047 *  <dt>id
048 *  <dd>resource id
049 *  <dt>type
050 *  <dd>with <code>"document"</code> value
051 *  <dt>full
052 *  <dd>resource content
053 * </dl>
054 */
055public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
056{
057    /** The avalon role. */
058    public static final String ROLE = SolrResourceIndexer.class.getName();
059    
060    /** The Tika instance. */
061    protected Tika _tika;
062    
063    /** The language manager. */
064    protected LanguagesManager _langManager;
065    
066    @Override
067    public void service(ServiceManager manager) throws ServiceException
068    {
069        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
070        _tika = tikaProvider.getTika();
071        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
072    }
073    
074    /**
075     * Index a resource.
076     * @param resource The resource to index.
077     * @param document The Solr document to index into.
078     * @param documentType The document type of the resource
079     * @throws Exception if an error occurs.
080     */
081    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
082    {
083        indexResource(resource, document, documentType, null, null);
084    }
085    
086    /**
087     * Index a resource.
088     * @param resource The resource to index.
089     * @param document The Solr document to index into.
090     * @param documentType The document type of the resource
091     * @param language The query language.
092     * @throws Exception if an error occurs.
093     */
094    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
095    {
096        indexResource(resource, document, documentType, language, null);
097    }
098    
099    /**
100     * Index a resource.
101     * @param resource The resource to index.
102     * @param document The Solr document to index into.
103     * @param documentType The document type of the resource
104     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
105     * @throws Exception if an error occurs.
106     */
107    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
108    {
109        indexResource(resource, document, documentType, null, resourceRoot);
110    }
111    
112    /**
113     * Index a resource.
114     * @param resource The resource to index.
115     * @param document The Solr document to index into.
116     * @param documentType The document type of the resource
117     * @param language The language, can be null.
118     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
119     * @throws Exception if an error occurs.
120     */
121    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
122    {
123        // Resource id - Store.YES, Index.NOT_ANALYZED
124        document.addField(ID, resource.getId());
125        // Type is resource - Store.YES, Index.NOT_ANALYZED
126        document.addField(DOCUMENT_TYPE, documentType);
127        // The resource path.
128        document.setField(PATH, resource.getResourcePath());
129        document.addField(FILENAME, resource.getName());
130        
131        // Title
132        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
133        document.addField(TITLE, title);
134        document.setField(TITLE_SORT, resource.getName());
135        document.addField(TITLE + "_s", title);
136        document.addField(TITLE + "_s_sort", title);
137        
138        // Replaces "all-not-analyzed".
139        indexFulltextValue(document, title, language);
140        
141        // Last modified
142        document.addField(LAST_MODIFIED + "_dt", SolrIndexer.dateFormat().format(resource.getLastModified()));
143
144        // Mime types - Store.YES, Index.ANALYZED
145        document.addField(MIME_TYPES, resource.getMimeType());
146        // Length - Store.YES, Index.NO
147        document.addField(LENGTH, resource.getLength());
148        
149        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
150        document.addField(RESOURCE_ROOT_ID, root.getId());
151        
152        // Parents resource collections of the resource
153        _indexAncestorIds(resource, document);
154        
155        // Solr facet specific : dates-facet
156        Date date = resource.getDCDate();
157        String formattedDate = SolrIndexer.dateFormat().format(date);
158        if (formattedDate != null)
159        {
160            document.setField(RESOURCE_DATE, formattedDate);
161            document.setField(DATE_FOR_SORTING, formattedDate);
162            document.setField(DATES_FACET, formattedDate);
163        }
164        
165        // Resource author
166        String author = UserIdentity.userIdentityToString(resource.getCreator());
167        if (StringUtils.isNotBlank(author))
168        {
169            document.setField(RESOURCE_CREATOR, author);
170        }
171        
172        // Hard-coded content type for facets.
173        // TODO Move to specific "embedded mode" method?
174        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
175        
176        // Indexation of AmetysObject property
177        document.addField(SolrFieldNames.IS_AMETYS_OBJECT, true);
178        
179        indexResourceContent(resource, document, language);
180    }
181    
182    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
183    {
184        List<String> ancestorIds = new ArrayList<>();
185        AmetysObject parent = resource.getParent();
186        while (parent instanceof ResourceCollection)
187        {
188            ancestorIds.add(parent.getId());
189            parent = parent.getParent();
190        }
191        
192        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
193    }
194    
195    /**
196     * Index a collection of resources.
197     * @param resourceCollection the resource collection to index.
198     * @param document The document to index into.
199     * @param language The current language.
200     * @throws Exception if an error occurs while indexing.
201     */
202    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
203    {
204        if (resourceCollection == null)
205        {
206            return;
207        }
208        
209        for (AmetysObject object : resourceCollection.getChildren())
210        {
211            if (object instanceof ResourceCollection)
212            {
213                indexResourceCollection((ResourceCollection) object, document, language);
214            }
215            else if (object instanceof Resource)
216            {
217                indexResourceContent((Resource) object, document, language);
218            }
219        }
220    }
221    
222    /**
223     * Index a resource content (text in case of a document, and Dublin Core metadata).
224     * @param resource The resource to index.
225     * @param document The document to index into.
226     * @param language The current language, can be null.
227     */
228    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
229    {
230        try (InputStream is = resource.getInputStream())
231        {
232            String value = _tika.parseToString(is);
233            indexFulltextValue(document, value, language);
234            
235            if (StringUtils.isNotBlank(value))
236            {
237                int summaryEndIndex = value.lastIndexOf(' ', 200);
238                if (summaryEndIndex == -1)
239                {
240                    summaryEndIndex = value.length();
241                }
242                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
243            }
244            
245            for (String keyword : resource.getDCSubject())
246            {
247                indexFulltextValue(document, keyword, language);
248            }
249
250            String desc = resource.getDCDescription();
251            if (desc != null)
252            {
253                indexFulltextValue(document, desc, language);
254            }
255            
256            // DC meta
257            indexDublinCoreMetadata(resource, document);
258        }
259        catch (Throwable e)
260        {
261            getLogger().error("Unable to index resource at " + resource.getPath(), e);
262        }
263    }
264    
265    /**
266     * Index a full-text value.
267     * @param document The document to index into.
268     * @param text The text to index.
269     * @param language The content language, can be null.
270     */
271    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
272    {
273        if (StringUtils.isNotEmpty(language))
274        {
275            SolrContentIndexer.indexFulltextValue(document, text, language);
276        }
277        else
278        {
279            Set<String> languages = _langManager.getAvailableLanguages().keySet();
280            SolrContentIndexer.indexFulltextValue(document, text, languages);
281        }
282    }
283    
284    ///////////////////////////////////////////////////////////////////////////
285    
286    /**
287     * Index Dublin core metadata.
288     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
289     * @param document the solr input document to populate.
290     */
291    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
292    {
293        _indexNonNullValue(document, "DCTitle", object.getDCTitle());
294        _indexNonNullValue(document, "DCSubject", object.getDCSubject());
295        _indexNonNullValue(document, "DCDescription", object.getDCDescription());
296        _indexNonNullValue(document, "DCContributor", object.getDCContributor());
297        _indexNonNullValue(document, "DCCoverage", object.getDCCoverage());
298        _indexNonNullValue(document, "DCCreator", object.getDCCreator());
299        _indexNonNullValue(document, "DCFormat", object.getDCFormat());
300        _indexNonNullValue(document, "DCLanguage", object.getDCLanguage());
301        _indexNonNullValue(document, "DCPublisher", object.getDCPublisher());
302        _indexNonNullValue(document, "DCRights", object.getDCRights());
303        _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate()));
304    }
305    
306    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
307    {
308        if (value != null)
309        {
310            document.addField(fieldName, value);
311        }
312    }
313    
314    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
315    {
316        if (values != null)
317        {
318            for (String value : values)
319            {
320                document.addField(fieldName, value);
321            }
322        }
323    }
324    
325    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
326    {
327        if (value != null)
328        {
329            document.addField(fieldName, value);
330        }
331    }*/
332}