001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Locale;
023import java.util.Optional;
024
025import org.apache.avalon.framework.component.Component;
026import org.apache.avalon.framework.service.ServiceException;
027import org.apache.avalon.framework.service.ServiceManager;
028import org.apache.avalon.framework.service.Serviceable;
029import org.apache.commons.lang3.StringUtils;
030import org.apache.solr.common.SolrInputDocument;
031import org.apache.tika.Tika;
032
033import org.ametys.cms.data.type.indexing.IndexableDataContext;
034import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper;
035import org.ametys.cms.languages.LanguagesManager;
036import org.ametys.cms.search.solr.field.FirstValidationSearchField;
037import org.ametys.cms.search.solr.field.LastMajorValidationSearchField;
038import org.ametys.cms.search.solr.field.LastModifiedSearchField;
039import org.ametys.cms.search.solr.field.LastValidationSearchField;
040import org.ametys.core.user.UserIdentity;
041import org.ametys.plugins.explorer.resources.Resource;
042import org.ametys.plugins.explorer.resources.ResourceCollection;
043import org.ametys.plugins.explorer.resources.ResourceHelper;
044import org.ametys.plugins.explorer.resources.metadata.TikaProvider;
045import org.ametys.plugins.repository.AmetysObject;
046import org.ametys.plugins.repository.TraversableAmetysObject;
047import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
048import org.ametys.runtime.plugin.component.AbstractLogEnabled;
049
050/**
051 * Solr resource indexer.<p>
052 * Populate a Solr input document with the following fields:
053 * <dl>
054 *  <dt>id
055 *  <dd>resource id
056 *  <dt>type
057 *  <dd>with <code>"document"</code> value
058 *  <dt>full
059 *  <dd>resource content
060 * </dl>
061 */
062public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
063{
064    /** The avalon role. */
065    public static final String ROLE = SolrResourceIndexer.class.getName();
066    
067    /** The Tika instance. */
068    protected Tika _tika;
069    
070    /** The language manager. */
071    protected LanguagesManager _langManager;
072
073    /** The solr indexer */
074    protected SolrIndexer _solrIndexer;
075    
076    @Override
077    public void service(ServiceManager manager) throws ServiceException
078    {
079        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
080        _tika = tikaProvider.getTika();
081        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
082        _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE);
083    }
084    
085    /**
086     * Index a resource.
087     * @param resource The resource to index.
088     * @param document The Solr document to index into.
089     * @param documentType The document type of the resource
090     * @throws Exception if an error occurs.
091     */
092    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
093    {
094        indexResource(resource, document, documentType, null, null);
095    }
096    
097    /**
098     * Index a resource.
099     * @param resource The resource to index.
100     * @param document The Solr document to index into.
101     * @param documentType The document type of the resource
102     * @param language The query language.
103     * @throws Exception if an error occurs.
104     */
105    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
106    {
107        indexResource(resource, document, documentType, language, null);
108    }
109    
110    /**
111     * Index a resource.
112     * @param resource The resource to index.
113     * @param document The Solr document to index into.
114     * @param documentType The document type of the resource
115     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
116     * @throws Exception if an error occurs.
117     */
118    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
119    {
120        indexResource(resource, document, documentType, null, resourceRoot);
121    }
122    
123    /**
124     * Index a resource.
125     * @param resource The resource to index.
126     * @param document The Solr document to index into.
127     * @param documentType The document type of the resource
128     * @param language The language, can be null.
129     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
130     * @throws Exception if an error occurs.
131     */
132    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
133    {
134        // Resource id
135        document.addField(ID, resource.getId());
136        // Type is resource
137        document.addField(DOCUMENT_TYPE, documentType);
138        document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE);
139        // The resource path.
140        document.setField(PATH, resource.getResourcePath());
141        document.addField(FILENAME, resource.getName());
142        
143        // Title
144        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
145        // Index title like other string values (like content attributes)
146        IndexableDataContext context = IndexableDataContext.newInstance();
147        if (StringUtils.isNotEmpty(language))
148        {
149            context.withLocale(new Locale(language));
150        }
151        IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger());
152        // Add sort indexation
153        document.setField(TITLE_SORT, resource.getName());
154        document.addField(TITLE + "_s_sort", title);
155        // Add title to "full" (already added to "systemFull")
156        IndexableDataContext fullContext = context.cloneContext()
157                .withIndexForFullTextField(true)
158                .withFullTextFieldName(SolrFieldNames.FULL);
159        IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext);
160        
161        _populateDatesOfPage(resource, document);
162
163        // Mime types
164        document.addField(MIME_TYPES, resource.getMimeType());
165        // Length
166        document.addField(LENGTH, resource.getLength());
167        
168        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
169        document.addField(RESOURCE_ROOT_ID, root.getId());
170        
171        // Parents resource collections of the resource
172        _indexAncestorIds(resource, document);
173        
174        // Resource author
175        String author = UserIdentity.userIdentityToString(resource.getCreator());
176        if (StringUtils.isNotBlank(author))
177        {
178            document.setField(RESOURCE_CREATOR, author);
179        }
180        
181        // Hard-coded content type for facets.
182        // TODO Move to specific "embedded mode" method?
183        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
184        
185        // Indexation of ACL initial values
186        _solrIndexer.indexAclInitValues(resource, document);
187        
188        indexResourceContent(resource, document, language);
189    }
190    
191    /**
192     * Populate the solr input document with dates from the resource
193     * @param resource The resource
194     * @param document The Solr document
195     */
196    protected void _populateDatesOfPage(Resource resource, SolrInputDocument document)
197    {
198        // Last modified
199        String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified());
200        // For 'new' search service
201        document.addField(LastModifiedSearchField.NAME, lastModifiedStr);
202        // For 'old' search service
203        document.addField(LAST_MODIFIED + "_dt", lastModifiedStr);
204        
205        // For 'new' search service => last validation, last major validation
206        document.addField(LastValidationSearchField.NAME, lastModifiedStr);
207        document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr);
208        
209        // For 'new' search service => first validation
210        String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate());
211        document.addField(FirstValidationSearchField.NAME, creationDateStr);
212        
213        // Solr facet specific : dates-facet
214        Date date = resource.getDCDate();
215        String formattedDate = SolrIndexer.dateFormat().format(date);
216        if (formattedDate != null)
217        {
218            document.setField(RESOURCE_DATE, formattedDate);
219            document.setField(DATE_FOR_SORTING, formattedDate);
220            document.setField(DATES_FACET, formattedDate);
221        }
222    }
223    
224    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
225    {
226        // Ancestors
227        List<String> ancestorIds = new ArrayList<>();
228        AmetysObject parent = resource.getParent();
229        while (parent instanceof ResourceCollection)
230        {
231            ancestorIds.add(parent.getId());
232            parent = parent.getParent();
233        }
234        
235        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
236        
237        // Ancestors and self
238        List<String> ancestorAndSelfIds = new ArrayList<>();
239        ancestorAndSelfIds.add(resource.getId());
240        ancestorAndSelfIds.addAll(ancestorIds);
241        document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds);
242    }
243    
244    /**
245     * Index a collection of resources.
246     * @param resourceCollection the resource collection to index.
247     * @param document The document to index into.
248     * @param language The current language.
249     * @throws Exception if an error occurs while indexing.
250     */
251    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
252    {
253        if (resourceCollection == null)
254        {
255            return;
256        }
257        
258        for (AmetysObject object : resourceCollection.getChildren())
259        {
260            if (object instanceof ResourceCollection)
261            {
262                indexResourceCollection((ResourceCollection) object, document, language);
263            }
264            else if (object instanceof Resource)
265            {
266                indexResourceContent((Resource) object, document, language);
267            }
268        }
269    }
270    
271    /**
272     * Index a resource content (text in case of a document, and Dublin Core metadata).
273     * @param resource The resource to index.
274     * @param document The document to index into.
275     * @param language The current language, can be null.
276     */
277    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
278    {
279        try (InputStream is = resource.getInputStream())
280        {
281            String value = _tika.parseToString(is);
282            indexFulltextValue(document, value, language);
283            
284            if (StringUtils.isNotBlank(value))
285            {
286                int summaryEndIndex = value.lastIndexOf(' ', 200);
287                if (summaryEndIndex == -1)
288                {
289                    summaryEndIndex = value.length();
290                }
291                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
292            }
293            
294            for (String keyword : resource.getDCSubject())
295            {
296                indexFulltextValue(document, keyword, language);
297            }
298
299            String desc = resource.getDCDescription();
300            if (desc != null)
301            {
302                indexFulltextValue(document, desc, language);
303            }
304            
305            // DC meta
306            indexDublinCoreMetadata(resource, document);
307        }
308        catch (Throwable e)
309        {
310            getLogger().error("Unable to index resource at " + resource.getPath(), e);
311        }
312    }
313    
314    /**
315     * Index a full-text value.
316     * @param document The document to index into.
317     * @param text The text to index.
318     * @param language The content language, can be null.
319     */
320    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
321    {
322        IndexableDataContext context = IndexableDataContext.newInstance()
323                .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future
324        if (StringUtils.isNotEmpty(language))
325        {
326            context.withLocale(new Locale(language));
327        }
328        
329        // Index the document in systemFull
330        IndexableElementTypeHelper.indexFulltextValue(document, text, context);
331        
332        // Then in full
333        IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL));
334    }
335    
336    ///////////////////////////////////////////////////////////////////////////
337    
338    /**
339     * Index Dublin core metadata.
340     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
341     * @param document the solr input document to populate.
342     */
343    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
344    {
345        _indexNonNullValue(document, DC_TITLE, object.getDCTitle());
346        _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject());
347        _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription());
348        _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor());
349        _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage());
350        _indexNonNullValue(document, DC_CREATOR, object.getDCCreator());
351        String mimeType = _getDcFormatToIndex(object);
352        _indexNonNullValue(document, DC_FORMAT, mimeType);
353        _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage());
354        _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher());
355        _indexNonNullValue(document, DC_RIGHTS, object.getDCRights());
356        _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate()));
357        
358        SolrResourceGroupedMimeTypes.getGroup(mimeType)
359                .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType));
360    }
361    
362    private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object)
363    {
364        return Optional.of(object)
365                .map(DublinCoreAwareAmetysObject::getDCFormat)
366                // According to https://en.wikipedia.org/wiki/Media_type#Naming
367                // input format is:
368                // type "/" [tree "."] subtype ["+" suffix] *[";" parameter]
369                // just output the part without optional parameters
370                .map(mimeType -> StringUtils.substringBefore(mimeType, ";"))
371                // According to https://en.wikipedia.org/wiki/Media_type#Naming
372                // Types, subtypes, and parameter names are case-insensitive
373                .map(String::toLowerCase)
374                .orElse(null);
375    }
376    
377    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
378    {
379        if (value != null)
380        {
381            document.addField(fieldName, value);
382        }
383    }
384    
385    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
386    {
387        if (values != null)
388        {
389            for (String value : values)
390            {
391                document.addField(fieldName, value);
392            }
393        }
394    }
395    
396    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
397    {
398        if (value != null)
399        {
400            document.addField(fieldName, value);
401        }
402    }*/
403}