001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Optional;
023import java.util.Set;
024
025import org.apache.avalon.framework.component.Component;
026import org.apache.avalon.framework.service.ServiceException;
027import org.apache.avalon.framework.service.ServiceManager;
028import org.apache.avalon.framework.service.Serviceable;
029import org.apache.commons.lang3.StringUtils;
030import org.apache.solr.common.SolrInputDocument;
031import org.apache.tika.Tika;
032
033import org.ametys.cms.languages.LanguagesManager;
034import org.ametys.cms.search.solr.field.FirstValidationSearchField;
035import org.ametys.cms.search.solr.field.LastMajorValidationSearchField;
036import org.ametys.cms.search.solr.field.LastModifiedSearchField;
037import org.ametys.cms.search.solr.field.LastValidationSearchField;
038import org.ametys.core.user.UserIdentity;
039import org.ametys.plugins.explorer.resources.Resource;
040import org.ametys.plugins.explorer.resources.ResourceCollection;
041import org.ametys.plugins.explorer.resources.ResourceHelper;
042import org.ametys.plugins.explorer.resources.metadata.TikaProvider;
043import org.ametys.plugins.repository.AmetysObject;
044import org.ametys.plugins.repository.TraversableAmetysObject;
045import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
046import org.ametys.runtime.plugin.component.AbstractLogEnabled;
047
048/**
049 * Solr resource indexer.<p>
050 * Populate a Solr input document with the following fields:
051 * <dl>
052 *  <dt>id
053 *  <dd>resource id
054 *  <dt>type
055 *  <dd>with <code>"document"</code> value
056 *  <dt>full
057 *  <dd>resource content
058 * </dl>
059 */
060public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
061{
062    /** The avalon role. */
063    public static final String ROLE = SolrResourceIndexer.class.getName();
064    
065    /** The Tika instance. */
066    protected Tika _tika;
067    
068    /** The language manager. */
069    protected LanguagesManager _langManager;
070
071    /** The solr indexer */
072    protected SolrIndexer _solrIndexer;
073    
074    @Override
075    public void service(ServiceManager manager) throws ServiceException
076    {
077        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
078        _tika = tikaProvider.getTika();
079        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
080        _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE);
081    }
082    
083    /**
084     * Index a resource.
085     * @param resource The resource to index.
086     * @param document The Solr document to index into.
087     * @param documentType The document type of the resource
088     * @throws Exception if an error occurs.
089     */
090    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
091    {
092        indexResource(resource, document, documentType, null, null);
093    }
094    
095    /**
096     * Index a resource.
097     * @param resource The resource to index.
098     * @param document The Solr document to index into.
099     * @param documentType The document type of the resource
100     * @param language The query language.
101     * @throws Exception if an error occurs.
102     */
103    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
104    {
105        indexResource(resource, document, documentType, language, null);
106    }
107    
108    /**
109     * Index a resource.
110     * @param resource The resource to index.
111     * @param document The Solr document to index into.
112     * @param documentType The document type of the resource
113     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
114     * @throws Exception if an error occurs.
115     */
116    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
117    {
118        indexResource(resource, document, documentType, null, resourceRoot);
119    }
120    
121    /**
122     * Index a resource.
123     * @param resource The resource to index.
124     * @param document The Solr document to index into.
125     * @param documentType The document type of the resource
126     * @param language The language, can be null.
127     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
128     * @throws Exception if an error occurs.
129     */
130    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
131    {
132        // Resource id - Store.YES, Index.NOT_ANALYZED
133        document.addField(ID, resource.getId());
134        // Type is resource - Store.YES, Index.NOT_ANALYZED
135        document.addField(DOCUMENT_TYPE, documentType);
136        document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE);
137        // The resource path.
138        document.setField(PATH, resource.getResourcePath());
139        document.addField(FILENAME, resource.getName());
140        
141        // Title
142        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
143        document.addField(TITLE, title);
144        document.setField(TITLE_SORT, resource.getName());
145        document.addField(TITLE + "_s", title);
146        document.addField(TITLE + "_s_sort", title);
147        
148        // Replaces "all-not-analyzed".
149        indexFulltextValue(document, title, language);
150        
151        _populateDatesOfPage(resource, document);
152
153        // Mime types - Store.YES, Index.ANALYZED
154        document.addField(MIME_TYPES, resource.getMimeType());
155        // Length - Store.YES, Index.NO
156        document.addField(LENGTH, resource.getLength());
157        
158        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
159        document.addField(RESOURCE_ROOT_ID, root.getId());
160        
161        // Parents resource collections of the resource
162        _indexAncestorIds(resource, document);
163        
164        // Resource author
165        String author = UserIdentity.userIdentityToString(resource.getCreator());
166        if (StringUtils.isNotBlank(author))
167        {
168            document.setField(RESOURCE_CREATOR, author);
169        }
170        
171        // Hard-coded content type for facets.
172        // TODO Move to specific "embedded mode" method?
173        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
174        
175        // Indexation of ACL initial values
176        _solrIndexer.indexAclInitValues(resource, document);
177        
178        indexResourceContent(resource, document, language);
179    }
180    
181    /**
182     * Populate the solr input document with dates from the resource
183     * @param resource The resource
184     * @param document The Solr document
185     */
186    protected void _populateDatesOfPage(Resource resource, SolrInputDocument document)
187    {
188        // Last modified
189        String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified());
190        // For 'new' search service
191        document.addField(LastModifiedSearchField.NAME, lastModifiedStr);
192        // For 'old' search service
193        document.addField(LAST_MODIFIED + "_dt", lastModifiedStr);
194        
195        // For 'new' search service => last validation, last major validation
196        document.addField(LastValidationSearchField.NAME, lastModifiedStr);
197        document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr);
198        
199        // For 'new' search service => first validation
200        String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate());
201        document.addField(FirstValidationSearchField.NAME, creationDateStr);
202        
203        // Solr facet specific : dates-facet
204        Date date = resource.getDCDate();
205        String formattedDate = SolrIndexer.dateFormat().format(date);
206        if (formattedDate != null)
207        {
208            document.setField(RESOURCE_DATE, formattedDate);
209            document.setField(DATE_FOR_SORTING, formattedDate);
210            document.setField(DATES_FACET, formattedDate);
211        }
212    }
213    
214    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
215    {
216        // Ancestors
217        List<String> ancestorIds = new ArrayList<>();
218        AmetysObject parent = resource.getParent();
219        while (parent instanceof ResourceCollection)
220        {
221            ancestorIds.add(parent.getId());
222            parent = parent.getParent();
223        }
224        
225        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
226        
227        // Ancestors and self
228        List<String> ancestorAndSelfIds = new ArrayList<>();
229        ancestorAndSelfIds.add(resource.getId());
230        ancestorAndSelfIds.addAll(ancestorIds);
231        document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds);
232    }
233    
234    /**
235     * Index a collection of resources.
236     * @param resourceCollection the resource collection to index.
237     * @param document The document to index into.
238     * @param language The current language.
239     * @throws Exception if an error occurs while indexing.
240     */
241    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
242    {
243        if (resourceCollection == null)
244        {
245            return;
246        }
247        
248        for (AmetysObject object : resourceCollection.getChildren())
249        {
250            if (object instanceof ResourceCollection)
251            {
252                indexResourceCollection((ResourceCollection) object, document, language);
253            }
254            else if (object instanceof Resource)
255            {
256                indexResourceContent((Resource) object, document, language);
257            }
258        }
259    }
260    
261    /**
262     * Index a resource content (text in case of a document, and Dublin Core metadata).
263     * @param resource The resource to index.
264     * @param document The document to index into.
265     * @param language The current language, can be null.
266     */
267    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
268    {
269        try (InputStream is = resource.getInputStream())
270        {
271            String value = _tika.parseToString(is);
272            indexFulltextValue(document, value, language);
273            
274            if (StringUtils.isNotBlank(value))
275            {
276                int summaryEndIndex = value.lastIndexOf(' ', 200);
277                if (summaryEndIndex == -1)
278                {
279                    summaryEndIndex = value.length();
280                }
281                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
282            }
283            
284            for (String keyword : resource.getDCSubject())
285            {
286                indexFulltextValue(document, keyword, language);
287            }
288
289            String desc = resource.getDCDescription();
290            if (desc != null)
291            {
292                indexFulltextValue(document, desc, language);
293            }
294            
295            // DC meta
296            indexDublinCoreMetadata(resource, document);
297        }
298        catch (Throwable e)
299        {
300            getLogger().error("Unable to index resource at " + resource.getPath(), e);
301        }
302    }
303    
304    /**
305     * Index a full-text value.
306     * @param document The document to index into.
307     * @param text The text to index.
308     * @param language The content language, can be null.
309     */
310    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
311    {
312        if (StringUtils.isNotEmpty(language))
313        {
314            SolrContentIndexer.indexFulltextValue(document, text, language);
315        }
316        else
317        {
318            Set<String> languages = _langManager.getAvailableLanguages().keySet();
319            SolrContentIndexer.indexFulltextValue(document, text, languages);
320        }
321    }
322    
323    ///////////////////////////////////////////////////////////////////////////
324    
325    /**
326     * Index Dublin core metadata.
327     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
328     * @param document the solr input document to populate.
329     */
330    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
331    {
332        _indexNonNullValue(document, DC_TITLE, object.getDCTitle());
333        _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject());
334        _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription());
335        _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor());
336        _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage());
337        _indexNonNullValue(document, DC_CREATOR, object.getDCCreator());
338        String mimeType = _getDcFormatToIndex(object);
339        _indexNonNullValue(document, DC_FORMAT, mimeType);
340        _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage());
341        _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher());
342        _indexNonNullValue(document, DC_RIGHTS, object.getDCRights());
343        _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate()));
344        
345        SolrResourceGroupedMimeTypes.getGroup(mimeType)
346                .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType));
347    }
348    
349    private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object)
350    {
351        return Optional.of(object)
352                .map(DublinCoreAwareAmetysObject::getDCFormat)
353                // According to https://en.wikipedia.org/wiki/Media_type#Naming
354                // input format is:
355                // type "/" [tree "."] subtype ["+" suffix] *[";" parameter]
356                // just output the part without optional parameters
357                .map(mimeType -> StringUtils.substringBefore(mimeType, ";"))
358                // According to https://en.wikipedia.org/wiki/Media_type#Naming
359                // Types, subtypes, and parameter names are case-insensitive
360                .map(String::toLowerCase)
361                .orElse(null);
362    }
363    
364    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
365    {
366        if (value != null)
367        {
368            document.addField(fieldName, value);
369        }
370    }
371    
372    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
373    {
374        if (values != null)
375        {
376            for (String value : values)
377            {
378                document.addField(fieldName, value);
379            }
380        }
381    }
382    
383    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
384    {
385        if (value != null)
386        {
387            document.addField(fieldName, value);
388        }
389    }*/
390}