001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Optional;
023
024import org.apache.avalon.framework.component.Component;
025import org.apache.avalon.framework.service.ServiceException;
026import org.apache.avalon.framework.service.ServiceManager;
027import org.apache.avalon.framework.service.Serviceable;
028import org.apache.commons.lang3.LocaleUtils;
029import org.apache.commons.lang3.StringUtils;
030import org.apache.solr.common.SolrInputDocument;
031import org.apache.tika.Tika;
032import org.apache.tika.exception.ZeroByteFileException;
033
034import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper;
035import org.ametys.cms.languages.LanguagesManager;
036import org.ametys.cms.model.CMSDataContext;
037import org.ametys.cms.search.systemprop.ContentTypeSystemProperty;
038import org.ametys.cms.search.systemprop.FirstValidationSystemProperty;
039import org.ametys.cms.search.systemprop.LastMajorValidationSystemProperty;
040import org.ametys.cms.search.systemprop.LastModifiedSystemProperty;
041import org.ametys.cms.search.systemprop.LastValidationSystemProperty;
042import org.ametys.core.file.TikaProvider;
043import org.ametys.core.user.UserIdentity;
044import org.ametys.plugins.explorer.resources.Resource;
045import org.ametys.plugins.explorer.resources.ResourceCollection;
046import org.ametys.plugins.explorer.resources.ResourceHelper;
047import org.ametys.plugins.repository.AmetysObject;
048import org.ametys.plugins.repository.TraversableAmetysObject;
049import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
050import org.ametys.runtime.plugin.component.AbstractLogEnabled;
051
052/**
053 * Solr resource indexer.<p>
054 * Populate a Solr input document with the following fields:
055 * <dl>
056 *  <dt>id
057 *  <dd>resource id
058 *  <dt>type
059 *  <dd>with <code>"document"</code> value
060 *  <dt>full
061 *  <dd>resource content
062 * </dl>
063 */
064public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
065{
066    /** The avalon role. */
067    public static final String ROLE = SolrResourceIndexer.class.getName();
068    
069    /** The Tika instance. */
070    protected Tika _tika;
071    
072    /** The language manager. */
073    protected LanguagesManager _langManager;
074
075    /** The solr indexer */
076    protected SolrIndexer _solrIndexer;
077    
078    @Override
079    public void service(ServiceManager manager) throws ServiceException
080    {
081        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
082        _tika = tikaProvider.getTika();
083        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
084        _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE);
085    }
086    
087    /**
088     * Index a resource.
089     * @param resource The resource to index.
090     * @param document The Solr document to index into.
091     * @param documentType The document type of the resource
092     * @throws Exception if an error occurs.
093     */
094    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
095    {
096        indexResource(resource, document, documentType, null, null);
097    }
098    
099    /**
100     * Index a resource.
101     * @param resource The resource to index.
102     * @param document The Solr document to index into.
103     * @param documentType The document type of the resource
104     * @param language The query language.
105     * @throws Exception if an error occurs.
106     */
107    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
108    {
109        indexResource(resource, document, documentType, language, null);
110    }
111    
112    /**
113     * Index a resource.
114     * @param resource The resource to index.
115     * @param document The Solr document to index into.
116     * @param documentType The document type of the resource
117     * @param resourceRoot The resource root, can be null. When null, it will have to be computed.
118     * @throws Exception if an error occurs.
119     */
120    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
121    {
122        indexResource(resource, document, documentType, null, resourceRoot);
123    }
124    
125    /**
126     * Index a resource.
127     * @param resource The resource to index.
128     * @param document The Solr document to index into.
129     * @param documentType The document type of the resource
130     * @param language The language, can be null.
131     * @param resourceRoot The resource root, can be null. When null, it will have to be computed.
132     * @throws Exception if an error occurs.
133     */
134    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
135    {
136        // Resource id
137        document.addField(ID, resource.getId());
138        // Type is resource
139        document.addField(DOCUMENT_TYPE, documentType);
140        document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE);
141        // The resource path.
142        document.setField(PATH, resource.getResourcePath());
143        document.addField(FILENAME, resource.getName());
144        
145        // Title
146        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
147        // Index title like other string values (like content attributes)
148        CMSDataContext context = CMSDataContext.newInstance();
149        if (StringUtils.isNotEmpty(language))
150        {
151            context.withLocale(LocaleUtils.toLocale(language));
152        }
153        IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger());
154        // Add sort indexation
155        document.setField(TITLE_SORT, resource.getName());
156        document.addField(TITLE + "_s_sort", title);
157        // Add title to "full" (already added to "systemFull")
158        CMSDataContext fullContext = context.cloneContext()
159                                            .withIndexForFullTextField(true)
160                                            .withFullTextFieldName(SolrFieldNames.FULL);
161        IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext);
162        
163        _populateDatesOfPage(resource, document);
164
165        // Mime types
166        document.addField(MIME_TYPES, resource.getMimeType());
167        // Length
168        document.addField(LENGTH, resource.getLength());
169        
170        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
171        document.addField(RESOURCE_ROOT_ID, root.getId());
172        
173        // Parents resource collections of the resource
174        _indexAncestorIds(resource, document);
175        
176        // Resource author
177        String author = UserIdentity.userIdentityToString(resource.getCreator());
178        if (StringUtils.isNotBlank(author))
179        {
180            document.setField(RESOURCE_CREATOR, author);
181        }
182        
183        // Hard-coded content type for facets.
184        // TODO Move to specific "embedded mode" method?
185        document.addField(ContentTypeSystemProperty.CONTENT_TYPES_SOLR_FIELD_NAME, CONTENT_TYPE_RESOURCE);
186        
187        // Indexation of ACL initial values
188        _solrIndexer.indexAclInitValues(resource, document);
189        
190        indexResourceContent(resource, document, language);
191    }
192    
193    /**
194     * Populate the solr input document with dates from the resource
195     * @param resource The resource
196     * @param document The Solr document
197     */
198    protected void _populateDatesOfPage(Resource resource, SolrInputDocument document)
199    {
200        // Last modified
201        _getFormattedDate(resource.getLastModified()).ifPresent(
202            lastModified ->
203            {
204                // For 'new' search service
205                document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME, lastModified);
206                // For 'old' search service
207                document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME + "_dt", lastModified);
208                
209                // For 'new' search service => last validation, last major validation
210                document.addField(LastValidationSystemProperty.SOLR_FIELD_NAME, lastModified);
211                document.addField(LastMajorValidationSystemProperty.SOLR_FIELD_NAME, lastModified);
212            }
213        );
214        
215        // For 'new' search service => first validation
216        _getFormattedDate(resource.getCreationDate()).ifPresent(
217            creationDate ->
218            {
219                // For 'new' search service
220                document.addField(FirstValidationSystemProperty.SOLR_FIELD_NAME, creationDate);
221            }
222        );
223        
224        // Solr facet specific : dates-facet
225        _getFormattedDate(resource.getDCDate()).ifPresent(
226            formattedDate ->
227            {
228                document.setField(RESOURCE_DATE, formattedDate);
229                document.setField(DATE_FOR_SORTING, formattedDate);
230                document.setField(DATES_FACET, formattedDate);
231            }
232        );
233    }
234    
235    private Optional<String> _getFormattedDate(Date date)
236    {
237        return Optional.ofNullable(date)
238                .map(SolrIndexer.dateFormat()::format);
239    }
240    
241    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
242    {
243        // Ancestors
244        List<String> ancestorIds = new ArrayList<>();
245        AmetysObject parent = resource.getParent();
246        while (parent instanceof ResourceCollection)
247        {
248            ancestorIds.add(parent.getId());
249            parent = parent.getParent();
250        }
251        
252        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
253        
254        // Ancestors and self
255        List<String> ancestorAndSelfIds = new ArrayList<>();
256        ancestorAndSelfIds.add(resource.getId());
257        ancestorAndSelfIds.addAll(ancestorIds);
258        document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds);
259    }
260    
261    /**
262     * Index a collection of resources.
263     * @param resourceCollection the resource collection to index.
264     * @param document The document to index into.
265     * @param language The current language.
266     * @throws Exception if an error occurs while indexing.
267     */
268    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
269    {
270        if (resourceCollection == null)
271        {
272            return;
273        }
274        
275        for (AmetysObject object : resourceCollection.getChildren())
276        {
277            if (object instanceof ResourceCollection)
278            {
279                indexResourceCollection((ResourceCollection) object, document, language);
280            }
281            else if (object instanceof Resource)
282            {
283                indexResourceContent((Resource) object, document, language);
284            }
285        }
286    }
287    
288    /**
289     * Index a resource content (text in case of a document, and Dublin Core metadata).
290     * @param resource The resource to index.
291     * @param document The document to index into.
292     * @param language The current language, can be null.
293     */
294    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
295    {
296        try (InputStream is = resource.getInputStream())
297        {
298            String value = _getResourceContent(resource);
299            
300            indexFulltextValue(document, value, language);
301            
302            if (StringUtils.isNotBlank(value))
303            {
304                int summaryEndIndex = value.lastIndexOf(' ', 200);
305                if (summaryEndIndex == -1)
306                {
307                    summaryEndIndex = value.length();
308                }
309                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
310            }
311
312            String[] dcSubject = resource.getDCSubject();
313            if (dcSubject != null)
314            {
315                for (String keyword : dcSubject)
316                {
317                    indexFulltextValue(document, keyword, language);
318                }
319            }
320
321            String desc = resource.getDCDescription();
322            if (desc != null)
323            {
324                indexFulltextValue(document, desc, language);
325            }
326            
327            // DC meta
328            indexDublinCoreMetadata(resource, document);
329        }
330        catch (Throwable e)
331        {
332            getLogger().error("Unable to index resource at " + resource.getPath(), e);
333        }
334    }
335    
336    private String _getResourceContent(Resource resource) throws Throwable
337    {
338        try (InputStream is = resource.getInputStream())
339        {
340            return _tika.parseToString(is);
341        }
342        catch (ZeroByteFileException e)
343        {
344            // Ignore it, the file is empty, nothing to do
345            return StringUtils.EMPTY;
346        }
347        catch (Throwable e)
348        {
349            throw e;
350        }
351    }
352    
353    /**
354     * Index a full-text value.
355     * @param document The document to index into.
356     * @param text The text to index.
357     * @param language The content language, can be null.
358     */
359    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
360    {
361        CMSDataContext context = CMSDataContext.newInstance()
362                .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future
363        if (StringUtils.isNotEmpty(language))
364        {
365            context.withLocale(LocaleUtils.toLocale(language));
366        }
367        
368        // Index the document in systemFull
369        IndexableElementTypeHelper.indexFulltextValue(document, text, context);
370        
371        // Then in full
372        IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL));
373    }
374    
375    ///////////////////////////////////////////////////////////////////////////
376    
377    /**
378     * Index Dublin core metadata.
379     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
380     * @param document the solr input document to populate.
381     */
382    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
383    {
384        _indexNonNullValue(document, DC_TITLE, object.getDCTitle());
385        _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject());
386        _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription());
387        _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor());
388        _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage());
389        _indexNonNullValue(document, DC_CREATOR, object.getDCCreator());
390        String mimeType = _getDcFormatToIndex(object);
391        _indexNonNullValue(document, DC_FORMAT, mimeType);
392        _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage());
393        _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher());
394        _indexNonNullValue(document, DC_RIGHTS, object.getDCRights());
395        _indexNonNullValue(document, DC_DATE, _getFormattedDate(object.getDCDate()).orElse(null));
396        
397        SolrResourceGroupedMimeTypes.getGroup(mimeType)
398                .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType));
399    }
400    
401    private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object)
402    {
403        return Optional.of(object)
404                .map(DublinCoreAwareAmetysObject::getDCFormat)
405                // According to https://en.wikipedia.org/wiki/Media_type#Naming
406                // input format is:
407                // type "/" [tree "."] subtype ["+" suffix] *[";" parameter]
408                // just output the part without optional parameters
409                .map(mimeType -> StringUtils.substringBefore(mimeType, ";"))
410                // According to https://en.wikipedia.org/wiki/Media_type#Naming
411                // Types, subtypes, and parameter names are case-insensitive
412                .map(String::toLowerCase)
413                .orElse(null);
414    }
415    
416    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
417    {
418        if (value != null)
419        {
420            document.addField(fieldName, value);
421        }
422    }
423    
424    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
425    {
426        if (values != null)
427        {
428            for (String value : values)
429            {
430                document.addField(fieldName, value);
431            }
432        }
433    }
434    
435    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
436    {
437        if (value != null)
438        {
439            document.addField(fieldName, value);
440        }
441    }*/
442}