Source code

001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Locale;
023import java.util.Optional;
024
025import org.apache.avalon.framework.component.Component;
026import org.apache.avalon.framework.service.ServiceException;
027import org.apache.avalon.framework.service.ServiceManager;
028import org.apache.avalon.framework.service.Serviceable;
029import org.apache.commons.lang3.StringUtils;
030import org.apache.solr.common.SolrInputDocument;
031import org.apache.tika.Tika;
032import org.apache.tika.exception.ZeroByteFileException;
033
034import org.ametys.cms.data.type.indexing.IndexableDataContext;
035import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper;
036import org.ametys.cms.languages.LanguagesManager;
037import org.ametys.cms.search.solr.field.FirstValidationSearchField;
038import org.ametys.cms.search.solr.field.LastMajorValidationSearchField;
039import org.ametys.cms.search.solr.field.LastModifiedSearchField;
040import org.ametys.cms.search.solr.field.LastValidationSearchField;
041import org.ametys.core.file.TikaProvider;
042import org.ametys.core.user.UserIdentity;
043import org.ametys.plugins.explorer.resources.Resource;
044import org.ametys.plugins.explorer.resources.ResourceCollection;
045import org.ametys.plugins.explorer.resources.ResourceHelper;
046import org.ametys.plugins.repository.AmetysObject;
047import org.ametys.plugins.repository.TraversableAmetysObject;
048import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
049import org.ametys.runtime.plugin.component.AbstractLogEnabled;
050
051/**
052 * Solr resource indexer.<p>
053 * Populate a Solr input document with the following fields:
054 * <dl>
055 *  <dt>id
056 *  <dd>resource id
057 *  <dt>type
058 *  <dd>with <code>"document"</code> value
059 *  <dt>full
060 *  <dd>resource content
061 * </dl>
062 */
063public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
064{
065    /** The avalon role. */
066    public static final String ROLE = SolrResourceIndexer.class.getName();
067    
068    /** The Tika instance. */
069    protected Tika _tika;
070    
071    /** The language manager. */
072    protected LanguagesManager _langManager;
073
074    /** The solr indexer */
075    protected SolrIndexer _solrIndexer;
076    
077    @Override
078    public void service(ServiceManager manager) throws ServiceException
079    {
080        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
081        _tika = tikaProvider.getTika();
082        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
083        _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE);
084    }
085    
086    /**
087     * Index a resource.
088     * @param resource The resource to index.
089     * @param document The Solr document to index into.
090     * @param documentType The document type of the resource
091     * @throws Exception if an error occurs.
092     */
093    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
094    {
095        indexResource(resource, document, documentType, null, null);
096    }
097    
098    /**
099     * Index a resource.
100     * @param resource The resource to index.
101     * @param document The Solr document to index into.
102     * @param documentType The document type of the resource
103     * @param language The query language.
104     * @throws Exception if an error occurs.
105     */
106    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
107    {
108        indexResource(resource, document, documentType, language, null);
109    }
110    
111    /**
112     * Index a resource.
113     * @param resource The resource to index.
114     * @param document The Solr document to index into.
115     * @param documentType The document type of the resource
116     * @param resourceRoot The resource root, can be null. When null, it will have to be computed.
117     * @throws Exception if an error occurs.
118     */
119    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
120    {
121        indexResource(resource, document, documentType, null, resourceRoot);
122    }
123    
124    /**
125     * Index a resource.
126     * @param resource The resource to index.
127     * @param document The Solr document to index into.
128     * @param documentType The document type of the resource
129     * @param language The language, can be null.
130     * @param resourceRoot The resource root, can be null. When null, it will have to be computed.
131     * @throws Exception if an error occurs.
132     */
133    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
134    {
135        // Resource id
136        document.addField(ID, resource.getId());
137        // Type is resource
138        document.addField(DOCUMENT_TYPE, documentType);
139        document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE);
140        // The resource path.
141        document.setField(PATH, resource.getResourcePath());
142        document.addField(FILENAME, resource.getName());
143        
144        // Title
145        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
146        // Index title like other string values (like content attributes)
147        IndexableDataContext context = IndexableDataContext.newInstance();
148        if (StringUtils.isNotEmpty(language))
149        {
150            context.withLocale(new Locale(language));
151        }
152        IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger());
153        // Add sort indexation
154        document.setField(TITLE_SORT, resource.getName());
155        document.addField(TITLE + "_s_sort", title);
156        // Add title to "full" (already added to "systemFull")
157        IndexableDataContext fullContext = context.cloneContext()
158                .withIndexForFullTextField(true)
159                .withFullTextFieldName(SolrFieldNames.FULL);
160        IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext);
161        
162        _populateDatesOfPage(resource, document);
163
164        // Mime types
165        document.addField(MIME_TYPES, resource.getMimeType());
166        // Length
167        document.addField(LENGTH, resource.getLength());
168        
169        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
170        document.addField(RESOURCE_ROOT_ID, root.getId());
171        
172        // Parents resource collections of the resource
173        _indexAncestorIds(resource, document);
174        
175        // Resource author
176        String author = UserIdentity.userIdentityToString(resource.getCreator());
177        if (StringUtils.isNotBlank(author))
178        {
179            document.setField(RESOURCE_CREATOR, author);
180        }
181        
182        // Hard-coded content type for facets.
183        // TODO Move to specific "embedded mode" method?
184        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
185        
186        // Indexation of ACL initial values
187        _solrIndexer.indexAclInitValues(resource, document);
188        
189        indexResourceContent(resource, document, language);
190    }
191    
192    /**
193     * Populate the solr input document with dates from the resource
194     * @param resource The resource
195     * @param document The Solr document
196     */
197    protected void _populateDatesOfPage(Resource resource, SolrInputDocument document)
198    {
199        // Last modified
200        String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified());
201        // For 'new' search service
202        document.addField(LastModifiedSearchField.NAME, lastModifiedStr);
203        // For 'old' search service
204        document.addField(LAST_MODIFIED + "_dt", lastModifiedStr);
205        
206        // For 'new' search service => last validation, last major validation
207        document.addField(LastValidationSearchField.NAME, lastModifiedStr);
208        document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr);
209        
210        // For 'new' search service => first validation
211        String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate());
212        document.addField(FirstValidationSearchField.NAME, creationDateStr);
213        
214        // Solr facet specific : dates-facet
215        Date date = resource.getDCDate();
216        String formattedDate = SolrIndexer.dateFormat().format(date);
217        if (formattedDate != null)
218        {
219            document.setField(RESOURCE_DATE, formattedDate);
220            document.setField(DATE_FOR_SORTING, formattedDate);
221            document.setField(DATES_FACET, formattedDate);
222        }
223    }
224    
225    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
226    {
227        // Ancestors
228        List<String> ancestorIds = new ArrayList<>();
229        AmetysObject parent = resource.getParent();
230        while (parent instanceof ResourceCollection)
231        {
232            ancestorIds.add(parent.getId());
233            parent = parent.getParent();
234        }
235        
236        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
237        
238        // Ancestors and self
239        List<String> ancestorAndSelfIds = new ArrayList<>();
240        ancestorAndSelfIds.add(resource.getId());
241        ancestorAndSelfIds.addAll(ancestorIds);
242        document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds);
243    }
244    
245    /**
246     * Index a collection of resources.
247     * @param resourceCollection the resource collection to index.
248     * @param document The document to index into.
249     * @param language The current language.
250     * @throws Exception if an error occurs while indexing.
251     */
252    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
253    {
254        if (resourceCollection == null)
255        {
256            return;
257        }
258        
259        for (AmetysObject object : resourceCollection.getChildren())
260        {
261            if (object instanceof ResourceCollection)
262            {
263                indexResourceCollection((ResourceCollection) object, document, language);
264            }
265            else if (object instanceof Resource)
266            {
267                indexResourceContent((Resource) object, document, language);
268            }
269        }
270    }
271    
272    /**
273     * Index a resource content (text in case of a document, and Dublin Core metadata).
274     * @param resource The resource to index.
275     * @param document The document to index into.
276     * @param language The current language, can be null.
277     */
278    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
279    {
280        try (InputStream is = resource.getInputStream())
281        {
282            String value = _getResourceContent(resource);
283            
284            indexFulltextValue(document, value, language);
285            
286            if (StringUtils.isNotBlank(value))
287            {
288                int summaryEndIndex = value.lastIndexOf(' ', 200);
289                if (summaryEndIndex == -1)
290                {
291                    summaryEndIndex = value.length();
292                }
293                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
294            }
295            
296            for (String keyword : resource.getDCSubject())
297            {
298                indexFulltextValue(document, keyword, language);
299            }
300
301            String desc = resource.getDCDescription();
302            if (desc != null)
303            {
304                indexFulltextValue(document, desc, language);
305            }
306            
307            // DC meta
308            indexDublinCoreMetadata(resource, document);
309        }
310        catch (Throwable e)
311        {
312            getLogger().error("Unable to index resource at " + resource.getPath(), e);
313        }
314    }
315    
316    private String _getResourceContent(Resource resource) throws Throwable
317    {
318        try (InputStream is = resource.getInputStream())
319        {
320            return _tika.parseToString(is);
321        }
322        catch (ZeroByteFileException e)
323        {
324            // Ignore it, the file is empty, nothing to do
325            return StringUtils.EMPTY;
326        }
327        catch (Throwable e)
328        {
329            throw e;
330        }
331    }
332    
333    /**
334     * Index a full-text value.
335     * @param document The document to index into.
336     * @param text The text to index.
337     * @param language The content language, can be null.
338     */
339    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
340    {
341        IndexableDataContext context = IndexableDataContext.newInstance()
342                .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future
343        if (StringUtils.isNotEmpty(language))
344        {
345            context.withLocale(new Locale(language));
346        }
347        
348        // Index the document in systemFull
349        IndexableElementTypeHelper.indexFulltextValue(document, text, context);
350        
351        // Then in full
352        IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL));
353    }
354    
355    ///////////////////////////////////////////////////////////////////////////
356    
357    /**
358     * Index Dublin core metadata.
359     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
360     * @param document the solr input document to populate.
361     */
362    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
363    {
364        _indexNonNullValue(document, DC_TITLE, object.getDCTitle());
365        _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject());
366        _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription());
367        _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor());
368        _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage());
369        _indexNonNullValue(document, DC_CREATOR, object.getDCCreator());
370        String mimeType = _getDcFormatToIndex(object);
371        _indexNonNullValue(document, DC_FORMAT, mimeType);
372        _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage());
373        _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher());
374        _indexNonNullValue(document, DC_RIGHTS, object.getDCRights());
375        _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate()));
376        
377        SolrResourceGroupedMimeTypes.getGroup(mimeType)
378                .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType));
379    }
380    
381    private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object)
382    {
383        return Optional.of(object)
384                .map(DublinCoreAwareAmetysObject::getDCFormat)
385                // According to https://en.wikipedia.org/wiki/Media_type#Naming
386                // input format is:
387                // type "/" [tree "."] subtype ["+" suffix] *[";" parameter]
388                // just output the part without optional parameters
389                .map(mimeType -> StringUtils.substringBefore(mimeType, ";"))
390                // According to https://en.wikipedia.org/wiki/Media_type#Naming
391                // Types, subtypes, and parameter names are case-insensitive
392                .map(String::toLowerCase)
393                .orElse(null);
394    }
395    
396    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
397    {
398        if (value != null)
399        {
400            document.addField(fieldName, value);
401        }
402    }
403    
404    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
405    {
406        if (values != null)
407        {
408            for (String value : values)
409            {
410                document.addField(fieldName, value);
411            }
412        }
413    }
414    
415    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
416    {
417        if (value != null)
418        {
419            document.addField(fieldName, value);
420        }
421    }*/
422}