001/*
002 *  Copyright 2015 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.content.indexing.solr;
017
018import java.io.InputStream;
019import java.util.ArrayList;
020import java.util.Date;
021import java.util.List;
022import java.util.Set;
023
024import org.apache.avalon.framework.component.Component;
025import org.apache.avalon.framework.service.ServiceException;
026import org.apache.avalon.framework.service.ServiceManager;
027import org.apache.avalon.framework.service.Serviceable;
028import org.apache.commons.lang3.StringUtils;
029import org.apache.solr.common.SolrInputDocument;
030import org.apache.tika.Tika;
031
032import org.ametys.cms.languages.LanguagesManager;
033import org.ametys.cms.search.solr.field.FirstValidationSearchField;
034import org.ametys.cms.search.solr.field.LastMajorValidationSearchField;
035import org.ametys.cms.search.solr.field.LastModifiedSearchField;
036import org.ametys.cms.search.solr.field.LastValidationSearchField;
037import org.ametys.core.user.UserIdentity;
038import org.ametys.plugins.explorer.resources.Resource;
039import org.ametys.plugins.explorer.resources.ResourceCollection;
040import org.ametys.plugins.explorer.resources.ResourceHelper;
041import org.ametys.plugins.explorer.resources.metadata.TikaProvider;
042import org.ametys.plugins.repository.AmetysObject;
043import org.ametys.plugins.repository.TraversableAmetysObject;
044import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject;
045import org.ametys.runtime.plugin.component.AbstractLogEnabled;
046
047/**
048 * Solr resource indexer.<p>
049 * Populate a Solr input document with the following fields:
050 * <dl>
051 *  <dt>id
052 *  <dd>resource id
053 *  <dt>type
054 *  <dd>with <code>"document"</code> value
055 *  <dt>full
056 *  <dd>resource content
057 * </dl>
058 */
059public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames
060{
061    /** The avalon role. */
062    public static final String ROLE = SolrResourceIndexer.class.getName();
063    
064    /** The Tika instance. */
065    protected Tika _tika;
066    
067    /** The language manager. */
068    protected LanguagesManager _langManager;
069
070    /** The solr indexer */
071    protected SolrIndexer _solrIndexer;
072    
073    @Override
074    public void service(ServiceManager manager) throws ServiceException
075    {
076        TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE);
077        _tika = tikaProvider.getTika();
078        _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE);
079        _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE);
080    }
081    
082    /**
083     * Index a resource.
084     * @param resource The resource to index.
085     * @param document The Solr document to index into.
086     * @param documentType The document type of the resource
087     * @throws Exception if an error occurs.
088     */
089    public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception
090    {
091        indexResource(resource, document, documentType, null, null);
092    }
093    
094    /**
095     * Index a resource.
096     * @param resource The resource to index.
097     * @param document The Solr document to index into.
098     * @param documentType The document type of the resource
099     * @param language The query language.
100     * @throws Exception if an error occurs.
101     */
102    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception
103    {
104        indexResource(resource, document, documentType, language, null);
105    }
106    
107    /**
108     * Index a resource.
109     * @param resource The resource to index.
110     * @param document The Solr document to index into.
111     * @param documentType The document type of the resource
112     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
113     * @throws Exception if an error occurs.
114     */
115    public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception
116    {
117        indexResource(resource, document, documentType, null, resourceRoot);
118    }
119    
120    /**
121     * Index a resource.
122     * @param resource The resource to index.
123     * @param document The Solr document to index into.
124     * @param documentType The document type of the resource
125     * @param language The language, can be null.
126     * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 
127     * @throws Exception if an error occurs.
128     */
129    public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception
130    {
131        // Resource id - Store.YES, Index.NOT_ANALYZED
132        document.addField(ID, resource.getId());
133        // Type is resource - Store.YES, Index.NOT_ANALYZED
134        document.addField(DOCUMENT_TYPE, documentType);
135        document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE);
136        // The resource path.
137        document.setField(PATH, resource.getResourcePath());
138        document.addField(FILENAME, resource.getName());
139        
140        // Title
141        String title = StringUtils.substringBeforeLast(resource.getName(), ".");
142        document.addField(TITLE, title);
143        document.setField(TITLE_SORT, resource.getName());
144        document.addField(TITLE + "_s", title);
145        document.addField(TITLE + "_s_sort", title);
146        
147        // Replaces "all-not-analyzed".
148        indexFulltextValue(document, title, language);
149        
150        _populateDatesOfPage(resource, document);
151
152        // Mime types - Store.YES, Index.ANALYZED
153        document.addField(MIME_TYPES, resource.getMimeType());
154        // Length - Store.YES, Index.NO
155        document.addField(LENGTH, resource.getLength());
156        
157        AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot;
158        document.addField(RESOURCE_ROOT_ID, root.getId());
159        
160        // Parents resource collections of the resource
161        _indexAncestorIds(resource, document);
162        
163        // Resource author
164        String author = UserIdentity.userIdentityToString(resource.getCreator());
165        if (StringUtils.isNotBlank(author))
166        {
167            document.setField(RESOURCE_CREATOR, author);
168        }
169        
170        // Hard-coded content type for facets.
171        // TODO Move to specific "embedded mode" method?
172        document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE);
173        
174        // Indexation of ACL initial values
175        _solrIndexer.indexAclInitValues(resource, document);
176        
177        indexResourceContent(resource, document, language);
178    }
179    
180    /**
181     * Populate the solr input document with dates from the resource
182     * @param resource The resource
183     * @param document The Solr document
184     */
185    protected void _populateDatesOfPage(Resource resource, SolrInputDocument document)
186    {
187        // Last modified
188        String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified());
189        // For 'new' search service
190        document.addField(LastModifiedSearchField.NAME, lastModifiedStr);
191        // For 'old' search service
192        document.addField(LAST_MODIFIED + "_dt", lastModifiedStr);
193        
194        // For 'new' search service => last validation, last major validation
195        document.addField(LastValidationSearchField.NAME, lastModifiedStr);
196        document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr);
197        
198        // For 'new' search service => first validation
199        String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate());
200        document.addField(FirstValidationSearchField.NAME, creationDateStr);
201        
202        // Solr facet specific : dates-facet
203        Date date = resource.getDCDate();
204        String formattedDate = SolrIndexer.dateFormat().format(date);
205        if (formattedDate != null)
206        {
207            document.setField(RESOURCE_DATE, formattedDate);
208            document.setField(DATE_FOR_SORTING, formattedDate);
209            document.setField(DATES_FACET, formattedDate);
210        }
211    }
212    
213    private void _indexAncestorIds(Resource resource, SolrInputDocument document)
214    {
215        // Ancestors
216        List<String> ancestorIds = new ArrayList<>();
217        AmetysObject parent = resource.getParent();
218        while (parent instanceof ResourceCollection)
219        {
220            ancestorIds.add(parent.getId());
221            parent = parent.getParent();
222        }
223        
224        document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds);
225        
226        // Ancestors and self
227        List<String> ancestorAndSelfIds = new ArrayList<>();
228        ancestorAndSelfIds.add(resource.getId());
229        ancestorAndSelfIds.addAll(ancestorIds);
230        document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds);
231    }
232    
233    /**
234     * Index a collection of resources.
235     * @param resourceCollection the resource collection to index.
236     * @param document The document to index into.
237     * @param language The current language.
238     * @throws Exception if an error occurs while indexing.
239     */
240    public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception
241    {
242        if (resourceCollection == null)
243        {
244            return;
245        }
246        
247        for (AmetysObject object : resourceCollection.getChildren())
248        {
249            if (object instanceof ResourceCollection)
250            {
251                indexResourceCollection((ResourceCollection) object, document, language);
252            }
253            else if (object instanceof Resource)
254            {
255                indexResourceContent((Resource) object, document, language);
256            }
257        }
258    }
259    
260    /**
261     * Index a resource content (text in case of a document, and Dublin Core metadata).
262     * @param resource The resource to index.
263     * @param document The document to index into.
264     * @param language The current language, can be null.
265     */
266    public void indexResourceContent(Resource resource, SolrInputDocument document, String language)
267    {
268        try (InputStream is = resource.getInputStream())
269        {
270            String value = _tika.parseToString(is);
271            indexFulltextValue(document, value, language);
272            
273            if (StringUtils.isNotBlank(value))
274            {
275                int summaryEndIndex = value.lastIndexOf(' ', 200);
276                if (summaryEndIndex == -1)
277                {
278                    summaryEndIndex = value.length();
279                }
280                document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : ""));
281            }
282            
283            for (String keyword : resource.getDCSubject())
284            {
285                indexFulltextValue(document, keyword, language);
286            }
287
288            String desc = resource.getDCDescription();
289            if (desc != null)
290            {
291                indexFulltextValue(document, desc, language);
292            }
293            
294            // DC meta
295            indexDublinCoreMetadata(resource, document);
296        }
297        catch (Throwable e)
298        {
299            getLogger().error("Unable to index resource at " + resource.getPath(), e);
300        }
301    }
302    
303    /**
304     * Index a full-text value.
305     * @param document The document to index into.
306     * @param text The text to index.
307     * @param language The content language, can be null.
308     */
309    protected void indexFulltextValue(SolrInputDocument document, String text, String language)
310    {
311        if (StringUtils.isNotEmpty(language))
312        {
313            SolrContentIndexer.indexFulltextValue(document, text, language);
314        }
315        else
316        {
317            Set<String> languages = _langManager.getAvailableLanguages().keySet();
318            SolrContentIndexer.indexFulltextValue(document, text, languages);
319        }
320    }
321    
322    ///////////////////////////////////////////////////////////////////////////
323    
324    /**
325     * Index Dublin core metadata.
326     * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata.
327     * @param document the solr input document to populate.
328     */
329    public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document)
330    {
331        _indexNonNullValue(document, "DCTitle", object.getDCTitle());
332        _indexNonNullValue(document, "DCSubject", object.getDCSubject());
333        _indexNonNullValue(document, "DCDescription", object.getDCDescription());
334        _indexNonNullValue(document, "DCContributor", object.getDCContributor());
335        _indexNonNullValue(document, "DCCoverage", object.getDCCoverage());
336        _indexNonNullValue(document, "DCCreator", object.getDCCreator());
337        _indexNonNullValue(document, "DCFormat", object.getDCFormat());
338        _indexNonNullValue(document, "DCLanguage", object.getDCLanguage());
339        _indexNonNullValue(document, "DCPublisher", object.getDCPublisher());
340        _indexNonNullValue(document, "DCRights", object.getDCRights());
341        _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate()));
342    }
343    
344    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value)
345    {
346        if (value != null)
347        {
348            document.addField(fieldName, value);
349        }
350    }
351    
352    private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values)
353    {
354        if (values != null)
355        {
356            for (String value : values)
357            {
358                document.addField(fieldName, value);
359            }
360        }
361    }
362    
363    /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value)
364    {
365        if (value != null)
366        {
367            document.addField(fieldName, value);
368        }
369    }*/
370}