001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Locale; 023import java.util.Optional; 024 025import org.apache.avalon.framework.component.Component; 026import org.apache.avalon.framework.service.ServiceException; 027import org.apache.avalon.framework.service.ServiceManager; 028import org.apache.avalon.framework.service.Serviceable; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032 033import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper; 034import org.ametys.cms.languages.LanguagesManager; 035import org.ametys.cms.search.solr.field.FirstValidationSearchField; 036import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 037import org.ametys.cms.search.solr.field.LastModifiedSearchField; 038import org.ametys.cms.search.solr.field.LastValidationSearchField; 039import org.ametys.core.user.UserIdentity; 040import org.ametys.plugins.explorer.resources.Resource; 041import org.ametys.plugins.explorer.resources.ResourceCollection; 042import org.ametys.plugins.explorer.resources.ResourceHelper; 043import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 044import org.ametys.plugins.repository.AmetysObject; 045import org.ametys.plugins.repository.TraversableAmetysObject; 046import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 047import org.ametys.runtime.model.type.DataContext; 048import org.ametys.runtime.plugin.component.AbstractLogEnabled; 049 050/** 051 * Solr resource indexer.<p> 052 * Populate a Solr input document with the following fields: 053 * <dl> 054 * <dt>id 055 * <dd>resource id 056 * <dt>type 057 * <dd>with <code>"document"</code> value 058 * <dt>full 059 * <dd>resource content 060 * </dl> 061 */ 062public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 063{ 064 /** The avalon role. */ 065 public static final String ROLE = SolrResourceIndexer.class.getName(); 066 067 /** The Tika instance. */ 068 protected Tika _tika; 069 070 /** The language manager. */ 071 protected LanguagesManager _langManager; 072 073 /** The solr indexer */ 074 protected SolrIndexer _solrIndexer; 075 076 @Override 077 public void service(ServiceManager manager) throws ServiceException 078 { 079 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 080 _tika = tikaProvider.getTika(); 081 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 082 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 083 } 084 085 /** 086 * Index a resource. 087 * @param resource The resource to index. 088 * @param document The Solr document to index into. 089 * @param documentType The document type of the resource 090 * @throws Exception if an error occurs. 091 */ 092 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 093 { 094 indexResource(resource, document, documentType, null, null); 095 } 096 097 /** 098 * Index a resource. 099 * @param resource The resource to index. 100 * @param document The Solr document to index into. 101 * @param documentType The document type of the resource 102 * @param language The query language. 103 * @throws Exception if an error occurs. 104 */ 105 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 106 { 107 indexResource(resource, document, documentType, language, null); 108 } 109 110 /** 111 * Index a resource. 112 * @param resource The resource to index. 113 * @param document The Solr document to index into. 114 * @param documentType The document type of the resource 115 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 116 * @throws Exception if an error occurs. 117 */ 118 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 119 { 120 indexResource(resource, document, documentType, null, resourceRoot); 121 } 122 123 /** 124 * Index a resource. 125 * @param resource The resource to index. 126 * @param document The Solr document to index into. 127 * @param documentType The document type of the resource 128 * @param language The language, can be null. 129 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 130 * @throws Exception if an error occurs. 131 */ 132 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 133 { 134 // Resource id - Store.YES, Index.NOT_ANALYZED 135 document.addField(ID, resource.getId()); 136 // Type is resource - Store.YES, Index.NOT_ANALYZED 137 document.addField(DOCUMENT_TYPE, documentType); 138 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 139 // The resource path. 140 document.setField(PATH, resource.getResourcePath()); 141 document.addField(FILENAME, resource.getName()); 142 143 // Title 144 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 145 document.addField(TITLE, title); 146 document.setField(TITLE_SORT, resource.getName()); 147 document.addField(TITLE + "_s", title); 148 document.addField(TITLE + "_s_sort", title); 149 document.addField(TITLE + "_s_lower", title.toLowerCase()); 150 151 // Replaces "all-not-analyzed". 152 indexFulltextValue(document, title, language); 153 154 _populateDatesOfPage(resource, document); 155 156 // Mime types - Store.YES, Index.ANALYZED 157 document.addField(MIME_TYPES, resource.getMimeType()); 158 // Length - Store.YES, Index.NO 159 document.addField(LENGTH, resource.getLength()); 160 161 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 162 document.addField(RESOURCE_ROOT_ID, root.getId()); 163 164 // Parents resource collections of the resource 165 _indexAncestorIds(resource, document); 166 167 // Resource author 168 String author = UserIdentity.userIdentityToString(resource.getCreator()); 169 if (StringUtils.isNotBlank(author)) 170 { 171 document.setField(RESOURCE_CREATOR, author); 172 } 173 174 // Hard-coded content type for facets. 175 // TODO Move to specific "embedded mode" method? 176 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 177 178 // Indexation of ACL initial values 179 _solrIndexer.indexAclInitValues(resource, document); 180 181 indexResourceContent(resource, document, language); 182 } 183 184 /** 185 * Populate the solr input document with dates from the resource 186 * @param resource The resource 187 * @param document The Solr document 188 */ 189 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 190 { 191 // Last modified 192 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 193 // For 'new' search service 194 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 195 // For 'old' search service 196 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 197 198 // For 'new' search service => last validation, last major validation 199 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 200 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 201 202 // For 'new' search service => first validation 203 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 204 document.addField(FirstValidationSearchField.NAME, creationDateStr); 205 206 // Solr facet specific : dates-facet 207 Date date = resource.getDCDate(); 208 String formattedDate = SolrIndexer.dateFormat().format(date); 209 if (formattedDate != null) 210 { 211 document.setField(RESOURCE_DATE, formattedDate); 212 document.setField(DATE_FOR_SORTING, formattedDate); 213 document.setField(DATES_FACET, formattedDate); 214 } 215 } 216 217 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 218 { 219 // Ancestors 220 List<String> ancestorIds = new ArrayList<>(); 221 AmetysObject parent = resource.getParent(); 222 while (parent instanceof ResourceCollection) 223 { 224 ancestorIds.add(parent.getId()); 225 parent = parent.getParent(); 226 } 227 228 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 229 230 // Ancestors and self 231 List<String> ancestorAndSelfIds = new ArrayList<>(); 232 ancestorAndSelfIds.add(resource.getId()); 233 ancestorAndSelfIds.addAll(ancestorIds); 234 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 235 } 236 237 /** 238 * Index a collection of resources. 239 * @param resourceCollection the resource collection to index. 240 * @param document The document to index into. 241 * @param language The current language. 242 * @throws Exception if an error occurs while indexing. 243 */ 244 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 245 { 246 if (resourceCollection == null) 247 { 248 return; 249 } 250 251 for (AmetysObject object : resourceCollection.getChildren()) 252 { 253 if (object instanceof ResourceCollection) 254 { 255 indexResourceCollection((ResourceCollection) object, document, language); 256 } 257 else if (object instanceof Resource) 258 { 259 indexResourceContent((Resource) object, document, language); 260 } 261 } 262 } 263 264 /** 265 * Index a resource content (text in case of a document, and Dublin Core metadata). 266 * @param resource The resource to index. 267 * @param document The document to index into. 268 * @param language The current language, can be null. 269 */ 270 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 271 { 272 try (InputStream is = resource.getInputStream()) 273 { 274 String value = _tika.parseToString(is); 275 indexFulltextValue(document, value, language); 276 277 if (StringUtils.isNotBlank(value)) 278 { 279 int summaryEndIndex = value.lastIndexOf(' ', 200); 280 if (summaryEndIndex == -1) 281 { 282 summaryEndIndex = value.length(); 283 } 284 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 285 } 286 287 for (String keyword : resource.getDCSubject()) 288 { 289 indexFulltextValue(document, keyword, language); 290 } 291 292 String desc = resource.getDCDescription(); 293 if (desc != null) 294 { 295 indexFulltextValue(document, desc, language); 296 } 297 298 // DC meta 299 indexDublinCoreMetadata(resource, document); 300 } 301 catch (Throwable e) 302 { 303 getLogger().error("Unable to index resource at " + resource.getPath(), e); 304 } 305 } 306 307 /** 308 * Index a full-text value. 309 * @param document The document to index into. 310 * @param text The text to index. 311 * @param language The content language, can be null. 312 */ 313 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 314 { 315 DataContext context = DataContext.newInstance(); 316 if (StringUtils.isNotEmpty(language)) 317 { 318 context.withLocale(new Locale(language)); 319 } 320 321 IndexableElementTypeHelper.indexFulltextValue(document, text, context); 322 } 323 324 /////////////////////////////////////////////////////////////////////////// 325 326 /** 327 * Index Dublin core metadata. 328 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 329 * @param document the solr input document to populate. 330 */ 331 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 332 { 333 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 334 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 335 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 336 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 337 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 338 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 339 String mimeType = _getDcFormatToIndex(object); 340 _indexNonNullValue(document, DC_FORMAT, mimeType); 341 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 342 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 343 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 344 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 345 346 SolrResourceGroupedMimeTypes.getGroup(mimeType) 347 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 348 } 349 350 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 351 { 352 return Optional.of(object) 353 .map(DublinCoreAwareAmetysObject::getDCFormat) 354 // According to https://en.wikipedia.org/wiki/Media_type#Naming 355 // input format is: 356 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 357 // just output the part without optional parameters 358 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 359 // According to https://en.wikipedia.org/wiki/Media_type#Naming 360 // Types, subtypes, and parameter names are case-insensitive 361 .map(String::toLowerCase) 362 .orElse(null); 363 } 364 365 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 366 { 367 if (value != null) 368 { 369 document.addField(fieldName, value); 370 } 371 } 372 373 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 374 { 375 if (values != null) 376 { 377 for (String value : values) 378 { 379 document.addField(fieldName, value); 380 } 381 } 382 } 383 384 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 385 { 386 if (value != null) 387 { 388 document.addField(fieldName, value); 389 } 390 }*/ 391}