001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Optional; 023 024import org.apache.avalon.framework.component.Component; 025import org.apache.avalon.framework.service.ServiceException; 026import org.apache.avalon.framework.service.ServiceManager; 027import org.apache.avalon.framework.service.Serviceable; 028import org.apache.commons.lang3.LocaleUtils; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032import org.apache.tika.exception.ZeroByteFileException; 033 034import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper; 035import org.ametys.cms.languages.LanguagesManager; 036import org.ametys.cms.model.CMSDataContext; 037import org.ametys.cms.search.systemprop.ContentTypeSystemProperty; 038import org.ametys.cms.search.systemprop.FirstValidationSystemProperty; 039import org.ametys.cms.search.systemprop.LastMajorValidationSystemProperty; 040import org.ametys.cms.search.systemprop.LastModifiedSystemProperty; 041import org.ametys.cms.search.systemprop.LastValidationSystemProperty; 042import org.ametys.core.file.TikaProvider; 043import org.ametys.core.user.UserIdentity; 044import org.ametys.plugins.explorer.resources.Resource; 045import org.ametys.plugins.explorer.resources.ResourceCollection; 046import org.ametys.plugins.explorer.resources.ResourceHelper; 047import org.ametys.plugins.repository.AmetysObject; 048import org.ametys.plugins.repository.TraversableAmetysObject; 049import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 050import org.ametys.runtime.plugin.component.AbstractLogEnabled; 051 052/** 053 * Solr resource indexer.<p> 054 * Populate a Solr input document with the following fields: 055 * <dl> 056 * <dt>id 057 * <dd>resource id 058 * <dt>type 059 * <dd>with <code>"document"</code> value 060 * <dt>full 061 * <dd>resource content 062 * </dl> 063 */ 064public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 065{ 066 /** The avalon role. */ 067 public static final String ROLE = SolrResourceIndexer.class.getName(); 068 069 /** The Tika instance. */ 070 protected Tika _tika; 071 072 /** The language manager. */ 073 protected LanguagesManager _langManager; 074 075 /** The solr indexer */ 076 protected SolrIndexer _solrIndexer; 077 078 @Override 079 public void service(ServiceManager manager) throws ServiceException 080 { 081 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 082 _tika = tikaProvider.getTika(); 083 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 084 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 085 } 086 087 /** 088 * Index a resource. 089 * @param resource The resource to index. 090 * @param document The Solr document to index into. 091 * @param documentType The document type of the resource 092 * @throws Exception if an error occurs. 093 */ 094 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 095 { 096 indexResource(resource, document, documentType, null, null); 097 } 098 099 /** 100 * Index a resource. 101 * @param resource The resource to index. 102 * @param document The Solr document to index into. 103 * @param documentType The document type of the resource 104 * @param language The query language. 105 * @throws Exception if an error occurs. 106 */ 107 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 108 { 109 indexResource(resource, document, documentType, language, null); 110 } 111 112 /** 113 * Index a resource. 114 * @param resource The resource to index. 115 * @param document The Solr document to index into. 116 * @param documentType The document type of the resource 117 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 118 * @throws Exception if an error occurs. 119 */ 120 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 121 { 122 indexResource(resource, document, documentType, null, resourceRoot); 123 } 124 125 /** 126 * Index a resource. 127 * @param resource The resource to index. 128 * @param document The Solr document to index into. 129 * @param documentType The document type of the resource 130 * @param language The language, can be null. 131 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 132 * @throws Exception if an error occurs. 133 */ 134 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 135 { 136 // Resource id 137 document.addField(ID, resource.getId()); 138 // Type is resource 139 document.addField(DOCUMENT_TYPE, documentType); 140 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 141 // The resource path. 142 document.setField(PATH, resource.getResourcePath()); 143 document.addField(FILENAME, resource.getName()); 144 145 // Title 146 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 147 // Index title like other string values (like content attributes) 148 CMSDataContext context = CMSDataContext.newInstance(); 149 if (StringUtils.isNotEmpty(language)) 150 { 151 context.withLocale(LocaleUtils.toLocale(language)); 152 } 153 IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger()); 154 // Add sort indexation 155 document.setField(TITLE_SORT, resource.getName()); 156 document.addField(TITLE + "_s_sort", title); 157 // Add title to "full" (already added to "systemFull") 158 CMSDataContext fullContext = context.cloneContext() 159 .withIndexForFullTextField(true) 160 .withFullTextFieldName(SolrFieldNames.FULL); 161 IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext); 162 163 _populateDatesOfPage(resource, document); 164 165 // Mime types 166 document.addField(MIME_TYPES, resource.getMimeType()); 167 // Length 168 document.addField(LENGTH, resource.getLength()); 169 170 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 171 document.addField(RESOURCE_ROOT_ID, root.getId()); 172 173 // Parents resource collections of the resource 174 _indexAncestorIds(resource, document); 175 176 // Resource author 177 String author = UserIdentity.userIdentityToString(resource.getCreator()); 178 if (StringUtils.isNotBlank(author)) 179 { 180 document.setField(RESOURCE_CREATOR, author); 181 } 182 183 // Hard-coded content type for facets. 184 // TODO Move to specific "embedded mode" method? 185 document.addField(ContentTypeSystemProperty.CONTENT_TYPES_SOLR_FIELD_NAME, CONTENT_TYPE_RESOURCE); 186 187 // Indexation of ACL initial values 188 _solrIndexer.indexAclInitValues(resource, document); 189 190 indexResourceContent(resource, document, language); 191 } 192 193 /** 194 * Populate the solr input document with dates from the resource 195 * @param resource The resource 196 * @param document The Solr document 197 */ 198 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 199 { 200 // Last modified 201 _getFormattedDate(resource.getLastModified()).ifPresent( 202 lastModified -> 203 { 204 // For 'new' search service 205 document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME, lastModified); 206 // For 'old' search service 207 document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME + "_dt", lastModified); 208 209 // For 'new' search service => last validation, last major validation 210 document.addField(LastValidationSystemProperty.SOLR_FIELD_NAME, lastModified); 211 document.addField(LastMajorValidationSystemProperty.SOLR_FIELD_NAME, lastModified); 212 } 213 ); 214 215 // For 'new' search service => first validation 216 _getFormattedDate(resource.getCreationDate()).ifPresent( 217 creationDate -> 218 { 219 // For 'new' search service 220 document.addField(FirstValidationSystemProperty.SOLR_FIELD_NAME, creationDate); 221 } 222 ); 223 224 // Solr facet specific : dates-facet 225 _getFormattedDate(resource.getDCDate()).ifPresent( 226 formattedDate -> 227 { 228 document.setField(RESOURCE_DATE, formattedDate); 229 document.setField(DATE_FOR_SORTING, formattedDate); 230 document.setField(DATES_FACET, formattedDate); 231 } 232 ); 233 } 234 235 private Optional<String> _getFormattedDate(Date date) 236 { 237 return Optional.ofNullable(date) 238 .map(SolrIndexer.dateFormat()::format); 239 } 240 241 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 242 { 243 // Ancestors 244 List<String> ancestorIds = new ArrayList<>(); 245 AmetysObject parent = resource.getParent(); 246 while (parent instanceof ResourceCollection) 247 { 248 ancestorIds.add(parent.getId()); 249 parent = parent.getParent(); 250 } 251 252 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 253 254 // Ancestors and self 255 List<String> ancestorAndSelfIds = new ArrayList<>(); 256 ancestorAndSelfIds.add(resource.getId()); 257 ancestorAndSelfIds.addAll(ancestorIds); 258 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 259 } 260 261 /** 262 * Index a collection of resources. 263 * @param resourceCollection the resource collection to index. 264 * @param document The document to index into. 265 * @param language The current language. 266 * @throws Exception if an error occurs while indexing. 267 */ 268 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 269 { 270 if (resourceCollection == null) 271 { 272 return; 273 } 274 275 for (AmetysObject object : resourceCollection.getChildren()) 276 { 277 if (object instanceof ResourceCollection) 278 { 279 indexResourceCollection((ResourceCollection) object, document, language); 280 } 281 else if (object instanceof Resource) 282 { 283 indexResourceContent((Resource) object, document, language); 284 } 285 } 286 } 287 288 /** 289 * Index a resource content (text in case of a document, and Dublin Core metadata). 290 * @param resource The resource to index. 291 * @param document The document to index into. 292 * @param language The current language, can be null. 293 */ 294 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 295 { 296 try (InputStream is = resource.getInputStream()) 297 { 298 String value = _getResourceContent(resource); 299 300 indexFulltextValue(document, value, language); 301 302 if (StringUtils.isNotBlank(value)) 303 { 304 int summaryEndIndex = value.lastIndexOf(' ', 200); 305 if (summaryEndIndex == -1) 306 { 307 summaryEndIndex = value.length(); 308 } 309 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 310 } 311 312 String[] dcSubject = resource.getDCSubject(); 313 if (dcSubject != null) 314 { 315 for (String keyword : dcSubject) 316 { 317 indexFulltextValue(document, keyword, language); 318 } 319 } 320 321 String desc = resource.getDCDescription(); 322 if (desc != null) 323 { 324 indexFulltextValue(document, desc, language); 325 } 326 327 // DC meta 328 indexDublinCoreMetadata(resource, document); 329 } 330 catch (Throwable e) 331 { 332 getLogger().error("Unable to index resource at " + resource.getPath(), e); 333 } 334 } 335 336 private String _getResourceContent(Resource resource) throws Throwable 337 { 338 try (InputStream is = resource.getInputStream()) 339 { 340 return _tika.parseToString(is); 341 } 342 catch (ZeroByteFileException e) 343 { 344 // Ignore it, the file is empty, nothing to do 345 return StringUtils.EMPTY; 346 } 347 catch (Throwable e) 348 { 349 throw e; 350 } 351 } 352 353 /** 354 * Index a full-text value. 355 * @param document The document to index into. 356 * @param text The text to index. 357 * @param language The content language, can be null. 358 */ 359 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 360 { 361 CMSDataContext context = CMSDataContext.newInstance() 362 .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future 363 if (StringUtils.isNotEmpty(language)) 364 { 365 context.withLocale(LocaleUtils.toLocale(language)); 366 } 367 368 // Index the document in systemFull 369 IndexableElementTypeHelper.indexFulltextValue(document, text, context); 370 371 // Then in full 372 IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL)); 373 } 374 375 /////////////////////////////////////////////////////////////////////////// 376 377 /** 378 * Index Dublin core metadata. 379 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 380 * @param document the solr input document to populate. 381 */ 382 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 383 { 384 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 385 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 386 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 387 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 388 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 389 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 390 String mimeType = _getDcFormatToIndex(object); 391 _indexNonNullValue(document, DC_FORMAT, mimeType); 392 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 393 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 394 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 395 _indexNonNullValue(document, DC_DATE, _getFormattedDate(object.getDCDate()).orElse(null)); 396 397 SolrResourceGroupedMimeTypes.getGroup(mimeType) 398 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 399 } 400 401 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 402 { 403 return Optional.of(object) 404 .map(DublinCoreAwareAmetysObject::getDCFormat) 405 // According to https://en.wikipedia.org/wiki/Media_type#Naming 406 // input format is: 407 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 408 // just output the part without optional parameters 409 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 410 // According to https://en.wikipedia.org/wiki/Media_type#Naming 411 // Types, subtypes, and parameter names are case-insensitive 412 .map(String::toLowerCase) 413 .orElse(null); 414 } 415 416 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 417 { 418 if (value != null) 419 { 420 document.addField(fieldName, value); 421 } 422 } 423 424 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 425 { 426 if (values != null) 427 { 428 for (String value : values) 429 { 430 document.addField(fieldName, value); 431 } 432 } 433 } 434 435 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 436 { 437 if (value != null) 438 { 439 document.addField(fieldName, value); 440 } 441 }*/ 442}