001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Optional; 023 024import org.apache.avalon.framework.component.Component; 025import org.apache.avalon.framework.service.ServiceException; 026import org.apache.avalon.framework.service.ServiceManager; 027import org.apache.avalon.framework.service.Serviceable; 028import org.apache.commons.lang3.LocaleUtils; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032import org.apache.tika.exception.ZeroByteFileException; 033 034import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper; 035import org.ametys.cms.languages.LanguagesManager; 036import org.ametys.cms.model.CMSDataContext; 037import org.ametys.cms.search.systemprop.ContentTypeSystemProperty; 038import org.ametys.cms.search.systemprop.FirstValidationSystemProperty; 039import org.ametys.cms.search.systemprop.LastMajorValidationSystemProperty; 040import org.ametys.cms.search.systemprop.LastModifiedSystemProperty; 041import org.ametys.cms.search.systemprop.LastValidationSystemProperty; 042import org.ametys.core.file.TikaProvider; 043import org.ametys.core.user.UserIdentity; 044import org.ametys.plugins.explorer.resources.Resource; 045import org.ametys.plugins.explorer.resources.ResourceCollection; 046import org.ametys.plugins.explorer.resources.ResourceHelper; 047import org.ametys.plugins.repository.AmetysObject; 048import org.ametys.plugins.repository.TraversableAmetysObject; 049import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 050import org.ametys.runtime.plugin.component.AbstractLogEnabled; 051 052/** 053 * Solr resource indexer.<p> 054 * Populate a Solr input document with the following fields: 055 * <dl> 056 * <dt>id 057 * <dd>resource id 058 * <dt>type 059 * <dd>with <code>"document"</code> value 060 * <dt>full 061 * <dd>resource content 062 * </dl> 063 */ 064public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 065{ 066 /** The avalon role. */ 067 public static final String ROLE = SolrResourceIndexer.class.getName(); 068 069 /** The Tika instance. */ 070 protected Tika _tika; 071 072 /** The language manager. */ 073 protected LanguagesManager _langManager; 074 075 /** The solr indexer */ 076 protected SolrIndexer _solrIndexer; 077 078 @Override 079 public void service(ServiceManager manager) throws ServiceException 080 { 081 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 082 _tika = tikaProvider.getTika(); 083 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 084 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 085 } 086 087 /** 088 * Index a resource. 089 * @param resource The resource to index. 090 * @param document The Solr document to index into. 091 * @param documentType The document type of the resource 092 * @throws Exception if an error occurs. 093 */ 094 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 095 { 096 indexResource(resource, document, documentType, null, null); 097 } 098 099 /** 100 * Index a resource. 101 * @param resource The resource to index. 102 * @param document The Solr document to index into. 103 * @param documentType The document type of the resource 104 * @param language The query language. 105 * @throws Exception if an error occurs. 106 */ 107 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 108 { 109 indexResource(resource, document, documentType, language, null); 110 } 111 112 /** 113 * Index a resource. 114 * @param resource The resource to index. 115 * @param document The Solr document to index into. 116 * @param documentType The document type of the resource 117 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 118 * @throws Exception if an error occurs. 119 */ 120 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 121 { 122 indexResource(resource, document, documentType, null, resourceRoot); 123 } 124 125 /** 126 * Index a resource. 127 * @param resource The resource to index. 128 * @param document The Solr document to index into. 129 * @param documentType The document type of the resource 130 * @param language The language, can be null. 131 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 132 * @throws Exception if an error occurs. 133 */ 134 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 135 { 136 // Resource id 137 document.addField(ID, resource.getId()); 138 // Type is resource 139 document.addField(DOCUMENT_TYPE, documentType); 140 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 141 // The resource path. 142 document.setField(PATH, resource.getResourcePath()); 143 document.addField(FILENAME, resource.getName()); 144 145 // Title 146 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 147 // Index title like other string values (like content attributes) 148 CMSDataContext context = CMSDataContext.newInstance(); 149 if (StringUtils.isNotEmpty(language)) 150 { 151 context.withLocale(LocaleUtils.toLocale(language)); 152 } 153 IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger()); 154 // Add sort indexation 155 document.setField(TITLE_SORT, resource.getName()); 156 document.addField(TITLE + "_s_sort", title); 157 // Add title to "full" (already added to "systemFull") 158 CMSDataContext fullContext = context.cloneContext() 159 .withIndexForFullTextField(true) 160 .withFullTextFieldName(SolrFieldNames.FULL); 161 IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext); 162 163 _populateDatesOfPage(resource, document); 164 165 // Mime types 166 document.addField(MIME_TYPES, resource.getMimeType()); 167 // Length 168 document.addField(LENGTH, resource.getLength()); 169 170 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 171 document.addField(RESOURCE_ROOT_ID, root.getId()); 172 173 // Parents resource collections of the resource 174 _indexAncestorIds(resource, document); 175 176 // Resource author 177 String author = UserIdentity.userIdentityToString(resource.getCreator()); 178 if (StringUtils.isNotBlank(author)) 179 { 180 document.setField(RESOURCE_CREATOR, author); 181 } 182 183 // Hard-coded content type for facets. 184 // TODO Move to specific "embedded mode" method? 185 document.addField(ContentTypeSystemProperty.CONTENT_TYPES_SOLR_FIELD_NAME, CONTENT_TYPE_RESOURCE); 186 187 // Indexation of ACL initial values 188 _solrIndexer.indexAclInitValues(resource, document); 189 190 indexResourceContent(resource, document, language); 191 } 192 193 /** 194 * Populate the solr input document with dates from the resource 195 * @param resource The resource 196 * @param document The Solr document 197 */ 198 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 199 { 200 // Last modified 201 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 202 // For 'new' search service 203 document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME, lastModifiedStr); 204 // For 'old' search service 205 document.addField(LastModifiedSystemProperty.SOLR_FIELD_NAME + "_dt", lastModifiedStr); 206 207 // For 'new' search service => last validation, last major validation 208 document.addField(LastValidationSystemProperty.SOLR_FIELD_NAME, lastModifiedStr); 209 document.addField(LastMajorValidationSystemProperty.SOLR_FIELD_NAME, lastModifiedStr); 210 211 // For 'new' search service => first validation 212 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 213 document.addField(FirstValidationSystemProperty.SOLR_FIELD_NAME, creationDateStr); 214 215 // Solr facet specific : dates-facet 216 Date date = resource.getDCDate(); 217 String formattedDate = SolrIndexer.dateFormat().format(date); 218 if (formattedDate != null) 219 { 220 document.setField(RESOURCE_DATE, formattedDate); 221 document.setField(DATE_FOR_SORTING, formattedDate); 222 document.setField(DATES_FACET, formattedDate); 223 } 224 } 225 226 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 227 { 228 // Ancestors 229 List<String> ancestorIds = new ArrayList<>(); 230 AmetysObject parent = resource.getParent(); 231 while (parent instanceof ResourceCollection) 232 { 233 ancestorIds.add(parent.getId()); 234 parent = parent.getParent(); 235 } 236 237 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 238 239 // Ancestors and self 240 List<String> ancestorAndSelfIds = new ArrayList<>(); 241 ancestorAndSelfIds.add(resource.getId()); 242 ancestorAndSelfIds.addAll(ancestorIds); 243 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 244 } 245 246 /** 247 * Index a collection of resources. 248 * @param resourceCollection the resource collection to index. 249 * @param document The document to index into. 250 * @param language The current language. 251 * @throws Exception if an error occurs while indexing. 252 */ 253 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 254 { 255 if (resourceCollection == null) 256 { 257 return; 258 } 259 260 for (AmetysObject object : resourceCollection.getChildren()) 261 { 262 if (object instanceof ResourceCollection) 263 { 264 indexResourceCollection((ResourceCollection) object, document, language); 265 } 266 else if (object instanceof Resource) 267 { 268 indexResourceContent((Resource) object, document, language); 269 } 270 } 271 } 272 273 /** 274 * Index a resource content (text in case of a document, and Dublin Core metadata). 275 * @param resource The resource to index. 276 * @param document The document to index into. 277 * @param language The current language, can be null. 278 */ 279 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 280 { 281 try (InputStream is = resource.getInputStream()) 282 { 283 String value = _getResourceContent(resource); 284 285 indexFulltextValue(document, value, language); 286 287 if (StringUtils.isNotBlank(value)) 288 { 289 int summaryEndIndex = value.lastIndexOf(' ', 200); 290 if (summaryEndIndex == -1) 291 { 292 summaryEndIndex = value.length(); 293 } 294 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 295 } 296 297 for (String keyword : resource.getDCSubject()) 298 { 299 indexFulltextValue(document, keyword, language); 300 } 301 302 String desc = resource.getDCDescription(); 303 if (desc != null) 304 { 305 indexFulltextValue(document, desc, language); 306 } 307 308 // DC meta 309 indexDublinCoreMetadata(resource, document); 310 } 311 catch (Throwable e) 312 { 313 getLogger().error("Unable to index resource at " + resource.getPath(), e); 314 } 315 } 316 317 private String _getResourceContent(Resource resource) throws Throwable 318 { 319 try (InputStream is = resource.getInputStream()) 320 { 321 return _tika.parseToString(is); 322 } 323 catch (ZeroByteFileException e) 324 { 325 // Ignore it, the file is empty, nothing to do 326 return StringUtils.EMPTY; 327 } 328 catch (Throwable e) 329 { 330 throw e; 331 } 332 } 333 334 /** 335 * Index a full-text value. 336 * @param document The document to index into. 337 * @param text The text to index. 338 * @param language The content language, can be null. 339 */ 340 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 341 { 342 CMSDataContext context = CMSDataContext.newInstance() 343 .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future 344 if (StringUtils.isNotEmpty(language)) 345 { 346 context.withLocale(LocaleUtils.toLocale(language)); 347 } 348 349 // Index the document in systemFull 350 IndexableElementTypeHelper.indexFulltextValue(document, text, context); 351 352 // Then in full 353 IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL)); 354 } 355 356 /////////////////////////////////////////////////////////////////////////// 357 358 /** 359 * Index Dublin core metadata. 360 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 361 * @param document the solr input document to populate. 362 */ 363 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 364 { 365 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 366 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 367 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 368 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 369 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 370 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 371 String mimeType = _getDcFormatToIndex(object); 372 _indexNonNullValue(document, DC_FORMAT, mimeType); 373 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 374 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 375 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 376 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 377 378 SolrResourceGroupedMimeTypes.getGroup(mimeType) 379 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 380 } 381 382 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 383 { 384 return Optional.of(object) 385 .map(DublinCoreAwareAmetysObject::getDCFormat) 386 // According to https://en.wikipedia.org/wiki/Media_type#Naming 387 // input format is: 388 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 389 // just output the part without optional parameters 390 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 391 // According to https://en.wikipedia.org/wiki/Media_type#Naming 392 // Types, subtypes, and parameter names are case-insensitive 393 .map(String::toLowerCase) 394 .orElse(null); 395 } 396 397 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 398 { 399 if (value != null) 400 { 401 document.addField(fieldName, value); 402 } 403 } 404 405 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 406 { 407 if (values != null) 408 { 409 for (String value : values) 410 { 411 document.addField(fieldName, value); 412 } 413 } 414 } 415 416 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 417 { 418 if (value != null) 419 { 420 document.addField(fieldName, value); 421 } 422 }*/ 423}