001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Locale; 023import java.util.Optional; 024 025import org.apache.avalon.framework.component.Component; 026import org.apache.avalon.framework.service.ServiceException; 027import org.apache.avalon.framework.service.ServiceManager; 028import org.apache.avalon.framework.service.Serviceable; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032import org.apache.tika.exception.ZeroByteFileException; 033 034import org.ametys.cms.data.type.indexing.IndexableDataContext; 035import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper; 036import org.ametys.cms.languages.LanguagesManager; 037import org.ametys.cms.search.solr.field.FirstValidationSearchField; 038import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 039import org.ametys.cms.search.solr.field.LastModifiedSearchField; 040import org.ametys.cms.search.solr.field.LastValidationSearchField; 041import org.ametys.core.file.TikaProvider; 042import org.ametys.core.user.UserIdentity; 043import org.ametys.plugins.explorer.resources.Resource; 044import org.ametys.plugins.explorer.resources.ResourceCollection; 045import org.ametys.plugins.explorer.resources.ResourceHelper; 046import org.ametys.plugins.repository.AmetysObject; 047import org.ametys.plugins.repository.TraversableAmetysObject; 048import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 049import org.ametys.runtime.plugin.component.AbstractLogEnabled; 050 051/** 052 * Solr resource indexer.<p> 053 * Populate a Solr input document with the following fields: 054 * <dl> 055 * <dt>id 056 * <dd>resource id 057 * <dt>type 058 * <dd>with <code>"document"</code> value 059 * <dt>full 060 * <dd>resource content 061 * </dl> 062 */ 063public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 064{ 065 /** The avalon role. */ 066 public static final String ROLE = SolrResourceIndexer.class.getName(); 067 068 /** The Tika instance. */ 069 protected Tika _tika; 070 071 /** The language manager. */ 072 protected LanguagesManager _langManager; 073 074 /** The solr indexer */ 075 protected SolrIndexer _solrIndexer; 076 077 @Override 078 public void service(ServiceManager manager) throws ServiceException 079 { 080 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 081 _tika = tikaProvider.getTika(); 082 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 083 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 084 } 085 086 /** 087 * Index a resource. 088 * @param resource The resource to index. 089 * @param document The Solr document to index into. 090 * @param documentType The document type of the resource 091 * @throws Exception if an error occurs. 092 */ 093 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 094 { 095 indexResource(resource, document, documentType, null, null); 096 } 097 098 /** 099 * Index a resource. 100 * @param resource The resource to index. 101 * @param document The Solr document to index into. 102 * @param documentType The document type of the resource 103 * @param language The query language. 104 * @throws Exception if an error occurs. 105 */ 106 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 107 { 108 indexResource(resource, document, documentType, language, null); 109 } 110 111 /** 112 * Index a resource. 113 * @param resource The resource to index. 114 * @param document The Solr document to index into. 115 * @param documentType The document type of the resource 116 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 117 * @throws Exception if an error occurs. 118 */ 119 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 120 { 121 indexResource(resource, document, documentType, null, resourceRoot); 122 } 123 124 /** 125 * Index a resource. 126 * @param resource The resource to index. 127 * @param document The Solr document to index into. 128 * @param documentType The document type of the resource 129 * @param language The language, can be null. 130 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 131 * @throws Exception if an error occurs. 132 */ 133 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 134 { 135 // Resource id 136 document.addField(ID, resource.getId()); 137 // Type is resource 138 document.addField(DOCUMENT_TYPE, documentType); 139 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 140 // The resource path. 141 document.setField(PATH, resource.getResourcePath()); 142 document.addField(FILENAME, resource.getName()); 143 144 // Title 145 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 146 // Index title like other string values (like content attributes) 147 IndexableDataContext context = IndexableDataContext.newInstance(); 148 if (StringUtils.isNotEmpty(language)) 149 { 150 context.withLocale(new Locale(language)); 151 } 152 IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger()); 153 // Add sort indexation 154 document.setField(TITLE_SORT, resource.getName()); 155 document.addField(TITLE + "_s_sort", title); 156 // Add title to "full" (already added to "systemFull") 157 IndexableDataContext fullContext = context.cloneContext() 158 .withIndexForFullTextField(true) 159 .withFullTextFieldName(SolrFieldNames.FULL); 160 IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext); 161 162 _populateDatesOfPage(resource, document); 163 164 // Mime types 165 document.addField(MIME_TYPES, resource.getMimeType()); 166 // Length 167 document.addField(LENGTH, resource.getLength()); 168 169 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 170 document.addField(RESOURCE_ROOT_ID, root.getId()); 171 172 // Parents resource collections of the resource 173 _indexAncestorIds(resource, document); 174 175 // Resource author 176 String author = UserIdentity.userIdentityToString(resource.getCreator()); 177 if (StringUtils.isNotBlank(author)) 178 { 179 document.setField(RESOURCE_CREATOR, author); 180 } 181 182 // Hard-coded content type for facets. 183 // TODO Move to specific "embedded mode" method? 184 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 185 186 // Indexation of ACL initial values 187 _solrIndexer.indexAclInitValues(resource, document); 188 189 indexResourceContent(resource, document, language); 190 } 191 192 /** 193 * Populate the solr input document with dates from the resource 194 * @param resource The resource 195 * @param document The Solr document 196 */ 197 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 198 { 199 // Last modified 200 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 201 // For 'new' search service 202 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 203 // For 'old' search service 204 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 205 206 // For 'new' search service => last validation, last major validation 207 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 208 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 209 210 // For 'new' search service => first validation 211 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 212 document.addField(FirstValidationSearchField.NAME, creationDateStr); 213 214 // Solr facet specific : dates-facet 215 Date date = resource.getDCDate(); 216 String formattedDate = SolrIndexer.dateFormat().format(date); 217 if (formattedDate != null) 218 { 219 document.setField(RESOURCE_DATE, formattedDate); 220 document.setField(DATE_FOR_SORTING, formattedDate); 221 document.setField(DATES_FACET, formattedDate); 222 } 223 } 224 225 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 226 { 227 // Ancestors 228 List<String> ancestorIds = new ArrayList<>(); 229 AmetysObject parent = resource.getParent(); 230 while (parent instanceof ResourceCollection) 231 { 232 ancestorIds.add(parent.getId()); 233 parent = parent.getParent(); 234 } 235 236 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 237 238 // Ancestors and self 239 List<String> ancestorAndSelfIds = new ArrayList<>(); 240 ancestorAndSelfIds.add(resource.getId()); 241 ancestorAndSelfIds.addAll(ancestorIds); 242 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 243 } 244 245 /** 246 * Index a collection of resources. 247 * @param resourceCollection the resource collection to index. 248 * @param document The document to index into. 249 * @param language The current language. 250 * @throws Exception if an error occurs while indexing. 251 */ 252 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 253 { 254 if (resourceCollection == null) 255 { 256 return; 257 } 258 259 for (AmetysObject object : resourceCollection.getChildren()) 260 { 261 if (object instanceof ResourceCollection) 262 { 263 indexResourceCollection((ResourceCollection) object, document, language); 264 } 265 else if (object instanceof Resource) 266 { 267 indexResourceContent((Resource) object, document, language); 268 } 269 } 270 } 271 272 /** 273 * Index a resource content (text in case of a document, and Dublin Core metadata). 274 * @param resource The resource to index. 275 * @param document The document to index into. 276 * @param language The current language, can be null. 277 */ 278 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 279 { 280 try (InputStream is = resource.getInputStream()) 281 { 282 String value = _getResourceContent(resource); 283 284 indexFulltextValue(document, value, language); 285 286 if (StringUtils.isNotBlank(value)) 287 { 288 int summaryEndIndex = value.lastIndexOf(' ', 200); 289 if (summaryEndIndex == -1) 290 { 291 summaryEndIndex = value.length(); 292 } 293 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 294 } 295 296 for (String keyword : resource.getDCSubject()) 297 { 298 indexFulltextValue(document, keyword, language); 299 } 300 301 String desc = resource.getDCDescription(); 302 if (desc != null) 303 { 304 indexFulltextValue(document, desc, language); 305 } 306 307 // DC meta 308 indexDublinCoreMetadata(resource, document); 309 } 310 catch (Throwable e) 311 { 312 getLogger().error("Unable to index resource at " + resource.getPath(), e); 313 } 314 } 315 316 private String _getResourceContent(Resource resource) throws Throwable 317 { 318 try (InputStream is = resource.getInputStream()) 319 { 320 return _tika.parseToString(is); 321 } 322 catch (ZeroByteFileException e) 323 { 324 // Ignore it, the file is empty, nothing to do 325 return StringUtils.EMPTY; 326 } 327 catch (Throwable e) 328 { 329 throw e; 330 } 331 } 332 333 /** 334 * Index a full-text value. 335 * @param document The document to index into. 336 * @param text The text to index. 337 * @param language The content language, can be null. 338 */ 339 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 340 { 341 IndexableDataContext context = IndexableDataContext.newInstance() 342 .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future 343 if (StringUtils.isNotEmpty(language)) 344 { 345 context.withLocale(new Locale(language)); 346 } 347 348 // Index the document in systemFull 349 IndexableElementTypeHelper.indexFulltextValue(document, text, context); 350 351 // Then in full 352 IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL)); 353 } 354 355 /////////////////////////////////////////////////////////////////////////// 356 357 /** 358 * Index Dublin core metadata. 359 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 360 * @param document the solr input document to populate. 361 */ 362 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 363 { 364 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 365 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 366 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 367 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 368 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 369 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 370 String mimeType = _getDcFormatToIndex(object); 371 _indexNonNullValue(document, DC_FORMAT, mimeType); 372 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 373 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 374 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 375 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 376 377 SolrResourceGroupedMimeTypes.getGroup(mimeType) 378 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 379 } 380 381 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 382 { 383 return Optional.of(object) 384 .map(DublinCoreAwareAmetysObject::getDCFormat) 385 // According to https://en.wikipedia.org/wiki/Media_type#Naming 386 // input format is: 387 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 388 // just output the part without optional parameters 389 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 390 // According to https://en.wikipedia.org/wiki/Media_type#Naming 391 // Types, subtypes, and parameter names are case-insensitive 392 .map(String::toLowerCase) 393 .orElse(null); 394 } 395 396 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 397 { 398 if (value != null) 399 { 400 document.addField(fieldName, value); 401 } 402 } 403 404 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 405 { 406 if (values != null) 407 { 408 for (String value : values) 409 { 410 document.addField(fieldName, value); 411 } 412 } 413 } 414 415 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 416 { 417 if (value != null) 418 { 419 document.addField(fieldName, value); 420 } 421 }*/ 422}