001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Locale; 023import java.util.Optional; 024 025import org.apache.avalon.framework.component.Component; 026import org.apache.avalon.framework.service.ServiceException; 027import org.apache.avalon.framework.service.ServiceManager; 028import org.apache.avalon.framework.service.Serviceable; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032 033import org.ametys.cms.data.type.indexing.IndexableDataContext; 034import org.ametys.cms.data.type.indexing.IndexableElementTypeHelper; 035import org.ametys.cms.languages.LanguagesManager; 036import org.ametys.cms.search.solr.field.FirstValidationSearchField; 037import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 038import org.ametys.cms.search.solr.field.LastModifiedSearchField; 039import org.ametys.cms.search.solr.field.LastValidationSearchField; 040import org.ametys.core.user.UserIdentity; 041import org.ametys.plugins.explorer.resources.Resource; 042import org.ametys.plugins.explorer.resources.ResourceCollection; 043import org.ametys.plugins.explorer.resources.ResourceHelper; 044import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 045import org.ametys.plugins.repository.AmetysObject; 046import org.ametys.plugins.repository.TraversableAmetysObject; 047import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 048import org.ametys.runtime.plugin.component.AbstractLogEnabled; 049 050/** 051 * Solr resource indexer.<p> 052 * Populate a Solr input document with the following fields: 053 * <dl> 054 * <dt>id 055 * <dd>resource id 056 * <dt>type 057 * <dd>with <code>"document"</code> value 058 * <dt>full 059 * <dd>resource content 060 * </dl> 061 */ 062public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 063{ 064 /** The avalon role. */ 065 public static final String ROLE = SolrResourceIndexer.class.getName(); 066 067 /** The Tika instance. */ 068 protected Tika _tika; 069 070 /** The language manager. */ 071 protected LanguagesManager _langManager; 072 073 /** The solr indexer */ 074 protected SolrIndexer _solrIndexer; 075 076 @Override 077 public void service(ServiceManager manager) throws ServiceException 078 { 079 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 080 _tika = tikaProvider.getTika(); 081 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 082 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 083 } 084 085 /** 086 * Index a resource. 087 * @param resource The resource to index. 088 * @param document The Solr document to index into. 089 * @param documentType The document type of the resource 090 * @throws Exception if an error occurs. 091 */ 092 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 093 { 094 indexResource(resource, document, documentType, null, null); 095 } 096 097 /** 098 * Index a resource. 099 * @param resource The resource to index. 100 * @param document The Solr document to index into. 101 * @param documentType The document type of the resource 102 * @param language The query language. 103 * @throws Exception if an error occurs. 104 */ 105 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 106 { 107 indexResource(resource, document, documentType, language, null); 108 } 109 110 /** 111 * Index a resource. 112 * @param resource The resource to index. 113 * @param document The Solr document to index into. 114 * @param documentType The document type of the resource 115 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 116 * @throws Exception if an error occurs. 117 */ 118 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 119 { 120 indexResource(resource, document, documentType, null, resourceRoot); 121 } 122 123 /** 124 * Index a resource. 125 * @param resource The resource to index. 126 * @param document The Solr document to index into. 127 * @param documentType The document type of the resource 128 * @param language The language, can be null. 129 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 130 * @throws Exception if an error occurs. 131 */ 132 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 133 { 134 // Resource id 135 document.addField(ID, resource.getId()); 136 // Type is resource 137 document.addField(DOCUMENT_TYPE, documentType); 138 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 139 // The resource path. 140 document.setField(PATH, resource.getResourcePath()); 141 document.addField(FILENAME, resource.getName()); 142 143 // Title 144 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 145 // Index title like other string values (like content attributes) 146 IndexableDataContext context = IndexableDataContext.newInstance(); 147 if (StringUtils.isNotEmpty(language)) 148 { 149 context.withLocale(new Locale(language)); 150 } 151 IndexableElementTypeHelper.indexStringValue(document, document, TITLE, title, context, getLogger()); 152 // Add sort indexation 153 document.setField(TITLE_SORT, resource.getName()); 154 document.addField(TITLE + "_s_sort", title); 155 // Add title to "full" (already added to "systemFull") 156 IndexableDataContext fullContext = context.cloneContext() 157 .withIndexForFullTextField(true) 158 .withFullTextFieldName(SolrFieldNames.FULL); 159 IndexableElementTypeHelper.indexFulltextValue(document, title, fullContext); 160 161 _populateDatesOfPage(resource, document); 162 163 // Mime types 164 document.addField(MIME_TYPES, resource.getMimeType()); 165 // Length 166 document.addField(LENGTH, resource.getLength()); 167 168 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 169 document.addField(RESOURCE_ROOT_ID, root.getId()); 170 171 // Parents resource collections of the resource 172 _indexAncestorIds(resource, document); 173 174 // Resource author 175 String author = UserIdentity.userIdentityToString(resource.getCreator()); 176 if (StringUtils.isNotBlank(author)) 177 { 178 document.setField(RESOURCE_CREATOR, author); 179 } 180 181 // Hard-coded content type for facets. 182 // TODO Move to specific "embedded mode" method? 183 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 184 185 // Indexation of ACL initial values 186 _solrIndexer.indexAclInitValues(resource, document); 187 188 indexResourceContent(resource, document, language); 189 } 190 191 /** 192 * Populate the solr input document with dates from the resource 193 * @param resource The resource 194 * @param document The Solr document 195 */ 196 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 197 { 198 // Last modified 199 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 200 // For 'new' search service 201 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 202 // For 'old' search service 203 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 204 205 // For 'new' search service => last validation, last major validation 206 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 207 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 208 209 // For 'new' search service => first validation 210 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 211 document.addField(FirstValidationSearchField.NAME, creationDateStr); 212 213 // Solr facet specific : dates-facet 214 Date date = resource.getDCDate(); 215 String formattedDate = SolrIndexer.dateFormat().format(date); 216 if (formattedDate != null) 217 { 218 document.setField(RESOURCE_DATE, formattedDate); 219 document.setField(DATE_FOR_SORTING, formattedDate); 220 document.setField(DATES_FACET, formattedDate); 221 } 222 } 223 224 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 225 { 226 // Ancestors 227 List<String> ancestorIds = new ArrayList<>(); 228 AmetysObject parent = resource.getParent(); 229 while (parent instanceof ResourceCollection) 230 { 231 ancestorIds.add(parent.getId()); 232 parent = parent.getParent(); 233 } 234 235 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 236 237 // Ancestors and self 238 List<String> ancestorAndSelfIds = new ArrayList<>(); 239 ancestorAndSelfIds.add(resource.getId()); 240 ancestorAndSelfIds.addAll(ancestorIds); 241 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 242 } 243 244 /** 245 * Index a collection of resources. 246 * @param resourceCollection the resource collection to index. 247 * @param document The document to index into. 248 * @param language The current language. 249 * @throws Exception if an error occurs while indexing. 250 */ 251 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 252 { 253 if (resourceCollection == null) 254 { 255 return; 256 } 257 258 for (AmetysObject object : resourceCollection.getChildren()) 259 { 260 if (object instanceof ResourceCollection) 261 { 262 indexResourceCollection((ResourceCollection) object, document, language); 263 } 264 else if (object instanceof Resource) 265 { 266 indexResourceContent((Resource) object, document, language); 267 } 268 } 269 } 270 271 /** 272 * Index a resource content (text in case of a document, and Dublin Core metadata). 273 * @param resource The resource to index. 274 * @param document The document to index into. 275 * @param language The current language, can be null. 276 */ 277 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 278 { 279 try (InputStream is = resource.getInputStream()) 280 { 281 String value = _tika.parseToString(is); 282 indexFulltextValue(document, value, language); 283 284 if (StringUtils.isNotBlank(value)) 285 { 286 int summaryEndIndex = value.lastIndexOf(' ', 200); 287 if (summaryEndIndex == -1) 288 { 289 summaryEndIndex = value.length(); 290 } 291 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 292 } 293 294 for (String keyword : resource.getDCSubject()) 295 { 296 indexFulltextValue(document, keyword, language); 297 } 298 299 String desc = resource.getDCDescription(); 300 if (desc != null) 301 { 302 indexFulltextValue(document, desc, language); 303 } 304 305 // DC meta 306 indexDublinCoreMetadata(resource, document); 307 } 308 catch (Throwable e) 309 { 310 getLogger().error("Unable to index resource at " + resource.getPath(), e); 311 } 312 } 313 314 /** 315 * Index a full-text value. 316 * @param document The document to index into. 317 * @param text The text to index. 318 * @param language The content language, can be null. 319 */ 320 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 321 { 322 IndexableDataContext context = IndexableDataContext.newInstance() 323 .withIndexForFullTextField(true); // Facultative here because not asked by the following methods, but a protection for the future 324 if (StringUtils.isNotEmpty(language)) 325 { 326 context.withLocale(new Locale(language)); 327 } 328 329 // Index the document in systemFull 330 IndexableElementTypeHelper.indexFulltextValue(document, text, context); 331 332 // Then in full 333 IndexableElementTypeHelper.indexFulltextValue(document, text, context.withFullTextFieldName(SolrFieldNames.FULL)); 334 } 335 336 /////////////////////////////////////////////////////////////////////////// 337 338 /** 339 * Index Dublin core metadata. 340 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 341 * @param document the solr input document to populate. 342 */ 343 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 344 { 345 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 346 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 347 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 348 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 349 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 350 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 351 String mimeType = _getDcFormatToIndex(object); 352 _indexNonNullValue(document, DC_FORMAT, mimeType); 353 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 354 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 355 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 356 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 357 358 SolrResourceGroupedMimeTypes.getGroup(mimeType) 359 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 360 } 361 362 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 363 { 364 return Optional.of(object) 365 .map(DublinCoreAwareAmetysObject::getDCFormat) 366 // According to https://en.wikipedia.org/wiki/Media_type#Naming 367 // input format is: 368 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 369 // just output the part without optional parameters 370 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 371 // According to https://en.wikipedia.org/wiki/Media_type#Naming 372 // Types, subtypes, and parameter names are case-insensitive 373 .map(String::toLowerCase) 374 .orElse(null); 375 } 376 377 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 378 { 379 if (value != null) 380 { 381 document.addField(fieldName, value); 382 } 383 } 384 385 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 386 { 387 if (values != null) 388 { 389 for (String value : values) 390 { 391 document.addField(fieldName, value); 392 } 393 } 394 } 395 396 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 397 { 398 if (value != null) 399 { 400 document.addField(fieldName, value); 401 } 402 }*/ 403}