001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Optional; 023import java.util.Set; 024 025import org.apache.avalon.framework.component.Component; 026import org.apache.avalon.framework.service.ServiceException; 027import org.apache.avalon.framework.service.ServiceManager; 028import org.apache.avalon.framework.service.Serviceable; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032 033import org.ametys.cms.languages.LanguagesManager; 034import org.ametys.cms.search.solr.field.FirstValidationSearchField; 035import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 036import org.ametys.cms.search.solr.field.LastModifiedSearchField; 037import org.ametys.cms.search.solr.field.LastValidationSearchField; 038import org.ametys.core.user.UserIdentity; 039import org.ametys.plugins.explorer.resources.Resource; 040import org.ametys.plugins.explorer.resources.ResourceCollection; 041import org.ametys.plugins.explorer.resources.ResourceHelper; 042import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 043import org.ametys.plugins.repository.AmetysObject; 044import org.ametys.plugins.repository.TraversableAmetysObject; 045import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 046import org.ametys.runtime.plugin.component.AbstractLogEnabled; 047 048/** 049 * Solr resource indexer.<p> 050 * Populate a Solr input document with the following fields: 051 * <dl> 052 * <dt>id 053 * <dd>resource id 054 * <dt>type 055 * <dd>with <code>"document"</code> value 056 * <dt>full 057 * <dd>resource content 058 * </dl> 059 */ 060public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 061{ 062 /** The avalon role. */ 063 public static final String ROLE = SolrResourceIndexer.class.getName(); 064 065 /** The Tika instance. */ 066 protected Tika _tika; 067 068 /** The language manager. */ 069 protected LanguagesManager _langManager; 070 071 /** The solr indexer */ 072 protected SolrIndexer _solrIndexer; 073 074 @Override 075 public void service(ServiceManager manager) throws ServiceException 076 { 077 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 078 _tika = tikaProvider.getTika(); 079 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 080 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 081 } 082 083 /** 084 * Index a resource. 085 * @param resource The resource to index. 086 * @param document The Solr document to index into. 087 * @param documentType The document type of the resource 088 * @throws Exception if an error occurs. 089 */ 090 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 091 { 092 indexResource(resource, document, documentType, null, null); 093 } 094 095 /** 096 * Index a resource. 097 * @param resource The resource to index. 098 * @param document The Solr document to index into. 099 * @param documentType The document type of the resource 100 * @param language The query language. 101 * @throws Exception if an error occurs. 102 */ 103 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 104 { 105 indexResource(resource, document, documentType, language, null); 106 } 107 108 /** 109 * Index a resource. 110 * @param resource The resource to index. 111 * @param document The Solr document to index into. 112 * @param documentType The document type of the resource 113 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 114 * @throws Exception if an error occurs. 115 */ 116 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 117 { 118 indexResource(resource, document, documentType, null, resourceRoot); 119 } 120 121 /** 122 * Index a resource. 123 * @param resource The resource to index. 124 * @param document The Solr document to index into. 125 * @param documentType The document type of the resource 126 * @param language The language, can be null. 127 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 128 * @throws Exception if an error occurs. 129 */ 130 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 131 { 132 // Resource id - Store.YES, Index.NOT_ANALYZED 133 document.addField(ID, resource.getId()); 134 // Type is resource - Store.YES, Index.NOT_ANALYZED 135 document.addField(DOCUMENT_TYPE, documentType); 136 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 137 // The resource path. 138 document.setField(PATH, resource.getResourcePath()); 139 document.addField(FILENAME, resource.getName()); 140 141 // Title 142 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 143 document.addField(TITLE, title); 144 document.setField(TITLE_SORT, resource.getName()); 145 document.addField(TITLE + "_s", title); 146 document.addField(TITLE + "_s_sort", title); 147 148 // Replaces "all-not-analyzed". 149 indexFulltextValue(document, title, language); 150 151 _populateDatesOfPage(resource, document); 152 153 // Mime types - Store.YES, Index.ANALYZED 154 document.addField(MIME_TYPES, resource.getMimeType()); 155 // Length - Store.YES, Index.NO 156 document.addField(LENGTH, resource.getLength()); 157 158 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 159 document.addField(RESOURCE_ROOT_ID, root.getId()); 160 161 // Parents resource collections of the resource 162 _indexAncestorIds(resource, document); 163 164 // Resource author 165 String author = UserIdentity.userIdentityToString(resource.getCreator()); 166 if (StringUtils.isNotBlank(author)) 167 { 168 document.setField(RESOURCE_CREATOR, author); 169 } 170 171 // Hard-coded content type for facets. 172 // TODO Move to specific "embedded mode" method? 173 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 174 175 // Indexation of ACL initial values 176 _solrIndexer.indexAclInitValues(resource, document); 177 178 indexResourceContent(resource, document, language); 179 } 180 181 /** 182 * Populate the solr input document with dates from the resource 183 * @param resource The resource 184 * @param document The Solr document 185 */ 186 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 187 { 188 // Last modified 189 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 190 // For 'new' search service 191 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 192 // For 'old' search service 193 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 194 195 // For 'new' search service => last validation, last major validation 196 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 197 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 198 199 // For 'new' search service => first validation 200 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 201 document.addField(FirstValidationSearchField.NAME, creationDateStr); 202 203 // Solr facet specific : dates-facet 204 Date date = resource.getDCDate(); 205 String formattedDate = SolrIndexer.dateFormat().format(date); 206 if (formattedDate != null) 207 { 208 document.setField(RESOURCE_DATE, formattedDate); 209 document.setField(DATE_FOR_SORTING, formattedDate); 210 document.setField(DATES_FACET, formattedDate); 211 } 212 } 213 214 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 215 { 216 // Ancestors 217 List<String> ancestorIds = new ArrayList<>(); 218 AmetysObject parent = resource.getParent(); 219 while (parent instanceof ResourceCollection) 220 { 221 ancestorIds.add(parent.getId()); 222 parent = parent.getParent(); 223 } 224 225 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 226 227 // Ancestors and self 228 List<String> ancestorAndSelfIds = new ArrayList<>(); 229 ancestorAndSelfIds.add(resource.getId()); 230 ancestorAndSelfIds.addAll(ancestorIds); 231 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 232 } 233 234 /** 235 * Index a collection of resources. 236 * @param resourceCollection the resource collection to index. 237 * @param document The document to index into. 238 * @param language The current language. 239 * @throws Exception if an error occurs while indexing. 240 */ 241 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 242 { 243 if (resourceCollection == null) 244 { 245 return; 246 } 247 248 for (AmetysObject object : resourceCollection.getChildren()) 249 { 250 if (object instanceof ResourceCollection) 251 { 252 indexResourceCollection((ResourceCollection) object, document, language); 253 } 254 else if (object instanceof Resource) 255 { 256 indexResourceContent((Resource) object, document, language); 257 } 258 } 259 } 260 261 /** 262 * Index a resource content (text in case of a document, and Dublin Core metadata). 263 * @param resource The resource to index. 264 * @param document The document to index into. 265 * @param language The current language, can be null. 266 */ 267 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 268 { 269 try (InputStream is = resource.getInputStream()) 270 { 271 String value = _tika.parseToString(is); 272 indexFulltextValue(document, value, language); 273 274 if (StringUtils.isNotBlank(value)) 275 { 276 int summaryEndIndex = value.lastIndexOf(' ', 200); 277 if (summaryEndIndex == -1) 278 { 279 summaryEndIndex = value.length(); 280 } 281 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 282 } 283 284 for (String keyword : resource.getDCSubject()) 285 { 286 indexFulltextValue(document, keyword, language); 287 } 288 289 String desc = resource.getDCDescription(); 290 if (desc != null) 291 { 292 indexFulltextValue(document, desc, language); 293 } 294 295 // DC meta 296 indexDublinCoreMetadata(resource, document); 297 } 298 catch (Throwable e) 299 { 300 getLogger().error("Unable to index resource at " + resource.getPath(), e); 301 } 302 } 303 304 /** 305 * Index a full-text value. 306 * @param document The document to index into. 307 * @param text The text to index. 308 * @param language The content language, can be null. 309 */ 310 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 311 { 312 if (StringUtils.isNotEmpty(language)) 313 { 314 SolrContentIndexer.indexFulltextValue(document, text, language); 315 } 316 else 317 { 318 Set<String> languages = _langManager.getAvailableLanguages().keySet(); 319 SolrContentIndexer.indexFulltextValue(document, text, languages); 320 } 321 } 322 323 /////////////////////////////////////////////////////////////////////////// 324 325 /** 326 * Index Dublin core metadata. 327 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 328 * @param document the solr input document to populate. 329 */ 330 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 331 { 332 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 333 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 334 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 335 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 336 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 337 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 338 String mimeType = _getDcFormatToIndex(object); 339 _indexNonNullValue(document, DC_FORMAT, mimeType); 340 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 341 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 342 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 343 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 344 345 SolrResourceGroupedMimeTypes.getGroup(mimeType) 346 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 347 } 348 349 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 350 { 351 return Optional.of(object) 352 .map(DublinCoreAwareAmetysObject::getDCFormat) 353 // According to https://en.wikipedia.org/wiki/Media_type#Naming 354 // input format is: 355 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 356 // just output the part without optional parameters 357 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 358 // According to https://en.wikipedia.org/wiki/Media_type#Naming 359 // Types, subtypes, and parameter names are case-insensitive 360 .map(String::toLowerCase) 361 .orElse(null); 362 } 363 364 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 365 { 366 if (value != null) 367 { 368 document.addField(fieldName, value); 369 } 370 } 371 372 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 373 { 374 if (values != null) 375 { 376 for (String value : values) 377 { 378 document.addField(fieldName, value); 379 } 380 } 381 } 382 383 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 384 { 385 if (value != null) 386 { 387 document.addField(fieldName, value); 388 } 389 }*/ 390}