001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Optional; 023import java.util.Set; 024 025import org.apache.avalon.framework.component.Component; 026import org.apache.avalon.framework.service.ServiceException; 027import org.apache.avalon.framework.service.ServiceManager; 028import org.apache.avalon.framework.service.Serviceable; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.solr.common.SolrInputDocument; 031import org.apache.tika.Tika; 032 033import org.ametys.cms.languages.LanguagesManager; 034import org.ametys.cms.search.solr.field.FirstValidationSearchField; 035import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 036import org.ametys.cms.search.solr.field.LastModifiedSearchField; 037import org.ametys.cms.search.solr.field.LastValidationSearchField; 038import org.ametys.core.user.UserIdentity; 039import org.ametys.plugins.explorer.resources.Resource; 040import org.ametys.plugins.explorer.resources.ResourceCollection; 041import org.ametys.plugins.explorer.resources.ResourceHelper; 042import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 043import org.ametys.plugins.repository.AmetysObject; 044import org.ametys.plugins.repository.TraversableAmetysObject; 045import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 046import org.ametys.runtime.plugin.component.AbstractLogEnabled; 047 048/** 049 * Solr resource indexer.<p> 050 * Populate a Solr input document with the following fields: 051 * <dl> 052 * <dt>id 053 * <dd>resource id 054 * <dt>type 055 * <dd>with <code>"document"</code> value 056 * <dt>full 057 * <dd>resource content 058 * </dl> 059 */ 060public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 061{ 062 /** The avalon role. */ 063 public static final String ROLE = SolrResourceIndexer.class.getName(); 064 065 /** The Tika instance. */ 066 protected Tika _tika; 067 068 /** The language manager. */ 069 protected LanguagesManager _langManager; 070 071 /** The solr indexer */ 072 protected SolrIndexer _solrIndexer; 073 074 @Override 075 public void service(ServiceManager manager) throws ServiceException 076 { 077 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 078 _tika = tikaProvider.getTika(); 079 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 080 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 081 } 082 083 /** 084 * Index a resource. 085 * @param resource The resource to index. 086 * @param document The Solr document to index into. 087 * @param documentType The document type of the resource 088 * @throws Exception if an error occurs. 089 */ 090 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 091 { 092 indexResource(resource, document, documentType, null, null); 093 } 094 095 /** 096 * Index a resource. 097 * @param resource The resource to index. 098 * @param document The Solr document to index into. 099 * @param documentType The document type of the resource 100 * @param language The query language. 101 * @throws Exception if an error occurs. 102 */ 103 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 104 { 105 indexResource(resource, document, documentType, language, null); 106 } 107 108 /** 109 * Index a resource. 110 * @param resource The resource to index. 111 * @param document The Solr document to index into. 112 * @param documentType The document type of the resource 113 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 114 * @throws Exception if an error occurs. 115 */ 116 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 117 { 118 indexResource(resource, document, documentType, null, resourceRoot); 119 } 120 121 /** 122 * Index a resource. 123 * @param resource The resource to index. 124 * @param document The Solr document to index into. 125 * @param documentType The document type of the resource 126 * @param language The language, can be null. 127 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 128 * @throws Exception if an error occurs. 129 */ 130 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 131 { 132 // Resource id - Store.YES, Index.NOT_ANALYZED 133 document.addField(ID, resource.getId()); 134 // Type is resource - Store.YES, Index.NOT_ANALYZED 135 document.addField(DOCUMENT_TYPE, documentType); 136 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 137 // The resource path. 138 document.setField(PATH, resource.getResourcePath()); 139 document.addField(FILENAME, resource.getName()); 140 141 // Title 142 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 143 document.addField(TITLE, title); 144 document.setField(TITLE_SORT, resource.getName()); 145 document.addField(TITLE + "_s", title); 146 document.addField(TITLE + "_s_sort", title); 147 document.addField(TITLE + "_s_lower", title.toLowerCase()); 148 149 // Replaces "all-not-analyzed". 150 indexFulltextValue(document, title, language); 151 152 _populateDatesOfPage(resource, document); 153 154 // Mime types - Store.YES, Index.ANALYZED 155 document.addField(MIME_TYPES, resource.getMimeType()); 156 // Length - Store.YES, Index.NO 157 document.addField(LENGTH, resource.getLength()); 158 159 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 160 document.addField(RESOURCE_ROOT_ID, root.getId()); 161 162 // Parents resource collections of the resource 163 _indexAncestorIds(resource, document); 164 165 // Resource author 166 String author = UserIdentity.userIdentityToString(resource.getCreator()); 167 if (StringUtils.isNotBlank(author)) 168 { 169 document.setField(RESOURCE_CREATOR, author); 170 } 171 172 // Hard-coded content type for facets. 173 // TODO Move to specific "embedded mode" method? 174 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 175 176 // Indexation of ACL initial values 177 _solrIndexer.indexAclInitValues(resource, document); 178 179 indexResourceContent(resource, document, language); 180 } 181 182 /** 183 * Populate the solr input document with dates from the resource 184 * @param resource The resource 185 * @param document The Solr document 186 */ 187 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 188 { 189 // Last modified 190 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 191 // For 'new' search service 192 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 193 // For 'old' search service 194 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 195 196 // For 'new' search service => last validation, last major validation 197 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 198 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 199 200 // For 'new' search service => first validation 201 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 202 document.addField(FirstValidationSearchField.NAME, creationDateStr); 203 204 // Solr facet specific : dates-facet 205 Date date = resource.getDCDate(); 206 String formattedDate = SolrIndexer.dateFormat().format(date); 207 if (formattedDate != null) 208 { 209 document.setField(RESOURCE_DATE, formattedDate); 210 document.setField(DATE_FOR_SORTING, formattedDate); 211 document.setField(DATES_FACET, formattedDate); 212 } 213 } 214 215 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 216 { 217 // Ancestors 218 List<String> ancestorIds = new ArrayList<>(); 219 AmetysObject parent = resource.getParent(); 220 while (parent instanceof ResourceCollection) 221 { 222 ancestorIds.add(parent.getId()); 223 parent = parent.getParent(); 224 } 225 226 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 227 228 // Ancestors and self 229 List<String> ancestorAndSelfIds = new ArrayList<>(); 230 ancestorAndSelfIds.add(resource.getId()); 231 ancestorAndSelfIds.addAll(ancestorIds); 232 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 233 } 234 235 /** 236 * Index a collection of resources. 237 * @param resourceCollection the resource collection to index. 238 * @param document The document to index into. 239 * @param language The current language. 240 * @throws Exception if an error occurs while indexing. 241 */ 242 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 243 { 244 if (resourceCollection == null) 245 { 246 return; 247 } 248 249 for (AmetysObject object : resourceCollection.getChildren()) 250 { 251 if (object instanceof ResourceCollection) 252 { 253 indexResourceCollection((ResourceCollection) object, document, language); 254 } 255 else if (object instanceof Resource) 256 { 257 indexResourceContent((Resource) object, document, language); 258 } 259 } 260 } 261 262 /** 263 * Index a resource content (text in case of a document, and Dublin Core metadata). 264 * @param resource The resource to index. 265 * @param document The document to index into. 266 * @param language The current language, can be null. 267 */ 268 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 269 { 270 try (InputStream is = resource.getInputStream()) 271 { 272 String value = _tika.parseToString(is); 273 indexFulltextValue(document, value, language); 274 275 if (StringUtils.isNotBlank(value)) 276 { 277 int summaryEndIndex = value.lastIndexOf(' ', 200); 278 if (summaryEndIndex == -1) 279 { 280 summaryEndIndex = value.length(); 281 } 282 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 283 } 284 285 for (String keyword : resource.getDCSubject()) 286 { 287 indexFulltextValue(document, keyword, language); 288 } 289 290 String desc = resource.getDCDescription(); 291 if (desc != null) 292 { 293 indexFulltextValue(document, desc, language); 294 } 295 296 // DC meta 297 indexDublinCoreMetadata(resource, document); 298 } 299 catch (Throwable e) 300 { 301 getLogger().error("Unable to index resource at " + resource.getPath(), e); 302 } 303 } 304 305 /** 306 * Index a full-text value. 307 * @param document The document to index into. 308 * @param text The text to index. 309 * @param language The content language, can be null. 310 */ 311 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 312 { 313 if (StringUtils.isNotEmpty(language)) 314 { 315 SolrContentIndexer.indexFulltextValue(document, text, language); 316 } 317 else 318 { 319 Set<String> languages = _langManager.getAvailableLanguages().keySet(); 320 SolrContentIndexer.indexFulltextValue(document, text, languages); 321 } 322 } 323 324 /////////////////////////////////////////////////////////////////////////// 325 326 /** 327 * Index Dublin core metadata. 328 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 329 * @param document the solr input document to populate. 330 */ 331 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 332 { 333 _indexNonNullValue(document, DC_TITLE, object.getDCTitle()); 334 _indexNonNullValue(document, DC_SUBJECT, object.getDCSubject()); 335 _indexNonNullValue(document, DC_DESCRIPTION, object.getDCDescription()); 336 _indexNonNullValue(document, DC_CONTRIBUTOR, object.getDCContributor()); 337 _indexNonNullValue(document, DC_COVERAGE, object.getDCCoverage()); 338 _indexNonNullValue(document, DC_CREATOR, object.getDCCreator()); 339 String mimeType = _getDcFormatToIndex(object); 340 _indexNonNullValue(document, DC_FORMAT, mimeType); 341 _indexNonNullValue(document, DC_LANGUAGE, object.getDCLanguage()); 342 _indexNonNullValue(document, DC_PUBLISHER, object.getDCPublisher()); 343 _indexNonNullValue(document, DC_RIGHTS, object.getDCRights()); 344 _indexNonNullValue(document, DC_DATE, SolrIndexer.dateFormat().format(object.getDCDate())); 345 346 SolrResourceGroupedMimeTypes.getGroup(mimeType) 347 .ifPresent(groupMimeType -> document.addField(RESOURCE_MIME_TYPE_GROUP, groupMimeType)); 348 } 349 350 private static String _getDcFormatToIndex(DublinCoreAwareAmetysObject object) 351 { 352 return Optional.of(object) 353 .map(DublinCoreAwareAmetysObject::getDCFormat) 354 // According to https://en.wikipedia.org/wiki/Media_type#Naming 355 // input format is: 356 // type "/" [tree "."] subtype ["+" suffix] *[";" parameter] 357 // just output the part without optional parameters 358 .map(mimeType -> StringUtils.substringBefore(mimeType, ";")) 359 // According to https://en.wikipedia.org/wiki/Media_type#Naming 360 // Types, subtypes, and parameter names are case-insensitive 361 .map(String::toLowerCase) 362 .orElse(null); 363 } 364 365 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 366 { 367 if (value != null) 368 { 369 document.addField(fieldName, value); 370 } 371 } 372 373 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 374 { 375 if (values != null) 376 { 377 for (String value : values) 378 { 379 document.addField(fieldName, value); 380 } 381 } 382 } 383 384 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 385 { 386 if (value != null) 387 { 388 document.addField(fieldName, value); 389 } 390 }*/ 391}