001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Set; 023 024import org.apache.avalon.framework.component.Component; 025import org.apache.avalon.framework.service.ServiceException; 026import org.apache.avalon.framework.service.ServiceManager; 027import org.apache.avalon.framework.service.Serviceable; 028import org.apache.commons.lang3.StringUtils; 029import org.apache.solr.common.SolrInputDocument; 030import org.apache.tika.Tika; 031 032import org.ametys.cms.languages.LanguagesManager; 033import org.ametys.cms.search.solr.field.FirstValidationSearchField; 034import org.ametys.cms.search.solr.field.LastMajorValidationSearchField; 035import org.ametys.cms.search.solr.field.LastModifiedSearchField; 036import org.ametys.cms.search.solr.field.LastValidationSearchField; 037import org.ametys.core.user.UserIdentity; 038import org.ametys.plugins.explorer.resources.Resource; 039import org.ametys.plugins.explorer.resources.ResourceCollection; 040import org.ametys.plugins.explorer.resources.ResourceHelper; 041import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 042import org.ametys.plugins.repository.AmetysObject; 043import org.ametys.plugins.repository.TraversableAmetysObject; 044import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 045import org.ametys.runtime.plugin.component.AbstractLogEnabled; 046 047/** 048 * Solr resource indexer.<p> 049 * Populate a Solr input document with the following fields: 050 * <dl> 051 * <dt>id 052 * <dd>resource id 053 * <dt>type 054 * <dd>with <code>"document"</code> value 055 * <dt>full 056 * <dd>resource content 057 * </dl> 058 */ 059public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 060{ 061 /** The avalon role. */ 062 public static final String ROLE = SolrResourceIndexer.class.getName(); 063 064 /** The Tika instance. */ 065 protected Tika _tika; 066 067 /** The language manager. */ 068 protected LanguagesManager _langManager; 069 070 /** The solr indexer */ 071 protected SolrIndexer _solrIndexer; 072 073 @Override 074 public void service(ServiceManager manager) throws ServiceException 075 { 076 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 077 _tika = tikaProvider.getTika(); 078 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 079 _solrIndexer = (SolrIndexer) manager.lookup(SolrIndexer.ROLE); 080 } 081 082 /** 083 * Index a resource. 084 * @param resource The resource to index. 085 * @param document The Solr document to index into. 086 * @param documentType The document type of the resource 087 * @throws Exception if an error occurs. 088 */ 089 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 090 { 091 indexResource(resource, document, documentType, null, null); 092 } 093 094 /** 095 * Index a resource. 096 * @param resource The resource to index. 097 * @param document The Solr document to index into. 098 * @param documentType The document type of the resource 099 * @param language The query language. 100 * @throws Exception if an error occurs. 101 */ 102 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 103 { 104 indexResource(resource, document, documentType, language, null); 105 } 106 107 /** 108 * Index a resource. 109 * @param resource The resource to index. 110 * @param document The Solr document to index into. 111 * @param documentType The document type of the resource 112 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 113 * @throws Exception if an error occurs. 114 */ 115 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 116 { 117 indexResource(resource, document, documentType, null, resourceRoot); 118 } 119 120 /** 121 * Index a resource. 122 * @param resource The resource to index. 123 * @param document The Solr document to index into. 124 * @param documentType The document type of the resource 125 * @param language The language, can be null. 126 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 127 * @throws Exception if an error occurs. 128 */ 129 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 130 { 131 // Resource id - Store.YES, Index.NOT_ANALYZED 132 document.addField(ID, resource.getId()); 133 // Type is resource - Store.YES, Index.NOT_ANALYZED 134 document.addField(DOCUMENT_TYPE, documentType); 135 document.addField(PSEUDO_CONTENT_TYPES, PSEUDO_CONTENT_TYPE_VALUE_RESOURCE); 136 // The resource path. 137 document.setField(PATH, resource.getResourcePath()); 138 document.addField(FILENAME, resource.getName()); 139 140 // Title 141 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 142 document.addField(TITLE, title); 143 document.setField(TITLE_SORT, resource.getName()); 144 document.addField(TITLE + "_s", title); 145 document.addField(TITLE + "_s_sort", title); 146 147 // Replaces "all-not-analyzed". 148 indexFulltextValue(document, title, language); 149 150 _populateDatesOfPage(resource, document); 151 152 // Mime types - Store.YES, Index.ANALYZED 153 document.addField(MIME_TYPES, resource.getMimeType()); 154 // Length - Store.YES, Index.NO 155 document.addField(LENGTH, resource.getLength()); 156 157 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 158 document.addField(RESOURCE_ROOT_ID, root.getId()); 159 160 // Parents resource collections of the resource 161 _indexAncestorIds(resource, document); 162 163 // Resource author 164 String author = UserIdentity.userIdentityToString(resource.getCreator()); 165 if (StringUtils.isNotBlank(author)) 166 { 167 document.setField(RESOURCE_CREATOR, author); 168 } 169 170 // Hard-coded content type for facets. 171 // TODO Move to specific "embedded mode" method? 172 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 173 174 // Indexation of ACL initial values 175 _solrIndexer.indexAclInitValues(resource, document); 176 177 indexResourceContent(resource, document, language); 178 } 179 180 /** 181 * Populate the solr input document with dates from the resource 182 * @param resource The resource 183 * @param document The Solr document 184 */ 185 protected void _populateDatesOfPage(Resource resource, SolrInputDocument document) 186 { 187 // Last modified 188 String lastModifiedStr = SolrIndexer.dateFormat().format(resource.getLastModified()); 189 // For 'new' search service 190 document.addField(LastModifiedSearchField.NAME, lastModifiedStr); 191 // For 'old' search service 192 document.addField(LAST_MODIFIED + "_dt", lastModifiedStr); 193 194 // For 'new' search service => last validation, last major validation 195 document.addField(LastValidationSearchField.NAME, lastModifiedStr); 196 document.addField(LastMajorValidationSearchField.NAME, lastModifiedStr); 197 198 // For 'new' search service => first validation 199 String creationDateStr = SolrIndexer.dateFormat().format(resource.getCreationDate()); 200 document.addField(FirstValidationSearchField.NAME, creationDateStr); 201 202 // Solr facet specific : dates-facet 203 Date date = resource.getDCDate(); 204 String formattedDate = SolrIndexer.dateFormat().format(date); 205 if (formattedDate != null) 206 { 207 document.setField(RESOURCE_DATE, formattedDate); 208 document.setField(DATE_FOR_SORTING, formattedDate); 209 document.setField(DATES_FACET, formattedDate); 210 } 211 } 212 213 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 214 { 215 // Ancestors 216 List<String> ancestorIds = new ArrayList<>(); 217 AmetysObject parent = resource.getParent(); 218 while (parent instanceof ResourceCollection) 219 { 220 ancestorIds.add(parent.getId()); 221 parent = parent.getParent(); 222 } 223 224 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 225 226 // Ancestors and self 227 List<String> ancestorAndSelfIds = new ArrayList<>(); 228 ancestorAndSelfIds.add(resource.getId()); 229 ancestorAndSelfIds.addAll(ancestorIds); 230 document.addField(RESOURCE_ANCESTOR_AND_SELF_IDS, ancestorAndSelfIds); 231 } 232 233 /** 234 * Index a collection of resources. 235 * @param resourceCollection the resource collection to index. 236 * @param document The document to index into. 237 * @param language The current language. 238 * @throws Exception if an error occurs while indexing. 239 */ 240 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 241 { 242 if (resourceCollection == null) 243 { 244 return; 245 } 246 247 for (AmetysObject object : resourceCollection.getChildren()) 248 { 249 if (object instanceof ResourceCollection) 250 { 251 indexResourceCollection((ResourceCollection) object, document, language); 252 } 253 else if (object instanceof Resource) 254 { 255 indexResourceContent((Resource) object, document, language); 256 } 257 } 258 } 259 260 /** 261 * Index a resource content (text in case of a document, and Dublin Core metadata). 262 * @param resource The resource to index. 263 * @param document The document to index into. 264 * @param language The current language, can be null. 265 */ 266 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 267 { 268 try (InputStream is = resource.getInputStream()) 269 { 270 String value = _tika.parseToString(is); 271 indexFulltextValue(document, value, language); 272 273 if (StringUtils.isNotBlank(value)) 274 { 275 int summaryEndIndex = value.lastIndexOf(' ', 200); 276 if (summaryEndIndex == -1) 277 { 278 summaryEndIndex = value.length(); 279 } 280 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 281 } 282 283 for (String keyword : resource.getDCSubject()) 284 { 285 indexFulltextValue(document, keyword, language); 286 } 287 288 String desc = resource.getDCDescription(); 289 if (desc != null) 290 { 291 indexFulltextValue(document, desc, language); 292 } 293 294 // DC meta 295 indexDublinCoreMetadata(resource, document); 296 } 297 catch (Throwable e) 298 { 299 getLogger().error("Unable to index resource at " + resource.getPath(), e); 300 } 301 } 302 303 /** 304 * Index a full-text value. 305 * @param document The document to index into. 306 * @param text The text to index. 307 * @param language The content language, can be null. 308 */ 309 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 310 { 311 if (StringUtils.isNotEmpty(language)) 312 { 313 SolrContentIndexer.indexFulltextValue(document, text, language); 314 } 315 else 316 { 317 Set<String> languages = _langManager.getAvailableLanguages().keySet(); 318 SolrContentIndexer.indexFulltextValue(document, text, languages); 319 } 320 } 321 322 /////////////////////////////////////////////////////////////////////////// 323 324 /** 325 * Index Dublin core metadata. 326 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 327 * @param document the solr input document to populate. 328 */ 329 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 330 { 331 _indexNonNullValue(document, "DCTitle", object.getDCTitle()); 332 _indexNonNullValue(document, "DCSubject", object.getDCSubject()); 333 _indexNonNullValue(document, "DCDescription", object.getDCDescription()); 334 _indexNonNullValue(document, "DCContributor", object.getDCContributor()); 335 _indexNonNullValue(document, "DCCoverage", object.getDCCoverage()); 336 _indexNonNullValue(document, "DCCreator", object.getDCCreator()); 337 _indexNonNullValue(document, "DCFormat", object.getDCFormat()); 338 _indexNonNullValue(document, "DCLanguage", object.getDCLanguage()); 339 _indexNonNullValue(document, "DCPublisher", object.getDCPublisher()); 340 _indexNonNullValue(document, "DCRights", object.getDCRights()); 341 _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate())); 342 } 343 344 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 345 { 346 if (value != null) 347 { 348 document.addField(fieldName, value); 349 } 350 } 351 352 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 353 { 354 if (values != null) 355 { 356 for (String value : values) 357 { 358 document.addField(fieldName, value); 359 } 360 } 361 } 362 363 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 364 { 365 if (value != null) 366 { 367 document.addField(fieldName, value); 368 } 369 }*/ 370}