001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.ArrayList; 020import java.util.Date; 021import java.util.List; 022import java.util.Set; 023 024import org.apache.avalon.framework.component.Component; 025import org.apache.avalon.framework.service.ServiceException; 026import org.apache.avalon.framework.service.ServiceManager; 027import org.apache.avalon.framework.service.Serviceable; 028import org.apache.commons.lang3.StringUtils; 029import org.apache.solr.common.SolrInputDocument; 030import org.apache.tika.Tika; 031 032import org.ametys.cms.languages.LanguagesManager; 033import org.ametys.core.user.UserIdentity; 034import org.ametys.plugins.explorer.resources.Resource; 035import org.ametys.plugins.explorer.resources.ResourceCollection; 036import org.ametys.plugins.explorer.resources.ResourceHelper; 037import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 038import org.ametys.plugins.repository.AmetysObject; 039import org.ametys.plugins.repository.TraversableAmetysObject; 040import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 041import org.ametys.runtime.plugin.component.AbstractLogEnabled; 042 043/** 044 * Solr resource indexer.<p> 045 * Populate a Solr input document with the following fields: 046 * <dl> 047 * <dt>id 048 * <dd>resource id 049 * <dt>type 050 * <dd>with <code>"document"</code> value 051 * <dt>full 052 * <dd>resource content 053 * </dl> 054 */ 055public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 056{ 057 /** The avalon role. */ 058 public static final String ROLE = SolrResourceIndexer.class.getName(); 059 060 /** The Tika instance. */ 061 protected Tika _tika; 062 063 /** The language manager. */ 064 protected LanguagesManager _langManager; 065 066 @Override 067 public void service(ServiceManager manager) throws ServiceException 068 { 069 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 070 _tika = tikaProvider.getTika(); 071 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 072 } 073 074 /** 075 * Index a resource. 076 * @param resource The resource to index. 077 * @param document The Solr document to index into. 078 * @param documentType The document type of the resource 079 * @throws Exception if an error occurs. 080 */ 081 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 082 { 083 indexResource(resource, document, documentType, null, null); 084 } 085 086 /** 087 * Index a resource. 088 * @param resource The resource to index. 089 * @param document The Solr document to index into. 090 * @param documentType The document type of the resource 091 * @param language The query language. 092 * @throws Exception if an error occurs. 093 */ 094 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 095 { 096 indexResource(resource, document, documentType, language, null); 097 } 098 099 /** 100 * Index a resource. 101 * @param resource The resource to index. 102 * @param document The Solr document to index into. 103 * @param documentType The document type of the resource 104 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 105 * @throws Exception if an error occurs. 106 */ 107 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 108 { 109 indexResource(resource, document, documentType, null, resourceRoot); 110 } 111 112 /** 113 * Index a resource. 114 * @param resource The resource to index. 115 * @param document The Solr document to index into. 116 * @param documentType The document type of the resource 117 * @param language The language, can be null. 118 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 119 * @throws Exception if an error occurs. 120 */ 121 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 122 { 123 // Resource id - Store.YES, Index.NOT_ANALYZED 124 document.addField(ID, resource.getId()); 125 // Type is resource - Store.YES, Index.NOT_ANALYZED 126 document.addField(DOCUMENT_TYPE, documentType); 127 // The resource path. 128 document.setField(PATH, resource.getResourcePath()); 129 document.addField(FILENAME, resource.getName()); 130 131 // Title 132 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 133 document.addField(TITLE, title); 134 document.setField(TITLE_SORT, resource.getName()); 135 document.addField(TITLE + "_s", title); 136 document.addField(TITLE + "_s_sort", title); 137 138 // Replaces "all-not-analyzed". 139 indexFulltextValue(document, title, language); 140 141 // Last modified 142 document.addField(LAST_MODIFIED + "_dt", SolrIndexer.dateFormat().format(resource.getLastModified())); 143 144 // Mime types - Store.YES, Index.ANALYZED 145 document.addField(MIME_TYPES, resource.getMimeType()); 146 // Length - Store.YES, Index.NO 147 document.addField(LENGTH, resource.getLength()); 148 149 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 150 document.addField(RESOURCE_ROOT_ID, root.getId()); 151 152 // Parents resource collections of the resource 153 _indexAncestorIds(resource, document); 154 155 // Solr facet specific : dates-facet 156 Date date = resource.getDCDate(); 157 String formattedDate = SolrIndexer.dateFormat().format(date); 158 if (formattedDate != null) 159 { 160 document.setField(RESOURCE_DATE, formattedDate); 161 document.setField(DATE_FOR_SORTING, formattedDate); 162 document.setField(DATES_FACET, formattedDate); 163 } 164 165 // Resource author 166 String author = UserIdentity.userIdentityToString(resource.getCreator()); 167 if (StringUtils.isNotBlank(author)) 168 { 169 document.setField(RESOURCE_CREATOR, author); 170 } 171 172 // Hard-coded content type for facets. 173 // TODO Move to specific "embedded mode" method? 174 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 175 176 // Indexation of AmetysObject property 177 document.addField(SolrFieldNames.IS_AMETYS_OBJECT, true); 178 179 indexResourceContent(resource, document, language); 180 } 181 182 private void _indexAncestorIds(Resource resource, SolrInputDocument document) 183 { 184 List<String> ancestorIds = new ArrayList<>(); 185 AmetysObject parent = resource.getParent(); 186 while (parent instanceof ResourceCollection) 187 { 188 ancestorIds.add(parent.getId()); 189 parent = parent.getParent(); 190 } 191 192 document.addField(RESOURCE_ANCESTOR_IDS, ancestorIds); 193 } 194 195 /** 196 * Index a collection of resources. 197 * @param resourceCollection the resource collection to index. 198 * @param document The document to index into. 199 * @param language The current language. 200 * @throws Exception if an error occurs while indexing. 201 */ 202 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 203 { 204 if (resourceCollection == null) 205 { 206 return; 207 } 208 209 for (AmetysObject object : resourceCollection.getChildren()) 210 { 211 if (object instanceof ResourceCollection) 212 { 213 indexResourceCollection((ResourceCollection) object, document, language); 214 } 215 else if (object instanceof Resource) 216 { 217 indexResourceContent((Resource) object, document, language); 218 } 219 } 220 } 221 222 /** 223 * Index a resource content (text in case of a document, and Dublin Core metadata). 224 * @param resource The resource to index. 225 * @param document The document to index into. 226 * @param language The current language, can be null. 227 */ 228 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 229 { 230 try (InputStream is = resource.getInputStream()) 231 { 232 String value = _tika.parseToString(is); 233 indexFulltextValue(document, value, language); 234 235 if (StringUtils.isNotBlank(value)) 236 { 237 int summaryEndIndex = value.lastIndexOf(' ', 200); 238 if (summaryEndIndex == -1) 239 { 240 summaryEndIndex = value.length(); 241 } 242 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 243 } 244 245 for (String keyword : resource.getDCSubject()) 246 { 247 indexFulltextValue(document, keyword, language); 248 } 249 250 String desc = resource.getDCDescription(); 251 if (desc != null) 252 { 253 indexFulltextValue(document, desc, language); 254 } 255 256 // DC meta 257 indexDublinCoreMetadata(resource, document); 258 } 259 catch (Throwable e) 260 { 261 getLogger().error("Unable to index resource at " + resource.getPath(), e); 262 } 263 } 264 265 /** 266 * Index a full-text value. 267 * @param document The document to index into. 268 * @param text The text to index. 269 * @param language The content language, can be null. 270 */ 271 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 272 { 273 if (StringUtils.isNotEmpty(language)) 274 { 275 SolrContentIndexer.indexFulltextValue(document, text, language); 276 } 277 else 278 { 279 Set<String> languages = _langManager.getAvailableLanguages().keySet(); 280 SolrContentIndexer.indexFulltextValue(document, text, languages); 281 } 282 } 283 284 /////////////////////////////////////////////////////////////////////////// 285 286 /** 287 * Index Dublin core metadata. 288 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 289 * @param document the solr input document to populate. 290 */ 291 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 292 { 293 _indexNonNullValue(document, "DCTitle", object.getDCTitle()); 294 _indexNonNullValue(document, "DCSubject", object.getDCSubject()); 295 _indexNonNullValue(document, "DCDescription", object.getDCDescription()); 296 _indexNonNullValue(document, "DCContributor", object.getDCContributor()); 297 _indexNonNullValue(document, "DCCoverage", object.getDCCoverage()); 298 _indexNonNullValue(document, "DCCreator", object.getDCCreator()); 299 _indexNonNullValue(document, "DCFormat", object.getDCFormat()); 300 _indexNonNullValue(document, "DCLanguage", object.getDCLanguage()); 301 _indexNonNullValue(document, "DCPublisher", object.getDCPublisher()); 302 _indexNonNullValue(document, "DCRights", object.getDCRights()); 303 _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate())); 304 } 305 306 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 307 { 308 if (value != null) 309 { 310 document.addField(fieldName, value); 311 } 312 } 313 314 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 315 { 316 if (values != null) 317 { 318 for (String value : values) 319 { 320 document.addField(fieldName, value); 321 } 322 } 323 } 324 325 /*private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 326 { 327 if (value != null) 328 { 329 document.addField(fieldName, value); 330 } 331 }*/ 332}