001/* 002 * Copyright 2015 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.content.indexing.solr; 017 018import java.io.InputStream; 019import java.util.Date; 020import java.util.Set; 021 022import org.apache.avalon.framework.component.Component; 023import org.apache.avalon.framework.service.ServiceException; 024import org.apache.avalon.framework.service.ServiceManager; 025import org.apache.avalon.framework.service.Serviceable; 026import org.apache.commons.lang3.StringUtils; 027import org.apache.solr.common.SolrInputDocument; 028import org.apache.tika.Tika; 029 030import org.ametys.cms.languages.LanguagesManager; 031import org.ametys.core.user.UserIdentity; 032import org.ametys.plugins.explorer.resources.Resource; 033import org.ametys.plugins.explorer.resources.ResourceCollection; 034import org.ametys.plugins.explorer.resources.ResourceHelper; 035import org.ametys.plugins.explorer.resources.metadata.TikaProvider; 036import org.ametys.plugins.repository.AmetysObject; 037import org.ametys.plugins.repository.TraversableAmetysObject; 038import org.ametys.plugins.repository.dublincore.DublinCoreAwareAmetysObject; 039import org.ametys.runtime.plugin.component.AbstractLogEnabled; 040 041/** 042 * Solr resource indexer.<p> 043 * Populate a Solr input document with the following fields: 044 * <dl> 045 * <dt>id 046 * <dd>resource id 047 * <dt>type 048 * <dd>with <code>"document"</code> value 049 * <dt>full 050 * <dd>resource content 051 * </dl> 052 */ 053public class SolrResourceIndexer extends AbstractLogEnabled implements Component, Serviceable, SolrFieldNames 054{ 055 /** The avalon role. */ 056 public static final String ROLE = SolrResourceIndexer.class.getName(); 057 058 /** The Tika instance. */ 059 protected Tika _tika; 060 061 /** The language manager. */ 062 protected LanguagesManager _langManager; 063 064 @Override 065 public void service(ServiceManager manager) throws ServiceException 066 { 067 TikaProvider tikaProvider = (TikaProvider) manager.lookup(TikaProvider.ROLE); 068 _tika = tikaProvider.getTika(); 069 _langManager = (LanguagesManager) manager.lookup(LanguagesManager.ROLE); 070 } 071 072 /** 073 * Index a resource. 074 * @param resource The resource to index. 075 * @param document The Solr document to index into. 076 * @param documentType The document type of the resource 077 * @throws Exception if an error occurs. 078 */ 079 public void indexResource(Resource resource, SolrInputDocument document, String documentType) throws Exception 080 { 081 indexResource(resource, document, documentType, null, null); 082 } 083 084 /** 085 * Index a resource. 086 * @param resource The resource to index. 087 * @param document The Solr document to index into. 088 * @param documentType The document type of the resource 089 * @param language The query language. 090 * @throws Exception if an error occurs. 091 */ 092 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language) throws Exception 093 { 094 indexResource(resource, document, documentType, language, null); 095 } 096 097 /** 098 * Index a resource. 099 * @param resource The resource to index. 100 * @param document The Solr document to index into. 101 * @param documentType The document type of the resource 102 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 103 * @throws Exception if an error occurs. 104 */ 105 public void indexResource(Resource resource, SolrInputDocument document, String documentType, TraversableAmetysObject resourceRoot) throws Exception 106 { 107 indexResource(resource, document, documentType, null, resourceRoot); 108 } 109 110 /** 111 * Index a resource. 112 * @param resource The resource to index. 113 * @param document The Solr document to index into. 114 * @param documentType The document type of the resource 115 * @param language The language, can be null. 116 * @param resourceRoot The resource root, can be null. When null, it will have to be computed. 117 * @throws Exception if an error occurs. 118 */ 119 public void indexResource(Resource resource, SolrInputDocument document, String documentType, String language, TraversableAmetysObject resourceRoot) throws Exception 120 { 121 // Resource id - Store.YES, Index.NOT_ANALYZED 122 document.addField(ID, resource.getId()); 123 // Type is resource - Store.YES, Index.NOT_ANALYZED 124 document.addField(DOCUMENT_TYPE, documentType); 125 // The resource path. 126 document.setField(PATH, resource.getResourcePath()); 127 document.addField(FILENAME, resource.getName()); 128 129 // Title 130 String title = StringUtils.substringBeforeLast(resource.getName(), "."); 131 document.addField(TITLE, title); 132 document.setField(TITLE_SORT, resource.getName()); 133 document.addField(TITLE + "_s", title); 134 document.addField(TITLE + "_s_sort", title); 135 136 // Replaces "all-not-analyzed". 137 indexFulltextValue(document, title, language); 138 139 // Last modified 140 document.addField(LAST_MODIFIED + "_dt", SolrIndexer.dateFormat().format(resource.getLastModified())); 141 142 // Mime types - Store.YES, Index.ANALYZED 143 document.addField(MIME_TYPES, resource.getMimeType()); 144 // Length - Store.YES, Index.NO 145 document.addField(LENGTH, resource.getLength()); 146 147 AmetysObject root = resourceRoot == null ? ResourceHelper.getResourceRoot(resource) : resourceRoot; 148 document.addField(RESOURCE_ROOT_ID, root.getId()); 149 150 // Solr facet specific : dates-facet 151 Date date = resource.getDCDate(); 152 String formattedDate = SolrIndexer.dateFormat().format(date); 153 if (formattedDate != null) 154 { 155 document.setField(RESOURCE_DATE, formattedDate); 156 document.setField(DATE_FOR_SORTING, formattedDate); 157 document.setField(DATES_FACET, formattedDate); 158 } 159 160 // Resource author 161 String author = UserIdentity.userIdentityToString(resource.getCreator()); 162 if (StringUtils.isNotBlank(author)) 163 { 164 document.setField(RESOURCE_CREATOR, author); 165 } 166 167 // Hard-coded content type for facets. 168 // TODO Move to specific "embedded mode" method? 169 document.addField(CONTENT_TYPES, CONTENT_TYPE_RESOURCE); 170 171 indexResourceContent(resource, document, language); 172 } 173 174 /** 175 * Index a collection of resources. 176 * @param resourceCollection the resource collection to index. 177 * @param document The document to index into. 178 * @param language The current language. 179 * @throws Exception if an error occurs while indexing. 180 */ 181 public void indexResourceCollection(ResourceCollection resourceCollection, SolrInputDocument document, String language) throws Exception 182 { 183 if (resourceCollection == null) 184 { 185 return; 186 } 187 188 for (AmetysObject object : resourceCollection.getChildren()) 189 { 190 if (object instanceof ResourceCollection) 191 { 192 indexResourceCollection((ResourceCollection) object, document, language); 193 } 194 else if (object instanceof Resource) 195 { 196 indexResourceContent((Resource) object, document, language); 197 } 198 } 199 } 200 201 /** 202 * Index a resource content (text in case of a document, and Dublin Core metadata). 203 * @param resource The resource to index. 204 * @param document The document to index into. 205 * @param language The current language, can be null. 206 */ 207 public void indexResourceContent(Resource resource, SolrInputDocument document, String language) 208 { 209 try (InputStream is = resource.getInputStream()) 210 { 211 String value = _tika.parseToString(is); 212 indexFulltextValue(document, value, language); 213 214 if (StringUtils.isNotBlank(value)) 215 { 216 int summaryEndIndex = value.lastIndexOf(' ', 200); 217 if (summaryEndIndex == -1) 218 { 219 summaryEndIndex = value.length(); 220 } 221 document.addField(EXCERPT, value.substring(0, summaryEndIndex) + (summaryEndIndex != value.length() ? "…" : "")); 222 } 223 224 for (String keyword : resource.getDCSubject()) 225 { 226 indexFulltextValue(document, keyword, language); 227 } 228 229 String desc = resource.getDCDescription(); 230 if (desc != null) 231 { 232 indexFulltextValue(document, desc, language); 233 } 234 235 // DC meta 236 indexDublinCoreMetadata(resource, document); 237 } 238 catch (Throwable e) 239 { 240 getLogger().error("Unable to index resource at " + resource.getPath(), e); 241 } 242 } 243 244 /** 245 * Index a full-text value. 246 * @param document The document to index into. 247 * @param text The text to index. 248 * @param language The content language, can be null. 249 */ 250 protected void indexFulltextValue(SolrInputDocument document, String text, String language) 251 { 252 if (StringUtils.isNotEmpty(language)) 253 { 254 SolrContentIndexer.indexFulltextValue(document, text, language); 255 } 256 else 257 { 258 Set<String> languages = _langManager.getAvailableLanguages().keySet(); 259 SolrContentIndexer.indexFulltextValue(document, text, languages); 260 } 261 } 262 263 /////////////////////////////////////////////////////////////////////////// 264 265 /** 266 * Index Dublin core metadata. 267 * @param object the {@link DublinCoreAwareAmetysObject} holding Dublin Core metadata. 268 * @param document the solr input document to populate. 269 */ 270 public void indexDublinCoreMetadata(DublinCoreAwareAmetysObject object, SolrInputDocument document) 271 { 272 _indexNonNullValue(document, "DCTitle", object.getDCTitle()); 273 _indexNonNullValue(document, "DCSubject", object.getDCSubject()); 274 _indexNonNullValue(document, "DCDescription", object.getDCDescription()); 275 _indexNonNullValue(document, "DCContributor", object.getDCContributor()); 276 _indexNonNullValue(document, "DCCoverage", object.getDCCoverage()); 277 _indexNonNullValue(document, "DCCreator", object.getDCCreator()); 278 _indexNonNullValue(document, "DCFormat", object.getDCFormat()); 279 _indexNonNullValue(document, "DCLanguage", object.getDCLanguage()); 280 _indexNonNullValue(document, "DCPublisher", object.getDCPublisher()); 281 _indexNonNullValue(document, "DCRights", object.getDCRights()); 282 _indexNonNullValue(document, "DCDate", SolrIndexer.dateFormat().format(object.getDCDate())); 283 } 284 285 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String value) 286 { 287 if (value != null) 288 { 289 document.addField(fieldName, value); 290 } 291 } 292 293 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, String[] values) 294 { 295 if (values != null) 296 { 297 for (String value : values) 298 { 299 document.addField(fieldName, value); 300 } 301 } 302 } 303 304 private static void _indexNonNullValue(SolrInputDocument document, String fieldName, Date value) 305 { 306 if (value != null) 307 { 308 document.addField(fieldName, value); 309 } 310 } 311 312}