001/* 002 * Copyright 2021 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.cms.duplicate.contents; 017 018import java.util.ArrayList; 019import java.util.Collection; 020import java.util.Comparator; 021import java.util.HashMap; 022import java.util.HashSet; 023import java.util.LinkedList; 024import java.util.List; 025import java.util.Map; 026import java.util.Set; 027import java.util.function.Predicate; 028import java.util.stream.Collectors; 029 030import org.apache.avalon.framework.component.Component; 031import org.apache.avalon.framework.configuration.Configurable; 032import org.apache.avalon.framework.configuration.Configuration; 033import org.apache.avalon.framework.configuration.ConfigurationException; 034import org.apache.avalon.framework.service.ServiceException; 035import org.apache.avalon.framework.service.ServiceManager; 036import org.apache.avalon.framework.service.Serviceable; 037import org.apache.commons.collections.ListUtils; 038import org.apache.commons.lang3.StringUtils; 039import org.apache.commons.lang3.tuple.Pair; 040import org.apache.solr.common.SolrException; 041import org.slf4j.Logger; 042 043import org.ametys.cms.content.ContentHelper; 044import org.ametys.cms.contenttype.ContentType; 045import org.ametys.cms.contenttype.ContentTypeExtensionPoint; 046import org.ametys.cms.duplicate.contents.attr.DuplicateAttributeConfiguration; 047import org.ametys.cms.repository.Content; 048import org.ametys.cms.search.content.ContentSearcherFactory; 049import org.ametys.cms.search.query.AndQuery; 050import org.ametys.cms.search.query.ContentTypeQuery; 051import org.ametys.cms.search.query.MatchAllQuery; 052import org.ametys.cms.search.query.Query; 053import org.ametys.plugins.repository.AmetysObjectIterable; 054import org.ametys.plugins.repository.EmptyIterable; 055import org.ametys.runtime.plugin.component.AbstractLogEnabled; 056 057/** 058 * Component able to detect duplicates (and near duplicates) for a given content. 059 */ 060public class DuplicateContentsManager extends AbstractLogEnabled implements Component, Serviceable, Configurable 061{ 062 063 /** The component role. */ 064 public static final String ROLE = DuplicateContentsManager.class.getName(); 065 066 /** key for duplicate contents list */ 067 public static final String DUPLICATE_CONTENTS_KEY = "duplicates"; 068 /** key for near duplicate contents list */ 069 public static final String NEAR_DUPLICATE_CONTENTS_KEY = "nearDuplicates"; 070 /** key for boolean to know whether there are some content types or not */ 071 public static final String NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY = "noDuplicatesContentType"; 072 /** key for boolean to know the query status */ 073 public static final String STATUS_KEY = "status"; 074 075 /** 076 * The status of the query 077 */ 078 public enum Status 079 { 080 /** Successful query */ 081 SUCCESSFUL, 082 /** too complex query */ 083 TOO_COMPLEX, 084 /** empty query */ 085 EMPTY 086 } 087 088 /** The content searcher factory. */ 089 protected ContentSearcherFactory _contentSearcherFactory; 090 091 /** The content helper */ 092 protected ContentHelper _contentHelper; 093 094 /** The duplicate content description */ 095 protected DuplicateContentConfiguration _duplicateContentConfiguration; 096 097 /** The content type extension point */ 098 ContentTypeExtensionPoint _cTypeEP; 099 100 @Override 101 public void service(ServiceManager manager) throws ServiceException 102 { 103 _cTypeEP = (ContentTypeExtensionPoint) manager.lookup(ContentTypeExtensionPoint.ROLE); 104 _contentSearcherFactory = (ContentSearcherFactory) manager.lookup(ContentSearcherFactory.ROLE); 105 _contentHelper = (ContentHelper) manager.lookup(ContentHelper.ROLE); 106 } 107 108 @Override 109 public void configure(Configuration configuration) throws ConfigurationException 110 { 111 _duplicateContentConfiguration = new DuplicateContentConfiguration(configuration, this); 112 logConfigurationErrors(getLogger()); 113 } 114 115 /** 116 * Get the data about duplicates and near duplicated for a given content 117 * @param content The content 118 * @return A map of data. key "duplicates" contains a list of the duplicates 119 * (id and label for each entry), and key "nearDuplicates" contains 120 * the near duplicates if requested (duplicates excluded). 121 */ 122 @SuppressWarnings("unchecked") 123 public Map<String, Object> searchDuplicates(Content content) 124 { 125 Map<String, Object> results = searchDuplicates(List.of(content)); 126 127 results.put(STATUS_KEY, ((Map<Content, Status>) results.get(STATUS_KEY)).get(content)); 128 results.put(DUPLICATE_CONTENTS_KEY, ((Map<Content, List<Content>>) results.get(DUPLICATE_CONTENTS_KEY)).getOrDefault(content, new ArrayList<>())); 129 results.put(NEAR_DUPLICATE_CONTENTS_KEY, ((Map<Content, List<Content>>) results.get(NEAR_DUPLICATE_CONTENTS_KEY)).getOrDefault(content, new ArrayList<>())); 130 return results; 131 } 132 133 /** 134 * Get the data about duplicates and near duplicates for all contents that match the content types included in the configuration 135 * @return the data about duplicates and near duplicates 136 */ 137 public Map<String, Object> searchDuplicates() 138 { 139 Map<String, Object> results = new HashMap<>(); 140 // Get content given duplicate content types. 141 Set<String> duplicatesContentTypes = _duplicateContentConfiguration.getDuplicatesContentTypes(); 142 143 if (duplicatesContentTypes.isEmpty()) 144 { 145 results.put(NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY, true); 146 return results; 147 } 148 149 results.put(NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY, false); 150 AmetysObjectIterable<Content> contents = _getContents(duplicatesContentTypes); 151 results.putAll(searchDuplicates(contents)); 152 return results; 153 } 154 155 /** 156 * Get the data about duplicates and near duplicates for a list of contents 157 * @param contents the contents to check 158 * @return the data about duplicates and near duplicates 159 */ 160 public Map<String, Object> searchDuplicates(Iterable<Content> contents) 161 { 162 Map<String, Object> results = new HashMap<>(); 163 164 Map<Content, List<Content>> duplicatesMap = new HashMap<>(); 165 Map<Content, List<Content>> nearDuplicatesMap = new HashMap<>(); 166 Map<Content, Status> statusMap = new HashMap<>(); 167 168 List<String> duplicatesFound = new LinkedList<>(); 169 170 for (Content content : contents) 171 { 172 if (!duplicatesFound.contains(content.getId())) 173 { 174 175 // Find the content types that will act as references to determine the duplicates attributes. 176 String[] contentTypes = content.getTypes(); 177 Set<String> duplicateContentTypes = _addDuplicatesContentTypes(contentTypes); 178 179 String[] mixinTypes = content.getMixinTypes(); 180 duplicateContentTypes.addAll(_addDuplicatesContentTypes(mixinTypes)); 181 182 183 // Duplicates attributes 184 Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations = duplicateContentTypes.stream() 185 .map(duplicateCtype -> _duplicateContentConfiguration.get(duplicateCtype)) 186 .collect(Collectors.toSet()); 187 188 // Search only if configuration is defined for the given content types 189 if (!duplicateContentTypeConfigurations.isEmpty()) 190 { 191 try 192 { 193 // Search for duplicate contents 194 List<Content> duplicates = _getDuplicates(content, duplicateContentTypeConfigurations, false, contentTypes); 195 if (!duplicates.isEmpty()) 196 { 197 duplicatesMap.put(content, duplicates); 198 } 199 duplicatesFound.addAll(duplicates.stream().map(Content::getId).collect(Collectors.toList())); 200 201 // Search for near duplicate contents if needed (different query for near duplicates) 202 boolean checkNearDuplicates = duplicateContentTypeConfigurations.stream() 203 .anyMatch(DuplicateContentTypeConfiguration::hasAnyNearDuplicateAttributes); 204 if (checkNearDuplicates) 205 { 206 List<Content> nearDuplicates = _getDuplicates(content, duplicateContentTypeConfigurations, true, contentTypes); 207 statusMap.put(content, Status.SUCCESSFUL); 208 nearDuplicates = ListUtils.removeAll(nearDuplicates, duplicates); 209 if (!nearDuplicates.isEmpty()) 210 { 211 nearDuplicatesMap.put(content, nearDuplicates); 212 } 213 } 214 } 215 catch (Exception e) 216 { 217 if (e instanceof SolrException && StringUtils.equals(((SolrException) e).getRootThrowable(), "org.apache.lucene.util.automaton.TooComplexToDeterminizeException")) 218 { 219 getLogger().warn("Fuzzy query too complex", e); 220 statusMap.put(content, Status.TOO_COMPLEX); 221 } 222 else 223 { 224 getLogger().error("Unable to query to the Solr server", e); 225 } 226 } 227 } 228 } 229 } 230 results.put(DUPLICATE_CONTENTS_KEY, duplicatesMap); 231 results.put(NEAR_DUPLICATE_CONTENTS_KEY, nearDuplicatesMap); 232 results.put(STATUS_KEY, statusMap); 233 return results; 234 } 235 236 /** 237 * Get the list of duplicates 238 * @param content The content 239 * @param duplicateContentTypeConfigurations the attribute list 240 * @param nearDuplicates true to check for near duplicates 241 * @param contentTypes the content types 242 * @return list of duplicates 243 * @throws Exception if a problem occurs while searching for duplicates 244 */ 245 protected List<Content> _getDuplicates(Content content, Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations, boolean nearDuplicates, String[] contentTypes) throws Exception 246 { 247 // Query 248 List<Query> queries = _getDuplicatesQueries(content, duplicateContentTypeConfigurations, nearDuplicates, contentTypes); 249 250 // Query building 251 Query query = new AndQuery(queries); 252 253 AmetysObjectIterable<Content> results; 254 results = _contentSearcherFactory.create() 255 .search(query); 256 257 return results.stream() 258 .filter(Predicate.not(content::equals)) 259 .sorted(Comparator.comparing(Content::getTitle, String.CASE_INSENSITIVE_ORDER)) 260 .collect(Collectors.toList()); 261 } 262 263 /** 264 * Get the list of queries to search for duplicates 265 * @param content The content 266 * @param duplicateContentTypeConfigurations the attribute list 267 * @param nearDuplicates true to check for near duplicates 268 * @param contentTypes the content types 269 * @return list of duplicates 270 */ 271 protected List<Query> _getDuplicatesQueries(Content content, Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations, boolean nearDuplicates, String[] contentTypes) 272 { 273 // Query 274 List<Query> queries = new LinkedList<>(); 275 276 // Content types (mixins are not added) 277 for (String contentType : contentTypes) 278 { 279 queries.add(new ContentTypeQuery(contentType)); 280 } 281 282 for (DuplicateContentTypeConfiguration duplicateContentTypeConfiguration : duplicateContentTypeConfigurations) 283 { 284 for (DuplicateAttributeConfiguration duplicateAttributeConfiguration : duplicateContentTypeConfiguration.getAttributeList()) 285 { 286 String path = duplicateAttributeConfiguration.getPath(); 287 288 queries.add(duplicateAttributeConfiguration.getQuery(content.getValue(path), nearDuplicates)); 289 } 290 } 291 return queries; 292 } 293 294 /** 295 * Get the configured duplicates content types 296 * @param contentTypeIds The content type identifiers for which duplicates content types should be found 297 * @return the duplicate content types 298 */ 299 protected Set<String> _addDuplicatesContentTypes(String[] contentTypeIds) 300 { 301 Set<String> duplicateContentTypes = new HashSet<>(); 302 for (String contentTypeId : contentTypeIds) 303 { 304 if (_duplicateContentConfiguration.getDuplicatesContentTypes().contains(contentTypeId)) 305 { 306 duplicateContentTypes.add(contentTypeId); 307 } 308 else 309 { 310 ContentType contentType = _cTypeEP.getExtension(contentTypeId); 311 312 duplicateContentTypes.addAll(_addDuplicatesContentTypes(contentType.getSupertypeIds())); 313 } 314 } 315 return duplicateContentTypes; 316 } 317 318 /** 319 * Retrieves indexed contents that have at least one of the listed content types 320 * @param contentTypes The desired content types 321 * @return solr query results 322 */ 323 protected AmetysObjectIterable<Content> _getContents(Collection<String> contentTypes) 324 { 325 try 326 { 327 return _contentSearcherFactory.create(contentTypes).search(new MatchAllQuery()); 328 } 329 catch (Exception e) 330 { 331 getLogger().error("Unable to query to the Solr server", e); 332 } 333 return new EmptyIterable<>(); 334 } 335 336 /** 337 * Get the list of configuration errors 338 * @return the configuration error list 339 */ 340 protected List<Pair<String, List<Object>>> _getConfigurationErrors() 341 { 342 return _duplicateContentConfiguration.getErrors(); 343 } 344 345 /** 346 * Get the list of configuration warns 347 * @return the configuration warn list 348 */ 349 protected List<Pair<String, List<Object>>> _getConfigurationWarns() 350 { 351 return _duplicateContentConfiguration.getWarns(); 352 } 353 354 /** 355 * Get the content types set 356 * @return the content types set 357 */ 358 public Set<String> getContentTypeIds() 359 { 360 return _duplicateContentConfiguration.getContentTypes() 361 .keySet() 362 .stream() 363 .collect(Collectors.toSet()); 364 } 365 366 /** 367 * Log all errors of the configuration 368 * @param logger the logger 369 */ 370 public void logConfigurationErrors(Logger logger) 371 { 372 Pair<String, Object[]> configurationErrorsPair = _getConfigurationErrorsPair(); 373 if (!_getConfigurationErrors().isEmpty()) 374 { 375 getLogger().error(configurationErrorsPair.getKey(), configurationErrorsPair.getValue()); 376 } 377 else if (!_getConfigurationWarns().isEmpty()) 378 { 379 getLogger().warn(configurationErrorsPair.getKey(), configurationErrorsPair.getValue()); 380 } 381 } 382 383 private Pair<String, Object[]> _getConfigurationErrorsPair() 384 { 385 List<Pair<String, List<Object>>> errorsMap = new ArrayList<>(_getConfigurationErrors()); 386 errorsMap.addAll(_getConfigurationWarns()); 387 StringBuilder errors = new StringBuilder(); 388 List<Object> parameters = new ArrayList<>(); 389 for (Pair<String, List<Object>> error : errorsMap) 390 { 391 errors.append(error.getKey()).append("\n"); 392 parameters.addAll(error.getValue()); 393 } 394 return Pair.of(errors.toString(), parameters.toArray(Object[]::new)); 395 } 396 397 /** 398 * Returns <code>true</code> if there is at least one configuration error 399 * @return <code>true</code> if there is at least one configuration error 400 */ 401 public boolean hasConfigurationErrors() 402 { 403 return !_getConfigurationErrors().isEmpty() || !_getConfigurationWarns().isEmpty(); 404 } 405}