001/*
002 *  Copyright 2021 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.cms.duplicate.contents;
017
018import java.util.ArrayList;
019import java.util.Collection;
020import java.util.Comparator;
021import java.util.HashMap;
022import java.util.HashSet;
023import java.util.LinkedList;
024import java.util.List;
025import java.util.Map;
026import java.util.Set;
027import java.util.function.Predicate;
028import java.util.stream.Collectors;
029
030import org.apache.avalon.framework.component.Component;
031import org.apache.avalon.framework.configuration.Configurable;
032import org.apache.avalon.framework.configuration.Configuration;
033import org.apache.avalon.framework.configuration.ConfigurationException;
034import org.apache.avalon.framework.service.ServiceException;
035import org.apache.avalon.framework.service.ServiceManager;
036import org.apache.avalon.framework.service.Serviceable;
037import org.apache.commons.collections.ListUtils;
038import org.apache.commons.lang3.StringUtils;
039import org.apache.commons.lang3.tuple.Pair;
040import org.apache.solr.common.SolrException;
041import org.slf4j.Logger;
042
043import org.ametys.cms.content.ContentHelper;
044import org.ametys.cms.contenttype.ContentType;
045import org.ametys.cms.contenttype.ContentTypeExtensionPoint;
046import org.ametys.cms.duplicate.contents.attr.DuplicateAttributeConfiguration;
047import org.ametys.cms.repository.Content;
048import org.ametys.cms.search.content.ContentSearcherFactory;
049import org.ametys.cms.search.query.AndQuery;
050import org.ametys.cms.search.query.ContentTypeQuery;
051import org.ametys.cms.search.query.MatchAllQuery;
052import org.ametys.cms.search.query.Query;
053import org.ametys.plugins.repository.AmetysObjectIterable;
054import org.ametys.plugins.repository.EmptyIterable;
055import org.ametys.runtime.plugin.component.AbstractLogEnabled;
056
057/**
058 * Component able to detect duplicates (and near duplicates) for a given content.
059 */
060public class DuplicateContentsManager extends AbstractLogEnabled implements Component, Serviceable, Configurable
061{
062    
063    /** The component role. */
064    public static final String ROLE = DuplicateContentsManager.class.getName();
065
066    /** key for duplicate contents list */
067    public static final String DUPLICATE_CONTENTS_KEY = "duplicates";
068    /** key for near duplicate contents list */
069    public static final String NEAR_DUPLICATE_CONTENTS_KEY = "nearDuplicates";
070    /** key for boolean to know whether there are some content types or not  */
071    public static final String NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY = "noDuplicatesContentType";
072    /** key for boolean to know the query status  */
073    public static final String STATUS_KEY = "status";
074    
075    /**
076     * The status of the query
077     */
078    public enum Status
079    {
080        /** Successful query */
081        SUCCESSFUL, 
082        /** too complex query */
083        TOO_COMPLEX, 
084        /** empty query */
085        EMPTY
086    }
087
088    /** The content searcher factory. */
089    protected ContentSearcherFactory _contentSearcherFactory;
090
091    /** The content helper */
092    protected ContentHelper _contentHelper;
093    
094    /** The duplicate content description */
095    protected DuplicateContentConfiguration _duplicateContentConfiguration;
096    
097    /** The content type extension point */
098    ContentTypeExtensionPoint _cTypeEP;
099    
100    @Override
101    public void service(ServiceManager manager) throws ServiceException
102    {
103        _cTypeEP = (ContentTypeExtensionPoint) manager.lookup(ContentTypeExtensionPoint.ROLE);
104        _contentSearcherFactory = (ContentSearcherFactory) manager.lookup(ContentSearcherFactory.ROLE);
105        _contentHelper = (ContentHelper) manager.lookup(ContentHelper.ROLE);
106    }
107        
108    @Override
109    public void configure(Configuration configuration) throws ConfigurationException
110    {
111        _duplicateContentConfiguration = new DuplicateContentConfiguration(configuration, this);
112        logConfigurationErrors(getLogger());
113    }
114
115    /**
116     * Get the data about duplicates and near duplicated for a given content
117     * @param content The content
118     * @return A map of data. key "duplicates" contains a list of the duplicates
119     *         (id and label for each entry), and key "nearDuplicates" contains
120     *         the near duplicates if requested (duplicates excluded).
121     */
122    @SuppressWarnings("unchecked")
123    public Map<String, Object> searchDuplicates(Content content)
124    {
125        Map<String, Object> results = searchDuplicates(List.of(content));
126
127        results.put(STATUS_KEY, ((Map<Content, Status>) results.get(STATUS_KEY)).get(content));
128        results.put(DUPLICATE_CONTENTS_KEY, ((Map<Content, List<Content>>) results.get(DUPLICATE_CONTENTS_KEY)).getOrDefault(content, new ArrayList<>()));
129        results.put(NEAR_DUPLICATE_CONTENTS_KEY, ((Map<Content, List<Content>>) results.get(NEAR_DUPLICATE_CONTENTS_KEY)).getOrDefault(content, new ArrayList<>()));
130        return results;
131    }
132
133    /**
134     * Get the data about duplicates and near duplicates for all contents that match the content types included in the configuration
135     * @return the data about duplicates and near duplicates
136     */
137    public Map<String, Object> searchDuplicates()
138    {
139        Map<String, Object> results = new HashMap<>();
140        // Get content given duplicate content types.
141        Set<String> duplicatesContentTypes = _duplicateContentConfiguration.getDuplicatesContentTypes();
142        
143        if (duplicatesContentTypes.isEmpty())
144        {
145            results.put(NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY, true);
146            return results;
147        }
148
149        results.put(NO_DUPLICATE_CONTENTS_CONTENT_TYPE_KEY, false);
150        AmetysObjectIterable<Content> contents = _getContents(duplicatesContentTypes);
151        results.putAll(searchDuplicates(contents));
152        return results;
153    }
154    
155    /**
156     * Get the data about duplicates and near duplicates for a list of contents
157     * @param contents the contents to check
158     * @return the data about duplicates and near duplicates
159     */
160    public Map<String, Object> searchDuplicates(Iterable<Content> contents)
161    {
162        Map<String, Object> results = new HashMap<>();
163
164        Map<Content, List<Content>> duplicatesMap = new HashMap<>();
165        Map<Content, List<Content>> nearDuplicatesMap = new HashMap<>();
166        Map<Content, Status> statusMap = new HashMap<>();
167        
168        List<String> duplicatesFound = new LinkedList<>();
169        
170        for (Content content : contents)
171        {
172            if (!duplicatesFound.contains(content.getId()))
173            {
174                
175                // Find the content types that will act as references to determine the duplicates attributes.
176                String[] contentTypes = content.getTypes();
177                Set<String> duplicateContentTypes = _addDuplicatesContentTypes(contentTypes);
178                
179                String[] mixinTypes = content.getMixinTypes();
180                duplicateContentTypes.addAll(_addDuplicatesContentTypes(mixinTypes));
181                
182
183                // Duplicates attributes
184                Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations = duplicateContentTypes.stream()
185                    .map(duplicateCtype -> _duplicateContentConfiguration.get(duplicateCtype))
186                    .collect(Collectors.toSet());
187                
188                // Search only if configuration is defined for the given content types
189                if (!duplicateContentTypeConfigurations.isEmpty())
190                {
191                    try
192                    {
193                        // Search for duplicate contents
194                        List<Content> duplicates = _getDuplicates(content, duplicateContentTypeConfigurations, false, contentTypes);
195                        if (!duplicates.isEmpty())
196                        {
197                            duplicatesMap.put(content, duplicates);
198                        }
199                        duplicatesFound.addAll(duplicates.stream().map(Content::getId).collect(Collectors.toList()));
200                        
201                        // Search for near duplicate contents if needed (different query for near duplicates)
202                        boolean checkNearDuplicates = duplicateContentTypeConfigurations.stream()
203                                .anyMatch(DuplicateContentTypeConfiguration::hasAnyNearDuplicateAttributes);
204                        if (checkNearDuplicates)
205                        {
206                            List<Content> nearDuplicates = _getDuplicates(content, duplicateContentTypeConfigurations, true, contentTypes);
207                            statusMap.put(content, Status.SUCCESSFUL);
208                            nearDuplicates = ListUtils.removeAll(nearDuplicates, duplicates);
209                            if (!nearDuplicates.isEmpty())
210                            {
211                                nearDuplicatesMap.put(content, nearDuplicates);
212                            }
213                        }
214                    }
215                    catch (Exception e)
216                    {
217                        if (e instanceof SolrException && StringUtils.equals(((SolrException) e).getRootThrowable(), "org.apache.lucene.util.automaton.TooComplexToDeterminizeException"))
218                        {
219                            getLogger().warn("Fuzzy query too complex", e);
220                            statusMap.put(content, Status.TOO_COMPLEX);
221                        }
222                        else
223                        {
224                            getLogger().error("Unable to query to the Solr server", e);
225                        }
226                    }
227                }
228            }
229        }
230        results.put(DUPLICATE_CONTENTS_KEY, duplicatesMap);
231        results.put(NEAR_DUPLICATE_CONTENTS_KEY, nearDuplicatesMap);
232        results.put(STATUS_KEY, statusMap);
233        return results;
234    }
235    
236    /**
237     * Get the list of duplicates
238     * @param content The content
239     * @param duplicateContentTypeConfigurations the attribute list
240     * @param nearDuplicates true to check for near duplicates
241     * @param contentTypes the content types
242     * @return list of duplicates 
243     * @throws Exception if a problem occurs while searching for duplicates
244     */
245    protected List<Content> _getDuplicates(Content content, Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations, boolean nearDuplicates, String[] contentTypes) throws Exception
246    {
247        // Query
248        List<Query> queries = _getDuplicatesQueries(content, duplicateContentTypeConfigurations, nearDuplicates, contentTypes);
249        
250        // Query building
251        Query query = new AndQuery(queries);
252
253        AmetysObjectIterable<Content> results;
254        results = _contentSearcherFactory.create()
255                .search(query);
256        
257        return results.stream()
258                .filter(Predicate.not(content::equals))
259                .sorted(Comparator.comparing(Content::getTitle, String.CASE_INSENSITIVE_ORDER))
260                .collect(Collectors.toList());
261    }
262
263    /**
264     * Get the list of queries to search for duplicates
265     * @param content The content
266     * @param duplicateContentTypeConfigurations the attribute list
267     * @param nearDuplicates true to check for near duplicates
268     * @param contentTypes the content types
269     * @return list of duplicates 
270     */
271    protected List<Query> _getDuplicatesQueries(Content content, Set<DuplicateContentTypeConfiguration> duplicateContentTypeConfigurations, boolean nearDuplicates, String[] contentTypes)
272    {
273        // Query
274        List<Query> queries = new LinkedList<>();
275
276        // Content types (mixins are not added)
277        for (String contentType : contentTypes)
278        {
279            queries.add(new ContentTypeQuery(contentType));
280        }
281
282        for (DuplicateContentTypeConfiguration duplicateContentTypeConfiguration : duplicateContentTypeConfigurations)
283        {
284            for (DuplicateAttributeConfiguration duplicateAttributeConfiguration : duplicateContentTypeConfiguration.getAttributeList())
285            {
286                String path = duplicateAttributeConfiguration.getPath();
287                
288                queries.add(duplicateAttributeConfiguration.getQuery(content.getValue(path), nearDuplicates));
289            }
290        }
291        return queries;
292    }
293    
294    /**
295     * Get the configured duplicates content types
296     * @param contentTypeIds The content type identifiers for which duplicates content types should be found
297     * @return the duplicate content types
298     */
299    protected Set<String> _addDuplicatesContentTypes(String[] contentTypeIds)
300    {
301        Set<String> duplicateContentTypes = new HashSet<>();
302        for (String contentTypeId : contentTypeIds)
303        {
304            if (_duplicateContentConfiguration.getDuplicatesContentTypes().contains(contentTypeId))
305            {
306                duplicateContentTypes.add(contentTypeId);
307            }
308            else
309            {
310                ContentType contentType = _cTypeEP.getExtension(contentTypeId);
311                
312                duplicateContentTypes.addAll(_addDuplicatesContentTypes(contentType.getSupertypeIds()));
313            }
314        }
315        return duplicateContentTypes;
316    }
317            
318    /**
319     * Retrieves indexed contents that have at least one of the listed content types
320     * @param contentTypes The desired content types
321     * @return solr query results
322     */
323    protected AmetysObjectIterable<Content> _getContents(Collection<String> contentTypes)
324    {
325        try
326        {
327            return _contentSearcherFactory.create(contentTypes).search(new MatchAllQuery());
328        }
329        catch (Exception e) 
330        {
331            getLogger().error("Unable to query to the Solr server", e);
332        }
333        return new EmptyIterable<>();
334    }
335    
336    /**
337     * Get the list of configuration errors
338     * @return the configuration error list
339     */
340    protected List<Pair<String, List<Object>>> _getConfigurationErrors()
341    {
342        return _duplicateContentConfiguration.getErrors();
343    }
344
345    /**
346     * Get the list of configuration warns
347     * @return the configuration warn list
348     */
349    protected List<Pair<String, List<Object>>> _getConfigurationWarns()
350    {
351        return _duplicateContentConfiguration.getWarns();
352    }
353    
354    /**
355     * Get the content types set
356     * @return the content types set
357     */
358    public Set<String> getContentTypeIds()
359    {
360        return _duplicateContentConfiguration.getContentTypes()
361                .keySet()
362                .stream()
363                .collect(Collectors.toSet());
364    }
365    
366    /**
367     * Log all errors of the configuration
368     * @param logger the logger
369     */
370    public void logConfigurationErrors(Logger logger)
371    {
372        Pair<String, Object[]> configurationErrorsPair = _getConfigurationErrorsPair();
373        if (!_getConfigurationErrors().isEmpty())
374        {
375            getLogger().error(configurationErrorsPair.getKey(), configurationErrorsPair.getValue());
376        }
377        else if (!_getConfigurationWarns().isEmpty())
378        {
379            getLogger().warn(configurationErrorsPair.getKey(), configurationErrorsPair.getValue());
380        }
381    }
382    
383    private Pair<String, Object[]> _getConfigurationErrorsPair()
384    {
385        List<Pair<String, List<Object>>> errorsMap = new ArrayList<>(_getConfigurationErrors());
386        errorsMap.addAll(_getConfigurationWarns());
387        StringBuilder errors = new StringBuilder();
388        List<Object> parameters = new ArrayList<>();
389        for (Pair<String, List<Object>> error : errorsMap)
390        {
391            errors.append(error.getKey()).append("\n");
392            parameters.addAll(error.getValue());
393        }
394        return Pair.of(errors.toString(), parameters.toArray(Object[]::new));
395    }
396    
397    /**
398     * Returns <code>true</code> if there is at least one configuration error
399     * @return <code>true</code> if there is at least one configuration error
400     */
401    public boolean hasConfigurationErrors()
402    {
403        return !_getConfigurationErrors().isEmpty() || !_getConfigurationWarns().isEmpty();
404    }
405}