001/*
002 *  Copyright 2014 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.plugins.contentio.in.csv;
017
018import java.io.BufferedInputStream;
019import java.io.BufferedReader;
020import java.io.ByteArrayInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.InputStreamReader;
024import java.io.Reader;
025import java.io.StringReader;
026import java.nio.charset.Charset;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Collection;
030import java.util.HashMap;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Map;
034import java.util.Set;
035import java.util.regex.Pattern;
036
037import org.apache.avalon.framework.configuration.Configuration;
038import org.apache.avalon.framework.configuration.ConfigurationException;
039import org.apache.avalon.framework.service.ServiceException;
040import org.apache.avalon.framework.service.ServiceManager;
041import org.apache.commons.lang3.StringUtils;
042import org.apache.commons.text.StringEscapeUtils;
043import org.apache.tika.parser.txt.CharsetDetector;
044import org.jsoup.Jsoup;
045import org.jsoup.safety.Whitelist;
046import org.supercsv.io.CsvMapReader;
047import org.supercsv.io.ICsvMapReader;
048import org.supercsv.io.Tokenizer;
049import org.supercsv.prefs.CsvPreference;
050
051import org.ametys.cms.contenttype.ContentConstants;
052import org.ametys.cms.contenttype.ContentTypesHelper;
053import org.ametys.cms.contenttype.MetadataDefinition;
054import org.ametys.cms.contenttype.MetadataType;
055import org.ametys.cms.repository.Content;
056import org.ametys.cms.repository.ModifiableContent;
057import org.ametys.plugins.contentio.AbstractContentImporter;
058import org.ametys.plugins.contentio.ContentImporter;
059import org.ametys.plugins.contentio.ContentImporterHelper;
060import org.ametys.plugins.repository.metadata.ModifiableCompositeMetadata;
061import org.ametys.plugins.repository.version.VersionableAmetysObject;
062
063/**
064 * {@link ContentImporter} importing contents from a CSV file.
065 * Each CSV record (line) contains content properties.
066 * Configuration options:
067 * <ul>
068 *   <li>The CSV file charset (default: auto-detect)</li>
069 *   <li>The CSV delimiter character (default: auto-detect from the header)</li>
070 *   <li>The CSV quote character (default to the double-quote: <code>"</code>)</li>
071 *   <li>Structure of the header line: fixed pattern or column list.</li>
072 *   <li>Mapping from CSV columns to content metadatas.</li>
073 * </ul>
074 */
075public class CsvContentImporter extends AbstractContentImporter
076{
077    
078    /** The content type helper. */
079    protected ContentTypesHelper _cTypeHelper;
080    
081    /** The file charset. */
082    protected Charset _charset;
083    
084    /** The CSV delimiter character. */
085    protected Character _delimiterChar;
086    
087    /** The CSV quote character. */
088    protected Character _quoteChar;
089    
090    /** True if the supported CSV files have a header formed of the columns (this is not always the case). */
091    protected boolean _columnHeaderLine;
092    
093    /** Determine if the file is supported by matching the header line against this pattern. */
094    protected Pattern _matchPattern;
095    
096    /** Determine if the file is supported by detecting the following columns in the header. */
097    protected Set<String> _matchColumns;
098    
099    /** Contains mapping from CSV column to content metadata path. */
100    protected Map<String, String> _columnToMetadata;
101    
102    @Override
103    public void service(ServiceManager serviceManager) throws ServiceException
104    {
105        super.service(serviceManager);
106        _cTypeHelper = (ContentTypesHelper) serviceManager.lookup(ContentTypesHelper.ROLE);
107    }
108    
109    @Override
110    public void configure(Configuration configuration) throws ConfigurationException
111    {
112        // Configure priority, allowed extensions, content creation parameters.
113        super.configure(configuration);
114        
115        // Configure CSV parsing and mapping properties.
116        configureCsvProperties(configuration.getChild("csv"));
117    }
118    
119    /**
120     * Configure CSV parsing and mapping properties.
121     * @param configuration the CSV configuration.
122     * @throws ConfigurationException if an error occurs.
123     */
124    protected void configureCsvProperties(Configuration configuration) throws ConfigurationException
125    {
126        String charsetName = configuration.getAttribute("charset", null);
127        if (StringUtils.isNotEmpty(charsetName))
128        {
129            try
130            {
131                _charset = Charset.forName(charsetName);
132            }
133            catch (Exception e)
134            {
135                throw new ConfigurationException("Invalid charset: " + charsetName, e);
136            }
137        }
138        
139        String delimiter = configuration.getAttribute("delimiter", null);
140        if (StringUtils.isNotEmpty(delimiter))
141        {
142            _delimiterChar = delimiter.charAt(0);
143        }
144        
145        String quote = configuration.getAttribute("quote", null);
146        if (StringUtils.isNotEmpty(quote))
147        {
148            _quoteChar = quote.charAt(0);
149        }
150        
151        _columnHeaderLine = configuration.getAttributeAsBoolean("columnHeader", true);
152        
153        // Match pattern or column list.
154        String matchPattern = configuration.getChild("match").getAttribute("pattern", null);
155        String matchColumns = configuration.getChild("match").getAttribute("columns", null);
156        
157        if (matchPattern != null && matchColumns == null)
158        {
159            _matchPattern = Pattern.compile(matchPattern);
160        }
161        else if (matchPattern == null && matchColumns != null)
162        {
163            _matchColumns = new HashSet<>();
164            for (String column : StringUtils.split(matchColumns, ", "))
165            {
166                _matchColumns.add(column.trim());
167            }
168        }
169        else
170        {
171            throw new ConfigurationException("A CSV content importer must match a pattern or a column list, but not both.", configuration);
172        }
173        
174        // Configure mappings.
175        _columnToMetadata = new HashMap<>();
176        
177        for (Configuration mappingConf : configuration.getChild("mappings").getChildren("mapping"))
178        {
179            String column = mappingConf.getAttribute("column");
180            String metadata = mappingConf.getAttribute("metadata");
181            
182            _columnToMetadata.put(column, metadata);
183        }
184    }
185    
186    @Override
187    protected Collection<String> getDefaultExtensions()
188    {
189        return Arrays.asList("csv", "tsv");
190    }
191    
192    @Override
193    public boolean supports(InputStream is, String name) throws IOException
194    {
195        if (name == null || isExtensionValid(name))
196        {
197            if (_matchPattern != null)
198            {
199                return matchHeaderPattern(is);
200            }
201            else // if (_matchColumns != null)
202            {
203                return matchColumns(is);
204            }
205        }
206        return false;
207    }
208    
209    /**
210     * Test if the importer supports the given file by matching its first line against the configured pattern.
211     * @param is an input stream on the data to test.
212     * @return true if the data's first line matches the pattern, false otherwise.
213     * @throws IOException if a read error occurs.
214     */
215    protected boolean matchHeaderPattern(InputStream is) throws IOException
216    {
217        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
218        String header = reader.readLine();
219        
220        return _matchPattern.matcher(header).matches();
221    }
222    
223    /**
224     * Test if the importer supports the given file by testing if it contains the configured columns in its header.
225     * @param is an input stream on the data to test.
226     * @return true if the CSV columns contain all the configured columns, false otherwise.
227     * @throws IOException if a read error occurs.
228     */
229    protected boolean matchColumns(InputStream is) throws IOException
230    {
231        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
232        String header = reader.readLine();
233        
234        Map<String, Object> params = new HashMap<>();
235        
236        CsvPreference preference = getCsvPreference(header, params);
237        
238        try (Tokenizer tok = new Tokenizer(new StringReader(header + "\n"), preference);)
239        {
240            List<String> columns = new ArrayList<>();
241            tok.readColumns(columns);
242            
243            return columns.containsAll(_matchColumns);
244        }
245    }
246    
247    private ICsvMapReader _getMapReader(BufferedReader reader, CsvPreference preference) throws IOException
248    {
249        if (_columnHeaderLine)
250        {
251            // Reset the reader (go back to the beginning of the file).
252            reader.reset();
253            return new CsvMapReader(reader, preference);
254        }
255        else
256        {
257            // No named columns: use a custom CSV map reader which uses the column number as the map index.
258            return new CsvColNumberMapReader(reader, preference);
259        }
260    }
261    
262    @Override
263    public Set<String> importContents(InputStream is, Map<String, Object> params) throws IOException
264    {
265        Set<String> contentIds = new HashSet<>();
266        
267        // Get a reader using the right charset and wrap it in a buffered reader.
268        BufferedReader reader = new BufferedReader(getReader(is), 8192);
269        
270        // Mark the start of file, to be able to reset it.
271        reader.mark(8192);
272        String headerLine = reader.readLine();
273        
274        if (headerLine != null)
275        {
276            CsvPreference preference = getCsvPreference(headerLine, params);
277            
278            try (ICsvMapReader mapReader = _getMapReader(reader, preference))
279            {
280                // Get the columns from the header line, if applicable.
281                String[] columns = _columnHeaderLine ? mapReader.getHeader(true) : new String[0];
282                
283                Map<String, String> properties;
284                while ((properties = mapReader.read(columns)) != null)
285                {
286                    String contentId = importContent(properties, params, mapReader.getLineNumber());
287                    
288                    if (contentId != null)
289                    {
290                        contentIds.add(contentId);
291                    }
292                }
293            }
294        }
295        
296        return contentIds;
297    }
298    
299    /**
300     * Get a reader on the data stream, optionally detecting the charset.
301     * @param in the data stream.
302     * @return the reader with the correct character set.
303     */
304    protected Reader getReader(InputStream in)
305    {
306        if (_charset != null)
307        {
308            // Return an InputStreamReader with the configured charset.
309            return new InputStreamReader(in, _charset);
310        }
311        else
312        {
313            // Use Tika/ICU to detect the file charset.
314            BufferedInputStream buffIs = new BufferedInputStream(in);
315            
316            CharsetDetector detector = new CharsetDetector();
317            return detector.getReader(buffIs, Charset.defaultCharset().name());
318        }
319    }
320    
321    /**
322     * Get the CSV preference.
323     * @param header the CSV first line.
324     * @param params the import parameters.
325     * @return a {@link CsvPreference} object.
326     */
327    protected CsvPreference getCsvPreference(String header, Map<String, Object> params)
328    {
329        char delimiter = getDelimiter(header, params);
330        char quoteChar = getQuoteChar(params);
331        
332        return new CsvPreference.Builder(quoteChar, delimiter, "\r\n").build();
333    }
334    
335    /**
336     * Get the CSV character delimiter.
337     * @param header the CSV first line.
338     * @param params the import parameters.
339     * @return the CSV character delimiter.
340     */
341    protected char getDelimiter(String header, Map<String, Object> params)
342    {
343        char delimiter = ',';
344        
345        if (_delimiterChar != null) // The delimiter char is specified
346        {
347            delimiter = _delimiterChar;
348        }
349        else if (header.contains("\t")) // Else, try to auto-detect.
350        {
351            delimiter = '\t';
352        }
353        else if (header.contains(";"))
354        {
355            delimiter = ';';
356        }
357        else if (header.contains(","))
358        {
359            delimiter = ',';
360        }
361        
362        return delimiter;
363    }
364    
365    /**
366     * Get the CSV quote character.
367     * @param params the import parameters.
368     * @return the CSV quote character.
369     */
370    protected char getQuoteChar(Map<String, Object> params)
371    {
372        char quote = '"';
373        
374        if (_quoteChar != null)
375        {
376            quote = _quoteChar;
377        }
378        
379        return quote;
380    }
381    
382    /**
383     * Import a content from a CSV record.
384     * @param properties the CSV record as a Map of values, indexed by column name or number.
385     * @param params the import parameters.
386     * @param lineNumber the line number of the record being imported, for logging purposes.
387     * @return the content ID or null if the content was not created.
388     */
389    protected String importContent(Map<String, String> properties, Map<String, Object> params, int lineNumber)
390    {
391        try
392        {
393            // Map properties to metadata.
394            Map<String, String> metadata = getMetadataFromProperties(properties);
395            
396            String title = metadata.get("title");
397            
398            Content content = createContent(title, params);
399            
400            if (content instanceof ModifiableContent)
401            {
402                setMetadatas((ModifiableContent) content, metadata, params);
403            }
404            else
405            {
406                getLogger().error("Import from CSV file: the content on line {} was imported as a read-only content, it could not be modified.", lineNumber);
407            }
408            
409            return content.getId();
410        }
411        catch (Exception e)
412        {
413            getLogger().error("Import from CSV file: error importing the content on line {}", lineNumber, e);
414        }
415        
416        return null;
417    }
418    
419    /**
420     * Get the content metadata from a CSV record.
421     * @param properties the CSV record as a Map of values, indexed by column name or number.
422     * @return a Map of metadata values, indexed by metadata path.
423     */
424    protected Map<String, String> getMetadataFromProperties(Map<String, String> properties)
425    {
426        Map<String, String> metadata = new HashMap<>();
427        
428        for (String propName : properties.keySet())
429        {
430            String value = properties.get(propName);
431            String metaName = null;
432            if (_columnToMetadata.containsKey(propName))
433            {
434                metaName = _columnToMetadata.get(propName);
435            }
436            else if (_columnHeaderLine)
437            {
438                metaName = propName;
439            }
440            
441            if (metaName != null)
442            {
443                metadata.put(metaName, value);
444            }
445        }
446        
447        return metadata;
448    }
449    
450    /**
451     * Set the content metadatas from the CSV values.
452     * @param content the content to populate.
453     * @param metaValues the metadata values, extracted from the CSV record.
454     * @param params the import parameters.
455     */
456    protected void setMetadatas(ModifiableContent content, Map<String, String> metaValues, Map<String, Object> params)
457    {
458        for (String path : metaValues.keySet())
459        {
460            String value = metaValues.get(path);
461            
462            if (value != null)
463            {
464                setMetadata(content, path, value, params);
465            }
466        }
467        
468        // Save changes and create a version.
469        content.saveChanges();
470        if (content instanceof VersionableAmetysObject)
471        {
472            ((VersionableAmetysObject) content).checkpoint();
473        }
474    }
475
476    /**
477     * Set a metadata from its string value.
478     * @param content the content to populate.
479     * @param path the metadata path.
480     * @param value the metadata string value.
481     * @param params the import parameters.
482     */
483    protected void setMetadata(ModifiableContent content, String path, String value, Map<String, Object> params)
484    {
485        ModifiableCompositeMetadata metaHolder = content.getMetadataHolder();
486        MetadataDefinition metaDef = null;
487        
488        // Iterate over path parts while they are composites.
489        String[] pathElements = StringUtils.split(path, ContentConstants.METADATA_PATH_SEPARATOR);
490        for (int i = 0; i < (pathElements.length - 1); i++)
491        {
492            String compositeName = pathElements[i];
493            
494            // Get metadata definition and metadata holder for this level.
495            metaDef = getMetadataDefinition(content, metaDef, compositeName);
496            if (metaDef != null && metaDef.getType() == MetadataType.COMPOSITE)
497            {
498                metaHolder = metaHolder.getCompositeMetadata(compositeName, true);
499            }
500        }
501        
502        // Last path element: get metadata name and definition.
503        String metaName = pathElements[pathElements.length - 1];
504        metaDef = getMetadataDefinition(content, metaDef, metaName);
505        
506        if (metaDef != null)
507        {
508            try
509            {
510//                if (metaDef.isMultiple())
511//                {
512//                    setMultipleMetadata(metaHolder, metaDef, metaName, value, params);
513//                }
514//                else
515//                {
516                setMetadata(metaHolder, metaDef, metaName, value, params);
517//                }
518            }
519            catch (Exception e)
520            {
521                String message = "The value for metadata '" + metaName + "' is invalid and will be ignored: " + value;
522                getLogger().warn(message, e);
523            }
524        }
525    }
526
527    /**
528     * Get a metadata definition, either from the parent metadata definition or from the content itself.
529     * @param content the imported content.
530     * @param parentMetaDef the parent metadata definition.
531     * @param name the metadata name.
532     * @return the metadata definition.
533     */
534    protected MetadataDefinition getMetadataDefinition(Content content, MetadataDefinition parentMetaDef, String name)
535    {
536        MetadataDefinition metaDef = null;
537        
538        if (parentMetaDef == null)
539        {
540            metaDef = _cTypeHelper.getMetadataDefinition(name, content.getTypes(), content.getMixinTypes());
541        }
542        else
543        {
544            metaDef = parentMetaDef.getMetadataDefinition(name);
545        }
546        
547        return metaDef;
548    }
549    
550    /**
551     * Set a single metadata.
552     * @param meta the metadata holder.
553     * @param metaDef the metadata definition.
554     * @param name the metadata name.
555     * @param value the metadata value as a String.
556     * @param params the import parameters.
557     * @throws IOException if an error occurs.
558     */
559    protected void setMetadata(ModifiableCompositeMetadata meta, MetadataDefinition metaDef, String name, String value, Map<String, Object> params) throws IOException
560    {
561        switch (metaDef.getType())
562        {
563            case STRING:
564                setStringMetadata(meta, name, metaDef, new String [] {value});
565                break;
566            case BOOLEAN:
567                setBooleanMetadata(meta, name, metaDef, new String [] {value});
568                break;
569            case LONG:
570                setLongMetadata(meta, name, metaDef, new String [] {value});
571                break;
572            case DOUBLE:
573                setDoubleMetadata(meta, name, metaDef, new String [] {value});
574                break;
575            case DATE:
576            case DATETIME:
577                setDateMetadata(meta, name, metaDef, new String [] {value});
578                break;
579            case GEOCODE:
580                break;
581            case RICH_TEXT:
582                setRichText(meta, name, value);
583                break;
584            case BINARY:
585            case FILE:
586                setBinaryMetadata(meta, name, metaDef, value);
587                break;
588            case COMPOSITE:
589            case USER:
590            case REFERENCE:
591            case CONTENT:
592            case SUB_CONTENT:
593            case MULTILINGUAL_STRING:
594                // Not supported
595            default:
596                break;
597        }
598    }
599    
600    /**
601     * Set a RichText metadata from a String value.
602     * @param meta the metadata holder.
603     * @param name the metadata name.
604     * @param value the String value.
605     * @throws IOException if something goes wrong when manipulating files
606     */
607    protected void setRichText(ModifiableCompositeMetadata meta, String name, String value) throws IOException
608    {
609        String cleanValue = Jsoup.clean(value, Whitelist.none());
610        
611        cleanValue = StringEscapeUtils.escapeXml10(cleanValue);
612        
613        String[] lines = StringUtils.split(cleanValue, "\r\n");
614        
615        String docbook = ContentImporterHelper.textToDocbook(lines);
616        setRichText(meta, name, new ByteArrayInputStream(docbook.getBytes("UTF-8")));
617    }
618    
619}