Source code

001/*
002 *  Copyright 2014 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.plugins.contentio.in.csv;
017
018import java.io.BufferedInputStream;
019import java.io.BufferedReader;
020import java.io.ByteArrayInputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.InputStreamReader;
024import java.io.Reader;
025import java.io.StringReader;
026import java.nio.charset.Charset;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Collection;
030import java.util.HashMap;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Map;
034import java.util.Set;
035import java.util.regex.Pattern;
036
037import org.apache.avalon.framework.configuration.Configuration;
038import org.apache.avalon.framework.configuration.ConfigurationException;
039import org.apache.avalon.framework.service.ServiceException;
040import org.apache.avalon.framework.service.ServiceManager;
041import org.apache.commons.lang3.StringEscapeUtils;
042import org.apache.commons.lang3.StringUtils;
043import org.apache.tika.parser.txt.CharsetDetector;
044import org.jsoup.Jsoup;
045import org.jsoup.safety.Whitelist;
046import org.supercsv.io.CsvMapReader;
047import org.supercsv.io.ICsvMapReader;
048import org.supercsv.io.Tokenizer;
049import org.supercsv.prefs.CsvPreference;
050
051import org.ametys.cms.contenttype.ContentConstants;
052import org.ametys.cms.contenttype.ContentTypesHelper;
053import org.ametys.cms.contenttype.MetadataDefinition;
054import org.ametys.cms.contenttype.MetadataType;
055import org.ametys.cms.repository.Content;
056import org.ametys.cms.repository.ModifiableContent;
057import org.ametys.plugins.contentio.AbstractContentImporter;
058import org.ametys.plugins.contentio.ContentImporter;
059import org.ametys.plugins.repository.metadata.ModifiableCompositeMetadata;
060import org.ametys.plugins.repository.version.VersionableAmetysObject;
061
062/**
063 * {@link ContentImporter} importing contents from a CSV file.
064 * Each CSV record (line) contains content properties.
065 * Configuration options:
066 * <ul>
067 *   <li>The CSV file charset (default: auto-detect)</li>
068 *   <li>The CSV delimiter character (default: auto-detect from the header)</li>
069 *   <li>The CSV quote character (default to the double-quote: <code>"</code>)</li>
070 *   <li>Structure of the header line: fixed pattern or column list.</li>
071 *   <li>Mapping from CSV columns to content metadatas.</li>
072 * </ul>
073 */
074public class CsvContentImporter extends AbstractContentImporter
075{
076    
077    /** The content type helper. */
078    protected ContentTypesHelper _cTypeHelper;
079    
080    /** The file charset. */
081    protected Charset _charset;
082    
083    /** The CSV delimiter character. */
084    protected Character _delimiterChar;
085    
086    /** The CSV quote character. */
087    protected Character _quoteChar;
088    
089    /** True if the supported CSV files have a header formed of the columns (this is not always the case). */
090    protected boolean _columnHeaderLine;
091    
092    /** Determine if the file is supported by matching the header line against this pattern. */
093    protected Pattern _matchPattern;
094    
095    /** Determine if the file is supported by detecting the following columns in the header. */
096    protected Set<String> _matchColumns;
097    
098    /** Contains mapping from CSV column to content metadata path. */
099    protected Map<String, String> _columnToMetadata;
100    
101    @Override
102    public void service(ServiceManager serviceManager) throws ServiceException
103    {
104        super.service(serviceManager);
105        _cTypeHelper = (ContentTypesHelper) serviceManager.lookup(ContentTypesHelper.ROLE);
106    }
107    
108    @Override
109    public void configure(Configuration configuration) throws ConfigurationException
110    {
111        // Configure priority, allowed extensions, content creation parameters.
112        super.configure(configuration);
113        
114        // Configure CSV parsing and mapping properties.
115        configureCsvProperties(configuration.getChild("csv"));
116    }
117    
118    /**
119     * Configure CSV parsing and mapping properties.
120     * @param configuration the CSV configuration.
121     * @throws ConfigurationException if an error occurs.
122     */
123    protected void configureCsvProperties(Configuration configuration) throws ConfigurationException
124    {
125        String charsetName = configuration.getAttribute("charset", null);
126        if (StringUtils.isNotEmpty(charsetName))
127        {
128            try
129            {
130                _charset = Charset.forName(charsetName);
131            }
132            catch (Exception e)
133            {
134                throw new ConfigurationException("Invalid charset: " + charsetName, e);
135            }
136        }
137        
138        String delimiter = configuration.getAttribute("delimiter", null);
139        if (StringUtils.isNotEmpty(delimiter))
140        {
141            _delimiterChar = delimiter.charAt(0);
142        }
143        
144        String quote = configuration.getAttribute("quote", null);
145        if (StringUtils.isNotEmpty(quote))
146        {
147            _quoteChar = quote.charAt(0);
148        }
149        
150        _columnHeaderLine = configuration.getAttributeAsBoolean("columnHeader", true);
151        
152        // Match pattern or column list.
153        String matchPattern = configuration.getChild("match").getAttribute("pattern", null);
154        String matchColumns = configuration.getChild("match").getAttribute("columns", null);
155        
156        if (matchPattern != null && matchColumns == null)
157        {
158            _matchPattern = Pattern.compile(matchPattern);
159        }
160        else if (matchPattern == null && matchColumns != null)
161        {
162            _matchColumns = new HashSet<>();
163            for (String column : StringUtils.split(matchColumns, ", "))
164            {
165                _matchColumns.add(column.trim());
166            }
167        }
168        else
169        {
170            throw new ConfigurationException("A CSV content importer must match a pattern or a column list, but not both.", configuration);
171        }
172        
173        // Configure mappings.
174        _columnToMetadata = new HashMap<>();
175        
176        for (Configuration mappingConf : configuration.getChild("mappings").getChildren("mapping"))
177        {
178            String column = mappingConf.getAttribute("column");
179            String metadata = mappingConf.getAttribute("metadata");
180            
181            _columnToMetadata.put(column, metadata);
182        }
183    }
184    
185    @Override
186    protected Collection<String> getDefaultExtensions()
187    {
188        return Arrays.asList("csv", "tsv");
189    }
190    
191    @Override
192    public boolean supports(InputStream is, String name) throws IOException
193    {
194        if (name == null || isExtensionValid(name))
195        {
196            if (_matchPattern != null)
197            {
198                return matchHeaderPattern(is);
199            }
200            else // if (_matchColumns != null)
201            {
202                return matchColumns(is);
203            }
204        }
205        return false;
206    }
207    
208    /**
209     * Test if the importer supports the given file by matching its first line against the configured pattern.
210     * @param is an input stream on the data to test.
211     * @return true if the data's first line matches the pattern, false otherwise.
212     * @throws IOException if a read error occurs.
213     */
214    protected boolean matchHeaderPattern(InputStream is) throws IOException
215    {
216        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
217        String header = reader.readLine();
218        
219        return _matchPattern.matcher(header).matches();
220    }
221    
222    /**
223     * Test if the importer supports the given file by testing if it contains the configured columns in its header.
224     * @param is an input stream on the data to test.
225     * @return true if the CSV columns contain all the configured columns, false otherwise.
226     * @throws IOException if a read error occurs.
227     */
228    protected boolean matchColumns(InputStream is) throws IOException
229    {
230        BufferedReader reader = new BufferedReader(new InputStreamReader(is));
231        String header = reader.readLine();
232        
233        Map<String, Object> params = new HashMap<>();
234        
235        CsvPreference preference = getCsvPreference(header, params);
236        
237        try (Tokenizer tok = new Tokenizer(new StringReader(header + "\n"), preference);)
238        {
239            List<String> columns = new ArrayList<>();
240            tok.readColumns(columns);
241            
242            return columns.containsAll(_matchColumns);
243        }
244    }
245    
246    private ICsvMapReader _getMapReader(BufferedReader reader, CsvPreference preference) throws IOException
247    {
248        if (_columnHeaderLine)
249        {
250            // Reset the reader (go back to the beginning of the file).
251            reader.reset();
252            return new CsvMapReader(reader, preference);
253        }
254        else
255        {
256            // No named columns: use a custom CSV map reader which uses the column number as the map index.
257            return new CsvColNumberMapReader(reader, preference);
258        }
259    }
260    
261    @Override
262    public Set<String> importContents(InputStream is, Map<String, Object> params) throws IOException
263    {
264        Set<String> contentIds = new HashSet<>();
265        
266        // Get a reader using the right charset and wrap it in a buffered reader.
267        BufferedReader reader = new BufferedReader(getReader(is), 8192);
268        
269        // Mark the start of file, to be able to reset it.
270        reader.mark(8192);
271        String headerLine = reader.readLine();
272        
273        if (headerLine != null)
274        {
275            CsvPreference preference = getCsvPreference(headerLine, params);
276            
277            try (ICsvMapReader mapReader = _getMapReader(reader, preference))
278            {
279                // Get the columns from the header line, if applicable.
280                String[] columns = _columnHeaderLine ? mapReader.getHeader(true) : new String[0];
281                
282                Map<String, String> properties;
283                while ((properties = mapReader.read(columns)) != null)
284                {
285                    String contentId = importContent(properties, params, mapReader.getLineNumber());
286                    
287                    if (contentId != null)
288                    {
289                        contentIds.add(contentId);
290                    }
291                }
292            }
293        }
294        
295        return contentIds;
296    }
297    
298    /**
299     * Get a reader on the data stream, optionally detecting the charset.
300     * @param in the data stream.
301     * @return the reader with the correct character set.
302     */
303    protected Reader getReader(InputStream in)
304    {
305        if (_charset != null)
306        {
307            // Return an InputStreamReader with the configured charset.
308            return new InputStreamReader(in, _charset);
309        }
310        else
311        {
312            // Use Tika/ICU to detect the file charset.
313            BufferedInputStream buffIs = new BufferedInputStream(in);
314            
315            CharsetDetector detector = new CharsetDetector();
316            return detector.getReader(buffIs, Charset.defaultCharset().name());
317        }
318    }
319    
320    /**
321     * Get the CSV preference.
322     * @param header the CSV first line.
323     * @param params the import parameters.
324     * @return a {@link CsvPreference} object.
325     */
326    protected CsvPreference getCsvPreference(String header, Map<String, Object> params)
327    {
328        char delimiter = getDelimiter(header, params);
329        char quoteChar = getQuoteChar(params);
330        
331        return new CsvPreference.Builder(quoteChar, delimiter, "\r\n").build();
332    }
333    
334    /**
335     * Get the CSV character delimiter.
336     * @param header the CSV first line.
337     * @param params the import parameters.
338     * @return the CSV character delimiter.
339     */
340    protected char getDelimiter(String header, Map<String, Object> params)
341    {
342        char delimiter = ',';
343        
344        if (_delimiterChar != null) // The delimiter char is specified
345        {
346            delimiter = _delimiterChar;
347        }
348        else if (header.contains("\t")) // Else, try to auto-detect.
349        {
350            delimiter = '\t';
351        }
352        else if (header.contains(";"))
353        {
354            delimiter = ';';
355        }
356        else if (header.contains(","))
357        {
358            delimiter = ',';
359        }
360        
361        return delimiter;
362    }
363    
364    /**
365     * Get the CSV quote character.
366     * @param params the import parameters.
367     * @return the CSV quote character.
368     */
369    protected char getQuoteChar(Map<String, Object> params)
370    {
371        char quote = '"';
372        
373        if (_quoteChar != null)
374        {
375            quote = _quoteChar;
376        }
377        
378        return quote;
379    }
380    
381    /**
382     * Import a content from a CSV record.
383     * @param properties the CSV record as a Map of values, indexed by column name or number.
384     * @param params the import parameters.
385     * @param lineNumber the line number of the record being imported, for logging purposes.
386     * @return the content ID or null if the content was not created.
387     */
388    protected String importContent(Map<String, String> properties, Map<String, Object> params, int lineNumber)
389    {
390        try
391        {
392            // Map properties to metadata.
393            Map<String, String> metadata = getMetadataFromProperties(properties);
394            
395            String title = metadata.get("title");
396            
397            Content content = createContent(title, params);
398            
399            if (content instanceof ModifiableContent)
400            {
401                setMetadatas((ModifiableContent) content, metadata, params);
402            }
403            else
404            {
405                getLogger().error("Import from CSV file: the content on line " + lineNumber + " was imported as a read-only content, it could not be modified.");
406            }
407            
408            return content.getId();
409        }
410        catch (Exception e)
411        {
412            getLogger().error("Import from CSV file: error importing the content on line " + lineNumber, e);
413        }
414        
415        return null;
416    }
417    
418    /**
419     * Get the content metadata from a CSV record.
420     * @param properties the CSV record as a Map of values, indexed by column name or number.
421     * @return a Map of metadata values, indexed by metadata path.
422     */
423    protected Map<String, String> getMetadataFromProperties(Map<String, String> properties)
424    {
425        Map<String, String> metadata = new HashMap<>();
426        
427        for (String propName : properties.keySet())
428        {
429            String value = properties.get(propName);
430            String metaName = null;
431            if (_columnToMetadata.containsKey(propName))
432            {
433                metaName = _columnToMetadata.get(propName);
434            }
435            else if (_columnHeaderLine)
436            {
437                metaName = propName;
438            }
439            
440            if (metaName != null)
441            {
442                metadata.put(metaName, value);
443            }
444        }
445        
446        return metadata;
447    }
448    
449    /**
450     * Set the content metadatas from the CSV values.
451     * @param content the content to populate.
452     * @param metaValues the metadata values, extracted from the CSV record.
453     * @param params the import parameters.
454     */
455    protected void setMetadatas(ModifiableContent content, Map<String, String> metaValues, Map<String, Object> params)
456    {
457        for (String path : metaValues.keySet())
458        {
459            String value = metaValues.get(path);
460            
461            if (value != null)
462            {
463                setMetadata(content, path, value, params);
464            }
465        }
466        
467        // Save changes and create a version.
468        content.saveChanges();
469        if (content instanceof VersionableAmetysObject)
470        {
471            ((VersionableAmetysObject) content).checkpoint();
472        }
473    }
474
475    /**
476     * Set a metadata from its string value.
477     * @param content the content to populate.
478     * @param path the metadata path.
479     * @param value the metadata string value.
480     * @param params the import parameters.
481     */
482    protected void setMetadata(ModifiableContent content, String path, String value, Map<String, Object> params)
483    {
484        ModifiableCompositeMetadata metaHolder = content.getMetadataHolder();
485        MetadataDefinition metaDef = null;
486        
487        // Iterate over path parts while they are composites.
488        String[] pathElements = StringUtils.split(path, ContentConstants.METADATA_PATH_SEPARATOR);
489        for (int i = 0; i < (pathElements.length - 1); i++)
490        {
491            String compositeName = pathElements[i];
492            
493            // Get metadata definition and metadata holder for this level.
494            metaDef = getMetadataDefinition(content, metaDef, compositeName);
495            if (metaDef != null && metaDef.getType() == MetadataType.COMPOSITE)
496            {
497                metaHolder = metaHolder.getCompositeMetadata(compositeName, true);
498            }
499        }
500        
501        // Last path element: get metadata name and definition.
502        String metaName = pathElements[pathElements.length - 1];
503        metaDef = getMetadataDefinition(content, metaDef, metaName);
504        
505        if (metaDef != null)
506        {
507            try
508            {
509//                if (metaDef.isMultiple())
510//                {
511//                    setMultipleMetadata(metaHolder, metaDef, metaName, value, params);
512//                }
513//                else
514//                {
515                setMetadata(metaHolder, metaDef, metaName, value, params);
516//                }
517            }
518            catch (Exception e)
519            {
520                String message = "The value for metadata '" + metaName + "' is invalid and will be ignored: " + value;
521                getLogger().warn(message, e);
522            }
523        }
524    }
525
526    /**
527     * Get a metadata definition, either from the parent metadata definition or from the content itself.
528     * @param content the imported content.
529     * @param parentMetaDef the parent metadata definition.
530     * @param name the metadata name.
531     * @return the metadata definition.
532     */
533    protected MetadataDefinition getMetadataDefinition(Content content, MetadataDefinition parentMetaDef, String name)
534    {
535        MetadataDefinition metaDef = null;
536        
537        if (parentMetaDef == null)
538        {
539            metaDef = _cTypeHelper.getMetadataDefinition(name, content.getTypes(), content.getMixinTypes());
540        }
541        else
542        {
543            metaDef = parentMetaDef.getMetadataDefinition(name);
544        }
545        
546        return metaDef;
547    }
548    
549    /**
550     * Set a single metadata.
551     * @param meta the metadata holder.
552     * @param metaDef the metadata definition.
553     * @param name the metadata name.
554     * @param value the metadata value as a String.
555     * @param params the import parameters.
556     * @throws IOException if an error occurs.
557     */
558    protected void setMetadata(ModifiableCompositeMetadata meta, MetadataDefinition metaDef, String name, String value, Map<String, Object> params) throws IOException
559    {
560        switch (metaDef.getType())
561        {
562            case STRING:
563                setStringMetadata(meta, name, metaDef, new String [] {value});
564                break;
565            case BOOLEAN:
566                setBooleanMetadata(meta, name, metaDef, new String [] {value});
567                break;
568            case LONG:
569                setLongMetadata(meta, name, metaDef, new String [] {value});
570                break;
571            case DOUBLE:
572                setDoubleMetadata(meta, name, metaDef, new String [] {value});
573                break;
574            case DATE:
575            case DATETIME:
576                setDateMetadata(meta, name, metaDef, new String [] {value});
577                break;
578            case GEOCODE:
579                break;
580            case RICH_TEXT:
581                setRichText(meta, name, value);
582                break;
583            case BINARY:
584            case FILE:
585                setBinaryMetadata(meta, name, metaDef, value);
586                break;
587            case COMPOSITE:
588            case USER:
589            case REFERENCE:
590            case CONTENT:
591            case SUB_CONTENT:
592            default:
593                break;
594        }
595    }
596    
597    /**
598     * Set a RichText metadata from a String value.
599     * @param meta the metadata holder.
600     * @param name the metadata name.
601     * @param value the String value.
602     * @throws IOException if something goes wrong when manipulating files
603     */
604    protected void setRichText(ModifiableCompositeMetadata meta, String name, String value) throws IOException
605    {
606        StringBuilder buff = new StringBuilder();
607        
608        String cleanValue = Jsoup.clean(value, Whitelist.none());
609        
610        cleanValue = StringEscapeUtils.escapeXml10(cleanValue);
611        
612        String[] lines = StringUtils.split(cleanValue, "\r\n");
613        
614        buff.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
615            .append("<docbook:article version=\"5.0\" xmlns:docbook=\"http://docbook.org/ns/docbook\">");
616        
617        for (String line : lines)
618        {
619            buff.append("<docbook:para>").append(line).append("</docbook:para>");
620        }
621        
622        buff.append("</docbook:article>");
623        
624        setRichText(meta, name, new ByteArrayInputStream(buff.toString().getBytes("UTF-8")));
625    }
626    
627}