001/* 002 * Copyright 2014 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.plugins.contentio.in.csv; 017 018import java.io.BufferedInputStream; 019import java.io.BufferedReader; 020import java.io.ByteArrayInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.InputStreamReader; 024import java.io.Reader; 025import java.io.StringReader; 026import java.nio.charset.Charset; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Collection; 030import java.util.HashMap; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Map; 034import java.util.Set; 035import java.util.regex.Pattern; 036 037import org.apache.avalon.framework.configuration.Configuration; 038import org.apache.avalon.framework.configuration.ConfigurationException; 039import org.apache.avalon.framework.service.ServiceException; 040import org.apache.avalon.framework.service.ServiceManager; 041import org.apache.commons.lang3.StringUtils; 042import org.apache.commons.text.StringEscapeUtils; 043import org.apache.tika.parser.txt.CharsetDetector; 044import org.jsoup.Jsoup; 045import org.jsoup.safety.Whitelist; 046import org.supercsv.io.CsvMapReader; 047import org.supercsv.io.ICsvMapReader; 048import org.supercsv.io.Tokenizer; 049import org.supercsv.prefs.CsvPreference; 050 051import org.ametys.cms.contenttype.ContentConstants; 052import org.ametys.cms.contenttype.ContentTypesHelper; 053import org.ametys.cms.contenttype.MetadataDefinition; 054import org.ametys.cms.contenttype.MetadataType; 055import org.ametys.cms.repository.Content; 056import org.ametys.cms.repository.ModifiableContent; 057import org.ametys.plugins.contentio.AbstractContentImporter; 058import org.ametys.plugins.contentio.ContentImporter; 059import org.ametys.plugins.contentio.ContentImporterHelper; 060import org.ametys.plugins.repository.metadata.ModifiableCompositeMetadata; 061import org.ametys.plugins.repository.version.VersionableAmetysObject; 062 063/** 064 * {@link ContentImporter} importing contents from a CSV file. 065 * Each CSV record (line) contains content properties. 066 * Configuration options: 067 * <ul> 068 * <li>The CSV file charset (default: auto-detect)</li> 069 * <li>The CSV delimiter character (default: auto-detect from the header)</li> 070 * <li>The CSV quote character (default to the double-quote: <code>"</code>)</li> 071 * <li>Structure of the header line: fixed pattern or column list.</li> 072 * <li>Mapping from CSV columns to content metadatas.</li> 073 * </ul> 074 */ 075public class CsvContentImporter extends AbstractContentImporter 076{ 077 078 /** The content type helper. */ 079 protected ContentTypesHelper _cTypeHelper; 080 081 /** The file charset. */ 082 protected Charset _charset; 083 084 /** The CSV delimiter character. */ 085 protected Character _delimiterChar; 086 087 /** The CSV quote character. */ 088 protected Character _quoteChar; 089 090 /** True if the supported CSV files have a header formed of the columns (this is not always the case). */ 091 protected boolean _columnHeaderLine; 092 093 /** Determine if the file is supported by matching the header line against this pattern. */ 094 protected Pattern _matchPattern; 095 096 /** Determine if the file is supported by detecting the following columns in the header. */ 097 protected Set<String> _matchColumns; 098 099 /** Contains mapping from CSV column to content metadata path. */ 100 protected Map<String, String> _columnToMetadata; 101 102 @Override 103 public void service(ServiceManager serviceManager) throws ServiceException 104 { 105 super.service(serviceManager); 106 _cTypeHelper = (ContentTypesHelper) serviceManager.lookup(ContentTypesHelper.ROLE); 107 } 108 109 @Override 110 public void configure(Configuration configuration) throws ConfigurationException 111 { 112 // Configure priority, allowed extensions, content creation parameters. 113 super.configure(configuration); 114 115 // Configure CSV parsing and mapping properties. 116 configureCsvProperties(configuration.getChild("csv")); 117 } 118 119 /** 120 * Configure CSV parsing and mapping properties. 121 * @param configuration the CSV configuration. 122 * @throws ConfigurationException if an error occurs. 123 */ 124 protected void configureCsvProperties(Configuration configuration) throws ConfigurationException 125 { 126 String charsetName = configuration.getAttribute("charset", null); 127 if (StringUtils.isNotEmpty(charsetName)) 128 { 129 try 130 { 131 _charset = Charset.forName(charsetName); 132 } 133 catch (Exception e) 134 { 135 throw new ConfigurationException("Invalid charset: " + charsetName, e); 136 } 137 } 138 139 String delimiter = configuration.getAttribute("delimiter", null); 140 if (StringUtils.isNotEmpty(delimiter)) 141 { 142 _delimiterChar = delimiter.charAt(0); 143 } 144 145 String quote = configuration.getAttribute("quote", null); 146 if (StringUtils.isNotEmpty(quote)) 147 { 148 _quoteChar = quote.charAt(0); 149 } 150 151 _columnHeaderLine = configuration.getAttributeAsBoolean("columnHeader", true); 152 153 // Match pattern or column list. 154 String matchPattern = configuration.getChild("match").getAttribute("pattern", null); 155 String matchColumns = configuration.getChild("match").getAttribute("columns", null); 156 157 if (matchPattern != null && matchColumns == null) 158 { 159 _matchPattern = Pattern.compile(matchPattern); 160 } 161 else if (matchPattern == null && matchColumns != null) 162 { 163 _matchColumns = new HashSet<>(); 164 for (String column : StringUtils.split(matchColumns, ", ")) 165 { 166 _matchColumns.add(column.trim()); 167 } 168 } 169 else 170 { 171 throw new ConfigurationException("A CSV content importer must match a pattern or a column list, but not both.", configuration); 172 } 173 174 // Configure mappings. 175 _columnToMetadata = new HashMap<>(); 176 177 for (Configuration mappingConf : configuration.getChild("mappings").getChildren("mapping")) 178 { 179 String column = mappingConf.getAttribute("column"); 180 String metadata = mappingConf.getAttribute("metadata"); 181 182 _columnToMetadata.put(column, metadata); 183 } 184 } 185 186 @Override 187 protected Collection<String> getDefaultExtensions() 188 { 189 return Arrays.asList("csv", "tsv"); 190 } 191 192 @Override 193 public boolean supports(InputStream is, String name) throws IOException 194 { 195 if (name == null || isExtensionValid(name)) 196 { 197 if (_matchPattern != null) 198 { 199 return matchHeaderPattern(is); 200 } 201 else // if (_matchColumns != null) 202 { 203 return matchColumns(is); 204 } 205 } 206 return false; 207 } 208 209 /** 210 * Test if the importer supports the given file by matching its first line against the configured pattern. 211 * @param is an input stream on the data to test. 212 * @return true if the data's first line matches the pattern, false otherwise. 213 * @throws IOException if a read error occurs. 214 */ 215 protected boolean matchHeaderPattern(InputStream is) throws IOException 216 { 217 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 218 String header = reader.readLine(); 219 220 return _matchPattern.matcher(header).matches(); 221 } 222 223 /** 224 * Test if the importer supports the given file by testing if it contains the configured columns in its header. 225 * @param is an input stream on the data to test. 226 * @return true if the CSV columns contain all the configured columns, false otherwise. 227 * @throws IOException if a read error occurs. 228 */ 229 protected boolean matchColumns(InputStream is) throws IOException 230 { 231 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 232 String header = reader.readLine(); 233 234 Map<String, Object> params = new HashMap<>(); 235 236 CsvPreference preference = getCsvPreference(header, params); 237 238 try (Tokenizer tok = new Tokenizer(new StringReader(header + "\n"), preference);) 239 { 240 List<String> columns = new ArrayList<>(); 241 tok.readColumns(columns); 242 243 return columns.containsAll(_matchColumns); 244 } 245 } 246 247 private ICsvMapReader _getMapReader(BufferedReader reader, CsvPreference preference) throws IOException 248 { 249 if (_columnHeaderLine) 250 { 251 // Reset the reader (go back to the beginning of the file). 252 reader.reset(); 253 return new CsvMapReader(reader, preference); 254 } 255 else 256 { 257 // No named columns: use a custom CSV map reader which uses the column number as the map index. 258 return new CsvColNumberMapReader(reader, preference); 259 } 260 } 261 262 @Override 263 public Set<String> importContents(InputStream is, Map<String, Object> params) throws IOException 264 { 265 Set<String> contentIds = new HashSet<>(); 266 267 // Get a reader using the right charset and wrap it in a buffered reader. 268 BufferedReader reader = new BufferedReader(getReader(is), 8192); 269 270 // Mark the start of file, to be able to reset it. 271 reader.mark(8192); 272 String headerLine = reader.readLine(); 273 274 if (headerLine != null) 275 { 276 CsvPreference preference = getCsvPreference(headerLine, params); 277 278 try (ICsvMapReader mapReader = _getMapReader(reader, preference)) 279 { 280 // Get the columns from the header line, if applicable. 281 String[] columns = _columnHeaderLine ? mapReader.getHeader(true) : new String[0]; 282 283 Map<String, String> properties; 284 while ((properties = mapReader.read(columns)) != null) 285 { 286 String contentId = importContent(properties, params, mapReader.getLineNumber()); 287 288 if (contentId != null) 289 { 290 contentIds.add(contentId); 291 } 292 } 293 } 294 } 295 296 return contentIds; 297 } 298 299 /** 300 * Get a reader on the data stream, optionally detecting the charset. 301 * @param in the data stream. 302 * @return the reader with the correct character set. 303 */ 304 protected Reader getReader(InputStream in) 305 { 306 if (_charset != null) 307 { 308 // Return an InputStreamReader with the configured charset. 309 return new InputStreamReader(in, _charset); 310 } 311 else 312 { 313 // Use Tika/ICU to detect the file charset. 314 BufferedInputStream buffIs = new BufferedInputStream(in); 315 316 CharsetDetector detector = new CharsetDetector(); 317 return detector.getReader(buffIs, Charset.defaultCharset().name()); 318 } 319 } 320 321 /** 322 * Get the CSV preference. 323 * @param header the CSV first line. 324 * @param params the import parameters. 325 * @return a {@link CsvPreference} object. 326 */ 327 protected CsvPreference getCsvPreference(String header, Map<String, Object> params) 328 { 329 char delimiter = getDelimiter(header, params); 330 char quoteChar = getQuoteChar(params); 331 332 return new CsvPreference.Builder(quoteChar, delimiter, "\r\n").build(); 333 } 334 335 /** 336 * Get the CSV character delimiter. 337 * @param header the CSV first line. 338 * @param params the import parameters. 339 * @return the CSV character delimiter. 340 */ 341 protected char getDelimiter(String header, Map<String, Object> params) 342 { 343 char delimiter = ','; 344 345 if (_delimiterChar != null) // The delimiter char is specified 346 { 347 delimiter = _delimiterChar; 348 } 349 else if (header.contains("\t")) // Else, try to auto-detect. 350 { 351 delimiter = '\t'; 352 } 353 else if (header.contains(";")) 354 { 355 delimiter = ';'; 356 } 357 else if (header.contains(",")) 358 { 359 delimiter = ','; 360 } 361 362 return delimiter; 363 } 364 365 /** 366 * Get the CSV quote character. 367 * @param params the import parameters. 368 * @return the CSV quote character. 369 */ 370 protected char getQuoteChar(Map<String, Object> params) 371 { 372 char quote = '"'; 373 374 if (_quoteChar != null) 375 { 376 quote = _quoteChar; 377 } 378 379 return quote; 380 } 381 382 /** 383 * Import a content from a CSV record. 384 * @param properties the CSV record as a Map of values, indexed by column name or number. 385 * @param params the import parameters. 386 * @param lineNumber the line number of the record being imported, for logging purposes. 387 * @return the content ID or null if the content was not created. 388 */ 389 protected String importContent(Map<String, String> properties, Map<String, Object> params, int lineNumber) 390 { 391 try 392 { 393 // Map properties to metadata. 394 Map<String, String> metadata = getMetadataFromProperties(properties); 395 396 String title = metadata.get("title"); 397 398 Content content = createContent(title, params); 399 400 if (content instanceof ModifiableContent) 401 { 402 setMetadatas((ModifiableContent) content, metadata, params); 403 } 404 else 405 { 406 getLogger().error("Import from CSV file: the content on line {} was imported as a read-only content, it could not be modified.", lineNumber); 407 } 408 409 return content.getId(); 410 } 411 catch (Exception e) 412 { 413 getLogger().error("Import from CSV file: error importing the content on line {}", lineNumber, e); 414 } 415 416 return null; 417 } 418 419 /** 420 * Get the content metadata from a CSV record. 421 * @param properties the CSV record as a Map of values, indexed by column name or number. 422 * @return a Map of metadata values, indexed by metadata path. 423 */ 424 protected Map<String, String> getMetadataFromProperties(Map<String, String> properties) 425 { 426 Map<String, String> metadata = new HashMap<>(); 427 428 for (String propName : properties.keySet()) 429 { 430 String value = properties.get(propName); 431 String metaName = null; 432 if (_columnToMetadata.containsKey(propName)) 433 { 434 metaName = _columnToMetadata.get(propName); 435 } 436 else if (_columnHeaderLine) 437 { 438 metaName = propName; 439 } 440 441 if (metaName != null) 442 { 443 metadata.put(metaName, value); 444 } 445 } 446 447 return metadata; 448 } 449 450 /** 451 * Set the content metadatas from the CSV values. 452 * @param content the content to populate. 453 * @param metaValues the metadata values, extracted from the CSV record. 454 * @param params the import parameters. 455 */ 456 protected void setMetadatas(ModifiableContent content, Map<String, String> metaValues, Map<String, Object> params) 457 { 458 for (String path : metaValues.keySet()) 459 { 460 String value = metaValues.get(path); 461 462 if (value != null) 463 { 464 setMetadata(content, path, value, params); 465 } 466 } 467 468 // Save changes and create a version. 469 content.saveChanges(); 470 if (content instanceof VersionableAmetysObject) 471 { 472 ((VersionableAmetysObject) content).checkpoint(); 473 } 474 } 475 476 /** 477 * Set a metadata from its string value. 478 * @param content the content to populate. 479 * @param path the metadata path. 480 * @param value the metadata string value. 481 * @param params the import parameters. 482 */ 483 protected void setMetadata(ModifiableContent content, String path, String value, Map<String, Object> params) 484 { 485 ModifiableCompositeMetadata metaHolder = content.getMetadataHolder(); 486 MetadataDefinition metaDef = null; 487 488 // Iterate over path parts while they are composites. 489 String[] pathElements = StringUtils.split(path, ContentConstants.METADATA_PATH_SEPARATOR); 490 for (int i = 0; i < (pathElements.length - 1); i++) 491 { 492 String compositeName = pathElements[i]; 493 494 // Get metadata definition and metadata holder for this level. 495 metaDef = getMetadataDefinition(content, metaDef, compositeName); 496 if (metaDef != null && metaDef.getType() == MetadataType.COMPOSITE) 497 { 498 metaHolder = metaHolder.getCompositeMetadata(compositeName, true); 499 } 500 } 501 502 // Last path element: get metadata name and definition. 503 String metaName = pathElements[pathElements.length - 1]; 504 metaDef = getMetadataDefinition(content, metaDef, metaName); 505 506 if (metaDef != null) 507 { 508 try 509 { 510// if (metaDef.isMultiple()) 511// { 512// setMultipleMetadata(metaHolder, metaDef, metaName, value, params); 513// } 514// else 515// { 516 setMetadata(metaHolder, metaDef, metaName, value, params); 517// } 518 } 519 catch (Exception e) 520 { 521 String message = "The value for metadata '" + metaName + "' is invalid and will be ignored: " + value; 522 getLogger().warn(message, e); 523 } 524 } 525 } 526 527 /** 528 * Get a metadata definition, either from the parent metadata definition or from the content itself. 529 * @param content the imported content. 530 * @param parentMetaDef the parent metadata definition. 531 * @param name the metadata name. 532 * @return the metadata definition. 533 */ 534 protected MetadataDefinition getMetadataDefinition(Content content, MetadataDefinition parentMetaDef, String name) 535 { 536 MetadataDefinition metaDef = null; 537 538 if (parentMetaDef == null) 539 { 540 metaDef = _cTypeHelper.getMetadataDefinition(name, content.getTypes(), content.getMixinTypes()); 541 } 542 else 543 { 544 metaDef = parentMetaDef.getMetadataDefinition(name); 545 } 546 547 return metaDef; 548 } 549 550 /** 551 * Set a single metadata. 552 * @param meta the metadata holder. 553 * @param metaDef the metadata definition. 554 * @param name the metadata name. 555 * @param value the metadata value as a String. 556 * @param params the import parameters. 557 * @throws IOException if an error occurs. 558 */ 559 protected void setMetadata(ModifiableCompositeMetadata meta, MetadataDefinition metaDef, String name, String value, Map<String, Object> params) throws IOException 560 { 561 switch (metaDef.getType()) 562 { 563 case STRING: 564 setStringMetadata(meta, name, metaDef, new String [] {value}); 565 break; 566 case BOOLEAN: 567 setBooleanMetadata(meta, name, metaDef, new String [] {value}); 568 break; 569 case LONG: 570 setLongMetadata(meta, name, metaDef, new String [] {value}); 571 break; 572 case DOUBLE: 573 setDoubleMetadata(meta, name, metaDef, new String [] {value}); 574 break; 575 case DATE: 576 case DATETIME: 577 setDateMetadata(meta, name, metaDef, new String [] {value}); 578 break; 579 case GEOCODE: 580 break; 581 case RICH_TEXT: 582 setRichText(meta, name, value); 583 break; 584 case BINARY: 585 case FILE: 586 setBinaryMetadata(meta, name, metaDef, value); 587 break; 588 case COMPOSITE: 589 case USER: 590 case REFERENCE: 591 case CONTENT: 592 case SUB_CONTENT: 593 case MULTILINGUAL_STRING: 594 // Not supported 595 default: 596 break; 597 } 598 } 599 600 /** 601 * Set a RichText metadata from a String value. 602 * @param meta the metadata holder. 603 * @param name the metadata name. 604 * @param value the String value. 605 * @throws IOException if something goes wrong when manipulating files 606 */ 607 protected void setRichText(ModifiableCompositeMetadata meta, String name, String value) throws IOException 608 { 609 String cleanValue = Jsoup.clean(value, Whitelist.none()); 610 611 cleanValue = StringEscapeUtils.escapeXml10(cleanValue); 612 613 String[] lines = StringUtils.split(cleanValue, "\r\n"); 614 615 String docbook = ContentImporterHelper.textToDocbook(lines); 616 setRichText(meta, name, new ByteArrayInputStream(docbook.getBytes("UTF-8"))); 617 } 618 619}