001/* 002 * Copyright 2014 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.plugins.contentio.in.csv; 017 018import java.io.BufferedInputStream; 019import java.io.BufferedReader; 020import java.io.ByteArrayInputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.InputStreamReader; 024import java.io.Reader; 025import java.io.StringReader; 026import java.nio.charset.Charset; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Collection; 030import java.util.HashMap; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Map; 034import java.util.Set; 035import java.util.regex.Pattern; 036 037import org.apache.avalon.framework.configuration.Configuration; 038import org.apache.avalon.framework.configuration.ConfigurationException; 039import org.apache.avalon.framework.service.ServiceException; 040import org.apache.avalon.framework.service.ServiceManager; 041import org.apache.commons.lang3.StringEscapeUtils; 042import org.apache.commons.lang3.StringUtils; 043import org.apache.tika.parser.txt.CharsetDetector; 044import org.jsoup.Jsoup; 045import org.jsoup.safety.Whitelist; 046import org.supercsv.io.CsvMapReader; 047import org.supercsv.io.ICsvMapReader; 048import org.supercsv.io.Tokenizer; 049import org.supercsv.prefs.CsvPreference; 050 051import org.ametys.cms.contenttype.ContentConstants; 052import org.ametys.cms.contenttype.ContentTypesHelper; 053import org.ametys.cms.contenttype.MetadataDefinition; 054import org.ametys.cms.contenttype.MetadataType; 055import org.ametys.cms.repository.Content; 056import org.ametys.cms.repository.ModifiableContent; 057import org.ametys.plugins.contentio.AbstractContentImporter; 058import org.ametys.plugins.contentio.ContentImporter; 059import org.ametys.plugins.repository.metadata.ModifiableCompositeMetadata; 060import org.ametys.plugins.repository.version.VersionableAmetysObject; 061 062/** 063 * {@link ContentImporter} importing contents from a CSV file. 064 * Each CSV record (line) contains content properties. 065 * Configuration options: 066 * <ul> 067 * <li>The CSV file charset (default: auto-detect)</li> 068 * <li>The CSV delimiter character (default: auto-detect from the header)</li> 069 * <li>The CSV quote character (default to the double-quote: <code>"</code>)</li> 070 * <li>Structure of the header line: fixed pattern or column list.</li> 071 * <li>Mapping from CSV columns to content metadatas.</li> 072 * </ul> 073 */ 074public class CsvContentImporter extends AbstractContentImporter 075{ 076 077 /** The content type helper. */ 078 protected ContentTypesHelper _cTypeHelper; 079 080 /** The file charset. */ 081 protected Charset _charset; 082 083 /** The CSV delimiter character. */ 084 protected Character _delimiterChar; 085 086 /** The CSV quote character. */ 087 protected Character _quoteChar; 088 089 /** True if the supported CSV files have a header formed of the columns (this is not always the case). */ 090 protected boolean _columnHeaderLine; 091 092 /** Determine if the file is supported by matching the header line against this pattern. */ 093 protected Pattern _matchPattern; 094 095 /** Determine if the file is supported by detecting the following columns in the header. */ 096 protected Set<String> _matchColumns; 097 098 /** Contains mapping from CSV column to content metadata path. */ 099 protected Map<String, String> _columnToMetadata; 100 101 @Override 102 public void service(ServiceManager serviceManager) throws ServiceException 103 { 104 super.service(serviceManager); 105 _cTypeHelper = (ContentTypesHelper) serviceManager.lookup(ContentTypesHelper.ROLE); 106 } 107 108 @Override 109 public void configure(Configuration configuration) throws ConfigurationException 110 { 111 // Configure priority, allowed extensions, content creation parameters. 112 super.configure(configuration); 113 114 // Configure CSV parsing and mapping properties. 115 configureCsvProperties(configuration.getChild("csv")); 116 } 117 118 /** 119 * Configure CSV parsing and mapping properties. 120 * @param configuration the CSV configuration. 121 * @throws ConfigurationException if an error occurs. 122 */ 123 protected void configureCsvProperties(Configuration configuration) throws ConfigurationException 124 { 125 String charsetName = configuration.getAttribute("charset", null); 126 if (StringUtils.isNotEmpty(charsetName)) 127 { 128 try 129 { 130 _charset = Charset.forName(charsetName); 131 } 132 catch (Exception e) 133 { 134 throw new ConfigurationException("Invalid charset: " + charsetName, e); 135 } 136 } 137 138 String delimiter = configuration.getAttribute("delimiter", null); 139 if (StringUtils.isNotEmpty(delimiter)) 140 { 141 _delimiterChar = delimiter.charAt(0); 142 } 143 144 String quote = configuration.getAttribute("quote", null); 145 if (StringUtils.isNotEmpty(quote)) 146 { 147 _quoteChar = quote.charAt(0); 148 } 149 150 _columnHeaderLine = configuration.getAttributeAsBoolean("columnHeader", true); 151 152 // Match pattern or column list. 153 String matchPattern = configuration.getChild("match").getAttribute("pattern", null); 154 String matchColumns = configuration.getChild("match").getAttribute("columns", null); 155 156 if (matchPattern != null && matchColumns == null) 157 { 158 _matchPattern = Pattern.compile(matchPattern); 159 } 160 else if (matchPattern == null && matchColumns != null) 161 { 162 _matchColumns = new HashSet<>(); 163 for (String column : StringUtils.split(matchColumns, ", ")) 164 { 165 _matchColumns.add(column.trim()); 166 } 167 } 168 else 169 { 170 throw new ConfigurationException("A CSV content importer must match a pattern or a column list, but not both.", configuration); 171 } 172 173 // Configure mappings. 174 _columnToMetadata = new HashMap<>(); 175 176 for (Configuration mappingConf : configuration.getChild("mappings").getChildren("mapping")) 177 { 178 String column = mappingConf.getAttribute("column"); 179 String metadata = mappingConf.getAttribute("metadata"); 180 181 _columnToMetadata.put(column, metadata); 182 } 183 } 184 185 @Override 186 protected Collection<String> getDefaultExtensions() 187 { 188 return Arrays.asList("csv", "tsv"); 189 } 190 191 @Override 192 public boolean supports(InputStream is, String name) throws IOException 193 { 194 if (name == null || isExtensionValid(name)) 195 { 196 if (_matchPattern != null) 197 { 198 return matchHeaderPattern(is); 199 } 200 else // if (_matchColumns != null) 201 { 202 return matchColumns(is); 203 } 204 } 205 return false; 206 } 207 208 /** 209 * Test if the importer supports the given file by matching its first line against the configured pattern. 210 * @param is an input stream on the data to test. 211 * @return true if the data's first line matches the pattern, false otherwise. 212 * @throws IOException if a read error occurs. 213 */ 214 protected boolean matchHeaderPattern(InputStream is) throws IOException 215 { 216 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 217 String header = reader.readLine(); 218 219 return _matchPattern.matcher(header).matches(); 220 } 221 222 /** 223 * Test if the importer supports the given file by testing if it contains the configured columns in its header. 224 * @param is an input stream on the data to test. 225 * @return true if the CSV columns contain all the configured columns, false otherwise. 226 * @throws IOException if a read error occurs. 227 */ 228 protected boolean matchColumns(InputStream is) throws IOException 229 { 230 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 231 String header = reader.readLine(); 232 233 Map<String, Object> params = new HashMap<>(); 234 235 CsvPreference preference = getCsvPreference(header, params); 236 237 try (Tokenizer tok = new Tokenizer(new StringReader(header + "\n"), preference);) 238 { 239 List<String> columns = new ArrayList<>(); 240 tok.readColumns(columns); 241 242 return columns.containsAll(_matchColumns); 243 } 244 } 245 246 private ICsvMapReader _getMapReader(BufferedReader reader, CsvPreference preference) throws IOException 247 { 248 if (_columnHeaderLine) 249 { 250 // Reset the reader (go back to the beginning of the file). 251 reader.reset(); 252 return new CsvMapReader(reader, preference); 253 } 254 else 255 { 256 // No named columns: use a custom CSV map reader which uses the column number as the map index. 257 return new CsvColNumberMapReader(reader, preference); 258 } 259 } 260 261 @Override 262 public Set<String> importContents(InputStream is, Map<String, Object> params) throws IOException 263 { 264 Set<String> contentIds = new HashSet<>(); 265 266 // Get a reader using the right charset and wrap it in a buffered reader. 267 BufferedReader reader = new BufferedReader(getReader(is), 8192); 268 269 // Mark the start of file, to be able to reset it. 270 reader.mark(8192); 271 String headerLine = reader.readLine(); 272 273 if (headerLine != null) 274 { 275 CsvPreference preference = getCsvPreference(headerLine, params); 276 277 try (ICsvMapReader mapReader = _getMapReader(reader, preference)) 278 { 279 // Get the columns from the header line, if applicable. 280 String[] columns = _columnHeaderLine ? mapReader.getHeader(true) : new String[0]; 281 282 Map<String, String> properties; 283 while ((properties = mapReader.read(columns)) != null) 284 { 285 String contentId = importContent(properties, params, mapReader.getLineNumber()); 286 287 if (contentId != null) 288 { 289 contentIds.add(contentId); 290 } 291 } 292 } 293 } 294 295 return contentIds; 296 } 297 298 /** 299 * Get a reader on the data stream, optionally detecting the charset. 300 * @param in the data stream. 301 * @return the reader with the correct character set. 302 */ 303 protected Reader getReader(InputStream in) 304 { 305 if (_charset != null) 306 { 307 // Return an InputStreamReader with the configured charset. 308 return new InputStreamReader(in, _charset); 309 } 310 else 311 { 312 // Use Tika/ICU to detect the file charset. 313 BufferedInputStream buffIs = new BufferedInputStream(in); 314 315 CharsetDetector detector = new CharsetDetector(); 316 return detector.getReader(buffIs, Charset.defaultCharset().name()); 317 } 318 } 319 320 /** 321 * Get the CSV preference. 322 * @param header the CSV first line. 323 * @param params the import parameters. 324 * @return a {@link CsvPreference} object. 325 */ 326 protected CsvPreference getCsvPreference(String header, Map<String, Object> params) 327 { 328 char delimiter = getDelimiter(header, params); 329 char quoteChar = getQuoteChar(params); 330 331 return new CsvPreference.Builder(quoteChar, delimiter, "\r\n").build(); 332 } 333 334 /** 335 * Get the CSV character delimiter. 336 * @param header the CSV first line. 337 * @param params the import parameters. 338 * @return the CSV character delimiter. 339 */ 340 protected char getDelimiter(String header, Map<String, Object> params) 341 { 342 char delimiter = ','; 343 344 if (_delimiterChar != null) // The delimiter char is specified 345 { 346 delimiter = _delimiterChar; 347 } 348 else if (header.contains("\t")) // Else, try to auto-detect. 349 { 350 delimiter = '\t'; 351 } 352 else if (header.contains(";")) 353 { 354 delimiter = ';'; 355 } 356 else if (header.contains(",")) 357 { 358 delimiter = ','; 359 } 360 361 return delimiter; 362 } 363 364 /** 365 * Get the CSV quote character. 366 * @param params the import parameters. 367 * @return the CSV quote character. 368 */ 369 protected char getQuoteChar(Map<String, Object> params) 370 { 371 char quote = '"'; 372 373 if (_quoteChar != null) 374 { 375 quote = _quoteChar; 376 } 377 378 return quote; 379 } 380 381 /** 382 * Import a content from a CSV record. 383 * @param properties the CSV record as a Map of values, indexed by column name or number. 384 * @param params the import parameters. 385 * @param lineNumber the line number of the record being imported, for logging purposes. 386 * @return the content ID or null if the content was not created. 387 */ 388 protected String importContent(Map<String, String> properties, Map<String, Object> params, int lineNumber) 389 { 390 try 391 { 392 // Map properties to metadata. 393 Map<String, String> metadata = getMetadataFromProperties(properties); 394 395 String title = metadata.get("title"); 396 397 Content content = createContent(title, params); 398 399 if (content instanceof ModifiableContent) 400 { 401 setMetadatas((ModifiableContent) content, metadata, params); 402 } 403 else 404 { 405 getLogger().error("Import from CSV file: the content on line " + lineNumber + " was imported as a read-only content, it could not be modified."); 406 } 407 408 return content.getId(); 409 } 410 catch (Exception e) 411 { 412 getLogger().error("Import from CSV file: error importing the content on line " + lineNumber, e); 413 } 414 415 return null; 416 } 417 418 /** 419 * Get the content metadata from a CSV record. 420 * @param properties the CSV record as a Map of values, indexed by column name or number. 421 * @return a Map of metadata values, indexed by metadata path. 422 */ 423 protected Map<String, String> getMetadataFromProperties(Map<String, String> properties) 424 { 425 Map<String, String> metadata = new HashMap<>(); 426 427 for (String propName : properties.keySet()) 428 { 429 String value = properties.get(propName); 430 String metaName = null; 431 if (_columnToMetadata.containsKey(propName)) 432 { 433 metaName = _columnToMetadata.get(propName); 434 } 435 else if (_columnHeaderLine) 436 { 437 metaName = propName; 438 } 439 440 if (metaName != null) 441 { 442 metadata.put(metaName, value); 443 } 444 } 445 446 return metadata; 447 } 448 449 /** 450 * Set the content metadatas from the CSV values. 451 * @param content the content to populate. 452 * @param metaValues the metadata values, extracted from the CSV record. 453 * @param params the import parameters. 454 */ 455 protected void setMetadatas(ModifiableContent content, Map<String, String> metaValues, Map<String, Object> params) 456 { 457 for (String path : metaValues.keySet()) 458 { 459 String value = metaValues.get(path); 460 461 if (value != null) 462 { 463 setMetadata(content, path, value, params); 464 } 465 } 466 467 // Save changes and create a version. 468 content.saveChanges(); 469 if (content instanceof VersionableAmetysObject) 470 { 471 ((VersionableAmetysObject) content).checkpoint(); 472 } 473 } 474 475 /** 476 * Set a metadata from its string value. 477 * @param content the content to populate. 478 * @param path the metadata path. 479 * @param value the metadata string value. 480 * @param params the import parameters. 481 */ 482 protected void setMetadata(ModifiableContent content, String path, String value, Map<String, Object> params) 483 { 484 ModifiableCompositeMetadata metaHolder = content.getMetadataHolder(); 485 MetadataDefinition metaDef = null; 486 487 // Iterate over path parts while they are composites. 488 String[] pathElements = StringUtils.split(path, ContentConstants.METADATA_PATH_SEPARATOR); 489 for (int i = 0; i < (pathElements.length - 1); i++) 490 { 491 String compositeName = pathElements[i]; 492 493 // Get metadata definition and metadata holder for this level. 494 metaDef = getMetadataDefinition(content, metaDef, compositeName); 495 if (metaDef != null && metaDef.getType() == MetadataType.COMPOSITE) 496 { 497 metaHolder = metaHolder.getCompositeMetadata(compositeName, true); 498 } 499 } 500 501 // Last path element: get metadata name and definition. 502 String metaName = pathElements[pathElements.length - 1]; 503 metaDef = getMetadataDefinition(content, metaDef, metaName); 504 505 if (metaDef != null) 506 { 507 try 508 { 509// if (metaDef.isMultiple()) 510// { 511// setMultipleMetadata(metaHolder, metaDef, metaName, value, params); 512// } 513// else 514// { 515 setMetadata(metaHolder, metaDef, metaName, value, params); 516// } 517 } 518 catch (Exception e) 519 { 520 String message = "The value for metadata '" + metaName + "' is invalid and will be ignored: " + value; 521 getLogger().warn(message, e); 522 } 523 } 524 } 525 526 /** 527 * Get a metadata definition, either from the parent metadata definition or from the content itself. 528 * @param content the imported content. 529 * @param parentMetaDef the parent metadata definition. 530 * @param name the metadata name. 531 * @return the metadata definition. 532 */ 533 protected MetadataDefinition getMetadataDefinition(Content content, MetadataDefinition parentMetaDef, String name) 534 { 535 MetadataDefinition metaDef = null; 536 537 if (parentMetaDef == null) 538 { 539 metaDef = _cTypeHelper.getMetadataDefinition(name, content.getTypes(), content.getMixinTypes()); 540 } 541 else 542 { 543 metaDef = parentMetaDef.getMetadataDefinition(name); 544 } 545 546 return metaDef; 547 } 548 549 /** 550 * Set a single metadata. 551 * @param meta the metadata holder. 552 * @param metaDef the metadata definition. 553 * @param name the metadata name. 554 * @param value the metadata value as a String. 555 * @param params the import parameters. 556 * @throws IOException if an error occurs. 557 */ 558 protected void setMetadata(ModifiableCompositeMetadata meta, MetadataDefinition metaDef, String name, String value, Map<String, Object> params) throws IOException 559 { 560 switch (metaDef.getType()) 561 { 562 case STRING: 563 setStringMetadata(meta, name, metaDef, new String [] {value}); 564 break; 565 case BOOLEAN: 566 setBooleanMetadata(meta, name, metaDef, new String [] {value}); 567 break; 568 case LONG: 569 setLongMetadata(meta, name, metaDef, new String [] {value}); 570 break; 571 case DOUBLE: 572 setDoubleMetadata(meta, name, metaDef, new String [] {value}); 573 break; 574 case DATE: 575 case DATETIME: 576 setDateMetadata(meta, name, metaDef, new String [] {value}); 577 break; 578 case GEOCODE: 579 break; 580 case RICH_TEXT: 581 setRichText(meta, name, value); 582 break; 583 case BINARY: 584 case FILE: 585 setBinaryMetadata(meta, name, metaDef, value); 586 break; 587 case COMPOSITE: 588 case USER: 589 case REFERENCE: 590 case CONTENT: 591 case SUB_CONTENT: 592 default: 593 break; 594 } 595 } 596 597 /** 598 * Set a RichText metadata from a String value. 599 * @param meta the metadata holder. 600 * @param name the metadata name. 601 * @param value the String value. 602 * @throws IOException if something goes wrong when manipulating files 603 */ 604 protected void setRichText(ModifiableCompositeMetadata meta, String name, String value) throws IOException 605 { 606 StringBuilder buff = new StringBuilder(); 607 608 String cleanValue = Jsoup.clean(value, Whitelist.none()); 609 610 cleanValue = StringEscapeUtils.escapeXml10(cleanValue); 611 612 String[] lines = StringUtils.split(cleanValue, "\r\n"); 613 614 buff.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") 615 .append("<docbook:article version=\"5.0\" xmlns:docbook=\"http://docbook.org/ns/docbook\">"); 616 617 for (String line : lines) 618 { 619 buff.append("<docbook:para>").append(line).append("</docbook:para>"); 620 } 621 622 buff.append("</docbook:article>"); 623 624 setRichText(meta, name, new ByteArrayInputStream(buff.toString().getBytes("UTF-8"))); 625 } 626 627}