001/*
002 *  Copyright 2020 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016
017package org.ametys.cms.data;
018
019import java.io.IOException;
020import java.io.InputStream;
021import java.net.URL;
022import java.util.Map;
023import java.util.Optional;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import org.apache.avalon.framework.component.Component;
028import org.apache.avalon.framework.context.ContextException;
029import org.apache.avalon.framework.context.Contextualizable;
030import org.apache.cocoon.Constants;
031import org.apache.cocoon.environment.Context;
032import org.apache.cocoon.xml.AttributesImpl;
033import org.apache.commons.lang3.StringUtils;
034import org.apache.excalibur.xml.sax.ContentHandlerProxy;
035import org.apache.http.HttpEntity;
036import org.apache.http.client.config.RequestConfig;
037import org.apache.http.client.methods.CloseableHttpResponse;
038import org.apache.http.client.methods.HttpGet;
039import org.apache.http.entity.ContentType;
040import org.apache.http.impl.client.CloseableHttpClient;
041import org.apache.http.impl.client.HttpClientBuilder;
042import org.slf4j.Logger;
043import org.xml.sax.Attributes;
044import org.xml.sax.ContentHandler;
045import org.xml.sax.SAXException;
046
047import org.ametys.core.util.HttpUrlUtils;
048import org.ametys.runtime.plugin.component.AbstractLogEnabled;
049
050/**
051 * Factory for the transformer that imports a rich text from docbook.
052 */
053public class RichTextImportHandlerFactory extends AbstractLogEnabled implements Component, Contextualizable
054{
055    /** Avalon role. */
056    public static final String ROLE = RichTextImportHandlerFactory.class.getName();
057    private Context _cocoonContext;
058    
059    public void contextualize(org.apache.avalon.framework.context.Context context) throws ContextException
060    {
061        _cocoonContext = (Context) context.get(Constants.CONTEXT_ENVIRONMENT_CONTEXT);
062    }
063
064    /**
065     * Creates a handler proxy to import the rich text
066     * @param contentHandler the contentHandler to pass SAX events to
067     * @param richText the rich text
068     * @param files the attachments of this rich text
069     * @return the created handler
070     */
071    public RichTextImportHandler createHandlerProxy(ContentHandler contentHandler, RichText richText, Map<String, InputStream> files)
072    {
073        return new RichTextImportHandler(contentHandler, richText, files, getLogger());
074    }
075    
076    /**
077     * This transformer imports the rich text from docbook.
078     */
079    public class RichTextImportHandler extends ContentHandlerProxy
080    {
081        private static final String __ATTACHMENT_IMAGE_TAG_NAME = "imagedata";
082        private static final String __ATTACHMENT_VIDEO_TAG_NAME = "videodata";
083        private static final String __ATTACHMENT_AUDIO_TAG_NAME = "audiodata";
084        private static final String __ATTACHMENT_TYPE_ATTRIBUTE_NAME = "type";
085        private static final String __ATTACHMENT_TYPE_ATTRIBUTE_LOCAL_VALUE = "local";
086
087        // Local attachment URI is of the form ownerId@dataName;fileName
088        private static final Pattern __LOCAL_ATTACHMENT_URI_VALIDATOR = Pattern.compile("^(?:[^@;]+)@(?:[^@;]+);([^@;]+)$");
089
090        private static final String __ANNOTATION_TAG_NAME = "phrase";
091        private static final String __ANNOTATION_NAME_ATTRIBUTE_NAME = "role";
092        private static final String __ANNOTATION_CLASS_ATTRIBUTE_NAME = "class";
093        private static final String __ANNOTATION_CLASS_ATTRIBUTE_VALUE = "semantic";
094
095        private RichText _richText;
096        private Map<String, InputStream> _files;
097        private Logger _logger;
098
099        private boolean _isCurrentlyInAnnotation;
100        private String _currentAnnotationName;
101        private StringBuilder _currentAnnotationValue;
102        private int _cptrElementsInsideCurrentAnnotation;
103        
104        /**
105         * Creates a handler proxy to import a rich text
106         * @param contentHandler the contentHandler to pass SAX events to
107         * @param richText the rich text
108         * @param files the attachments of this rich text
109         * @param logger the logger
110         */
111        public RichTextImportHandler(ContentHandler contentHandler, RichText richText, Map<String, InputStream> files, Logger logger)
112        {
113            super(contentHandler);
114            _richText = richText;
115            _files = files;
116            _logger = logger;
117        } 
118
119        @Override
120        public void startDocument() throws SAXException
121        {
122            // Remove all existing attachments from the rich text.
123            _richText.removeAttachments();
124
125            // Remove all existing annotations from the rich text.
126            _richText.removeAllAnnotations();
127
128            super.startDocument();
129        }
130
131        @Override
132        public void startElement(String uri, String loc, String raw, Attributes attrs) throws SAXException
133        {
134            // A new attachment starts being saxed
135            boolean isAttachment = _isAttachment(loc);
136            String type = attrs.getValue(__ATTACHMENT_TYPE_ATTRIBUTE_NAME);
137            Attributes newAttrs = attrs;
138            if (isAttachment && __ATTACHMENT_TYPE_ATTRIBUTE_LOCAL_VALUE.equals(type))
139            {
140                newAttrs = _processAttachment(attrs);
141            }
142
143            // A new semantic annotation starts being saxed
144            String clazz = attrs.getValue(__ANNOTATION_CLASS_ATTRIBUTE_NAME);
145            String annotationName = attrs.getValue(__ANNOTATION_NAME_ATTRIBUTE_NAME);
146            if (__ANNOTATION_TAG_NAME.equals(loc) && __ANNOTATION_CLASS_ATTRIBUTE_VALUE.equals(clazz) && annotationName != null) 
147            {
148                _processAnnotation(attrs);
149            }
150            else if (_isCurrentlyInAnnotation)
151            {
152                // A new element is being SAXed inside the current annotation
153                _cptrElementsInsideCurrentAnnotation++;
154            }
155
156            super.startElement(uri, loc, raw, newAttrs);
157        }
158        
159        private boolean _isAttachment(String loc)
160        {
161            return __ATTACHMENT_IMAGE_TAG_NAME.equals(loc) || __ATTACHMENT_VIDEO_TAG_NAME.equals(loc) || __ATTACHMENT_AUDIO_TAG_NAME.equals(loc);
162        }
163
164        private Attributes _processAttachment(Attributes attrs) throws SAXException
165        {
166            String fileRefAttribute = attrs.getValue("fileref");
167            String filename = fileRefAttribute;
168            
169            if (HttpUrlUtils.HTTP_URL_VALIDATOR.matcher(fileRefAttribute).matches())
170            {
171                try
172                {
173                    NamedResource attachment = new NamedResource();
174    
175                    RequestConfig requestConfig = RequestConfig.custom()
176                            .setConnectTimeout(2000)
177                            .setSocketTimeout(2000)
178                            .build();
179                    
180                    URL url = new URL(fileRefAttribute);
181                    String path = url.getPath();
182                    filename = path.substring(path.lastIndexOf("/") + 1);
183                    
184                    try (CloseableHttpClient httpclient = HttpClientBuilder.create()
185                                                                           .setDefaultRequestConfig(requestConfig)
186                                                                           .useSystemProperties()
187                                                                           .build())
188                    {
189                        HttpGet httpGet = new HttpGet(fileRefAttribute);
190                        try (CloseableHttpResponse httpResponse = httpclient.execute(httpGet))
191                        {
192                            int statusCode = httpResponse.getStatusLine().getStatusCode();
193                            if (statusCode != 200)
194                            {
195                                _logger.warn("Can't import file with url '" + fileRefAttribute + "' in the imported rich text. Status code is: " + statusCode);
196                            }
197                            else
198                            {
199                                HttpEntity entity = httpResponse.getEntity();
200                                try (InputStream is = entity.getContent())
201                                {
202                                    if (is == null)
203                                    {
204                                        _logger.warn("The attachment named '" + filename + "' of the imported rich text is empty");
205                                    }
206                                    else
207                                    {
208                                        attachment.setInputStream(is);
209                                    }
210                                    
211                                    String mimeType = Optional.ofNullable(ContentType.get(entity))
212                                            .map(ContentType::getMimeType)
213                                            .filter(StringUtils::isNotEmpty)
214                                            .orElse(_cocoonContext.getMimeType(filename.toLowerCase()));
215
216                                    attachment.setMimeType(mimeType);
217                                    attachment.setFilename(filename);
218                                    
219                                    _richText.addAttachment(attachment);
220                                }
221                            }
222                        }
223                    }
224                }
225                catch (IOException e)
226                {
227                    throw new SAXException("Unable to process the attachment '" + fileRefAttribute + "'. An error occured while setting its content", e);
228                }
229            }
230            else
231            {
232                Matcher uriMatcher = __LOCAL_ATTACHMENT_URI_VALIDATOR.matcher(fileRefAttribute);
233                if (uriMatcher.matches())
234                {
235                    filename = uriMatcher.group(1);
236                    if (_files.containsKey(filename))
237                    {
238                        try
239                        {
240                            NamedResource attachment = new NamedResource();
241                            String mimeType = _cocoonContext.getMimeType(filename.toLowerCase());
242                            attachment.setMimeType(mimeType);
243                            attachment.setFilename(filename);
244                            attachment.setInputStream(_files.get(filename));
245                            _richText.addAttachment(attachment);
246                        }
247                        catch (IOException e)
248                        {
249                            throw new SAXException("Unable to process the attachment '" + filename + "'. An error occured while setting its content", e);
250                        }
251                    }
252                    else
253                    {
254                        _logger.warn("The file named '" + filename + "' is not an attachment of the imported rich text");
255                    }
256                }
257                else
258                {
259                    // No URL format matches
260                    _logger.warn("Can't import file with url '" + fileRefAttribute + "' in the imported rich text. URL format is not valid.");
261                }
262            }
263            
264            AttributesImpl newAttrs = new AttributesImpl();
265            _copyAttributes(attrs, newAttrs);
266            newAttrs.addCDATAAttribute("fileref", filename);
267            return newAttrs;
268        }
269        
270        /**
271         * Copy the attributes except the fileref attribute
272         * @param attrs the attributes to copy.
273         * @param newAttrs the attributes to copy to.
274         */
275        private void _copyAttributes(Attributes attrs, AttributesImpl newAttrs)
276        {
277            for (int i = 0; i < attrs.getLength(); i++)
278            {
279                String name = attrs.getQName(i);
280
281                if (!"fileref".equals(name))
282                {
283                    newAttrs.addAttribute(attrs.getURI(i), attrs.getLocalName(i), name, attrs.getType(i), attrs.getValue(i));
284                }
285            }
286        }
287
288        private void _processAnnotation(Attributes attrs)
289        {
290            _isCurrentlyInAnnotation = true;
291            _currentAnnotationName = attrs.getValue(__ANNOTATION_NAME_ATTRIBUTE_NAME);
292            _currentAnnotationValue = new StringBuilder();
293            _cptrElementsInsideCurrentAnnotation = 0;
294        }
295
296        @Override
297        public void characters(char[] ch, int start, int length) throws SAXException
298        {
299            if (_isCurrentlyInAnnotation)
300            {
301                _currentAnnotationValue.append(ch, start, length);
302            }
303            
304            super.characters(ch, start, length);
305        }
306
307        @Override
308        public void endElement(String uri, String loc, String raw) throws SAXException
309        {    
310            if (_isCurrentlyInAnnotation)
311            {
312                if (_cptrElementsInsideCurrentAnnotation == 0)
313                {                
314                    // When the semantic annotation is fully saxed, add it to the rich text
315                    _richText.addAnnotations(_currentAnnotationName, _currentAnnotationValue.toString());
316                    _isCurrentlyInAnnotation = false;
317                }
318                else 
319                {
320                    _cptrElementsInsideCurrentAnnotation--;                
321                }
322            }
323            
324            super.endElement(uri, loc, raw);
325        }
326    }
327}