001/*
002 *  Copyright 2018 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.web.url;
017
018import java.awt.image.BufferedImage;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.net.URL;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027import java.util.regex.Pattern;
028
029import javax.imageio.ImageIO;
030
031import org.apache.avalon.framework.component.Component;
032import org.apache.avalon.framework.context.Context;
033import org.apache.avalon.framework.context.ContextException;
034import org.apache.avalon.framework.context.Contextualizable;
035import org.apache.cocoon.components.ContextHelper;
036import org.apache.cocoon.environment.Request;
037import org.apache.commons.lang3.StringUtils;
038import org.jsoup.Connection;
039import org.jsoup.Connection.Response;
040import org.jsoup.HttpStatusException;
041import org.jsoup.Jsoup;
042import org.jsoup.nodes.Document;
043import org.jsoup.nodes.Element;
044import org.jsoup.select.Elements;
045
046import org.ametys.runtime.plugin.component.AbstractLogEnabled;
047
048import net.sf.image4j.codec.ico.ICODecoder;
049
050/**
051 * Component to parse a HTML page to get its meta for preview
052 *
053 */
054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable
055{
056    /** The avalon role */
057    public static final String ROLE = UrlPreviewComponent.class.getName();
058    
059    /** Pattern to detect Ametys authentication redirection URLs */
060    public static final Pattern AUTHENTICATE_PATTERN = Pattern.compile(".*/_authenticate\\?requestedURL=.*");
061
062    private Context _context;
063    
064    public void contextualize(Context context) throws ContextException
065    {
066        _context = context;
067    }
068    
069    /**
070     * Fetch and parse the HTML page at given url to get the {@link UrlPreview}
071     * @param url the url to parse
072     * @param lang the language
073     * @return the {@link UrlPreview}
074     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
075     * @throws IOException if an error occured while parsing HTML page
076     */
077    public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException
078    {
079        Connection con = _getConnection(url, lang, false);
080        
081        Response response = con.execute();
082        try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
083        {
084            String finalUrl = response.url().toString(); // follow redirects
085            
086            // Consider URL that match Ametys authentication redirection URL as 401 Unauthorized url (no preview available)
087            if (finalUrl.matches(AUTHENTICATE_PATTERN.pattern()))
088            {
089                throw new HttpStatusException("URL leads to an authentication page", 401, url);
090            }
091
092            Document doc = Jsoup.parse(is, "UTF-8", url);
093            
094            UrlPreview urlPreview = new UrlPreview();
095            urlPreview.setTitle(_getTitle(doc));
096            urlPreview.setDescription(_getDescription(doc));
097            urlPreview.setFavicon(_getFavicon(doc, response.url()));
098            urlPreview.setUrl(finalUrl);
099            urlPreview.setImageUrl(_getImageUrl(doc));
100            
101            return urlPreview;
102        }
103    }
104    
105    /**
106     * Fetch and parse the HTML page at given url to get the favicon url
107     * @param url the url to parse
108     * @return the favicon url or null if not found
109     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
110     * @throws IOException if an error occured while parsing HTML page
111     */
112    public String getFavicon(String url) throws IOException
113    {
114        Connection con = _getConnection(url, "en", true);
115        
116        Response response = con.execute();
117        
118        Document doc = null;
119        
120        if (response.statusCode() == 200)
121        {
122            try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
123            {
124                doc = Jsoup.parse(is, "UTF-8", url);
125            }
126        }
127        
128        return _getFavicon(doc, response.url());
129    }
130    
131    /**
132     * Convert an file.ico into a file.png
133     * @param is the input stream of the file.ico
134     * @return the input stream of the file.png
135     * @throws IOException if failed to convert .ico
136     */
137    public InputStream convertIcoToPng(InputStream is) throws IOException
138    {
139        ByteArrayOutputStream out = new ByteArrayOutputStream();
140        List<BufferedImage> images = ICODecoder.read(is);
141        
142        // Take ico with the bigger width
143        int width = 0;
144        BufferedImage biggerImage = null;
145        for (BufferedImage image : images)
146        {
147            if (image.getWidth() > width)
148            {
149                width = image.getWidth();
150                biggerImage = image;
151            }
152        }
153        
154        ImageIO.write(biggerImage, "png", out);
155        
156        return new ByteArrayInputStream(out.toByteArray());
157    }
158    
159    /**
160     * Fetch and parse the HTML page at given url for preview
161     * @param url the url to parse
162     * @param lang the language
163     * @return the page information for preview
164     */
165    public Map<String, String> parseUrl(String url, String lang)
166    {
167        Map<String, String> preview = new HashMap<>();
168        
169        try
170        {
171            UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en"));
172            preview = urlPreview.toJSON();
173        }
174        catch (HttpStatusException e)
175        {
176            getLogger().error("Failed to parse url '{}'", url, e);
177            preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl());
178        }
179        catch (IOException e)
180        {
181            getLogger().error("Failed to parse url '{}'", url, e);
182            preview.put("error", "Failed to parse URL " + url);
183        }
184        
185        return preview;
186
187    }
188    
189    private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors)
190    {
191        Connection con = Jsoup.connect(url)
192                .timeout(5000) // limit to 5s
193                .maxBodySize(50 * 1024) // limit to 50ko
194                .followRedirects(true)
195                .ignoreHttpErrors(ignoreHttpErrors)
196                .header("Accept-Language", lang)
197                .method(Connection.Method.GET);
198        
199        String userAgent = _getUserAgent();
200        if (StringUtils.isNotBlank(userAgent))
201        {
202            con.userAgent(userAgent);
203        }
204        
205        return con;
206    }
207    
208    private String _getTitle(Document doc)
209    {
210        Elements metaOgTitle = doc.select("meta[property=og:title]");
211        if (metaOgTitle != null) 
212        {
213            String ogTitle = metaOgTitle.attr("content");
214            if (StringUtils.isNotBlank(ogTitle))
215            {
216                return ogTitle;
217            }
218        }
219        
220        return doc.title();
221    }
222    
223    private String _getDescription(Document doc)
224    {
225        Elements metaOgDesc = doc.select("meta[property=og:description]");
226        if (metaOgDesc != null) 
227        {
228            String ogDesc = metaOgDesc.attr("content");
229            if (StringUtils.isNotBlank(ogDesc))
230            {
231                return ogDesc;
232            }
233        }
234        
235        Elements metaDesc = doc.select("meta[name=description]");
236        if (metaDesc != null) 
237        {
238            String desc = metaDesc.attr("content");
239            if (StringUtils.isNotBlank(desc))
240            {
241                return desc;
242            }
243        }
244        
245        return StringUtils.EMPTY;
246    }
247    
248    private String _getImageUrl(Document doc)
249    {
250        Elements metaOgImage = doc.select("meta[property=og:image]");
251        if (metaOgImage != null) 
252        {
253            String ogImg = metaOgImage.attr("content");
254            if (StringUtils.isNotBlank(ogImg))
255            {
256                return ogImg;
257            }
258        }
259        
260        return StringUtils.EMPTY;
261    }
262    
263    private String _getFavicon(Document doc, URL url)
264    {
265        if (doc != null)
266        {
267            Element head = doc.head();
268            
269            Element element = head.select("link[rel=icon]").first();
270            if (element != null)
271            {
272                return element.absUrl("href");
273            }
274            
275            element = head.select("link[rel='shortcut icon']").first();
276            if (element != null)
277            {
278                return element.absUrl("href");
279            }
280            
281            element = head.select("link[href~=.*\\.(ico|png|gif)]").first();
282            if (element != null)
283            {
284                return element.absUrl("href");
285            }
286            
287            element = head.select("meta[itemprop=image]").first();
288            if (element != null)
289            {
290                return element.absUrl("content");
291            }
292        }
293        
294        try
295        {
296            // Finally, try to get favico from [base_url]/favico.ico url
297            String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico";
298            
299            Connection con = Jsoup.connect(favicoUrl)
300                    .ignoreContentType(true)
301                    .timeout(2000);
302            
303            String userAgent = _getUserAgent();
304            if (StringUtils.isNotBlank(userAgent))
305            {
306                con.userAgent(userAgent);
307            }
308            
309            Response response = con.execute();
310            if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length")))
311            {
312                return favicoUrl;
313            }
314        }
315        catch (IOException e)
316        {
317            // Ignore
318        }
319        
320        return null;
321    }
322    
323    /**
324     * Get the user agent from current request
325     * @return the user agent
326     */
327    private String _getUserAgent()
328    {
329        Request request = null;
330        try
331        {
332            request = ContextHelper.getRequest(_context);
333            return request.getHeader("User-Agent");
334        }
335        catch (Exception e)
336        {
337            // ignore, there's simply no current request
338        }
339        
340        return null;
341    }
342    
343}