001/*
002 *  Copyright 2018 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.web.url;
017
018import java.awt.image.BufferedImage;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.net.URL;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import javax.imageio.ImageIO;
029
030import org.apache.avalon.framework.component.Component;
031import org.apache.avalon.framework.context.Context;
032import org.apache.avalon.framework.context.ContextException;
033import org.apache.avalon.framework.context.Contextualizable;
034import org.apache.cocoon.components.ContextHelper;
035import org.apache.cocoon.environment.Request;
036import org.apache.commons.lang3.StringUtils;
037import org.jsoup.Connection;
038import org.jsoup.Connection.Response;
039import org.jsoup.HttpStatusException;
040import org.jsoup.Jsoup;
041import org.jsoup.nodes.Document;
042import org.jsoup.nodes.Element;
043import org.jsoup.select.Elements;
044
045import org.ametys.runtime.plugin.component.AbstractLogEnabled;
046
047import net.sf.image4j.codec.ico.ICODecoder;
048
049/**
050 * Component to parse a HTML page to get its meta for preview
051 *
052 */
053public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable
054{
055    /** The avalon role */
056    public static final String ROLE = UrlPreviewComponent.class.getName();
057
058    private Context _context;
059    
060    public void contextualize(Context context) throws ContextException
061    {
062        _context = context;
063    }
064    
065    /**
066     * Fetch and parse the HTML page at given url to get the {@link UrlPreview}
067     * @param url the url to parse
068     * @param lang the language
069     * @return the {@link UrlPreview}
070     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
071     * @throws IOException if an error occured while parsing HTML page
072     */
073    public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException
074    {
075        Connection con = _getConnection(url, lang, false);
076        
077        Response response = con.execute();
078        try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
079        {
080            Document doc = Jsoup.parse(is, "UTF-8", url);
081            
082            UrlPreview urlPreview = new UrlPreview();
083            urlPreview.setTitle(_getTitle(doc));
084            urlPreview.setDescription(_getDescription(doc));
085            urlPreview.setFavicon(_getFavicon(doc, response.url()));
086            urlPreview.setUrl(response.url().toString());
087            urlPreview.setImageUrl(_getImageUrl(doc));
088            
089            return urlPreview;
090        }
091    }
092    
093    /**
094     * Fetch and parse the HTML page at given url to get the favicon url
095     * @param url the url to parse
096     * @return the favicon url or null if not found
097     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
098     * @throws IOException if an error occured while parsing HTML page
099     */
100    public String getFavicon(String url) throws IOException
101    {
102        Connection con = _getConnection(url, "en", true);
103        
104        Response response = con.execute();
105        
106        Document doc = null;
107        
108        if (response.statusCode() == 200)
109        {
110            try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
111            {
112                doc = Jsoup.parse(is, "UTF-8", url);
113            }
114        }
115        
116        return _getFavicon(doc, response.url());
117    }
118    
119    /**
120     * Convert an file.ico into a file.png
121     * @param is the input stream of the file.ico
122     * @return the input stream of the file.png
123     * @throws IOException if failed to convert .ico
124     */
125    public InputStream convertIcoToPng(InputStream is) throws IOException
126    {
127        ByteArrayOutputStream out = new ByteArrayOutputStream();
128        List<BufferedImage> images = ICODecoder.read(is);
129        
130        // Take ico with the bigger width
131        int width = 0;
132        BufferedImage biggerImage = null;
133        for (BufferedImage image : images)
134        {
135            if (image.getWidth() > width)
136            {
137                width = image.getWidth();
138                biggerImage = image;
139            }
140        }
141        
142        ImageIO.write(biggerImage, "png", out);
143        
144        return new ByteArrayInputStream(out.toByteArray());
145    }
146    
147    /**
148     * Fetch and parse the HTML page at given url for preview
149     * @param url the url to parse
150     * @param lang the language
151     * @return the page information for preview
152     */
153    public Map<String, String> parseUrl(String url, String lang)
154    {
155        Map<String, String> preview = new HashMap<>();
156        
157        try
158        {
159            UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en"));
160            preview = urlPreview.toJSON();
161        }
162        catch (HttpStatusException e)
163        {
164            getLogger().error("Failed to parse url '{}'", url, e);
165            preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl());
166        }
167        catch (IOException e)
168        {
169            getLogger().error("Failed to parse url '{}'", url, e);
170            preview.put("error", "Failed to parse URL " + url);
171        }
172        
173        return preview;
174
175    }
176    
177    private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors)
178    {
179        Connection con = Jsoup.connect(url)
180                .timeout(5000) // limit to 5s
181                .maxBodySize(50 * 1024) // limit to 50ko
182                .followRedirects(true)
183                .ignoreHttpErrors(ignoreHttpErrors)
184                .header("Accept-Language", lang)
185                .method(Connection.Method.GET);
186        
187        String userAgent = _getUserAgent();
188        if (StringUtils.isNotBlank(userAgent))
189        {
190            con.userAgent(userAgent);
191        }
192        
193        return con;
194    }
195    
196    private String _getTitle(Document doc)
197    {
198        Elements metaOgTitle = doc.select("meta[property=og:title]");
199        if (metaOgTitle != null) 
200        {
201            String ogTitle = metaOgTitle.attr("content");
202            if (StringUtils.isNotBlank(ogTitle))
203            {
204                return ogTitle;
205            }
206        }
207        
208        return doc.title();
209    }
210    
211    private String _getDescription(Document doc)
212    {
213        Elements metaOgDesc = doc.select("meta[property=og:description]");
214        if (metaOgDesc != null) 
215        {
216            String ogDesc = metaOgDesc.attr("content");
217            if (StringUtils.isNotBlank(ogDesc))
218            {
219                return ogDesc;
220            }
221        }
222        
223        Elements metaDesc = doc.select("meta[name=description]");
224        if (metaDesc != null) 
225        {
226            String desc = metaDesc.attr("content");
227            if (StringUtils.isNotBlank(desc))
228            {
229                return desc;
230            }
231        }
232        
233        return StringUtils.EMPTY;
234    }
235    
236    private String _getImageUrl(Document doc)
237    {
238        Elements metaOgImage = doc.select("meta[property=og:image]");
239        if (metaOgImage != null) 
240        {
241            String ogImg = metaOgImage.attr("content");
242            if (StringUtils.isNotBlank(ogImg))
243            {
244                return ogImg;
245            }
246        }
247        
248        return StringUtils.EMPTY;
249    }
250    
251    private String _getFavicon(Document doc, URL url)
252    {
253        if (doc != null)
254        {
255            Element head = doc.head();
256            
257            Element element = head.select("link[rel=icon]").first();
258            if (element != null)
259            {
260                return element.absUrl("href");
261            }
262            
263            element = head.select("link[rel='shortcut icon']").first();
264            if (element != null)
265            {
266                return element.absUrl("href");
267            }
268            
269            element = head.select("link[href~=.*\\.(ico|png|gif)]").first();
270            if (element != null)
271            {
272                return element.absUrl("href");
273            }
274            
275            element = head.select("meta[itemprop=image]").first();
276            if (element != null)
277            {
278                return element.absUrl("content");
279            }
280        }
281        
282        try
283        {
284            // Finally, try to get favico from [base_url]/favico.ico url
285            String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico";
286            
287            Connection con = Jsoup.connect(favicoUrl)
288                    .ignoreContentType(true)
289                    .timeout(2000);
290            
291            String userAgent = _getUserAgent();
292            if (StringUtils.isNotBlank(userAgent))
293            {
294                con.userAgent(userAgent);
295            }
296            
297            Response response = con.execute();
298            if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length")))
299            {
300                return favicoUrl;
301            }
302        }
303        catch (IOException e)
304        {
305            // Ignore
306        }
307        
308        return null;
309    }
310    
311    /**
312     * Get the user agent from current request
313     * @return the user agent
314     */
315    private String _getUserAgent()
316    {
317        Request request = null;
318        try
319        {
320            request = ContextHelper.getRequest(_context);
321            return request.getHeader("User-Agent");
322        }
323        catch (Exception e)
324        {
325            // ignore, there's simply no current request
326        }
327        
328        return null;
329    }
330    
331}