001/*
002 *  Copyright 2018 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.web.url;
017
018import java.awt.image.BufferedImage;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.net.URL;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import javax.imageio.ImageIO;
029
030import org.apache.avalon.framework.component.Component;
031import org.apache.avalon.framework.context.Context;
032import org.apache.avalon.framework.context.ContextException;
033import org.apache.avalon.framework.context.Contextualizable;
034import org.apache.cocoon.components.ContextHelper;
035import org.apache.cocoon.environment.Request;
036import org.apache.commons.lang3.StringUtils;
037import org.jsoup.Connection;
038import org.jsoup.Connection.Response;
039import org.jsoup.HttpStatusException;
040import org.jsoup.Jsoup;
041import org.jsoup.nodes.Document;
042import org.jsoup.nodes.Element;
043import org.jsoup.select.Elements;
044
045import org.ametys.core.ui.Callable;
046import org.ametys.runtime.plugin.component.AbstractLogEnabled;
047
048import net.sf.image4j.codec.ico.ICODecoder;
049
050/**
051 * Component to parse a HTML page to get its meta for preview
052 *
053 */
054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable
055{
056    /** The avalon role */
057    public static final String ROLE = UrlPreviewComponent.class.getName();
058
059    private Context _context;
060    
061    public void contextualize(Context context) throws ContextException
062    {
063        _context = context;
064    }
065    
066    /**
067     * Fetch and parse the HTML page at given url to get the {@link UrlPreview}
068     * @param url the url to parse
069     * @param lang the language
070     * @return the {@link UrlPreview}
071     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
072     * @throws IOException if an error occured while parsing HTML page
073     */
074    public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException
075    {
076        Connection con = _getConnection(url, lang);
077        
078        Response response = con.execute();
079        try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
080        {
081            Document doc = Jsoup.parse(is, "UTF-8", url);
082            
083            UrlPreview urlPreview = new UrlPreview();
084            urlPreview.setTitle(_getTitle(doc));
085            urlPreview.setDescription(_getDescription(doc));
086            urlPreview.setFavicon(_getFavicon(doc, response.url()));
087            urlPreview.setUrl(response.url().toString());
088            urlPreview.setImageUrl(_getImageUrl(doc));
089            
090            return urlPreview;
091        }
092    }
093    
094    /**
095     * Fetch and parse the HTML page at given url to get the favicon url
096     * @param url the url to parse
097     * @return the favicon url or null if not found
098     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
099     * @throws IOException if an error occured while parsing HTML page
100     */
101    public String getFavicon(String url) throws IOException
102    {
103        Connection con = _getConnection(url, "en");
104        
105        Response response = con.execute();
106        try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
107        {
108            Document doc = Jsoup.parse(is, "UTF-8", url);
109            
110            return _getFavicon(doc, response.url());
111        }
112    }
113    
114    /**
115     * Convert an file.ico into a file.png
116     * @param is the input stream of the file.ico
117     * @return the input stream of the file.png
118     * @throws IOException if failed to convert .ico
119     */
120    public InputStream convertIcoToPng(InputStream is) throws IOException
121    {
122        ByteArrayOutputStream out = new ByteArrayOutputStream();
123        List<BufferedImage> images = ICODecoder.read(is);
124        
125        // Take ico with the bigger width
126        int width = 0;
127        BufferedImage biggerImage = null;
128        for (BufferedImage image : images)
129        {
130            if (image.getWidth() > width)
131            {
132                width = image.getWidth();
133                biggerImage = image;
134            }
135        }
136        
137        ImageIO.write(biggerImage, "png", out);
138        
139        return new ByteArrayInputStream(out.toByteArray());
140    }
141    
142    /**
143     * Fetch and parse the HTML page at given url for preview
144     * @param url the url to parse
145     * @param lang the language
146     * @return the page information for preview
147     */
148    @Callable
149    public Map<String, String> parseUrl(String url, String lang)
150    {
151        Map<String, String> preview = new HashMap<>();
152        
153        try
154        {
155            UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en"));
156            preview = urlPreview.toJSON();
157        }
158        catch (HttpStatusException e)
159        {
160            getLogger().error("Failed to parse url '{}'", url, e);
161            preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl());
162        }
163        catch (IOException e)
164        {
165            getLogger().error("Failed to parse url '{}'", url, e);
166            preview.put("error", "Failed to parse URL " + url);
167        }
168        
169        return preview;
170
171    }
172    
173    private Connection _getConnection(String url, String lang)
174    {
175        Connection con = Jsoup.connect(url)
176                .timeout(5000) // limit to 5s
177                .maxBodySize(50 * 1024) // limit to 50ko
178                .followRedirects(true)
179                .header("Accept-Language", lang)
180                .method(Connection.Method.GET);
181        
182        String userAgent = _getUserAgent();
183        if (StringUtils.isNotBlank(userAgent))
184        {
185            con.userAgent(userAgent);
186        }
187        
188        return con;
189    }
190    
191    private String _getTitle(Document doc)
192    {
193        Elements metaOgTitle = doc.select("meta[property=og:title]");
194        if (metaOgTitle != null) 
195        {
196            String ogTitle = metaOgTitle.attr("content");
197            if (StringUtils.isNotBlank(ogTitle))
198            {
199                return ogTitle;
200            }
201        }
202        
203        return doc.title();
204    }
205    
206    private String _getDescription(Document doc)
207    {
208        Elements metaOgDesc = doc.select("meta[property=og:description]");
209        if (metaOgDesc != null) 
210        {
211            String ogDesc = metaOgDesc.attr("content");
212            if (StringUtils.isNotBlank(ogDesc))
213            {
214                return ogDesc;
215            }
216        }
217        
218        Elements metaDesc = doc.select("meta[name=description]");
219        if (metaDesc != null) 
220        {
221            String desc = metaDesc.attr("content");
222            if (StringUtils.isNotBlank(desc))
223            {
224                return desc;
225            }
226        }
227        
228        return StringUtils.EMPTY;
229    }
230    
231    private String _getImageUrl(Document doc)
232    {
233        Elements metaOgImage = doc.select("meta[property=og:image]");
234        if (metaOgImage != null) 
235        {
236            String ogImg = metaOgImage.attr("content");
237            if (StringUtils.isNotBlank(ogImg))
238            {
239                return ogImg;
240            }
241        }
242        
243        return StringUtils.EMPTY;
244    }
245    
246    private String _getFavicon(Document doc, URL url)
247    {
248        Element head = doc.head();
249        
250        Element element = head.select("link[rel=icon]").first();
251        if (element != null)
252        {
253            return element.absUrl("href");
254        }
255        
256        element = head.select("link[href~=.*\\.(ico|png|gif)]").first();
257        if (element != null)
258        {
259            return element.absUrl("href");
260        }
261        
262        element = head.select("meta[itemprop=image]").first();
263        if (element != null)
264        {
265            return element.absUrl("content");
266        }
267        
268        try
269        {
270            // Finally, try to get favico from [base_url]/favico.ico url
271            String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico";
272            
273            Connection con = Jsoup.connect(favicoUrl)
274                    .ignoreContentType(true)
275                    .timeout(2000);
276            
277            String userAgent = _getUserAgent();
278            if (StringUtils.isNotBlank(userAgent))
279            {
280                con.userAgent(userAgent);
281            }
282            
283            Response response = con.execute();
284            if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length")))
285            {
286                return favicoUrl;
287            }
288        }
289        catch (IOException e)
290        {
291            // Ignore
292        }
293        
294        return null;
295    }
296    
297    /**
298     * Get the user agent from current request
299     * @return the user agent
300     */
301    private String _getUserAgent()
302    {
303        Request request = null;
304        try
305        {
306            request = ContextHelper.getRequest(_context);
307            return request.getHeader("User-Agent");
308        }
309        catch (Exception e)
310        {
311            // ignore, there's simply no current request
312        }
313        
314        return null;
315    }
316    
317}