001/*
002 *  Copyright 2018 Anyware Services
003 *
004 *  Licensed under the Apache License, Version 2.0 (the "License");
005 *  you may not use this file except in compliance with the License.
006 *  You may obtain a copy of the License at
007 *
008 *      http://www.apache.org/licenses/LICENSE-2.0
009 *
010 *  Unless required by applicable law or agreed to in writing, software
011 *  distributed under the License is distributed on an "AS IS" BASIS,
012 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 *  See the License for the specific language governing permissions and
014 *  limitations under the License.
015 */
016package org.ametys.web.url;
017
018import java.awt.image.BufferedImage;
019import java.io.ByteArrayInputStream;
020import java.io.ByteArrayOutputStream;
021import java.io.IOException;
022import java.io.InputStream;
023import java.net.URL;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import javax.imageio.ImageIO;
029
030import org.apache.avalon.framework.component.Component;
031import org.apache.avalon.framework.context.Context;
032import org.apache.avalon.framework.context.ContextException;
033import org.apache.avalon.framework.context.Contextualizable;
034import org.apache.cocoon.components.ContextHelper;
035import org.apache.cocoon.environment.Request;
036import org.apache.commons.lang3.StringUtils;
037import org.jsoup.Connection;
038import org.jsoup.Connection.Response;
039import org.jsoup.HttpStatusException;
040import org.jsoup.Jsoup;
041import org.jsoup.nodes.Document;
042import org.jsoup.nodes.Element;
043import org.jsoup.select.Elements;
044
045import org.ametys.core.ui.Callable;
046import org.ametys.runtime.plugin.component.AbstractLogEnabled;
047
048import net.sf.image4j.codec.ico.ICODecoder;
049
050/**
051 * Component to parse a HTML page to get its meta for preview
052 *
053 */
054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable
055{
056    /** The avalon role */
057    public static final String ROLE = UrlPreviewComponent.class.getName();
058
059    private Context _context;
060    
061    public void contextualize(Context context) throws ContextException
062    {
063        _context = context;
064    }
065    
066    /**
067     * Fetch and parse the HTML page at given url to get the {@link UrlPreview}
068     * @param url the url to parse
069     * @param lang the language
070     * @return the {@link UrlPreview}
071     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
072     * @throws IOException if an error occured while parsing HTML page
073     */
074    public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException
075    {
076        Connection con = _getConnection(url, lang, false);
077        
078        Response response = con.execute();
079        try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
080        {
081            Document doc = Jsoup.parse(is, "UTF-8", url);
082            
083            UrlPreview urlPreview = new UrlPreview();
084            urlPreview.setTitle(_getTitle(doc));
085            urlPreview.setDescription(_getDescription(doc));
086            urlPreview.setFavicon(_getFavicon(doc, response.url()));
087            urlPreview.setUrl(response.url().toString());
088            urlPreview.setImageUrl(_getImageUrl(doc));
089            
090            return urlPreview;
091        }
092    }
093    
094    /**
095     * Fetch and parse the HTML page at given url to get the favicon url
096     * @param url the url to parse
097     * @return the favicon url or null if not found
098     * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response.
099     * @throws IOException if an error occured while parsing HTML page
100     */
101    public String getFavicon(String url) throws IOException
102    {
103        Connection con = _getConnection(url, "en", true);
104        
105        Response response = con.execute();
106        
107        Document doc = null;
108        
109        if (response.statusCode() == 200)
110        {
111            try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes()))
112            {
113                doc = Jsoup.parse(is, "UTF-8", url);
114            }
115        }
116        
117        return _getFavicon(doc, response.url());
118    }
119    
120    /**
121     * Convert an file.ico into a file.png
122     * @param is the input stream of the file.ico
123     * @return the input stream of the file.png
124     * @throws IOException if failed to convert .ico
125     */
126    public InputStream convertIcoToPng(InputStream is) throws IOException
127    {
128        ByteArrayOutputStream out = new ByteArrayOutputStream();
129        List<BufferedImage> images = ICODecoder.read(is);
130        
131        // Take ico with the bigger width
132        int width = 0;
133        BufferedImage biggerImage = null;
134        for (BufferedImage image : images)
135        {
136            if (image.getWidth() > width)
137            {
138                width = image.getWidth();
139                biggerImage = image;
140            }
141        }
142        
143        ImageIO.write(biggerImage, "png", out);
144        
145        return new ByteArrayInputStream(out.toByteArray());
146    }
147    
148    /**
149     * Fetch and parse the HTML page at given url for preview
150     * @param url the url to parse
151     * @param lang the language
152     * @return the page information for preview
153     */
154    @Callable
155    public Map<String, String> parseUrl(String url, String lang)
156    {
157        Map<String, String> preview = new HashMap<>();
158        
159        try
160        {
161            UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en"));
162            preview = urlPreview.toJSON();
163        }
164        catch (HttpStatusException e)
165        {
166            getLogger().error("Failed to parse url '{}'", url, e);
167            preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl());
168        }
169        catch (IOException e)
170        {
171            getLogger().error("Failed to parse url '{}'", url, e);
172            preview.put("error", "Failed to parse URL " + url);
173        }
174        
175        return preview;
176
177    }
178    
179    private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors)
180    {
181        Connection con = Jsoup.connect(url)
182                .timeout(5000) // limit to 5s
183                .maxBodySize(50 * 1024) // limit to 50ko
184                .followRedirects(true)
185                .ignoreHttpErrors(ignoreHttpErrors)
186                .header("Accept-Language", lang)
187                .method(Connection.Method.GET);
188        
189        String userAgent = _getUserAgent();
190        if (StringUtils.isNotBlank(userAgent))
191        {
192            con.userAgent(userAgent);
193        }
194        
195        return con;
196    }
197    
198    private String _getTitle(Document doc)
199    {
200        Elements metaOgTitle = doc.select("meta[property=og:title]");
201        if (metaOgTitle != null) 
202        {
203            String ogTitle = metaOgTitle.attr("content");
204            if (StringUtils.isNotBlank(ogTitle))
205            {
206                return ogTitle;
207            }
208        }
209        
210        return doc.title();
211    }
212    
213    private String _getDescription(Document doc)
214    {
215        Elements metaOgDesc = doc.select("meta[property=og:description]");
216        if (metaOgDesc != null) 
217        {
218            String ogDesc = metaOgDesc.attr("content");
219            if (StringUtils.isNotBlank(ogDesc))
220            {
221                return ogDesc;
222            }
223        }
224        
225        Elements metaDesc = doc.select("meta[name=description]");
226        if (metaDesc != null) 
227        {
228            String desc = metaDesc.attr("content");
229            if (StringUtils.isNotBlank(desc))
230            {
231                return desc;
232            }
233        }
234        
235        return StringUtils.EMPTY;
236    }
237    
238    private String _getImageUrl(Document doc)
239    {
240        Elements metaOgImage = doc.select("meta[property=og:image]");
241        if (metaOgImage != null) 
242        {
243            String ogImg = metaOgImage.attr("content");
244            if (StringUtils.isNotBlank(ogImg))
245            {
246                return ogImg;
247            }
248        }
249        
250        return StringUtils.EMPTY;
251    }
252    
253    private String _getFavicon(Document doc, URL url)
254    {
255        if (doc != null)
256        {
257            Element head = doc.head();
258            
259            Element element = head.select("link[rel=icon]").first();
260            if (element != null)
261            {
262                return element.absUrl("href");
263            }
264            
265            element = head.select("link[rel='shortcut icon']").first();
266            if (element != null)
267            {
268                return element.absUrl("href");
269            }
270            
271            element = head.select("link[href~=.*\\.(ico|png|gif)]").first();
272            if (element != null)
273            {
274                return element.absUrl("href");
275            }
276            
277            element = head.select("meta[itemprop=image]").first();
278            if (element != null)
279            {
280                return element.absUrl("content");
281            }
282        }
283        
284        try
285        {
286            // Finally, try to get favico from [base_url]/favico.ico url
287            String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico";
288            
289            Connection con = Jsoup.connect(favicoUrl)
290                    .ignoreContentType(true)
291                    .timeout(2000);
292            
293            String userAgent = _getUserAgent();
294            if (StringUtils.isNotBlank(userAgent))
295            {
296                con.userAgent(userAgent);
297            }
298            
299            Response response = con.execute();
300            if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length")))
301            {
302                return favicoUrl;
303            }
304        }
305        catch (IOException e)
306        {
307            // Ignore
308        }
309        
310        return null;
311    }
312    
313    /**
314     * Get the user agent from current request
315     * @return the user agent
316     */
317    private String _getUserAgent()
318    {
319        Request request = null;
320        try
321        {
322            request = ContextHelper.getRequest(_context);
323            return request.getHeader("User-Agent");
324        }
325        catch (Exception e)
326        {
327            // ignore, there's simply no current request
328        }
329        
330        return null;
331    }
332    
333}