001/* 002 * Copyright 2018 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.web.url; 017 018import java.awt.image.BufferedImage; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.net.URL; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import javax.imageio.ImageIO; 029 030import org.apache.avalon.framework.component.Component; 031import org.apache.avalon.framework.context.Context; 032import org.apache.avalon.framework.context.ContextException; 033import org.apache.avalon.framework.context.Contextualizable; 034import org.apache.cocoon.components.ContextHelper; 035import org.apache.cocoon.environment.Request; 036import org.apache.commons.lang3.StringUtils; 037import org.jsoup.Connection; 038import org.jsoup.Connection.Response; 039import org.jsoup.HttpStatusException; 040import org.jsoup.Jsoup; 041import org.jsoup.nodes.Document; 042import org.jsoup.nodes.Element; 043import org.jsoup.select.Elements; 044 045import org.ametys.core.ui.Callable; 046import org.ametys.runtime.plugin.component.AbstractLogEnabled; 047 048import net.sf.image4j.codec.ico.ICODecoder; 049 050/** 051 * Component to parse a HTML page to get its meta for preview 052 * 053 */ 054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable 055{ 056 /** The avalon role */ 057 public static final String ROLE = UrlPreviewComponent.class.getName(); 058 059 private Context _context; 060 061 public void contextualize(Context context) throws ContextException 062 { 063 _context = context; 064 } 065 066 /** 067 * Fetch and parse the HTML page at given url to get the {@link UrlPreview} 068 * @param url the url to parse 069 * @param lang the language 070 * @return the {@link UrlPreview} 071 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 072 * @throws IOException if an error occured while parsing HTML page 073 */ 074 public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException 075 { 076 Connection con = _getConnection(url, lang); 077 078 Response response = con.execute(); 079 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 080 { 081 Document doc = Jsoup.parse(is, "UTF-8", url); 082 083 UrlPreview urlPreview = new UrlPreview(); 084 urlPreview.setTitle(_getTitle(doc)); 085 urlPreview.setDescription(_getDescription(doc)); 086 urlPreview.setFavicon(_getFavicon(doc, response.url())); 087 urlPreview.setUrl(response.url().toString()); 088 urlPreview.setImageUrl(_getImageUrl(doc)); 089 090 return urlPreview; 091 } 092 } 093 094 /** 095 * Fetch and parse the HTML page at given url to get the favicon url 096 * @param url the url to parse 097 * @return the favicon url or null if not found 098 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 099 * @throws IOException if an error occured while parsing HTML page 100 */ 101 public String getFavicon(String url) throws IOException 102 { 103 Connection con = _getConnection(url, "en"); 104 105 Response response = con.execute(); 106 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 107 { 108 Document doc = Jsoup.parse(is, "UTF-8", url); 109 110 return _getFavicon(doc, response.url()); 111 } 112 } 113 114 /** 115 * Convert an file.ico into a file.png 116 * @param is the input stream of the file.ico 117 * @return the input stream of the file.png 118 * @throws IOException if failed to convert .ico 119 */ 120 public InputStream convertIcoToPng(InputStream is) throws IOException 121 { 122 ByteArrayOutputStream out = new ByteArrayOutputStream(); 123 List<BufferedImage> images = ICODecoder.read(is); 124 125 // Take ico with the bigger width 126 int width = 0; 127 BufferedImage biggerImage = null; 128 for (BufferedImage image : images) 129 { 130 if (image.getWidth() > width) 131 { 132 width = image.getWidth(); 133 biggerImage = image; 134 } 135 } 136 137 ImageIO.write(biggerImage, "png", out); 138 139 return new ByteArrayInputStream(out.toByteArray()); 140 } 141 142 /** 143 * Fetch and parse the HTML page at given url for preview 144 * @param url the url to parse 145 * @param lang the language 146 * @return the page information for preview 147 */ 148 @Callable 149 public Map<String, String> parseUrl(String url, String lang) 150 { 151 Map<String, String> preview = new HashMap<>(); 152 153 try 154 { 155 UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en")); 156 preview = urlPreview.toJSON(); 157 } 158 catch (HttpStatusException e) 159 { 160 getLogger().error("Failed to parse url '{}'", url, e); 161 preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl()); 162 } 163 catch (IOException e) 164 { 165 getLogger().error("Failed to parse url '{}'", url, e); 166 preview.put("error", "Failed to parse URL " + url); 167 } 168 169 return preview; 170 171 } 172 173 private Connection _getConnection(String url, String lang) 174 { 175 Connection con = Jsoup.connect(url) 176 .timeout(5000) // limit to 5s 177 .maxBodySize(50 * 1024) // limit to 50ko 178 .followRedirects(true) 179 .header("Accept-Language", lang) 180 .method(Connection.Method.GET); 181 182 String userAgent = _getUserAgent(); 183 if (StringUtils.isNotBlank(userAgent)) 184 { 185 con.userAgent(userAgent); 186 } 187 188 return con; 189 } 190 191 private String _getTitle(Document doc) 192 { 193 Elements metaOgTitle = doc.select("meta[property=og:title]"); 194 if (metaOgTitle != null) 195 { 196 String ogTitle = metaOgTitle.attr("content"); 197 if (StringUtils.isNotBlank(ogTitle)) 198 { 199 return ogTitle; 200 } 201 } 202 203 return doc.title(); 204 } 205 206 private String _getDescription(Document doc) 207 { 208 Elements metaOgDesc = doc.select("meta[property=og:description]"); 209 if (metaOgDesc != null) 210 { 211 String ogDesc = metaOgDesc.attr("content"); 212 if (StringUtils.isNotBlank(ogDesc)) 213 { 214 return ogDesc; 215 } 216 } 217 218 Elements metaDesc = doc.select("meta[name=description]"); 219 if (metaDesc != null) 220 { 221 String desc = metaDesc.attr("content"); 222 if (StringUtils.isNotBlank(desc)) 223 { 224 return desc; 225 } 226 } 227 228 return StringUtils.EMPTY; 229 } 230 231 private String _getImageUrl(Document doc) 232 { 233 Elements metaOgImage = doc.select("meta[property=og:image]"); 234 if (metaOgImage != null) 235 { 236 String ogImg = metaOgImage.attr("content"); 237 if (StringUtils.isNotBlank(ogImg)) 238 { 239 return ogImg; 240 } 241 } 242 243 return StringUtils.EMPTY; 244 } 245 246 private String _getFavicon(Document doc, URL url) 247 { 248 Element head = doc.head(); 249 250 Element element = head.select("link[rel=icon]").first(); 251 if (element != null) 252 { 253 return element.absUrl("href"); 254 } 255 256 element = head.select("link[href~=.*\\.(ico|png|gif)]").first(); 257 if (element != null) 258 { 259 return element.absUrl("href"); 260 } 261 262 element = head.select("meta[itemprop=image]").first(); 263 if (element != null) 264 { 265 return element.absUrl("content"); 266 } 267 268 try 269 { 270 // Finally, try to get favico from [base_url]/favico.ico url 271 String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico"; 272 273 Connection con = Jsoup.connect(favicoUrl) 274 .ignoreContentType(true) 275 .timeout(2000); 276 277 String userAgent = _getUserAgent(); 278 if (StringUtils.isNotBlank(userAgent)) 279 { 280 con.userAgent(userAgent); 281 } 282 283 Response response = con.execute(); 284 if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length"))) 285 { 286 return favicoUrl; 287 } 288 } 289 catch (IOException e) 290 { 291 // Ignore 292 } 293 294 return null; 295 } 296 297 /** 298 * Get the user agent from current request 299 * @return the user agent 300 */ 301 private String _getUserAgent() 302 { 303 Request request = null; 304 try 305 { 306 request = ContextHelper.getRequest(_context); 307 return request.getHeader("User-Agent"); 308 } 309 catch (Exception e) 310 { 311 // ignore, there's simply no current request 312 } 313 314 return null; 315 } 316 317}