001/* 002 * Copyright 2018 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.web.url; 017 018import java.awt.image.BufferedImage; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.net.URL; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027import java.util.regex.Pattern; 028 029import javax.imageio.ImageIO; 030 031import org.apache.avalon.framework.component.Component; 032import org.apache.avalon.framework.context.Context; 033import org.apache.avalon.framework.context.ContextException; 034import org.apache.avalon.framework.context.Contextualizable; 035import org.apache.cocoon.components.ContextHelper; 036import org.apache.cocoon.environment.Request; 037import org.apache.commons.lang3.StringUtils; 038import org.jsoup.Connection; 039import org.jsoup.Connection.Response; 040import org.jsoup.HttpStatusException; 041import org.jsoup.Jsoup; 042import org.jsoup.nodes.Document; 043import org.jsoup.nodes.Element; 044import org.jsoup.select.Elements; 045 046import org.ametys.runtime.plugin.component.AbstractLogEnabled; 047 048import net.sf.image4j.codec.ico.ICODecoder; 049 050/** 051 * Component to parse a HTML page to get its meta for preview 052 * 053 */ 054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable 055{ 056 /** The avalon role */ 057 public static final String ROLE = UrlPreviewComponent.class.getName(); 058 059 /** Pattern to detect Ametys authentication redirection URLs */ 060 public static final Pattern AUTHENTICATE_PATTERN = Pattern.compile(".*/_authenticate\\?requestedURL=.*"); 061 062 private Context _context; 063 064 public void contextualize(Context context) throws ContextException 065 { 066 _context = context; 067 } 068 069 /** 070 * Fetch and parse the HTML page at given url to get the {@link UrlPreview} 071 * @param url the url to parse 072 * @param lang the language 073 * @return the {@link UrlPreview} 074 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 075 * @throws IOException if an error occured while parsing HTML page 076 */ 077 public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException 078 { 079 Connection con = _getConnection(url, lang, false); 080 081 Response response = con.execute(); 082 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 083 { 084 String finalUrl = response.url().toString(); // follow redirects 085 086 // Consider URL that match Ametys authentication redirection URL as 401 Unauthorized url (no preview available) 087 if (finalUrl.matches(AUTHENTICATE_PATTERN.pattern())) 088 { 089 throw new HttpStatusException("URL leads to an authentication page", 401, url); 090 } 091 092 Document doc = Jsoup.parse(is, "UTF-8", url); 093 094 UrlPreview urlPreview = new UrlPreview(); 095 urlPreview.setTitle(_getTitle(doc)); 096 urlPreview.setDescription(_getDescription(doc)); 097 urlPreview.setFavicon(_getFavicon(doc, response.url())); 098 urlPreview.setUrl(finalUrl); 099 urlPreview.setImageUrl(_getImageUrl(doc)); 100 101 return urlPreview; 102 } 103 } 104 105 /** 106 * Fetch and parse the HTML page at given url to get the favicon url 107 * @param url the url to parse 108 * @return the favicon url or null if not found 109 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 110 * @throws IOException if an error occured while parsing HTML page 111 */ 112 public String getFavicon(String url) throws IOException 113 { 114 Connection con = _getConnection(url, "en", true); 115 116 Response response = con.execute(); 117 118 Document doc = null; 119 120 if (response.statusCode() == 200) 121 { 122 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 123 { 124 doc = Jsoup.parse(is, "UTF-8", url); 125 } 126 } 127 128 return _getFavicon(doc, response.url()); 129 } 130 131 /** 132 * Convert an file.ico into a file.png 133 * @param is the input stream of the file.ico 134 * @return the input stream of the file.png 135 * @throws IOException if failed to convert .ico 136 */ 137 public InputStream convertIcoToPng(InputStream is) throws IOException 138 { 139 ByteArrayOutputStream out = new ByteArrayOutputStream(); 140 List<BufferedImage> images = ICODecoder.read(is); 141 142 // Take ico with the bigger width 143 int width = 0; 144 BufferedImage biggerImage = null; 145 for (BufferedImage image : images) 146 { 147 if (image.getWidth() > width) 148 { 149 width = image.getWidth(); 150 biggerImage = image; 151 } 152 } 153 154 ImageIO.write(biggerImage, "png", out); 155 156 return new ByteArrayInputStream(out.toByteArray()); 157 } 158 159 /** 160 * Fetch and parse the HTML page at given url for preview 161 * @param url the url to parse 162 * @param lang the language 163 * @return the page information for preview 164 */ 165 public Map<String, String> parseUrl(String url, String lang) 166 { 167 Map<String, String> preview = new HashMap<>(); 168 169 try 170 { 171 UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en")); 172 preview = urlPreview.toJSON(); 173 } 174 catch (HttpStatusException e) 175 { 176 getLogger().error("Failed to parse url '{}'", url, e); 177 preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl()); 178 } 179 catch (IOException e) 180 { 181 getLogger().error("Failed to parse url '{}'", url, e); 182 preview.put("error", "Failed to parse URL " + url); 183 } 184 185 return preview; 186 187 } 188 189 private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors) 190 { 191 Connection con = Jsoup.connect(url) 192 .timeout(5000) // limit to 5s 193 .maxBodySize(50 * 1024) // limit to 50ko 194 .followRedirects(true) 195 .ignoreHttpErrors(ignoreHttpErrors) 196 .header("Accept-Language", lang) 197 .method(Connection.Method.GET); 198 199 String userAgent = _getUserAgent(); 200 if (StringUtils.isNotBlank(userAgent)) 201 { 202 con.userAgent(userAgent); 203 } 204 205 return con; 206 } 207 208 private String _getTitle(Document doc) 209 { 210 Elements metaOgTitle = doc.select("meta[property=og:title]"); 211 if (metaOgTitle != null) 212 { 213 String ogTitle = metaOgTitle.attr("content"); 214 if (StringUtils.isNotBlank(ogTitle)) 215 { 216 return ogTitle; 217 } 218 } 219 220 return doc.title(); 221 } 222 223 private String _getDescription(Document doc) 224 { 225 Elements metaOgDesc = doc.select("meta[property=og:description]"); 226 if (metaOgDesc != null) 227 { 228 String ogDesc = metaOgDesc.attr("content"); 229 if (StringUtils.isNotBlank(ogDesc)) 230 { 231 return ogDesc; 232 } 233 } 234 235 Elements metaDesc = doc.select("meta[name=description]"); 236 if (metaDesc != null) 237 { 238 String desc = metaDesc.attr("content"); 239 if (StringUtils.isNotBlank(desc)) 240 { 241 return desc; 242 } 243 } 244 245 return StringUtils.EMPTY; 246 } 247 248 private String _getImageUrl(Document doc) 249 { 250 Elements metaOgImage = doc.select("meta[property=og:image]"); 251 if (metaOgImage != null) 252 { 253 String ogImg = metaOgImage.attr("content"); 254 if (StringUtils.isNotBlank(ogImg)) 255 { 256 return ogImg; 257 } 258 } 259 260 return StringUtils.EMPTY; 261 } 262 263 private String _getFavicon(Document doc, URL url) 264 { 265 if (doc != null) 266 { 267 Element head = doc.head(); 268 269 Element element = head.select("link[rel=icon]").first(); 270 if (element != null) 271 { 272 return element.absUrl("href"); 273 } 274 275 element = head.select("link[rel='shortcut icon']").first(); 276 if (element != null) 277 { 278 return element.absUrl("href"); 279 } 280 281 element = head.select("link[href~=.*\\.(ico|png|gif)]").first(); 282 if (element != null) 283 { 284 return element.absUrl("href"); 285 } 286 287 element = head.select("meta[itemprop=image]").first(); 288 if (element != null) 289 { 290 return element.absUrl("content"); 291 } 292 } 293 294 try 295 { 296 // Finally, try to get favico from [base_url]/favico.ico url 297 String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico"; 298 299 Connection con = Jsoup.connect(favicoUrl) 300 .ignoreContentType(true) 301 .timeout(2000); 302 303 String userAgent = _getUserAgent(); 304 if (StringUtils.isNotBlank(userAgent)) 305 { 306 con.userAgent(userAgent); 307 } 308 309 Response response = con.execute(); 310 if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length"))) 311 { 312 return favicoUrl; 313 } 314 } 315 catch (IOException e) 316 { 317 // Ignore 318 } 319 320 return null; 321 } 322 323 /** 324 * Get the user agent from current request 325 * @return the user agent 326 */ 327 private String _getUserAgent() 328 { 329 Request request = null; 330 try 331 { 332 request = ContextHelper.getRequest(_context); 333 return request.getHeader("User-Agent"); 334 } 335 catch (Exception e) 336 { 337 // ignore, there's simply no current request 338 } 339 340 return null; 341 } 342 343}