001/* 002 * Copyright 2018 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.web.url; 017 018import java.awt.image.BufferedImage; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.net.URL; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import javax.imageio.ImageIO; 029 030import org.apache.avalon.framework.component.Component; 031import org.apache.avalon.framework.context.Context; 032import org.apache.avalon.framework.context.ContextException; 033import org.apache.avalon.framework.context.Contextualizable; 034import org.apache.cocoon.components.ContextHelper; 035import org.apache.cocoon.environment.Request; 036import org.apache.commons.lang3.StringUtils; 037import org.jsoup.Connection; 038import org.jsoup.Connection.Response; 039import org.jsoup.HttpStatusException; 040import org.jsoup.Jsoup; 041import org.jsoup.nodes.Document; 042import org.jsoup.nodes.Element; 043import org.jsoup.select.Elements; 044 045import org.ametys.core.ui.Callable; 046import org.ametys.runtime.plugin.component.AbstractLogEnabled; 047 048import net.sf.image4j.codec.ico.ICODecoder; 049 050/** 051 * Component to parse a HTML page to get its meta for preview 052 * 053 */ 054public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable 055{ 056 /** The avalon role */ 057 public static final String ROLE = UrlPreviewComponent.class.getName(); 058 059 private Context _context; 060 061 public void contextualize(Context context) throws ContextException 062 { 063 _context = context; 064 } 065 066 /** 067 * Fetch and parse the HTML page at given url to get the {@link UrlPreview} 068 * @param url the url to parse 069 * @param lang the language 070 * @return the {@link UrlPreview} 071 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 072 * @throws IOException if an error occured while parsing HTML page 073 */ 074 public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException 075 { 076 Connection con = _getConnection(url, lang, false); 077 078 Response response = con.execute(); 079 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 080 { 081 Document doc = Jsoup.parse(is, "UTF-8", url); 082 083 UrlPreview urlPreview = new UrlPreview(); 084 urlPreview.setTitle(_getTitle(doc)); 085 urlPreview.setDescription(_getDescription(doc)); 086 urlPreview.setFavicon(_getFavicon(doc, response.url())); 087 urlPreview.setUrl(response.url().toString()); 088 urlPreview.setImageUrl(_getImageUrl(doc)); 089 090 return urlPreview; 091 } 092 } 093 094 /** 095 * Fetch and parse the HTML page at given url to get the favicon url 096 * @param url the url to parse 097 * @return the favicon url or null if not found 098 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 099 * @throws IOException if an error occured while parsing HTML page 100 */ 101 public String getFavicon(String url) throws IOException 102 { 103 Connection con = _getConnection(url, "en", true); 104 105 Response response = con.execute(); 106 107 Document doc = null; 108 109 if (response.statusCode() == 200) 110 { 111 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 112 { 113 doc = Jsoup.parse(is, "UTF-8", url); 114 } 115 } 116 117 return _getFavicon(doc, response.url()); 118 } 119 120 /** 121 * Convert an file.ico into a file.png 122 * @param is the input stream of the file.ico 123 * @return the input stream of the file.png 124 * @throws IOException if failed to convert .ico 125 */ 126 public InputStream convertIcoToPng(InputStream is) throws IOException 127 { 128 ByteArrayOutputStream out = new ByteArrayOutputStream(); 129 List<BufferedImage> images = ICODecoder.read(is); 130 131 // Take ico with the bigger width 132 int width = 0; 133 BufferedImage biggerImage = null; 134 for (BufferedImage image : images) 135 { 136 if (image.getWidth() > width) 137 { 138 width = image.getWidth(); 139 biggerImage = image; 140 } 141 } 142 143 ImageIO.write(biggerImage, "png", out); 144 145 return new ByteArrayInputStream(out.toByteArray()); 146 } 147 148 /** 149 * Fetch and parse the HTML page at given url for preview 150 * @param url the url to parse 151 * @param lang the language 152 * @return the page information for preview 153 */ 154 @Callable 155 public Map<String, String> parseUrl(String url, String lang) 156 { 157 Map<String, String> preview = new HashMap<>(); 158 159 try 160 { 161 UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en")); 162 preview = urlPreview.toJSON(); 163 } 164 catch (HttpStatusException e) 165 { 166 getLogger().error("Failed to parse url '{}'", url, e); 167 preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl()); 168 } 169 catch (IOException e) 170 { 171 getLogger().error("Failed to parse url '{}'", url, e); 172 preview.put("error", "Failed to parse URL " + url); 173 } 174 175 return preview; 176 177 } 178 179 private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors) 180 { 181 Connection con = Jsoup.connect(url) 182 .timeout(5000) // limit to 5s 183 .maxBodySize(50 * 1024) // limit to 50ko 184 .followRedirects(true) 185 .ignoreHttpErrors(ignoreHttpErrors) 186 .header("Accept-Language", lang) 187 .method(Connection.Method.GET); 188 189 String userAgent = _getUserAgent(); 190 if (StringUtils.isNotBlank(userAgent)) 191 { 192 con.userAgent(userAgent); 193 } 194 195 return con; 196 } 197 198 private String _getTitle(Document doc) 199 { 200 Elements metaOgTitle = doc.select("meta[property=og:title]"); 201 if (metaOgTitle != null) 202 { 203 String ogTitle = metaOgTitle.attr("content"); 204 if (StringUtils.isNotBlank(ogTitle)) 205 { 206 return ogTitle; 207 } 208 } 209 210 return doc.title(); 211 } 212 213 private String _getDescription(Document doc) 214 { 215 Elements metaOgDesc = doc.select("meta[property=og:description]"); 216 if (metaOgDesc != null) 217 { 218 String ogDesc = metaOgDesc.attr("content"); 219 if (StringUtils.isNotBlank(ogDesc)) 220 { 221 return ogDesc; 222 } 223 } 224 225 Elements metaDesc = doc.select("meta[name=description]"); 226 if (metaDesc != null) 227 { 228 String desc = metaDesc.attr("content"); 229 if (StringUtils.isNotBlank(desc)) 230 { 231 return desc; 232 } 233 } 234 235 return StringUtils.EMPTY; 236 } 237 238 private String _getImageUrl(Document doc) 239 { 240 Elements metaOgImage = doc.select("meta[property=og:image]"); 241 if (metaOgImage != null) 242 { 243 String ogImg = metaOgImage.attr("content"); 244 if (StringUtils.isNotBlank(ogImg)) 245 { 246 return ogImg; 247 } 248 } 249 250 return StringUtils.EMPTY; 251 } 252 253 private String _getFavicon(Document doc, URL url) 254 { 255 if (doc != null) 256 { 257 Element head = doc.head(); 258 259 Element element = head.select("link[rel=icon]").first(); 260 if (element != null) 261 { 262 return element.absUrl("href"); 263 } 264 265 element = head.select("link[rel='shortcut icon']").first(); 266 if (element != null) 267 { 268 return element.absUrl("href"); 269 } 270 271 element = head.select("link[href~=.*\\.(ico|png|gif)]").first(); 272 if (element != null) 273 { 274 return element.absUrl("href"); 275 } 276 277 element = head.select("meta[itemprop=image]").first(); 278 if (element != null) 279 { 280 return element.absUrl("content"); 281 } 282 } 283 284 try 285 { 286 // Finally, try to get favico from [base_url]/favico.ico url 287 String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico"; 288 289 Connection con = Jsoup.connect(favicoUrl) 290 .ignoreContentType(true) 291 .timeout(2000); 292 293 String userAgent = _getUserAgent(); 294 if (StringUtils.isNotBlank(userAgent)) 295 { 296 con.userAgent(userAgent); 297 } 298 299 Response response = con.execute(); 300 if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length"))) 301 { 302 return favicoUrl; 303 } 304 } 305 catch (IOException e) 306 { 307 // Ignore 308 } 309 310 return null; 311 } 312 313 /** 314 * Get the user agent from current request 315 * @return the user agent 316 */ 317 private String _getUserAgent() 318 { 319 Request request = null; 320 try 321 { 322 request = ContextHelper.getRequest(_context); 323 return request.getHeader("User-Agent"); 324 } 325 catch (Exception e) 326 { 327 // ignore, there's simply no current request 328 } 329 330 return null; 331 } 332 333}