001/* 002 * Copyright 2018 Anyware Services 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package org.ametys.web.url; 017 018import java.awt.image.BufferedImage; 019import java.io.ByteArrayInputStream; 020import java.io.ByteArrayOutputStream; 021import java.io.IOException; 022import java.io.InputStream; 023import java.net.URL; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import javax.imageio.ImageIO; 029 030import org.apache.avalon.framework.component.Component; 031import org.apache.avalon.framework.context.Context; 032import org.apache.avalon.framework.context.ContextException; 033import org.apache.avalon.framework.context.Contextualizable; 034import org.apache.cocoon.components.ContextHelper; 035import org.apache.cocoon.environment.Request; 036import org.apache.commons.lang3.StringUtils; 037import org.jsoup.Connection; 038import org.jsoup.Connection.Response; 039import org.jsoup.HttpStatusException; 040import org.jsoup.Jsoup; 041import org.jsoup.nodes.Document; 042import org.jsoup.nodes.Element; 043import org.jsoup.select.Elements; 044 045import org.ametys.runtime.plugin.component.AbstractLogEnabled; 046 047import net.sf.image4j.codec.ico.ICODecoder; 048 049/** 050 * Component to parse a HTML page to get its meta for preview 051 * 052 */ 053public class UrlPreviewComponent extends AbstractLogEnabled implements Component, Contextualizable 054{ 055 /** The avalon role */ 056 public static final String ROLE = UrlPreviewComponent.class.getName(); 057 058 private Context _context; 059 060 public void contextualize(Context context) throws ContextException 061 { 062 _context = context; 063 } 064 065 /** 066 * Fetch and parse the HTML page at given url to get the {@link UrlPreview} 067 * @param url the url to parse 068 * @param lang the language 069 * @return the {@link UrlPreview} 070 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 071 * @throws IOException if an error occured while parsing HTML page 072 */ 073 public UrlPreview getUrlPreview(String url, String lang) throws HttpStatusException, IOException 074 { 075 Connection con = _getConnection(url, lang, false); 076 077 Response response = con.execute(); 078 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 079 { 080 Document doc = Jsoup.parse(is, "UTF-8", url); 081 082 UrlPreview urlPreview = new UrlPreview(); 083 urlPreview.setTitle(_getTitle(doc)); 084 urlPreview.setDescription(_getDescription(doc)); 085 urlPreview.setFavicon(_getFavicon(doc, response.url())); 086 urlPreview.setUrl(response.url().toString()); 087 urlPreview.setImageUrl(_getImageUrl(doc)); 088 089 return urlPreview; 090 } 091 } 092 093 /** 094 * Fetch and parse the HTML page at given url to get the favicon url 095 * @param url the url to parse 096 * @return the favicon url or null if not found 097 * @throws HttpStatusException if the HTTP request resulted in a not OK HTTP response. 098 * @throws IOException if an error occured while parsing HTML page 099 */ 100 public String getFavicon(String url) throws IOException 101 { 102 Connection con = _getConnection(url, "en", true); 103 104 Response response = con.execute(); 105 106 Document doc = null; 107 108 if (response.statusCode() == 200) 109 { 110 try (InputStream is = new ByteArrayInputStream(response.bodyAsBytes())) 111 { 112 doc = Jsoup.parse(is, "UTF-8", url); 113 } 114 } 115 116 return _getFavicon(doc, response.url()); 117 } 118 119 /** 120 * Convert an file.ico into a file.png 121 * @param is the input stream of the file.ico 122 * @return the input stream of the file.png 123 * @throws IOException if failed to convert .ico 124 */ 125 public InputStream convertIcoToPng(InputStream is) throws IOException 126 { 127 ByteArrayOutputStream out = new ByteArrayOutputStream(); 128 List<BufferedImage> images = ICODecoder.read(is); 129 130 // Take ico with the bigger width 131 int width = 0; 132 BufferedImage biggerImage = null; 133 for (BufferedImage image : images) 134 { 135 if (image.getWidth() > width) 136 { 137 width = image.getWidth(); 138 biggerImage = image; 139 } 140 } 141 142 ImageIO.write(biggerImage, "png", out); 143 144 return new ByteArrayInputStream(out.toByteArray()); 145 } 146 147 /** 148 * Fetch and parse the HTML page at given url for preview 149 * @param url the url to parse 150 * @param lang the language 151 * @return the page information for preview 152 */ 153 public Map<String, String> parseUrl(String url, String lang) 154 { 155 Map<String, String> preview = new HashMap<>(); 156 157 try 158 { 159 UrlPreview urlPreview = getUrlPreview(url, StringUtils.defaultIfBlank(lang, "en")); 160 preview = urlPreview.toJSON(); 161 } 162 catch (HttpStatusException e) 163 { 164 getLogger().error("Failed to parse url '{}'", url, e); 165 preview.put("error", "Invalid response status code " + e.getStatusCode() + " for URL " + e.getUrl()); 166 } 167 catch (IOException e) 168 { 169 getLogger().error("Failed to parse url '{}'", url, e); 170 preview.put("error", "Failed to parse URL " + url); 171 } 172 173 return preview; 174 175 } 176 177 private Connection _getConnection(String url, String lang, boolean ignoreHttpErrors) 178 { 179 Connection con = Jsoup.connect(url) 180 .timeout(5000) // limit to 5s 181 .maxBodySize(50 * 1024) // limit to 50ko 182 .followRedirects(true) 183 .ignoreHttpErrors(ignoreHttpErrors) 184 .header("Accept-Language", lang) 185 .method(Connection.Method.GET); 186 187 String userAgent = _getUserAgent(); 188 if (StringUtils.isNotBlank(userAgent)) 189 { 190 con.userAgent(userAgent); 191 } 192 193 return con; 194 } 195 196 private String _getTitle(Document doc) 197 { 198 Elements metaOgTitle = doc.select("meta[property=og:title]"); 199 if (metaOgTitle != null) 200 { 201 String ogTitle = metaOgTitle.attr("content"); 202 if (StringUtils.isNotBlank(ogTitle)) 203 { 204 return ogTitle; 205 } 206 } 207 208 return doc.title(); 209 } 210 211 private String _getDescription(Document doc) 212 { 213 Elements metaOgDesc = doc.select("meta[property=og:description]"); 214 if (metaOgDesc != null) 215 { 216 String ogDesc = metaOgDesc.attr("content"); 217 if (StringUtils.isNotBlank(ogDesc)) 218 { 219 return ogDesc; 220 } 221 } 222 223 Elements metaDesc = doc.select("meta[name=description]"); 224 if (metaDesc != null) 225 { 226 String desc = metaDesc.attr("content"); 227 if (StringUtils.isNotBlank(desc)) 228 { 229 return desc; 230 } 231 } 232 233 return StringUtils.EMPTY; 234 } 235 236 private String _getImageUrl(Document doc) 237 { 238 Elements metaOgImage = doc.select("meta[property=og:image]"); 239 if (metaOgImage != null) 240 { 241 String ogImg = metaOgImage.attr("content"); 242 if (StringUtils.isNotBlank(ogImg)) 243 { 244 return ogImg; 245 } 246 } 247 248 return StringUtils.EMPTY; 249 } 250 251 private String _getFavicon(Document doc, URL url) 252 { 253 if (doc != null) 254 { 255 Element head = doc.head(); 256 257 Element element = head.select("link[rel=icon]").first(); 258 if (element != null) 259 { 260 return element.absUrl("href"); 261 } 262 263 element = head.select("link[rel='shortcut icon']").first(); 264 if (element != null) 265 { 266 return element.absUrl("href"); 267 } 268 269 element = head.select("link[href~=.*\\.(ico|png|gif)]").first(); 270 if (element != null) 271 { 272 return element.absUrl("href"); 273 } 274 275 element = head.select("meta[itemprop=image]").first(); 276 if (element != null) 277 { 278 return element.absUrl("content"); 279 } 280 } 281 282 try 283 { 284 // Finally, try to get favico from [base_url]/favico.ico url 285 String favicoUrl = url.getProtocol() + "://" + url.getHost() + "/favicon.ico"; 286 287 Connection con = Jsoup.connect(favicoUrl) 288 .ignoreContentType(true) 289 .timeout(2000); 290 291 String userAgent = _getUserAgent(); 292 if (StringUtils.isNotBlank(userAgent)) 293 { 294 con.userAgent(userAgent); 295 } 296 297 Response response = con.execute(); 298 if (response.statusCode() == 200 && !"0".equals(response.header("Content-Length"))) 299 { 300 return favicoUrl; 301 } 302 } 303 catch (IOException e) 304 { 305 // Ignore 306 } 307 308 return null; 309 } 310 311 /** 312 * Get the user agent from current request 313 * @return the user agent 314 */ 315 private String _getUserAgent() 316 { 317 Request request = null; 318 try 319 { 320 request = ContextHelper.getRequest(_context); 321 return request.getHeader("User-Agent"); 322 } 323 catch (Exception e) 324 { 325 // ignore, there's simply no current request 326 } 327 328 return null; 329 } 330 331}