package app import ( "fmt" "image" "io" "log" "net/http" "net/url" "path/filepath" "strconv" "strings" "golang.org/x/net/html" ) // Insights parsed from a web URL. type Insights struct { Title string Icons []Icon } // Icon found for the web URL. type Icon struct { Size string URL string Data image.Image // raw image data, if available } // Width returns the icon's width. func (i Icon) Width() int { parts := strings.Split(i.Size, "x") if len(parts) > 1 { w, _ := strconv.Atoi(parts[0]) return w } return 0 } // Parse a web page and return page insights. func Parse(url string) (Insights, error) { log.Printf("### Parse HTML on %s for Title and Icon URLs", url) var ( result = Insights{ Icons: []Icon{}, } inTag = "" ) resp, err := http.Get(url) if err != nil { return result, fmt.Errorf("HTTP error: %s", err) } defer resp.Body.Close() z := html.NewTokenizer(resp.Body) parsing := true for parsing { tt := z.Next() token := z.Token() switch tt { case html.ErrorToken: fmt.Printf("error: %s\n", z.Err()) if z.Err() == io.EOF { // successful error condition parsing = false break } return result, fmt.Errorf("HTML parsing error: %s", z.Err()) case html.TextToken: if inTag == "title" && result.Title == "" { result.Title += token.Data } case html.StartTagToken: inTag = token.Data // Looking for if token.Data == "link" { attr := AttrDict(token) rel, _ := attr["rel"] sizes, _ := attr["sizes"] href, _ := attr["href"] // Ensure "//" URIs start with "https://" href = AddScheme(href) if rel == "shortcut icon" { log.Printf(`Found URL: %s`, href) if sizes == "" { sizes = "16x16" } // If an ico file, extract the PNGs. if filepath.Ext(href) == ".ico" { log.Printf("The favicon is a .ico file, extracting PNGs") if resp, err := http.Get(href); err == nil { if pngs, err := IcoToPNG(resp.Body); err == nil { for _, png := range pngs { size := png.Bounds().Size() result.Icons = append(result.Icons, Icon{ Size: fmt.Sprintf("%dx%d", size.X, size.Y), URL: href, Data: png, }) } resp.Body.Close() continue } else { log.Printf("Error extracting PNG from %s: %s", href, err) } resp.Body.Close() } else { log.Printf("HTTP error downloading %s: %s", href, err) } } result.Icons = append(result.Icons, Icon{ Size: sizes, URL: href, }) } } case html.EndTagToken, html.SelfClosingTagToken: inTag = "" } } return result, nil } // DetectIcons checks well-known icon URLs on a base domain. func DetectIcons(weburl string) ([]Icon, error) { var result = []Icon{} uri, err := url.Parse(weburl) if err != nil { return result, err } baseURL := uri.Scheme + "://" + uri.Host log.Printf("### Auto-detecting Icon URLs from site %s", baseURL) tryURI := []string{ "/apple-touch-icon.png", "/apple-touch-icon-180x180.png", "/apple-touch-icon-152x152.png", "/apple-touch-icon-144x144.png", "/apple-touch-icon-120x120.png", "/apple-touch-icon-114x114.png", "/apple-touch-icon-76x76.png", "/apple-touch-icon-72x72.png", "/apple-touch-icon-57x57.png", "/apple-touch-icon-60x60.png", "/favicon.ico", } for _, uri := range tryURI { resp, err := http.Get(baseURL + uri) if err != nil { continue } defer resp.Body.Close() // PNG images? if filepath.Ext(uri) == ".png" { png, err := ParsePNG(resp.Body) if err != nil { continue } size := png.Bounds().Size() result = append(result, Icon{ Size: fmt.Sprintf("%dx%d", size.X, size.Y), URL: baseURL + uri, }) } else if filepath.Ext(uri) == ".ico" { // Extract the PNG images from the icon. pngs, err := IcoToPNG(resp.Body) if err != nil { continue } for _, png := range pngs { size := png.Bounds().Size() result = append(result, Icon{ Size: fmt.Sprintf("%dx%d", size.X, size.Y), URL: baseURL + uri, Data: png, }) } } log.Printf("Found icon: %s", uri) _ = resp } return result, nil } // AttrDict converts an HTML Token attributes list into a hash map. func AttrDict(token html.Token) map[string]string { var result = map[string]string{} for _, attr := range token.Attr { result[attr.Key] = attr.Val } return result } // AddScheme ensures an HTTP URL has a valid scheme, converting "//" into // "https://" func AddScheme(uri string) string { if strings.HasPrefix(uri, "//") { return "https:" + uri } return uri }