2022-11-19 18:43:56 +08:00
|
|
|
package getter
|
2022-11-19 16:58:55 +08:00
|
|
|
|
|
|
|
import (
|
2023-09-17 22:55:13 +08:00
|
|
|
"errors"
|
2022-11-19 16:58:55 +08:00
|
|
|
"io"
|
|
|
|
"net/http"
|
2022-11-19 18:43:56 +08:00
|
|
|
"net/url"
|
2022-11-19 16:58:55 +08:00
|
|
|
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"golang.org/x/net/html/atom"
|
|
|
|
)
|
|
|
|
|
|
|
|
type HTMLMeta struct {
|
|
|
|
Title string `json:"title"`
|
|
|
|
Description string `json:"description"`
|
|
|
|
Image string `json:"image"`
|
|
|
|
}
|
|
|
|
|
2022-11-19 18:43:56 +08:00
|
|
|
func GetHTMLMeta(urlStr string) (*HTMLMeta, error) {
|
|
|
|
if _, err := url.Parse(urlStr); err != nil {
|
2022-11-19 16:58:55 +08:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2022-11-19 18:43:56 +08:00
|
|
|
response, err := http.Get(urlStr)
|
2022-11-19 16:58:55 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer response.Body.Close()
|
|
|
|
|
2022-11-19 18:43:56 +08:00
|
|
|
mediatype, err := getMediatype(response)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if mediatype != "text/html" {
|
2024-04-29 08:00:37 +08:00
|
|
|
return nil, errors.New("not a HTML page")
|
2022-11-19 18:43:56 +08:00
|
|
|
}
|
2022-11-19 16:58:55 +08:00
|
|
|
|
2022-11-19 18:43:56 +08:00
|
|
|
htmlMeta := extractHTMLMeta(response.Body)
|
2022-11-19 16:58:55 +08:00
|
|
|
return htmlMeta, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func extractHTMLMeta(resp io.Reader) *HTMLMeta {
|
|
|
|
tokenizer := html.NewTokenizer(resp)
|
|
|
|
htmlMeta := new(HTMLMeta)
|
|
|
|
|
|
|
|
for {
|
|
|
|
tokenType := tokenizer.Next()
|
|
|
|
if tokenType == html.ErrorToken {
|
|
|
|
break
|
|
|
|
} else if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
|
|
|
|
token := tokenizer.Token()
|
|
|
|
if token.DataAtom == atom.Body {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
if token.DataAtom == atom.Title {
|
|
|
|
tokenizer.Next()
|
|
|
|
token := tokenizer.Token()
|
|
|
|
htmlMeta.Title = token.Data
|
|
|
|
} else if token.DataAtom == atom.Meta {
|
|
|
|
description, ok := extractMetaProperty(token, "description")
|
|
|
|
if ok {
|
|
|
|
htmlMeta.Description = description
|
|
|
|
}
|
|
|
|
|
|
|
|
ogTitle, ok := extractMetaProperty(token, "og:title")
|
|
|
|
if ok {
|
|
|
|
htmlMeta.Title = ogTitle
|
|
|
|
}
|
|
|
|
|
|
|
|
ogDescription, ok := extractMetaProperty(token, "og:description")
|
|
|
|
if ok {
|
|
|
|
htmlMeta.Description = ogDescription
|
|
|
|
}
|
|
|
|
|
|
|
|
ogImage, ok := extractMetaProperty(token, "og:image")
|
|
|
|
if ok {
|
|
|
|
htmlMeta.Image = ogImage
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return htmlMeta
|
|
|
|
}
|
|
|
|
|
|
|
|
func extractMetaProperty(token html.Token, prop string) (content string, ok bool) {
|
|
|
|
content, ok = "", false
|
|
|
|
for _, attr := range token.Attr {
|
|
|
|
if attr.Key == "property" && attr.Val == prop {
|
|
|
|
ok = true
|
|
|
|
}
|
|
|
|
if attr.Key == "content" {
|
|
|
|
content = attr.Val
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return content, ok
|
|
|
|
}
|