diff --git a/readability/read.go b/readability/read.go index 9b7440e..28dab62 100644 --- a/readability/read.go +++ b/readability/read.go @@ -4,6 +4,7 @@ import ( "bytes" "fmt" ghtml "html" + "io" "math" "net/http" nurl "net/url" @@ -64,19 +65,6 @@ type Article struct { RawContent string } -func fetchURL(url *nurl.URL, timeout time.Duration) (*goquery.Document, error) { - // Fetch page from URL - client := &http.Client{Timeout: timeout} - resp, err := client.Get(url.String()) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - // Create goquery document - return goquery.NewDocumentFromReader(resp.Body) -} - // removeScripts removes script tags from the document. func removeScripts(doc *goquery.Document) { doc.Find("script").Remove() @@ -1130,10 +1118,34 @@ func estimateReadTime(articleContent *goquery.Selection) (int, int) { return int(minReadTime), int(maxReadTime) } -// Parse an URL to readability format -func Parse(url *nurl.URL, timeout time.Duration) (Article, error) { - // Fetch page - doc, err := fetchURL(url, timeout) +// FromURL get readable content from the specified URL +func FromURL(url *nurl.URL, timeout time.Duration) (Article, error) { + // Fetch page from URL + client := &http.Client{Timeout: timeout} + resp, err := client.Get(url.String()) + if err != nil { + return Article{}, err + } + defer resp.Body.Close() + + // If response is not HTML, stop process + mimeType, err := getMimeType(resp.Body) + if err != nil { + return Article{}, err + } + + if !strings.HasPrefix(mimeType, "text/html") { + return Article{}, fmt.Errorf("URL must be a text/html, found %s", mimeType) + } + + // Parse response body + return FromReader(resp.Body, url) +} + +// FromReader get readable content from the specified io.Reader +func FromReader(reader io.Reader, url *nurl.URL) (Article, error) { + // Create goquery document + doc, err := goquery.NewDocumentFromReader(reader) if err != nil { return Article{}, err } diff --git a/readability/utils.go b/readability/utils.go index 2b1d9a3..cde610d 100644 --- a/readability/utils.go +++ b/readability/utils.go @@ -3,6 +3,8 @@ package readability import ( "crypto/md5" "fmt" + "io" + "net/http" "os" "strings" "unicode/utf8" @@ -68,6 +70,17 @@ func removeSeparator(str string, separators ...string) string { return strings.Join(finalWords, " ") } +func getMimeType(resp io.Reader) (string, error) { + buffer := make([]byte, 512) + _, err := resp.Read(buffer) + if err != nil { + return "", err + } + + mimeType := http.DetectContentType(buffer) + return mimeType, nil +} + func normalizeText(str string) string { return strings.Join(strings.Fields(str), " ") }