Readability only parse URL that target text/html content

This commit is contained in:
Radhi Fadlillah 2018-05-20 16:33:06 +07:00
parent 9fe82fc623
commit 49a9a12d6f
2 changed files with 42 additions and 17 deletions

View file

@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
ghtml "html"
"io"
"math"
"net/http"
nurl "net/url"
@ -64,19 +65,6 @@ type Article struct {
RawContent string
}
func fetchURL(url *nurl.URL, timeout time.Duration) (*goquery.Document, error) {
// Fetch page from URL
client := &http.Client{Timeout: timeout}
resp, err := client.Get(url.String())
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Create goquery document
return goquery.NewDocumentFromReader(resp.Body)
}
// removeScripts removes script tags from the document.
func removeScripts(doc *goquery.Document) {
doc.Find("script").Remove()
@ -1130,10 +1118,34 @@ func estimateReadTime(articleContent *goquery.Selection) (int, int) {
return int(minReadTime), int(maxReadTime)
}
// Parse an URL to readability format
func Parse(url *nurl.URL, timeout time.Duration) (Article, error) {
// Fetch page
doc, err := fetchURL(url, timeout)
// FromURL get readable content from the specified URL
func FromURL(url *nurl.URL, timeout time.Duration) (Article, error) {
// Fetch page from URL
client := &http.Client{Timeout: timeout}
resp, err := client.Get(url.String())
if err != nil {
return Article{}, err
}
defer resp.Body.Close()
// If response is not HTML, stop process
mimeType, err := getMimeType(resp.Body)
if err != nil {
return Article{}, err
}
if !strings.HasPrefix(mimeType, "text/html") {
return Article{}, fmt.Errorf("URL must be a text/html, found %s", mimeType)
}
// Parse response body
return FromReader(resp.Body, url)
}
// FromReader get readable content from the specified io.Reader
func FromReader(reader io.Reader, url *nurl.URL) (Article, error) {
// Create goquery document
doc, err := goquery.NewDocumentFromReader(reader)
if err != nil {
return Article{}, err
}

View file

@ -3,6 +3,8 @@ package readability
import (
"crypto/md5"
"fmt"
"io"
"net/http"
"os"
"strings"
"unicode/utf8"
@ -68,6 +70,17 @@ func removeSeparator(str string, separators ...string) string {
return strings.Join(finalWords, " ")
}
func getMimeType(resp io.Reader) (string, error) {
buffer := make([]byte, 512)
_, err := resp.Read(buffer)
if err != nil {
return "", err
}
mimeType := http.DetectContentType(buffer)
return mimeType, nil
}
func normalizeText(str string) string {
return strings.Join(strings.Fields(str), " ")
}