mirror of
https://github.com/go-shiori/shiori.git
synced 2024-11-16 14:16:29 +08:00
Readability only parse URL that target text/html content
This commit is contained in:
parent
9fe82fc623
commit
49a9a12d6f
2 changed files with 42 additions and 17 deletions
|
@ -4,6 +4,7 @@ import (
|
|||
"bytes"
|
||||
"fmt"
|
||||
ghtml "html"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
nurl "net/url"
|
||||
|
@ -64,19 +65,6 @@ type Article struct {
|
|||
RawContent string
|
||||
}
|
||||
|
||||
func fetchURL(url *nurl.URL, timeout time.Duration) (*goquery.Document, error) {
|
||||
// Fetch page from URL
|
||||
client := &http.Client{Timeout: timeout}
|
||||
resp, err := client.Get(url.String())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Create goquery document
|
||||
return goquery.NewDocumentFromReader(resp.Body)
|
||||
}
|
||||
|
||||
// removeScripts removes script tags from the document.
|
||||
func removeScripts(doc *goquery.Document) {
|
||||
doc.Find("script").Remove()
|
||||
|
@ -1130,10 +1118,34 @@ func estimateReadTime(articleContent *goquery.Selection) (int, int) {
|
|||
return int(minReadTime), int(maxReadTime)
|
||||
}
|
||||
|
||||
// Parse an URL to readability format
|
||||
func Parse(url *nurl.URL, timeout time.Duration) (Article, error) {
|
||||
// Fetch page
|
||||
doc, err := fetchURL(url, timeout)
|
||||
// FromURL get readable content from the specified URL
|
||||
func FromURL(url *nurl.URL, timeout time.Duration) (Article, error) {
|
||||
// Fetch page from URL
|
||||
client := &http.Client{Timeout: timeout}
|
||||
resp, err := client.Get(url.String())
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// If response is not HTML, stop process
|
||||
mimeType, err := getMimeType(resp.Body)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(mimeType, "text/html") {
|
||||
return Article{}, fmt.Errorf("URL must be a text/html, found %s", mimeType)
|
||||
}
|
||||
|
||||
// Parse response body
|
||||
return FromReader(resp.Body, url)
|
||||
}
|
||||
|
||||
// FromReader get readable content from the specified io.Reader
|
||||
func FromReader(reader io.Reader, url *nurl.URL) (Article, error) {
|
||||
// Create goquery document
|
||||
doc, err := goquery.NewDocumentFromReader(reader)
|
||||
if err != nil {
|
||||
return Article{}, err
|
||||
}
|
||||
|
|
|
@ -3,6 +3,8 @@ package readability
|
|||
import (
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
@ -68,6 +70,17 @@ func removeSeparator(str string, separators ...string) string {
|
|||
return strings.Join(finalWords, " ")
|
||||
}
|
||||
|
||||
func getMimeType(resp io.Reader) (string, error) {
|
||||
buffer := make([]byte, 512)
|
||||
_, err := resp.Read(buffer)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
mimeType := http.DetectContentType(buffer)
|
||||
return mimeType, nil
|
||||
}
|
||||
|
||||
func normalizeText(str string) string {
|
||||
return strings.Join(strings.Fields(str), " ")
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue