mirror of
https://github.com/go-shiori/shiori.git
synced 2025-02-22 15:06:04 +08:00
Now only fetch text content if page is readable
This commit is contained in:
parent
43040a9bc4
commit
af1a32ac4f
3 changed files with 28 additions and 4 deletions
|
@ -101,7 +101,8 @@ func addHandler(cmd *cobra.Command, args []string) {
|
||||||
// Split response body so it can be processed twice
|
// Split response body so it can be processed twice
|
||||||
archivalInput := bytes.NewBuffer(nil)
|
archivalInput := bytes.NewBuffer(nil)
|
||||||
readabilityInput := bytes.NewBuffer(nil)
|
readabilityInput := bytes.NewBuffer(nil)
|
||||||
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
|
readabilityCheckInput := bytes.NewBuffer(nil)
|
||||||
|
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
|
||||||
|
|
||||||
_, err = io.Copy(multiWriter, resp.Body)
|
_, err = io.Copy(multiWriter, resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -112,6 +113,8 @@ func addHandler(cmd *cobra.Command, args []string) {
|
||||||
// If this is HTML, parse for readable content
|
// If this is HTML, parse for readable content
|
||||||
contentType := resp.Header.Get("Content-Type")
|
contentType := resp.Header.Get("Content-Type")
|
||||||
if strings.Contains(contentType, "text/html") {
|
if strings.Contains(contentType, "text/html") {
|
||||||
|
isReadable := readability.IsReadable(readabilityCheckInput)
|
||||||
|
|
||||||
article, err := readability.FromReader(readabilityInput, url)
|
article, err := readability.FromReader(readabilityInput, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cError.Printf("Failed to parse article: %v\n", err)
|
cError.Printf("Failed to parse article: %v\n", err)
|
||||||
|
@ -131,6 +134,10 @@ func addHandler(cmd *cobra.Command, args []string) {
|
||||||
book.Excerpt = article.Excerpt
|
book.Excerpt = article.Excerpt
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !isReadable {
|
||||||
|
book.Content = ""
|
||||||
|
}
|
||||||
|
|
||||||
// Get image URL
|
// Get image URL
|
||||||
if article.Image != "" {
|
if article.Image != "" {
|
||||||
imageURLs = append(imageURLs, article.Image)
|
imageURLs = append(imageURLs, article.Image)
|
||||||
|
|
|
@ -185,7 +185,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
|
||||||
// Split response body so it can be processed twice
|
// Split response body so it can be processed twice
|
||||||
archivalInput := bytes.NewBuffer(nil)
|
archivalInput := bytes.NewBuffer(nil)
|
||||||
readabilityInput := bytes.NewBuffer(nil)
|
readabilityInput := bytes.NewBuffer(nil)
|
||||||
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
|
readabilityCheckInput := bytes.NewBuffer(nil)
|
||||||
|
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
|
||||||
|
|
||||||
_, err = io.Copy(multiWriter, resp.Body)
|
_, err = io.Copy(multiWriter, resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -197,6 +198,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
|
||||||
// If this is HTML, parse for readable content
|
// If this is HTML, parse for readable content
|
||||||
contentType := resp.Header.Get("Content-Type")
|
contentType := resp.Header.Get("Content-Type")
|
||||||
if strings.Contains(contentType, "text/html") {
|
if strings.Contains(contentType, "text/html") {
|
||||||
|
isReadable := readability.IsReadable(readabilityCheckInput)
|
||||||
|
|
||||||
article, err := readability.FromReader(readabilityInput, book.URL)
|
article, err := readability.FromReader(readabilityInput, book.URL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
chProblem <- book.ID
|
chProblem <- book.ID
|
||||||
|
@ -208,6 +211,10 @@ func updateHandler(cmd *cobra.Command, args []string) {
|
||||||
book.Content = article.TextContent
|
book.Content = article.TextContent
|
||||||
book.HTML = article.Content
|
book.HTML = article.Content
|
||||||
|
|
||||||
|
if !isReadable {
|
||||||
|
book.Content = ""
|
||||||
|
}
|
||||||
|
|
||||||
if !dontOverwrite {
|
if !dontOverwrite {
|
||||||
book.Title = article.Title
|
book.Title = article.Title
|
||||||
book.Excerpt = article.Excerpt
|
book.Excerpt = article.Excerpt
|
||||||
|
|
|
@ -237,7 +237,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
|
||||||
// Split response body so it can be processed twice
|
// Split response body so it can be processed twice
|
||||||
archivalInput := bytes.NewBuffer(nil)
|
archivalInput := bytes.NewBuffer(nil)
|
||||||
readabilityInput := bytes.NewBuffer(nil)
|
readabilityInput := bytes.NewBuffer(nil)
|
||||||
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
|
readabilityCheckInput := bytes.NewBuffer(nil)
|
||||||
|
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
|
||||||
|
|
||||||
_, err = io.Copy(multiWriter, resp.Body)
|
_, err = io.Copy(multiWriter, resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -247,6 +248,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
|
||||||
// If this is HTML, parse for readable content
|
// If this is HTML, parse for readable content
|
||||||
contentType := resp.Header.Get("Content-Type")
|
contentType := resp.Header.Get("Content-Type")
|
||||||
if strings.Contains(contentType, "text/html") {
|
if strings.Contains(contentType, "text/html") {
|
||||||
|
isReadable := readability.IsReadable(readabilityCheckInput)
|
||||||
|
|
||||||
article, err := readability.FromReader(readabilityInput, book.URL)
|
article, err := readability.FromReader(readabilityInput, book.URL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
|
@ -273,6 +276,10 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
|
||||||
if article.Favicon != "" {
|
if article.Favicon != "" {
|
||||||
imageURLs = append(imageURLs, article.Favicon)
|
imageURLs = append(imageURLs, article.Favicon)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !isReadable {
|
||||||
|
book.Content = ""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If needed, create offline archive as well
|
// If needed, create offline archive as well
|
||||||
|
@ -513,7 +520,6 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
|
||||||
book.Author = article.Byline
|
book.Author = article.Byline
|
||||||
book.Content = article.TextContent
|
book.Content = article.TextContent
|
||||||
book.HTML = article.Content
|
book.HTML = article.Content
|
||||||
book.HasContent = book.Content != "" && isReadable
|
|
||||||
|
|
||||||
if article.Title != "" {
|
if article.Title != "" {
|
||||||
book.Title = article.Title
|
book.Title = article.Title
|
||||||
|
@ -523,6 +529,10 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
|
||||||
book.Excerpt = article.Excerpt
|
book.Excerpt = article.Excerpt
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !isReadable {
|
||||||
|
book.Content = ""
|
||||||
|
}
|
||||||
|
|
||||||
// Get image for thumbnail and save it to local disk
|
// Get image for thumbnail and save it to local disk
|
||||||
var imageURLs []string
|
var imageURLs []string
|
||||||
if article.Image != "" {
|
if article.Image != "" {
|
||||||
|
|
Loading…
Reference in a new issue