Now only fetch text content if page is readable

This commit is contained in:
Radhi Fadlillah 2019-08-05 20:22:27 +07:00
parent 43040a9bc4
commit af1a32ac4f
3 changed files with 28 additions and 4 deletions

View file

@ -101,7 +101,8 @@ func addHandler(cmd *cobra.Command, args []string) {
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
@ -112,6 +113,8 @@ func addHandler(cmd *cobra.Command, args []string) {
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, url)
if err != nil {
cError.Printf("Failed to parse article: %v\n", err)
@ -131,6 +134,10 @@ func addHandler(cmd *cobra.Command, args []string) {
book.Excerpt = article.Excerpt
}
if !isReadable {
book.Content = ""
}
// Get image URL
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)

View file

@ -185,7 +185,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
@ -197,6 +198,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
chProblem <- book.ID
@ -208,6 +211,10 @@ func updateHandler(cmd *cobra.Command, args []string) {
book.Content = article.TextContent
book.HTML = article.Content
if !isReadable {
book.Content = ""
}
if !dontOverwrite {
book.Title = article.Title
book.Excerpt = article.Excerpt

View file

@ -237,7 +237,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
@ -247,6 +248,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
return
@ -273,6 +276,10 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
if !isReadable {
book.Content = ""
}
}
// If needed, create offline archive as well
@ -513,7 +520,6 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
book.HasContent = book.Content != "" && isReadable
if article.Title != "" {
book.Title = article.Title
@ -523,6 +529,10 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
book.Excerpt = article.Excerpt
}
if !isReadable {
book.Content = ""
}
// Get image for thumbnail and save it to local disk
var imageURLs []string
if article.Image != "" {