Now only fetch text content if page is readable

2025-02-21 22:43:22 +08:00 · 2019-08-05 20:22:27 +07:00 · 2019-08-05 20:22:27 +07:00 · af1a32ac4f
commit af1a32ac4f
parent 43040a9bc4
3 changed files with 28 additions and 4 deletions
--- a/internal/cmd/add.go
+++ b/internal/cmd/add.go
@ -101,7 +101,8 @@ func addHandler(cmd *cobra.Command, args []string) {
 			// Split response body so it can be processed twice
 			archivalInput := bytes.NewBuffer(nil)
 			readabilityInput := bytes.NewBuffer(nil)
-			multiWriter := io.MultiWriter(archivalInput, readabilityInput)
+			readabilityCheckInput := bytes.NewBuffer(nil)
+			multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)

 			_, err = io.Copy(multiWriter, resp.Body)
 			if err != nil {
@ -112,6 +113,8 @@ func addHandler(cmd *cobra.Command, args []string) {
 			// If this is HTML, parse for readable content
 			contentType := resp.Header.Get("Content-Type")
 			if strings.Contains(contentType, "text/html") {
+				isReadable := readability.IsReadable(readabilityCheckInput)
+
 				article, err := readability.FromReader(readabilityInput, url)
 				if err != nil {
 					cError.Printf("Failed to parse article: %v\n", err)
@ -131,6 +134,10 @@ func addHandler(cmd *cobra.Command, args []string) {
 					book.Excerpt = article.Excerpt
 				}

+				if !isReadable {
+					book.Content = ""
+				}
+
 				// Get image URL
 				if article.Image != "" {
 					imageURLs = append(imageURLs, article.Image)
--- a/internal/cmd/update.go
+++ b/internal/cmd/update.go
@ -185,7 +185,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
 				// Split response body so it can be processed twice
 				archivalInput := bytes.NewBuffer(nil)
 				readabilityInput := bytes.NewBuffer(nil)
-				multiWriter := io.MultiWriter(archivalInput, readabilityInput)
+				readabilityCheckInput := bytes.NewBuffer(nil)
+				multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)

 				_, err = io.Copy(multiWriter, resp.Body)
 				if err != nil {
@ -197,6 +198,8 @@ func updateHandler(cmd *cobra.Command, args []string) {
 				// If this is HTML, parse for readable content
 				contentType := resp.Header.Get("Content-Type")
 				if strings.Contains(contentType, "text/html") {
+					isReadable := readability.IsReadable(readabilityCheckInput)
+
 					article, err := readability.FromReader(readabilityInput, book.URL)
 					if err != nil {
 						chProblem <- book.ID
@ -208,6 +211,10 @@ func updateHandler(cmd *cobra.Command, args []string) {
 					book.Content = article.TextContent
 					book.HTML = article.Content

+					if !isReadable {
+						book.Content = ""
+					}
+
 					if !dontOverwrite {
 						book.Title = article.Title
 						book.Excerpt = article.Excerpt
--- a/internal/webserver/handler-api.go
+++ b/internal/webserver/handler-api.go
@ -237,7 +237,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
 		// Split response body so it can be processed twice
 		archivalInput := bytes.NewBuffer(nil)
 		readabilityInput := bytes.NewBuffer(nil)
-		multiWriter := io.MultiWriter(archivalInput, readabilityInput)
+		readabilityCheckInput := bytes.NewBuffer(nil)
+		multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)

 		_, err = io.Copy(multiWriter, resp.Body)
 		if err != nil {
@ -247,6 +248,8 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
 		// If this is HTML, parse for readable content
 		contentType := resp.Header.Get("Content-Type")
 		if strings.Contains(contentType, "text/html") {
+			isReadable := readability.IsReadable(readabilityCheckInput)
+
 			article, err := readability.FromReader(readabilityInput, book.URL)
 			if err != nil {
 				return
@ -273,6 +276,10 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
 			if article.Favicon != "" {
 				imageURLs = append(imageURLs, article.Favicon)
 			}
+
+			if !isReadable {
+				book.Content = ""
+			}
 		}

 		// If needed, create offline archive as well
@ -513,7 +520,6 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
 				book.Author = article.Byline
 				book.Content = article.TextContent
 				book.HTML = article.Content
-				book.HasContent = book.Content != "" && isReadable

 				if article.Title != "" {
 					book.Title = article.Title
@ -523,6 +529,10 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
 					book.Excerpt = article.Excerpt
 				}

+				if !isReadable {
+					book.Content = ""
+				}
+
 				// Get image for thumbnail and save it to local disk
 				var imageURLs []string
 				if article.Image != "" {