Remove repeated code for archiving bookmarks

This commit is contained in:
Radhi Fadlillah 2019-09-20 16:48:57 +07:00
parent 2da0c7e297
commit 64c62d6b12
11 changed files with 425 additions and 766 deletions

View file

@ -1,18 +1,10 @@
package cmd package cmd
import ( import (
"bytes"
"fmt" "fmt"
"io"
"net/http"
nurl "net/url"
fp "path/filepath"
"strings" "strings"
"time"
"github.com/go-shiori/shiori/pkg/warc" "github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/go-readability"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -45,28 +37,16 @@ func addHandler(cmd *cobra.Command, args []string) {
noArchival, _ := cmd.Flags().GetBool("no-archival") noArchival, _ := cmd.Flags().GetBool("no-archival")
logArchival, _ := cmd.Flags().GetBool("log-archival") logArchival, _ := cmd.Flags().GetBool("log-archival")
// Clean up URL by removing its fragment and UTM parameters
tmp, err := nurl.Parse(url)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
cError.Println("URL is not valid")
return
}
tmp.Fragment = ""
clearUTMParams(tmp)
// Create bookmark item // Create bookmark item
book := model.Bookmark{ book := model.Bookmark{
URL: tmp.String(), URL: url,
Title: normalizeSpace(title), Title: normalizeSpace(title),
Excerpt: normalizeSpace(excerpt), Excerpt: normalizeSpace(excerpt),
CreateArchive: !noArchival,
} }
// Create bookmark ID if book.Title == "" {
book.ID, err = db.CreateNewID("bookmark") book.Title = book.URL
if err != nil {
cError.Printf("Failed to create ID: %v\n", err)
return
} }
// Set bookmark tags // Set bookmark tags
@ -75,101 +55,51 @@ func addHandler(cmd *cobra.Command, args []string) {
book.Tags[i].Name = strings.TrimSpace(tag) book.Tags[i].Name = strings.TrimSpace(tag)
} }
// If it's not offline mode, fetch data from internet // Create bookmark ID
var imageURLs []string var err error
book.ID, err = db.CreateNewID("bookmark")
if !offline { if err != nil {
func() { cError.Printf("Failed to create ID: %v\n", err)
cInfo.Println("Downloading article...") return
// Prepare download request
req, err := http.NewRequest("GET", url, nil)
if err != nil {
cError.Printf("Failed to download article: %v\n", err)
return
}
// Send download request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
resp, err := httpClient.Do(req)
if err != nil {
cError.Printf("Failed to download article: %v\n", err)
return
}
defer resp.Body.Close()
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
cError.Printf("Failed to process article: %v\n", err)
return
}
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, url)
if err != nil {
cError.Printf("Failed to parse article: %v\n", err)
return
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
// If title and excerpt doesnt have submitted value, use from article
if book.Title == "" {
book.Title = article.Title
}
if book.Excerpt == "" {
book.Excerpt = article.Excerpt
}
if !isReadable {
book.Content = ""
}
// Get image URL
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
}
// If needed, create offline archive as well
if !noArchival {
archivePath := fp.Join(dataDir, "archive", fmt.Sprintf("%d", book.ID))
archivalRequest := warc.ArchivalRequest{
URL: url,
Reader: archivalInput,
ContentType: contentType,
LogEnabled: logArchival,
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil {
cError.Printf("Failed to create archive: %v\n", err)
return
}
}
}()
} }
// Make sure title is not empty // Clean up bookmark URL
if book.Title == "" { book.URL, err = core.RemoveUTMParams(book.URL)
book.Title = book.URL if err != nil {
cError.Printf("Failed to clean URL: %v\n", err)
return
}
// If it's not offline mode, fetch data from internet.
if !offline {
cInfo.Println("Downloading article...")
var isFatalErr bool
content, contentType, err := core.DownloadBookmark(book.URL)
if err != nil {
cError.Printf("Failed to download: %v\n", err)
}
if err == nil && content != nil {
request := core.ProcessRequest{
DataDir: dataDir,
Bookmark: book,
Content: content,
ContentType: contentType,
LogArchival: logArchival,
}
book, isFatalErr, err = core.ProcessBookmark(request)
content.Close()
if err != nil {
cError.Printf("Failed: %v\n", err)
}
if isFatalErr {
return
}
}
} }
// Save bookmark to database // Save bookmark to database
@ -179,18 +109,6 @@ func addHandler(cmd *cobra.Command, args []string) {
return return
} }
// Save article image to local disk
imgPath := fp.Join(dataDir, "thumb", fmt.Sprintf("%d", book.ID))
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath, time.Minute)
if err == nil {
break
} else {
cError.Printf("Failed to download image: %v\n", err)
continue
}
}
// Print added bookmark // Print added bookmark
fmt.Println() fmt.Println()
printBookmarks(book) printBookmarks(book)

View file

@ -2,11 +2,11 @@ package cmd
import ( import (
"fmt" "fmt"
nurl "net/url"
"os" "os"
"strings" "strings"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -73,17 +73,14 @@ func importHandler(cmd *cobra.Command, args []string) {
url, _ := a.Attr("href") url, _ := a.Attr("href")
strTags, _ := a.Attr("tags") strTags, _ := a.Attr("tags")
// Clean up URL by removing its fragment and UTM parameters // Clean up URL
tmp, err := nurl.Parse(url) var err error
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { url, err = core.RemoveUTMParams(url)
if err != nil {
cError.Printf("Skip %s: URL is not valid\n", url) cError.Printf("Skip %s: URL is not valid\n", url)
return return
} }
tmp.Fragment = ""
clearUTMParams(tmp)
url = tmp.String()
// Make sure title is valid Utf-8 // Make sure title is valid Utf-8
title = toValidUtf8(title, url) title = toValidUtf8(title, url)

View file

@ -2,13 +2,13 @@ package cmd
import ( import (
"fmt" "fmt"
nurl "net/url"
"os" "os"
"strconv" "strconv"
"strings" "strings"
"time" "time"
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -59,17 +59,14 @@ func pocketHandler(cmd *cobra.Command, args []string) {
intModified, _ := strconv.ParseInt(strModified, 10, 64) intModified, _ := strconv.ParseInt(strModified, 10, 64)
modified := time.Unix(intModified, 0) modified := time.Unix(intModified, 0)
// Clean up URL by removing its fragment and UTM parameters // Clean up URL
tmp, err := nurl.Parse(url) var err error
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { url, err = core.RemoveUTMParams(url)
if err != nil {
cError.Printf("Skip %s: URL is not valid\n", url) cError.Printf("Skip %s: URL is not valid\n", url)
return return
} }
tmp.Fragment = ""
clearUTMParams(tmp)
url = tmp.String()
// Make sure title is valid Utf-8 // Make sure title is valid Utf-8
title = toValidUtf8(title, url) title = toValidUtf8(title, url)

View file

@ -1,22 +1,14 @@
package cmd package cmd
import ( import (
"bytes"
"fmt" "fmt"
"io"
"net/http"
nurl "net/url"
"os"
fp "path/filepath"
"sort" "sort"
"strings" "strings"
"sync" "sync"
"time"
"github.com/go-shiori/go-readability" "github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/database" "github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
@ -83,17 +75,12 @@ func updateHandler(cmd *cobra.Command, args []string) {
excerpt = normalizeSpace(excerpt) excerpt = normalizeSpace(excerpt)
if cmd.Flags().Changed("url") { if cmd.Flags().Changed("url") {
// Clean up URL by removing its fragment and UTM parameters // Clean up bookmark URL
tmp, err := nurl.Parse(url) url, err = core.RemoveUTMParams(url)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { if err != nil {
cError.Println("URL is not valid") panic(fmt.Errorf("failed to clean URL: %v", err))
return
} }
tmp.Fragment = ""
clearUTMParams(tmp)
url = tmp.String()
// Since user uses custom URL, make sure there is only one ID to update // Since user uses custom URL, make sure there is only one ID to update
if len(ids) != 1 { if len(ids) != 1 {
cError.Println("Update only accepts one index while using --url flag") cError.Println("Update only accepts one index while using --url flag")
@ -149,6 +136,9 @@ func updateHandler(cmd *cobra.Command, args []string) {
for i, book := range bookmarks { for i, book := range bookmarks {
wg.Add(1) wg.Add(1)
// Mark whether book will be archived
book.CreateArchive = !noArchival
// If used, use submitted URL // If used, use submitted URL
if url != "" { if url != "" {
book.URL = url book.URL = url
@ -164,102 +154,32 @@ func updateHandler(cmd *cobra.Command, args []string) {
<-semaphore <-semaphore
}() }()
// Prepare download request // Download data from internet
req, err := http.NewRequest("GET", book.URL, nil) content, contentType, err := core.DownloadBookmark(book.URL)
if err != nil { if err != nil {
chProblem <- book.ID chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
return return
} }
// Send download request request := core.ProcessRequest{
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") DataDir: dataDir,
resp, err := httpClient.Do(req) Bookmark: book,
if err != nil { Content: content,
chProblem <- book.ID ContentType: contentType,
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) KeepMetadata: keepMetadata,
return LogArchival: logArchival,
} }
defer resp.Body.Close()
// Split response body so it can be processed twice book, _, err = core.ProcessBookmark(request)
archivalInput := bytes.NewBuffer(nil) content.Close()
readabilityInput := bytes.NewBuffer(nil)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil { if err != nil {
chProblem <- book.ID chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to process %s: %v", book.URL, err) chMessage <- fmt.Errorf("Failed to process %s: %v", book.URL, err)
return return
} }
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err)
return
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
if !isReadable {
book.Content = ""
}
if !keepMetadata {
book.Title = article.Title
book.Excerpt = article.Excerpt
}
// Get image for thumbnail and save it to local disk
var imageURLs []string
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
imgPath := fp.Join(dataDir, "thumb", fmt.Sprintf("%d", book.ID))
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath, time.Minute)
if err == nil {
break
}
}
}
// If needed, update offline archive as well.
// Make sure to delete the old one first.
if !noArchival {
archivePath := fp.Join(dataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath)
archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType,
LogEnabled: logArchival,
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil {
chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err)
return
}
}
// Send success message // Send success message
chMessage <- fmt.Sprintf("Downloaded %s", book.URL) chMessage <- fmt.Sprintf("Downloaded %s", book.URL)

View file

@ -3,29 +3,17 @@ package cmd
import ( import (
"errors" "errors"
"fmt" "fmt"
"image"
clr "image/color"
"image/draw"
"image/jpeg"
"math"
"net/http"
nurl "net/url" nurl "net/url"
"os" "os"
"os/exec" "os/exec"
fp "path/filepath"
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
"time"
"unicode/utf8" "unicode/utf8"
"github.com/disintegration/imaging"
"github.com/fatih/color" "github.com/fatih/color"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"golang.org/x/crypto/ssh/terminal" "golang.org/x/crypto/ssh/terminal"
// Add supports for PNG image
_ "image/png"
) )
var ( var (
@ -54,95 +42,6 @@ func isURLValid(s string) bool {
return err == nil && tmp.Scheme != "" && tmp.Hostname() != "" return err == nil && tmp.Scheme != "" && tmp.Hostname() != ""
} }
func clearUTMParams(url *nurl.URL) {
queries := url.Query()
for key := range queries {
if strings.HasPrefix(key, "utm_") {
queries.Del(key)
}
}
url.RawQuery = queries.Encode()
}
func downloadBookImage(url, dstPath string, timeout time.Duration) error {
// Fetch data from URL
client := &http.Client{Timeout: timeout}
resp, err := client.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
// Make sure it's JPG or PNG image
cp := resp.Header.Get("Content-Type")
if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") {
return fmt.Errorf("%s is not a supported image", url)
}
// At this point, the download has finished successfully.
// Prepare destination file.
err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
if err != nil {
return fmt.Errorf("failed to create image dir: %v", err)
}
dstFile, err := os.Create(dstPath)
if err != nil {
return fmt.Errorf("failed to create image file: %v", err)
}
defer dstFile.Close()
// Parse image and process it.
// If image is smaller than 600x400 or its ratio is less than 4:3, resize.
// Else, save it as it is.
img, _, err := image.Decode(resp.Body)
if err != nil {
return fmt.Errorf("failed to parse image %s: %v", url, err)
}
imgRect := img.Bounds()
imgWidth := imgRect.Dx()
imgHeight := imgRect.Dy()
imgRatio := float64(imgWidth) / float64(imgHeight)
if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 {
err = jpeg.Encode(dstFile, img, nil)
} else {
// Create background
bg := image.NewNRGBA(imgRect)
draw.Draw(bg, imgRect, image.NewUniform(clr.White), image.Point{}, draw.Src)
draw.Draw(bg, imgRect, img, image.Point{}, draw.Over)
bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos)
bg = imaging.Blur(bg, 150)
bg = imaging.AdjustBrightness(bg, 30)
// Create foreground
fg := imaging.Fit(img, 600, 400, imaging.Lanczos)
// Merge foreground and background
bgRect := bg.Bounds()
fgRect := fg.Bounds()
fgPosition := image.Point{
X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)),
Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)),
}
draw.Draw(bg, bgRect, fg, fgPosition, draw.Over)
// Save to file
err = jpeg.Encode(dstFile, bg, nil)
}
if err != nil {
return fmt.Errorf("failed to save image %s: %v", url, err)
}
return nil
}
func printBookmarks(bookmarks ...model.Bookmark) { func printBookmarks(bookmarks ...model.Bookmark) {
for _, bookmark := range bookmarks { for _, bookmark := range bookmarks {
// Create bookmark index // Create bookmark index

31
internal/core/download.go Normal file
View file

@ -0,0 +1,31 @@
package core
import (
"io"
"net/http"
"time"
)
var httpClient = &http.Client{Timeout: time.Minute}
// DownloadBookmark downloads bookmarked page from specified URL.
// Return response body, make sure to close it later.
func DownloadBookmark(url string) (io.ReadCloser, string, error) {
// Prepare download request
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, "", err
}
// Send download request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
resp, err := httpClient.Do(req)
if err != nil {
return nil, "", err
}
// Get content type
contentType := resp.Header.Get("Content-Type")
return resp.Body, contentType, nil
}

218
internal/core/processing.go Normal file
View file

@ -0,0 +1,218 @@
package core
import (
"bytes"
"fmt"
"image"
"image/color"
"image/draw"
"image/jpeg"
"io"
"math"
"os"
"path"
fp "path/filepath"
"strconv"
"strings"
"github.com/disintegration/imaging"
"github.com/go-shiori/go-readability"
"github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/shiori/pkg/warc"
// Add support for png
_ "image/png"
)
// ProcessRequest is the request for processing bookmark.
type ProcessRequest struct {
DataDir string
Bookmark model.Bookmark
Content io.Reader
ContentType string
KeepMetadata bool
LogArchival bool
}
// ProcessBookmark process the bookmark and archive it if needed.
// Return three values, the bookmark itself, is error fatal, and error value.
func ProcessBookmark(req ProcessRequest) (model.Bookmark, bool, error) {
book := req.Bookmark
contentType := req.ContentType
// Make sure bookmark ID is defined
if book.ID == 0 {
return book, true, fmt.Errorf("bookmark ID is not valid")
}
// Split bookmark content so it can be processed several times
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
readabilityCheckInput := bytes.NewBuffer(nil)
var multiWriter io.Writer
if !strings.Contains(contentType, "text/html") {
multiWriter = io.MultiWriter(archivalInput)
} else {
multiWriter = io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
}
_, err := io.Copy(multiWriter, req.Content)
if err != nil {
return book, false, fmt.Errorf("failed to process article: %v", err)
}
// If this is HTML, parse for readable content
var imageURLs []string
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
return book, false, fmt.Errorf("failed to parse article: %v", err)
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
// If title and excerpt doesnt have submitted value, use from article
if !req.KeepMetadata || book.Title == "" {
book.Title = article.Title
}
if !req.KeepMetadata || book.Excerpt == "" {
book.Excerpt = article.Excerpt
}
// Sometimes article doesn't have any title, so make sure it is not empty
if book.Title == "" {
book.Title = book.URL
}
// Get image URL
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
if !isReadable {
book.Content = ""
}
book.HasContent = book.Content != ""
}
// Save article image to local disk
strID := strconv.Itoa(book.ID)
imgPath := fp.Join(req.DataDir, "thumb", strID)
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath)
if err == nil {
book.ImageURL = path.Join("/", "bookmark", strID, "thumb")
break
}
}
// If needed, create offline archive as well
if book.CreateArchive {
archivePath := fp.Join(req.DataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath)
archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType,
LogEnabled: req.LogArchival,
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil {
return book, false, fmt.Errorf("failed to create archive: %v", err)
}
book.HasArchive = true
}
return book, false, nil
}
func downloadBookImage(url, dstPath string) error {
// Fetch data from URL
resp, err := httpClient.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
// Make sure it's JPG or PNG image
cp := resp.Header.Get("Content-Type")
if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") {
return fmt.Errorf("%s is not a supported image", url)
}
// At this point, the download has finished successfully.
// Prepare destination file.
err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
if err != nil {
return fmt.Errorf("failed to create image dir: %v", err)
}
dstFile, err := os.Create(dstPath)
if err != nil {
return fmt.Errorf("failed to create image file: %v", err)
}
defer dstFile.Close()
// Parse image and process it.
// If image is smaller than 600x400 or its ratio is less than 4:3, resize.
// Else, save it as it is.
img, _, err := image.Decode(resp.Body)
if err != nil {
return fmt.Errorf("failed to parse image %s: %v", url, err)
}
imgRect := img.Bounds()
imgWidth := imgRect.Dx()
imgHeight := imgRect.Dy()
imgRatio := float64(imgWidth) / float64(imgHeight)
if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 {
err = jpeg.Encode(dstFile, img, nil)
} else {
// Create background
bg := image.NewNRGBA(imgRect)
draw.Draw(bg, imgRect, image.NewUniform(color.White), image.Point{}, draw.Src)
draw.Draw(bg, imgRect, img, image.Point{}, draw.Over)
bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos)
bg = imaging.Blur(bg, 150)
bg = imaging.AdjustBrightness(bg, 30)
// Create foreground
fg := imaging.Fit(img, 600, 400, imaging.Lanczos)
// Merge foreground and background
bgRect := bg.Bounds()
fgRect := fg.Bounds()
fgPosition := image.Point{
X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)),
Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)),
}
draw.Draw(bg, bgRect, fg, fgPosition, draw.Over)
// Save to file
err = jpeg.Encode(dstFile, bg, nil)
}
if err != nil {
return fmt.Errorf("failed to save image %s: %v", url, err)
}
return nil
}

28
internal/core/url.go Normal file
View file

@ -0,0 +1,28 @@
package core
import (
"fmt"
nurl "net/url"
"strings"
)
// RemoveUTMParams removes the UTM parameters from URL.
func RemoveUTMParams(url string) (string, error) {
// Parse string URL
tmp, err := nurl.Parse(url)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
return url, fmt.Errorf("URL is not valid")
}
// Remove UTM queries
queries := tmp.Query()
for key := range queries {
if strings.HasPrefix(key, "utm_") {
queries.Del(key)
}
}
tmp.Fragment = ""
tmp.RawQuery = queries.Encode()
return tmp.String(), nil
}

View file

@ -6,17 +6,12 @@ import (
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
nurl "net/url"
"os" "os"
"path"
fp "path/filepath" fp "path/filepath"
"strconv" "strconv"
"strings"
"time"
"github.com/go-shiori/go-readability" "github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/julienschmidt/httprouter" "github.com/julienschmidt/httprouter"
) )
@ -31,18 +26,15 @@ func (h *handler) apiInsertViaExtension(w http.ResponseWriter, r *http.Request,
err = json.NewDecoder(r.Body).Decode(&request) err = json.NewDecoder(r.Body).Decode(&request)
checkError(err) checkError(err)
// Clean up URL by removing its fragment and UTM parameters // Clean up bookmark URL
tmp, err := nurl.Parse(request.URL) request.URL, err = core.RemoveUTMParams(request.URL)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { if err != nil {
panic(fmt.Errorf("URL is not valid")) panic(fmt.Errorf("failed to clean URL: %v", err))
} }
tmp.Fragment = ""
clearUTMParams(tmp)
request.URL = tmp.String()
// Check if bookmark already exists. // Check if bookmark already exists.
book, exist := h.DB.GetBookmark(0, request.URL) book, exist := h.DB.GetBookmark(0, request.URL)
book.CreateArchive = true
// If it already exists, we need to set ID and tags. // If it already exists, we need to set ID and tags.
if exist { if exist {
@ -69,119 +61,37 @@ func (h *handler) apiInsertViaExtension(w http.ResponseWriter, r *http.Request,
// Since we are using extension, the extension might send the HTML content // Since we are using extension, the extension might send the HTML content
// so no need to download it again here. However, if it's empty, it might be not HTML file // so no need to download it again here. However, if it's empty, it might be not HTML file
// so we download it here. // so we download it here.
contentType := "text/html; charset=UTF-8" var contentType string
contentBuffer := bytes.NewBufferString(book.HTML) var contentBuffer io.Reader
if book.HTML == "" { if book.HTML == "" {
func() { contentBuffer, contentType, _ = core.DownloadBookmark(book.URL)
// Prepare download request } else {
req, err := http.NewRequest("GET", book.URL, nil) contentType = "text/html; charset=UTF-8"
if err != nil { contentBuffer = bytes.NewBufferString(book.HTML)
return
}
// Send download request
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
resp, err := httpClient.Do(req)
if err != nil {
return
}
defer resp.Body.Close()
// Save response for later use
contentType = resp.Header.Get("Content-Type")
contentBuffer.Reset()
_, err = io.Copy(contentBuffer, resp.Body)
if err != nil {
return
}
}()
} }
// At this point the web page already downloaded. // At this point the web page already downloaded.
// Time to process it. // Time to process it.
func() { if contentBuffer != nil {
// Split response so it can be processed several times request := core.ProcessRequest{
archivalInput := bytes.NewBuffer(nil) DataDir: h.DataDir,
readabilityInput := bytes.NewBuffer(nil) Bookmark: book,
readabilityCheckInput := bytes.NewBuffer(nil) Content: contentBuffer,
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, contentBuffer)
if err != nil {
return
}
// If it's HTML, parse the readable content.
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
return
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
if book.Title == "" {
if article.Title == "" {
book.Title = book.URL
} else {
book.Title = article.Title
}
}
if book.Excerpt == "" {
book.Excerpt = article.Excerpt
}
if !isReadable {
book.Content = ""
}
book.HasContent = book.Content != ""
// Get image for thumbnail and save it to local disk
var imageURLs []string
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
// Save article image to local disk
strID := strconv.Itoa(book.ID)
imgPath := fp.Join(h.DataDir, "thumb", strID)
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath, time.Minute)
if err == nil {
book.ImageURL = path.Join("/", "bookmark", strID, "thumb")
break
}
}
}
// Create offline archive as well
archivePath := fp.Join(h.DataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath)
archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType, ContentType: contentType,
} }
err = warc.NewArchive(archivalRequest, archivePath) var isFatalErr bool
if err != nil { book, isFatalErr, err = core.ProcessBookmark(request)
return
if tmp, ok := contentBuffer.(io.ReadCloser); ok {
tmp.Close()
} }
book.HasArchive = true if err != nil && isFatalErr {
}() panic(fmt.Errorf("failed to process bookmark: %v", err))
}
}
// Save bookmark to database // Save bookmark to database
results, err := h.DB.SaveBookmarks(book) results, err := h.DB.SaveBookmarks(book)

View file

@ -1,13 +1,10 @@
package webserver package webserver
import ( import (
"bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"math" "math"
"net/http" "net/http"
nurl "net/url"
"os" "os"
"path" "path"
fp "path/filepath" fp "path/filepath"
@ -16,10 +13,9 @@ import (
"sync" "sync"
"time" "time"
"github.com/go-shiori/go-readability" "github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/database" "github.com/go-shiori/shiori/internal/database"
"github.com/go-shiori/shiori/internal/model" "github.com/go-shiori/shiori/internal/model"
"github.com/go-shiori/shiori/pkg/warc"
"github.com/gofrs/uuid" "github.com/gofrs/uuid"
"github.com/julienschmidt/httprouter" "github.com/julienschmidt/httprouter"
"golang.org/x/crypto/bcrypt" "golang.org/x/crypto/bcrypt"
@ -251,112 +247,35 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
err = json.NewDecoder(r.Body).Decode(&book) err = json.NewDecoder(r.Body).Decode(&book)
checkError(err) checkError(err)
// Clean up URL by removing its fragment and UTM parameters
tmp, err := nurl.Parse(book.URL)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
panic(fmt.Errorf("URL is not valid"))
}
tmp.Fragment = ""
clearUTMParams(tmp)
book.URL = tmp.String()
// Create bookmark ID // Create bookmark ID
book.ID, err = h.DB.CreateNewID("bookmark") book.ID, err = h.DB.CreateNewID("bookmark")
if err != nil { if err != nil {
panic(fmt.Errorf("failed to create ID: %v", err)) panic(fmt.Errorf("failed to create ID: %v", err))
} }
// Clean up bookmark URL
book.URL, err = core.RemoveUTMParams(book.URL)
if err != nil {
panic(fmt.Errorf("failed to clean URL: %v", err))
}
// Fetch data from internet // Fetch data from internet
var imageURLs []string var isFatalErr bool
func() { content, contentType, err := core.DownloadBookmark(book.URL)
// Prepare download request if err == nil && content != nil {
req, err := http.NewRequest("GET", book.URL, nil) request := core.ProcessRequest{
if err != nil { DataDir: h.DataDir,
return Bookmark: book,
Content: content,
ContentType: contentType,
} }
// Send download request book, isFatalErr, err = core.ProcessBookmark(request)
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") content.Close()
resp, err := httpClient.Do(req)
if err != nil { if err != nil && isFatalErr {
return panic(fmt.Errorf("failed to process bookmark: %v", err))
} }
defer resp.Body.Close()
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
return
}
// If this is HTML, parse for readable content
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
return
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
// If title and excerpt doesnt have submitted value, use from article
if book.Title == "" {
book.Title = article.Title
}
if book.Excerpt == "" {
book.Excerpt = article.Excerpt
}
// Get image URL
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
if !isReadable {
book.Content = ""
}
book.HasContent = book.Content != ""
}
// If needed, create offline archive as well
if book.CreateArchive {
archivePath := fp.Join(h.DataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath)
archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType,
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil {
return
}
book.HasArchive = true
}
}()
// Make sure title is not empty
if book.Title == "" {
book.Title = book.URL
} }
// Save bookmark to database // Save bookmark to database
@ -366,17 +285,6 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h
} }
book = results[0] book = results[0]
// Save article image to local disk
strID := strconv.Itoa(book.ID)
imgPath := fp.Join(h.DataDir, "thumb", strID)
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath, time.Minute)
if err == nil {
book.ImageURL = path.Join("/", "bookmark", strID, "thumb")
break
}
}
// Return the new bookmark // Return the new bookmark
w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(&book) err = json.NewEncoder(w).Encode(&book)
@ -446,6 +354,12 @@ func (h *handler) apiUpdateBookmark(w http.ResponseWriter, r *http.Request, ps h
book.Excerpt = request.Excerpt book.Excerpt = request.Excerpt
book.Public = request.Public book.Public = request.Public
// Clean up bookmark URL
book.URL, err = core.RemoveUTMParams(book.URL)
if err != nil {
panic(fmt.Errorf("failed to clean URL: %v", err))
}
// Set new tags // Set new tags
for i := range book.Tags { for i := range book.Tags {
book.Tags[i].Deleted = true book.Tags[i].Deleted = true
@ -525,6 +439,9 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
for i, book := range bookmarks { for i, book := range bookmarks {
wg.Add(1) wg.Add(1)
// Mark whether book will be archived
book.CreateArchive = request.CreateArchive
go func(i int, book model.Bookmark, keepMetadata bool) { go func(i int, book model.Bookmark, keepMetadata bool) {
// Make sure to finish the WG // Make sure to finish the WG
defer wg.Done() defer wg.Done()
@ -535,107 +452,28 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http
<-semaphore <-semaphore
}() }()
// Prepare download request // Download data from internet
req, err := http.NewRequest("GET", book.URL, nil) content, contentType, err := core.DownloadBookmark(book.URL)
if err != nil { if err != nil {
chProblem <- book.ID chProblem <- book.ID
return return
} }
// Send download request request := core.ProcessRequest{
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") DataDir: h.DataDir,
resp, err := httpClient.Do(req) Bookmark: book,
Content: content,
ContentType: contentType,
KeepMetadata: keepMetadata,
}
book, _, err = core.ProcessBookmark(request)
content.Close()
if err != nil { if err != nil {
chProblem <- book.ID chProblem <- book.ID
return return
} }
defer resp.Body.Close()
// Split response body so it can be processed twice
archivalInput := bytes.NewBuffer(nil)
readabilityInput := bytes.NewBuffer(nil)
readabilityCheckInput := bytes.NewBuffer(nil)
multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput)
_, err = io.Copy(multiWriter, resp.Body)
if err != nil {
chProblem <- book.ID
return
}
// If this is HTML, parse for readable content
strID := strconv.Itoa(book.ID)
contentType := resp.Header.Get("Content-Type")
if strings.Contains(contentType, "text/html") {
isReadable := readability.IsReadable(readabilityCheckInput)
article, err := readability.FromReader(readabilityInput, book.URL)
if err != nil {
chProblem <- book.ID
return
}
book.Author = article.Byline
book.Content = article.TextContent
book.HTML = article.Content
if !isReadable {
book.Content = ""
}
if !keepMetadata {
book.Title = article.Title
book.Excerpt = article.Excerpt
}
if book.Title == "" {
book.Title = book.URL
}
book.HasContent = book.Content != ""
// Get image for thumbnail and save it to local disk
var imageURLs []string
if article.Image != "" {
imageURLs = append(imageURLs, article.Image)
}
if article.Favicon != "" {
imageURLs = append(imageURLs, article.Favicon)
}
// Save article image to local disk
imgPath := fp.Join(h.DataDir, "thumb", strID)
for _, imageURL := range imageURLs {
err = downloadBookImage(imageURL, imgPath, time.Minute)
if err == nil {
book.ImageURL = path.Join("/", "bookmark", strID, "thumb")
break
}
}
}
// If needed, update offline archive as well.
// Make sure to delete the old one first.
if request.CreateArchive {
archivePath := fp.Join(h.DataDir, "archive", strID)
os.Remove(archivePath)
archivalRequest := warc.ArchivalRequest{
URL: book.URL,
Reader: archivalInput,
ContentType: contentType,
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil {
chProblem <- book.ID
return
}
book.HasArchive = true
}
// Update list of bookmarks // Update list of bookmarks
mx.Lock() mx.Lock()

View file

@ -3,13 +3,8 @@ package webserver
import ( import (
"fmt" "fmt"
"html/template" "html/template"
"image"
"image/color"
"image/draw"
"image/jpeg"
"io" "io"
"io/ioutil" "io/ioutil"
"math"
"mime" "mime"
"net" "net"
"net/http" "net/http"
@ -19,9 +14,6 @@ import (
"regexp" "regexp"
"strings" "strings"
"syscall" "syscall"
"time"
"github.com/disintegration/imaging"
) )
var rxRepeatedStrip = regexp.MustCompile(`(?i)-+`) var rxRepeatedStrip = regexp.MustCompile(`(?i)-+`)
@ -89,95 +81,6 @@ func fileExists(filePath string) bool {
return !os.IsNotExist(err) && !info.IsDir() return !os.IsNotExist(err) && !info.IsDir()
} }
func clearUTMParams(url *nurl.URL) {
queries := url.Query()
for key := range queries {
if strings.HasPrefix(key, "utm_") {
queries.Del(key)
}
}
url.RawQuery = queries.Encode()
}
func downloadBookImage(url, dstPath string, timeout time.Duration) error {
// Fetch data from URL
client := &http.Client{Timeout: timeout}
resp, err := client.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
// Make sure it's JPG or PNG image
cp := resp.Header.Get("Content-Type")
if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") {
return fmt.Errorf("%s is not a supported image", url)
}
// At this point, the download has finished successfully.
// Prepare destination file.
err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
if err != nil {
return fmt.Errorf("failed to create image dir: %v", err)
}
dstFile, err := os.Create(dstPath)
if err != nil {
return fmt.Errorf("failed to create image file: %v", err)
}
defer dstFile.Close()
// Parse image and process it.
// If image is smaller than 600x400 or its ratio is less than 4:3, resize.
// Else, save it as it is.
img, _, err := image.Decode(resp.Body)
if err != nil {
return fmt.Errorf("failed to parse image %s: %v", url, err)
}
imgRect := img.Bounds()
imgWidth := imgRect.Dx()
imgHeight := imgRect.Dy()
imgRatio := float64(imgWidth) / float64(imgHeight)
if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 {
err = jpeg.Encode(dstFile, img, nil)
} else {
// Create background
bg := image.NewNRGBA(imgRect)
draw.Draw(bg, imgRect, image.NewUniform(color.White), image.Point{}, draw.Src)
draw.Draw(bg, imgRect, img, image.Point{}, draw.Over)
bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos)
bg = imaging.Blur(bg, 150)
bg = imaging.AdjustBrightness(bg, 30)
// Create foreground
fg := imaging.Fit(img, 600, 400, imaging.Lanczos)
// Merge foreground and background
bgRect := bg.Bounds()
fgRect := fg.Bounds()
fgPosition := image.Point{
X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)),
Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)),
}
draw.Draw(bg, bgRect, fg, fgPosition, draw.Over)
// Save to file
err = jpeg.Encode(dstFile, bg, nil)
}
if err != nil {
return fmt.Errorf("failed to save image %s: %v", url, err)
}
return nil
}
func createTemplate(filename string, funcMap template.FuncMap) (*template.Template, error) { func createTemplate(filename string, funcMap template.FuncMap) (*template.Template, error) {
// Open file // Open file
src, err := assets.Open(filename) src, err := assets.Open(filename)