From 64c62d6b1233f4152f05f61e4d1473cb62b7a47b Mon Sep 17 00:00:00 2001 From: Radhi Fadlillah Date: Fri, 20 Sep 2019 16:48:57 +0700 Subject: [PATCH] Remove repeated code for archiving bookmarks --- internal/cmd/add.go | 182 ++++++------------- internal/cmd/import.go | 13 +- internal/cmd/pocket.go | 13 +- internal/cmd/update.go | 118 ++---------- internal/cmd/utils.go | 101 ----------- internal/core/download.go | 31 ++++ internal/core/processing.go | 218 +++++++++++++++++++++++ internal/core/url.go | 28 +++ internal/webserver/handler-api-ext.go | 144 +++------------ internal/webserver/handler-api.go | 246 +++++--------------------- internal/webserver/utils.go | 97 ---------- 11 files changed, 425 insertions(+), 766 deletions(-) create mode 100644 internal/core/download.go create mode 100644 internal/core/processing.go create mode 100644 internal/core/url.go diff --git a/internal/cmd/add.go b/internal/cmd/add.go index 8f592c9f..e7f59183 100644 --- a/internal/cmd/add.go +++ b/internal/cmd/add.go @@ -1,18 +1,10 @@ package cmd import ( - "bytes" "fmt" - "io" - "net/http" - nurl "net/url" - fp "path/filepath" "strings" - "time" - "github.com/go-shiori/shiori/pkg/warc" - - "github.com/go-shiori/go-readability" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/model" "github.com/spf13/cobra" ) @@ -45,28 +37,16 @@ func addHandler(cmd *cobra.Command, args []string) { noArchival, _ := cmd.Flags().GetBool("no-archival") logArchival, _ := cmd.Flags().GetBool("log-archival") - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(url) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { - cError.Println("URL is not valid") - return - } - - tmp.Fragment = "" - clearUTMParams(tmp) - // Create bookmark item book := model.Bookmark{ - URL: tmp.String(), - Title: normalizeSpace(title), - Excerpt: normalizeSpace(excerpt), + URL: url, + Title: normalizeSpace(title), + Excerpt: normalizeSpace(excerpt), + CreateArchive: !noArchival, } - // Create bookmark ID - book.ID, err = db.CreateNewID("bookmark") - if err != nil { - cError.Printf("Failed to create ID: %v\n", err) - return + if book.Title == "" { + book.Title = book.URL } // Set bookmark tags @@ -75,101 +55,51 @@ func addHandler(cmd *cobra.Command, args []string) { book.Tags[i].Name = strings.TrimSpace(tag) } - // If it's not offline mode, fetch data from internet - var imageURLs []string - - if !offline { - func() { - cInfo.Println("Downloading article...") - - // Prepare download request - req, err := http.NewRequest("GET", url, nil) - if err != nil { - cError.Printf("Failed to download article: %v\n", err) - return - } - - // Send download request - req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") - resp, err := httpClient.Do(req) - if err != nil { - cError.Printf("Failed to download article: %v\n", err) - return - } - defer resp.Body.Close() - - // Split response body so it can be processed twice - archivalInput := bytes.NewBuffer(nil) - readabilityInput := bytes.NewBuffer(nil) - readabilityCheckInput := bytes.NewBuffer(nil) - multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) - - _, err = io.Copy(multiWriter, resp.Body) - if err != nil { - cError.Printf("Failed to process article: %v\n", err) - return - } - - // If this is HTML, parse for readable content - contentType := resp.Header.Get("Content-Type") - if strings.Contains(contentType, "text/html") { - isReadable := readability.IsReadable(readabilityCheckInput) - - article, err := readability.FromReader(readabilityInput, url) - if err != nil { - cError.Printf("Failed to parse article: %v\n", err) - return - } - - book.Author = article.Byline - book.Content = article.TextContent - book.HTML = article.Content - - // If title and excerpt doesnt have submitted value, use from article - if book.Title == "" { - book.Title = article.Title - } - - if book.Excerpt == "" { - book.Excerpt = article.Excerpt - } - - if !isReadable { - book.Content = "" - } - - // Get image URL - if article.Image != "" { - imageURLs = append(imageURLs, article.Image) - } - - if article.Favicon != "" { - imageURLs = append(imageURLs, article.Favicon) - } - } - - // If needed, create offline archive as well - if !noArchival { - archivePath := fp.Join(dataDir, "archive", fmt.Sprintf("%d", book.ID)) - archivalRequest := warc.ArchivalRequest{ - URL: url, - Reader: archivalInput, - ContentType: contentType, - LogEnabled: logArchival, - } - - err = warc.NewArchive(archivalRequest, archivePath) - if err != nil { - cError.Printf("Failed to create archive: %v\n", err) - return - } - } - }() + // Create bookmark ID + var err error + book.ID, err = db.CreateNewID("bookmark") + if err != nil { + cError.Printf("Failed to create ID: %v\n", err) + return } - // Make sure title is not empty - if book.Title == "" { - book.Title = book.URL + // Clean up bookmark URL + book.URL, err = core.RemoveUTMParams(book.URL) + if err != nil { + cError.Printf("Failed to clean URL: %v\n", err) + return + } + + // If it's not offline mode, fetch data from internet. + if !offline { + cInfo.Println("Downloading article...") + + var isFatalErr bool + content, contentType, err := core.DownloadBookmark(book.URL) + if err != nil { + cError.Printf("Failed to download: %v\n", err) + } + + if err == nil && content != nil { + request := core.ProcessRequest{ + DataDir: dataDir, + Bookmark: book, + Content: content, + ContentType: contentType, + LogArchival: logArchival, + } + + book, isFatalErr, err = core.ProcessBookmark(request) + content.Close() + + if err != nil { + cError.Printf("Failed: %v\n", err) + } + + if isFatalErr { + return + } + } } // Save bookmark to database @@ -179,18 +109,6 @@ func addHandler(cmd *cobra.Command, args []string) { return } - // Save article image to local disk - imgPath := fp.Join(dataDir, "thumb", fmt.Sprintf("%d", book.ID)) - for _, imageURL := range imageURLs { - err = downloadBookImage(imageURL, imgPath, time.Minute) - if err == nil { - break - } else { - cError.Printf("Failed to download image: %v\n", err) - continue - } - } - // Print added bookmark fmt.Println() printBookmarks(book) diff --git a/internal/cmd/import.go b/internal/cmd/import.go index 8553d153..08f37b40 100644 --- a/internal/cmd/import.go +++ b/internal/cmd/import.go @@ -2,11 +2,11 @@ package cmd import ( "fmt" - nurl "net/url" "os" "strings" "github.com/PuerkitoBio/goquery" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/model" "github.com/spf13/cobra" ) @@ -73,17 +73,14 @@ func importHandler(cmd *cobra.Command, args []string) { url, _ := a.Attr("href") strTags, _ := a.Attr("tags") - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(url) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { + // Clean up URL + var err error + url, err = core.RemoveUTMParams(url) + if err != nil { cError.Printf("Skip %s: URL is not valid\n", url) return } - tmp.Fragment = "" - clearUTMParams(tmp) - url = tmp.String() - // Make sure title is valid Utf-8 title = toValidUtf8(title, url) diff --git a/internal/cmd/pocket.go b/internal/cmd/pocket.go index 4b368c45..6f558628 100644 --- a/internal/cmd/pocket.go +++ b/internal/cmd/pocket.go @@ -2,13 +2,13 @@ package cmd import ( "fmt" - nurl "net/url" "os" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/model" "github.com/spf13/cobra" ) @@ -59,17 +59,14 @@ func pocketHandler(cmd *cobra.Command, args []string) { intModified, _ := strconv.ParseInt(strModified, 10, 64) modified := time.Unix(intModified, 0) - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(url) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { + // Clean up URL + var err error + url, err = core.RemoveUTMParams(url) + if err != nil { cError.Printf("Skip %s: URL is not valid\n", url) return } - tmp.Fragment = "" - clearUTMParams(tmp) - url = tmp.String() - // Make sure title is valid Utf-8 title = toValidUtf8(title, url) diff --git a/internal/cmd/update.go b/internal/cmd/update.go index 79acc823..51457d32 100644 --- a/internal/cmd/update.go +++ b/internal/cmd/update.go @@ -1,22 +1,14 @@ package cmd import ( - "bytes" "fmt" - "io" - "net/http" - nurl "net/url" - "os" - fp "path/filepath" "sort" "strings" "sync" - "time" - "github.com/go-shiori/go-readability" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/database" "github.com/go-shiori/shiori/internal/model" - "github.com/go-shiori/shiori/pkg/warc" "github.com/spf13/cobra" ) @@ -83,17 +75,12 @@ func updateHandler(cmd *cobra.Command, args []string) { excerpt = normalizeSpace(excerpt) if cmd.Flags().Changed("url") { - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(url) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { - cError.Println("URL is not valid") - return + // Clean up bookmark URL + url, err = core.RemoveUTMParams(url) + if err != nil { + panic(fmt.Errorf("failed to clean URL: %v", err)) } - tmp.Fragment = "" - clearUTMParams(tmp) - url = tmp.String() - // Since user uses custom URL, make sure there is only one ID to update if len(ids) != 1 { cError.Println("Update only accepts one index while using --url flag") @@ -149,6 +136,9 @@ func updateHandler(cmd *cobra.Command, args []string) { for i, book := range bookmarks { wg.Add(1) + // Mark whether book will be archived + book.CreateArchive = !noArchival + // If used, use submitted URL if url != "" { book.URL = url @@ -164,102 +154,32 @@ func updateHandler(cmd *cobra.Command, args []string) { <-semaphore }() - // Prepare download request - req, err := http.NewRequest("GET", book.URL, nil) + // Download data from internet + content, contentType, err := core.DownloadBookmark(book.URL) if err != nil { chProblem <- book.ID chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) return } - // Send download request - req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") - resp, err := httpClient.Do(req) - if err != nil { - chProblem <- book.ID - chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) - return + request := core.ProcessRequest{ + DataDir: dataDir, + Bookmark: book, + Content: content, + ContentType: contentType, + KeepMetadata: keepMetadata, + LogArchival: logArchival, } - defer resp.Body.Close() - // Split response body so it can be processed twice - archivalInput := bytes.NewBuffer(nil) - readabilityInput := bytes.NewBuffer(nil) - readabilityCheckInput := bytes.NewBuffer(nil) - multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) + book, _, err = core.ProcessBookmark(request) + content.Close() - _, err = io.Copy(multiWriter, resp.Body) if err != nil { chProblem <- book.ID chMessage <- fmt.Errorf("Failed to process %s: %v", book.URL, err) return } - // If this is HTML, parse for readable content - contentType := resp.Header.Get("Content-Type") - if strings.Contains(contentType, "text/html") { - isReadable := readability.IsReadable(readabilityCheckInput) - - article, err := readability.FromReader(readabilityInput, book.URL) - if err != nil { - chProblem <- book.ID - chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err) - return - } - - book.Author = article.Byline - book.Content = article.TextContent - book.HTML = article.Content - - if !isReadable { - book.Content = "" - } - - if !keepMetadata { - book.Title = article.Title - book.Excerpt = article.Excerpt - } - - // Get image for thumbnail and save it to local disk - var imageURLs []string - if article.Image != "" { - imageURLs = append(imageURLs, article.Image) - } - - if article.Favicon != "" { - imageURLs = append(imageURLs, article.Favicon) - } - - imgPath := fp.Join(dataDir, "thumb", fmt.Sprintf("%d", book.ID)) - for _, imageURL := range imageURLs { - err = downloadBookImage(imageURL, imgPath, time.Minute) - if err == nil { - break - } - } - } - - // If needed, update offline archive as well. - // Make sure to delete the old one first. - if !noArchival { - archivePath := fp.Join(dataDir, "archive", fmt.Sprintf("%d", book.ID)) - os.Remove(archivePath) - - archivalRequest := warc.ArchivalRequest{ - URL: book.URL, - Reader: archivalInput, - ContentType: contentType, - LogEnabled: logArchival, - } - - err = warc.NewArchive(archivalRequest, archivePath) - if err != nil { - chProblem <- book.ID - chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err) - return - } - } - // Send success message chMessage <- fmt.Sprintf("Downloaded %s", book.URL) diff --git a/internal/cmd/utils.go b/internal/cmd/utils.go index 2f1dcd9f..43488726 100644 --- a/internal/cmd/utils.go +++ b/internal/cmd/utils.go @@ -3,29 +3,17 @@ package cmd import ( "errors" "fmt" - "image" - clr "image/color" - "image/draw" - "image/jpeg" - "math" - "net/http" nurl "net/url" "os" "os/exec" - fp "path/filepath" "runtime" "strconv" "strings" - "time" "unicode/utf8" - "github.com/disintegration/imaging" "github.com/fatih/color" "github.com/go-shiori/shiori/internal/model" "golang.org/x/crypto/ssh/terminal" - - // Add supports for PNG image - _ "image/png" ) var ( @@ -54,95 +42,6 @@ func isURLValid(s string) bool { return err == nil && tmp.Scheme != "" && tmp.Hostname() != "" } -func clearUTMParams(url *nurl.URL) { - queries := url.Query() - - for key := range queries { - if strings.HasPrefix(key, "utm_") { - queries.Del(key) - } - } - - url.RawQuery = queries.Encode() -} - -func downloadBookImage(url, dstPath string, timeout time.Duration) error { - // Fetch data from URL - client := &http.Client{Timeout: timeout} - resp, err := client.Get(url) - if err != nil { - return err - } - defer resp.Body.Close() - - // Make sure it's JPG or PNG image - cp := resp.Header.Get("Content-Type") - if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") { - return fmt.Errorf("%s is not a supported image", url) - } - - // At this point, the download has finished successfully. - // Prepare destination file. - err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm) - if err != nil { - return fmt.Errorf("failed to create image dir: %v", err) - } - - dstFile, err := os.Create(dstPath) - if err != nil { - return fmt.Errorf("failed to create image file: %v", err) - } - defer dstFile.Close() - - // Parse image and process it. - // If image is smaller than 600x400 or its ratio is less than 4:3, resize. - // Else, save it as it is. - img, _, err := image.Decode(resp.Body) - if err != nil { - return fmt.Errorf("failed to parse image %s: %v", url, err) - } - - imgRect := img.Bounds() - imgWidth := imgRect.Dx() - imgHeight := imgRect.Dy() - imgRatio := float64(imgWidth) / float64(imgHeight) - - if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 { - err = jpeg.Encode(dstFile, img, nil) - } else { - // Create background - bg := image.NewNRGBA(imgRect) - draw.Draw(bg, imgRect, image.NewUniform(clr.White), image.Point{}, draw.Src) - draw.Draw(bg, imgRect, img, image.Point{}, draw.Over) - - bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos) - bg = imaging.Blur(bg, 150) - bg = imaging.AdjustBrightness(bg, 30) - - // Create foreground - fg := imaging.Fit(img, 600, 400, imaging.Lanczos) - - // Merge foreground and background - bgRect := bg.Bounds() - fgRect := fg.Bounds() - fgPosition := image.Point{ - X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)), - Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)), - } - - draw.Draw(bg, bgRect, fg, fgPosition, draw.Over) - - // Save to file - err = jpeg.Encode(dstFile, bg, nil) - } - - if err != nil { - return fmt.Errorf("failed to save image %s: %v", url, err) - } - - return nil -} - func printBookmarks(bookmarks ...model.Bookmark) { for _, bookmark := range bookmarks { // Create bookmark index diff --git a/internal/core/download.go b/internal/core/download.go new file mode 100644 index 00000000..17816b16 --- /dev/null +++ b/internal/core/download.go @@ -0,0 +1,31 @@ +package core + +import ( + "io" + "net/http" + "time" +) + +var httpClient = &http.Client{Timeout: time.Minute} + +// DownloadBookmark downloads bookmarked page from specified URL. +// Return response body, make sure to close it later. +func DownloadBookmark(url string) (io.ReadCloser, string, error) { + // Prepare download request + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, "", err + } + + // Send download request + req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") + resp, err := httpClient.Do(req) + if err != nil { + return nil, "", err + } + + // Get content type + contentType := resp.Header.Get("Content-Type") + + return resp.Body, contentType, nil +} diff --git a/internal/core/processing.go b/internal/core/processing.go new file mode 100644 index 00000000..dcdf8fbb --- /dev/null +++ b/internal/core/processing.go @@ -0,0 +1,218 @@ +package core + +import ( + "bytes" + "fmt" + "image" + "image/color" + "image/draw" + "image/jpeg" + "io" + "math" + "os" + "path" + fp "path/filepath" + "strconv" + "strings" + + "github.com/disintegration/imaging" + "github.com/go-shiori/go-readability" + "github.com/go-shiori/shiori/internal/model" + "github.com/go-shiori/shiori/pkg/warc" + + // Add support for png + _ "image/png" +) + +// ProcessRequest is the request for processing bookmark. +type ProcessRequest struct { + DataDir string + Bookmark model.Bookmark + Content io.Reader + ContentType string + KeepMetadata bool + LogArchival bool +} + +// ProcessBookmark process the bookmark and archive it if needed. +// Return three values, the bookmark itself, is error fatal, and error value. +func ProcessBookmark(req ProcessRequest) (model.Bookmark, bool, error) { + book := req.Bookmark + contentType := req.ContentType + + // Make sure bookmark ID is defined + if book.ID == 0 { + return book, true, fmt.Errorf("bookmark ID is not valid") + } + + // Split bookmark content so it can be processed several times + archivalInput := bytes.NewBuffer(nil) + readabilityInput := bytes.NewBuffer(nil) + readabilityCheckInput := bytes.NewBuffer(nil) + + var multiWriter io.Writer + if !strings.Contains(contentType, "text/html") { + multiWriter = io.MultiWriter(archivalInput) + } else { + multiWriter = io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) + } + + _, err := io.Copy(multiWriter, req.Content) + if err != nil { + return book, false, fmt.Errorf("failed to process article: %v", err) + } + + // If this is HTML, parse for readable content + var imageURLs []string + if strings.Contains(contentType, "text/html") { + isReadable := readability.IsReadable(readabilityCheckInput) + + article, err := readability.FromReader(readabilityInput, book.URL) + if err != nil { + return book, false, fmt.Errorf("failed to parse article: %v", err) + } + + book.Author = article.Byline + book.Content = article.TextContent + book.HTML = article.Content + + // If title and excerpt doesnt have submitted value, use from article + if !req.KeepMetadata || book.Title == "" { + book.Title = article.Title + } + + if !req.KeepMetadata || book.Excerpt == "" { + book.Excerpt = article.Excerpt + } + + // Sometimes article doesn't have any title, so make sure it is not empty + if book.Title == "" { + book.Title = book.URL + } + + // Get image URL + if article.Image != "" { + imageURLs = append(imageURLs, article.Image) + } + + if article.Favicon != "" { + imageURLs = append(imageURLs, article.Favicon) + } + + if !isReadable { + book.Content = "" + } + + book.HasContent = book.Content != "" + } + + // Save article image to local disk + strID := strconv.Itoa(book.ID) + imgPath := fp.Join(req.DataDir, "thumb", strID) + + for _, imageURL := range imageURLs { + err = downloadBookImage(imageURL, imgPath) + if err == nil { + book.ImageURL = path.Join("/", "bookmark", strID, "thumb") + break + } + } + + // If needed, create offline archive as well + if book.CreateArchive { + archivePath := fp.Join(req.DataDir, "archive", fmt.Sprintf("%d", book.ID)) + os.Remove(archivePath) + + archivalRequest := warc.ArchivalRequest{ + URL: book.URL, + Reader: archivalInput, + ContentType: contentType, + LogEnabled: req.LogArchival, + } + + err = warc.NewArchive(archivalRequest, archivePath) + if err != nil { + return book, false, fmt.Errorf("failed to create archive: %v", err) + } + + book.HasArchive = true + } + + return book, false, nil +} + +func downloadBookImage(url, dstPath string) error { + // Fetch data from URL + resp, err := httpClient.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + // Make sure it's JPG or PNG image + cp := resp.Header.Get("Content-Type") + if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") { + return fmt.Errorf("%s is not a supported image", url) + } + + // At this point, the download has finished successfully. + // Prepare destination file. + err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm) + if err != nil { + return fmt.Errorf("failed to create image dir: %v", err) + } + + dstFile, err := os.Create(dstPath) + if err != nil { + return fmt.Errorf("failed to create image file: %v", err) + } + defer dstFile.Close() + + // Parse image and process it. + // If image is smaller than 600x400 or its ratio is less than 4:3, resize. + // Else, save it as it is. + img, _, err := image.Decode(resp.Body) + if err != nil { + return fmt.Errorf("failed to parse image %s: %v", url, err) + } + + imgRect := img.Bounds() + imgWidth := imgRect.Dx() + imgHeight := imgRect.Dy() + imgRatio := float64(imgWidth) / float64(imgHeight) + + if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 { + err = jpeg.Encode(dstFile, img, nil) + } else { + // Create background + bg := image.NewNRGBA(imgRect) + draw.Draw(bg, imgRect, image.NewUniform(color.White), image.Point{}, draw.Src) + draw.Draw(bg, imgRect, img, image.Point{}, draw.Over) + + bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos) + bg = imaging.Blur(bg, 150) + bg = imaging.AdjustBrightness(bg, 30) + + // Create foreground + fg := imaging.Fit(img, 600, 400, imaging.Lanczos) + + // Merge foreground and background + bgRect := bg.Bounds() + fgRect := fg.Bounds() + fgPosition := image.Point{ + X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)), + Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)), + } + + draw.Draw(bg, bgRect, fg, fgPosition, draw.Over) + + // Save to file + err = jpeg.Encode(dstFile, bg, nil) + } + + if err != nil { + return fmt.Errorf("failed to save image %s: %v", url, err) + } + + return nil +} diff --git a/internal/core/url.go b/internal/core/url.go new file mode 100644 index 00000000..bc00dd16 --- /dev/null +++ b/internal/core/url.go @@ -0,0 +1,28 @@ +package core + +import ( + "fmt" + nurl "net/url" + "strings" +) + +// RemoveUTMParams removes the UTM parameters from URL. +func RemoveUTMParams(url string) (string, error) { + // Parse string URL + tmp, err := nurl.Parse(url) + if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { + return url, fmt.Errorf("URL is not valid") + } + + // Remove UTM queries + queries := tmp.Query() + for key := range queries { + if strings.HasPrefix(key, "utm_") { + queries.Del(key) + } + } + + tmp.Fragment = "" + tmp.RawQuery = queries.Encode() + return tmp.String(), nil +} diff --git a/internal/webserver/handler-api-ext.go b/internal/webserver/handler-api-ext.go index 4834f435..35d133a4 100644 --- a/internal/webserver/handler-api-ext.go +++ b/internal/webserver/handler-api-ext.go @@ -6,17 +6,12 @@ import ( "fmt" "io" "net/http" - nurl "net/url" "os" - "path" fp "path/filepath" "strconv" - "strings" - "time" - "github.com/go-shiori/go-readability" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/model" - "github.com/go-shiori/shiori/pkg/warc" "github.com/julienschmidt/httprouter" ) @@ -31,18 +26,15 @@ func (h *handler) apiInsertViaExtension(w http.ResponseWriter, r *http.Request, err = json.NewDecoder(r.Body).Decode(&request) checkError(err) - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(request.URL) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { - panic(fmt.Errorf("URL is not valid")) + // Clean up bookmark URL + request.URL, err = core.RemoveUTMParams(request.URL) + if err != nil { + panic(fmt.Errorf("failed to clean URL: %v", err)) } - tmp.Fragment = "" - clearUTMParams(tmp) - request.URL = tmp.String() - // Check if bookmark already exists. book, exist := h.DB.GetBookmark(0, request.URL) + book.CreateArchive = true // If it already exists, we need to set ID and tags. if exist { @@ -69,119 +61,37 @@ func (h *handler) apiInsertViaExtension(w http.ResponseWriter, r *http.Request, // Since we are using extension, the extension might send the HTML content // so no need to download it again here. However, if it's empty, it might be not HTML file // so we download it here. - contentType := "text/html; charset=UTF-8" - contentBuffer := bytes.NewBufferString(book.HTML) + var contentType string + var contentBuffer io.Reader + if book.HTML == "" { - func() { - // Prepare download request - req, err := http.NewRequest("GET", book.URL, nil) - if err != nil { - return - } - - // Send download request - req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") - resp, err := httpClient.Do(req) - if err != nil { - return - } - defer resp.Body.Close() - - // Save response for later use - contentType = resp.Header.Get("Content-Type") - - contentBuffer.Reset() - _, err = io.Copy(contentBuffer, resp.Body) - if err != nil { - return - } - }() + contentBuffer, contentType, _ = core.DownloadBookmark(book.URL) + } else { + contentType = "text/html; charset=UTF-8" + contentBuffer = bytes.NewBufferString(book.HTML) } // At this point the web page already downloaded. // Time to process it. - func() { - // Split response so it can be processed several times - archivalInput := bytes.NewBuffer(nil) - readabilityInput := bytes.NewBuffer(nil) - readabilityCheckInput := bytes.NewBuffer(nil) - multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) - - _, err = io.Copy(multiWriter, contentBuffer) - if err != nil { - return - } - - // If it's HTML, parse the readable content. - if strings.Contains(contentType, "text/html") { - isReadable := readability.IsReadable(readabilityCheckInput) - - article, err := readability.FromReader(readabilityInput, book.URL) - if err != nil { - return - } - - book.Author = article.Byline - book.Content = article.TextContent - book.HTML = article.Content - - if book.Title == "" { - if article.Title == "" { - book.Title = book.URL - } else { - book.Title = article.Title - } - } - - if book.Excerpt == "" { - book.Excerpt = article.Excerpt - } - - if !isReadable { - book.Content = "" - } - - book.HasContent = book.Content != "" - - // Get image for thumbnail and save it to local disk - var imageURLs []string - if article.Image != "" { - imageURLs = append(imageURLs, article.Image) - } - - if article.Favicon != "" { - imageURLs = append(imageURLs, article.Favicon) - } - - // Save article image to local disk - strID := strconv.Itoa(book.ID) - imgPath := fp.Join(h.DataDir, "thumb", strID) - for _, imageURL := range imageURLs { - err = downloadBookImage(imageURL, imgPath, time.Minute) - if err == nil { - book.ImageURL = path.Join("/", "bookmark", strID, "thumb") - break - } - } - } - - // Create offline archive as well - archivePath := fp.Join(h.DataDir, "archive", fmt.Sprintf("%d", book.ID)) - os.Remove(archivePath) - - archivalRequest := warc.ArchivalRequest{ - URL: book.URL, - Reader: archivalInput, + if contentBuffer != nil { + request := core.ProcessRequest{ + DataDir: h.DataDir, + Bookmark: book, + Content: contentBuffer, ContentType: contentType, } - err = warc.NewArchive(archivalRequest, archivePath) - if err != nil { - return + var isFatalErr bool + book, isFatalErr, err = core.ProcessBookmark(request) + + if tmp, ok := contentBuffer.(io.ReadCloser); ok { + tmp.Close() } - book.HasArchive = true - }() + if err != nil && isFatalErr { + panic(fmt.Errorf("failed to process bookmark: %v", err)) + } + } // Save bookmark to database results, err := h.DB.SaveBookmarks(book) diff --git a/internal/webserver/handler-api.go b/internal/webserver/handler-api.go index b2f7f990..17bdb5c6 100644 --- a/internal/webserver/handler-api.go +++ b/internal/webserver/handler-api.go @@ -1,13 +1,10 @@ package webserver import ( - "bytes" "encoding/json" "fmt" - "io" "math" "net/http" - nurl "net/url" "os" "path" fp "path/filepath" @@ -16,10 +13,9 @@ import ( "sync" "time" - "github.com/go-shiori/go-readability" + "github.com/go-shiori/shiori/internal/core" "github.com/go-shiori/shiori/internal/database" "github.com/go-shiori/shiori/internal/model" - "github.com/go-shiori/shiori/pkg/warc" "github.com/gofrs/uuid" "github.com/julienschmidt/httprouter" "golang.org/x/crypto/bcrypt" @@ -251,112 +247,35 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h err = json.NewDecoder(r.Body).Decode(&book) checkError(err) - // Clean up URL by removing its fragment and UTM parameters - tmp, err := nurl.Parse(book.URL) - if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" { - panic(fmt.Errorf("URL is not valid")) - } - - tmp.Fragment = "" - clearUTMParams(tmp) - book.URL = tmp.String() - // Create bookmark ID book.ID, err = h.DB.CreateNewID("bookmark") if err != nil { panic(fmt.Errorf("failed to create ID: %v", err)) } + // Clean up bookmark URL + book.URL, err = core.RemoveUTMParams(book.URL) + if err != nil { + panic(fmt.Errorf("failed to clean URL: %v", err)) + } + // Fetch data from internet - var imageURLs []string - func() { - // Prepare download request - req, err := http.NewRequest("GET", book.URL, nil) - if err != nil { - return + var isFatalErr bool + content, contentType, err := core.DownloadBookmark(book.URL) + if err == nil && content != nil { + request := core.ProcessRequest{ + DataDir: h.DataDir, + Bookmark: book, + Content: content, + ContentType: contentType, } - // Send download request - req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") - resp, err := httpClient.Do(req) - if err != nil { - return + book, isFatalErr, err = core.ProcessBookmark(request) + content.Close() + + if err != nil && isFatalErr { + panic(fmt.Errorf("failed to process bookmark: %v", err)) } - defer resp.Body.Close() - - // Split response body so it can be processed twice - archivalInput := bytes.NewBuffer(nil) - readabilityInput := bytes.NewBuffer(nil) - readabilityCheckInput := bytes.NewBuffer(nil) - multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) - - _, err = io.Copy(multiWriter, resp.Body) - if err != nil { - return - } - - // If this is HTML, parse for readable content - contentType := resp.Header.Get("Content-Type") - if strings.Contains(contentType, "text/html") { - isReadable := readability.IsReadable(readabilityCheckInput) - - article, err := readability.FromReader(readabilityInput, book.URL) - if err != nil { - return - } - - book.Author = article.Byline - book.Content = article.TextContent - book.HTML = article.Content - - // If title and excerpt doesnt have submitted value, use from article - if book.Title == "" { - book.Title = article.Title - } - - if book.Excerpt == "" { - book.Excerpt = article.Excerpt - } - - // Get image URL - if article.Image != "" { - imageURLs = append(imageURLs, article.Image) - } - - if article.Favicon != "" { - imageURLs = append(imageURLs, article.Favicon) - } - - if !isReadable { - book.Content = "" - } - - book.HasContent = book.Content != "" - } - - // If needed, create offline archive as well - if book.CreateArchive { - archivePath := fp.Join(h.DataDir, "archive", fmt.Sprintf("%d", book.ID)) - os.Remove(archivePath) - - archivalRequest := warc.ArchivalRequest{ - URL: book.URL, - Reader: archivalInput, - ContentType: contentType, - } - - err = warc.NewArchive(archivalRequest, archivePath) - if err != nil { - return - } - - book.HasArchive = true - } - }() - - // Make sure title is not empty - if book.Title == "" { - book.Title = book.URL } // Save bookmark to database @@ -366,17 +285,6 @@ func (h *handler) apiInsertBookmark(w http.ResponseWriter, r *http.Request, ps h } book = results[0] - // Save article image to local disk - strID := strconv.Itoa(book.ID) - imgPath := fp.Join(h.DataDir, "thumb", strID) - for _, imageURL := range imageURLs { - err = downloadBookImage(imageURL, imgPath, time.Minute) - if err == nil { - book.ImageURL = path.Join("/", "bookmark", strID, "thumb") - break - } - } - // Return the new bookmark w.Header().Set("Content-Type", "application/json") err = json.NewEncoder(w).Encode(&book) @@ -446,6 +354,12 @@ func (h *handler) apiUpdateBookmark(w http.ResponseWriter, r *http.Request, ps h book.Excerpt = request.Excerpt book.Public = request.Public + // Clean up bookmark URL + book.URL, err = core.RemoveUTMParams(book.URL) + if err != nil { + panic(fmt.Errorf("failed to clean URL: %v", err)) + } + // Set new tags for i := range book.Tags { book.Tags[i].Deleted = true @@ -525,6 +439,9 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http for i, book := range bookmarks { wg.Add(1) + // Mark whether book will be archived + book.CreateArchive = request.CreateArchive + go func(i int, book model.Bookmark, keepMetadata bool) { // Make sure to finish the WG defer wg.Done() @@ -535,107 +452,28 @@ func (h *handler) apiUpdateCache(w http.ResponseWriter, r *http.Request, ps http <-semaphore }() - // Prepare download request - req, err := http.NewRequest("GET", book.URL, nil) + // Download data from internet + content, contentType, err := core.DownloadBookmark(book.URL) if err != nil { chProblem <- book.ID return } - // Send download request - req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") - resp, err := httpClient.Do(req) + request := core.ProcessRequest{ + DataDir: h.DataDir, + Bookmark: book, + Content: content, + ContentType: contentType, + KeepMetadata: keepMetadata, + } + + book, _, err = core.ProcessBookmark(request) + content.Close() + if err != nil { chProblem <- book.ID return } - defer resp.Body.Close() - - // Split response body so it can be processed twice - archivalInput := bytes.NewBuffer(nil) - readabilityInput := bytes.NewBuffer(nil) - readabilityCheckInput := bytes.NewBuffer(nil) - multiWriter := io.MultiWriter(archivalInput, readabilityInput, readabilityCheckInput) - - _, err = io.Copy(multiWriter, resp.Body) - if err != nil { - chProblem <- book.ID - return - } - - // If this is HTML, parse for readable content - strID := strconv.Itoa(book.ID) - contentType := resp.Header.Get("Content-Type") - - if strings.Contains(contentType, "text/html") { - isReadable := readability.IsReadable(readabilityCheckInput) - - article, err := readability.FromReader(readabilityInput, book.URL) - if err != nil { - chProblem <- book.ID - return - } - - book.Author = article.Byline - book.Content = article.TextContent - book.HTML = article.Content - - if !isReadable { - book.Content = "" - } - - if !keepMetadata { - book.Title = article.Title - book.Excerpt = article.Excerpt - } - - if book.Title == "" { - book.Title = book.URL - } - - book.HasContent = book.Content != "" - - // Get image for thumbnail and save it to local disk - var imageURLs []string - if article.Image != "" { - imageURLs = append(imageURLs, article.Image) - } - - if article.Favicon != "" { - imageURLs = append(imageURLs, article.Favicon) - } - - // Save article image to local disk - imgPath := fp.Join(h.DataDir, "thumb", strID) - for _, imageURL := range imageURLs { - err = downloadBookImage(imageURL, imgPath, time.Minute) - if err == nil { - book.ImageURL = path.Join("/", "bookmark", strID, "thumb") - break - } - } - } - - // If needed, update offline archive as well. - // Make sure to delete the old one first. - if request.CreateArchive { - archivePath := fp.Join(h.DataDir, "archive", strID) - os.Remove(archivePath) - - archivalRequest := warc.ArchivalRequest{ - URL: book.URL, - Reader: archivalInput, - ContentType: contentType, - } - - err = warc.NewArchive(archivalRequest, archivePath) - if err != nil { - chProblem <- book.ID - return - } - - book.HasArchive = true - } // Update list of bookmarks mx.Lock() diff --git a/internal/webserver/utils.go b/internal/webserver/utils.go index e951abba..dd5510be 100644 --- a/internal/webserver/utils.go +++ b/internal/webserver/utils.go @@ -3,13 +3,8 @@ package webserver import ( "fmt" "html/template" - "image" - "image/color" - "image/draw" - "image/jpeg" "io" "io/ioutil" - "math" "mime" "net" "net/http" @@ -19,9 +14,6 @@ import ( "regexp" "strings" "syscall" - "time" - - "github.com/disintegration/imaging" ) var rxRepeatedStrip = regexp.MustCompile(`(?i)-+`) @@ -89,95 +81,6 @@ func fileExists(filePath string) bool { return !os.IsNotExist(err) && !info.IsDir() } -func clearUTMParams(url *nurl.URL) { - queries := url.Query() - - for key := range queries { - if strings.HasPrefix(key, "utm_") { - queries.Del(key) - } - } - - url.RawQuery = queries.Encode() -} - -func downloadBookImage(url, dstPath string, timeout time.Duration) error { - // Fetch data from URL - client := &http.Client{Timeout: timeout} - resp, err := client.Get(url) - if err != nil { - return err - } - defer resp.Body.Close() - - // Make sure it's JPG or PNG image - cp := resp.Header.Get("Content-Type") - if !strings.Contains(cp, "image/jpeg") && !strings.Contains(cp, "image/png") { - return fmt.Errorf("%s is not a supported image", url) - } - - // At this point, the download has finished successfully. - // Prepare destination file. - err = os.MkdirAll(fp.Dir(dstPath), os.ModePerm) - if err != nil { - return fmt.Errorf("failed to create image dir: %v", err) - } - - dstFile, err := os.Create(dstPath) - if err != nil { - return fmt.Errorf("failed to create image file: %v", err) - } - defer dstFile.Close() - - // Parse image and process it. - // If image is smaller than 600x400 or its ratio is less than 4:3, resize. - // Else, save it as it is. - img, _, err := image.Decode(resp.Body) - if err != nil { - return fmt.Errorf("failed to parse image %s: %v", url, err) - } - - imgRect := img.Bounds() - imgWidth := imgRect.Dx() - imgHeight := imgRect.Dy() - imgRatio := float64(imgWidth) / float64(imgHeight) - - if imgWidth >= 600 && imgHeight >= 400 && imgRatio > 1.3 { - err = jpeg.Encode(dstFile, img, nil) - } else { - // Create background - bg := image.NewNRGBA(imgRect) - draw.Draw(bg, imgRect, image.NewUniform(color.White), image.Point{}, draw.Src) - draw.Draw(bg, imgRect, img, image.Point{}, draw.Over) - - bg = imaging.Fill(bg, 600, 400, imaging.Center, imaging.Lanczos) - bg = imaging.Blur(bg, 150) - bg = imaging.AdjustBrightness(bg, 30) - - // Create foreground - fg := imaging.Fit(img, 600, 400, imaging.Lanczos) - - // Merge foreground and background - bgRect := bg.Bounds() - fgRect := fg.Bounds() - fgPosition := image.Point{ - X: bgRect.Min.X - int(math.Round(float64(bgRect.Dx()-fgRect.Dx())/2)), - Y: bgRect.Min.Y - int(math.Round(float64(bgRect.Dy()-fgRect.Dy())/2)), - } - - draw.Draw(bg, bgRect, fg, fgPosition, draw.Over) - - // Save to file - err = jpeg.Encode(dstFile, bg, nil) - } - - if err != nil { - return fmt.Errorf("failed to save image %s: %v", url, err) - } - - return nil -} - func createTemplate(filename string, funcMap template.FuncMap) (*template.Template, error) { // Open file src, err := assets.Open(filename)