shiori/internal/cmd/pocket.go
Marc Brugger 3091d844c0
fix: parse pocket new CSV format (#1112)
* fix pocket parsing error

Signed-off-by: bakito <github@bakito.ch>

* add tests forpocket csv

Signed-off-by: bakito <github@bakito.ch>

* Use file name from test case

* fix lint ant test issues

Signed-off-by: bakito <github@bakito.ch>

---------

Signed-off-by: bakito <github@bakito.ch>
Co-authored-by: Felipe Martin <812088+fmartingr@users.noreply.github.com>
2025-06-27 13:35:15 +02:00

222 lines
5.4 KiB
Go

package cmd
import (
"context"
"encoding/csv"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/go-shiori/shiori/internal/core"
"github.com/go-shiori/shiori/internal/model"
"github.com/spf13/cobra"
)
func pocketCmd() *cobra.Command {
cmd := &cobra.Command{
Use: "pocket source-file",
Short: "Import bookmarks from Pocket's data export file",
Args: cobra.ExactArgs(1),
Run: pocketHandler,
}
return cmd
}
func pocketHandler(cmd *cobra.Command, args []string) {
ctx := cmd.Context()
_, deps := initShiori(ctx, cmd)
// Open pocket's file
filePath := args[0]
srcFile, err := os.Open(filePath)
if err != nil {
cError.Println(err)
os.Exit(1)
}
defer srcFile.Close()
var bookmarks []model.BookmarkDTO
switch filepath.Ext(filePath) {
case ".html":
bookmarks = parseHtmlExport(ctx, deps.Database(), srcFile)
case ".csv":
bookmarks = parseCsvExport(ctx, deps.Database(), srcFile)
default:
cError.Println("Invalid file format. Only HTML and CSV are supported.")
os.Exit(1)
}
// Save bookmark to database
bookmarks, err = deps.Database().SaveBookmarks(ctx, true, bookmarks...)
if err != nil {
cError.Printf("Failed to save bookmarks: %v\n", err)
os.Exit(1)
}
// Print imported bookmarks
fmt.Println()
printBookmarks(bookmarks...)
}
// Parse bookmarks from HTML file
func parseHtmlExport(ctx context.Context, db model.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})
doc, err := goquery.NewDocumentFromReader(srcFile)
if err != nil {
cError.Println(err)
os.Exit(1)
}
doc.Find("a").Each(func(_ int, a *goquery.Selection) {
// Get metadata
title := a.Text()
url, _ := a.Attr("href")
tagsStr, _ := a.Attr("tags")
timeAddedStr, _ := a.Attr("time_added")
title, url, timeAdded, tags, err := verifyMetadata(title, url, timeAddedStr, tagsStr)
if err != nil {
cError.Printf("Skip %s: %v\n", url, err)
return
}
if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
return
}
// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}
mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
})
return bookmarks
}
// Parse bookmarks from CSV file
func parseCsvExport(ctx context.Context, db model.DB, srcFile *os.File) []model.BookmarkDTO {
bookmarks := []model.BookmarkDTO{}
mapURL := make(map[string]struct{})
reader := csv.NewReader(srcFile)
records, err := reader.ReadAll()
if err != nil {
cError.Println(err)
os.Exit(1)
}
var titleIdx, urlIdx, timeAddedIdx, tagsIdx int
for i, cols := range records {
// Check and skip header
if i == 0 {
titleIdx = slices.Index(cols, "title")
urlIdx = slices.Index(cols, "url")
timeAddedIdx = slices.Index(cols, "time_added")
tagsIdx = slices.Index(cols, "tags")
if titleIdx == -1 || urlIdx == -1 || timeAddedIdx == -1 || tagsIdx == -1 {
cError.Printf("Invalid CSV format. Header must contain: title, url, time_added, tags\n")
os.Exit(1)
}
continue
}
// Get metadata
title, url, timeAdded, tags, err := verifyMetadata(cols[titleIdx], cols[urlIdx], cols[timeAddedIdx], cols[tagsIdx])
if err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}
if err = handleDuplicates(ctx, db, mapURL, url); err != nil {
cError.Printf("Skip %s: %v\n", url, err)
continue
}
// Add item to list
bookmark := model.BookmarkDTO{
URL: url,
Title: title,
ModifiedAt: timeAdded.Format(model.DatabaseDateFormat),
CreatedAt: timeAdded.Format(model.DatabaseDateFormat),
Tags: tags,
}
mapURL[url] = struct{}{}
bookmarks = append(bookmarks, bookmark)
}
return bookmarks
}
// Parse metadata and verify it's validity
func verifyMetadata(title, url, timeAddedStr, tags string) (string, string, time.Time, []model.TagDTO, error) {
// Clean up URL
var err error
url, err = core.RemoveUTMParams(url)
if err != nil {
err = fmt.Errorf("URL is not valid, %w", err)
return "", "", time.Time{}, nil, err
}
// Make sure title is valid Utf-8
title = validateTitle(title, url)
// Parse time added
timeAddedInt, err := strconv.ParseInt(timeAddedStr, 10, 64)
if err != nil {
err = fmt.Errorf("Invalid time added, %w", err)
return "", "", time.Time{}, nil, err
}
timeAdded := time.Unix(timeAddedInt, 0)
// Get bookmark tags
tagsList := []model.TagDTO{}
// We need to split tags by both comma or pipe,
// because Pocket's CSV export use pipe as separator,
// while HTML export use comma.
for _, tag := range regexp.MustCompile(`[,|]`).Split(tags, -1) {
if tag != "" {
tagsList = append(tagsList, model.TagDTO{
Tag: model.Tag{Name: tag},
})
}
}
return title, url, timeAdded, tagsList, nil
}
// Checks if the URL already exist, both in bookmark
// file or in database
func handleDuplicates(ctx context.Context, db model.DB, mapURL map[string]struct{}, url string) error {
if _, exists := mapURL[url]; exists {
return errors.New("URL already exists")
}
_, exists, err := db.GetBookmark(ctx, 0, url)
if err != nil {
return fmt.Errorf("Failed getting bookmark, %w", err)
}
if exists {
return errors.New("URL already exists")
}
return nil
}