mirror of
https://github.com/go-shiori/shiori.git
synced 2025-02-22 06:53:22 +08:00
Add initial archiver
This commit is contained in:
parent
95c8717855
commit
4e38387170
16 changed files with 1524 additions and 38 deletions
4
go.mod
4
go.mod
|
@ -20,9 +20,11 @@ require (
|
|||
github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd
|
||||
github.com/sirupsen/logrus v1.4.2
|
||||
github.com/spf13/cobra v0.0.4
|
||||
github.com/tdewolff/parse/v2 v2.3.7
|
||||
go.etcd.io/bbolt v1.3.2
|
||||
golang.org/x/crypto v0.0.0-20190513172903-22d7a77e9e5f
|
||||
golang.org/x/image v0.0.0-20190523035834-f03afa92d3ff // indirect
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092 // indirect
|
||||
golang.org/x/net v0.0.0-20190522155817-f3200d17e092
|
||||
golang.org/x/sys v0.0.0-20190526052359-791d8a0f4d09 // indirect
|
||||
golang.org/x/tools v0.0.0-20190525145741-7be61e1b0e51 // indirect
|
||||
google.golang.org/appengine v1.6.0 // indirect
|
||||
|
|
6
go.sum
6
go.sum
|
@ -80,8 +80,14 @@ github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1
|
|||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/tdewolff/parse/v2 v2.3.7 h1:DXoTUgrUE2Eap0m7zg1ljCO5C78vhEi7HTc4YnJWrRk=
|
||||
github.com/tdewolff/parse/v2 v2.3.7/go.mod h1:HansaqmN4I/U7L6/tUp0NcwT2tFO0F4EAWYGSDzkYNk=
|
||||
github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU=
|
||||
github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4=
|
||||
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
|
||||
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
|
||||
go.etcd.io/bbolt v1.3.2 h1:Z/90sZLPOeCy2PwprqkFa25PdkusRzaj9P8zm/KNyvk=
|
||||
go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU=
|
||||
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
|
|
|
@ -1,12 +1,17 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
nurl "net/url"
|
||||
fp "path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-shiori/shiori/pkg/warc"
|
||||
|
||||
"github.com/go-shiori/go-readability"
|
||||
"github.com/go-shiori/shiori/internal/model"
|
||||
"github.com/spf13/cobra"
|
||||
|
@ -73,14 +78,36 @@ func addHandler(cmd *cobra.Command, args []string) {
|
|||
func() {
|
||||
cInfo.Println("Downloading article...")
|
||||
|
||||
resp, err := httpClient.Get(url)
|
||||
// Prepare request
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to download article: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Send request
|
||||
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to download article: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
article, err := readability.FromReader(resp.Body, url)
|
||||
// Save as archive
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
tee := io.TeeReader(resp.Body, buffer)
|
||||
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
|
||||
err = warc.FromReader(tee, url, contentType, archivePath)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to create archive: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse article
|
||||
article, err := readability.FromReader(buffer, url)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to parse article: %v\n", err)
|
||||
return
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"fmt"
|
||||
"os"
|
||||
fp "path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
|
@ -57,18 +58,20 @@ func deleteHandler(cmd *cobra.Command, args []string) {
|
|||
return
|
||||
}
|
||||
|
||||
// Delete thumbnail image from local disk
|
||||
// Delete thumbnail image and archives from local disk
|
||||
if len(ids) == 0 {
|
||||
thumbDir := fp.Join(DataDir, "thumb")
|
||||
archiveDir := fp.Join(DataDir, "archive")
|
||||
os.RemoveAll(thumbDir)
|
||||
os.RemoveAll(archiveDir)
|
||||
} else {
|
||||
for _, id := range ids {
|
||||
imgPath := fp.Join(DataDir, "thumb", fmt.Sprintf("%d.*", id))
|
||||
matchedFiles, _ := fp.Glob(imgPath)
|
||||
strID := strconv.Itoa(id)
|
||||
imgPath := fp.Join(DataDir, "thumb", strID)
|
||||
archivePath := fp.Join(DataDir, "archive", strID)
|
||||
|
||||
for _, f := range matchedFiles {
|
||||
os.Remove(f)
|
||||
}
|
||||
os.Remove(imgPath)
|
||||
os.Remove(archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,9 +2,15 @@ package cmd
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
fp "path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/go-shiori/shiori/internal/database"
|
||||
"github.com/go-shiori/shiori/pkg/warc"
|
||||
"github.com/julienschmidt/httprouter"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
|
@ -20,6 +26,7 @@ func openCmd() *cobra.Command {
|
|||
}
|
||||
|
||||
cmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt and open ALL bookmarks")
|
||||
cmd.Flags().BoolP("archive", "a", false, "Open the bookmark's archived content")
|
||||
cmd.Flags().BoolP("text-cache", "t", false, "Open the bookmark's text cache in terminal")
|
||||
|
||||
return cmd
|
||||
|
@ -28,8 +35,22 @@ func openCmd() *cobra.Command {
|
|||
func openHandler(cmd *cobra.Command, args []string) {
|
||||
// Parse flags
|
||||
skipConfirm, _ := cmd.Flags().GetBool("yes")
|
||||
archiveMode, _ := cmd.Flags().GetBool("archive")
|
||||
textCacheMode, _ := cmd.Flags().GetBool("text-cache")
|
||||
|
||||
// Convert args to ids
|
||||
ids, err := parseStrIndices(args)
|
||||
if err != nil {
|
||||
cError.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// If in archive mode, only one bookmark allowed
|
||||
if len(ids) > 1 && archiveMode {
|
||||
cError.Println("In archive mode, only one bookmark allowed")
|
||||
return
|
||||
}
|
||||
|
||||
// If no arguments (i.e all bookmarks will be opened),
|
||||
// confirm to user
|
||||
if len(args) == 0 && !skipConfirm {
|
||||
|
@ -42,13 +63,6 @@ func openHandler(cmd *cobra.Command, args []string) {
|
|||
}
|
||||
}
|
||||
|
||||
// Convert args to ids
|
||||
ids, err := parseStrIndices(args)
|
||||
if err != nil {
|
||||
cError.Println(err)
|
||||
return
|
||||
}
|
||||
|
||||
// Read bookmarks from database
|
||||
getOptions := database.GetBookmarksOptions{
|
||||
IDs: ids,
|
||||
|
@ -62,17 +76,16 @@ func openHandler(cmd *cobra.Command, args []string) {
|
|||
}
|
||||
|
||||
if len(bookmarks) == 0 {
|
||||
switch {
|
||||
case len(ids) > 0:
|
||||
if len(ids) > 0 {
|
||||
cError.Println("No matching index found")
|
||||
default:
|
||||
} else {
|
||||
cError.Println("No bookmarks saved yet")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// If not text cache mode, open bookmarks in browser
|
||||
if !textCacheMode {
|
||||
// If not text cache mode nor archive mode, open bookmarks in browser
|
||||
if !textCacheMode && !archiveMode {
|
||||
for _, book := range bookmarks {
|
||||
err = openBrowser(book.URL)
|
||||
if err != nil {
|
||||
|
@ -83,22 +96,74 @@ func openHandler(cmd *cobra.Command, args []string) {
|
|||
}
|
||||
|
||||
// Show bookmarks content in terminal
|
||||
termWidth := getTerminalWidth()
|
||||
if textCacheMode {
|
||||
termWidth := getTerminalWidth()
|
||||
|
||||
for _, book := range bookmarks {
|
||||
cIndex.Printf("%d. ", book.ID)
|
||||
cTitle.Println(book.Title)
|
||||
fmt.Println()
|
||||
for _, book := range bookmarks {
|
||||
cIndex.Printf("%d. ", book.ID)
|
||||
cTitle.Println(book.Title)
|
||||
fmt.Println()
|
||||
|
||||
if book.Content == "" {
|
||||
cError.Println("This bookmark doesn't have any cached content")
|
||||
} else {
|
||||
book.Content = strings.Join(strings.Fields(book.Content), " ")
|
||||
fmt.Println(book.Content)
|
||||
if book.Content == "" {
|
||||
cError.Println("This bookmark doesn't have any cached content")
|
||||
} else {
|
||||
book.Content = strings.Join(strings.Fields(book.Content), " ")
|
||||
fmt.Println(book.Content)
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
cSymbol.Println(strings.Repeat("=", termWidth))
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
// Open archive
|
||||
id := strconv.Itoa(bookmarks[0].ID)
|
||||
archivePath := fp.Join(DataDir, "archive", id)
|
||||
|
||||
archive, err := warc.Open(archivePath)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to open archive: %v\n", err)
|
||||
return
|
||||
}
|
||||
defer archive.Close()
|
||||
|
||||
// Create simple server
|
||||
router := httprouter.New()
|
||||
router.GET("/*filename", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
|
||||
filename := ps.ByName("filename")
|
||||
resourceName := fp.Base(filename)
|
||||
if resourceName == "/" {
|
||||
resourceName = ""
|
||||
}
|
||||
|
||||
fmt.Println()
|
||||
cSymbol.Println(strings.Repeat("=", termWidth))
|
||||
fmt.Println()
|
||||
content, contentType, err := archive.Read(resourceName)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", contentType)
|
||||
if _, err = w.Write(content); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
})
|
||||
|
||||
router.PanicHandler = func(w http.ResponseWriter, r *http.Request, arg interface{}) {
|
||||
http.Error(w, fmt.Sprint(arg), 500)
|
||||
}
|
||||
|
||||
// Choose random port
|
||||
listener, err := net.Listen("tcp", ":0")
|
||||
if err != nil {
|
||||
cError.Printf("Failed to serve archive: %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
portNumber := listener.Addr().(*net.TCPAddr).Port
|
||||
cInfo.Printf("Archive served in http://localhost:%d\n", portNumber)
|
||||
|
||||
err = http.Serve(listener, router)
|
||||
if err != nil {
|
||||
cError.Printf("Failed to serve archive: %v\n", err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"time"
|
||||
|
||||
"github.com/go-shiori/shiori/internal/database"
|
||||
|
@ -15,9 +17,22 @@ var (
|
|||
// DataDir is directory for downloaded data
|
||||
DataDir string
|
||||
|
||||
httpClient = &http.Client{Timeout: time.Minute}
|
||||
httpClient *http.Client
|
||||
)
|
||||
|
||||
func init() {
|
||||
jar, _ := cookiejar.New(nil)
|
||||
httpClient = &http.Client{
|
||||
Timeout: time.Minute,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
},
|
||||
},
|
||||
Jar: jar,
|
||||
}
|
||||
}
|
||||
|
||||
// ShioriCmd returns the root command for shiori
|
||||
func ShioriCmd() *cobra.Command {
|
||||
rootCmd := &cobra.Command{
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
nurl "net/url"
|
||||
fp "path/filepath"
|
||||
"sort"
|
||||
|
@ -12,6 +15,7 @@ import (
|
|||
"github.com/go-shiori/go-readability"
|
||||
"github.com/go-shiori/shiori/internal/database"
|
||||
"github.com/go-shiori/shiori/internal/model"
|
||||
"github.com/go-shiori/shiori/pkg/warc"
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
|
@ -139,8 +143,17 @@ func updateHandler(cmd *cobra.Command, args []string) {
|
|||
<-semaphore
|
||||
}()
|
||||
|
||||
// Download article
|
||||
resp, err := httpClient.Get(book.URL)
|
||||
// Prepare request
|
||||
req, err := http.NewRequest("GET", book.URL, nil)
|
||||
if err != nil {
|
||||
chProblem <- book.ID
|
||||
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Send request
|
||||
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
chProblem <- book.ID
|
||||
chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err)
|
||||
|
@ -148,7 +161,21 @@ func updateHandler(cmd *cobra.Command, args []string) {
|
|||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
article, err := readability.FromReader(resp.Body, book.URL)
|
||||
// Save as archive
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
tee := io.TeeReader(resp.Body, buffer)
|
||||
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
|
||||
err = warc.FromReader(tee, book.URL, contentType, archivePath)
|
||||
if err != nil {
|
||||
chProblem <- book.ID
|
||||
chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse article
|
||||
article, err := readability.FromReader(buffer, book.URL)
|
||||
if err != nil {
|
||||
chProblem <- book.ID
|
||||
chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err)
|
||||
|
|
173
pkg/warc/internal/archiver/archiver.go
Normal file
173
pkg/warc/internal/archiver/archiver.go
Normal file
|
@ -0,0 +1,173 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// Archiver is struct for archiving an URL and its resources.
|
||||
type Archiver struct {
|
||||
sync.RWMutex
|
||||
sync.WaitGroup
|
||||
|
||||
DB *bbolt.DB
|
||||
ChDone chan struct{}
|
||||
ChErrors chan error
|
||||
ChWarnings chan error
|
||||
ChRequest chan ResourceURL
|
||||
ResourceMap map[string]struct{}
|
||||
LogEnabled bool
|
||||
}
|
||||
|
||||
// Close closes channels that used by the Archiver.
|
||||
func (arc *Archiver) Close() {
|
||||
close(arc.ChErrors)
|
||||
close(arc.ChWarnings)
|
||||
close(arc.ChRequest)
|
||||
}
|
||||
|
||||
// StartArchiver starts the archival process.
|
||||
func (arc *Archiver) StartArchiver() []error {
|
||||
go func() {
|
||||
time.Sleep(time.Second)
|
||||
arc.Wait()
|
||||
close(arc.ChDone)
|
||||
}()
|
||||
|
||||
// Download the URL concurrently.
|
||||
// After download finished, parse response to extract resources
|
||||
// URL inside it. After that, send it to channel to download again.
|
||||
errors := make([]error, 0)
|
||||
warnings := make([]error, 0)
|
||||
|
||||
func() {
|
||||
for {
|
||||
select {
|
||||
case <-arc.ChDone:
|
||||
return
|
||||
case err := <-arc.ChErrors:
|
||||
errors = append(errors, err)
|
||||
case err := <-arc.ChWarnings:
|
||||
warnings = append(warnings, err)
|
||||
case res := <-arc.ChRequest:
|
||||
arc.RLock()
|
||||
_, exist := arc.ResourceMap[res.DownloadURL]
|
||||
arc.RUnlock()
|
||||
|
||||
if !exist {
|
||||
arc.Add(1)
|
||||
go arc.archive(res)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Print log message if required
|
||||
if arc.LogEnabled {
|
||||
nErrors := len(errors)
|
||||
nWarnings := len(warnings)
|
||||
arc.Logf(infoLog, "Download finished with %d warnings and %d errors\n", nWarnings, nErrors)
|
||||
|
||||
if nWarnings > 0 {
|
||||
fmt.Println()
|
||||
for _, warning := range warnings {
|
||||
arc.Log(warningLog, warning)
|
||||
}
|
||||
}
|
||||
|
||||
if nErrors > 0 {
|
||||
for _, err := range errors {
|
||||
arc.Log(errorLog, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// archive downloads a subresource and save it to storage.
|
||||
func (arc *Archiver) archive(res ResourceURL) {
|
||||
// Make sure to decrease wait group once finished
|
||||
defer arc.Done()
|
||||
|
||||
// Download resource
|
||||
resp, err := DownloadData(res.DownloadURL)
|
||||
if err != nil {
|
||||
arc.ChErrors <- fmt.Errorf("failed to download %s: %v", res.DownloadURL, err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Process resource depending on its type.
|
||||
// Since this `archive` method only used for processing sub
|
||||
// resource, we will only process the CSS sub resources.
|
||||
// For other file, we will simply download it as it is.
|
||||
var result ProcessResult
|
||||
var subResources []ResourceURL
|
||||
cType := resp.Header.Get("Content-Type")
|
||||
|
||||
switch {
|
||||
case strings.Contains(cType, "text/css"):
|
||||
result, subResources, err = arc.ProcessCSSFile(res, resp.Body)
|
||||
default:
|
||||
result, err = arc.ProcessOtherFile(res, resp.Body)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
arc.ChErrors <- fmt.Errorf("failed to process %s: %v", res.DownloadURL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Add this url to resource map
|
||||
arc.Lock()
|
||||
arc.ResourceMap[res.DownloadURL] = struct{}{}
|
||||
arc.Unlock()
|
||||
|
||||
// Save content to storage
|
||||
arc.Logf(infoLog, "Downloaded %s, parent %s", res.DownloadURL, res.Parent)
|
||||
|
||||
result.ContentType = cType
|
||||
err = arc.SaveToStorage(result)
|
||||
if err != nil {
|
||||
arc.ChErrors <- fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
|
||||
return
|
||||
}
|
||||
|
||||
// Send sub resource to request channel
|
||||
for _, subRes := range subResources {
|
||||
arc.ChRequest <- subRes
|
||||
}
|
||||
}
|
||||
|
||||
// SaveToStorage save processing result to storage.
|
||||
func (arc *Archiver) SaveToStorage(result ProcessResult) error {
|
||||
err := arc.DB.Batch(func(tx *bbolt.Tx) error {
|
||||
bucket := tx.Bucket([]byte(result.Name))
|
||||
if bucket != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
bucket, err := tx.CreateBucketIfNotExists([]byte(result.Name))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = bucket.Put([]byte("content"), result.Content)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = bucket.Put([]byte("type"), []byte(result.ContentType))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
38
pkg/warc/internal/archiver/http-client.go
Normal file
38
pkg/warc/internal/archiver/http-client.go
Normal file
|
@ -0,0 +1,38 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
defaultClient *http.Client
|
||||
)
|
||||
|
||||
func init() {
|
||||
jar, _ := cookiejar.New(nil)
|
||||
defaultClient = &http.Client{
|
||||
Timeout: time.Minute,
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: &tls.Config{
|
||||
InsecureSkipVerify: true,
|
||||
},
|
||||
},
|
||||
Jar: jar,
|
||||
}
|
||||
}
|
||||
|
||||
// DownloadData downloads data from the specified URL.
|
||||
func DownloadData(url string) (*http.Response, error) {
|
||||
// Prepare request
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Send request
|
||||
req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)")
|
||||
return defaultClient.Do(req)
|
||||
}
|
43
pkg/warc/internal/archiver/log.go
Normal file
43
pkg/warc/internal/archiver/log.go
Normal file
|
@ -0,0 +1,43 @@
|
|||
package archiver
|
||||
|
||||
import "github.com/sirupsen/logrus"
|
||||
|
||||
type logType int
|
||||
|
||||
const (
|
||||
infoLog logType = iota
|
||||
errorLog
|
||||
warningLog
|
||||
)
|
||||
|
||||
// Log prints the log ended with newline.
|
||||
func (arc *Archiver) Log(tp logType, msgs ...interface{}) {
|
||||
if !arc.LogEnabled {
|
||||
return
|
||||
}
|
||||
|
||||
switch tp {
|
||||
case errorLog:
|
||||
logrus.Errorln(msgs...)
|
||||
case warningLog:
|
||||
logrus.Warnln(msgs...)
|
||||
default:
|
||||
logrus.Infoln(msgs...)
|
||||
}
|
||||
}
|
||||
|
||||
// Logf print log with specified format.
|
||||
func (arc *Archiver) Logf(tp logType, format string, msgs ...interface{}) {
|
||||
if !arc.LogEnabled {
|
||||
return
|
||||
}
|
||||
|
||||
switch tp {
|
||||
case errorLog:
|
||||
logrus.Errorf(format, msgs...)
|
||||
case warningLog:
|
||||
logrus.Warnf(format, msgs...)
|
||||
default:
|
||||
logrus.Infof(format, msgs...)
|
||||
}
|
||||
}
|
468
pkg/warc/internal/archiver/processor.go
Normal file
468
pkg/warc/internal/archiver/processor.go
Normal file
|
@ -0,0 +1,468 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
nurl "net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/tdewolff/parse/v2/css"
|
||||
"github.com/tdewolff/parse/v2/js"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// ProcessResult is the result from content processing.
|
||||
type ProcessResult struct {
|
||||
Name string
|
||||
ContentType string
|
||||
Content []byte
|
||||
}
|
||||
|
||||
var (
|
||||
rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`)
|
||||
rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
|
||||
rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
|
||||
)
|
||||
|
||||
// ProcessHTMLFile process HTML file that submitted through the io.Reader.
|
||||
func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
|
||||
// Parse HTML document
|
||||
doc, err := html.Parse(input)
|
||||
if err != nil {
|
||||
return ProcessResult{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", res.DownloadURL, err)
|
||||
}
|
||||
|
||||
// Parse URL
|
||||
parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
|
||||
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
|
||||
return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
|
||||
}
|
||||
|
||||
// Convert lazy loaded image to normal
|
||||
fixLazyImages(doc)
|
||||
|
||||
// Convert hyperlinks rith relative URL
|
||||
fixRelativeURIs(doc, parsedURL)
|
||||
|
||||
// Extract resources from each nodes
|
||||
for _, node := range getElementsByTagName(doc, "*") {
|
||||
// First extract resources from inline style
|
||||
cssResources := extractInlineCSS(node, parsedURL)
|
||||
resources = append(resources, cssResources...)
|
||||
|
||||
// Next extract resources from tag's specific attribute
|
||||
nodeResources := []ResourceURL{}
|
||||
switch tagName(node) {
|
||||
case "style":
|
||||
nodeResources = extractStyleTag(node, parsedURL)
|
||||
case "script":
|
||||
nodeResources = extractScriptTag(node, parsedURL)
|
||||
case "meta":
|
||||
nodeResources = extractMetaTag(node, parsedURL)
|
||||
case "img", "picture", "figure", "video", "audio", "source":
|
||||
nodeResources = extractMediaTag(node, parsedURL)
|
||||
case "link":
|
||||
nodeResources = extractGenericTag(node, "href", parsedURL)
|
||||
case "iframe":
|
||||
nodeResources = extractGenericTag(node, "src", parsedURL)
|
||||
case "object":
|
||||
nodeResources = extractGenericTag(node, "data", parsedURL)
|
||||
default:
|
||||
continue
|
||||
}
|
||||
resources = append(resources, nodeResources...)
|
||||
}
|
||||
|
||||
// Get outer HTML of the doc
|
||||
result = ProcessResult{
|
||||
Name: res.ArchivalURL,
|
||||
Content: outerHTML(doc),
|
||||
}
|
||||
|
||||
return result, resources, nil
|
||||
}
|
||||
|
||||
// ProcessCSSFile process CSS file that submitted through the io.Reader.
|
||||
func (arc *Archiver) ProcessCSSFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) {
|
||||
// Parse URL
|
||||
parsedURL, err := nurl.ParseRequestURI(res.DownloadURL)
|
||||
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
|
||||
return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL)
|
||||
}
|
||||
|
||||
// Extract CSS rules
|
||||
rules, resources := processCSS(input, parsedURL)
|
||||
|
||||
result = ProcessResult{
|
||||
Name: res.ArchivalURL,
|
||||
Content: []byte(rules),
|
||||
}
|
||||
|
||||
return result, resources, nil
|
||||
}
|
||||
|
||||
// ProcessOtherFile process files that not HTML, JS or CSS that submitted through the io.Reader.
|
||||
func (arc *Archiver) ProcessOtherFile(res ResourceURL, input io.Reader) (result ProcessResult, err error) {
|
||||
// Copy data to buffer
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
|
||||
_, err = io.Copy(buffer, input)
|
||||
if err != nil {
|
||||
return ProcessResult{}, fmt.Errorf("failed to copy data: %v", err)
|
||||
}
|
||||
|
||||
// Create result
|
||||
result = ProcessResult{
|
||||
Name: res.ArchivalURL,
|
||||
Content: buffer.Bytes(),
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// fixRelativeURIs converts each <a> in the given element
|
||||
// to an absolute URI, ignoring #ref URIs.
|
||||
func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) {
|
||||
links := getAllNodesWithTag(doc, "a")
|
||||
forEachNode(links, func(link *html.Node, _ int) {
|
||||
href := getAttribute(link, "href")
|
||||
if href == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// Replace links with javascript: URIs with text content,
|
||||
// since they won't work after scripts have been removed
|
||||
// from the page.
|
||||
if strings.HasPrefix(href, "javascript:") {
|
||||
text := createTextNode(textContent(link))
|
||||
replaceNode(link, text)
|
||||
} else {
|
||||
newHref := toAbsoluteURI(href, pageURL)
|
||||
if newHref == "" {
|
||||
removeAttribute(link, "href")
|
||||
} else {
|
||||
setAttribute(link, "href", newHref)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// fixLazyImages convert images and figures that have properties like data-src into
|
||||
// images that can be loaded without JS.
|
||||
func fixLazyImages(root *html.Node) {
|
||||
imageNodes := getAllNodesWithTag(root, "img", "picture", "figure")
|
||||
forEachNode(imageNodes, func(elem *html.Node, _ int) {
|
||||
src := getAttribute(elem, "src")
|
||||
srcset := getAttribute(elem, "srcset")
|
||||
nodeTag := tagName(elem)
|
||||
nodeClass := className(elem)
|
||||
|
||||
if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") {
|
||||
for i := 0; i < len(elem.Attr); i++ {
|
||||
attr := elem.Attr[i]
|
||||
if attr.Key == "src" || attr.Key == "srcset" {
|
||||
continue
|
||||
}
|
||||
|
||||
copyTo := ""
|
||||
if rxLazyImageSrcset.MatchString(attr.Val) {
|
||||
copyTo = "srcset"
|
||||
} else if rxLazyImageSrc.MatchString(attr.Val) {
|
||||
copyTo = "src"
|
||||
}
|
||||
|
||||
if copyTo == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if nodeTag == "img" || nodeTag == "picture" {
|
||||
// if this is an img or picture, set the attribute directly
|
||||
setAttribute(elem, copyTo, attr.Val)
|
||||
} else if nodeTag == "figure" && len(getAllNodesWithTag(elem, "img", "picture")) == 0 {
|
||||
// if the item is a <figure> that does not contain an image or picture,
|
||||
// create one and place it inside the figure see the nytimes-3
|
||||
// testcase for an example
|
||||
img := createElement("img")
|
||||
setAttribute(img, copyTo, attr.Val)
|
||||
appendChild(elem, img)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// extractInlineCSS extract archive's resource from the CSS rules inside
|
||||
// style attribute. Once finished, all CSS URLs in the style attribute
|
||||
// will be updated to use the archival URL.
|
||||
func extractInlineCSS(node *html.Node, pageURL *nurl.URL) []ResourceURL {
|
||||
// Make sure this node has inline style
|
||||
styleAttr := getAttribute(node, "style")
|
||||
if styleAttr == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Extract resource URLs from the inline style
|
||||
// and update the CSS rules accordingly.
|
||||
reader := strings.NewReader(styleAttr)
|
||||
newStyleAttr, resources := processCSS(reader, pageURL)
|
||||
setAttribute(node, "style", newStyleAttr)
|
||||
|
||||
return resources
|
||||
}
|
||||
|
||||
// extractStyleTag extract archive's resource from inside a <style> tag.
|
||||
// Once finished, all CSS URLs will be updated to use the archival URL.
|
||||
func extractStyleTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
|
||||
// Extract CSS rules from <style>
|
||||
rules := textContent(node)
|
||||
rules = strings.TrimSpace(rules)
|
||||
if rules == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Extract resource URLs from the rules and update it accordingly.
|
||||
reader := strings.NewReader(rules)
|
||||
newRules, resources := processCSS(reader, pageURL)
|
||||
setTextContent(node, newRules)
|
||||
|
||||
return resources
|
||||
}
|
||||
|
||||
// extractScriptTag extract archive's resource from inside a <script> tag.
|
||||
// Once finished, all URLs inside it will be updated to use the archival URL.
|
||||
func extractScriptTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
|
||||
// Also get the URL from `src` attribute
|
||||
resources := extractGenericTag(node, "src", pageURL)
|
||||
|
||||
// Extract JS code from the <script> itself
|
||||
script := textContent(node)
|
||||
script = strings.TrimSpace(script)
|
||||
if script == "" {
|
||||
return resources
|
||||
}
|
||||
|
||||
reader := strings.NewReader(script)
|
||||
newScript, scriptResources := processJS(reader, pageURL)
|
||||
setTextContent(node, newScript)
|
||||
resources = append(resources, scriptResources...)
|
||||
|
||||
return resources
|
||||
}
|
||||
|
||||
// extractMetaTag extract archive's resource from inside a <meta>.
|
||||
// Normally, <meta> doesn't have any resource URLs. However, as
|
||||
// social media come and grow, a new metadata is added to contain
|
||||
// the hero image for a web page, e.g. og:image, twitter:image, etc.
|
||||
// Once finished, all URLs in <meta> for image will be updated
|
||||
// to use the archival URL.
|
||||
func extractMetaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
|
||||
// Get the needed attributes
|
||||
name := getAttribute(node, "name")
|
||||
property := getAttribute(node, "property")
|
||||
content := getAttribute(node, "content")
|
||||
|
||||
// If this <meta> is not for image, don't process it
|
||||
if !rxImageMeta.MatchString(name + " " + property) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If URL is not valid, skip
|
||||
tmp, err := nurl.ParseRequestURI(content)
|
||||
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Create archive resource and update the href URL
|
||||
res := ToResourceURL(content, pageURL)
|
||||
if res.ArchivalURL == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
setAttribute(node, "content", res.ArchivalURL)
|
||||
return []ResourceURL{res}
|
||||
}
|
||||
|
||||
// extractMediaTag extract resource from inside a media tag e.g.
|
||||
// <img>, <video>, <audio>, <source>. Once finished, all URLs will be
|
||||
// updated to use the archival URL.
|
||||
func extractMediaTag(node *html.Node, pageURL *nurl.URL) []ResourceURL {
|
||||
// Get the needed attributes
|
||||
src := getAttribute(node, "src")
|
||||
poster := getAttribute(node, "poster")
|
||||
strSrcSets := getAttribute(node, "srcset")
|
||||
|
||||
// Create initial resources
|
||||
resources := []ResourceURL{}
|
||||
|
||||
// Save `src` and `poster` to resources
|
||||
if src != "" {
|
||||
res := ToResourceURL(src, pageURL)
|
||||
if res.ArchivalURL != "" {
|
||||
setAttribute(node, "src", res.ArchivalURL)
|
||||
resources = append(resources, res)
|
||||
}
|
||||
}
|
||||
|
||||
if poster != "" {
|
||||
res := ToResourceURL(poster, pageURL)
|
||||
if res.ArchivalURL != "" {
|
||||
setAttribute(node, "poster", res.ArchivalURL)
|
||||
resources = append(resources, res)
|
||||
}
|
||||
}
|
||||
|
||||
// Split srcset by comma, then process it like any URLs
|
||||
srcSets := strings.Split(strSrcSets, ",")
|
||||
for i, srcSet := range srcSets {
|
||||
srcSet = strings.TrimSpace(srcSet)
|
||||
parts := strings.SplitN(srcSet, " ", 2)
|
||||
if parts[0] == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
res := ToResourceURL(parts[0], pageURL)
|
||||
if res.ArchivalURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
srcSets[i] = strings.Replace(srcSets[i], parts[0], res.ArchivalURL, 1)
|
||||
resources = append(resources, res)
|
||||
}
|
||||
|
||||
if len(srcSets) > 0 {
|
||||
setAttribute(node, "srcset", strings.Join(srcSets, ","))
|
||||
}
|
||||
|
||||
return resources
|
||||
}
|
||||
|
||||
// extractGenericTag extract resource from specified attribute.
|
||||
// This method is used for tags where the URL is obviously exist in
|
||||
// the tag, without any additional process needed to extract it.
|
||||
// For example is <link> with its href, <object> with its data, etc.
|
||||
// Once finished, the URL attribute will be updated to use the
|
||||
// archival URL.
|
||||
func extractGenericTag(node *html.Node, attrName string, pageURL *nurl.URL) []ResourceURL {
|
||||
// Get the needed attributes
|
||||
attrValue := getAttribute(node, attrName)
|
||||
if attrValue == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
res := ToResourceURL(attrValue, pageURL)
|
||||
if res.ArchivalURL == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
setAttribute(node, attrName, res.ArchivalURL)
|
||||
return []ResourceURL{res}
|
||||
}
|
||||
|
||||
// processCSSRules extract resource URLs from the specified CSS input.
|
||||
// Returns the new rules with all CSS URLs updated to the archival link.
|
||||
func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
|
||||
// Prepare buffers
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
|
||||
// Scan CSS file and process the resource's URL
|
||||
lexer := css.NewLexer(input)
|
||||
resources := []ResourceURL{}
|
||||
|
||||
for {
|
||||
token, bt := lexer.Next()
|
||||
|
||||
// Check for error
|
||||
if token == css.ErrorToken {
|
||||
break
|
||||
}
|
||||
|
||||
// If it's not an URL, just write it to buffer as it is
|
||||
if token != css.URLToken {
|
||||
buffer.Write(bt)
|
||||
continue
|
||||
}
|
||||
|
||||
// Sanitize the URL by removing `url()`, quotation mark and trailing slash
|
||||
cssURL := string(bt)
|
||||
cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1")
|
||||
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
|
||||
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
|
||||
|
||||
// Save the CSS URL and replace it with archival URL
|
||||
res := ToResourceURL(cssURL, baseURL)
|
||||
if res.ArchivalURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
cssURL = `url("` + res.ArchivalURL + `")`
|
||||
buffer.WriteString(cssURL)
|
||||
resources = append(resources, res)
|
||||
}
|
||||
|
||||
// Return the new rule after all URL has been processed
|
||||
return buffer.String(), resources
|
||||
}
|
||||
|
||||
// processJavascript extract resource URLs from the specified JS input.
|
||||
// Returns the new rules with all URLs updated to the archival link.
|
||||
func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
|
||||
// Prepare buffers
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
|
||||
// Scan JS file and process the resource's URL
|
||||
lexer := js.NewLexer(input)
|
||||
resources := []ResourceURL{}
|
||||
|
||||
for {
|
||||
token, bt := lexer.Next()
|
||||
|
||||
// Check for error
|
||||
if token == js.ErrorToken {
|
||||
break
|
||||
}
|
||||
|
||||
// If it's not a string, just write it to buffer as it is
|
||||
if token != js.StringToken {
|
||||
buffer.Write(bt)
|
||||
continue
|
||||
}
|
||||
|
||||
// Process the string.
|
||||
// Unlike CSS, JS doesn't have it's own URL token. So, we can only guess whether
|
||||
// a string is URL or not. There are three criteria to decide if it's URL :
|
||||
// - It started with http(s):// for absolute URL
|
||||
// - It started with slash (/) for relative URL
|
||||
// - It surrounded by `url()` just like CSS
|
||||
// If it doesn't fulfill any of criteria above, just write it as it is.
|
||||
var res ResourceURL
|
||||
var newURL string
|
||||
|
||||
text := string(bt)
|
||||
text = rxSingleQuote.ReplaceAllString(text, "$1")
|
||||
text = rxDoubleQuote.ReplaceAllString(text, "$1")
|
||||
|
||||
if strings.HasPrefix(text, "url(") {
|
||||
cssURL := rxStyleURL.ReplaceAllString(text, "$1")
|
||||
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
|
||||
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
|
||||
|
||||
res = ToResourceURL(cssURL, baseURL)
|
||||
newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL)
|
||||
} else {
|
||||
buffer.Write(bt)
|
||||
continue
|
||||
}
|
||||
|
||||
if res.ArchivalURL == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
buffer.WriteString(newURL)
|
||||
resources = append(resources, res)
|
||||
}
|
||||
|
||||
// Return the new rule after all URL has been processed
|
||||
return buffer.String(), resources
|
||||
}
|
50
pkg/warc/internal/archiver/resource-url.go
Normal file
50
pkg/warc/internal/archiver/resource-url.go
Normal file
|
@ -0,0 +1,50 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
nurl "net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
rxHTTPScheme = regexp.MustCompile(`(?i)^https?:\/{2}`)
|
||||
rxTrailingSlash = regexp.MustCompile(`(?i)/+$`)
|
||||
rxRepeatedStrip = regexp.MustCompile(`(?i)-+`)
|
||||
)
|
||||
|
||||
// ResourceURL is strcut that contains URL for downloading
|
||||
// and archiving a resource.
|
||||
type ResourceURL struct {
|
||||
DownloadURL string
|
||||
ArchivalURL string
|
||||
Parent string
|
||||
}
|
||||
|
||||
// ToResourceURL generates an uri into a Resource URL.
|
||||
func ToResourceURL(uri string, base *nurl.URL) ResourceURL {
|
||||
// Make sure URL has a valid scheme
|
||||
uri = strings.TrimSpace(uri)
|
||||
switch {
|
||||
case uri == "",
|
||||
strings.Contains(uri, ":") && !rxHTTPScheme.MatchString(uri):
|
||||
return ResourceURL{}
|
||||
}
|
||||
|
||||
// Create archive URL
|
||||
downloadURL := toAbsoluteURI(uri, base)
|
||||
downloadURL = rxTrailingSlash.ReplaceAllString(downloadURL, "")
|
||||
downloadURL = strings.ReplaceAll(downloadURL, " ", "+")
|
||||
|
||||
archivalURL := strings.Replace(downloadURL, "://", "/", 1)
|
||||
archivalURL = strings.ReplaceAll(archivalURL, "?", "-")
|
||||
archivalURL = strings.ReplaceAll(archivalURL, "#", "-")
|
||||
archivalURL = strings.ReplaceAll(archivalURL, "/", "-")
|
||||
archivalURL = strings.ReplaceAll(archivalURL, " ", "-")
|
||||
archivalURL = rxRepeatedStrip.ReplaceAllString(archivalURL, "-")
|
||||
|
||||
return ResourceURL{
|
||||
DownloadURL: downloadURL,
|
||||
ArchivalURL: archivalURL,
|
||||
Parent: base.String(),
|
||||
}
|
||||
}
|
334
pkg/warc/internal/archiver/utils-dom.go
Normal file
334
pkg/warc/internal/archiver/utils-dom.go
Normal file
|
@ -0,0 +1,334 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// getElementsByTagName returns a collection of all elements in the document with
|
||||
// the specified tag name, as an array of Node object.
|
||||
// The special tag "*" will represents all elements.
|
||||
func getElementsByTagName(doc *html.Node, tagName string) []*html.Node {
|
||||
var results []*html.Node
|
||||
var finder func(*html.Node)
|
||||
|
||||
finder = func(node *html.Node) {
|
||||
if node.Type == html.ElementNode && (tagName == "*" || node.Data == tagName) {
|
||||
results = append(results, node)
|
||||
}
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
finder(child)
|
||||
}
|
||||
}
|
||||
|
||||
for child := doc.FirstChild; child != nil; child = child.NextSibling {
|
||||
finder(child)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// createElement creates a new ElementNode with specified tag.
|
||||
func createElement(tagName string) *html.Node {
|
||||
return &html.Node{
|
||||
Type: html.ElementNode,
|
||||
Data: tagName,
|
||||
}
|
||||
}
|
||||
|
||||
// createTextNode creates a new Text node.
|
||||
func createTextNode(data string) *html.Node {
|
||||
return &html.Node{
|
||||
Type: html.TextNode,
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
// tagName returns the tag name of a Node.
|
||||
// If it's not ElementNode, return empty string.
|
||||
func tagName(node *html.Node) string {
|
||||
if node.Type != html.ElementNode {
|
||||
return ""
|
||||
}
|
||||
return node.Data
|
||||
}
|
||||
|
||||
// getAttribute returns the value of a specified attribute on
|
||||
// the element. If the given attribute does not exist, the value
|
||||
// returned will be an empty string.
|
||||
func getAttribute(node *html.Node, attrName string) string {
|
||||
for i := 0; i < len(node.Attr); i++ {
|
||||
if node.Attr[i].Key == attrName {
|
||||
return node.Attr[i].Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// setAttribute sets attribute for node. If attribute already exists,
|
||||
// it will be replaced.
|
||||
func setAttribute(node *html.Node, attrName string, attrValue string) {
|
||||
attrIdx := -1
|
||||
for i := 0; i < len(node.Attr); i++ {
|
||||
if node.Attr[i].Key == attrName {
|
||||
attrIdx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if attrIdx >= 0 {
|
||||
node.Attr[attrIdx].Val = attrValue
|
||||
} else {
|
||||
node.Attr = append(node.Attr, html.Attribute{
|
||||
Key: attrName,
|
||||
Val: attrValue,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// removeAttribute removes attribute with given name.
|
||||
func removeAttribute(node *html.Node, attrName string) {
|
||||
attrIdx := -1
|
||||
for i := 0; i < len(node.Attr); i++ {
|
||||
if node.Attr[i].Key == attrName {
|
||||
attrIdx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if attrIdx >= 0 {
|
||||
a := node.Attr
|
||||
a = append(a[:attrIdx], a[attrIdx+1:]...)
|
||||
node.Attr = a
|
||||
}
|
||||
}
|
||||
|
||||
// hasAttribute returns a Boolean value indicating whether the
|
||||
// specified node has the specified attribute or not.
|
||||
func hasAttribute(node *html.Node, attrName string) bool {
|
||||
for i := 0; i < len(node.Attr); i++ {
|
||||
if node.Attr[i].Key == attrName {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// textContent returns the text content of the specified node,
|
||||
// and all its descendants.
|
||||
func textContent(node *html.Node) string {
|
||||
var buffer bytes.Buffer
|
||||
var finder func(*html.Node)
|
||||
|
||||
finder = func(n *html.Node) {
|
||||
if n.Type == html.TextNode {
|
||||
buffer.WriteString(n.Data)
|
||||
}
|
||||
|
||||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||||
finder(child)
|
||||
}
|
||||
}
|
||||
|
||||
finder(node)
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// outerHTML returns an HTML serialization of the element and its descendants.
|
||||
func outerHTML(node *html.Node) []byte {
|
||||
var buffer bytes.Buffer
|
||||
err := html.Render(&buffer, node)
|
||||
if err != nil {
|
||||
return []byte{}
|
||||
}
|
||||
return buffer.Bytes()
|
||||
}
|
||||
|
||||
// innerHTML returns the HTML content (inner HTML) of an element.
|
||||
func innerHTML(node *html.Node) string {
|
||||
var err error
|
||||
var buffer bytes.Buffer
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
err = html.Render(&buffer, child)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
return strings.TrimSpace(buffer.String())
|
||||
}
|
||||
|
||||
// documentElement returns the Element that is the root element
|
||||
// of the document. Since we are working with HTML document,
|
||||
// the root will be <html> element for HTML documents).
|
||||
func documentElement(doc *html.Node) *html.Node {
|
||||
if nodes := getElementsByTagName(doc, "html"); len(nodes) > 0 {
|
||||
return nodes[0]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// id returns the value of the id attribute of the specified element.
|
||||
func id(node *html.Node) string {
|
||||
id := getAttribute(node, "id")
|
||||
id = strings.TrimSpace(id)
|
||||
return id
|
||||
}
|
||||
|
||||
// className returns the value of the class attribute of
|
||||
// the specified element.
|
||||
func className(node *html.Node) string {
|
||||
className := getAttribute(node, "class")
|
||||
className = strings.TrimSpace(className)
|
||||
className = strings.Join(strings.Fields(className), " ")
|
||||
return className
|
||||
}
|
||||
|
||||
// children returns an HTMLCollection of the child elements of Node.
|
||||
func children(node *html.Node) []*html.Node {
|
||||
var children []*html.Node
|
||||
if node == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.ElementNode {
|
||||
children = append(children, child)
|
||||
}
|
||||
}
|
||||
return children
|
||||
}
|
||||
|
||||
// childNodes returns list of a node's direct children.
|
||||
func childNodes(node *html.Node) []*html.Node {
|
||||
var childNodes []*html.Node
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
childNodes = append(childNodes, child)
|
||||
}
|
||||
return childNodes
|
||||
}
|
||||
|
||||
// firstElementChild returns the object's first child Element,
|
||||
// or nil if there are no child elements.
|
||||
func firstElementChild(node *html.Node) *html.Node {
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Type == html.ElementNode {
|
||||
return child
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// nextElementSibling returns the Element immediately following
|
||||
// the specified one in its parent's children list, or nil if the
|
||||
// specified Element is the last one in the list.
|
||||
func nextElementSibling(node *html.Node) *html.Node {
|
||||
for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling {
|
||||
if sibling.Type == html.ElementNode {
|
||||
return sibling
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// appendChild adds a node to the end of the list of children of a
|
||||
// specified parent node. If the given child is a reference to an
|
||||
// existing node in the document, appendChild() moves it from its
|
||||
// current position to the new position.
|
||||
func appendChild(node *html.Node, child *html.Node) {
|
||||
if child.Parent != nil {
|
||||
temp := cloneNode(child)
|
||||
node.AppendChild(temp)
|
||||
child.Parent.RemoveChild(child)
|
||||
} else {
|
||||
node.AppendChild(child)
|
||||
}
|
||||
}
|
||||
|
||||
// replaceNode replaces an OldNode with a NewNode.
|
||||
func replaceNode(oldNode *html.Node, newNode *html.Node) {
|
||||
if oldNode.Parent == nil {
|
||||
return
|
||||
}
|
||||
|
||||
newNode.Parent = nil
|
||||
newNode.PrevSibling = nil
|
||||
newNode.NextSibling = nil
|
||||
oldNode.Parent.InsertBefore(newNode, oldNode)
|
||||
oldNode.Parent.RemoveChild(oldNode)
|
||||
}
|
||||
|
||||
// includeNode determines if node is included inside nodeList.
|
||||
func includeNode(nodeList []*html.Node, node *html.Node) bool {
|
||||
for i := 0; i < len(nodeList); i++ {
|
||||
if nodeList[i] == node {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// cloneNode returns a deep clone of the node and its children.
|
||||
// However, it will be detached from the original's parents
|
||||
// and siblings.
|
||||
func cloneNode(src *html.Node) *html.Node {
|
||||
clone := &html.Node{
|
||||
Type: src.Type,
|
||||
DataAtom: src.DataAtom,
|
||||
Data: src.Data,
|
||||
Attr: make([]html.Attribute, len(src.Attr)),
|
||||
}
|
||||
|
||||
copy(clone.Attr, src.Attr)
|
||||
for child := src.FirstChild; child != nil; child = child.NextSibling {
|
||||
clone.AppendChild(cloneNode(child))
|
||||
}
|
||||
|
||||
return clone
|
||||
}
|
||||
|
||||
func getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node {
|
||||
var result []*html.Node
|
||||
for i := 0; i < len(tagNames); i++ {
|
||||
result = append(result, getElementsByTagName(node, tagNames[i])...)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// forEachNode iterates over a NodeList and runs fn on each node.
|
||||
func forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) {
|
||||
for i := 0; i < len(nodeList); i++ {
|
||||
fn(nodeList[i], i)
|
||||
}
|
||||
}
|
||||
|
||||
// removeNodes iterates over a NodeList, calls `filterFn` for each node
|
||||
// and removes node if function returned `true`. If function is not
|
||||
// passed, removes all the nodes in node list.
|
||||
func removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) {
|
||||
for i := len(nodeList) - 1; i >= 0; i-- {
|
||||
node := nodeList[i]
|
||||
parentNode := node.Parent
|
||||
if parentNode != nil && (filterFn == nil || filterFn(node)) {
|
||||
parentNode.RemoveChild(node)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// setTextContent sets the text content of the specified node.
|
||||
func setTextContent(node *html.Node, text string) {
|
||||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||
if child.Parent != nil {
|
||||
child.Parent.RemoveChild(child)
|
||||
}
|
||||
}
|
||||
|
||||
node.AppendChild(&html.Node{
|
||||
Type: html.TextNode,
|
||||
Data: text,
|
||||
})
|
||||
}
|
54
pkg/warc/internal/archiver/utils.go
Normal file
54
pkg/warc/internal/archiver/utils.go
Normal file
|
@ -0,0 +1,54 @@
|
|||
package archiver
|
||||
|
||||
import (
|
||||
nurl "net/url"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`)
|
||||
rxSingleQuote = regexp.MustCompile(`(?i)^'(.*)'$`)
|
||||
rxDoubleQuote = regexp.MustCompile(`(?i)^"(.*)"$`)
|
||||
rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`)
|
||||
)
|
||||
|
||||
func clearUTMParams(url *nurl.URL) {
|
||||
queries := url.Query()
|
||||
|
||||
for key := range queries {
|
||||
if strings.HasPrefix(key, "utm_") {
|
||||
queries.Del(key)
|
||||
}
|
||||
}
|
||||
|
||||
url.RawQuery = queries.Encode()
|
||||
}
|
||||
|
||||
// toAbsoluteURI convert uri to absolute path based on base.
|
||||
// However, if uri is prefixed with hash (#), the uri won't be changed.
|
||||
func toAbsoluteURI(uri string, base *nurl.URL) string {
|
||||
if uri == "" || base == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
// If it is hash tag, return as it is
|
||||
if uri[:1] == "#" {
|
||||
return uri
|
||||
}
|
||||
|
||||
// If it is already an absolute URL, return as it is
|
||||
tmp, err := nurl.ParseRequestURI(uri)
|
||||
if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
|
||||
return uri
|
||||
}
|
||||
|
||||
// Otherwise, resolve against base URI.
|
||||
tmp, err = nurl.Parse(uri)
|
||||
if err != nil {
|
||||
return uri
|
||||
}
|
||||
|
||||
clearUTMParams(tmp)
|
||||
return base.ResolveReference(tmp).String()
|
||||
}
|
76
pkg/warc/reader.go
Normal file
76
pkg/warc/reader.go
Normal file
|
@ -0,0 +1,76 @@
|
|||
package warc
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// Archive is the storage for archiving the web page.
|
||||
type Archive struct {
|
||||
db *bbolt.DB
|
||||
}
|
||||
|
||||
// Open opens the archive from specified path.
|
||||
func Open(path string) (*Archive, error) {
|
||||
// Make sure archive exists
|
||||
info, err := os.Stat(path)
|
||||
if os.IsNotExist(err) || info.IsDir() {
|
||||
return nil, fmt.Errorf("archive doesn't exist")
|
||||
}
|
||||
|
||||
// Open database
|
||||
options := &bbolt.Options{
|
||||
ReadOnly: true,
|
||||
}
|
||||
|
||||
db, err := bbolt.Open(path, os.ModePerm, options)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &Archive{db: db}, nil
|
||||
}
|
||||
|
||||
// Close closes the storage.
|
||||
func (arc *Archive) Close() {
|
||||
arc.db.Close()
|
||||
}
|
||||
|
||||
// Read fetch the resource with specified name from archive.
|
||||
func (arc *Archive) Read(name string) ([]byte, string, error) {
|
||||
// Make sure name exists
|
||||
if name == "" {
|
||||
name = "archive-root"
|
||||
}
|
||||
|
||||
var content []byte
|
||||
var strContentType string
|
||||
|
||||
err := arc.db.View(func(tx *bbolt.Tx) error {
|
||||
bucket := tx.Bucket([]byte(name))
|
||||
if bucket == nil {
|
||||
return fmt.Errorf("%s doesn't exist", name)
|
||||
}
|
||||
|
||||
contentType := bucket.Get([]byte("type"))
|
||||
if contentType == nil {
|
||||
return fmt.Errorf("%s doesn't exist", name)
|
||||
}
|
||||
strContentType = string(contentType)
|
||||
|
||||
content = bucket.Get([]byte("content"))
|
||||
if content == nil {
|
||||
return fmt.Errorf("%s doesn't exist", name)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
return content, strContentType, nil
|
||||
}
|
105
pkg/warc/writer.go
Normal file
105
pkg/warc/writer.go
Normal file
|
@ -0,0 +1,105 @@
|
|||
package warc
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
nurl "net/url"
|
||||
"os"
|
||||
fp "path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
|
||||
"go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// FromReader create archive from the specified io.Reader.
|
||||
func FromReader(input io.Reader, url, contentType, dstPath string) error {
|
||||
// Make sure URL is valid
|
||||
parsedURL, err := nurl.ParseRequestURI(url)
|
||||
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
|
||||
return fmt.Errorf("url %s is not valid", url)
|
||||
}
|
||||
|
||||
// Generate resource URL
|
||||
res := archiver.ToResourceURL(url, parsedURL)
|
||||
res.ArchivalURL = "archive-root"
|
||||
|
||||
// Create database for archive
|
||||
os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
|
||||
|
||||
db, err := bbolt.Open(dstPath, os.ModePerm, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create archive: %v", err)
|
||||
}
|
||||
|
||||
// Create archiver
|
||||
arc := &archiver.Archiver{
|
||||
DB: db,
|
||||
ChDone: make(chan struct{}),
|
||||
ChErrors: make(chan error),
|
||||
ChWarnings: make(chan error),
|
||||
ChRequest: make(chan archiver.ResourceURL, 10),
|
||||
ResourceMap: make(map[string]struct{}),
|
||||
LogEnabled: true,
|
||||
}
|
||||
defer arc.Close()
|
||||
|
||||
// Process input depending on its type.
|
||||
// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
|
||||
// If it's not HTML, we can just save it to archive.
|
||||
var result archiver.ProcessResult
|
||||
var subResources []archiver.ResourceURL
|
||||
|
||||
if strings.Contains(contentType, "text/html") {
|
||||
result, subResources, err = arc.ProcessHTMLFile(res, input)
|
||||
} else {
|
||||
result, err = arc.ProcessOtherFile(res, input)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("archival failed: %v", err)
|
||||
}
|
||||
|
||||
// Add this url to resource map to mark it as processed
|
||||
arc.ResourceMap[res.DownloadURL] = struct{}{}
|
||||
|
||||
// Save content to storage
|
||||
arc.Logf(0, "Downloaded %s", res.DownloadURL)
|
||||
|
||||
result.ContentType = contentType
|
||||
err = arc.SaveToStorage(result)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
|
||||
}
|
||||
|
||||
// If there are no sub resources found, our job is finished.
|
||||
if len(subResources) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// However, if there are, we need to run the archiver in background to
|
||||
// process the sub resources concurrently.
|
||||
go func() {
|
||||
for _, subRes := range subResources {
|
||||
arc.ChRequest <- subRes
|
||||
}
|
||||
}()
|
||||
|
||||
time.Sleep(time.Second)
|
||||
arc.StartArchiver()
|
||||
return nil
|
||||
}
|
||||
|
||||
// FromURL create archive from the specified URL.
|
||||
func FromURL(url, dstPath string) error {
|
||||
// Download URL
|
||||
resp, err := archiver.DownloadData(url)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download %s: %v", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
return FromReader(resp.Body, url, contentType, dstPath)
|
||||
}
|
Loading…
Reference in a new issue