2019-06-09 15:54:07 +08:00
|
|
|
package warc
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
nurl "net/url"
|
|
|
|
"os"
|
|
|
|
fp "path/filepath"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
|
|
|
|
"go.etcd.io/bbolt"
|
|
|
|
)
|
|
|
|
|
2019-06-09 23:49:25 +08:00
|
|
|
// ArchivalRequest is request for archiving a web page,
|
|
|
|
// either from URL or from an io.Reader.
|
|
|
|
type ArchivalRequest struct {
|
|
|
|
URL string
|
|
|
|
Reader io.Reader
|
|
|
|
ContentType string
|
|
|
|
LogEnabled bool
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewArchive creates new archive based on submitted request,
|
|
|
|
// then save it to specified path.
|
|
|
|
func NewArchive(req ArchivalRequest, dstPath string) error {
|
2019-06-09 15:54:07 +08:00
|
|
|
// Make sure URL is valid
|
2019-06-09 23:49:25 +08:00
|
|
|
parsedURL, err := nurl.ParseRequestURI(req.URL)
|
2019-06-09 15:54:07 +08:00
|
|
|
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
|
2019-06-09 23:49:25 +08:00
|
|
|
return fmt.Errorf("url %s is not valid", req.URL)
|
2019-06-09 15:54:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Generate resource URL
|
2019-06-09 23:49:25 +08:00
|
|
|
res := archiver.ToResourceURL(req.URL, parsedURL)
|
2019-06-09 15:54:07 +08:00
|
|
|
res.ArchivalURL = "archive-root"
|
|
|
|
|
2019-06-09 23:49:25 +08:00
|
|
|
// Download URL if needed
|
|
|
|
if req.Reader == nil || req.ContentType == "" {
|
|
|
|
resp, err := archiver.DownloadData(res.DownloadURL)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to download %s: %v", req.URL, err)
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
req.Reader = resp.Body
|
|
|
|
req.ContentType = resp.Header.Get("Content-Type")
|
|
|
|
}
|
|
|
|
|
2019-06-09 15:54:07 +08:00
|
|
|
// Create database for archive
|
|
|
|
os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
|
|
|
|
|
|
|
|
db, err := bbolt.Open(dstPath, os.ModePerm, nil)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to create archive: %v", err)
|
|
|
|
}
|
2019-08-06 21:51:06 +08:00
|
|
|
defer db.Close()
|
2019-06-09 15:54:07 +08:00
|
|
|
|
|
|
|
// Create archiver
|
|
|
|
arc := &archiver.Archiver{
|
|
|
|
DB: db,
|
|
|
|
ChDone: make(chan struct{}),
|
|
|
|
ChErrors: make(chan error),
|
|
|
|
ChWarnings: make(chan error),
|
|
|
|
ChRequest: make(chan archiver.ResourceURL, 10),
|
|
|
|
ResourceMap: make(map[string]struct{}),
|
2019-06-09 23:49:25 +08:00
|
|
|
LogEnabled: req.LogEnabled,
|
2019-06-09 15:54:07 +08:00
|
|
|
}
|
2019-08-12 23:06:36 +08:00
|
|
|
|
|
|
|
// TODO: investigate whether the channel must be closed or not.
|
|
|
|
// At first, I thought the channels must be closed. Unfortunately, it leads to
|
|
|
|
// a panic when error message is accidentally sent after error channels closed.
|
|
|
|
// defer arc.Close()
|
2019-06-09 15:54:07 +08:00
|
|
|
|
|
|
|
// Process input depending on its type.
|
|
|
|
// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
|
|
|
|
// If it's not HTML, we can just save it to archive.
|
|
|
|
var result archiver.ProcessResult
|
|
|
|
var subResources []archiver.ResourceURL
|
|
|
|
|
2019-06-09 23:49:25 +08:00
|
|
|
if strings.Contains(req.ContentType, "text/html") {
|
|
|
|
result, subResources, err = arc.ProcessHTMLFile(res, req.Reader)
|
2019-06-09 15:54:07 +08:00
|
|
|
} else {
|
2019-06-09 23:49:25 +08:00
|
|
|
result, err = arc.ProcessOtherFile(res, req.Reader)
|
2019-06-09 15:54:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("archival failed: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add this url to resource map to mark it as processed
|
|
|
|
arc.ResourceMap[res.DownloadURL] = struct{}{}
|
|
|
|
|
|
|
|
// Save content to storage
|
|
|
|
arc.Logf(0, "Downloaded %s", res.DownloadURL)
|
|
|
|
|
2019-06-09 23:49:25 +08:00
|
|
|
result.ContentType = req.ContentType
|
2019-06-09 15:54:07 +08:00
|
|
|
err = arc.SaveToStorage(result)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there are no sub resources found, our job is finished.
|
|
|
|
if len(subResources) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// However, if there are, we need to run the archiver in background to
|
|
|
|
// process the sub resources concurrently.
|
|
|
|
go func() {
|
|
|
|
for _, subRes := range subResources {
|
|
|
|
arc.ChRequest <- subRes
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
|
|
|
time.Sleep(time.Second)
|
|
|
|
arc.StartArchiver()
|
|
|
|
return nil
|
|
|
|
}
|