Refactor archiver

This commit is contained in:
Radhi Fadlillah 2019-06-09 22:49:25 +07:00
parent 89cc8caa23
commit 94b59a29a7
3 changed files with 47 additions and 30 deletions

View file

@ -96,11 +96,15 @@ func addHandler(cmd *cobra.Command, args []string) {
// Save as archive // Save as archive
buffer := bytes.NewBuffer(nil) buffer := bytes.NewBuffer(nil)
tee := io.TeeReader(resp.Body, buffer)
contentType := resp.Header.Get("Content-Type")
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID)) archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
err = warc.FromReader(tee, url, contentType, archivePath) archivalRequest := warc.ArchivalRequest{
URL: url,
Reader: io.TeeReader(resp.Body, buffer),
ContentType: resp.Header.Get("Content-Type"),
}
err = warc.NewArchive(archivalRequest, archivePath)
if err != nil { if err != nil {
cError.Printf("Failed to create archive: %v\n", err) cError.Printf("Failed to create archive: %v\n", err)
return return

View file

@ -180,14 +180,18 @@ func updateHandler(cmd *cobra.Command, args []string) {
defer resp.Body.Close() defer resp.Body.Close()
// Save as archive, make sure to delete the old one first // Save as archive, make sure to delete the old one first
buffer := bytes.NewBuffer(nil)
archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID)) archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID))
os.Remove(archivePath) os.Remove(archivePath)
buffer := bytes.NewBuffer(nil) archivalRequest := warc.ArchivalRequest{
tee := io.TeeReader(resp.Body, buffer) URL: book.URL,
Reader: io.TeeReader(resp.Body, buffer),
ContentType: resp.Header.Get("Content-Type"),
}
contentType := resp.Header.Get("Content-Type") err = warc.NewArchive(archivalRequest, archivePath)
err = warc.FromReader(tee, book.URL, contentType, archivePath)
if err != nil { if err != nil {
chProblem <- book.ID chProblem <- book.ID
chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err) chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err)

View file

@ -13,18 +13,40 @@ import (
"go.etcd.io/bbolt" "go.etcd.io/bbolt"
) )
// FromReader create archive from the specified io.Reader. // ArchivalRequest is request for archiving a web page,
func FromReader(input io.Reader, url, contentType, dstPath string) error { // either from URL or from an io.Reader.
type ArchivalRequest struct {
URL string
Reader io.Reader
ContentType string
LogEnabled bool
}
// NewArchive creates new archive based on submitted request,
// then save it to specified path.
func NewArchive(req ArchivalRequest, dstPath string) error {
// Make sure URL is valid // Make sure URL is valid
parsedURL, err := nurl.ParseRequestURI(url) parsedURL, err := nurl.ParseRequestURI(req.URL)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return fmt.Errorf("url %s is not valid", url) return fmt.Errorf("url %s is not valid", req.URL)
} }
// Generate resource URL // Generate resource URL
res := archiver.ToResourceURL(url, parsedURL) res := archiver.ToResourceURL(req.URL, parsedURL)
res.ArchivalURL = "archive-root" res.ArchivalURL = "archive-root"
// Download URL if needed
if req.Reader == nil || req.ContentType == "" {
resp, err := archiver.DownloadData(res.DownloadURL)
if err != nil {
return fmt.Errorf("failed to download %s: %v", req.URL, err)
}
defer resp.Body.Close()
req.Reader = resp.Body
req.ContentType = resp.Header.Get("Content-Type")
}
// Create database for archive // Create database for archive
os.MkdirAll(fp.Dir(dstPath), os.ModePerm) os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
@ -41,7 +63,7 @@ func FromReader(input io.Reader, url, contentType, dstPath string) error {
ChWarnings: make(chan error), ChWarnings: make(chan error),
ChRequest: make(chan archiver.ResourceURL, 10), ChRequest: make(chan archiver.ResourceURL, 10),
ResourceMap: make(map[string]struct{}), ResourceMap: make(map[string]struct{}),
LogEnabled: true, LogEnabled: req.LogEnabled,
} }
defer arc.Close() defer arc.Close()
@ -51,10 +73,10 @@ func FromReader(input io.Reader, url, contentType, dstPath string) error {
var result archiver.ProcessResult var result archiver.ProcessResult
var subResources []archiver.ResourceURL var subResources []archiver.ResourceURL
if strings.Contains(contentType, "text/html") { if strings.Contains(req.ContentType, "text/html") {
result, subResources, err = arc.ProcessHTMLFile(res, input) result, subResources, err = arc.ProcessHTMLFile(res, req.Reader)
} else { } else {
result, err = arc.ProcessOtherFile(res, input) result, err = arc.ProcessOtherFile(res, req.Reader)
} }
if err != nil { if err != nil {
@ -67,7 +89,7 @@ func FromReader(input io.Reader, url, contentType, dstPath string) error {
// Save content to storage // Save content to storage
arc.Logf(0, "Downloaded %s", res.DownloadURL) arc.Logf(0, "Downloaded %s", res.DownloadURL)
result.ContentType = contentType result.ContentType = req.ContentType
err = arc.SaveToStorage(result) err = arc.SaveToStorage(result)
if err != nil { if err != nil {
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err) return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
@ -90,16 +112,3 @@ func FromReader(input io.Reader, url, contentType, dstPath string) error {
arc.StartArchiver() arc.StartArchiver()
return nil return nil
} }
// FromURL create archive from the specified URL.
func FromURL(url, dstPath string) error {
// Download URL
resp, err := archiver.DownloadData(url)
if err != nil {
return fmt.Errorf("failed to download %s: %v", url, err)
}
defer resp.Body.Close()
contentType := resp.Header.Get("Content-Type")
return FromReader(resp.Body, url, contentType, dstPath)
}