shiori/pkg/warc/writer.go
2019-06-09 14:54:07 +07:00

105 lines
2.7 KiB
Go

package warc
import (
"fmt"
"io"
nurl "net/url"
"os"
fp "path/filepath"
"strings"
"time"
"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
"go.etcd.io/bbolt"
)
// FromReader create archive from the specified io.Reader.
func FromReader(input io.Reader, url, contentType, dstPath string) error {
// Make sure URL is valid
parsedURL, err := nurl.ParseRequestURI(url)
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
return fmt.Errorf("url %s is not valid", url)
}
// Generate resource URL
res := archiver.ToResourceURL(url, parsedURL)
res.ArchivalURL = "archive-root"
// Create database for archive
os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
db, err := bbolt.Open(dstPath, os.ModePerm, nil)
if err != nil {
return fmt.Errorf("failed to create archive: %v", err)
}
// Create archiver
arc := &archiver.Archiver{
DB: db,
ChDone: make(chan struct{}),
ChErrors: make(chan error),
ChWarnings: make(chan error),
ChRequest: make(chan archiver.ResourceURL, 10),
ResourceMap: make(map[string]struct{}),
LogEnabled: true,
}
defer arc.Close()
// Process input depending on its type.
// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
// If it's not HTML, we can just save it to archive.
var result archiver.ProcessResult
var subResources []archiver.ResourceURL
if strings.Contains(contentType, "text/html") {
result, subResources, err = arc.ProcessHTMLFile(res, input)
} else {
result, err = arc.ProcessOtherFile(res, input)
}
if err != nil {
return fmt.Errorf("archival failed: %v", err)
}
// Add this url to resource map to mark it as processed
arc.ResourceMap[res.DownloadURL] = struct{}{}
// Save content to storage
arc.Logf(0, "Downloaded %s", res.DownloadURL)
result.ContentType = contentType
err = arc.SaveToStorage(result)
if err != nil {
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
}
// If there are no sub resources found, our job is finished.
if len(subResources) == 0 {
return nil
}
// However, if there are, we need to run the archiver in background to
// process the sub resources concurrently.
go func() {
for _, subRes := range subResources {
arc.ChRequest <- subRes
}
}()
time.Sleep(time.Second)
arc.StartArchiver()
return nil
}
// FromURL create archive from the specified URL.
func FromURL(url, dstPath string) error {
// Download URL
resp, err := archiver.DownloadData(url)
if err != nil {
return fmt.Errorf("failed to download %s: %v", url, err)
}
defer resp.Body.Close()
contentType := resp.Header.Get("Content-Type")
return FromReader(resp.Body, url, contentType, dstPath)
}