mirror of
https://github.com/go-shiori/shiori.git
synced 2024-11-15 21:55:31 +08:00
105 lines
2.7 KiB
Go
105 lines
2.7 KiB
Go
package warc
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
nurl "net/url"
|
|
"os"
|
|
fp "path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/go-shiori/shiori/pkg/warc/internal/archiver"
|
|
"go.etcd.io/bbolt"
|
|
)
|
|
|
|
// FromReader create archive from the specified io.Reader.
|
|
func FromReader(input io.Reader, url, contentType, dstPath string) error {
|
|
// Make sure URL is valid
|
|
parsedURL, err := nurl.ParseRequestURI(url)
|
|
if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" {
|
|
return fmt.Errorf("url %s is not valid", url)
|
|
}
|
|
|
|
// Generate resource URL
|
|
res := archiver.ToResourceURL(url, parsedURL)
|
|
res.ArchivalURL = "archive-root"
|
|
|
|
// Create database for archive
|
|
os.MkdirAll(fp.Dir(dstPath), os.ModePerm)
|
|
|
|
db, err := bbolt.Open(dstPath, os.ModePerm, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create archive: %v", err)
|
|
}
|
|
|
|
// Create archiver
|
|
arc := &archiver.Archiver{
|
|
DB: db,
|
|
ChDone: make(chan struct{}),
|
|
ChErrors: make(chan error),
|
|
ChWarnings: make(chan error),
|
|
ChRequest: make(chan archiver.ResourceURL, 10),
|
|
ResourceMap: make(map[string]struct{}),
|
|
LogEnabled: true,
|
|
}
|
|
defer arc.Close()
|
|
|
|
// Process input depending on its type.
|
|
// If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files.
|
|
// If it's not HTML, we can just save it to archive.
|
|
var result archiver.ProcessResult
|
|
var subResources []archiver.ResourceURL
|
|
|
|
if strings.Contains(contentType, "text/html") {
|
|
result, subResources, err = arc.ProcessHTMLFile(res, input)
|
|
} else {
|
|
result, err = arc.ProcessOtherFile(res, input)
|
|
}
|
|
|
|
if err != nil {
|
|
return fmt.Errorf("archival failed: %v", err)
|
|
}
|
|
|
|
// Add this url to resource map to mark it as processed
|
|
arc.ResourceMap[res.DownloadURL] = struct{}{}
|
|
|
|
// Save content to storage
|
|
arc.Logf(0, "Downloaded %s", res.DownloadURL)
|
|
|
|
result.ContentType = contentType
|
|
err = arc.SaveToStorage(result)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err)
|
|
}
|
|
|
|
// If there are no sub resources found, our job is finished.
|
|
if len(subResources) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// However, if there are, we need to run the archiver in background to
|
|
// process the sub resources concurrently.
|
|
go func() {
|
|
for _, subRes := range subResources {
|
|
arc.ChRequest <- subRes
|
|
}
|
|
}()
|
|
|
|
time.Sleep(time.Second)
|
|
arc.StartArchiver()
|
|
return nil
|
|
}
|
|
|
|
// FromURL create archive from the specified URL.
|
|
func FromURL(url, dstPath string) error {
|
|
// Download URL
|
|
resp, err := archiver.DownloadData(url)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to download %s: %v", url, err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
return FromReader(resp.Body, url, contentType, dstPath)
|
|
}
|