package warc import ( "fmt" "io" nurl "net/url" "os" fp "path/filepath" "strings" "time" "github.com/go-shiori/shiori/pkg/warc/internal/archiver" "go.etcd.io/bbolt" ) // ArchivalRequest is request for archiving a web page, // either from URL or from an io.Reader. type ArchivalRequest struct { URL string Reader io.Reader ContentType string LogEnabled bool } // NewArchive creates new archive based on submitted request, // then save it to specified path. func NewArchive(req ArchivalRequest, dstPath string) error { // Make sure URL is valid parsedURL, err := nurl.ParseRequestURI(req.URL) if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { return fmt.Errorf("url %s is not valid", req.URL) } // Generate resource URL res := archiver.ToResourceURL(req.URL, parsedURL) res.ArchivalURL = "archive-root" // Download URL if needed if req.Reader == nil || req.ContentType == "" { resp, err := archiver.DownloadData(res.DownloadURL) if err != nil { return fmt.Errorf("failed to download %s: %v", req.URL, err) } defer resp.Body.Close() req.Reader = resp.Body req.ContentType = resp.Header.Get("Content-Type") } // Create database for archive os.MkdirAll(fp.Dir(dstPath), os.ModePerm) db, err := bbolt.Open(dstPath, os.ModePerm, nil) if err != nil { return fmt.Errorf("failed to create archive: %v", err) } defer db.Close() // Create archiver arc := &archiver.Archiver{ DB: db, ChDone: make(chan struct{}), ChErrors: make(chan error), ChWarnings: make(chan error), ChRequest: make(chan archiver.ResourceURL, 10), ResourceMap: make(map[string]struct{}), LogEnabled: req.LogEnabled, } // TODO: investigate whether the channel must be closed or not. // At first, I thought the channels must be closed. Unfortunately, it leads to // a panic when error message is accidentally sent after error channels closed. // defer arc.Close() // Process input depending on its type. // If it's HTML, we need to extract the sub resources that used by it, e.g some CSS or JS files. // If it's not HTML, we can just save it to archive. var result archiver.ProcessResult var subResources []archiver.ResourceURL if strings.Contains(req.ContentType, "text/html") { result, subResources, err = arc.ProcessHTMLFile(res, req.Reader) } else { result, err = arc.ProcessOtherFile(res, req.Reader) } if err != nil { return fmt.Errorf("archival failed: %v", err) } // Add this url to resource map to mark it as processed arc.ResourceMap[res.DownloadURL] = struct{}{} // Save content to storage arc.Logf(0, "Downloaded %s", res.DownloadURL) result.ContentType = req.ContentType err = arc.SaveToStorage(result) if err != nil { return fmt.Errorf("failed to save %s: %v", res.DownloadURL, err) } // If there are no sub resources found, our job is finished. if len(subResources) == 0 { return nil } // However, if there are, we need to run the archiver in background to // process the sub resources concurrently. go func() { for _, subRes := range subResources { arc.ChRequest <- subRes } }() time.Sleep(time.Second) arc.StartArchiver() return nil }