diff --git a/go.mod b/go.mod index 7c09076b..31fb1a9a 100644 --- a/go.mod +++ b/go.mod @@ -20,9 +20,11 @@ require ( github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd github.com/sirupsen/logrus v1.4.2 github.com/spf13/cobra v0.0.4 + github.com/tdewolff/parse/v2 v2.3.7 + go.etcd.io/bbolt v1.3.2 golang.org/x/crypto v0.0.0-20190513172903-22d7a77e9e5f golang.org/x/image v0.0.0-20190523035834-f03afa92d3ff // indirect - golang.org/x/net v0.0.0-20190522155817-f3200d17e092 // indirect + golang.org/x/net v0.0.0-20190522155817-f3200d17e092 golang.org/x/sys v0.0.0-20190526052359-791d8a0f4d09 // indirect golang.org/x/tools v0.0.0-20190525145741-7be61e1b0e51 // indirect google.golang.org/appengine v1.6.0 // indirect diff --git a/go.sum b/go.sum index 9eefbb6c..4407900c 100644 --- a/go.sum +++ b/go.sum @@ -80,8 +80,14 @@ github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/tdewolff/parse/v2 v2.3.7 h1:DXoTUgrUE2Eap0m7zg1ljCO5C78vhEi7HTc4YnJWrRk= +github.com/tdewolff/parse/v2 v2.3.7/go.mod h1:HansaqmN4I/U7L6/tUp0NcwT2tFO0F4EAWYGSDzkYNk= +github.com/tdewolff/test v1.0.0 h1:jOwzqCXr5ePXEPGJaq2ivoR6HOCi+D5TPfpoyg8yvmU= +github.com/tdewolff/test v1.0.0/go.mod h1:DiQUlutnqlEvdvhSn2LPGy4TFwRauAaYDsL+683RNX4= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +go.etcd.io/bbolt v1.3.2 h1:Z/90sZLPOeCy2PwprqkFa25PdkusRzaj9P8zm/KNyvk= +go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 h1:VklqNMn3ovrHsnt90PveolxSbWFaJdECFbxSq0Mqo2M= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= diff --git a/internal/cmd/add.go b/internal/cmd/add.go index 42d7cc7b..a81d102f 100644 --- a/internal/cmd/add.go +++ b/internal/cmd/add.go @@ -1,12 +1,17 @@ package cmd import ( + "bytes" "fmt" + "io" + "net/http" nurl "net/url" fp "path/filepath" "strings" "time" + "github.com/go-shiori/shiori/pkg/warc" + "github.com/go-shiori/go-readability" "github.com/go-shiori/shiori/internal/model" "github.com/spf13/cobra" @@ -73,14 +78,36 @@ func addHandler(cmd *cobra.Command, args []string) { func() { cInfo.Println("Downloading article...") - resp, err := httpClient.Get(url) + // Prepare request + req, err := http.NewRequest("GET", url, nil) + if err != nil { + cError.Printf("Failed to download article: %v\n", err) + return + } + + // Send request + req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") + resp, err := httpClient.Do(req) if err != nil { cError.Printf("Failed to download article: %v\n", err) return } defer resp.Body.Close() - article, err := readability.FromReader(resp.Body, url) + // Save as archive + buffer := bytes.NewBuffer(nil) + tee := io.TeeReader(resp.Body, buffer) + + contentType := resp.Header.Get("Content-Type") + archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID)) + err = warc.FromReader(tee, url, contentType, archivePath) + if err != nil { + cError.Printf("Failed to create archive: %v\n", err) + return + } + + // Parse article + article, err := readability.FromReader(buffer, url) if err != nil { cError.Printf("Failed to parse article: %v\n", err) return diff --git a/internal/cmd/delete.go b/internal/cmd/delete.go index cd445506..838ff7d7 100644 --- a/internal/cmd/delete.go +++ b/internal/cmd/delete.go @@ -4,6 +4,7 @@ import ( "fmt" "os" fp "path/filepath" + "strconv" "strings" "github.com/spf13/cobra" @@ -57,18 +58,20 @@ func deleteHandler(cmd *cobra.Command, args []string) { return } - // Delete thumbnail image from local disk + // Delete thumbnail image and archives from local disk if len(ids) == 0 { thumbDir := fp.Join(DataDir, "thumb") + archiveDir := fp.Join(DataDir, "archive") os.RemoveAll(thumbDir) + os.RemoveAll(archiveDir) } else { for _, id := range ids { - imgPath := fp.Join(DataDir, "thumb", fmt.Sprintf("%d.*", id)) - matchedFiles, _ := fp.Glob(imgPath) + strID := strconv.Itoa(id) + imgPath := fp.Join(DataDir, "thumb", strID) + archivePath := fp.Join(DataDir, "archive", strID) - for _, f := range matchedFiles { - os.Remove(f) - } + os.Remove(imgPath) + os.Remove(archivePath) } } diff --git a/internal/cmd/open.go b/internal/cmd/open.go index e390591e..43a01932 100644 --- a/internal/cmd/open.go +++ b/internal/cmd/open.go @@ -2,9 +2,15 @@ package cmd import ( "fmt" + "net" + "net/http" + fp "path/filepath" + "strconv" "strings" "github.com/go-shiori/shiori/internal/database" + "github.com/go-shiori/shiori/pkg/warc" + "github.com/julienschmidt/httprouter" "github.com/spf13/cobra" ) @@ -20,6 +26,7 @@ func openCmd() *cobra.Command { } cmd.Flags().BoolP("yes", "y", false, "Skip confirmation prompt and open ALL bookmarks") + cmd.Flags().BoolP("archive", "a", false, "Open the bookmark's archived content") cmd.Flags().BoolP("text-cache", "t", false, "Open the bookmark's text cache in terminal") return cmd @@ -28,8 +35,22 @@ func openCmd() *cobra.Command { func openHandler(cmd *cobra.Command, args []string) { // Parse flags skipConfirm, _ := cmd.Flags().GetBool("yes") + archiveMode, _ := cmd.Flags().GetBool("archive") textCacheMode, _ := cmd.Flags().GetBool("text-cache") + // Convert args to ids + ids, err := parseStrIndices(args) + if err != nil { + cError.Println(err) + return + } + + // If in archive mode, only one bookmark allowed + if len(ids) > 1 && archiveMode { + cError.Println("In archive mode, only one bookmark allowed") + return + } + // If no arguments (i.e all bookmarks will be opened), // confirm to user if len(args) == 0 && !skipConfirm { @@ -42,13 +63,6 @@ func openHandler(cmd *cobra.Command, args []string) { } } - // Convert args to ids - ids, err := parseStrIndices(args) - if err != nil { - cError.Println(err) - return - } - // Read bookmarks from database getOptions := database.GetBookmarksOptions{ IDs: ids, @@ -62,17 +76,16 @@ func openHandler(cmd *cobra.Command, args []string) { } if len(bookmarks) == 0 { - switch { - case len(ids) > 0: + if len(ids) > 0 { cError.Println("No matching index found") - default: + } else { cError.Println("No bookmarks saved yet") } return } - // If not text cache mode, open bookmarks in browser - if !textCacheMode { + // If not text cache mode nor archive mode, open bookmarks in browser + if !textCacheMode && !archiveMode { for _, book := range bookmarks { err = openBrowser(book.URL) if err != nil { @@ -83,22 +96,74 @@ func openHandler(cmd *cobra.Command, args []string) { } // Show bookmarks content in terminal - termWidth := getTerminalWidth() + if textCacheMode { + termWidth := getTerminalWidth() - for _, book := range bookmarks { - cIndex.Printf("%d. ", book.ID) - cTitle.Println(book.Title) - fmt.Println() + for _, book := range bookmarks { + cIndex.Printf("%d. ", book.ID) + cTitle.Println(book.Title) + fmt.Println() - if book.Content == "" { - cError.Println("This bookmark doesn't have any cached content") - } else { - book.Content = strings.Join(strings.Fields(book.Content), " ") - fmt.Println(book.Content) + if book.Content == "" { + cError.Println("This bookmark doesn't have any cached content") + } else { + book.Content = strings.Join(strings.Fields(book.Content), " ") + fmt.Println(book.Content) + } + + fmt.Println() + cSymbol.Println(strings.Repeat("=", termWidth)) + fmt.Println() + } + } + + // Open archive + id := strconv.Itoa(bookmarks[0].ID) + archivePath := fp.Join(DataDir, "archive", id) + + archive, err := warc.Open(archivePath) + if err != nil { + cError.Printf("Failed to open archive: %v\n", err) + return + } + defer archive.Close() + + // Create simple server + router := httprouter.New() + router.GET("/*filename", func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { + filename := ps.ByName("filename") + resourceName := fp.Base(filename) + if resourceName == "/" { + resourceName = "" } - fmt.Println() - cSymbol.Println(strings.Repeat("=", termWidth)) - fmt.Println() + content, contentType, err := archive.Read(resourceName) + if err != nil { + panic(err) + } + + w.Header().Set("Content-Type", contentType) + if _, err = w.Write(content); err != nil { + panic(err) + } + }) + + router.PanicHandler = func(w http.ResponseWriter, r *http.Request, arg interface{}) { + http.Error(w, fmt.Sprint(arg), 500) + } + + // Choose random port + listener, err := net.Listen("tcp", ":0") + if err != nil { + cError.Printf("Failed to serve archive: %v\n", err) + return + } + + portNumber := listener.Addr().(*net.TCPAddr).Port + cInfo.Printf("Archive served in http://localhost:%d\n", portNumber) + + err = http.Serve(listener, router) + if err != nil { + cError.Printf("Failed to serve archive: %v\n", err) } } diff --git a/internal/cmd/root.go b/internal/cmd/root.go index 61b945d5..97ea13b6 100644 --- a/internal/cmd/root.go +++ b/internal/cmd/root.go @@ -1,7 +1,9 @@ package cmd import ( + "crypto/tls" "net/http" + "net/http/cookiejar" "time" "github.com/go-shiori/shiori/internal/database" @@ -15,9 +17,22 @@ var ( // DataDir is directory for downloaded data DataDir string - httpClient = &http.Client{Timeout: time.Minute} + httpClient *http.Client ) +func init() { + jar, _ := cookiejar.New(nil) + httpClient = &http.Client{ + Timeout: time.Minute, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + Jar: jar, + } +} + // ShioriCmd returns the root command for shiori func ShioriCmd() *cobra.Command { rootCmd := &cobra.Command{ diff --git a/internal/cmd/update.go b/internal/cmd/update.go index 5f64eddf..1dff291f 100644 --- a/internal/cmd/update.go +++ b/internal/cmd/update.go @@ -1,7 +1,10 @@ package cmd import ( + "bytes" "fmt" + "io" + "net/http" nurl "net/url" fp "path/filepath" "sort" @@ -12,6 +15,7 @@ import ( "github.com/go-shiori/go-readability" "github.com/go-shiori/shiori/internal/database" "github.com/go-shiori/shiori/internal/model" + "github.com/go-shiori/shiori/pkg/warc" "github.com/spf13/cobra" ) @@ -139,8 +143,17 @@ func updateHandler(cmd *cobra.Command, args []string) { <-semaphore }() - // Download article - resp, err := httpClient.Get(book.URL) + // Prepare request + req, err := http.NewRequest("GET", book.URL, nil) + if err != nil { + chProblem <- book.ID + chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) + return + } + + // Send request + req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") + resp, err := httpClient.Do(req) if err != nil { chProblem <- book.ID chMessage <- fmt.Errorf("Failed to download %s: %v", book.URL, err) @@ -148,7 +161,21 @@ func updateHandler(cmd *cobra.Command, args []string) { } defer resp.Body.Close() - article, err := readability.FromReader(resp.Body, book.URL) + // Save as archive + buffer := bytes.NewBuffer(nil) + tee := io.TeeReader(resp.Body, buffer) + + contentType := resp.Header.Get("Content-Type") + archivePath := fp.Join(DataDir, "archive", fmt.Sprintf("%d", book.ID)) + err = warc.FromReader(tee, book.URL, contentType, archivePath) + if err != nil { + chProblem <- book.ID + chMessage <- fmt.Errorf("Failed to create archive %s: %v", book.URL, err) + return + } + + // Parse article + article, err := readability.FromReader(buffer, book.URL) if err != nil { chProblem <- book.ID chMessage <- fmt.Errorf("Failed to parse %s: %v", book.URL, err) diff --git a/pkg/warc/internal/archiver/archiver.go b/pkg/warc/internal/archiver/archiver.go new file mode 100644 index 00000000..cdfba0b3 --- /dev/null +++ b/pkg/warc/internal/archiver/archiver.go @@ -0,0 +1,173 @@ +package archiver + +import ( + "fmt" + "strings" + "sync" + "time" + + "go.etcd.io/bbolt" +) + +// Archiver is struct for archiving an URL and its resources. +type Archiver struct { + sync.RWMutex + sync.WaitGroup + + DB *bbolt.DB + ChDone chan struct{} + ChErrors chan error + ChWarnings chan error + ChRequest chan ResourceURL + ResourceMap map[string]struct{} + LogEnabled bool +} + +// Close closes channels that used by the Archiver. +func (arc *Archiver) Close() { + close(arc.ChErrors) + close(arc.ChWarnings) + close(arc.ChRequest) +} + +// StartArchiver starts the archival process. +func (arc *Archiver) StartArchiver() []error { + go func() { + time.Sleep(time.Second) + arc.Wait() + close(arc.ChDone) + }() + + // Download the URL concurrently. + // After download finished, parse response to extract resources + // URL inside it. After that, send it to channel to download again. + errors := make([]error, 0) + warnings := make([]error, 0) + + func() { + for { + select { + case <-arc.ChDone: + return + case err := <-arc.ChErrors: + errors = append(errors, err) + case err := <-arc.ChWarnings: + warnings = append(warnings, err) + case res := <-arc.ChRequest: + arc.RLock() + _, exist := arc.ResourceMap[res.DownloadURL] + arc.RUnlock() + + if !exist { + arc.Add(1) + go arc.archive(res) + } + } + } + }() + + // Print log message if required + if arc.LogEnabled { + nErrors := len(errors) + nWarnings := len(warnings) + arc.Logf(infoLog, "Download finished with %d warnings and %d errors\n", nWarnings, nErrors) + + if nWarnings > 0 { + fmt.Println() + for _, warning := range warnings { + arc.Log(warningLog, warning) + } + } + + if nErrors > 0 { + for _, err := range errors { + arc.Log(errorLog, err) + } + } + } + + return nil +} + +// archive downloads a subresource and save it to storage. +func (arc *Archiver) archive(res ResourceURL) { + // Make sure to decrease wait group once finished + defer arc.Done() + + // Download resource + resp, err := DownloadData(res.DownloadURL) + if err != nil { + arc.ChErrors <- fmt.Errorf("failed to download %s: %v", res.DownloadURL, err) + return + } + defer resp.Body.Close() + + // Process resource depending on its type. + // Since this `archive` method only used for processing sub + // resource, we will only process the CSS sub resources. + // For other file, we will simply download it as it is. + var result ProcessResult + var subResources []ResourceURL + cType := resp.Header.Get("Content-Type") + + switch { + case strings.Contains(cType, "text/css"): + result, subResources, err = arc.ProcessCSSFile(res, resp.Body) + default: + result, err = arc.ProcessOtherFile(res, resp.Body) + } + + if err != nil { + arc.ChErrors <- fmt.Errorf("failed to process %s: %v", res.DownloadURL, err) + return + } + + // Add this url to resource map + arc.Lock() + arc.ResourceMap[res.DownloadURL] = struct{}{} + arc.Unlock() + + // Save content to storage + arc.Logf(infoLog, "Downloaded %s, parent %s", res.DownloadURL, res.Parent) + + result.ContentType = cType + err = arc.SaveToStorage(result) + if err != nil { + arc.ChErrors <- fmt.Errorf("failed to save %s: %v", res.DownloadURL, err) + return + } + + // Send sub resource to request channel + for _, subRes := range subResources { + arc.ChRequest <- subRes + } +} + +// SaveToStorage save processing result to storage. +func (arc *Archiver) SaveToStorage(result ProcessResult) error { + err := arc.DB.Batch(func(tx *bbolt.Tx) error { + bucket := tx.Bucket([]byte(result.Name)) + if bucket != nil { + return nil + } + + bucket, err := tx.CreateBucketIfNotExists([]byte(result.Name)) + if err != nil { + return err + } + + err = bucket.Put([]byte("content"), result.Content) + if err != nil { + return err + } + + err = bucket.Put([]byte("type"), []byte(result.ContentType)) + if err != nil { + return err + } + + return nil + }) + + return err +} diff --git a/pkg/warc/internal/archiver/http-client.go b/pkg/warc/internal/archiver/http-client.go new file mode 100644 index 00000000..eb9c6b65 --- /dev/null +++ b/pkg/warc/internal/archiver/http-client.go @@ -0,0 +1,38 @@ +package archiver + +import ( + "crypto/tls" + "net/http" + "net/http/cookiejar" + "time" +) + +var ( + defaultClient *http.Client +) + +func init() { + jar, _ := cookiejar.New(nil) + defaultClient = &http.Client{ + Timeout: time.Minute, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + }, + Jar: jar, + } +} + +// DownloadData downloads data from the specified URL. +func DownloadData(url string) (*http.Response, error) { + // Prepare request + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + + // Send request + req.Header.Set("User-Agent", "Shiori/2.0.0 (+https://github.com/go-shiori/shiori)") + return defaultClient.Do(req) +} diff --git a/pkg/warc/internal/archiver/log.go b/pkg/warc/internal/archiver/log.go new file mode 100644 index 00000000..ef1039b4 --- /dev/null +++ b/pkg/warc/internal/archiver/log.go @@ -0,0 +1,43 @@ +package archiver + +import "github.com/sirupsen/logrus" + +type logType int + +const ( + infoLog logType = iota + errorLog + warningLog +) + +// Log prints the log ended with newline. +func (arc *Archiver) Log(tp logType, msgs ...interface{}) { + if !arc.LogEnabled { + return + } + + switch tp { + case errorLog: + logrus.Errorln(msgs...) + case warningLog: + logrus.Warnln(msgs...) + default: + logrus.Infoln(msgs...) + } +} + +// Logf print log with specified format. +func (arc *Archiver) Logf(tp logType, format string, msgs ...interface{}) { + if !arc.LogEnabled { + return + } + + switch tp { + case errorLog: + logrus.Errorf(format, msgs...) + case warningLog: + logrus.Warnf(format, msgs...) + default: + logrus.Infof(format, msgs...) + } +} diff --git a/pkg/warc/internal/archiver/processor.go b/pkg/warc/internal/archiver/processor.go new file mode 100644 index 00000000..96fa0324 --- /dev/null +++ b/pkg/warc/internal/archiver/processor.go @@ -0,0 +1,468 @@ +package archiver + +import ( + "bytes" + "fmt" + "io" + nurl "net/url" + "regexp" + "strings" + + "github.com/tdewolff/parse/v2/css" + "github.com/tdewolff/parse/v2/js" + "golang.org/x/net/html" +) + +// ProcessResult is the result from content processing. +type ProcessResult struct { + Name string + ContentType string + Content []byte +} + +var ( + rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`) + rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) + rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) +) + +// ProcessHTMLFile process HTML file that submitted through the io.Reader. +func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) { + // Parse HTML document + doc, err := html.Parse(input) + if err != nil { + return ProcessResult{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", res.DownloadURL, err) + } + + // Parse URL + parsedURL, err := nurl.ParseRequestURI(res.DownloadURL) + if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { + return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL) + } + + // Convert lazy loaded image to normal + fixLazyImages(doc) + + // Convert hyperlinks rith relative URL + fixRelativeURIs(doc, parsedURL) + + // Extract resources from each nodes + for _, node := range getElementsByTagName(doc, "*") { + // First extract resources from inline style + cssResources := extractInlineCSS(node, parsedURL) + resources = append(resources, cssResources...) + + // Next extract resources from tag's specific attribute + nodeResources := []ResourceURL{} + switch tagName(node) { + case "style": + nodeResources = extractStyleTag(node, parsedURL) + case "script": + nodeResources = extractScriptTag(node, parsedURL) + case "meta": + nodeResources = extractMetaTag(node, parsedURL) + case "img", "picture", "figure", "video", "audio", "source": + nodeResources = extractMediaTag(node, parsedURL) + case "link": + nodeResources = extractGenericTag(node, "href", parsedURL) + case "iframe": + nodeResources = extractGenericTag(node, "src", parsedURL) + case "object": + nodeResources = extractGenericTag(node, "data", parsedURL) + default: + continue + } + resources = append(resources, nodeResources...) + } + + // Get outer HTML of the doc + result = ProcessResult{ + Name: res.ArchivalURL, + Content: outerHTML(doc), + } + + return result, resources, nil +} + +// ProcessCSSFile process CSS file that submitted through the io.Reader. +func (arc *Archiver) ProcessCSSFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) { + // Parse URL + parsedURL, err := nurl.ParseRequestURI(res.DownloadURL) + if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { + return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL) + } + + // Extract CSS rules + rules, resources := processCSS(input, parsedURL) + + result = ProcessResult{ + Name: res.ArchivalURL, + Content: []byte(rules), + } + + return result, resources, nil +} + +// ProcessOtherFile process files that not HTML, JS or CSS that submitted through the io.Reader. +func (arc *Archiver) ProcessOtherFile(res ResourceURL, input io.Reader) (result ProcessResult, err error) { + // Copy data to buffer + buffer := bytes.NewBuffer(nil) + + _, err = io.Copy(buffer, input) + if err != nil { + return ProcessResult{}, fmt.Errorf("failed to copy data: %v", err) + } + + // Create result + result = ProcessResult{ + Name: res.ArchivalURL, + Content: buffer.Bytes(), + } + + return result, nil +} + +// fixRelativeURIs converts each in the given element +// to an absolute URI, ignoring #ref URIs. +func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) { + links := getAllNodesWithTag(doc, "a") + forEachNode(links, func(link *html.Node, _ int) { + href := getAttribute(link, "href") + if href == "" { + return + } + + // Replace links with javascript: URIs with text content, + // since they won't work after scripts have been removed + // from the page. + if strings.HasPrefix(href, "javascript:") { + text := createTextNode(textContent(link)) + replaceNode(link, text) + } else { + newHref := toAbsoluteURI(href, pageURL) + if newHref == "" { + removeAttribute(link, "href") + } else { + setAttribute(link, "href", newHref) + } + } + }) +} + +// fixLazyImages convert images and figures that have properties like data-src into +// images that can be loaded without JS. +func fixLazyImages(root *html.Node) { + imageNodes := getAllNodesWithTag(root, "img", "picture", "figure") + forEachNode(imageNodes, func(elem *html.Node, _ int) { + src := getAttribute(elem, "src") + srcset := getAttribute(elem, "srcset") + nodeTag := tagName(elem) + nodeClass := className(elem) + + if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") { + for i := 0; i < len(elem.Attr); i++ { + attr := elem.Attr[i] + if attr.Key == "src" || attr.Key == "srcset" { + continue + } + + copyTo := "" + if rxLazyImageSrcset.MatchString(attr.Val) { + copyTo = "srcset" + } else if rxLazyImageSrc.MatchString(attr.Val) { + copyTo = "src" + } + + if copyTo == "" { + continue + } + + if nodeTag == "img" || nodeTag == "picture" { + // if this is an img or picture, set the attribute directly + setAttribute(elem, copyTo, attr.Val) + } else if nodeTag == "figure" && len(getAllNodesWithTag(elem, "img", "picture")) == 0 { + // if the item is a
that does not contain an image or picture, + // create one and place it inside the figure see the nytimes-3 + // testcase for an example + img := createElement("img") + setAttribute(img, copyTo, attr.Val) + appendChild(elem, img) + } + } + } + }) +} + +// extractInlineCSS extract archive's resource from the CSS rules inside +// style attribute. Once finished, all CSS URLs in the style attribute +// will be updated to use the archival URL. +func extractInlineCSS(node *html.Node, pageURL *nurl.URL) []ResourceURL { + // Make sure this node has inline style + styleAttr := getAttribute(node, "style") + if styleAttr == "" { + return nil + } + + // Extract resource URLs from the inline style + // and update the CSS rules accordingly. + reader := strings.NewReader(styleAttr) + newStyleAttr, resources := processCSS(reader, pageURL) + setAttribute(node, "style", newStyleAttr) + + return resources +} + +// extractStyleTag extract archive's resource from inside a