package archiver import ( "bytes" "fmt" "io" nurl "net/url" "regexp" "strings" "github.com/tdewolff/parse/v2/css" "github.com/tdewolff/parse/v2/js" "golang.org/x/net/html" ) // ProcessResult is the result from content processing. type ProcessResult struct { Name string ContentType string Content []byte } var ( rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`) rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) ) // ProcessHTMLFile process HTML file that submitted through the io.Reader. func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) { // Parse HTML document doc, err := html.Parse(input) if err != nil { return ProcessResult{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", res.DownloadURL, err) } // Parse URL parsedURL, err := nurl.ParseRequestURI(res.DownloadURL) if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL) } // Convert lazy loaded image to normal fixLazyImages(doc) // Convert hyperlinks rith relative URL fixRelativeURIs(doc, parsedURL) // Extract resources from each nodes for _, node := range getElementsByTagName(doc, "*") { // First extract resources from inline style cssResources := extractInlineCSS(node, parsedURL) resources = append(resources, cssResources...) // Next extract resources from tag's specific attribute nodeResources := []ResourceURL{} switch tagName(node) { case "style": nodeResources = extractStyleTag(node, parsedURL) case "script": nodeResources = extractScriptTag(node, parsedURL) case "meta": nodeResources = extractMetaTag(node, parsedURL) case "img", "picture", "figure", "video", "audio", "source": nodeResources = extractMediaTag(node, parsedURL) case "link": nodeResources = extractGenericTag(node, "href", parsedURL) case "iframe": nodeResources = extractGenericTag(node, "src", parsedURL) case "object": nodeResources = extractGenericTag(node, "data", parsedURL) default: continue } resources = append(resources, nodeResources...) } // Get outer HTML of the doc result = ProcessResult{ Name: res.ArchivalURL, Content: outerHTML(doc), } return result, resources, nil } // ProcessCSSFile process CSS file that submitted through the io.Reader. func (arc *Archiver) ProcessCSSFile(res ResourceURL, input io.Reader) (result ProcessResult, resources []ResourceURL, err error) { // Parse URL parsedURL, err := nurl.ParseRequestURI(res.DownloadURL) if err != nil || parsedURL.Scheme == "" || parsedURL.Hostname() == "" { return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL) } // Extract CSS rules rules, resources := processCSS(input, parsedURL) result = ProcessResult{ Name: res.ArchivalURL, Content: []byte(rules), } return result, resources, nil } // ProcessOtherFile process files that not HTML, JS or CSS that submitted through the io.Reader. func (arc *Archiver) ProcessOtherFile(res ResourceURL, input io.Reader) (result ProcessResult, err error) { // Copy data to buffer buffer := bytes.NewBuffer(nil) _, err = io.Copy(buffer, input) if err != nil { return ProcessResult{}, fmt.Errorf("failed to copy data: %v", err) } // Create result result = ProcessResult{ Name: res.ArchivalURL, Content: buffer.Bytes(), } return result, nil } // fixRelativeURIs converts each in the given element // to an absolute URI, ignoring #ref URIs. func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) { links := getAllNodesWithTag(doc, "a") forEachNode(links, func(link *html.Node, _ int) { href := getAttribute(link, "href") if href == "" { return } // Replace links with javascript: URIs with text content, // since they won't work after scripts have been removed // from the page. if strings.HasPrefix(href, "javascript:") { text := createTextNode(textContent(link)) replaceNode(link, text) } else { newHref := toAbsoluteURI(href, pageURL) if newHref == "" { removeAttribute(link, "href") } else { setAttribute(link, "href", newHref) } } }) } // fixLazyImages convert images and figures that have properties like data-src into // images that can be loaded without JS. func fixLazyImages(root *html.Node) { imageNodes := getAllNodesWithTag(root, "img", "picture", "figure") forEachNode(imageNodes, func(elem *html.Node, _ int) { src := getAttribute(elem, "src") srcset := getAttribute(elem, "srcset") nodeTag := tagName(elem) nodeClass := className(elem) if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") { for i := 0; i < len(elem.Attr); i++ { attr := elem.Attr[i] if attr.Key == "src" || attr.Key == "srcset" { continue } copyTo := "" if rxLazyImageSrcset.MatchString(attr.Val) { copyTo = "srcset" } else if rxLazyImageSrc.MatchString(attr.Val) { copyTo = "src" } if copyTo == "" { continue } if nodeTag == "img" || nodeTag == "picture" { // if this is an img or picture, set the attribute directly setAttribute(elem, copyTo, attr.Val) } else if nodeTag == "figure" && len(getAllNodesWithTag(elem, "img", "picture")) == 0 { // if the item is a
that does not contain an image or picture, // create one and place it inside the figure see the nytimes-3 // testcase for an example img := createElement("img") setAttribute(img, copyTo, attr.Val) appendChild(elem, img) } } } }) } // extractInlineCSS extract archive's resource from the CSS rules inside // style attribute. Once finished, all CSS URLs in the style attribute // will be updated to use the archival URL. func extractInlineCSS(node *html.Node, pageURL *nurl.URL) []ResourceURL { // Make sure this node has inline style styleAttr := getAttribute(node, "style") if styleAttr == "" { return nil } // Extract resource URLs from the inline style // and update the CSS rules accordingly. reader := strings.NewReader(styleAttr) newStyleAttr, resources := processCSS(reader, pageURL) setAttribute(node, "style", newStyleAttr) return resources } // extractStyleTag extract archive's resource from inside a