From 96d4a4cff35db7ff38206d4108796ae992c3568c Mon Sep 17 00:00:00 2001 From: Radhi Fadlillah Date: Sun, 9 Jun 2019 23:59:45 +0700 Subject: [PATCH] Better(?) archival process for iframe and js --- pkg/warc/internal/archiver/archiver.go | 4 ++- pkg/warc/internal/archiver/processor.go | 37 ++++++++++++++++++++-- pkg/warc/internal/archiver/resource-url.go | 1 + pkg/warc/internal/archiver/utils.go | 8 ----- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pkg/warc/internal/archiver/archiver.go b/pkg/warc/internal/archiver/archiver.go index cdfba0b3..d3a91ccb 100644 --- a/pkg/warc/internal/archiver/archiver.go +++ b/pkg/warc/internal/archiver/archiver.go @@ -104,13 +104,15 @@ func (arc *Archiver) archive(res ResourceURL) { // Process resource depending on its type. // Since this `archive` method only used for processing sub - // resource, we will only process the CSS sub resources. + // resource, we will only process the CSS and HTML sub resources. // For other file, we will simply download it as it is. var result ProcessResult var subResources []ResourceURL cType := resp.Header.Get("Content-Type") switch { + case strings.Contains(cType, "text/html") && res.IsEmbedded: + result, subResources, err = arc.ProcessHTMLFile(res, resp.Body) case strings.Contains(cType, "text/css"): result, subResources, err = arc.ProcessCSSFile(res, resp.Body) default: diff --git a/pkg/warc/internal/archiver/processor.go b/pkg/warc/internal/archiver/processor.go index e37cca59..0b122f19 100644 --- a/pkg/warc/internal/archiver/processor.go +++ b/pkg/warc/internal/archiver/processor.go @@ -4,7 +4,9 @@ import ( "bytes" "fmt" "io" + "mime" nurl "net/url" + "path" "regexp" "strings" @@ -24,6 +26,10 @@ var ( rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`) rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) + rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`) + rxSingleQuote = regexp.MustCompile(`(?i)^'([^']*)'$`) + rxDoubleQuote = regexp.MustCompile(`(?i)^"([^"]*)"$`) + rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`) ) // ProcessHTMLFile process HTML file that submitted through the io.Reader. @@ -356,6 +362,11 @@ func extractGenericTag(node *html.Node, attrName string, pageURL *nurl.URL) []Re return nil } + // If this node is iframe, mark it as embedded + if tagName(node) == "iframe" { + res.IsEmbedded = true + } + setAttribute(node, attrName, res.ArchivalURL) return []ResourceURL{res} } @@ -431,9 +442,12 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { // Process the string. // Unlike CSS, JS doesn't have it's own URL token. So, we can only guess whether - // a string is URL or not. For simplicity, we only catch those that wrapped in `url()` - // because it's usually CSS resource which downloaded via JS. However, - // if it doesn't fulfill the criteria above, just write it as it is. + // a string is URL or not. There are several criterias to decide if it's URL : + // - It surrounded by `url()` just like CSS + // - It started with http(s):// for absolute URL + // - It started with slash (/) for relative URL + // - + // If it doesn't fulfill any of criteria above, just write it as it is. var res ResourceURL var newURL string @@ -448,6 +462,23 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { res = ToResourceURL(cssURL, baseURL) newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL) + } else if strings.HasPrefix(text, "/") || rxHTTPScheme.MatchString(text) { + res = ToResourceURL(text, baseURL) + + tmp, err := nurl.Parse(res.DownloadURL) + if err != nil { + buffer.Write(bt) + continue + } + + ext := path.Ext(tmp.Path) + cType := mime.TypeByExtension(ext) + if !strings.Contains(cType, "text/css") && !rxJSContentType.MatchString(cType) { + buffer.Write(bt) + continue + } + + newURL = fmt.Sprintf("\"%s\"", res.ArchivalURL) } else { buffer.Write(bt) continue diff --git a/pkg/warc/internal/archiver/resource-url.go b/pkg/warc/internal/archiver/resource-url.go index ac30191f..fdf69a6d 100644 --- a/pkg/warc/internal/archiver/resource-url.go +++ b/pkg/warc/internal/archiver/resource-url.go @@ -18,6 +18,7 @@ type ResourceURL struct { DownloadURL string ArchivalURL string Parent string + IsEmbedded bool } // ToResourceURL generates an uri into a Resource URL. diff --git a/pkg/warc/internal/archiver/utils.go b/pkg/warc/internal/archiver/utils.go index 956ed025..2d51e933 100644 --- a/pkg/warc/internal/archiver/utils.go +++ b/pkg/warc/internal/archiver/utils.go @@ -2,17 +2,9 @@ package archiver import ( nurl "net/url" - "regexp" "strings" ) -var ( - rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`) - rxSingleQuote = regexp.MustCompile(`(?i)^'(.*)'$`) - rxDoubleQuote = regexp.MustCompile(`(?i)^"(.*)"$`) - rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`) -) - func clearUTMParams(url *nurl.URL) { queries := url.Query()