From 598ea9476d9cdcc29eaf74725a41e9159376e113 Mon Sep 17 00:00:00 2001 From: Radhi Fadlillah Date: Mon, 10 Jun 2019 14:24:45 +0700 Subject: [PATCH] Fix: unescaped archive URL can't be opened --- pkg/warc/internal/archiver/archiver.go | 2 ++ pkg/warc/internal/archiver/processor.go | 26 ++++++++++++++-------- pkg/warc/internal/archiver/resource-url.go | 11 +++++---- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/pkg/warc/internal/archiver/archiver.go b/pkg/warc/internal/archiver/archiver.go index 8929ce3..8bf4b4d 100644 --- a/pkg/warc/internal/archiver/archiver.go +++ b/pkg/warc/internal/archiver/archiver.go @@ -133,9 +133,11 @@ func (arc *Archiver) archive(res ResourceURL) { // Save content to storage arc.Logf(infoLog, "Downloaded %s\n"+ + "\tArchive name %s\n"+ "\tParent %s\n"+ "\tSize %d Bytes\n", res.DownloadURL, + res.ArchivalURL, res.Parent, resp.ContentLength) diff --git a/pkg/warc/internal/archiver/processor.go b/pkg/warc/internal/archiver/processor.go index e47ebd8..10f90e9 100644 --- a/pkg/warc/internal/archiver/processor.go +++ b/pkg/warc/internal/archiver/processor.go @@ -27,8 +27,6 @@ var ( rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`) rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`) rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`) - rxSingleQuote = regexp.MustCompile(`(?i)^'([^']*)'$`) - rxDoubleQuote = regexp.MustCompile(`(?i)^"([^"]*)"$`) rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`) ) @@ -398,8 +396,9 @@ func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { // Sanitize the URL by removing `url()`, quotation mark and trailing slash cssURL := string(bt) cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1") - cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1") - cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1") + cssURL = strings.TrimSpace(cssURL) + cssURL = strings.Trim(cssURL, `'`) + cssURL = strings.Trim(cssURL, `"`) // Save the CSS URL and replace it with archival URL res := ToResourceURL(cssURL, baseURL) @@ -453,13 +452,15 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { var newURL string text := string(bt) - text = rxSingleQuote.ReplaceAllString(text, "$1") - text = rxDoubleQuote.ReplaceAllString(text, "$1") + text = strings.TrimSpace(text) + text = strings.Trim(text, `'`) + text = strings.Trim(text, `"`) if strings.HasPrefix(text, "url(") { cssURL := rxStyleURL.ReplaceAllString(text, "$1") - cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1") - cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1") + cssURL = strings.TrimSpace(cssURL) + cssURL = strings.Trim(cssURL, `'`) + cssURL = strings.Trim(cssURL, `"`) res = ToResourceURL(cssURL, baseURL) newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL) @@ -474,7 +475,14 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { ext := path.Ext(tmp.Path) cType := mime.TypeByExtension(ext) - if !strings.Contains(cType, "text/css") && !rxJSContentType.MatchString(cType) { + + switch { + case rxJSContentType.MatchString(cType), + strings.Contains(cType, "text/css"), + strings.Contains(cType, "image/"), + strings.Contains(cType, "audio/"), + strings.Contains(cType, "video/"): + default: buffer.Write(bt) continue } diff --git a/pkg/warc/internal/archiver/resource-url.go b/pkg/warc/internal/archiver/resource-url.go index 606223a..239abf3 100644 --- a/pkg/warc/internal/archiver/resource-url.go +++ b/pkg/warc/internal/archiver/resource-url.go @@ -39,22 +39,25 @@ func ToResourceURL(uri string, base *nurl.URL) ResourceURL { // Create archival URL archivalURL := downloadURL - // Some URL have its query escaped. + // Some URL have its query or path escaped, e.g. Wikipedia and Dev.to. // For example, Wikipedia's stylesheet looks like this : // load.php?lang=en&modules=ext.3d.styles%7Cext.cite.styles%7Cext.uls.interlanguage // However, when browser download it, it will be registered as unescaped query : // load.php?lang=en&modules=ext.3d.styles|ext.cite.styles|ext.uls.interlanguage - // So, for archival URL, we need to unescape the query first. + // So, for archival URL, we need to unescape the query and path first. tmp, err := nurl.Parse(downloadURL) if err == nil { + tmp.RawPath = tmp.Path + newQuery, _ := nurl.QueryUnescape(tmp.RawQuery) if newQuery != "" { tmp.RawQuery = newQuery - archivalURL = tmp.String() } + + archivalURL = tmp.String() } - archivalURL = strings.Replace(archivalURL, "://", "/", 1) + archivalURL = strings.ReplaceAll(archivalURL, "://", "/") archivalURL = strings.ReplaceAll(archivalURL, "?", "-") archivalURL = strings.ReplaceAll(archivalURL, "#", "-") archivalURL = strings.ReplaceAll(archivalURL, "/", "-")