Fix: unescaped archive URL can't be opened

This commit is contained in:
Radhi Fadlillah 2019-06-10 14:24:45 +07:00
parent 7137c0693a
commit 598ea9476d
3 changed files with 26 additions and 13 deletions

View file

@ -133,9 +133,11 @@ func (arc *Archiver) archive(res ResourceURL) {
// Save content to storage
arc.Logf(infoLog, "Downloaded %s\n"+
"\tArchive name %s\n"+
"\tParent %s\n"+
"\tSize %d Bytes\n",
res.DownloadURL,
res.ArchivalURL,
res.Parent,
resp.ContentLength)

View file

@ -27,8 +27,6 @@ var (
rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
rxStyleURL = regexp.MustCompile(`(?i)^url\((.+)\)$`)
rxSingleQuote = regexp.MustCompile(`(?i)^'([^']*)'$`)
rxDoubleQuote = regexp.MustCompile(`(?i)^"([^"]*)"$`)
rxJSContentType = regexp.MustCompile(`(?i)(text|application)/(java|ecma)script`)
)
@ -398,8 +396,9 @@ func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
// Sanitize the URL by removing `url()`, quotation mark and trailing slash
cssURL := string(bt)
cssURL = rxStyleURL.ReplaceAllString(cssURL, "$1")
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
cssURL = strings.TrimSpace(cssURL)
cssURL = strings.Trim(cssURL, `'`)
cssURL = strings.Trim(cssURL, `"`)
// Save the CSS URL and replace it with archival URL
res := ToResourceURL(cssURL, baseURL)
@ -453,13 +452,15 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
var newURL string
text := string(bt)
text = rxSingleQuote.ReplaceAllString(text, "$1")
text = rxDoubleQuote.ReplaceAllString(text, "$1")
text = strings.TrimSpace(text)
text = strings.Trim(text, `'`)
text = strings.Trim(text, `"`)
if strings.HasPrefix(text, "url(") {
cssURL := rxStyleURL.ReplaceAllString(text, "$1")
cssURL = rxSingleQuote.ReplaceAllString(cssURL, "$1")
cssURL = rxDoubleQuote.ReplaceAllString(cssURL, "$1")
cssURL = strings.TrimSpace(cssURL)
cssURL = strings.Trim(cssURL, `'`)
cssURL = strings.Trim(cssURL, `"`)
res = ToResourceURL(cssURL, baseURL)
newURL = fmt.Sprintf("\"url('%s')\"", res.ArchivalURL)
@ -474,7 +475,14 @@ func processJS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) {
ext := path.Ext(tmp.Path)
cType := mime.TypeByExtension(ext)
if !strings.Contains(cType, "text/css") && !rxJSContentType.MatchString(cType) {
switch {
case rxJSContentType.MatchString(cType),
strings.Contains(cType, "text/css"),
strings.Contains(cType, "image/"),
strings.Contains(cType, "audio/"),
strings.Contains(cType, "video/"):
default:
buffer.Write(bt)
continue
}

View file

@ -39,22 +39,25 @@ func ToResourceURL(uri string, base *nurl.URL) ResourceURL {
// Create archival URL
archivalURL := downloadURL
// Some URL have its query escaped.
// Some URL have its query or path escaped, e.g. Wikipedia and Dev.to.
// For example, Wikipedia's stylesheet looks like this :
// load.php?lang=en&modules=ext.3d.styles%7Cext.cite.styles%7Cext.uls.interlanguage
// However, when browser download it, it will be registered as unescaped query :
// load.php?lang=en&modules=ext.3d.styles|ext.cite.styles|ext.uls.interlanguage
// So, for archival URL, we need to unescape the query first.
// So, for archival URL, we need to unescape the query and path first.
tmp, err := nurl.Parse(downloadURL)
if err == nil {
tmp.RawPath = tmp.Path
newQuery, _ := nurl.QueryUnescape(tmp.RawQuery)
if newQuery != "" {
tmp.RawQuery = newQuery
archivalURL = tmp.String()
}
archivalURL = tmp.String()
}
archivalURL = strings.Replace(archivalURL, "://", "/", 1)
archivalURL = strings.ReplaceAll(archivalURL, "://", "/")
archivalURL = strings.ReplaceAll(archivalURL, "?", "-")
archivalURL = strings.ReplaceAll(archivalURL, "#", "-")
archivalURL = strings.ReplaceAll(archivalURL, "/", "-")