diff --git a/pkg/warc/internal/archiver/processor.go b/pkg/warc/internal/archiver/processor.go index 0b122f19..e47ebd83 100644 --- a/pkg/warc/internal/archiver/processor.go +++ b/pkg/warc/internal/archiver/processor.go @@ -404,6 +404,7 @@ func processCSS(input io.Reader, baseURL *nurl.URL) (string, []ResourceURL) { // Save the CSS URL and replace it with archival URL res := ToResourceURL(cssURL, baseURL) if res.ArchivalURL == "" { + buffer.Write(bt) continue } diff --git a/pkg/warc/internal/archiver/resource-url.go b/pkg/warc/internal/archiver/resource-url.go index fdf69a6d..606223a6 100644 --- a/pkg/warc/internal/archiver/resource-url.go +++ b/pkg/warc/internal/archiver/resource-url.go @@ -31,12 +31,30 @@ func ToResourceURL(uri string, base *nurl.URL) ResourceURL { return ResourceURL{} } - // Create archive URL + // Create download URL downloadURL := toAbsoluteURI(uri, base) downloadURL = rxTrailingSlash.ReplaceAllString(downloadURL, "") downloadURL = strings.ReplaceAll(downloadURL, " ", "+") - archivalURL := strings.Replace(downloadURL, "://", "/", 1) + // Create archival URL + archivalURL := downloadURL + + // Some URL have its query escaped. + // For example, Wikipedia's stylesheet looks like this : + // load.php?lang=en&modules=ext.3d.styles%7Cext.cite.styles%7Cext.uls.interlanguage + // However, when browser download it, it will be registered as unescaped query : + // load.php?lang=en&modules=ext.3d.styles|ext.cite.styles|ext.uls.interlanguage + // So, for archival URL, we need to unescape the query first. + tmp, err := nurl.Parse(downloadURL) + if err == nil { + newQuery, _ := nurl.QueryUnescape(tmp.RawQuery) + if newQuery != "" { + tmp.RawQuery = newQuery + archivalURL = tmp.String() + } + } + + archivalURL = strings.Replace(archivalURL, "://", "/", 1) archivalURL = strings.ReplaceAll(archivalURL, "?", "-") archivalURL = strings.ReplaceAll(archivalURL, "#", "-") archivalURL = strings.ReplaceAll(archivalURL, "/", "-")