shiori/pkg/warc/internal/archiver/resource-url.go
2019-06-09 14:54:07 +07:00

50 lines
1.3 KiB
Go

package archiver
import (
nurl "net/url"
"regexp"
"strings"
)
var (
rxHTTPScheme = regexp.MustCompile(`(?i)^https?:\/{2}`)
rxTrailingSlash = regexp.MustCompile(`(?i)/+$`)
rxRepeatedStrip = regexp.MustCompile(`(?i)-+`)
)
// ResourceURL is strcut that contains URL for downloading
// and archiving a resource.
type ResourceURL struct {
DownloadURL string
ArchivalURL string
Parent string
}
// ToResourceURL generates an uri into a Resource URL.
func ToResourceURL(uri string, base *nurl.URL) ResourceURL {
// Make sure URL has a valid scheme
uri = strings.TrimSpace(uri)
switch {
case uri == "",
strings.Contains(uri, ":") && !rxHTTPScheme.MatchString(uri):
return ResourceURL{}
}
// Create archive URL
downloadURL := toAbsoluteURI(uri, base)
downloadURL = rxTrailingSlash.ReplaceAllString(downloadURL, "")
downloadURL = strings.ReplaceAll(downloadURL, " ", "+")
archivalURL := strings.Replace(downloadURL, "://", "/", 1)
archivalURL = strings.ReplaceAll(archivalURL, "?", "-")
archivalURL = strings.ReplaceAll(archivalURL, "#", "-")
archivalURL = strings.ReplaceAll(archivalURL, "/", "-")
archivalURL = strings.ReplaceAll(archivalURL, " ", "-")
archivalURL = rxRepeatedStrip.ReplaceAllString(archivalURL, "-")
return ResourceURL{
DownloadURL: downloadURL,
ArchivalURL: archivalURL,
Parent: base.String(),
}
}