From 01d0df5d58853caeacc4b3ed3f164263d66f58a1 Mon Sep 17 00:00:00 2001 From: Radhi Fadlillah Date: Fri, 2 Aug 2019 21:45:43 +0700 Subject: [PATCH] Remove javascript bevore archival --- pkg/warc/internal/archiver/processor.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/warc/internal/archiver/processor.go b/pkg/warc/internal/archiver/processor.go index 10f90e97..b0604483 100644 --- a/pkg/warc/internal/archiver/processor.go +++ b/pkg/warc/internal/archiver/processor.go @@ -44,6 +44,9 @@ func (arc *Archiver) ProcessHTMLFile(res ResourceURL, input io.Reader) (result P return ProcessResult{}, nil, fmt.Errorf("url %s is not valid", res.DownloadURL) } + // TODO: I'm still not really sure, but IMHO it's safer to disable Javascript + removeNodes(getElementsByTagName(doc, "script"), nil) + // Convert lazy loaded image to normal fixLazyImages(doc)