theHarvester/discovery/googlesearch.py

import myparser
import time
import requests
import random

class search_google:

    def __init__(self, word, limit, start):
        self.word = word
        self.results = ""
        self.totalresults = ""
        self.server = "www.google.com"
        self.dorks = []
        self.links = []
        self.database = "https://www.google.com/search?q="
        self.userAgent = ["(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6",
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
          ,("Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) " +
          "AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36"),
          ("Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; RM-1152) " +
          "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15254"),
          "Mozilla/5.0 (SMART-TV; X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42"
          ,"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 OPR/43.0.2442.991"
          ,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52"
          ,"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
          ,"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
          ,"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"]
        self.quantity = "100"
        self.limit = limit
        self.counter = start

    def do_search(self):
        try: #do normal scraping
            urly="http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(self.counter) + "&hl=en&meta=&q=%40\"" + self.word + "\""
        except Exception, e:
            print e
        try:
            params = {'User-Agent': random.choice(self.userAgent)} #select random user agent
            r=requests.get(urly,params= params)
        except Exception,e:
            print e
        self.results = r.content
        self.totalresults += self.results

    def do_search_profiles(self):
        try:
            urly="http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(self.counter) + "&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20" + self.word + "\""
        except Exception, e:
            print e
        try:
            r=requests.get(urly)
        except Exception,e:
            print e
        self.results = r.content
        #'&hl=en&meta=&q=site:www.google.com%20intitle:"Google%20Profile"%20"Companies%20I%27ve%20worked%20for"%20"at%20' + self.word + '"')
        self.totalresults += self.results

    def get_emails(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.emails()

    def get_hostnames(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.hostnames()

    def get_files(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.fileurls(self.files)

    def get_profiles(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.profiles()

    def process(self,google_dorking):
        if google_dorking == False:
            while self.counter <= self.limit and self.counter <= 1000:
                self.do_search()
                #more = self.check_next()
                time.sleep(1)
                print "\tSearching " + str(self.counter) + " results..."
                self.counter += 100
        else: #google dorking is true
            self.counter = 0 #reset counter
            print '\n'
            print "[-] Searching with Google Dorks: "
            while self.counter <= self.limit and self.counter <= 200: # only 200 dorks in list
                self.googledork() #call google dorking method if user wanted it!
                # more = self.check_next()
                time.sleep(.1)
                print "\tSearching " + str(self.counter) + " results..."
                self.counter += 100

    def process_profiles(self):
        while self.counter < self.limit:
            self.do_search_profiles()
            time.sleep(0.2)
            self.counter += 100
            print "\tSearching " + str(self.counter) + " results..."

    def append_dorks(self):
        try:  # wrap in try-except incase filepaths are messed up
            with open('wordlists/dorks.txt', mode='r') as fp:
                self.dorks = [dork.strip() for dork in fp]
        except IOError as error:
            print(error)

    def construct_dorks(self):
        #format is: site:targetwebsite.com + space + inurl:admindork
        colon = "%3A"
        plus = "%2B"
        space = '+'
        period = "%2E"
        double_quote = "%22"
        asterick = "%2A"
        left_bracket = "%5B"
        right_bracket = "%5D"
        question_mark = "%3F"
        slash = "%2F"
        single_quote = "%27"
        ampersand = "%26"
        left_peren = "%28"
        right_peren = "%29"
        pipe = '%7C'
        # replace links with html encoding
        self.links = [self.database + space + self.word + space +
                      str(dork).replace(':', colon).replace('+', plus).replace('.', period).replace('"', double_quote)
                      .replace("*", asterick).replace('[', left_bracket).replace(']', right_bracket)
                      .replace('?', question_mark).replace(' ', space).replace('/', slash).replace("'", single_quote)
                      .replace("&", ampersand).replace('(', left_peren).replace(')', right_peren).replace('|', pipe)
                      for dork in self.dorks]

    def googledork(self):
        self.append_dorks()  # call functions to create list
        self.construct_dorks()
        if (self.counter >= 0 and self.counter <=100):
            self.send_dork(start=0, end=100)
        elif (self.counter >= 100 and self.counter <=200):
            self.send_dork(start=101, end=200)
        else: #only 200 dorks to prevent google from blocking ip
            pass

    def send_dork(self, start, end): # helper function to minimize code reusability
        params = {'User-Agent': random.choice(self.userAgent)}
        # get random user agent to try and prevent google from blocking ip
        for i in range(start, end):
            try:
                link = self.links[i] # get link from dork list
                req = requests.get(link, params=params)
                time.sleep(.2)  # sleep for a short time
                self.results = req.content
                self.totalresults += self.results
            except:
                continue
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`import myparser`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00			`import time`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`import requests`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`import random`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00
			`class search_google:`

Made boolean to be an arguement instead of necessary in class as only used in one spot which is inside function. 2018-10-27 04:15:43 +08:00			`def __init__(self, word, limit, start):`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`self.word = word`
			`self.results = ""`
			`self.totalresults = ""`
			`self.server = "www.google.com"`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`self.dorks = []`
			`self.links = []`
			`self.database = "https://www.google.com/search?q="`
			`self.userAgent = ["(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6",`
			`"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`,("Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) " +`
			`"AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36"),`
			`("Mozilla/5.0 (Windows Phone 10.0; Android 6.0.1; Microsoft; RM-1152) " +`
Reworked some more logic, and verified that the flag does get more hostnames and emails; although, the biggest obstacle is google blocking your ip. 2018-10-25 03:38:51 +08:00			`"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Mobile Safari/537.36 Edge/15.15254"),`
			`"Mozilla/5.0 (SMART-TV; X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42"`
			`,"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 OPR/43.0.2442.991"`
			`,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52"`
			`,"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"`
			`,"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"`
			`,"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"]`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`self.quantity = "100"`
			`self.limit = limit`
			`self.counter = start`

Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`def do_search(self):`
Added some more docstring, about to restructure some logic, made it so if user can do normal google harvesting and google dorking as well. 2018-10-25 00:35:53 +08:00			`try: #do normal scraping`
			`urly="http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(self.counter) + "&hl=en&meta=&q=%40\"" + self.word + "\""`
			`except Exception, e:`
			`print e`
			`try:`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`params = {'User-Agent': random.choice(self.userAgent)} #select random user agent`
Reworked some more logic, and verified that the flag does get more hostnames and emails; although, the biggest obstacle is google blocking your ip. 2018-10-25 03:38:51 +08:00			`r=requests.get(urly,params= params)`
Added some more docstring, about to restructure some logic, made it so if user can do normal google harvesting and google dorking as well. 2018-10-25 00:35:53 +08:00			`except Exception,e:`
			`print e`
			`self.results = r.content`
			`self.totalresults += self.results`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00
			`def do_search_profiles(self):`
			`try:`
			`urly="http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(self.counter) + "&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20" + self.word + "\""`
			`except Exception, e:`
			`print e`
			`try:`
			`r=requests.get(urly)`
			`except Exception,e:`
			`print e`
Verified that google dorking is getting more results, have to modify searches to utilize different dorks! 2018-10-24 21:36:20 +08:00			`self.results = r.content`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`#'&hl=en&meta=&q=site:www.google.com%20intitle:"Google%20Profile"%20"Companies%20I%27ve%20worked%20for"%20"at%20' + self.word + '"')`
			`self.totalresults += self.results`

			`def get_emails(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.emails()`

			`def get_hostnames(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.hostnames()`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`def get_files(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.fileurls(self.files)`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`def get_profiles(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.profiles()`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00
Made boolean to be an arguement instead of necessary in class as only used in one spot which is inside function. 2018-10-27 04:15:43 +08:00			`def process(self,google_dorking):`
			`if google_dorking == False:`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`while self.counter <= self.limit and self.counter <= 1000:`
			`self.do_search()`
			`#more = self.check_next()`
			`time.sleep(1)`
			`print "\tSearching " + str(self.counter) + " results..."`
			`self.counter += 100`
			`else: #google dorking is true`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`self.counter = 0 #reset counter`
			`print '\n'`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`print "[-] Searching with Google Dorks: "`
			`while self.counter <= self.limit and self.counter <= 200: # only 200 dorks in list`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`self.googledork() #call google dorking method if user wanted it!`
			`# more = self.check_next()`
Added hunter search engine and just need to test it, checked to make sure user key was not empty. 2018-11-05 06:26:34 +08:00			`time.sleep(.1)`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`print "\tSearching " + str(self.counter) + " results..."`
			`self.counter += 100`
Initial commit for version 2.0 2011-05-04 23:07:06 +08:00
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`def process_profiles(self):`
			`while self.counter < self.limit:`
			`self.do_search_profiles()`
Reworking hunter search engine into theHarvester.py to work properly. 2018-11-05 08:15:07 +08:00			`time.sleep(0.2)`
2.5 Fixed Google searches, and introduced Requests library 2014-12-17 07:25:12 +08:00			`self.counter += 100`
			`print "\tSearching " + str(self.counter) + " results..."`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00
			`def append_dorks(self):`
			`try: # wrap in try-except incase filepaths are messed up`
Cleaned up how to open file, and fixed how to open file in wfuzz. 2018-10-26 10:14:34 +08:00			`with open('wordlists/dorks.txt', mode='r') as fp:`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`self.dorks = [dork.strip() for dork in fp]`
			`except IOError as error:`
			`print(error)`

			`def construct_dorks(self):`
			`#format is: site:targetwebsite.com + space + inurl:admindork`
			`colon = "%3A"`
			`plus = "%2B"`
			`space = '+'`
			`period = "%2E"`
			`double_quote = "%22"`
			`asterick = "%2A"`
			`left_bracket = "%5B"`
			`right_bracket = "%5D"`
			`question_mark = "%3F"`
			`slash = "%2F"`
			`single_quote = "%27"`
			`ampersand = "%26"`
			`left_peren = "%28"`
			`right_peren = "%29"`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`pipe = '%7C'`
			`# replace links with html encoding`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`self.links = [self.database + space + self.word + space +`
			`str(dork).replace(':', colon).replace('+', plus).replace('.', period).replace('"', double_quote)`
			`.replace("*", asterick).replace('[', left_bracket).replace(']', right_bracket)`
			`.replace('?', question_mark).replace(' ', space).replace('/', slash).replace("'", single_quote)`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`.replace("&", ampersand).replace('(', left_peren).replace(')', right_peren).replace('\|', pipe)`
Removed google dork class, added more dorks, and added boolean to indicate if user wants google dorks in google search. 2018-10-24 04:54:59 +08:00			`for dork in self.dorks]`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00
			`def googledork(self):`
			`self.append_dorks() # call functions to create list`
			`self.construct_dorks()`
			`if (self.counter >= 0 and self.counter <=100):`
			`self.send_dork(start=0, end=100)`
			`elif (self.counter >= 100 and self.counter <=200):`
			`self.send_dork(start=101, end=200)`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`else: #only 200 dorks to prevent google from blocking ip`
			`pass`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`def send_dork(self, start, end): # helper function to minimize code reusability`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`params = {'User-Agent': random.choice(self.userAgent)}`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`# get random user agent to try and prevent google from blocking ip`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`for i in range(start, end):`
			`try:`
Shrunk dorks.txt to 200 to decrease chances of getting caught by google. 2018-10-25 08:49:48 +08:00			`link = self.links[i] # get link from dork list`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`req = requests.get(link, params=params)`
Reworking hunter search engine into theHarvester.py to work properly. 2018-11-05 08:15:07 +08:00			`time.sleep(.2) # sleep for a short time`
Reworked logic, increased dorks.txt, and need to do a few test runs to verifiy dorking increases output. 2018-10-25 03:09:23 +08:00			`self.results = req.content`
			`self.totalresults += self.results`
			`except:`
			`continue`