theHarvester/discovery/googlesearch.py

151 lines
5.7 KiB
Python

from parsers import myparser
import time
import requests
from discovery.constants import *
class search_google:
def __init__(self, word, limit, start):
self.word = word
self.results = ""
self.totalresults = ""
self.server = "www.google.com"
self.dorks = []
self.links = []
self.database = "https://www.google.com/search?q="
self.quantity = "100"
self.limit = limit
self.counter = start
def do_search(self):
try: # do normal scraping
urly = "http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(
self.counter) + "&hl=en&meta=&q=%40\"" + self.word + "\""
except Exception as e:
print(e)
try:
headers = {'User-Agent': googleUA}
r = requests.get(urly, headers=headers)
except Exception as e:
print(e)
self.results = r.text
if search(self.results):
time.sleep(getDelay() * 5) # sleep for a longer time
else:
time.sleep(getDelay())
self.totalresults += self.results
def do_search_profiles(self):
try:
urly = "http://" + self.server + "/search?num=" + self.quantity + "&start=" + str(
self.counter) + "&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20" + self.word + "\""
except Exception as e:
print(e)
try:
headers = {'User-Agent': googleUA}
r = requests.get(urly, headers=headers)
except Exception as e:
print(e)
self.results = r.text
if search(self.results):
time.sleep(getDelay() * 5) # sleep for a longer time
else:
time.sleep(getDelay())
self.totalresults += self.results
def get_emails(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.emails()
def get_hostnames(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.hostnames()
def get_files(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.fileurls(self.files)
def get_profiles(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.profiles()
def process(self, google_dorking):
if google_dorking is False:
while self.counter <= self.limit and self.counter <= 1000:
self.do_search()
print(f'\tSearching {self.counter} results...')
self.counter += 100
else: # google dorking is true
self.counter = 0 # reset counter
print('\n')
print("[-] Searching with Google Dorks: ")
while self.counter <= self.limit and self.counter <= 200: # only 200 dorks in list
self.googledork() # call google dorking method if user wanted it!
print(f'\tSearching {self.counter} results...')
self.counter += 100
def process_profiles(self):
while self.counter < self.limit:
self.do_search_profiles()
time.sleep(getDelay())
self.counter += 100
print(f'\tSearching {self.counter} results...')
def append_dorks(self):
try: # wrap in try-except incase filepaths are messed up
with open('wordlists/dorks.txt', mode='r') as fp:
self.dorks = [dork.strip() for dork in fp]
except FileNotFoundError as error:
print(error)
def construct_dorks(self):
# format is: site:targetwebsite.com + space + inurl:admindork
colon = "%3A"
plus = "%2B"
space = '+'
period = "%2E"
double_quote = "%22"
asterick = "%2A"
left_bracket = "%5B"
right_bracket = "%5D"
question_mark = "%3F"
slash = "%2F"
single_quote = "%27"
ampersand = "%26"
left_peren = "%28"
right_peren = "%29"
pipe = '%7C'
# replace links with html encoding
self.links = [self.database + space + self.word + space +
str(dork).replace(':', colon).replace('+', plus).replace('.', period).replace('"', double_quote)
.replace("*", asterick).replace('[', left_bracket).replace(']', right_bracket)
.replace('?', question_mark).replace(' ', space).replace('/', slash).replace("'",single_quote)
.replace("&", ampersand).replace('(', left_peren).replace(')', right_peren).replace('|', pipe)
for dork in self.dorks]
def googledork(self):
self.append_dorks() # call functions to create list
self.construct_dorks()
if self.counter >= 0 and self.counter <= 100:
self.send_dork(start=0, end=100)
elif self.counter >= 100 and self.counter <= 200:
self.send_dork(start=101, end=200)
else: # only 200 dorks to prevent google from blocking ip
pass
def send_dork(self, start, end): # helper function to minimize code reusability
headers = {'User-Agent': googleUA}
# get random user agent to try and prevent google from blocking ip
for i in range(start, end):
try:
link = self.links[i] # get link from dork list
req = requests.get(link, headers=headers)
self.results = req.text
if search(self.results):
time.sleep(getDelay() * 5) # sleep for a longer time
else:
time.sleep(getDelay())
self.totalresults += self.results
except:
continue