mirror of
https://github.com/laramies/theHarvester.git
synced 2024-09-20 15:26:31 +08:00
Merge pull request #8 from NotoriousRebel/master
Introduction of grequests
This commit is contained in:
commit
eb5af2bd34
|
@ -7,5 +7,5 @@ plotly==4.1.0
|
|||
pytest==5.1.0
|
||||
PyYaml==5.1.2
|
||||
requests==2.22.0
|
||||
shodan==1.15.0
|
||||
shodan==1.14.0
|
||||
texttable==1.6.2
|
|
@ -53,7 +53,7 @@ def start():
|
|||
parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true')
|
||||
parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str)
|
||||
parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster,
|
||||
dogpile, duckduckgo, github-code, google,
|
||||
dogpile, duckduckgo, exalead, github-code, google,
|
||||
hunter, intelx,
|
||||
linkedin, netcraft, securityTrails, threatcrowd,
|
||||
trello, twitter, vhost, virustotal, yahoo, all''')
|
||||
|
@ -158,6 +158,7 @@ def start():
|
|||
all_hosts.extend(hosts)
|
||||
db = stash.stash_manager()
|
||||
db.store_all(word, all_hosts, 'host', 'CRTsh')
|
||||
|
||||
except Exception:
|
||||
print(f'\033[93m[!] An timeout occurred with crtsh, cannot find {args.domain}\033[0m')
|
||||
|
||||
|
@ -221,6 +222,19 @@ def start():
|
|||
else:
|
||||
pass
|
||||
|
||||
elif engineitem == 'exalead':
|
||||
print('\033[94m[*] Searching Exalead \033[0m')
|
||||
from theHarvester.discovery import exaleadsearch
|
||||
search = exaleadsearch.search_exalead(word, limit, start)
|
||||
search.process()
|
||||
emails = filter(search.get_emails())
|
||||
all_emails.extend(emails)
|
||||
hosts = filter(search.get_hostnames())
|
||||
all_hosts.extend(hosts)
|
||||
db = stash.stash_manager()
|
||||
db.store_all(word, all_hosts, 'host', 'exalead')
|
||||
db.store_all(word, all_emails, 'email', 'exalead')
|
||||
|
||||
elif engineitem == 'google':
|
||||
print('\033[94m[*] Searching Google. \033[0m')
|
||||
from theHarvester.discovery import googlesearch
|
||||
|
@ -491,6 +505,20 @@ def start():
|
|||
db.store_all(word, all_hosts, 'email', 'duckduckgo')
|
||||
db.store_all(word, all_hosts, 'host', 'duckduckgo')
|
||||
|
||||
print('\033[94m[*] Searching Exalead \033[0m')
|
||||
try:
|
||||
from theHarvester.discovery import exaleadsearch
|
||||
search = exaleadsearch.search_exalead(word, limit, start)
|
||||
search.process()
|
||||
emails = filter(search.get_emails())
|
||||
all_emails.extend(emails)
|
||||
hosts = filter(search.get_hostnames())
|
||||
all_hosts.extend(hosts)
|
||||
db = stash.stash_manager()
|
||||
db.store_all(word, all_hosts, 'host', 'exalead')
|
||||
db.store_all(word, all_emails, 'email', 'exalead')
|
||||
except Exception:
|
||||
pass
|
||||
print('\033[94m[*] Searching Google. \033[0m')
|
||||
from theHarvester.discovery import googlesearch
|
||||
search = googlesearch.search_google(word, limit, start)
|
||||
|
@ -977,7 +1005,6 @@ def entry_point():
|
|||
print('\n\n\033[93m[!] ctrl+c detected from user, quitting.\n\n \033[0m')
|
||||
except Exception:
|
||||
import traceback
|
||||
|
||||
print(traceback.print_exc())
|
||||
sys.exit(1)
|
||||
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import time
|
||||
import grequests
|
||||
|
||||
|
||||
class SearchBaidu:
|
||||
|
@ -13,24 +11,21 @@ def __init__(self, word, limit):
|
|||
self.server = 'www.baidu.com'
|
||||
self.hostname = 'www.baidu.com'
|
||||
self.limit = limit
|
||||
self.counter = 0
|
||||
|
||||
def do_search(self):
|
||||
url = 'http://' + self.server + '/s?wd=%40' + self.word + '&pn=' + str(self.counter) + '&oq=' + self.word
|
||||
url = f'https://{self.server}/s?wd=%40{self.word}&pn{self.counter}&oq={self.word}'
|
||||
headers = {
|
||||
'Host': self.hostname,
|
||||
'User-agent': Core.get_user_agent()
|
||||
}
|
||||
h = requests.get(url=url, headers=headers)
|
||||
time.sleep(getDelay())
|
||||
self.total_results += h.text
|
||||
base_url = f'https://{self.server}/s?wd=%40{self.word}&pnxx&oq={self.word}'
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
|
||||
req = (grequests.get(url, headers=headers, timeout=5) for url in urls)
|
||||
responses = grequests.imap(req, size=5)
|
||||
for response in responses:
|
||||
self.total_results += response.content.decode('UTF-8')
|
||||
|
||||
def process(self):
|
||||
while self.counter <= self.limit and self.counter <= 1000:
|
||||
self.do_search()
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
self.counter += 10
|
||||
|
||||
def get_emails(self):
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
|
@ -39,3 +34,5 @@ def get_emails(self):
|
|||
def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.hostnames()
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import time
|
||||
import grequests
|
||||
|
||||
|
||||
class SearchBing:
|
||||
|
@ -10,11 +9,10 @@ class SearchBing:
|
|||
def __init__(self, word, limit, start):
|
||||
self.word = word.replace(' ', '%20')
|
||||
self.results = ""
|
||||
self.totalresults = ""
|
||||
self.total_results = ""
|
||||
self.server = 'www.bing.com'
|
||||
self.apiserver = 'api.search.live.net'
|
||||
self.hostname = 'www.bing.com'
|
||||
self.quantity = '50'
|
||||
self.limit = int(limit)
|
||||
self.bingApi = Core.bing_key()
|
||||
self.counter = start
|
||||
|
@ -26,9 +24,12 @@ def do_search(self):
|
|||
'Accept-Language': 'en-us,en',
|
||||
'User-agent': Core.get_user_agent()
|
||||
}
|
||||
h = requests.get(url=('https://' + self.server + '/search?q=%40"' + self.word + '"&count=50&first=' + str(self.counter)), headers=headers)
|
||||
self.results = h.text
|
||||
self.totalresults += self.results
|
||||
base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
|
||||
req = (grequests.get(url, headers=headers, timeout=5) for url in urls)
|
||||
responses = grequests.imap(req, size=5)
|
||||
for response in responses:
|
||||
self.total_results += response.content.decode('UTF-8')
|
||||
|
||||
def do_search_api(self):
|
||||
url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?'
|
||||
|
@ -40,9 +41,10 @@ def do_search_api(self):
|
|||
'safesearch': 'Off'
|
||||
}
|
||||
headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi}
|
||||
h = requests.get(url=url, headers=headers, params=params)
|
||||
self.results = h.text
|
||||
self.totalresults += self.results
|
||||
grequests_resp = grequests.get(url=url, headers=headers, params=params)
|
||||
response = grequests.map([grequests_resp])
|
||||
self.results = response[0].content.decode('UTF-8')
|
||||
self.total_results += self.results
|
||||
|
||||
def do_search_vhost(self):
|
||||
headers = {
|
||||
|
@ -51,39 +53,35 @@ def do_search_vhost(self):
|
|||
'Accept-Language': 'en-us,en',
|
||||
'User-agent': Core.get_user_agent()
|
||||
}
|
||||
url = 'http://' + self.server + '/search?q=ip:' + self.word + '&go=&count=50&FORM=QBHL&qs=n&first=' + str(self.counter)
|
||||
h = requests.get(url=url, headers=headers)
|
||||
self.results = h.text
|
||||
self.totalresults += self.results
|
||||
base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
|
||||
req = (grequests.get(url, headers=headers, timeout=5) for url in urls)
|
||||
responses = grequests.imap(req, size=5)
|
||||
for response in responses:
|
||||
self.total_results += response.content.decode('UTF-8')
|
||||
|
||||
def get_emails(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.emails()
|
||||
|
||||
def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.hostnames()
|
||||
|
||||
def get_allhostnames(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.hostnames_all()
|
||||
|
||||
def process(self, api):
|
||||
if api == 'yes':
|
||||
if self.bingApi is None:
|
||||
raise MissingKey(True)
|
||||
while self.counter < self.limit:
|
||||
else:
|
||||
if api == 'yes':
|
||||
self.do_search_api()
|
||||
time.sleep(getDelay())
|
||||
else:
|
||||
self.do_search()
|
||||
time.sleep(getDelay())
|
||||
self.counter += 50
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
|
||||
def process_vhost(self):
|
||||
# Maybe it is good to use other limit for this.
|
||||
while self.counter < self.limit:
|
||||
self.do_search_vhost()
|
||||
self.counter += 50
|
||||
|
|
|
@ -10,6 +10,9 @@ def filter(lst):
|
|||
:param lst: list to be filtered
|
||||
:return: new filtered list
|
||||
"""
|
||||
if lst is None:
|
||||
return []
|
||||
if not isinstance(lst, set):
|
||||
lst = set(lst) # Remove duplicates.
|
||||
new_lst = []
|
||||
for item in lst:
|
||||
|
|
|
@ -16,8 +16,7 @@ def do_search(self):
|
|||
request = requests.get(url, headers=headers, timeout=15)
|
||||
if request.ok:
|
||||
content = request.json()
|
||||
data = set(
|
||||
[dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content])
|
||||
data = set([dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content])
|
||||
return data
|
||||
return data
|
||||
except Exception:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import grequests
|
||||
import time
|
||||
|
||||
|
||||
|
@ -13,31 +13,24 @@ def __init__(self, word, limit):
|
|||
self.server = 'www.dogpile.com'
|
||||
self.hostname = 'www.dogpile.com'
|
||||
self.limit = limit
|
||||
self.counter = 0
|
||||
|
||||
def do_search(self):
|
||||
#import ssl
|
||||
#ssl._create_default_https_context = ssl._create_unverified_context
|
||||
# Dogpile is hardcoded to return 10 results.
|
||||
url = 'https://' + self.server + "/search/web?qsi=" + str(self.counter) \
|
||||
+ "&q=\"%40" + self.word + "\""
|
||||
headers = {
|
||||
'Host': self.hostname,
|
||||
'User-agent': Core.get_user_agent()
|
||||
}
|
||||
try:
|
||||
h = requests.get(url=url, headers=headers, verify=False)
|
||||
#print(h.text)
|
||||
self.total_results += h.text
|
||||
headers = {'User-agent': Core.get_user_agent()}
|
||||
base_url = f'https://{self.server}/search/web?qsi=xx&q=%40{self.word}'
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
|
||||
req = (grequests.get(url, headers=headers, verify=False, timeout=5) for url in urls)
|
||||
responses = grequests.imap(req, size=5)
|
||||
for response in responses:
|
||||
self.total_results += response.content.decode('UTF-8')
|
||||
except Exception as e:
|
||||
print(f'Error Occurred: {e}')
|
||||
|
||||
def process(self):
|
||||
while self.counter <= self.limit and self.counter <= 1000:
|
||||
self.do_search()
|
||||
time.sleep(getDelay())
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
self.counter += 10
|
||||
|
||||
def get_emails(self):
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
|
@ -46,3 +39,4 @@ def get_emails(self):
|
|||
def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.hostnames()
|
||||
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import re
|
||||
import requests
|
||||
import time
|
||||
|
||||
import grequests
|
||||
import requests
|
||||
|
||||
class search_exalead:
|
||||
|
||||
|
@ -12,27 +12,33 @@ def __init__(self, word, limit, start):
|
|||
self.word = word
|
||||
self.files = 'pdf'
|
||||
self.results = ""
|
||||
self.totalresults = ""
|
||||
self.total_results = ""
|
||||
self.server = 'www.exalead.com'
|
||||
self.hostname = 'www.exalead.com'
|
||||
self.limit = limit
|
||||
self.counter = start
|
||||
|
||||
def do_search(self):
|
||||
url = 'http:// ' + self.server + '/search/web/results/?q=%40' + self.word \
|
||||
+ '&elements_per_page=50&start_index=' + str(self.counter)
|
||||
base_url = f'https://{self.server}/search/web/results/?q=%40{self.word}&elements_per_page=50&start_index=xx'
|
||||
headers = {
|
||||
'Host': self.hostname,
|
||||
'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word),
|
||||
'User-agent': Core.get_user_agent()
|
||||
}
|
||||
h = requests.get(url=url, headers=headers)
|
||||
self.results = h.text
|
||||
self.totalresults += self.results
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(self.counter, self.limit, 50) if num <= self.limit]
|
||||
req = []
|
||||
for url in urls:
|
||||
req.append(grequests.get(url, headers=headers, timeout=5))
|
||||
time.sleep(3)
|
||||
responses = grequests.imap(tuple(req), size=3)
|
||||
for response in responses:
|
||||
# TODO if decoded content contains information about solving captcha print message to user to visit website
|
||||
# TODO to solve it or use a vpn as it appears to be ip based
|
||||
self.total_results += response.content.decode('UTF-8')
|
||||
|
||||
def do_search_files(self, files):
|
||||
url = 'http:// ' + self.server + '/search/web/results/?q=%40' + self.word \
|
||||
+ 'filetype:' + self.files + '&elements_per_page=50&start_index=' + str(self.counter)
|
||||
url = f'https://{self.server}/search/web/results/?q=%40{self.word}filetype:{self.files}&elements_per_page' \
|
||||
f'=50&start_index={self.counter} '
|
||||
headers = {
|
||||
'Host': self.hostname,
|
||||
'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word),
|
||||
|
@ -40,7 +46,7 @@ def do_search_files(self, files):
|
|||
}
|
||||
h = requests.get(url=url, headers=headers)
|
||||
self.results = h.text
|
||||
self.totalresults += self.results
|
||||
self.total_results += self.results
|
||||
|
||||
def check_next(self):
|
||||
renext = re.compile('topNextUrl')
|
||||
|
@ -53,22 +59,20 @@ def check_next(self):
|
|||
return nexty
|
||||
|
||||
def get_emails(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.emails()
|
||||
|
||||
def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.hostnames()
|
||||
|
||||
def get_files(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
return rawres.fileurls(self.files)
|
||||
|
||||
def process(self):
|
||||
while self.counter <= self.limit:
|
||||
print('Searching results')
|
||||
self.do_search()
|
||||
self.counter += 50
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
|
||||
def process_files(self, files):
|
||||
while self.counter < self.limit:
|
||||
|
|
|
@ -1,30 +1,27 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import grequests
|
||||
|
||||
|
||||
class SearchHunter:
|
||||
|
||||
def __init__(self, word, limit, start):
|
||||
self.word = word
|
||||
self.limit = 100
|
||||
self.limit = limit
|
||||
self.start = start
|
||||
self.key = Core.hunter_key()
|
||||
if self.key is None:
|
||||
raise MissingKey(True)
|
||||
self.results = ""
|
||||
self.totalresults = ""
|
||||
self.total_results = ""
|
||||
self.counter = start
|
||||
self.database = "https://api.hunter.io/v2/domain-search?domain=" + word + "&api_key=" + self.key + "&limit=" + str(self.limit)
|
||||
self.database = f'https://api.hunter.io/v2/domain-search?domain={word}&api_key={self.key}&limit={self.limit}'
|
||||
|
||||
def do_search(self):
|
||||
try:
|
||||
r = requests.get(self.database)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.results = r.text
|
||||
self.totalresults += self.results
|
||||
request = grequests.get(self.database)
|
||||
response = grequests.map([request])
|
||||
self.total_results = response[0].content.decode('UTF-8')
|
||||
|
||||
|
||||
def process(self):
|
||||
self.do_search() # Only need to do it once.
|
||||
|
@ -40,3 +37,4 @@ def get_hostnames(self):
|
|||
def get_profiles(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.profiles()
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ def __init__(self, word):
|
|||
self.word = word.replace(' ', '%20')
|
||||
self.totalresults = ""
|
||||
self.server = 'netcraft.com'
|
||||
self.base_url = 'https://searchdns.netcraft.com/?restriction=site+ends+with&host={domain}'
|
||||
self.base_url = f'https://searchdns.netcraft.com/?restriction=site+ends+with&host={word}'
|
||||
self.session = requests.session()
|
||||
self.headers = {
|
||||
'User-Agent': Core.get_user_agent()
|
||||
|
@ -34,7 +34,7 @@ def get_next(self, resp):
|
|||
link_regx = re.compile('<A href="(.*?)"><b>Next page</b></a>')
|
||||
link = link_regx.findall(resp)
|
||||
link = re.sub(f'host=.*?{self.word}', f'host={self.domain}', link[0])
|
||||
url = f'http://searchdns.netcraft.com{link}'
|
||||
url = f'https://searchdns.netcraft.com{link.replace(" ", "%20")}'
|
||||
return url
|
||||
|
||||
def create_cookies(self, cookie):
|
||||
|
@ -57,13 +57,12 @@ def do_search(self):
|
|||
start_url = self.base_url
|
||||
resp = self.request(start_url)
|
||||
cookies = self.get_cookies(resp.headers)
|
||||
url = self.base_url.format(domain="yale.edu")
|
||||
while True:
|
||||
resp = self.request(url, cookies).text
|
||||
resp = self.request(self.base_url, cookies).text
|
||||
self.totalresults += resp
|
||||
if 'Next page' not in resp or resp is None:
|
||||
break
|
||||
url = self.get_next(resp)
|
||||
self.base_url = self.get_next(resp)
|
||||
|
||||
def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
|
|
|
@ -72,6 +72,7 @@ def get_supportedengines():
|
|||
'dnsdumpster',
|
||||
'dogpile',
|
||||
'duckduckgo',
|
||||
'exalead',
|
||||
'github-code',
|
||||
'google',
|
||||
'hunter',
|
||||
|
|
|
@ -9,26 +9,15 @@ def __init__(self, results, word):
|
|||
self.temp = []
|
||||
|
||||
def genericClean(self):
|
||||
self.results = re.sub('<em>', '', self.results)
|
||||
self.results = re.sub('<b>', '', self.results)
|
||||
self.results = re.sub('</b>', '', self.results)
|
||||
self.results = re.sub('</em>', '', self.results)
|
||||
self.results = re.sub('%2f', ' ', self.results)
|
||||
self.results = re.sub('%3a', ' ', self.results)
|
||||
self.results = re.sub('<strong>', '', self.results)
|
||||
self.results = re.sub('</strong>', '', self.results)
|
||||
self.results = re.sub('<wbr>', '', self.results)
|
||||
self.results = re.sub('</wbr>', '', self.results)
|
||||
self.results = self.results.replace('<em>', '').replace('<b>', '').replace('</b>', '').replace('</em>',
|
||||
'').replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>','')\
|
||||
.replace('<wbr>','').replace('</wbr>','')
|
||||
|
||||
for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'):
|
||||
self.results = self.results.replace(e, ' ')
|
||||
|
||||
def urlClean(self):
|
||||
self.results = re.sub('<em>', '', self.results)
|
||||
self.results = re.sub('</em>', '', self.results)
|
||||
self.results = re.sub('%2f', ' ', self.results)
|
||||
self.results = re.sub('%3a', ' ', self.results)
|
||||
|
||||
self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
|
||||
for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
|
||||
self.results = self.results.replace(e, ' ')
|
||||
|
||||
|
|
Loading…
Reference in a new issue