From 720aa06080df965cd6edb4226006e27679da3da0 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Sun, 11 Aug 2019 22:19:01 -0400 Subject: [PATCH 1/6] Implemented grequests in multiple modules to increase speed. --- requirements.txt | 3 +- theHarvester/__main__.py | 46 ++++++++++++++++++------ theHarvester/discovery/baidusearch.py | 23 ++++++------ theHarvester/discovery/bingsearch.py | 48 ++++++++++++------------- theHarvester/discovery/constants.py | 3 +- theHarvester/discovery/crtsh.py | 20 ++++++----- theHarvester/discovery/dogpilesearch.py | 26 ++++++-------- theHarvester/discovery/huntersearch.py | 21 ++++++----- theHarvester/discovery/netcraft.py | 9 +++-- theHarvester/discovery/yahoosearch.py | 27 ++++++-------- theHarvester/lib/core.py | 1 + 11 files changed, 120 insertions(+), 107 deletions(-) diff --git a/requirements.txt b/requirements.txt index 235c5e61..19e17d36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ pytest==5.0.1 PyYaml==5.1.1 requests==2.22.0 shodan==1.14.0 -texttable==1.6.2 \ No newline at end of file +texttable==1.6.2 +grequests>=0.4.0 \ No newline at end of file diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index fb43e147..105046f7 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -53,7 +53,7 @@ def start(): parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true') parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str) parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster, - dogpile, duckduckgo, github-code, google, + dogpile, duckduckgo, exalead, github-code, google, hunter, intelx, linkedin, netcraft, securityTrails, threatcrowd, trello, twitter, vhost, virustotal, yahoo, all''') @@ -144,14 +144,16 @@ def start(): db.store_all(word, all_ip, 'ip', 'censys') elif engineitem == 'crtsh': - print('\033[94m[*] Searching CRT.sh. \033[0m') - search = crtsh.SearchCrtsh(word) - search.process() - hosts = filter(search.get_data()) - all_hosts.extend(hosts) - db = stash.stash_manager() - db.store_all(word, all_hosts, 'host', 'CRTsh') - + try: + print('\033[94m[*] Searching CRT.sh. \033[0m') + search = crtsh.SearchCrtsh(word) + search.process() + hosts = filter(search.get_data()) + all_hosts.extend(hosts) + db = stash.stash_manager() + db.store_all(word, all_hosts, 'host', 'CRTsh') + except Exception as e: + pass elif engineitem == 'dnsdumpster': try: print('\033[94m[*] Searching DNSdumpster. \033[0m') @@ -211,6 +213,18 @@ def start(): else: pass + elif engineitem == 'exalead': + print('\033[94m[*] Searching Exalead \033[0m') + search = exaleadsearch.search_exalead(word, limit, start) + search.process() + emails = filter(search.get_emails()) + all_emails.extend(emails) + hosts = filter(search.get_hostnames()) + all_hosts.extend(hosts) + db = stash.stash_manager() + db.store_all(word, all_hosts, 'host', 'exalead') + db.store_all(word, all_emails, 'email', 'exalead') + elif engineitem == 'google': print('\033[94m[*] Searching Google. \033[0m') search = googlesearch.search_google(word, limit, start) @@ -363,7 +377,7 @@ def start(): elif engineitem == 'yahoo': print('\033[94m[*] Searching Yahoo. \033[0m') - search = yahoosearch.search_yahoo(word, limit) + search = yahoosearch.SearchYahoo(word, limit) search.process() hosts = search.get_hostnames() emails = search.get_emails() @@ -467,6 +481,17 @@ def start(): db.store_all(word, all_hosts, 'email', 'duckduckgo') db.store_all(word, all_hosts, 'host', 'duckduckgo') + print('\033[94m[*] Searching Exalead \033[0m') + search = exaleadsearch.search_exalead(word, limit, start) + search.process() + emails = filter(search.get_emails()) + all_emails.extend(emails) + hosts = filter(search.get_hostnames()) + all_hosts.extend(hosts) + db = stash.stash_manager() + db.store_all(word, all_hosts, 'host', 'exalead') + db.store_all(word, all_emails, 'email', 'exalead') + print('\033[94m[*] Searching Google. \033[0m') search = googlesearch.search_google(word, limit, start) search.process(google_dorking) @@ -945,7 +970,6 @@ def entry_point(): print('\n\n\033[93m[!] ctrl+c detected from user, quitting.\n\n \033[0m') except Exception: import traceback - print(traceback.print_exc()) sys.exit(1) diff --git a/theHarvester/discovery/baidusearch.py b/theHarvester/discovery/baidusearch.py index a371476a..a1f5becd 100644 --- a/theHarvester/discovery/baidusearch.py +++ b/theHarvester/discovery/baidusearch.py @@ -1,8 +1,6 @@ -from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests -import time +import grequests class SearchBaidu: @@ -13,24 +11,21 @@ def __init__(self, word, limit): self.server = 'www.baidu.com' self.hostname = 'www.baidu.com' self.limit = limit - self.counter = 0 def do_search(self): - url = 'http://' + self.server + '/s?wd=%40' + self.word + '&pn=' + str(self.counter) + '&oq=' + self.word - url = f'https://{self.server}/s?wd=%40{self.word}&pn{self.counter}&oq={self.word}' headers = { 'Host': self.hostname, 'User-agent': Core.get_user_agent() } - h = requests.get(url=url, headers=headers) - time.sleep(getDelay()) - self.total_results += h.text + base_url = f'https://{self.server}/s?wd=%40{self.word}&pnxx&oq={self.word}' + urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] + req = (grequests.get(u, headers=headers, timeout=5) for u in urls) + resp = grequests.imap(req, size=5) + for x in resp: + self.total_results += x.content.decode('UTF-8') def process(self): - while self.counter <= self.limit and self.counter <= 1000: - self.do_search() - print(f'\tSearching {self.counter} results.') - self.counter += 10 + self.do_search() def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) @@ -39,3 +34,5 @@ def get_emails(self): def get_hostnames(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() + + diff --git a/theHarvester/discovery/bingsearch.py b/theHarvester/discovery/bingsearch.py index 1147f2ab..683da2be 100644 --- a/theHarvester/discovery/bingsearch.py +++ b/theHarvester/discovery/bingsearch.py @@ -1,8 +1,7 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests -import time +import grequests class SearchBing: @@ -10,11 +9,10 @@ class SearchBing: def __init__(self, word, limit, start): self.word = word.replace(' ', '%20') self.results = "" - self.totalresults = "" + self.total_results = "" self.server = 'www.bing.com' self.apiserver = 'api.search.live.net' self.hostname = 'www.bing.com' - self.quantity = '50' self.limit = int(limit) self.bingApi = Core.bing_key() self.counter = start @@ -26,9 +24,12 @@ def do_search(self): 'Accept-Language': 'en-us,en', 'User-agent': Core.get_user_agent() } - h = requests.get(url=('https://' + self.server + '/search?q=%40"' + self.word + '"&count=50&first=' + str(self.counter)), headers=headers) - self.results = h.text - self.totalresults += self.results + base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx' + urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] + req = (grequests.get(u, headers=headers, timeout=5) for u in urls) + resp = grequests.imap(req, size=5) + for x in resp: + self.total_results += x.content.decode('UTF-8') def do_search_api(self): url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?' @@ -40,9 +41,10 @@ def do_search_api(self): 'safesearch': 'Off' } headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi} - h = requests.get(url=url, headers=headers, params=params) - self.results = h.text - self.totalresults += self.results + h = grequests.get(url=url, headers=headers, params=params) + response = grequests.map([h]) + self.results = response[0].content.decode('UTF-8') + self.total_results += self.results def do_search_vhost(self): headers = { @@ -51,39 +53,35 @@ def do_search_vhost(self): 'Accept-Language': 'en-us,en', 'User-agent': Core.get_user_agent() } - url = 'http://' + self.server + '/search?q=ip:' + self.word + '&go=&count=50&FORM=QBHL&qs=n&first=' + str(self.counter) - h = requests.get(url=url, headers=headers) - self.results = h.text - self.totalresults += self.results + base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx' + urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] + req = (grequests.get(u, headers=headers, timeout=5) for u in urls) + resp = grequests.imap(req, size=5) + for x in resp: + self.total_results += x.content.decode('UTF-8') def get_emails(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.emails() def get_hostnames(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() def get_allhostnames(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames_all() def process(self, api): if api == 'yes': if self.bingApi is None: raise MissingKey(True) - while self.counter < self.limit: + else: if api == 'yes': self.do_search_api() - time.sleep(getDelay()) else: self.do_search() - time.sleep(getDelay()) - self.counter += 50 print(f'\tSearching {self.counter} results.') def process_vhost(self): - # Maybe it is good to use other limit for this. - while self.counter < self.limit: - self.do_search_vhost() - self.counter += 50 + self.do_search_vhost() diff --git a/theHarvester/discovery/constants.py b/theHarvester/discovery/constants.py index eeb8dcaf..005d9bca 100644 --- a/theHarvester/discovery/constants.py +++ b/theHarvester/discovery/constants.py @@ -10,7 +10,8 @@ def filter(lst): :param lst: list to be filtered :return: new filtered list """ - lst = set(lst) # Remove duplicates. + if not isinstance(lst, set): + lst = set(lst) # Remove duplicates. new_lst = [] for item in lst: item = str(item) diff --git a/theHarvester/discovery/crtsh.py b/theHarvester/discovery/crtsh.py index d5dc5b2d..1894a1e5 100644 --- a/theHarvester/discovery/crtsh.py +++ b/theHarvester/discovery/crtsh.py @@ -1,6 +1,5 @@ from theHarvester.lib.core import * import requests -import urllib3 class SearchCrtsh: @@ -9,13 +8,18 @@ def __init__(self, word): self.data = set() def do_search(self): - url = f'https://crt.sh/?q=%25.{self.word}&output=json' - headers = {'User-Agent': Core.get_user_agent()} - request = requests.get(url, params=headers, timeout=30) - if request.ok: - content = request.json() - data = set([dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content]) - return data + try: + data = set() + url = f'https://crt.sh/?q=%25.{self.word}&output=json' + headers = {'User-Agent': Core.get_user_agent()} + request = requests.get(url, headers=headers, timeout=15) + if request.ok: + content = request.json() + data = set([dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content]) + return data + return data + except Exception as e: + print(f'An exception has occurred in crtsh: {e}') def process(self): print('\tSearching results.') diff --git a/theHarvester/discovery/dogpilesearch.py b/theHarvester/discovery/dogpilesearch.py index b7769f14..74eaece5 100644 --- a/theHarvester/discovery/dogpilesearch.py +++ b/theHarvester/discovery/dogpilesearch.py @@ -1,7 +1,7 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests +import grequests import time @@ -13,31 +13,24 @@ def __init__(self, word, limit): self.server = 'www.dogpile.com' self.hostname = 'www.dogpile.com' self.limit = limit - self.counter = 0 def do_search(self): #import ssl #ssl._create_default_https_context = ssl._create_unverified_context # Dogpile is hardcoded to return 10 results. - url = 'https://' + self.server + "/search/web?qsi=" + str(self.counter) \ - + "&q=\"%40" + self.word + "\"" - headers = { - 'Host': self.hostname, - 'User-agent': Core.get_user_agent() - } try: - h = requests.get(url=url, headers=headers, verify=False) - #print(h.text) - self.total_results += h.text + headers = {'User-agent': Core.get_user_agent()} + base_url = f'https://{self.server}/search/web?qsi=xx&q=%40{self.word}' + urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] + req = (grequests.get(u, headers=headers, verify=False, timeout=5) for u in urls) + resp = grequests.imap(req, size=5) + for x in resp: + self.total_results += x.content.decode('UTF-8') except Exception as e: print(f'Error Occurred: {e}') def process(self): - while self.counter <= self.limit and self.counter <= 1000: - self.do_search() - time.sleep(getDelay()) - print(f'\tSearching {self.counter} results.') - self.counter += 10 + self.do_search() def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) @@ -46,3 +39,4 @@ def get_emails(self): def get_hostnames(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() + diff --git a/theHarvester/discovery/huntersearch.py b/theHarvester/discovery/huntersearch.py index feaee187..f0eb2253 100644 --- a/theHarvester/discovery/huntersearch.py +++ b/theHarvester/discovery/huntersearch.py @@ -1,30 +1,28 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests +import grequests class SearchHunter: def __init__(self, word, limit, start): self.word = word - self.limit = 100 + self.limit = limit self.start = start self.key = Core.hunter_key() + #self.key = "e802ef64e560430c3612ab7e9f2d018fd9946177" if self.key is None: raise MissingKey(True) - self.results = "" - self.totalresults = "" + self.total_results = "" self.counter = start - self.database = "https://api.hunter.io/v2/domain-search?domain=" + word + "&api_key=" + self.key + "&limit=" + str(self.limit) + self.database = f'https://api.hunter.io/v2/domain-search?domain={word}&api_key={self.key}&limit={self.limit}' def do_search(self): - try: - r = requests.get(self.database) - except Exception as e: - print(e) - self.results = r.text - self.totalresults += self.results + request = grequests.get(self.database) + response = grequests.map([request]) + self.total_results = response[0].content.decode('UTF-8') + def process(self): self.do_search() # Only need to do it once. @@ -40,3 +38,4 @@ def get_hostnames(self): def get_profiles(self): rawres = myparser.Parser(self.totalresults, self.word) return rawres.profiles() + diff --git a/theHarvester/discovery/netcraft.py b/theHarvester/discovery/netcraft.py index f3d001d5..a65453b8 100644 --- a/theHarvester/discovery/netcraft.py +++ b/theHarvester/discovery/netcraft.py @@ -12,7 +12,7 @@ def __init__(self, word): self.word = word.replace(' ', '%20') self.totalresults = "" self.server = 'netcraft.com' - self.base_url = 'https://searchdns.netcraft.com/?restriction=site+ends+with&host={domain}' + self.base_url = f'https://searchdns.netcraft.com/?restriction=site+ends+with&host={word}' self.session = requests.session() self.headers = { 'User-Agent': Core.get_user_agent() @@ -33,7 +33,7 @@ def get_next(self, resp): link_regx = re.compile('Next page') link = link_regx.findall(resp) link = re.sub(f'host=.*?{self.word}', f'host={self.domain}', link[0]) - url = f'http://searchdns.netcraft.com{link}' + url = f'https://searchdns.netcraft.com{link.replace(" ", "%20")}' return url def create_cookies(self, cookie): @@ -56,13 +56,12 @@ def do_search(self): start_url = self.base_url resp = self.request(start_url) cookies = self.get_cookies(resp.headers) - url = self.base_url.format(domain="yale.edu") while True: - resp = self.request(url, cookies).text + resp = self.request(self.base_url, cookies).text self.totalresults += resp if 'Next page' not in resp or resp is None: break - url = self.get_next(resp) + self.base_url = self.get_next(resp) def get_hostnames(self): rawres = myparser.Parser(self.totalresults, self.word) diff --git a/theHarvester/discovery/yahoosearch.py b/theHarvester/discovery/yahoosearch.py index 65f510bb..a03a7a97 100644 --- a/theHarvester/discovery/yahoosearch.py +++ b/theHarvester/discovery/yahoosearch.py @@ -1,35 +1,30 @@ -from theHarvester.discovery.constants import * +import grequests from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests -import time -class search_yahoo: +class SearchYahoo: def __init__(self, word, limit): self.word = word self.total_results = "" self.server = 'search.yahoo.com' - self.hostname = 'search.yahoo.com' self.limit = limit - self.counter = 0 def do_search(self): - url = 'http://' + self.server + '/search?p=\"%40' + self.word + '\"&b=' + str(self.counter) + '&pz=10' + base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10' headers = { - 'Host': self.hostname, + 'Host': self.server, 'User-agent': Core.get_user_agent() } - h = requests.get(url=url, headers=headers) - self.total_results += h.text + urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] + request = (grequests.get(url, headers=headers) for url in urls) + response = grequests.imap(request, size=5) + for entry in response: + self.total_results += entry.content.decode('UTF-8') def process(self): - while self.counter <= self.limit and self.counter <= 1000: - self.do_search() - time.sleep(getDelay()) - print(f'\tSearching {self.counter} results.') - self.counter += 10 + self.do_search() def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) @@ -46,4 +41,4 @@ def get_emails(self): def get_hostnames(self): rawres = myparser.Parser(self.total_results, self.word) - return rawres.hostnames() + return rawres.hostnames() \ No newline at end of file diff --git a/theHarvester/lib/core.py b/theHarvester/lib/core.py index 1038a560..1bb3eadc 100644 --- a/theHarvester/lib/core.py +++ b/theHarvester/lib/core.py @@ -72,6 +72,7 @@ def get_supportedengines(): 'dnsdumpster', 'dogpile', 'duckduckgo', + 'exalead', 'github-code', 'google', 'hunter', From f30742dbc042b18abf39504209f197a9973fd530 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Sun, 11 Aug 2019 22:26:10 -0400 Subject: [PATCH 2/6] Removed key. --- theHarvester/discovery/huntersearch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/theHarvester/discovery/huntersearch.py b/theHarvester/discovery/huntersearch.py index f0eb2253..40d41a81 100644 --- a/theHarvester/discovery/huntersearch.py +++ b/theHarvester/discovery/huntersearch.py @@ -11,7 +11,6 @@ def __init__(self, word, limit, start): self.limit = limit self.start = start self.key = Core.hunter_key() - #self.key = "e802ef64e560430c3612ab7e9f2d018fd9946177" if self.key is None: raise MissingKey(True) self.total_results = "" From 0c3dac58a41d55ac9242c94efa9f0e6e23af5af9 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Thu, 15 Aug 2019 23:15:59 -0400 Subject: [PATCH 3/6] Updted parser to use str.replace instead of re.sub, added small check for constants.py, and wrapped exalead in try catch. --- theHarvester/__main__.py | 22 ++++++++++++---------- theHarvester/discovery/constants.py | 2 ++ theHarvester/parsers/myparser.py | 19 ++++--------------- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index df041033..162ff856 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -482,16 +482,18 @@ def start(): db.store_all(word, all_hosts, 'host', 'duckduckgo') print('\033[94m[*] Searching Exalead \033[0m') - search = exaleadsearch.search_exalead(word, limit, start) - search.process() - emails = filter(search.get_emails()) - all_emails.extend(emails) - hosts = filter(search.get_hostnames()) - all_hosts.extend(hosts) - db = stash.stash_manager() - db.store_all(word, all_hosts, 'host', 'exalead') - db.store_all(word, all_emails, 'email', 'exalead') - + try: + search = exaleadsearch.search_exalead(word, limit, start) + search.process() + emails = filter(search.get_emails()) + all_emails.extend(emails) + hosts = filter(search.get_hostnames()) + all_hosts.extend(hosts) + db = stash.stash_manager() + db.store_all(word, all_hosts, 'host', 'exalead') + db.store_all(word, all_emails, 'email', 'exalead') + except Exception: + pass print('\033[94m[*] Searching Google. \033[0m') search = googlesearch.search_google(word, limit, start) search.process(google_dorking) diff --git a/theHarvester/discovery/constants.py b/theHarvester/discovery/constants.py index 005d9bca..7549c4c6 100644 --- a/theHarvester/discovery/constants.py +++ b/theHarvester/discovery/constants.py @@ -10,6 +10,8 @@ def filter(lst): :param lst: list to be filtered :return: new filtered list """ + if lst is None: + return [] if not isinstance(lst, set): lst = set(lst) # Remove duplicates. new_lst = [] diff --git a/theHarvester/parsers/myparser.py b/theHarvester/parsers/myparser.py index a63b0612..0f212730 100644 --- a/theHarvester/parsers/myparser.py +++ b/theHarvester/parsers/myparser.py @@ -9,26 +9,15 @@ def __init__(self, results, word): self.temp = [] def genericClean(self): - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('%2f', ' ', self.results) - self.results = re.sub('%3a', ' ', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) + self.results = self.results.replace('', '').replace('', '').replace('', '').replace('', + '').replace('%2f', '').replace('%3a', '').replace('', '').replace('','')\ + .replace('','').replace('','') for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'): self.results = self.results.replace(e, ' ') def urlClean(self): - self.results = re.sub('', '', self.results) - self.results = re.sub('', '', self.results) - self.results = re.sub('%2f', ' ', self.results) - self.results = re.sub('%3a', ' ', self.results) - + self.results = self.results.replace('', '').replace('', '').replace('%2f', '').replace('%3a', '') for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'): self.results = self.results.replace(e, ' ') From a758757229811fa5c5d2322c059225e021a6ae92 Mon Sep 17 00:00:00 2001 From: Matt <36310667+NotoriousRebel@users.noreply.github.com> Date: Sat, 17 Aug 2019 18:06:14 -0400 Subject: [PATCH 4/6] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 37c1891a..241cbe1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ PyYaml==5.1.2 requests==2.22.0 shodan==1.14.0 texttable==1.6.2 -grequests>=0.4.0 \ No newline at end of file +grequests==0.4.0 From 3478aa3b15d942d48a5f9dc50201899bcc511fe4 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Sat, 17 Aug 2019 23:18:36 -0400 Subject: [PATCH 5/6] Implemented grequests in exaleadsearch and fixed module. --- theHarvester/discovery/exaleadsearch.py | 40 ++++++++++++++----------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/theHarvester/discovery/exaleadsearch.py b/theHarvester/discovery/exaleadsearch.py index 5cb1c1d7..6dc3e30e 100644 --- a/theHarvester/discovery/exaleadsearch.py +++ b/theHarvester/discovery/exaleadsearch.py @@ -2,9 +2,9 @@ from theHarvester.lib.core import * from theHarvester.parsers import myparser import re -import requests import time - +import grequests +import requests class search_exalead: @@ -12,27 +12,33 @@ def __init__(self, word, limit, start): self.word = word self.files = 'pdf' self.results = "" - self.totalresults = "" + self.total_results = "" self.server = 'www.exalead.com' self.hostname = 'www.exalead.com' self.limit = limit self.counter = start def do_search(self): - url = 'http:// ' + self.server + '/search/web/results/?q=%40' + self.word \ - + '&elements_per_page=50&start_index=' + str(self.counter) + base_url = f'https://{self.server}/search/web/results/?q=%40{self.word}&elements_per_page=50&start_index=xx' headers = { 'Host': self.hostname, 'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word), 'User-agent': Core.get_user_agent() } - h = requests.get(url=url, headers=headers) - self.results = h.text - self.totalresults += self.results + urls = [base_url.replace("xx", str(num)) for num in range(self.counter, self.limit, 50) if num <= self.limit] + req = [] + for u in urls: + req.append(grequests.get(u, headers=headers, timeout=5)) + time.sleep(3) + resp = grequests.imap(tuple(req), size=3) + for x in resp: + # TODO if decoded content contains information about solving captcha print message to user to visit website + # TODO to solve it or use a vpn as it appears to be ip based + self.total_results += x.content.decode('UTF-8') def do_search_files(self, files): - url = 'http:// ' + self.server + '/search/web/results/?q=%40' + self.word \ - + 'filetype:' + self.files + '&elements_per_page=50&start_index=' + str(self.counter) + url = f'https://{self.server}/search/web/results/?q=%40{self.word}filetype:{self.files}&elements_per_page' \ + f'=50&start_index={self.counter} ' headers = { 'Host': self.hostname, 'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word), @@ -40,7 +46,7 @@ def do_search_files(self, files): } h = requests.get(url=url, headers=headers) self.results = h.text - self.totalresults += self.results + self.total_results += self.results def check_next(self): renext = re.compile('topNextUrl') @@ -53,22 +59,20 @@ def check_next(self): return nexty def get_emails(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.emails() def get_hostnames(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() def get_files(self): - rawres = myparser.Parser(self.totalresults, self.word) + rawres = myparser.Parser(self.total_results, self.word) return rawres.fileurls(self.files) def process(self): - while self.counter <= self.limit: - self.do_search() - self.counter += 50 - print(f'\tSearching {self.counter} results.') + print('Searching 0 results') + self.do_search() def process_files(self, files): while self.counter < self.limit: From fdfb137eb2d066aa0b355aa4c62859c5b9777c58 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Sun, 18 Aug 2019 21:03:41 -0400 Subject: [PATCH 6/6] Changed variable names to be more helpful and added local import for exalead. --- theHarvester/__main__.py | 2 ++ theHarvester/discovery/baidusearch.py | 8 ++++---- theHarvester/discovery/bingsearch.py | 20 ++++++++++---------- theHarvester/discovery/dogpilesearch.py | 8 ++++---- theHarvester/discovery/exaleadsearch.py | 12 ++++++------ 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index 53a49668..318bf2e4 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -224,6 +224,7 @@ def start(): elif engineitem == 'exalead': print('\033[94m[*] Searching Exalead \033[0m') + from theHarvester.discovery import exaleadsearch search = exaleadsearch.search_exalead(word, limit, start) search.process() emails = filter(search.get_emails()) @@ -506,6 +507,7 @@ def start(): print('\033[94m[*] Searching Exalead \033[0m') try: + from theHarvester.discovery import exaleadsearch search = exaleadsearch.search_exalead(word, limit, start) search.process() emails = filter(search.get_emails()) diff --git a/theHarvester/discovery/baidusearch.py b/theHarvester/discovery/baidusearch.py index a1f5becd..b23e01be 100644 --- a/theHarvester/discovery/baidusearch.py +++ b/theHarvester/discovery/baidusearch.py @@ -19,10 +19,10 @@ def do_search(self): } base_url = f'https://{self.server}/s?wd=%40{self.word}&pnxx&oq={self.word}' urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] - req = (grequests.get(u, headers=headers, timeout=5) for u in urls) - resp = grequests.imap(req, size=5) - for x in resp: - self.total_results += x.content.decode('UTF-8') + req = (grequests.get(url, headers=headers, timeout=5) for url in urls) + responses = grequests.imap(req, size=5) + for response in responses: + self.total_results += response.content.decode('UTF-8') def process(self): self.do_search() diff --git a/theHarvester/discovery/bingsearch.py b/theHarvester/discovery/bingsearch.py index 683da2be..490f5c06 100644 --- a/theHarvester/discovery/bingsearch.py +++ b/theHarvester/discovery/bingsearch.py @@ -26,10 +26,10 @@ def do_search(self): } base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx' urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] - req = (grequests.get(u, headers=headers, timeout=5) for u in urls) - resp = grequests.imap(req, size=5) - for x in resp: - self.total_results += x.content.decode('UTF-8') + req = (grequests.get(url, headers=headers, timeout=5) for url in urls) + responses = grequests.imap(req, size=5) + for response in responses: + self.total_results += response.content.decode('UTF-8') def do_search_api(self): url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?' @@ -41,8 +41,8 @@ def do_search_api(self): 'safesearch': 'Off' } headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi} - h = grequests.get(url=url, headers=headers, params=params) - response = grequests.map([h]) + grequests_resp = grequests.get(url=url, headers=headers, params=params) + response = grequests.map([grequests_resp]) self.results = response[0].content.decode('UTF-8') self.total_results += self.results @@ -55,10 +55,10 @@ def do_search_vhost(self): } base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx' urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] - req = (grequests.get(u, headers=headers, timeout=5) for u in urls) - resp = grequests.imap(req, size=5) - for x in resp: - self.total_results += x.content.decode('UTF-8') + req = (grequests.get(url, headers=headers, timeout=5) for url in urls) + responses = grequests.imap(req, size=5) + for response in responses: + self.total_results += response.content.decode('UTF-8') def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) diff --git a/theHarvester/discovery/dogpilesearch.py b/theHarvester/discovery/dogpilesearch.py index 74eaece5..5ced83bf 100644 --- a/theHarvester/discovery/dogpilesearch.py +++ b/theHarvester/discovery/dogpilesearch.py @@ -22,10 +22,10 @@ def do_search(self): headers = {'User-agent': Core.get_user_agent()} base_url = f'https://{self.server}/search/web?qsi=xx&q=%40{self.word}' urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] - req = (grequests.get(u, headers=headers, verify=False, timeout=5) for u in urls) - resp = grequests.imap(req, size=5) - for x in resp: - self.total_results += x.content.decode('UTF-8') + req = (grequests.get(url, headers=headers, verify=False, timeout=5) for url in urls) + responses = grequests.imap(req, size=5) + for response in responses: + self.total_results += response.content.decode('UTF-8') except Exception as e: print(f'Error Occurred: {e}') diff --git a/theHarvester/discovery/exaleadsearch.py b/theHarvester/discovery/exaleadsearch.py index 6dc3e30e..5a8617e6 100644 --- a/theHarvester/discovery/exaleadsearch.py +++ b/theHarvester/discovery/exaleadsearch.py @@ -27,14 +27,14 @@ def do_search(self): } urls = [base_url.replace("xx", str(num)) for num in range(self.counter, self.limit, 50) if num <= self.limit] req = [] - for u in urls: - req.append(grequests.get(u, headers=headers, timeout=5)) + for url in urls: + req.append(grequests.get(url, headers=headers, timeout=5)) time.sleep(3) - resp = grequests.imap(tuple(req), size=3) - for x in resp: + responses = grequests.imap(tuple(req), size=3) + for response in responses: # TODO if decoded content contains information about solving captcha print message to user to visit website # TODO to solve it or use a vpn as it appears to be ip based - self.total_results += x.content.decode('UTF-8') + self.total_results += response.content.decode('UTF-8') def do_search_files(self, files): url = f'https://{self.server}/search/web/results/?q=%40{self.word}filetype:{self.files}&elements_per_page' \ @@ -71,7 +71,7 @@ def get_files(self): return rawres.fileurls(self.files) def process(self): - print('Searching 0 results') + print('Searching results') self.do_search() def process_files(self, files):