From 90ed486184e6f6777ce72496fbc8abbad86cee30 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Wed, 25 Dec 2019 17:54:32 -0500 Subject: [PATCH 1/4] Removed old code, ported exalead to use aiohttp. --- theHarvester/discovery/bingsearch.py | 2 -- theHarvester/discovery/exaleadsearch.py | 43 ++++++++++--------------- theHarvester/discovery/suip.py | 9 ++---- theHarvester/lib/core.py | 19 ++++++----- 4 files changed, 31 insertions(+), 42 deletions(-) diff --git a/theHarvester/discovery/bingsearch.py b/theHarvester/discovery/bingsearch.py index ef503f4c..cbdb9319 100644 --- a/theHarvester/discovery/bingsearch.py +++ b/theHarvester/discovery/bingsearch.py @@ -1,7 +1,6 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -# import grequests from theHarvester.lib.core import async_fetcher @@ -19,7 +18,6 @@ def __init__(self, word, limit, start): self.counter = start async def do_search(self): - print('hello from bing do search') headers = { 'Host': self.hostname, 'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50', diff --git a/theHarvester/discovery/exaleadsearch.py b/theHarvester/discovery/exaleadsearch.py index 9bc4e243..68c55f47 100644 --- a/theHarvester/discovery/exaleadsearch.py +++ b/theHarvester/discovery/exaleadsearch.py @@ -1,10 +1,7 @@ -from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser import re -import time -import grequests -import requests +import asyncio class SearchExalead: @@ -19,7 +16,7 @@ def __init__(self, word, limit, start): self.limit = limit self.counter = start - def do_search(self): + async def do_search(self): base_url = f'https://{self.server}/search/web/results/?q=%40{self.word}&elements_per_page=50&start_index=xx' headers = { 'Host': self.hostname, @@ -27,29 +24,23 @@ def do_search(self): 'User-agent': Core.get_user_agent() } urls = [base_url.replace("xx", str(num)) for num in range(self.counter, self.limit, 50) if num <= self.limit] - req = [] - for url in urls: - req.append(grequests.get(url, headers=headers, timeout=5)) - time.sleep(3) - responses = grequests.imap(tuple(req), size=3) + responses = await async_fetcher.fetch_all(urls, headers=headers) for response in responses: - # TODO if decoded content contains information about solving captcha print message to user to visit website - # TODO to solve it or use a vpn as it appears to be ip based - self.total_results += response.content.decode('UTF-8') + self.total_results += response - def do_search_files(self, files): + async def do_search_files(self, files): url = f'https://{self.server}/search/web/results/?q=%40{self.word}filetype:{self.files}&elements_per_page' \ - f'=50&start_index={self.counter} ' + f'=50&start_index={self.counter} ' headers = { 'Host': self.hostname, 'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word), 'User-agent': Core.get_user_agent() } - h = requests.get(url=url, headers=headers) - self.results = h.text + responses = await async_fetcher.fetch_all(url, headers=headers) + self.results = responses[0] self.total_results += self.results - def check_next(self): + async def check_next(self): renext = re.compile('topNextUrl') nextres = renext.findall(self.results) if nextres != []: @@ -59,27 +50,27 @@ def check_next(self): nexty = '0' return nexty - def get_emails(self): + async def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.emails() - def get_hostnames(self): + async def get_hostnames(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() - def get_files(self): + async def get_files(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.fileurls(self.files) - def process(self): + async def process(self): print('Searching results') - self.do_search() + await self.do_search() - def process_files(self, files): + async def process_files(self, files): while self.counter < self.limit: - self.do_search_files(files) - time.sleep(getDelay()) + await self.do_search_files(files) more = self.check_next() + await asyncio.sleep(2) if more == '1': self.counter += 50 else: diff --git a/theHarvester/discovery/suip.py b/theHarvester/discovery/suip.py index a8c8e14e..0e735921 100644 --- a/theHarvester/discovery/suip.py +++ b/theHarvester/discovery/suip.py @@ -1,6 +1,5 @@ from theHarvester.lib.core import * from bs4 import BeautifulSoup -import requests import aiohttp import asyncio @@ -17,7 +16,7 @@ def __init__(self, word: str): async def request(self, url, params): headers = {'User-Agent': Core.get_user_agent()} data = {'url': self.word.replace('www.', ''), 'Submit1': 'Submit'} - timeout = aiohttp.ClientTimeout(total=360) + timeout = aiohttp.ClientTimeout(total=720) # by default timeout is 5 minutes we will change that to 6 minutes # Depending on the domain and if it has a lot of subdomains you may want to tweak it # The results are well worth the wait :) @@ -51,9 +50,7 @@ async def do_search(self): hosts: list = str(soup.find('pre')).splitlines() await self.clean_hosts(hosts) except Exception as e: - print('An exception has occurred: ', e) - import traceback as t - t.print_exc() + print(f'An exception has occurred: {e}') async def get_hostnames(self) -> set: return self.totalhosts @@ -69,4 +66,4 @@ async def clean_hosts(self, soup_hosts): if host[0] == '.': self.totalhosts.add(host[1:]) else: - self.totalhosts.add(host) + self.totalhosts.add(host) \ No newline at end of file diff --git a/theHarvester/lib/core.py b/theHarvester/lib/core.py index 07d4c169..dfdb20d9 100644 --- a/theHarvester/lib/core.py +++ b/theHarvester/lib/core.py @@ -378,14 +378,17 @@ class async_fetcher: async def fetch(session, url, params='', json=False) -> Union[str, dict, list]: # This fetch method solely focuses on get requests # TODO determine if method for post requests is necessary - if len(params) == 0: - async with session.get(url, params=params) as response: - await asyncio.sleep(2) - return await response.text() if json is False else await response.json() - else: - async with session.get(url) as response: - await asyncio.sleep(2) - return await response.text() if json is False else await response.json() + try: + if params != '': + async with session.get(url, params=params) as response: + await asyncio.sleep(2) + return await response.text() if json is False else await response.json() + else: + async with session.get(url) as response: + await asyncio.sleep(2) + return await response.text() if json is False else await response.json() + except Exception: + return '' @staticmethod async def fetch_all(urls, headers='', params='') -> list: From b169a00f45ed4c3e6bb3755a9acadaf8f1ca3e9b Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Wed, 25 Dec 2019 23:42:17 -0500 Subject: [PATCH 2/4] Added fix for storing emails. --- theHarvester/__main__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index c2fea6b9..647589de 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -103,6 +103,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor db_stash.store_all(word, all_hosts, 'host', source) if store_emails: email_list = filter(await search_engine.get_emails()) + all_emails.extend(email_list) db_stash.store_all(word, email_list, 'email', source) if store_ip: ips_list = await search_engine.get_ips() @@ -175,7 +176,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor if isinstance(e, MissingKey): print(e) else: - pass + print(e) elif engineitem == 'certspotter': print('\033[94m[*] Searching CertSpotter. \033[0m') @@ -664,4 +665,6 @@ async def entry_point(): if __name__ == '__main__': + #import uvloop + #uvloop.install() asyncio.run(main=entry_point()) From 35dc8a86325ca94c81d8938260d7998daded3a12 Mon Sep 17 00:00:00 2001 From: Matt <36310667+NotoriousRebel@users.noreply.github.com> Date: Wed, 25 Dec 2019 23:46:28 -0500 Subject: [PATCH 3/4] Update __main__.py Added fix to make emails are being properly stored. --- theHarvester/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index 2c3cc5b4..c52eb245 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -97,6 +97,7 @@ def store(search_engine: Any, source: str, process_param: Any = None, store_host db_stash.store_all(word, all_hosts, 'host', source) if store_emails: email_list = filter(search_engine.get_emails()) + all_emails.extend(email_list) db_stash.store_all(word, email_list, 'email', source) if store_ip: ips_list = search_engine.get_ips() From 0a3265860a178e46254fac4c77bd477844b9b091 Mon Sep 17 00:00:00 2001 From: NotoriousRebel Date: Wed, 25 Dec 2019 23:51:28 -0500 Subject: [PATCH 4/4] Ported hunter+exalead from grequests to aiohttp. --- theHarvester/discovery/exaleadsearch.py | 2 +- theHarvester/discovery/huntersearch.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/theHarvester/discovery/exaleadsearch.py b/theHarvester/discovery/exaleadsearch.py index 68c55f47..fae014e9 100644 --- a/theHarvester/discovery/exaleadsearch.py +++ b/theHarvester/discovery/exaleadsearch.py @@ -36,7 +36,7 @@ async def do_search_files(self, files): 'Referer': ('http://' + self.hostname + '/search/web/results/?q=%40' + self.word), 'User-agent': Core.get_user_agent() } - responses = await async_fetcher.fetch_all(url, headers=headers) + responses = await async_fetcher.fetch_all([url], headers=headers) self.results = responses[0] self.total_results += self.results diff --git a/theHarvester/discovery/huntersearch.py b/theHarvester/discovery/huntersearch.py index e4bbc8c9..8699a4e8 100644 --- a/theHarvester/discovery/huntersearch.py +++ b/theHarvester/discovery/huntersearch.py @@ -1,7 +1,6 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import grequests class SearchHunter: @@ -17,22 +16,21 @@ def __init__(self, word, limit, start): self.counter = start self.database = f'https://api.hunter.io/v2/domain-search?domain={word}&api_key={self.key}&limit={self.limit}' - def do_search(self): - request = grequests.get(self.database) - response = grequests.map([request]) - self.total_results = response[0].content.decode('UTF-8') + async def do_search(self): + responses = await async_fetcher.fetch_all([self.database], headers={'User-Agent': Core.get_user_agent()}) + self.total_results += responses[0] - def process(self): - self.do_search() # Only need to do it once. + async def process(self): + await self.do_search() # Only need to do it once. - def get_emails(self): + async def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.emails() - def get_hostnames(self): + async def get_hostnames(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.hostnames() - def get_profiles(self): + async def get_profiles(self): rawres = myparser.Parser(self.total_results, self.word) return rawres.profiles()