From 79d5eaef75c28b2eb5a963a9ed5bf726aaff3df4 Mon Sep 17 00:00:00 2001 From: L1ghtn1ng Date: Mon, 31 May 2021 16:55:13 +0100 Subject: [PATCH] Add new modules and tweaks everywhere --- api-keys.yaml | 8 +- theHarvester/__main__.py | 219 ++++++++++++--------- theHarvester/discovery/binaryedgesearch.py | 40 ++++ theHarvester/discovery/intelxsearch.py | 8 +- theHarvester/discovery/rocketreach.py | 66 +++++-- theHarvester/discovery/threatcrowd.py | 21 +- theHarvester/discovery/threatminer.py | 7 + theHarvester/discovery/urlscan.py | 21 +- theHarvester/discovery/zoomeyesearch.py | 199 +++++++++++++++++++ theHarvester/lib/api/api.py | 33 ++-- theHarvester/lib/core.py | 10 + 11 files changed, 480 insertions(+), 152 deletions(-) create mode 100644 theHarvester/discovery/binaryedgesearch.py create mode 100644 theHarvester/discovery/zoomeyesearch.py diff --git a/api-keys.yaml b/api-keys.yaml index c441cc7b..4306fe37 100644 --- a/api-keys.yaml +++ b/api-keys.yaml @@ -1,4 +1,7 @@ apikeys: + binaryedge: + key: + bing: key: @@ -7,7 +10,7 @@ apikeys: secret: github: - key: + key: hunter: key: @@ -32,3 +35,6 @@ apikeys: spyse: key: + + zoomeye: + key: diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index 6e3bb1eb..8ecf8b7b 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -1,23 +1,24 @@ #!/usr/bin/env python3 +from typing import Dict, List from theHarvester.discovery import * from theHarvester.discovery import dnssearch, takeover, shodansearch from theHarvester.discovery.constants import * from theHarvester.lib import hostchecker -from theHarvester.lib import reportgraph from theHarvester.lib import stash -from theHarvester.lib import statichtmlgenerator from theHarvester.lib.core import * import argparse import asyncio -import datetime +import aiofiles import json import netaddr import re import sys +import string +import secrets -async def start(): +async def start(rest_args=None): """Main program function""" parser = argparse.ArgumentParser(description='theHarvester is used to gather open source intelligence (OSINT) on a company or domain.') parser.add_argument('-d', '--domain', help='Company name or domain to search.', required=True) @@ -33,36 +34,54 @@ async def start(): parser.add_argument('-r', '--take-over', help='Check for takeovers.', default=False, action='store_true') parser.add_argument('-n', '--dns-lookup', help='Enable DNS server lookup, default False.', default=False, action='store_true') parser.add_argument('-c', '--dns-brute', help='Perform a DNS brute force on the domain.', default=False, action='store_true') - parser.add_argument('-f', '--filename', help='Save the results to an HTML,XML and JSON file.', default='', type=str) - parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, bufferoverun, censys, certspotter, crtsh, + parser.add_argument('-f', '--filename', help='Save the results to an XML and JSON file.', default='', type=str) + parser.add_argument('-b', '--source', help='''baidu, bing, binaryedge, bingapi, bufferoverun, censys, certspotter, crtsh, dnsdumpster, duckduckgo, exalead, github-code, google, hackertarget, hunter, intelx, linkedin, linkedin_links, netcraft, omnisint, otx, pentesttools, projectdiscovery, qwant, rapiddns, rocketreach, securityTrails, spyse, sublist3r, threatcrowd, threatminer, - trello, twitter, urlscan, virustotal, yahoo''') + trello, twitter, urlscan, virustotal, yahoo, zoomeye''') - args = parser.parse_args() - filename: str = args.filename - dnsbrute = (args.dns_brute, False) + # determines if filename is coming from rest api or user + rest_filename = '' + # indicates this from the rest API + if rest_args: + if rest_args.source and rest_args.source == "getsources": + return list(sorted(Core.get_supportedengines())) + elif rest_args.dns_brute: + args = rest_args + dnsbrute = (rest_args.dns_brute, True) + else: + args = rest_args + # We need to make sure the filename is random as to not overwrite other files + filename: str = args.filename + alphabet = string.ascii_letters + string.digits + rest_filename += f"{''.join(secrets.choice(alphabet) for _ in range(32))}_{filename}" \ + if len(filename) != 0 else "" + + else: + args = parser.parse_args() + filename: str = args.filename + dnsbrute = (args.dns_brute, False) try: db = stash.StashManager() await db.do_init() except Exception: pass - all_emails: list = [] - all_hosts: list = [] - all_ip: list = [] + all_emails: List = [] + all_hosts: List = [] + all_ip: List = [] dnslookup = args.dns_lookup dnsserver = args.dns_server dnstld = args.dns_tld - engines = [] + engines: List = [] # If the user specifies - full: list = [] - ips: list = [] + full: List = [] + ips: List = [] google_dorking = args.google_dork - host_ip: list = [] + host_ip: List = [] limit: int = args.limit shodan = args.shodan start: int = args.start @@ -72,13 +91,16 @@ async def start(): word: str = args.domain takeover_status = args.take_over use_proxy = args.proxies - linkedin_people_list_tracker: list = [] - linkedin_links_tracker: list = [] - twitter_people_list_tracker: list = [] + linkedin_people_list_tracker: List = [] + linkedin_links_tracker: List = [] + twitter_people_list_tracker: List = [] + interesting_urls: list = [] + total_asns: list = [] async def store(search_engine: Any, source: str, process_param: Any = None, store_host: bool = False, store_emails: bool = False, store_ip: bool = False, store_people: bool = False, - store_links: bool = False, store_results: bool = False) -> None: + store_links: bool = False, store_results: bool = False, + store_interestingurls: bool = False, store_asns: bool = False) -> None: """ Persist details into the database. The details to be stored is controlled by the parameters passed to the method. @@ -92,6 +114,8 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor :param store_people: whether to store user details :param store_links: whether to store links :param store_results: whether to fetch details from get_results() and persist + :param store_interestingurls: whether to store interesting urls + :param store_asns: whether to store asns """ await search_engine.process(use_proxy) if process_param is None else await \ search_engine.process(process_param, use_proxy) @@ -128,24 +152,28 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor await db.store_all(word, all_emails, 'email', source) if store_people: people_list = await search_engine.get_people() + if source == 'twitter': + twitter_people_list_tracker.extend(people_list) + if source == 'linkedin': + linkedin_people_list_tracker.extend(people_list) await db_stash.store_all(word, people_list, 'people', source) - if len(people_list) == 0: - print('\n[*] No users found.\n\n') - else: - print('\n[*] Users found: ' + str(len(people_list))) - print('---------------------') - for usr in sorted(list(set(people_list))): - print(usr) + if store_links: links = await search_engine.get_links() - await db.store_all(word, links, 'name', engineitem) - if len(links) == 0: - print('\n[*] No links found.\n\n') - else: - print(f'\n[*] Links found: {len(links)}') - print('---------------------') - for link in sorted(list(set(links))): - print(link) + linkedin_links_tracker.extend(links) + if len(links) > 0: + await db.store_all(word, links, 'linkedinlinks', engineitem) + + if store_interestingurls: + iurls = await search_engine.get_interestingurls() + interesting_urls.extend(iurls) + if len(iurls) > 0: + await db.store_all(word, iurls, 'interestingurl', engineitem) + if store_asns: + fasns = await search_engine.get_asns() + total_asns.extend(fasns) + if len(fasns) > 0: + await db.store_all(word, fasns, 'asns', engineitem) stor_lst = [] if args.source is not None: @@ -163,8 +191,16 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor try: baidu_search = baidusearch.SearchBaidu(word, limit) stor_lst.append(store(baidu_search, engineitem, store_host=True, store_emails=True)) - except Exception: - pass + except Exception as e: + print(e) + + elif engineitem == 'binaryedge': + from theHarvester.discovery import binaryedgesearch + try: + binaryedge_search = binaryedgesearch.SearchBinaryEdge(word, limit) + stor_lst.append(store(binaryedge_search, engineitem, store_host=True)) + except Exception as e: + print(e) elif engineitem == 'bing' or engineitem == 'bingapi': from theHarvester.discovery import bingsearch @@ -220,7 +256,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor try: from theHarvester.discovery import dnsdumpster dns_dumpster_search = dnsdumpster.SearchDnsDumpster(word) - stor_lst.append(store(dns_dumpster_search, engineitem, store_host=True)) + stor_lst.append(store(dns_dumpster_search, engineitem, store_host=True, store_ip=True)) except Exception as e: print(f'\033[93m[!] An error occurred with dnsdumpster: {e} \033[0m') @@ -272,7 +308,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor # Import locally or won't work. try: intelx_search = intelxsearch.SearchIntelx(word) - stor_lst.append(store(intelx_search, engineitem, store_host=True, store_emails=True)) + stor_lst.append(store(intelx_search, engineitem, store_interestingurls=True, store_emails=True)) except Exception as e: if isinstance(e, MissingKey): print(e) @@ -387,7 +423,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor from theHarvester.discovery import threatcrowd try: threatcrowd_search = threatcrowd.SearchThreatcrowd(word) - stor_lst.append(store(threatcrowd_search, engineitem, store_host=True)) + stor_lst.append(store(threatcrowd_search, engineitem, store_host=True, store_ip=True)) except Exception as e: print(e) @@ -395,7 +431,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor from theHarvester.discovery import threatminer try: threatminer_search = threatminer.SearchThreatminer(word) - stor_lst.append(store(threatminer_search, engineitem, store_host=True)) + stor_lst.append(store(threatminer_search, engineitem, store_host=True, store_ip=True)) except Exception as e: print(e) @@ -414,7 +450,8 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor from theHarvester.discovery import urlscan try: urlscan_search = urlscan.SearchUrlscan(word) - stor_lst.append(store(urlscan_search, engineitem, store_host=True, store_ip=True)) + stor_lst.append(store(urlscan_search, engineitem, store_host=True, store_ip=True, + store_interestingurls=True, store_asns=True)) except Exception as e: print(e) @@ -424,13 +461,22 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor stor_lst.append(store(virustotal_search, engineitem, store_host=True)) elif engineitem == 'yahoo': - from theHarvester.discovery import yahoosearch yahoo_search = yahoosearch.SearchYahoo(word, limit) stor_lst.append(store(yahoo_search, engineitem, store_host=True, store_emails=True)) + + elif engineitem == 'zoomeye': + from theHarvester.discovery import zoomeyesearch + zoomeye_search = zoomeyesearch.SearchZoomEye(word, limit) + stor_lst.append(store(zoomeye_search, engineitem, store_host=True, store_emails=True, + store_ip=True, store_interestingurls=True, store_asns=True)) else: - print('\033[93m[!] Invalid source.\n\n \033[0m') - sys.exit(1) + try: + # Check if dns_brute is defined + rest_args.dns_brute + except Exception: + print('\033[93m[!] Invalid source.\n\n \033[0m') + sys.exit(1) async def worker(queue): while True: @@ -465,6 +511,15 @@ async def handler(lst): await asyncio.gather(*tasks, return_exceptions=True) await handler(lst=stor_lst) + return_ips: List = [] + if rest_args is not None and len(rest_filename) == 0 and rest_args.dns_brute is False: + # Indicates user is using rest api but not wanting output to be saved to a file + full = [host if ':' in host and word in host else word in host.split(':')[0] and host for host in full] + full = list({host for host in full if host}) + full.sort() + # cast to string so Rest API can understand type + return_ips.extend([str(ip) for ip in sorted([netaddr.IPAddress(ip.strip()) for ip in set(all_ip)])]) + return list(set(all_emails)), return_ips, full, '', '' # Sanity check to see if all_emails and all_hosts are defined. try: all_emails @@ -526,6 +581,9 @@ async def handler(lst): hosts, ips = await dns_force.run() hosts = list({host for host in hosts if ':' in host}) hosts.sort(key=lambda el: el.split(':')[0]) + # Check if Rest API is being used if so return found hosts + if dnsbrute[1]: + return hosts print('\n[*] Hosts found after DNS brute force:') db = stash.StashManager() for host in hosts: @@ -677,59 +735,24 @@ async def handler(lst): if args.dns_tld is not False: counter = 0 for word in vhost: - search = googlesearch.SearchGoogle(word, limit, counter) - await search.process(google_dorking) - emails = await search.get_emails() - hosts = await search.get_hostnames() + search_google = googlesearch.SearchGoogle(word, limit, counter) + await search_google.process(google_dorking) + emails = await search_google.get_emails() + hosts = await search_google.get_hostnames() print(emails) print(hosts) else: pass # Reporting - if filename != "": + if filename != '': + print('\n[*] Reporting started.') try: - print('\n[*] Reporting started.') - db = stash.StashManager() - scanboarddata = await db.getscanboarddata() - latestscanresults = await db.getlatestscanresults(word) - previousscanresults = await db.getlatestscanresults(word, previousday=True) - latestscanchartdata = await db.latestscanchartdata(word) - scanhistorydomain = await db.getscanhistorydomain(word) - pluginscanstatistics = await db.getpluginscanstatistics() - generator = statichtmlgenerator.HtmlGenerator(word) - html_code = await generator.beginhtml() - html_code += await generator.generatedashboardcode(scanboarddata) - html_code += await generator.generatelatestscanresults(latestscanresults) - if len(screenshot_tups) > 0: - html_code += await generator.generatescreenshots(screenshot_tups) - html_code += await generator.generatepreviousscanresults(previousscanresults) - graph = reportgraph.GraphGenerator(word) - await graph.init_db() - html_code += await graph.drawlatestscangraph(word, latestscanchartdata) - html_code += await graph.drawscattergraphscanhistory(word, scanhistorydomain) - html_code += await generator.generatepluginscanstatistics(pluginscanstatistics) - html_code += '

Report generated on ' + str( - datetime.datetime.now()) + '

' - html_code += ''' - - - ''' - except Exception as e: - print(e) - print('\n\033[93m[!] An error occurred while creating the output file.\n\n \033[0m') - sys.exit(1) - - html_file = open(f'{filename}.html' if '.html' not in filename else filename, 'w') - html_file.write(html_code) - html_file.close() - print('[*] Reporting finished.') - print('[*] Saving files.') - - try: - # XML REPORT SECTION - filename = filename.rsplit('.', 1)[0] + '.xml' - + if len(rest_filename) == 0: + filename = filename.rsplit('.', 1)[0] + '.xml' + else: + filename = 'theHarvester/app/static/' + rest_filename.rsplit('.', 1)[0] + '.xml' + # TODO use aiofiles if user is using rest api with open(filename, 'w+') as file: file.write('') for x in all_emails: @@ -767,16 +790,16 @@ async def handler(lst): file.write('') file.write('') - print('[*] XML File saved.') - except Exception as er: - print(f'\033[93m[!] An error occurred while saving the XML file: {er} \033[0m') + print('[*] XML File saved.') + except Exception as error: + print(f'\033[93m[!] An error occurred while saving the XML file: {error} \033[0m') try: # JSON REPORT SECTION filename = filename.rsplit('.', 1)[0] + '.json' # create dict with values for json output - json_dict = dict() + json_dict: Dict = dict() json_dict["emails"] = [email for email in all_emails] json_dict["hosts"] = [host for host in full] @@ -791,9 +814,9 @@ async def handler(lst): if len(linkedin_links_tracker) > 0: json_dict["linkedin_links"] = [link for link in list(sorted(set(linkedin_links_tracker)))] - shodan_dict = dict() + shodan_dict: Dict = dict() if shodanres != []: - shodanalysis = [] + shodanalysis: List = [] for x in shodanres: res = x.split('SAPO') shodan_dict[res[0]] = [res[2], [res[1]]] diff --git a/theHarvester/discovery/binaryedgesearch.py b/theHarvester/discovery/binaryedgesearch.py new file mode 100644 index 00000000..8382e9c6 --- /dev/null +++ b/theHarvester/discovery/binaryedgesearch.py @@ -0,0 +1,40 @@ +from theHarvester.discovery.constants import * +import asyncio + + +class SearchBinaryEdge: + + def __init__(self, word, limit): + self.word = word + self.totalhosts = set() + self.proxy = False + self.key = Core.binaryedge_key() + self.limit = 501 if limit >= 501 else limit + self.limit = 2 if self.limit == 1 else self.limit + if self.key is None: + raise MissingKey('binaryedge') + + async def do_search(self): + base_url = f'https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}' + headers = {'X-KEY': self.key, 'User-Agent': Core.get_user_agent()} + for page in range(1, self.limit): + params = {'page': page} + response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy, params=params, headers=headers) + responses = response[0] + dct = responses + if ('status' in dct.keys() and 'message' in dct.keys()) and \ + (dct['status'] == 400 or 'Bad Parameter' in dct['message'] or 'Error' in dct['message']): + # 400 status code means no more results + break + if 'events' in dct.keys(): + if len(dct['events']) == 0: + break + self.totalhosts.update({host for host in dct['events']}) + await asyncio.sleep(get_delay()) + + async def get_hostnames(self) -> set: + return self.totalhosts + + async def process(self, proxy=False): + self.proxy = proxy + await self.do_search() diff --git a/theHarvester/discovery/intelxsearch.py b/theHarvester/discovery/intelxsearch.py index 5d2851bb..af32f561 100644 --- a/theHarvester/discovery/intelxsearch.py +++ b/theHarvester/discovery/intelxsearch.py @@ -25,8 +25,8 @@ async def do_search(self): # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py # API requests self identification # https://intelx.io/integrations - headers: dict = {'x-key': self.key, 'User-Agent': f'{Core.get_user_agent()}-theHarvester'} - data: dict = { + headers = {'x-key': self.key, 'User-Agent': f'{Core.get_user_agent()}-theHarvester'} + data = { "term": self.word, "buckets": [], "lookuplevel": 0, @@ -59,8 +59,8 @@ async def process(self, proxy=False): intelx_parser = intelxparser.Parser() self.info = await intelx_parser.parse_dictionaries(self.results) - async def get_emails(self) -> Set: + async def get_emails(self): return self.info[0] - async def get_hostnames(self) -> Set: + async def get_interestingurls(self): return self.info[1] diff --git a/theHarvester/discovery/rocketreach.py b/theHarvester/discovery/rocketreach.py index ee1ab76f..55fe35a8 100644 --- a/theHarvester/discovery/rocketreach.py +++ b/theHarvester/discovery/rocketreach.py @@ -1,31 +1,61 @@ -from theHarvester.discovery.constants import MissingKey +from theHarvester.discovery.constants import * from theHarvester.lib.core import * -import rocketreach +import asyncio -class SearchRocketreach: +class SearchRocketReach: - def __init__(self, word): + def __init__(self, word, limit): + self.ips = set() self.word = word self.key = Core.rocketreach_key() if self.key is None: - raise MissingKey('Rocketreach') - self.total_results = "" + raise MissingKey('RocketReach') + self.hosts = set() self.proxy = False + self.baseurl = 'https://api.rocketreach.co/v2/api/search' + self.links = set() + self.limit = limit async def do_search(self): - rr = rocketreach.Gateway(rocketreach.GatewayConfig(self.key)) - s = rr.person.search().filter(current_employer=self.word) - result = s.execute() - if result.is_success: - lookup = rr.person.lookup(result.people[0].id) - if lookup.is_success: - print(repr(lookup.person)) + try: + headers = { + 'Api-Key': self.key, + 'Content-Type': 'application/json', + 'User-Agent': Core.get_user_agent() + } + + import pprint as pp + + # linkedin_urls = set() + for page in range(1, self.limit): + data = f'{{"query":{{"company_website_url": ["{self.word}"]}}, "start": {page}}}' + result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True) + + if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result[ + 'detail']: + # No more results can be fetched + break + if 'detail' in result.keys() and 'Request was throttled.' in result['detail']: + # Rate limit has been triggered need to sleep extra + print(f'RocketReach requests have been throttled; ' + f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}') + break + if 'profiles' in dict(result).keys(): + if len(result['profiles']) == 0: + break + for profile in result['profiles']: + if 'linkedin_url' in dict(profile).keys(): + self.links.add(profile['linkedin_url']) + + await asyncio.sleep(get_delay() + 2) + + except Exception as e: + print(f'An exception has occurred: {e}') + + async def get_links(self): + return self.links async def process(self, proxy=False): self.proxy = proxy - await self.do_search() # Only need to do it once. - - # async def get_emails(self): - # rawres = myparser.Parser(self.total_results, self.word) - # return await rawres.emails() + await self.do_search() diff --git a/theHarvester/discovery/threatcrowd.py b/theHarvester/discovery/threatcrowd.py index 1cad8f21..78cbbfc3 100644 --- a/theHarvester/discovery/threatcrowd.py +++ b/theHarvester/discovery/threatcrowd.py @@ -1,28 +1,31 @@ -from typing import Coroutine +from typing import List from theHarvester.lib.core import * -from theHarvester.parsers import myparser class SearchThreatcrowd: def __init__(self, word): self.word = word.replace(' ', '%20') - self.results: str = "" - self.totalresults: str = "" + self.hostnames = list() + self.ips = list() self.proxy = False async def do_search(self): base_url = f'https://www.threatcrowd.org/searchApi/v2/domain/report/?domain={self.word}' headers = {'User-Agent': Core.get_user_agent()} try: - responses = await AsyncFetcher.fetch_all([base_url], headers=headers, proxy=self.proxy) - self.results = responses[0] + responses = await AsyncFetcher.fetch_all([base_url], headers=headers, proxy=self.proxy, json=True) + resp = responses[0] + self.ips = {ip['ip_address'] for ip in resp['resolutions'] if len(ip['ip_address']) > 4} + self.hostnames = set(list(resp['subdomains'])) except Exception as e: print(e) - self.totalresults += self.results - async def get_hostnames(self) -> Coroutine: - return await myparser.Parser(self.results, self.word).hostnames() + async def get_ips(self) -> List: + return self.ips + + async def get_hostnames(self) -> List: + return self.hostnames async def process(self, proxy=False): self.proxy = proxy diff --git a/theHarvester/discovery/threatminer.py b/theHarvester/discovery/threatminer.py index e94ab205..d5ee7069 100644 --- a/theHarvester/discovery/threatminer.py +++ b/theHarvester/discovery/threatminer.py @@ -7,16 +7,23 @@ class SearchThreatminer: def __init__(self, word): self.word = word self.totalhosts = list + self.totalips = list self.proxy = False async def do_search(self): url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5' response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) self.totalhosts: set = {host for host in response[0]['results']} + second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2' + secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy) + self.totalips: set = {resp['ip'] for resp in secondresp[0]['results']} async def get_hostnames(self) -> Type[list]: return self.totalhosts + async def get_ips(self) -> Type[list]: + return self.totalips + async def process(self, proxy=False): self.proxy = proxy await self.do_search() diff --git a/theHarvester/discovery/urlscan.py b/theHarvester/discovery/urlscan.py index 9b08b1c9..24aa3c6e 100644 --- a/theHarvester/discovery/urlscan.py +++ b/theHarvester/discovery/urlscan.py @@ -1,12 +1,14 @@ -from typing import Type +from typing import List from theHarvester.lib.core import * class SearchUrlscan: def __init__(self, word): self.word = word - self.totalhosts = list - self.totalips = list + self.totalhosts = list() + self.totalips = list() + self.interestingurls = list() + self.totalasns = list() self.proxy = False async def do_search(self): @@ -15,13 +17,22 @@ async def do_search(self): resp = response[0] self.totalhosts = {f"{page['page']['domain']}" for page in resp['results']} self.totalips = {f"{page['page']['ip']}" for page in resp['results'] if 'ip' in page['page'].keys()} + self.interestingurls = {f"{page['page']['url']}" for page in resp['results'] if self.word in page['page']['url'] + and 'url' in page['page'].keys()} + self.totalasns = {f"{page['page']['asn']}" for page in resp['results'] if 'asn' in page['page'].keys()} - async def get_hostnames(self) -> Type[list]: + async def get_hostnames(self) -> List: return self.totalhosts - async def get_ips(self) -> Type[list]: + async def get_ips(self) -> List: return self.totalips + async def get_interestingurls(self) -> List: + return self.interestingurls + + async def get_asns(self) -> List: + return self.totalasns + async def process(self, proxy=False): self.proxy = proxy await self.do_search() diff --git a/theHarvester/discovery/zoomeyesearch.py b/theHarvester/discovery/zoomeyesearch.py new file mode 100644 index 00000000..5df69d43 --- /dev/null +++ b/theHarvester/discovery/zoomeyesearch.py @@ -0,0 +1,199 @@ +from theHarvester.discovery.constants import * +from theHarvester.lib.core import * +from theHarvester.parsers import myparser +import asyncio +import re + + +class SearchZoomEye: + + def __init__(self, word, limit): + self.word = word + self.limit = limit + self.key = Core.zoomeye_key() + if self.key is None: + raise MissingKey('zoomeye') + self.baseurl = 'https://api.zoomeye.org/host/search' + self.proxy = False + self.totalasns = list() + self.totalhosts = list() + self.interestingurls = list() + self.totalips = list() + self.totalemails = list() + # Regex used is directly from: https://github.com/GerbenJavado/LinkFinder/blob/master/linkfinder.py#L29 + # Maybe one day it will be a pip package + # Regardless LinkFinder is an amazing tool! + self.iurl_regex = r""" + (?:"|') # Start newline delimiter + ( + ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or // + [^"'/]{1,}\. # Match a domainname (any character + dot) + [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path + | + ((?:/|\.\./|\./) # Start with /,../,./ + [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be... + [^"'><,;|()]{1,}) # Rest of the characters can't be + | + ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with / + [a-zA-Z0-9_\-/]{1,} # Resource name + \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action) + (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters + | + ([a-zA-Z0-9_\-/]{1,}/ # REST API (no extension) with / + [a-zA-Z0-9_\-/]{3,} # Proper REST endpoints usually have 3+ chars + (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters + | + ([a-zA-Z0-9_\-]{1,} # filename + \.(?:php|asp|aspx|jsp|json| + action|html|js|txt|xml) # . + extension + (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters + ) + (?:"|') # End newline delimiter + """ + self.iurl_regex = re.compile(self.iurl_regex, re.VERBOSE) + + async def do_search(self): + headers = { + 'API-KEY': self.key, + 'User-Agent': Core.get_user_agent() + } + params = ( + ('query', f'site:{self.word}'), + ('page', '1'), + ) + # TODO add: https://www.zoomeye.org/profile/domain to fetch subdomains more easily once + # once api endpoint is created + response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers, + params=params) + # First request determines how many pages there in total + resp = response[0] + total_pages = int(resp['available']) + self.limit = self.limit if total_pages > self.limit else total_pages + self.limit = 3 if self.limit == 2 else self.limit + cur_page = 2 if self.limit >= 2 else -1 + # Means there is only one page + # hostnames, emails, ips, asns, iurls + nomatches_counter = 0 + # cur_page = -1 + if cur_page == -1: + # No need to do loop just parse and leave + if 'matches' in resp.keys(): + hostnames, emails, ips, asns, iurls = await self.parse_matchs(resp['matches']) + self.totalhosts.extend(hostnames) + self.totalemails.extend(emails) + self.totalips.extend(ips) + self.totalasns.extend(asns) + self.interestingurls.extend(iurls) + else: + if 'matches' in resp.keys(): + # Parse out initial results and then continue to loop + hostnames, emails, ips, asns, iurls = await self.parse_matchs(resp['matches']) + self.totalhosts.extend(hostnames) + self.totalemails.extend(emails) + self.totalips.extend(ips) + self.totalasns.extend(asns) + self.interestingurls.extend(iurls) + + for num in range(2, self.limit): + print(f'Currently on page: {num}') + params = ( + ('query', f'site:{self.word}'), + ('page', f'{num}'), + ) + response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers, + params=params) + resp = response[0] + if 'matches' not in resp.keys(): + print(f'Your resp: {resp}') + print('Match not found in keys') + break + + hostnames, emails, ips, asns, iurls = await self.parse_matchs(resp['matches']) + + if len(hostnames) == 0 and len(emails) == 0 and len(ips) == 0 \ + and len(asns) == 0 and len(iurls) == 0: + nomatches_counter += 1 + + if nomatches_counter >= 5: + break + + self.totalhosts.extend(hostnames) + self.totalemails.extend(emails) + self.totalips.extend(ips) + self.totalasns.extend(asns) + self.interestingurls.extend(iurls) + + await asyncio.sleep(get_delay() + 2) + + async def parse_matchs(self, matches): + # Helper function to parse items from match json + # ips = {match["ip"] for match in matches} + ips = set() + iurls = set() + hostnames = set() + asns = set() + emails = set() + for match in matches: + try: + ips.add(match['ip']) + + if 'geoinfo' in match.keys(): + asns.add(int(match['geoinfo']['asn'])) + + if 'rdns_new' in match.keys(): + rdns_new = match['rdns_new'] + + if ',' in rdns_new: + parts = str(rdns_new).split(',') + rdns_new = parts[0] + if len(parts) == 2: + hostnames.add(parts[1]) + rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new + hostnames.add(rdns_new) + else: + rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new + hostnames.add(rdns_new) + + if 'rdns' in match.keys(): + rdns = match['rdns'] + rdns = rdns[:-1] if rdns[-1] == '.' else rdns + hostnames.add(rdns) + + if 'portinfo' in match.keys(): + # re. + temp_emails = set(await self.parse_emails(match['portinfo']['banner'])) + emails.update(temp_emails) + hostnames.update(set(await self.parse_hostnames(match['portinfo']['banner']))) + iurls = {str(iurl.group(1)).replace('"', '') for iurl + in re.finditer(self.iurl_regex, match['portinfo']['banner']) + if self.word in str(iurl.group(1))} + except Exception as e: + print(f'An exception has occurred: {e}') + return hostnames, emails, ips, asns, iurls + + async def process(self, proxy=False): + self.proxy = proxy + await self.do_search() # Only need to do it once. + + async def parse_emails(self, content): + rawres = myparser.Parser(content, self.word) + return await rawres.emails() + + async def parse_hostnames(self, content): + rawres = myparser.Parser(content, self.word) + return await rawres.hostnames() + + async def get_hostnames(self): + return set(self.totalhosts) + + async def get_emails(self): + return set(self.totalemails) + + async def get_ips(self): + return set(self.totalips) + + async def get_asns(self): + return set(self.totalasns) + + async def get_interestingurls(self): + return set(self.interestingurls) diff --git a/theHarvester/lib/api/api.py b/theHarvester/lib/api/api.py index 032017fe..7b6e6373 100644 --- a/theHarvester/lib/api/api.py +++ b/theHarvester/lib/api/api.py @@ -78,20 +78,20 @@ async def dnsbrute(request: Request, user_agent: str = Header(None), if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): response = RedirectResponse(app.url_path_for('bot')) return response - dns_bruteforce = await __main__.entry_point(argparse.Namespace(dns_brute=True, - dns_lookup=False, - dns_server=False, - dns_tld=False, - domain=domain, - filename='', - google_dork=False, - limit=500, - proxies=False, - shodan=False, - source=','.join([]), - start=0, - take_over=False, - virtual_host=False)) + dns_bruteforce = await __main__.start(argparse.Namespace(dns_brute=True, + dns_lookup=False, + dns_server=False, + dns_tld=False, + domain=domain, + filename='', + google_dork=False, + limit=500, + proxies=False, + shodan=False, + source=','.join([]), + start=0, + take_over=False, + virtual_host=False)) return {'dns_bruteforce': dns_bruteforce} @@ -115,7 +115,7 @@ async def query(request: Request, dns_server: str = Query(""), user_agent: str = response = RedirectResponse(app.url_path_for('bot')) return response try: - emails, ips, urls, html_filename, xml_filename = await __main__.start(argparse.Namespace(dns_brute=dns_brute, + emails, ips, urls, xml_filename = await __main__.start(argparse.Namespace(dns_brute=dns_brute, dns_lookup=dns_lookup, dns_server=dns_server, dns_tld=dns_tld, @@ -130,7 +130,6 @@ async def query(request: Request, dns_server: str = Query(""), user_agent: str = take_over=take_over, virtual_host=virtual_host)) - return {'domain': f'{domain}', 'emails': emails, 'ips': ips, 'urls': urls, 'html_file': f'{html_filename}', - 'xml_file': f'{xml_filename}'} + return {'domain': f'{domain}', 'emails': emails, 'ips': ips, 'urls': urls, 'xml_file': f'{xml_filename}'} except Exception as e: return {'exception': f'{e}'} diff --git a/theHarvester/lib/core.py b/theHarvester/lib/core.py index 13935d0e..6d343eb6 100644 --- a/theHarvester/lib/core.py +++ b/theHarvester/lib/core.py @@ -28,6 +28,10 @@ def api_keys() -> dict: keys = yaml.safe_load(api_keys) return keys['apikeys'] + @staticmethod + def binaryedge_key() -> str: + return Core.api_keys()['binaryedge']['key'] + @staticmethod def bing_key() -> str: return Core.api_keys()['bing']['key'] @@ -72,6 +76,10 @@ def shodan_key() -> str: def spyse_key() -> str: return Core.api_keys()['spyse']['key'] + @staticmethod + def zoomeye_key() -> str: + return Core.api_keys()['zoomEye']['key'] + @staticmethod def proxy_list() -> List: try: @@ -106,6 +114,7 @@ def banner() -> None: @staticmethod def get_supportedengines() -> Set[Union[str, Any]]: supportedengines = {'baidu', + 'binaryedge', 'bing', 'bingapi', 'bufferoverun', @@ -140,6 +149,7 @@ def get_supportedengines() -> Set[Union[str, Any]]: 'urlscan', 'virustotal', 'yahoo', + 'zoomeye' } return supportedengines