diff --git a/.gitignore b/.gitignore index 3f36fa1e..cdfc67d3 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ venv .pytest_cache build/ dist/ -theHarvester.egg-info/ \ No newline at end of file +api-keys.yaml diff --git a/theHarvester/discovery/constants.py b/theHarvester/discovery/constants.py index 0cd242f8..f792b98b 100644 --- a/theHarvester/discovery/constants.py +++ b/theHarvester/discovery/constants.py @@ -1,7 +1,12 @@ +from theHarvester.lib.core import * from typing import Union import random +import aiohttp +import re +from bs4 import BeautifulSoup -googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' +googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \ + 'Safari/537.36 ' def splitter(links): @@ -52,7 +57,7 @@ def getDelay() -> float: return random.randint(1, 3) - .5 -def search(text: str) -> bool: +async def search(text: str) -> bool: # Helper function to check if Google has blocked traffic. for line in text.strip().splitlines(): if 'This page appears when Google automatically detects requests coming from your computer network' in line \ @@ -62,13 +67,12 @@ def search(text: str) -> bool: return False -def google_workaround(visit_url: str) -> Union[bool, str]: +async def google_workaround(visit_url: str) -> Union[bool, str]: """ Function that makes a request on our behalf, if Google starts to block us :param visit_url: Url to scrape :return: Correct html that can be parsed by BS4 """ - import requests url = 'https://websniffer.cc/' data = { 'Cookie': '', @@ -77,12 +81,20 @@ def google_workaround(visit_url: str) -> Union[bool, str]: 'type': 'GET&http=1.1', 'uak': str(random.randint(4, 8)) # select random UA to send to Google } - resp = requests.post(url, headers={'User-Agent': googleUA}, data=data) - returned_html = resp.text - if search(returned_html): + import requests + returned_html = requests.post(url, data=data, headers={'User-Agent': Core.get_user_agent()}) + returned_html = returned_html.text + # TODO FIX + #returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data) + import pprint as p + print('returned html') + p.pprint(returned_html, indent=4) + returned_html = "This page appears when Google automatically detects requests coming from your computer network" + if await search(returned_html): + print('going to second method!') # indicates that google is serving workaround a captcha - # TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request - return True + # That means we will try out second option which will utilize proxies + return await second_method(visit_url) # the html we get is malformed for BS4 as there are no greater than or less than signs if '<html>' in returned_html: start_index = returned_html.index('<html>') @@ -96,6 +108,104 @@ def google_workaround(visit_url: str) -> Union[bool, str]: return correct_html +async def request(url, params): + headers = {'User-Agent': Core.get_user_agent()} + session = aiohttp.ClientSession(headers=headers) + results = await AsyncFetcher.fetch(session, url=url, params=params) + await session.close() + return results + + +async def proxy_fetch(session, url, proxy): + try: + async with session.get(url, proxy=proxy, ssl=False) as resp: + return f'success:{proxy}', await resp.text() + except Exception as e: + # print(e) + return f'failed:{proxy}', proxy + + +async def proxy_test(proxies, url): + print('doing proxy test with this number of proxies: ', len(proxies)) + headers = {'User-Agent': Core.get_user_agent()} + timeout = aiohttp.ClientTimeout(total=40) + async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session: + texts = await asyncio.gather(*[proxy_fetch(session, url, proxy) for proxy in proxies]) + return texts + + +async def get_proxies(): + print('inside get proxies') + # ideas borrowed and modified from twitterscraper + proxy_url = 'https://free-proxy-list.net/' + response = await AsyncFetcher.fetch_all([proxy_url]) + response = response[0] + soup = BeautifulSoup(response, 'lxml') + table = soup.find('table', id='proxylisttable') + list_tr = table.find_all('tr') + list_td = [elem.find_all('td') for elem in list_tr] + list_td = [x for x in list_td if x is not None and len(x) > 0] + list_ip = [elem[0].text for elem in list_td] + list_ports = [elem[1].text for elem in list_td] + list_proxies = [f"http://{':'.join(elem)}" for elem in list(zip(list_ip, list_ports))] + return list_proxies + + +async def clean_dct(dct: dict, second_test=False): + print('cleaning dct and second test is: ', second_test) + good_proxies = set() + for proxy, text in dct.items(): + if 'failed' not in proxy: + if second_test: + if await search(text) is False: + print(text) + return text + else: + good_proxies.add(proxy[proxy.find(':') + 1:]) + return good_proxies if second_test is False else True + + +async def create_init_proxies(): + print('inside create init proxies') + url = "https://suip.biz" + first_param = [url, (('act', 'proxy1'),), ] + second_param = [url, (('act', 'proxy2'),), ] + third_param = [url, (('act', 'proxy3'),), ] + async_requests = [ + request(url=url, params=params) + for url, params in [first_param, second_param, third_param] + ] + results = await asyncio.gather(*async_requests) + proxy_set = set() + for resp in results: + ip_candidates = re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', resp) + proxy_set.update({f'http://{ip}' for ip in ip_candidates}) + + new_proxies = await get_proxies() + proxy_set.update({proxy for proxy in new_proxies}) + return proxy_set + + +async def second_method(url: str) -> Union[str, bool]: + print('inside second method') + # First visit example.com to make to filter out bad proxies + init_url = "http://example.com" + proxy_set = await create_init_proxies() + tuples = await proxy_test(proxy_set, init_url) + mega_dct = dict((x, y) for x, y in tuples) + proxy_set = await clean_dct(mega_dct) + # After we clean our proxy set now we use them to visit the url we care about + print('got working proxies now onto the juice') + tuples = await proxy_test(proxy_set, url) + mega_dct = dict((x, y) for x, y in tuples) + results = await clean_dct(mega_dct, second_test=True) + print('returning the juice') + # pass in second_test flag as True to indicate this will + # the text we care about or a bool to indicate it was + # not successful + return results + + class MissingKey(Exception): def __init__(self, identity_flag: bool): diff --git a/theHarvester/discovery/githubcode.py b/theHarvester/discovery/githubcode.py index b620e7e3..aa9caa0c 100644 --- a/theHarvester/discovery/githubcode.py +++ b/theHarvester/discovery/githubcode.py @@ -1,10 +1,9 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests -from requests import Response -import time -from typing import List, Dict, Any, Optional, NamedTuple +from typing import List, Dict, Any, Optional, NamedTuple, Tuple +import asyncio +import aiohttp import urllib.parse as urlparse @@ -40,20 +39,21 @@ def __init__(self, word, limit): raise MissingKey(True) @staticmethod - def fragments_from_response(response: Response) -> List[str]: - items: List[Dict[str, Any]] = response.json().get('items') or list() + async def fragments_from_response(json_data: dict) -> List[str]: + items: List[Dict[str, Any]] = json_data.get('items') or list() fragments: List[str] = list() for item in items: matches = item.get("text_matches") or list() for match in matches: fragments.append(match.get("fragment")) + return [fragment for fragment in fragments if fragment is not None] @staticmethod - def page_from_response(page: str, response: Response) -> Optional[Any]: - page_link = response.links.get(page) + async def page_from_response(page: str, links) -> Optional[Any]: + page_link = links.get(page) if page_link: - parsed = urlparse.urlparse(page_link.get("url")) + parsed = urlparse.urlparse(str(page_link.get("url"))) params = urlparse.parse_qs(parsed.query) pages: List[Any] = params.get('page', [None]) page_number = pages[0] and int(pages[0]) @@ -61,21 +61,22 @@ def page_from_response(page: str, response: Response) -> Optional[Any]: else: return None - def handle_response(self, response: Response) -> Optional[Any]: - if response.ok: - results = self.fragments_from_response(response) - next_page = self.page_from_response("next", response) - last_page = self.page_from_response("last", response) + async def handle_response(self, response: Tuple[str, dict, int, Any]): + text, json_data, status, links = response + if status == 200: + results = await self.fragments_from_response(json_data) + next_page = await self.page_from_response("next", links) + last_page = await self.page_from_response("last", links) return SuccessResult(results, next_page, last_page) - elif response.status_code == 429 or response.status_code == 403: + elif status == 429 or status == 403: return RetryResult(60) else: try: - return ErrorResult(response.status_code, response.json()) + return ErrorResult(status, json_data) except ValueError: - return ErrorResult(response.status_code, response.text) + return ErrorResult(status, text) - def do_search(self, page: Optional[int]) -> Response: + async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]: if page is None: url = f'https://{self.server}/search/code?q="{self.word}"' else: @@ -84,37 +85,41 @@ def do_search(self, page: Optional[int]) -> Response: 'Host': self.server, 'User-agent': Core.get_user_agent(), 'Accept': "application/vnd.github.v3.text-match+json", - 'Authorization': 'token {}'.format(self.key) + 'Authorization': f'token {self.key}' } - return requests.get(url=url, headers=headers, verify=True) + async with aiohttp.ClientSession(headers=headers) as sess: + async with sess.get(url) as resp: + return await resp.text(), await resp.json(), resp.status, resp.links @staticmethod - def next_page_or_end(result: SuccessResult) -> Optional[int]: + async def next_page_or_end(result: SuccessResult) -> Optional[int]: if result.next_page is not None: return result.next_page else: return result.last_page - def process(self): - while self.counter <= self.limit and self.page is not None: - api_response = self.do_search(self.page) - result = self.handle_response(api_response) - if type(result) == SuccessResult: - print(f'\tSearching {self.counter} results.') - for fragment in result.fragments: - self.total_results += fragment - self.counter = self.counter + 1 - - self.page = self.next_page_or_end(result) - time.sleep(getDelay()) - elif type(result) == RetryResult: - sleepy_time = getDelay() + result.time - print(f'\tRetrying page in {sleepy_time} seconds...') - time.sleep(sleepy_time) - elif type(result) == ErrorResult: - raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}") - else: - raise Exception("\tUnknown exception occurred") + async def process(self): + try: + while self.counter <= self.limit and self.page is not None: + api_response = await self.do_search(self.page) + result = await self.handle_response(api_response) + if type(result) == SuccessResult: + print(f'\tSearching {self.counter} results.') + for fragment in result.fragments: + self.total_results += fragment + self.counter = self.counter + 1 + self.page = await self.next_page_or_end(result) + await asyncio.sleep(getDelay()) + elif type(result) == RetryResult: + sleepy_time = getDelay() + result.time + print(f'\tRetrying page in {sleepy_time} seconds...') + await asyncio.sleep(sleepy_time) + elif type(result) == ErrorResult: + raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}") + else: + raise Exception("\tUnknown exception occurred") + except Exception as e: + print(f'An exception has occurred: {e}') async def get_emails(self): rawres = myparser.Parser(self.total_results, self.word) diff --git a/theHarvester/discovery/googlesearch.py b/theHarvester/discovery/googlesearch.py index 4eb64843..b7cba3f2 100644 --- a/theHarvester/discovery/googlesearch.py +++ b/theHarvester/discovery/googlesearch.py @@ -1,8 +1,6 @@ from theHarvester.discovery.constants import * from theHarvester.parsers import myparser -import requests -import time - +import asyncio class SearchGoogle: @@ -18,85 +16,91 @@ def __init__(self, word, limit, start): self.limit = limit self.counter = start - def do_search(self): + async def do_search(self): # Do normal scraping. urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str( self.counter) + '&hl=en&meta=&q=%40\"' + self.word + '\"' try: headers = {'User-Agent': googleUA} - r = requests.get(urly, headers=headers) + resp = await AsyncFetcher.fetch_all([urly], headers=headers) except Exception as e: print(e) - self.results = r.text - if search(self.results): + self.results = resp[0] + searched = await search(self.results) + if searched: try: - self.results = google_workaround(urly) + self.results = await google_workaround(urly) + print('self.results: ', self.results) + p.pprint(self.results, indent=4) if isinstance(self.results, bool): print('Google is blocking your ip and the workaround, returning') return - except Exception: + except Exception as e: + print(e) + import traceback as t + t.print_exc() # google blocked, no useful result return - time.sleep(getDelay()) + await asyncio.sleep(getDelay()) self.totalresults += self.results - def do_search_profiles(self): + async def do_search_profiles(self): urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str( self.counter) + '&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20' + self.word + '\"' try: headers = {'User-Agent': googleUA} - r = requests.get(urly, headers=headers) + resp = await AsyncFetcher.fetch_all([urly], headers=headers) except Exception as e: print(e) - self.results = r.text - if search(self.results): + self.results = resp[0] + if await search(self.results): try: - self.results = google_workaround(urly) + self.results = await google_workaround(urly) if isinstance(self.results, bool): print('Google is blocking your ip and the workaround, returning') return except Exception: # google blocked, no useful result return - time.sleep(getDelay()) + await asyncio.sleep(getDelay()) self.totalresults += self.results - def get_emails(self): + async def get_emails(self): rawres = myparser.Parser(self.totalresults, self.word) - return rawres.emails() + return await rawres.emails() - def get_hostnames(self): + async def get_hostnames(self): rawres = myparser.Parser(self.totalresults, self.word) - return rawres.hostnames() + return await rawres.hostnames() - def get_files(self): + async def get_files(self): rawres = myparser.Parser(self.totalresults, self.word) return rawres.fileurls(self.files) - def get_profiles(self): + async def get_profiles(self): rawres = myparser.Parser(self.totalresults, self.word) return rawres.profiles() - def process(self, google_dorking): + async def process(self, google_dorking): if google_dorking is False: while self.counter <= self.limit and self.counter <= 1000: - self.do_search() + await self.do_search() print(f'\tSearching {self.counter} results.') self.counter += 100 else: # Google dorking is true. self.counter = 0 # Reset counter. print('\n') print('[-] Searching with Google Dorks: ') - self.googledork() # Call Google dorking method if user wanted it! + await self.googledork() # Call Google dorking method if user wanted it! - def process_profiles(self): + async def process_profiles(self): while self.counter < self.limit: - self.do_search_profiles() - time.sleep(getDelay()) + await self.do_search_profiles() + await asyncio.sleep(getDelay()) self.counter += 100 print(f'\tSearching {self.counter} results.') - def append_dorks(self): + async def append_dorks(self): # Wrap in try-except incase filepaths are messed up. try: with open('wordlists/dorks.txt', mode='r') as fp: @@ -104,7 +108,7 @@ def append_dorks(self): except FileNotFoundError as error: print(error) - def construct_dorks(self): + async def construct_dorks(self): # Format is: site:targetwebsite.com + space + inurl:admindork colon = '%3A' plus = '%2B' @@ -128,12 +132,12 @@ def construct_dorks(self): .replace('&', ampersand).replace('(', left_peren).replace(')', right_peren).replace('|', pipe) + space + self.word for dork in self.dorks) - def googledork(self): - self.append_dorks() # Call functions to create list. - self.construct_dorks() - self.send_dorks() + async def googledork(self): + await self.append_dorks() # Call functions to create list. + await self.construct_dorks() + await self.send_dorks() - def send_dorks(self): # Helper function to minimize code reusability. + async def send_dorks(self): # Helper function to minimize code reusability. headers = {'User-Agent': googleUA} # Get random user agent to try and prevent google from blocking IP. for num in range(len(self.links)): @@ -141,18 +145,18 @@ def send_dorks(self): # Helper function to minimize code reusability. if num % 10 == 0 and num > 0: print(f'\tSearching through {num} results') link = self.links[num] - req = requests.get(link, headers=headers) - self.results = req.text - if search(self.results): + req = await AsyncFetcher.fetch_all([link], headers=headers) + self.results = req[0] + if await search(self.results): try: - self.results = google_workaround(link) + self.results = await google_workaround(link) if isinstance(self.results, bool): print('Google is blocking your ip and the workaround, returning') return except Exception: # google blocked, no useful result return - time.sleep(getDelay()) + await asyncio.sleep(getDelay()) self.totalresults += self.results except Exception as e: print(f'\tException Occurred {e}') diff --git a/theHarvester/discovery/linkedinsearch.py b/theHarvester/discovery/linkedinsearch.py index a0b165f8..cb69957c 100644 --- a/theHarvester/discovery/linkedinsearch.py +++ b/theHarvester/discovery/linkedinsearch.py @@ -1,8 +1,7 @@ from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser -import requests -import time +import asyncio class SearchLinkedin: @@ -16,15 +15,15 @@ def __init__(self, word, limit): self.limit = int(limit) self.counter = 0 - def do_search(self): + async def do_search(self): urly = 'http://' + self.server + '/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.word try: headers = {'User-Agent': Core.get_user_agent()} - r = requests.get(urly, headers=headers) - self.results = r.text - if search(self.results): + resp = await AsyncFetcher.fetch_all([urly], headers=headers) + self.results = resp[0] + if await search(self.results): try: - self.results = google_workaround(urly) + self.results = await google_workaround(urly) if isinstance(self.results, bool): print('Google is blocking your ip and the workaround, returning') return @@ -33,20 +32,20 @@ def do_search(self): return except Exception as e: print(e) - time.sleep(getDelay()) + await asyncio.sleep(getDelay()) self.totalresults += self.results - def get_people(self): + async def get_people(self): rawres = myparser.Parser(self.totalresults, self.word) - return rawres.people_linkedin() + return await rawres.people_linkedin() - def get_links(self): + async def get_links(self): links = myparser.Parser(self.totalresults, self.word) - return splitter(links.links_linkedin()) + return splitter(await links.links_linkedin()) - def process(self): + async def process(self): while self.counter < self.limit: - self.do_search() - time.sleep(getDelay()) + await self.do_search() + await asyncio.sleep(getDelay()) self.counter += 100 print(f'\tSearching {self.counter} results.') diff --git a/theHarvester/discovery/trello.py b/theHarvester/discovery/trello.py index 65834ead..ff97ca29 100644 --- a/theHarvester/discovery/trello.py +++ b/theHarvester/discovery/trello.py @@ -2,7 +2,7 @@ from theHarvester.parsers import myparser import requests import random -import time +import asyncio class SearchTrello: @@ -18,54 +18,54 @@ def __init__(self, word): self.hostnames = [] self.counter = 0 - def do_search(self): + async def do_search(self): base_url = f'https://{self.server}/search?num=300&start=xx&hl=en&q=site%3Atrello.com%20{self.word}' urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 20) if num <= self.limit] # limit is 20 as that is the most results google will show per num headers = {'User-Agent': googleUA} for url in urls: try: - resp = requests.get(url, headers=headers) - self.results = resp.text - if search(self.results): + resp = await AsyncFetcher.fetch_all([url], headers=headers) + self.results = resp[0] + if await search(self.results): try: - self.results = google_workaround(base_url) + self.results = await google_workaround(base_url) if isinstance(self.results, bool): print('Google is blocking your ip and the workaround, returning') return except Exception as e: print(e) self.totalresults += self.results - time.sleep(getDelay() - .5) + await asyncio.sleep(getDelay() - .5) except Exception as e: print(f'An exception has occurred in trello: {e}') - def get_emails(self): + async def get_emails(self): rawres = myparser.Parser(self.totalresults, self.word) return rawres.emails() - def get_urls(self): + async def get_urls(self): try: rawres = myparser.Parser(self.totalresults, 'trello.com') - self.trello_urls = set(rawres.urls()) + self.trello_urls = set(await rawres.urls()) self.totalresults = '' # reset what totalresults as before it was just google results now it is trello results headers = {'User-Agent': random.choice(['curl/7.37.0', 'Wget/1.19.4'])} # do not change the headers - req = (grequests.get(url, headers=headers, timeout=4) for url in self.trello_urls) - responses = grequests.imap(req, size=8) + print('fetching trello urls') + responses = await AsyncFetcher.fetch_all(self.trello_urls, headers=headers) for response in responses: - self.totalresults += response.content.decode('UTF-8') + self.totalresults += response rawres = myparser.Parser(self.totalresults, self.word) - self.hostnames = rawres.hostnames() + self.hostnames = await rawres.hostnames() except Exception as e: print(f'Error occurred: {e}') - def process(self): - self.do_search() - self.get_urls() + async def process(self): + await self.do_search() + await self.get_urls() print(f'\tSearching {self.counter} results.') - def get_results(self) -> tuple: - return self.get_emails(), self.hostnames, self.trello_urls + async def get_results(self) -> tuple: + return await self.get_emails(), self.hostnames, self.trello_urls diff --git a/theHarvester/discovery/twittersearch.py b/theHarvester/discovery/twittersearch.py index b5f9f345..2351a5b9 100644 --- a/theHarvester/discovery/twittersearch.py +++ b/theHarvester/discovery/twittersearch.py @@ -1,3 +1,4 @@ +from theHarvester.discovery.constants import * from theHarvester.lib.core import * from theHarvester.parsers import myparser import re @@ -19,9 +20,19 @@ async def do_search(self): headers = {'User-Agent': Core.get_user_agent()} try: urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] - responses = await AsyncFetcher.fetch_all(urls, headers=headers) - for response in responses: - self.totalresults += response + for url in urls: + response = await AsyncFetcher.fetch_all([url], headers=headers) + self.results = response[0] + if await search(self.results): + try: + self.results = await google_workaround(url) + if isinstance(self.results, bool): + print('Google is blocking your ip and the workaround, returning') + return + except Exception: + # google blocked, no useful result + return + self.totalresults += self.results except Exception as error: print(error)