Merge pull request #66 from NotoriousRebel/master

Ported github-code to use aiohttp.
This commit is contained in:
J.Townsend 2020-01-04 05:52:48 +00:00 committed by GitHub
commit c6e8077af9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 257 additions and 128 deletions

2
.gitignore vendored
View file

@ -10,4 +10,4 @@ venv
.pytest_cache
build/
dist/
theHarvester.egg-info/
api-keys.yaml

View file

@ -1,7 +1,12 @@
from theHarvester.lib.core import *
from typing import Union
import random
import aiohttp
import re
from bs4 import BeautifulSoup
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \
'Safari/537.36 '
def splitter(links):
@ -52,7 +57,7 @@ def getDelay() -> float:
return random.randint(1, 3) - .5
def search(text: str) -> bool:
async def search(text: str) -> bool:
# Helper function to check if Google has blocked traffic.
for line in text.strip().splitlines():
if 'This page appears when Google automatically detects requests coming from your computer network' in line \
@ -62,13 +67,12 @@ def search(text: str) -> bool:
return False
def google_workaround(visit_url: str) -> Union[bool, str]:
async def google_workaround(visit_url: str) -> Union[bool, str]:
"""
Function that makes a request on our behalf, if Google starts to block us
:param visit_url: Url to scrape
:return: Correct html that can be parsed by BS4
"""
import requests
url = 'https://websniffer.cc/'
data = {
'Cookie': '',
@ -77,12 +81,20 @@ def google_workaround(visit_url: str) -> Union[bool, str]:
'type': 'GET&http=1.1',
'uak': str(random.randint(4, 8)) # select random UA to send to Google
}
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
returned_html = resp.text
if search(returned_html):
import requests
returned_html = requests.post(url, data=data, headers={'User-Agent': Core.get_user_agent()})
returned_html = returned_html.text
# TODO FIX
#returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
import pprint as p
print('returned html')
p.pprint(returned_html, indent=4)
returned_html = "This page appears when Google automatically detects requests coming from your computer network"
if await search(returned_html):
print('going to second method!')
# indicates that google is serving workaround a captcha
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
return True
# That means we will try out second option which will utilize proxies
return await second_method(visit_url)
# the html we get is malformed for BS4 as there are no greater than or less than signs
if '<html>' in returned_html:
start_index = returned_html.index('<html>')
@ -96,6 +108,104 @@ def google_workaround(visit_url: str) -> Union[bool, str]:
return correct_html
async def request(url, params):
headers = {'User-Agent': Core.get_user_agent()}
session = aiohttp.ClientSession(headers=headers)
results = await AsyncFetcher.fetch(session, url=url, params=params)
await session.close()
return results
async def proxy_fetch(session, url, proxy):
try:
async with session.get(url, proxy=proxy, ssl=False) as resp:
return f'success:{proxy}', await resp.text()
except Exception as e:
# print(e)
return f'failed:{proxy}', proxy
async def proxy_test(proxies, url):
print('doing proxy test with this number of proxies: ', len(proxies))
headers = {'User-Agent': Core.get_user_agent()}
timeout = aiohttp.ClientTimeout(total=40)
async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
texts = await asyncio.gather(*[proxy_fetch(session, url, proxy) for proxy in proxies])
return texts
async def get_proxies():
print('inside get proxies')
# ideas borrowed and modified from twitterscraper
proxy_url = 'https://free-proxy-list.net/'
response = await AsyncFetcher.fetch_all([proxy_url])
response = response[0]
soup = BeautifulSoup(response, 'lxml')
table = soup.find('table', id='proxylisttable')
list_tr = table.find_all('tr')
list_td = [elem.find_all('td') for elem in list_tr]
list_td = [x for x in list_td if x is not None and len(x) > 0]
list_ip = [elem[0].text for elem in list_td]
list_ports = [elem[1].text for elem in list_td]
list_proxies = [f"http://{':'.join(elem)}" for elem in list(zip(list_ip, list_ports))]
return list_proxies
async def clean_dct(dct: dict, second_test=False):
print('cleaning dct and second test is: ', second_test)
good_proxies = set()
for proxy, text in dct.items():
if 'failed' not in proxy:
if second_test:
if await search(text) is False:
print(text)
return text
else:
good_proxies.add(proxy[proxy.find(':') + 1:])
return good_proxies if second_test is False else True
async def create_init_proxies():
print('inside create init proxies')
url = "https://suip.biz"
first_param = [url, (('act', 'proxy1'),), ]
second_param = [url, (('act', 'proxy2'),), ]
third_param = [url, (('act', 'proxy3'),), ]
async_requests = [
request(url=url, params=params)
for url, params in [first_param, second_param, third_param]
]
results = await asyncio.gather(*async_requests)
proxy_set = set()
for resp in results:
ip_candidates = re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', resp)
proxy_set.update({f'http://{ip}' for ip in ip_candidates})
new_proxies = await get_proxies()
proxy_set.update({proxy for proxy in new_proxies})
return proxy_set
async def second_method(url: str) -> Union[str, bool]:
print('inside second method')
# First visit example.com to make to filter out bad proxies
init_url = "http://example.com"
proxy_set = await create_init_proxies()
tuples = await proxy_test(proxy_set, init_url)
mega_dct = dict((x, y) for x, y in tuples)
proxy_set = await clean_dct(mega_dct)
# After we clean our proxy set now we use them to visit the url we care about
print('got working proxies now onto the juice')
tuples = await proxy_test(proxy_set, url)
mega_dct = dict((x, y) for x, y in tuples)
results = await clean_dct(mega_dct, second_test=True)
print('returning the juice')
# pass in second_test flag as True to indicate this will
# the text we care about or a bool to indicate it was
# not successful
return results
class MissingKey(Exception):
def __init__(self, identity_flag: bool):

View file

@ -1,10 +1,9 @@
from theHarvester.discovery.constants import *
from theHarvester.lib.core import *
from theHarvester.parsers import myparser
import requests
from requests import Response
import time
from typing import List, Dict, Any, Optional, NamedTuple
from typing import List, Dict, Any, Optional, NamedTuple, Tuple
import asyncio
import aiohttp
import urllib.parse as urlparse
@ -40,20 +39,21 @@ def __init__(self, word, limit):
raise MissingKey(True)
@staticmethod
def fragments_from_response(response: Response) -> List[str]:
items: List[Dict[str, Any]] = response.json().get('items') or list()
async def fragments_from_response(json_data: dict) -> List[str]:
items: List[Dict[str, Any]] = json_data.get('items') or list()
fragments: List[str] = list()
for item in items:
matches = item.get("text_matches") or list()
for match in matches:
fragments.append(match.get("fragment"))
return [fragment for fragment in fragments if fragment is not None]
@staticmethod
def page_from_response(page: str, response: Response) -> Optional[Any]:
page_link = response.links.get(page)
async def page_from_response(page: str, links) -> Optional[Any]:
page_link = links.get(page)
if page_link:
parsed = urlparse.urlparse(page_link.get("url"))
parsed = urlparse.urlparse(str(page_link.get("url")))
params = urlparse.parse_qs(parsed.query)
pages: List[Any] = params.get('page', [None])
page_number = pages[0] and int(pages[0])
@ -61,21 +61,22 @@ def page_from_response(page: str, response: Response) -> Optional[Any]:
else:
return None
def handle_response(self, response: Response) -> Optional[Any]:
if response.ok:
results = self.fragments_from_response(response)
next_page = self.page_from_response("next", response)
last_page = self.page_from_response("last", response)
async def handle_response(self, response: Tuple[str, dict, int, Any]):
text, json_data, status, links = response
if status == 200:
results = await self.fragments_from_response(json_data)
next_page = await self.page_from_response("next", links)
last_page = await self.page_from_response("last", links)
return SuccessResult(results, next_page, last_page)
elif response.status_code == 429 or response.status_code == 403:
elif status == 429 or status == 403:
return RetryResult(60)
else:
try:
return ErrorResult(response.status_code, response.json())
return ErrorResult(status, json_data)
except ValueError:
return ErrorResult(response.status_code, response.text)
return ErrorResult(status, text)
def do_search(self, page: Optional[int]) -> Response:
async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]:
if page is None:
url = f'https://{self.server}/search/code?q="{self.word}"'
else:
@ -84,37 +85,41 @@ def do_search(self, page: Optional[int]) -> Response:
'Host': self.server,
'User-agent': Core.get_user_agent(),
'Accept': "application/vnd.github.v3.text-match+json",
'Authorization': 'token {}'.format(self.key)
'Authorization': f'token {self.key}'
}
return requests.get(url=url, headers=headers, verify=True)
async with aiohttp.ClientSession(headers=headers) as sess:
async with sess.get(url) as resp:
return await resp.text(), await resp.json(), resp.status, resp.links
@staticmethod
def next_page_or_end(result: SuccessResult) -> Optional[int]:
async def next_page_or_end(result: SuccessResult) -> Optional[int]:
if result.next_page is not None:
return result.next_page
else:
return result.last_page
def process(self):
while self.counter <= self.limit and self.page is not None:
api_response = self.do_search(self.page)
result = self.handle_response(api_response)
if type(result) == SuccessResult:
print(f'\tSearching {self.counter} results.')
for fragment in result.fragments:
self.total_results += fragment
self.counter = self.counter + 1
self.page = self.next_page_or_end(result)
time.sleep(getDelay())
elif type(result) == RetryResult:
sleepy_time = getDelay() + result.time
print(f'\tRetrying page in {sleepy_time} seconds...')
time.sleep(sleepy_time)
elif type(result) == ErrorResult:
raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
else:
raise Exception("\tUnknown exception occurred")
async def process(self):
try:
while self.counter <= self.limit and self.page is not None:
api_response = await self.do_search(self.page)
result = await self.handle_response(api_response)
if type(result) == SuccessResult:
print(f'\tSearching {self.counter} results.')
for fragment in result.fragments:
self.total_results += fragment
self.counter = self.counter + 1
self.page = await self.next_page_or_end(result)
await asyncio.sleep(getDelay())
elif type(result) == RetryResult:
sleepy_time = getDelay() + result.time
print(f'\tRetrying page in {sleepy_time} seconds...')
await asyncio.sleep(sleepy_time)
elif type(result) == ErrorResult:
raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
else:
raise Exception("\tUnknown exception occurred")
except Exception as e:
print(f'An exception has occurred: {e}')
async def get_emails(self):
rawres = myparser.Parser(self.total_results, self.word)

View file

@ -1,8 +1,6 @@
from theHarvester.discovery.constants import *
from theHarvester.parsers import myparser
import requests
import time
import asyncio
class SearchGoogle:
@ -18,85 +16,91 @@ def __init__(self, word, limit, start):
self.limit = limit
self.counter = start
def do_search(self):
async def do_search(self):
# Do normal scraping.
urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str(
self.counter) + '&hl=en&meta=&q=%40\"' + self.word + '\"'
try:
headers = {'User-Agent': googleUA}
r = requests.get(urly, headers=headers)
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
except Exception as e:
print(e)
self.results = r.text
if search(self.results):
self.results = resp[0]
searched = await search(self.results)
if searched:
try:
self.results = google_workaround(urly)
self.results = await google_workaround(urly)
print('self.results: ', self.results)
p.pprint(self.results, indent=4)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
except Exception:
except Exception as e:
print(e)
import traceback as t
t.print_exc()
# google blocked, no useful result
return
time.sleep(getDelay())
await asyncio.sleep(getDelay())
self.totalresults += self.results
def do_search_profiles(self):
async def do_search_profiles(self):
urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str(
self.counter) + '&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20' + self.word + '\"'
try:
headers = {'User-Agent': googleUA}
r = requests.get(urly, headers=headers)
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
except Exception as e:
print(e)
self.results = r.text
if search(self.results):
self.results = resp[0]
if await search(self.results):
try:
self.results = google_workaround(urly)
self.results = await google_workaround(urly)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
except Exception:
# google blocked, no useful result
return
time.sleep(getDelay())
await asyncio.sleep(getDelay())
self.totalresults += self.results
def get_emails(self):
async def get_emails(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.emails()
return await rawres.emails()
def get_hostnames(self):
async def get_hostnames(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.hostnames()
return await rawres.hostnames()
def get_files(self):
async def get_files(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.fileurls(self.files)
def get_profiles(self):
async def get_profiles(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.profiles()
def process(self, google_dorking):
async def process(self, google_dorking):
if google_dorking is False:
while self.counter <= self.limit and self.counter <= 1000:
self.do_search()
await self.do_search()
print(f'\tSearching {self.counter} results.')
self.counter += 100
else: # Google dorking is true.
self.counter = 0 # Reset counter.
print('\n')
print('[-] Searching with Google Dorks: ')
self.googledork() # Call Google dorking method if user wanted it!
await self.googledork() # Call Google dorking method if user wanted it!
def process_profiles(self):
async def process_profiles(self):
while self.counter < self.limit:
self.do_search_profiles()
time.sleep(getDelay())
await self.do_search_profiles()
await asyncio.sleep(getDelay())
self.counter += 100
print(f'\tSearching {self.counter} results.')
def append_dorks(self):
async def append_dorks(self):
# Wrap in try-except incase filepaths are messed up.
try:
with open('wordlists/dorks.txt', mode='r') as fp:
@ -104,7 +108,7 @@ def append_dorks(self):
except FileNotFoundError as error:
print(error)
def construct_dorks(self):
async def construct_dorks(self):
# Format is: site:targetwebsite.com + space + inurl:admindork
colon = '%3A'
plus = '%2B'
@ -128,12 +132,12 @@ def construct_dorks(self):
.replace('&', ampersand).replace('(', left_peren).replace(')', right_peren).replace('|', pipe) + space + self.word
for dork in self.dorks)
def googledork(self):
self.append_dorks() # Call functions to create list.
self.construct_dorks()
self.send_dorks()
async def googledork(self):
await self.append_dorks() # Call functions to create list.
await self.construct_dorks()
await self.send_dorks()
def send_dorks(self): # Helper function to minimize code reusability.
async def send_dorks(self): # Helper function to minimize code reusability.
headers = {'User-Agent': googleUA}
# Get random user agent to try and prevent google from blocking IP.
for num in range(len(self.links)):
@ -141,18 +145,18 @@ def send_dorks(self): # Helper function to minimize code reusability.
if num % 10 == 0 and num > 0:
print(f'\tSearching through {num} results')
link = self.links[num]
req = requests.get(link, headers=headers)
self.results = req.text
if search(self.results):
req = await AsyncFetcher.fetch_all([link], headers=headers)
self.results = req[0]
if await search(self.results):
try:
self.results = google_workaround(link)
self.results = await google_workaround(link)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
except Exception:
# google blocked, no useful result
return
time.sleep(getDelay())
await asyncio.sleep(getDelay())
self.totalresults += self.results
except Exception as e:
print(f'\tException Occurred {e}')

View file

@ -1,8 +1,7 @@
from theHarvester.discovery.constants import *
from theHarvester.lib.core import *
from theHarvester.parsers import myparser
import requests
import time
import asyncio
class SearchLinkedin:
@ -16,15 +15,15 @@ def __init__(self, word, limit):
self.limit = int(limit)
self.counter = 0
def do_search(self):
async def do_search(self):
urly = 'http://' + self.server + '/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.word
try:
headers = {'User-Agent': Core.get_user_agent()}
r = requests.get(urly, headers=headers)
self.results = r.text
if search(self.results):
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
self.results = resp[0]
if await search(self.results):
try:
self.results = google_workaround(urly)
self.results = await google_workaround(urly)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
@ -33,20 +32,20 @@ def do_search(self):
return
except Exception as e:
print(e)
time.sleep(getDelay())
await asyncio.sleep(getDelay())
self.totalresults += self.results
def get_people(self):
async def get_people(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.people_linkedin()
return await rawres.people_linkedin()
def get_links(self):
async def get_links(self):
links = myparser.Parser(self.totalresults, self.word)
return splitter(links.links_linkedin())
return splitter(await links.links_linkedin())
def process(self):
async def process(self):
while self.counter < self.limit:
self.do_search()
time.sleep(getDelay())
await self.do_search()
await asyncio.sleep(getDelay())
self.counter += 100
print(f'\tSearching {self.counter} results.')

View file

@ -2,7 +2,7 @@
from theHarvester.parsers import myparser
import requests
import random
import time
import asyncio
class SearchTrello:
@ -18,54 +18,54 @@ def __init__(self, word):
self.hostnames = []
self.counter = 0
def do_search(self):
async def do_search(self):
base_url = f'https://{self.server}/search?num=300&start=xx&hl=en&q=site%3Atrello.com%20{self.word}'
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 20) if num <= self.limit]
# limit is 20 as that is the most results google will show per num
headers = {'User-Agent': googleUA}
for url in urls:
try:
resp = requests.get(url, headers=headers)
self.results = resp.text
if search(self.results):
resp = await AsyncFetcher.fetch_all([url], headers=headers)
self.results = resp[0]
if await search(self.results):
try:
self.results = google_workaround(base_url)
self.results = await google_workaround(base_url)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
except Exception as e:
print(e)
self.totalresults += self.results
time.sleep(getDelay() - .5)
await asyncio.sleep(getDelay() - .5)
except Exception as e:
print(f'An exception has occurred in trello: {e}')
def get_emails(self):
async def get_emails(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.emails()
def get_urls(self):
async def get_urls(self):
try:
rawres = myparser.Parser(self.totalresults, 'trello.com')
self.trello_urls = set(rawres.urls())
self.trello_urls = set(await rawres.urls())
self.totalresults = ''
# reset what totalresults as before it was just google results now it is trello results
headers = {'User-Agent': random.choice(['curl/7.37.0', 'Wget/1.19.4'])}
# do not change the headers
req = (grequests.get(url, headers=headers, timeout=4) for url in self.trello_urls)
responses = grequests.imap(req, size=8)
print('fetching trello urls')
responses = await AsyncFetcher.fetch_all(self.trello_urls, headers=headers)
for response in responses:
self.totalresults += response.content.decode('UTF-8')
self.totalresults += response
rawres = myparser.Parser(self.totalresults, self.word)
self.hostnames = rawres.hostnames()
self.hostnames = await rawres.hostnames()
except Exception as e:
print(f'Error occurred: {e}')
def process(self):
self.do_search()
self.get_urls()
async def process(self):
await self.do_search()
await self.get_urls()
print(f'\tSearching {self.counter} results.')
def get_results(self) -> tuple:
return self.get_emails(), self.hostnames, self.trello_urls
async def get_results(self) -> tuple:
return await self.get_emails(), self.hostnames, self.trello_urls

View file

@ -1,3 +1,4 @@
from theHarvester.discovery.constants import *
from theHarvester.lib.core import *
from theHarvester.parsers import myparser
import re
@ -19,9 +20,19 @@ async def do_search(self):
headers = {'User-Agent': Core.get_user_agent()}
try:
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
responses = await AsyncFetcher.fetch_all(urls, headers=headers)
for response in responses:
self.totalresults += response
for url in urls:
response = await AsyncFetcher.fetch_all([url], headers=headers)
self.results = response[0]
if await search(self.results):
try:
self.results = await google_workaround(url)
if isinstance(self.results, bool):
print('Google is blocking your ip and the workaround, returning')
return
except Exception:
# google blocked, no useful result
return
self.totalresults += self.results
except Exception as error:
print(error)