mirror of
https://github.com/laramies/theHarvester.git
synced 2024-09-22 08:16:35 +08:00
Merge pull request #17 from NotoriousRebel/master
Implemented Google Work Around and more static typing.
This commit is contained in:
commit
38c9731262
|
@ -1,6 +1,5 @@
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
|
||||||
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
|
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
|
||||||
|
|
||||||
|
|
||||||
|
@ -8,7 +7,7 @@ def splitter(links):
|
||||||
"""
|
"""
|
||||||
Method that tries to remove duplicates
|
Method that tries to remove duplicates
|
||||||
LinkedinLists pulls a lot of profiles with the same name.
|
LinkedinLists pulls a lot of profiles with the same name.
|
||||||
This method triest to remove duplicates from the list.
|
This method tries to remove duplicates from the list.
|
||||||
:param links: list of links to remove duplicates from
|
:param links: list of links to remove duplicates from
|
||||||
:return: unique-ish list
|
:return: unique-ish list
|
||||||
"""
|
"""
|
||||||
|
@ -43,32 +42,62 @@ def filter(lst):
|
||||||
for item in lst:
|
for item in lst:
|
||||||
item = str(item)
|
item = str(item)
|
||||||
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
|
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
|
||||||
if '252f' in item:
|
item = item.replace('252f', '').replace('2F', '').replace('2f', '')
|
||||||
item = item.replace('252f', '')
|
|
||||||
if '2F' in item:
|
|
||||||
item = item.replace('2F', '')
|
|
||||||
if '2f' in item:
|
|
||||||
item = item.replace('2f', '')
|
|
||||||
new_lst.append(item.lower())
|
new_lst.append(item.lower())
|
||||||
return new_lst
|
return new_lst
|
||||||
|
|
||||||
|
|
||||||
def getDelay():
|
def getDelay() -> int:
|
||||||
return random.randint(1, 3) - .5
|
return random.randint(1, 3) - .5
|
||||||
|
|
||||||
|
|
||||||
def search(text):
|
def search(text: str) -> bool:
|
||||||
# Helper function to check if Google has blocked traffic.
|
# Helper function to check if Google has blocked traffic.
|
||||||
for line in text.strip().splitlines():
|
for line in text.strip().splitlines():
|
||||||
if 'This page appears when Google automatically detects requests coming from your computer network' in line:
|
if 'This page appears when Google automatically detects requests coming from your computer network' in line \
|
||||||
print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
|
or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
|
||||||
|
# print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def google_workaround(visit_url: str) -> str or bool:
|
||||||
|
"""
|
||||||
|
Function that makes a request on our behalf, if Google starts to block us
|
||||||
|
:param visit_url: Url to scrape
|
||||||
|
:return: Correct html that can be parsed by BS4
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
url = 'https://websniffer.cc/'
|
||||||
|
data = {
|
||||||
|
'Cookie': '',
|
||||||
|
'url': visit_url,
|
||||||
|
'submit': 'Submit',
|
||||||
|
'type': 'GET&http=1.1',
|
||||||
|
'uak': str(random.randint(4, 8)) # select random UA to send to Google
|
||||||
|
}
|
||||||
|
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
|
||||||
|
returned_html = resp.text
|
||||||
|
if search(returned_html):
|
||||||
|
# indicates that google is serving workaround a captcha
|
||||||
|
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
|
||||||
|
return True
|
||||||
|
# the html we get is malformed for BS4 as there are no greater than or less than signs
|
||||||
|
if '<html>' in returned_html:
|
||||||
|
start_index = returned_html.index('<html>')
|
||||||
|
else:
|
||||||
|
start_index = returned_html.index('<html')
|
||||||
|
|
||||||
|
end_index = returned_html.index('</html>') + 1
|
||||||
|
correct_html = returned_html[start_index:end_index]
|
||||||
|
# Slice list to get the response's html
|
||||||
|
correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html])
|
||||||
|
return correct_html
|
||||||
|
|
||||||
|
|
||||||
class MissingKey(Exception):
|
class MissingKey(Exception):
|
||||||
|
|
||||||
def __init__(self, identity_flag):
|
def __init__(self, identity_flag: bool):
|
||||||
if identity_flag:
|
if identity_flag:
|
||||||
self.message = '\n\033[93m[!] Missing API key. \033[0m'
|
self.message = '\n\033[93m[!] Missing API key. \033[0m'
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -31,9 +31,15 @@ def do_search(self):
|
||||||
print(e)
|
print(e)
|
||||||
self.results = r.text
|
self.results = r.text
|
||||||
if search(self.results):
|
if search(self.results):
|
||||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
try:
|
||||||
else:
|
if isinstance(search(self.results), bool):
|
||||||
time.sleep(getDelay())
|
print('Google is blocking your ip and the workaround, returning')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.results = google_workaround(urly)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
|
time.sleep(getDelay())
|
||||||
self.totalresults += self.results
|
self.totalresults += self.results
|
||||||
|
|
||||||
def do_search_profiles(self):
|
def do_search_profiles(self):
|
||||||
|
@ -49,9 +55,15 @@ def do_search_profiles(self):
|
||||||
print(e)
|
print(e)
|
||||||
self.results = r.text
|
self.results = r.text
|
||||||
if search(self.results):
|
if search(self.results):
|
||||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
try:
|
||||||
else:
|
if isinstance(search(self.results), bool):
|
||||||
time.sleep(getDelay())
|
print('Google is blocking your ip and the workaround, returning')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.results = google_workaround(urly)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
|
time.sleep(getDelay())
|
||||||
self.totalresults += self.results
|
self.totalresults += self.results
|
||||||
|
|
||||||
def get_emails(self):
|
def get_emails(self):
|
||||||
|
@ -137,9 +149,15 @@ def send_dorks(self): # Helper function to minimize code reusability.
|
||||||
req = requests.get(link, headers=headers)
|
req = requests.get(link, headers=headers)
|
||||||
self.results = req.text
|
self.results = req.text
|
||||||
if search(self.results):
|
if search(self.results):
|
||||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
try:
|
||||||
else:
|
if isinstance(search(self.results), bool):
|
||||||
time.sleep(getDelay())
|
print('Google is blocking your ip and the workaround, returning')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.results = google_workaround(link)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
|
time.sleep(getDelay())
|
||||||
self.totalresults += self.results
|
self.totalresults += self.results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f'\tException Occurred {e}')
|
print(f'\tException Occurred {e}')
|
||||||
|
|
|
@ -12,7 +12,6 @@ def __init__(self, word, limit):
|
||||||
self.results = ""
|
self.results = ""
|
||||||
self.totalresults = ""
|
self.totalresults = ""
|
||||||
self.server = 'www.google.com'
|
self.server = 'www.google.com'
|
||||||
self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
|
|
||||||
self.quantity = '100'
|
self.quantity = '100'
|
||||||
self.limit = int(limit)
|
self.limit = int(limit)
|
||||||
self.counter = 0
|
self.counter = 0
|
||||||
|
@ -25,9 +24,19 @@ def do_search(self):
|
||||||
try:
|
try:
|
||||||
headers = {'User-Agent': Core.get_user_agent()}
|
headers = {'User-Agent': Core.get_user_agent()}
|
||||||
r = requests.get(urly, headers=headers)
|
r = requests.get(urly, headers=headers)
|
||||||
|
self.results = r.text
|
||||||
|
if search(self.results):
|
||||||
|
try:
|
||||||
|
if isinstance(search(self.results), bool):
|
||||||
|
print('Google is blocking your ip and the workaround, returning')
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
self.results = google_workaround(urly)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
self.results = r.text
|
time.sleep(getDelay())
|
||||||
self.totalresults += self.results
|
self.totalresults += self.results
|
||||||
|
|
||||||
def get_people(self):
|
def get_people(self):
|
||||||
|
|
Loading…
Reference in a new issue