mirror of
https://github.com/laramies/theHarvester.git
synced 2024-09-22 08:16:35 +08:00
Merge pull request #17 from NotoriousRebel/master
Implemented Google Work Around and more static typing.
This commit is contained in:
commit
38c9731262
|
@ -1,6 +1,5 @@
|
|||
import random
|
||||
|
||||
|
||||
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
|
||||
|
||||
|
||||
|
@ -8,7 +7,7 @@ def splitter(links):
|
|||
"""
|
||||
Method that tries to remove duplicates
|
||||
LinkedinLists pulls a lot of profiles with the same name.
|
||||
This method triest to remove duplicates from the list.
|
||||
This method tries to remove duplicates from the list.
|
||||
:param links: list of links to remove duplicates from
|
||||
:return: unique-ish list
|
||||
"""
|
||||
|
@ -43,32 +42,62 @@ def filter(lst):
|
|||
for item in lst:
|
||||
item = str(item)
|
||||
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
|
||||
if '252f' in item:
|
||||
item = item.replace('252f', '')
|
||||
if '2F' in item:
|
||||
item = item.replace('2F', '')
|
||||
if '2f' in item:
|
||||
item = item.replace('2f', '')
|
||||
item = item.replace('252f', '').replace('2F', '').replace('2f', '')
|
||||
new_lst.append(item.lower())
|
||||
return new_lst
|
||||
|
||||
|
||||
def getDelay():
|
||||
def getDelay() -> int:
|
||||
return random.randint(1, 3) - .5
|
||||
|
||||
|
||||
def search(text):
|
||||
def search(text: str) -> bool:
|
||||
# Helper function to check if Google has blocked traffic.
|
||||
for line in text.strip().splitlines():
|
||||
if 'This page appears when Google automatically detects requests coming from your computer network' in line:
|
||||
print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
|
||||
if 'This page appears when Google automatically detects requests coming from your computer network' in line \
|
||||
or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
|
||||
# print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def google_workaround(visit_url: str) -> str or bool:
|
||||
"""
|
||||
Function that makes a request on our behalf, if Google starts to block us
|
||||
:param visit_url: Url to scrape
|
||||
:return: Correct html that can be parsed by BS4
|
||||
"""
|
||||
import requests
|
||||
url = 'https://websniffer.cc/'
|
||||
data = {
|
||||
'Cookie': '',
|
||||
'url': visit_url,
|
||||
'submit': 'Submit',
|
||||
'type': 'GET&http=1.1',
|
||||
'uak': str(random.randint(4, 8)) # select random UA to send to Google
|
||||
}
|
||||
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
|
||||
returned_html = resp.text
|
||||
if search(returned_html):
|
||||
# indicates that google is serving workaround a captcha
|
||||
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
|
||||
return True
|
||||
# the html we get is malformed for BS4 as there are no greater than or less than signs
|
||||
if '<html>' in returned_html:
|
||||
start_index = returned_html.index('<html>')
|
||||
else:
|
||||
start_index = returned_html.index('<html')
|
||||
|
||||
end_index = returned_html.index('</html>') + 1
|
||||
correct_html = returned_html[start_index:end_index]
|
||||
# Slice list to get the response's html
|
||||
correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html])
|
||||
return correct_html
|
||||
|
||||
|
||||
class MissingKey(Exception):
|
||||
|
||||
def __init__(self, identity_flag):
|
||||
def __init__(self, identity_flag: bool):
|
||||
if identity_flag:
|
||||
self.message = '\n\033[93m[!] Missing API key. \033[0m'
|
||||
else:
|
||||
|
|
|
@ -31,9 +31,15 @@ def do_search(self):
|
|||
print(e)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
||||
else:
|
||||
time.sleep(getDelay())
|
||||
try:
|
||||
if isinstance(search(self.results), bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
else:
|
||||
self.results = google_workaround(urly)
|
||||
except BaseException:
|
||||
pass
|
||||
time.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def do_search_profiles(self):
|
||||
|
@ -49,9 +55,15 @@ def do_search_profiles(self):
|
|||
print(e)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
||||
else:
|
||||
time.sleep(getDelay())
|
||||
try:
|
||||
if isinstance(search(self.results), bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
else:
|
||||
self.results = google_workaround(urly)
|
||||
except BaseException:
|
||||
pass
|
||||
time.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def get_emails(self):
|
||||
|
@ -137,9 +149,15 @@ def send_dorks(self): # Helper function to minimize code reusability.
|
|||
req = requests.get(link, headers=headers)
|
||||
self.results = req.text
|
||||
if search(self.results):
|
||||
time.sleep(getDelay() * 5) # Sleep for a longer time.
|
||||
else:
|
||||
time.sleep(getDelay())
|
||||
try:
|
||||
if isinstance(search(self.results), bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
else:
|
||||
self.results = google_workaround(link)
|
||||
except BaseException:
|
||||
pass
|
||||
time.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
except Exception as e:
|
||||
print(f'\tException Occurred {e}')
|
||||
|
|
|
@ -12,7 +12,6 @@ def __init__(self, word, limit):
|
|||
self.results = ""
|
||||
self.totalresults = ""
|
||||
self.server = 'www.google.com'
|
||||
self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
|
||||
self.quantity = '100'
|
||||
self.limit = int(limit)
|
||||
self.counter = 0
|
||||
|
@ -25,9 +24,19 @@ def do_search(self):
|
|||
try:
|
||||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
r = requests.get(urly, headers=headers)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
try:
|
||||
if isinstance(search(self.results), bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
else:
|
||||
self.results = google_workaround(urly)
|
||||
except BaseException:
|
||||
pass
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.results = r.text
|
||||
time.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def get_people(self):
|
||||
|
|
Loading…
Reference in a new issue