Merge pull request #17 from NotoriousRebel/master

Implemented Google Work Around and more static typing.
This commit is contained in:
J.Townsend 2019-09-12 21:02:07 +01:00 committed by GitHub
commit 38c9731262
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 24 deletions

View file

@ -1,6 +1,5 @@
import random import random
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36' googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
@ -8,7 +7,7 @@ def splitter(links):
""" """
Method that tries to remove duplicates Method that tries to remove duplicates
LinkedinLists pulls a lot of profiles with the same name. LinkedinLists pulls a lot of profiles with the same name.
This method triest to remove duplicates from the list. This method tries to remove duplicates from the list.
:param links: list of links to remove duplicates from :param links: list of links to remove duplicates from
:return: unique-ish list :return: unique-ish list
""" """
@ -43,32 +42,62 @@ def filter(lst):
for item in lst: for item in lst:
item = str(item) item = str(item)
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item): if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
if '252f' in item: item = item.replace('252f', '').replace('2F', '').replace('2f', '')
item = item.replace('252f', '')
if '2F' in item:
item = item.replace('2F', '')
if '2f' in item:
item = item.replace('2f', '')
new_lst.append(item.lower()) new_lst.append(item.lower())
return new_lst return new_lst
def getDelay(): def getDelay() -> int:
return random.randint(1, 3) - .5 return random.randint(1, 3) - .5
def search(text): def search(text: str) -> bool:
# Helper function to check if Google has blocked traffic. # Helper function to check if Google has blocked traffic.
for line in text.strip().splitlines(): for line in text.strip().splitlines():
if 'This page appears when Google automatically detects requests coming from your computer network' in line: if 'This page appears when Google automatically detects requests coming from your computer network' in line \
print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP') or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
# print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
return True return True
return False return False
def google_workaround(visit_url: str) -> str or bool:
"""
Function that makes a request on our behalf, if Google starts to block us
:param visit_url: Url to scrape
:return: Correct html that can be parsed by BS4
"""
import requests
url = 'https://websniffer.cc/'
data = {
'Cookie': '',
'url': visit_url,
'submit': 'Submit',
'type': 'GET&http=1.1',
'uak': str(random.randint(4, 8)) # select random UA to send to Google
}
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
returned_html = resp.text
if search(returned_html):
# indicates that google is serving workaround a captcha
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
return True
# the html we get is malformed for BS4 as there are no greater than or less than signs
if '<html>' in returned_html:
start_index = returned_html.index('<html>')
else:
start_index = returned_html.index('<html')
end_index = returned_html.index('</html>') + 1
correct_html = returned_html[start_index:end_index]
# Slice list to get the response's html
correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
return correct_html
class MissingKey(Exception): class MissingKey(Exception):
def __init__(self, identity_flag): def __init__(self, identity_flag: bool):
if identity_flag: if identity_flag:
self.message = '\n\033[93m[!] Missing API key. \033[0m' self.message = '\n\033[93m[!] Missing API key. \033[0m'
else: else:

View file

@ -31,9 +31,15 @@ def do_search(self):
print(e) print(e)
self.results = r.text self.results = r.text
if search(self.results): if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time. try:
else: if isinstance(search(self.results), bool):
time.sleep(getDelay()) print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results self.totalresults += self.results
def do_search_profiles(self): def do_search_profiles(self):
@ -49,9 +55,15 @@ def do_search_profiles(self):
print(e) print(e)
self.results = r.text self.results = r.text
if search(self.results): if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time. try:
else: if isinstance(search(self.results), bool):
time.sleep(getDelay()) print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results self.totalresults += self.results
def get_emails(self): def get_emails(self):
@ -137,9 +149,15 @@ def send_dorks(self): # Helper function to minimize code reusability.
req = requests.get(link, headers=headers) req = requests.get(link, headers=headers)
self.results = req.text self.results = req.text
if search(self.results): if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time. try:
else: if isinstance(search(self.results), bool):
time.sleep(getDelay()) print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(link)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results self.totalresults += self.results
except Exception as e: except Exception as e:
print(f'\tException Occurred {e}') print(f'\tException Occurred {e}')

View file

@ -12,7 +12,6 @@ def __init__(self, word, limit):
self.results = "" self.results = ""
self.totalresults = "" self.totalresults = ""
self.server = 'www.google.com' self.server = 'www.google.com'
self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
self.quantity = '100' self.quantity = '100'
self.limit = int(limit) self.limit = int(limit)
self.counter = 0 self.counter = 0
@ -25,9 +24,19 @@ def do_search(self):
try: try:
headers = {'User-Agent': Core.get_user_agent()} headers = {'User-Agent': Core.get_user_agent()}
r = requests.get(urly, headers=headers) r = requests.get(urly, headers=headers)
self.results = r.text
if search(self.results):
try:
if isinstance(search(self.results), bool):
print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
except Exception as e: except Exception as e:
print(e) print(e)
self.results = r.text time.sleep(getDelay())
self.totalresults += self.results self.totalresults += self.results
def get_people(self): def get_people(self):