Merge pull request #17 from NotoriousRebel/master

Implemented Google Work Around and more static typing.
This commit is contained in:
J.Townsend 2019-09-12 21:02:07 +01:00 committed by GitHub
commit 38c9731262
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 80 additions and 24 deletions

View file

@ -1,6 +1,5 @@
import random
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
@ -8,7 +7,7 @@ def splitter(links):
"""
Method that tries to remove duplicates
LinkedinLists pulls a lot of profiles with the same name.
This method triest to remove duplicates from the list.
This method tries to remove duplicates from the list.
:param links: list of links to remove duplicates from
:return: unique-ish list
"""
@ -43,32 +42,62 @@ def filter(lst):
for item in lst:
item = str(item)
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
if '252f' in item:
item = item.replace('252f', '')
if '2F' in item:
item = item.replace('2F', '')
if '2f' in item:
item = item.replace('2f', '')
item = item.replace('252f', '').replace('2F', '').replace('2f', '')
new_lst.append(item.lower())
return new_lst
def getDelay():
def getDelay() -> int:
return random.randint(1, 3) - .5
def search(text):
def search(text: str) -> bool:
# Helper function to check if Google has blocked traffic.
for line in text.strip().splitlines():
if 'This page appears when Google automatically detects requests coming from your computer network' in line:
print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
if 'This page appears when Google automatically detects requests coming from your computer network' in line \
or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
# print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
return True
return False
def google_workaround(visit_url: str) -> str or bool:
"""
Function that makes a request on our behalf, if Google starts to block us
:param visit_url: Url to scrape
:return: Correct html that can be parsed by BS4
"""
import requests
url = 'https://websniffer.cc/'
data = {
'Cookie': '',
'url': visit_url,
'submit': 'Submit',
'type': 'GET&http=1.1',
'uak': str(random.randint(4, 8)) # select random UA to send to Google
}
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
returned_html = resp.text
if search(returned_html):
# indicates that google is serving workaround a captcha
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
return True
# the html we get is malformed for BS4 as there are no greater than or less than signs
if '<html>' in returned_html:
start_index = returned_html.index('<html>')
else:
start_index = returned_html.index('<html')
end_index = returned_html.index('</html>') + 1
correct_html = returned_html[start_index:end_index]
# Slice list to get the response's html
correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
return correct_html
class MissingKey(Exception):
def __init__(self, identity_flag):
def __init__(self, identity_flag: bool):
if identity_flag:
self.message = '\n\033[93m[!] Missing API key. \033[0m'
else:

View file

@ -31,9 +31,15 @@ def do_search(self):
print(e)
self.results = r.text
if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time.
else:
time.sleep(getDelay())
try:
if isinstance(search(self.results), bool):
print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results
def do_search_profiles(self):
@ -49,9 +55,15 @@ def do_search_profiles(self):
print(e)
self.results = r.text
if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time.
else:
time.sleep(getDelay())
try:
if isinstance(search(self.results), bool):
print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results
def get_emails(self):
@ -137,9 +149,15 @@ def send_dorks(self): # Helper function to minimize code reusability.
req = requests.get(link, headers=headers)
self.results = req.text
if search(self.results):
time.sleep(getDelay() * 5) # Sleep for a longer time.
else:
time.sleep(getDelay())
try:
if isinstance(search(self.results), bool):
print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(link)
except BaseException:
pass
time.sleep(getDelay())
self.totalresults += self.results
except Exception as e:
print(f'\tException Occurred {e}')

View file

@ -12,7 +12,6 @@ def __init__(self, word, limit):
self.results = ""
self.totalresults = ""
self.server = 'www.google.com'
self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
self.quantity = '100'
self.limit = int(limit)
self.counter = 0
@ -25,9 +24,19 @@ def do_search(self):
try:
headers = {'User-Agent': Core.get_user_agent()}
r = requests.get(urly, headers=headers)
self.results = r.text
if search(self.results):
try:
if isinstance(search(self.results), bool):
print('Google is blocking your ip and the workaround, returning')
return
else:
self.results = google_workaround(urly)
except BaseException:
pass
except Exception as e:
print(e)
self.results = r.text
time.sleep(getDelay())
self.totalresults += self.results
def get_people(self):