Merge pull request #17 from NotoriousRebel/master

Implemented Google Work Around and more static typing.
2024-09-22 08:16:35 +08:00 · 2019-09-12 21:02:07 +01:00 · 2019-09-12 21:02:07 +01:00 · 38c9731262
parent be2a7e019e a833c1ec18
commit 38c9731262
3 changed files with 80 additions and 24 deletions
--- a/theHarvester/discovery/constants.py
+++ b/theHarvester/discovery/constants.py
@ -1,6 +1,5 @@
 import random
 googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
@ -8,7 +7,7 @@ def splitter(links):
    """
    Method that tries to remove duplicates
    LinkedinLists pulls a lot of profiles with the same name.
-    This method triest to remove duplicates from the list.
+    This method tries to remove duplicates from the list.
    :param links: list of links to remove duplicates from
    :return: unique-ish list
    """
@ -43,32 +42,62 @@ def filter(lst):
    for item in lst:
        item = str(item)
        if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
-            if '252f' in item:
+            item = item.replace('252f', '').replace('2F', '').replace('2f', '')
                item = item.replace('252f', '')
            if '2F' in item:
                item = item.replace('2F', '')
            if '2f' in item:
                item = item.replace('2f', '')
            new_lst.append(item.lower())
    return new_lst
-def getDelay():
+def getDelay() -> int:
    return random.randint(1, 3) - .5
-def search(text):
+def search(text: str) -> bool:
    # Helper function to check if Google has blocked traffic.
    for line in text.strip().splitlines():
-        if 'This page appears when Google automatically detects requests coming from your computer network' in line:
+        if 'This page appears when Google automatically detects requests coming from your computer network' in line \
-            print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
+                or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
            # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
            return True
    return False
 def google_workaround(visit_url: str) -> str or bool:
    """
    Function that makes a request on our behalf, if Google starts to block us
    :param visit_url: Url to scrape
    :return: Correct html that can be parsed by BS4
    """
    import requests
    url = 'https://websniffer.cc/'
    data = {
        'Cookie': '',
        'url': visit_url,
        'submit': 'Submit',
        'type': 'GET&http=1.1',
        'uak': str(random.randint(4, 8))  # select random UA to send to Google
    }
    resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
    returned_html = resp.text
    if search(returned_html):
        # indicates that google is serving workaround a captcha
        # TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
        return True
    # the html we get is malformed for BS4 as there are no greater than or less than signs
    if '&lt;html&gt;' in returned_html:
        start_index = returned_html.index('&lt;html&gt;')
    else:
        start_index = returned_html.index('&lt;html')
    end_index = returned_html.index('&lt;/html&gt;') + 1
    correct_html = returned_html[start_index:end_index]
    # Slice list to get the response's html
    correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
    return correct_html
 class MissingKey(Exception):
-    def __init__(self, identity_flag):
+    def __init__(self, identity_flag: bool):
        if identity_flag:
            self.message = '\n\033[93m[!] Missing API key. \033[0m'
        else:
--- a/theHarvester/discovery/googlesearch.py
+++ b/theHarvester/discovery/googlesearch.py
@ -31,9 +31,15 @@ def do_search(self):
            print(e)
        self.results = r.text
        if search(self.results):
-            time.sleep(getDelay() * 5)  # Sleep for a longer time.
+            try:
-        else:
+                if isinstance(search(self.results), bool):
-            time.sleep(getDelay())
+                    print('Google is blocking your ip and the workaround, returning')
                    return
                else:
                    self.results = google_workaround(urly)
            except BaseException:
                pass
        time.sleep(getDelay())
        self.totalresults += self.results
    def do_search_profiles(self):
@ -49,9 +55,15 @@ def do_search_profiles(self):
            print(e)
        self.results = r.text
        if search(self.results):
-            time.sleep(getDelay() * 5)  # Sleep for a longer time.
+            try:
-        else:
+                if isinstance(search(self.results), bool):
-            time.sleep(getDelay())
+                    print('Google is blocking your ip and the workaround, returning')
                    return
                else:
                    self.results = google_workaround(urly)
            except BaseException:
                pass
        time.sleep(getDelay())
        self.totalresults += self.results
    def get_emails(self):
@ -137,9 +149,15 @@ def send_dorks(self):  # Helper function to minimize code reusability.
                req = requests.get(link, headers=headers)
                self.results = req.text
                if search(self.results):
-                    time.sleep(getDelay() * 5)  # Sleep for a longer time.
+                    try:
-                else:
+                        if isinstance(search(self.results), bool):
-                    time.sleep(getDelay())
+                            print('Google is blocking your ip and the workaround, returning')
                            return
                        else:
                            self.results = google_workaround(link)
                    except BaseException:
                        pass
                time.sleep(getDelay())
                self.totalresults += self.results
            except Exception as e:
                print(f'\tException Occurred {e}')
--- a/theHarvester/discovery/linkedinsearch.py
+++ b/theHarvester/discovery/linkedinsearch.py
@ -12,7 +12,6 @@ def __init__(self, word, limit):
        self.results = ""
        self.totalresults = ""
        self.server = 'www.google.com'
        self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
        self.quantity = '100'
        self.limit = int(limit)
        self.counter = 0
@ -25,9 +24,19 @@ def do_search(self):
        try:
            headers = {'User-Agent': Core.get_user_agent()}
            r = requests.get(urly, headers=headers)
            self.results = r.text
            if search(self.results):
                try:
                    if isinstance(search(self.results), bool):
                        print('Google is blocking your ip and the workaround, returning')
                        return
                    else:
                        self.results = google_workaround(urly)
                except BaseException:
                    pass
        except Exception as e:
            print(e)
-        self.results = r.text
+        time.sleep(getDelay())
        self.totalresults += self.results
    def get_people(self):