Merge pull request #17 from NotoriousRebel/master

Implemented Google Work Around and more static typing.
2024-09-22 08:16:35 +08:00 · 2019-09-12 21:02:07 +01:00 · 2019-09-12 21:02:07 +01:00 · 38c9731262
parent be2a7e019e a833c1ec18
commit 38c9731262
3 changed files with 80 additions and 24 deletions
--- a/theHarvester/discovery/constants.py
+++ b/theHarvester/discovery/constants.py
@ -1,6 +1,5 @@
 import random

-
 googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'


@ -8,7 +7,7 @@ def splitter(links):
    """
    Method that tries to remove duplicates
    LinkedinLists pulls a lot of profiles with the same name.
-    This method triest to remove duplicates from the list.
+    This method tries to remove duplicates from the list.
    :param links: list of links to remove duplicates from
    :return: unique-ish list
    """
@ -43,32 +42,62 @@ def filter(lst):
    for item in lst:
        item = str(item)
        if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
-            if '252f' in item:
-                item = item.replace('252f', '')
-            if '2F' in item:
-                item = item.replace('2F', '')
-            if '2f' in item:
-                item = item.replace('2f', '')
+            item = item.replace('252f', '').replace('2F', '').replace('2f', '')
            new_lst.append(item.lower())
    return new_lst


-def getDelay():
+def getDelay() -> int:
    return random.randint(1, 3) - .5


-def search(text):
+def search(text: str) -> bool:
    # Helper function to check if Google has blocked traffic.
    for line in text.strip().splitlines():
-        if 'This page appears when Google automatically detects requests coming from your computer network' in line:
-            print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
+        if 'This page appears when Google automatically detects requests coming from your computer network' in line \
+                or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
+            # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
            return True
    return False


+def google_workaround(visit_url: str) -> str or bool:
+    """
+    Function that makes a request on our behalf, if Google starts to block us
+    :param visit_url: Url to scrape
+    :return: Correct html that can be parsed by BS4
+    """
+    import requests
+    url = 'https://websniffer.cc/'
+    data = {
+        'Cookie': '',
+        'url': visit_url,
+        'submit': 'Submit',
+        'type': 'GET&http=1.1',
+        'uak': str(random.randint(4, 8))  # select random UA to send to Google
+    }
+    resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
+    returned_html = resp.text
+    if search(returned_html):
+        # indicates that google is serving workaround a captcha
+        # TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
+        return True
+    # the html we get is malformed for BS4 as there are no greater than or less than signs
+    if '&lt;html&gt;' in returned_html:
+        start_index = returned_html.index('&lt;html&gt;')
+    else:
+        start_index = returned_html.index('&lt;html')
+
+    end_index = returned_html.index('&lt;/html&gt;') + 1
+    correct_html = returned_html[start_index:end_index]
+    # Slice list to get the response's html
+    correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
+    return correct_html
+
+
 class MissingKey(Exception):

-    def __init__(self, identity_flag):
+    def __init__(self, identity_flag: bool):
        if identity_flag:
            self.message = '\n\033[93m[!] Missing API key. \033[0m'
        else:
--- a/theHarvester/discovery/googlesearch.py
+++ b/theHarvester/discovery/googlesearch.py
@ -31,9 +31,15 @@ def do_search(self):
            print(e)
        self.results = r.text
        if search(self.results):
-            time.sleep(getDelay() * 5)  # Sleep for a longer time.
-        else:
-            time.sleep(getDelay())
+            try:
+                if isinstance(search(self.results), bool):
+                    print('Google is blocking your ip and the workaround, returning')
+                    return
+                else:
+                    self.results = google_workaround(urly)
+            except BaseException:
+                pass
+        time.sleep(getDelay())
        self.totalresults += self.results

    def do_search_profiles(self):
@ -49,9 +55,15 @@ def do_search_profiles(self):
            print(e)
        self.results = r.text
        if search(self.results):
-            time.sleep(getDelay() * 5)  # Sleep for a longer time.
-        else:
-            time.sleep(getDelay())
+            try:
+                if isinstance(search(self.results), bool):
+                    print('Google is blocking your ip and the workaround, returning')
+                    return
+                else:
+                    self.results = google_workaround(urly)
+            except BaseException:
+                pass
+        time.sleep(getDelay())
        self.totalresults += self.results

    def get_emails(self):
@ -137,9 +149,15 @@ def send_dorks(self):  # Helper function to minimize code reusability.
                req = requests.get(link, headers=headers)
                self.results = req.text
                if search(self.results):
-                    time.sleep(getDelay() * 5)  # Sleep for a longer time.
-                else:
-                    time.sleep(getDelay())
+                    try:
+                        if isinstance(search(self.results), bool):
+                            print('Google is blocking your ip and the workaround, returning')
+                            return
+                        else:
+                            self.results = google_workaround(link)
+                    except BaseException:
+                        pass
+                time.sleep(getDelay())
                self.totalresults += self.results
            except Exception as e:
                print(f'\tException Occurred {e}')
--- a/theHarvester/discovery/linkedinsearch.py
+++ b/theHarvester/discovery/linkedinsearch.py
@ -12,7 +12,6 @@ def __init__(self, word, limit):
        self.results = ""
        self.totalresults = ""
        self.server = 'www.google.com'
-        self.userAgent = '(Mozilla/5.0 (Windows; U; Windows NT 6.0;en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6'
        self.quantity = '100'
        self.limit = int(limit)
        self.counter = 0
@ -25,9 +24,19 @@ def do_search(self):
        try:
            headers = {'User-Agent': Core.get_user_agent()}
            r = requests.get(urly, headers=headers)
+            self.results = r.text
+            if search(self.results):
+                try:
+                    if isinstance(search(self.results), bool):
+                        print('Google is blocking your ip and the workaround, returning')
+                        return
+                    else:
+                        self.results = google_workaround(urly)
+                except BaseException:
+                    pass
        except Exception as e:
            print(e)
-        self.results = r.text
+        time.sleep(getDelay())
        self.totalresults += self.results

    def get_people(self):