Implemented grequests in multiple modules to increase speed.

2025-02-24 22:42:56 +08:00 · 2019-08-11 22:19:01 -04:00 · 2019-08-11 22:19:01 -04:00 · 720aa06080
commit 720aa06080
parent 951f567bab
11 changed files with 120 additions and 107 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -6,4 +6,5 @@ pytest==5.0.1
 PyYaml==5.1.1
 requests==2.22.0
 shodan==1.14.0
-texttable==1.6.2
+texttable==1.6.2
+grequests>=0.4.0
--- a/theHarvester/main.py
+++ b/theHarvester/main.py
@ -53,7 +53,7 @@ def start():
    parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true')
    parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str)
    parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster,
-                        dogpile, duckduckgo, github-code, google, 
+                        dogpile, duckduckgo, exalead, github-code, google, 
                        hunter, intelx,
                        linkedin, netcraft, securityTrails, threatcrowd,
                        trello, twitter, vhost, virustotal, yahoo, all''')
@ -144,14 +144,16 @@ def start():
                    db.store_all(word, all_ip, 'ip', 'censys')

                elif engineitem == 'crtsh':
-                    print('\033[94m[*] Searching CRT.sh. \033[0m')
-                    search = crtsh.SearchCrtsh(word)
-                    search.process()
-                    hosts = filter(search.get_data())
-                    all_hosts.extend(hosts)
-                    db = stash.stash_manager()
-                    db.store_all(word, all_hosts, 'host', 'CRTsh')
-
+                    try:
+                        print('\033[94m[*] Searching CRT.sh. \033[0m')
+                        search = crtsh.SearchCrtsh(word)
+                        search.process()
+                        hosts = filter(search.get_data())
+                        all_hosts.extend(hosts)
+                        db = stash.stash_manager()
+                        db.store_all(word, all_hosts, 'host', 'CRTsh')
+                    except Exception as e:
+                        pass
                elif engineitem == 'dnsdumpster':
                    try:
                        print('\033[94m[*] Searching DNSdumpster. \033[0m')
@ -211,6 +213,18 @@ def start():
                    else:
                        pass

+                elif engineitem == 'exalead':
+                    print('\033[94m[*] Searching Exalead \033[0m')
+                    search = exaleadsearch.search_exalead(word, limit, start)
+                    search.process()
+                    emails = filter(search.get_emails())
+                    all_emails.extend(emails)
+                    hosts = filter(search.get_hostnames())
+                    all_hosts.extend(hosts)
+                    db = stash.stash_manager()
+                    db.store_all(word, all_hosts, 'host', 'exalead')
+                    db.store_all(word, all_emails, 'email', 'exalead')
+
                elif engineitem == 'google':
                    print('\033[94m[*] Searching Google. \033[0m')
                    search = googlesearch.search_google(word, limit, start)
@ -363,7 +377,7 @@ def start():

                elif engineitem == 'yahoo':
                    print('\033[94m[*] Searching Yahoo. \033[0m')
-                    search = yahoosearch.search_yahoo(word, limit)
+                    search = yahoosearch.SearchYahoo(word, limit)
                    search.process()
                    hosts = search.get_hostnames()
                    emails = search.get_emails()
@ -467,6 +481,17 @@ def start():
                    db.store_all(word, all_hosts, 'email', 'duckduckgo')
                    db.store_all(word, all_hosts, 'host', 'duckduckgo')

+                    print('\033[94m[*] Searching Exalead \033[0m')
+                    search = exaleadsearch.search_exalead(word, limit, start)
+                    search.process()
+                    emails = filter(search.get_emails())
+                    all_emails.extend(emails)
+                    hosts = filter(search.get_hostnames())
+                    all_hosts.extend(hosts)
+                    db = stash.stash_manager()
+                    db.store_all(word, all_hosts, 'host', 'exalead')
+                    db.store_all(word, all_emails, 'email', 'exalead')
+
                    print('\033[94m[*] Searching Google. \033[0m')
                    search = googlesearch.search_google(word, limit, start)
                    search.process(google_dorking)
@ -945,7 +970,6 @@ def entry_point():
        print('\n\n\033[93m[!] ctrl+c detected from user, quitting.\n\n \033[0m')
    except Exception:
        import traceback
-
        print(traceback.print_exc())
        sys.exit(1)

--- a/theHarvester/discovery/baidusearch.py
+++ b/theHarvester/discovery/baidusearch.py
@ -1,8 +1,6 @@
-from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser
-import requests
-import time
+import grequests


 class SearchBaidu:
@ -13,24 +11,21 @@ def __init__(self, word, limit):
        self.server = 'www.baidu.com'
        self.hostname = 'www.baidu.com'
        self.limit = limit
-        self.counter = 0

    def do_search(self):
-        url = 'http://' + self.server + '/s?wd=%40' + self.word + '&pn=' + str(self.counter) + '&oq=' + self.word
-        url = f'https://{self.server}/s?wd=%40{self.word}&pn{self.counter}&oq={self.word}'
        headers = {
            'Host': self.hostname,
            'User-agent': Core.get_user_agent()
        }
-        h = requests.get(url=url, headers=headers)
-        time.sleep(getDelay())
-        self.total_results += h.text
+        base_url = f'https://{self.server}/s?wd=%40{self.word}&pnxx&oq={self.word}'
+        urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
+        req = (grequests.get(u, headers=headers, timeout=5) for u in urls)
+        resp = grequests.imap(req, size=5)
+        for x in resp:
+            self.total_results += x.content.decode('UTF-8')

    def process(self):
-        while self.counter <= self.limit and self.counter <= 1000:
-            self.do_search()
-            print(f'\tSearching {self.counter} results.')
-            self.counter += 10
+        self.do_search()

    def get_emails(self):
        rawres = myparser.Parser(self.total_results, self.word)
@ -39,3 +34,5 @@ def get_emails(self):
    def get_hostnames(self):
        rawres = myparser.Parser(self.total_results, self.word)
        return rawres.hostnames()
+
+
--- a/theHarvester/discovery/bingsearch.py
+++ b/theHarvester/discovery/bingsearch.py
@ -1,8 +1,7 @@
 from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser
-import requests
-import time
+import grequests


 class SearchBing:
@ -10,11 +9,10 @@ class SearchBing:
    def __init__(self, word, limit, start):
        self.word = word.replace(' ', '%20')
        self.results = ""
-        self.totalresults = ""
+        self.total_results = ""
        self.server = 'www.bing.com'
        self.apiserver = 'api.search.live.net'
        self.hostname = 'www.bing.com'
-        self.quantity = '50'
        self.limit = int(limit)
        self.bingApi = Core.bing_key()
        self.counter = start
@ -26,9 +24,12 @@ def do_search(self):
            'Accept-Language': 'en-us,en',
            'User-agent': Core.get_user_agent()
        }
-        h = requests.get(url=('https://' + self.server + '/search?q=%40"' + self.word + '"&count=50&first=' + str(self.counter)), headers=headers)
-        self.results = h.text
-        self.totalresults += self.results
+        base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
+        urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
+        req = (grequests.get(u, headers=headers, timeout=5) for u in urls)
+        resp = grequests.imap(req, size=5)
+        for x in resp:
+            self.total_results += x.content.decode('UTF-8')

    def do_search_api(self):
        url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?'
@ -40,9 +41,10 @@ def do_search_api(self):
            'safesearch': 'Off'
        }
        headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi}
-        h = requests.get(url=url, headers=headers, params=params)
-        self.results = h.text
-        self.totalresults += self.results
+        h = grequests.get(url=url, headers=headers, params=params)
+        response = grequests.map([h])
+        self.results = response[0].content.decode('UTF-8')
+        self.total_results += self.results

    def do_search_vhost(self):
        headers = {
@ -51,39 +53,35 @@ def do_search_vhost(self):
            'Accept-Language': 'en-us,en',
            'User-agent': Core.get_user_agent()
        }
-        url = 'http://' + self.server + '/search?q=ip:' + self.word + '&go=&count=50&FORM=QBHL&qs=n&first=' + str(self.counter)
-        h = requests.get(url=url, headers=headers)
-        self.results = h.text
-        self.totalresults += self.results
+        base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
+        urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
+        req = (grequests.get(u, headers=headers, timeout=5) for u in urls)
+        resp = grequests.imap(req, size=5)
+        for x in resp:
+            self.total_results += x.content.decode('UTF-8')

    def get_emails(self):
-        rawres = myparser.Parser(self.totalresults, self.word)
+        rawres = myparser.Parser(self.total_results, self.word)
        return rawres.emails()

    def get_hostnames(self):
-        rawres = myparser.Parser(self.totalresults, self.word)
+        rawres = myparser.Parser(self.total_results, self.word)
        return rawres.hostnames()

    def get_allhostnames(self):
-        rawres = myparser.Parser(self.totalresults, self.word)
+        rawres = myparser.Parser(self.total_results, self.word)
        return rawres.hostnames_all()

    def process(self, api):
        if api == 'yes':
            if self.bingApi is None:
                raise MissingKey(True)
-        while self.counter < self.limit:
+        else:
            if api == 'yes':
                self.do_search_api()
-                time.sleep(getDelay())
            else:
                self.do_search()
-                time.sleep(getDelay())
-            self.counter += 50
            print(f'\tSearching {self.counter} results.')

    def process_vhost(self):
-        # Maybe it is good to use other limit for this.
-        while self.counter < self.limit:
-            self.do_search_vhost()
-            self.counter += 50
+        self.do_search_vhost()
--- a/theHarvester/discovery/constants.py
+++ b/theHarvester/discovery/constants.py
@ -10,7 +10,8 @@ def filter(lst):
    :param lst: list to be filtered
    :return: new filtered list
    """
-    lst = set(lst)  # Remove duplicates.
+    if not isinstance(lst, set):
+        lst = set(lst)  # Remove duplicates.
    new_lst = []
    for item in lst:
        item = str(item)
--- a/theHarvester/discovery/crtsh.py
+++ b/theHarvester/discovery/crtsh.py
@ -1,6 +1,5 @@
 from theHarvester.lib.core import *
 import requests
-import urllib3

 class SearchCrtsh:

@ -9,13 +8,18 @@ def __init__(self, word):
        self.data = set()
        
    def do_search(self):
-        url = f'https://crt.sh/?q=%25.{self.word}&output=json'
-        headers = {'User-Agent': Core.get_user_agent()}
-        request = requests.get(url, params=headers, timeout=30)
-        if request.ok:
-            content = request.json()
-            data = set([dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content])
-        return data
+        try:
+            data = set()
+            url = f'https://crt.sh/?q=%25.{self.word}&output=json'
+            headers = {'User-Agent': Core.get_user_agent()}
+            request = requests.get(url, headers=headers, timeout=15)
+            if request.ok:
+                content = request.json()
+                data = set([dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] for dct in content])
+                return data
+            return data
+        except Exception as e:
+            print(f'An exception has occurred in crtsh: {e}')

    def process(self):
        print('\tSearching results.')
--- a/theHarvester/discovery/dogpilesearch.py
+++ b/theHarvester/discovery/dogpilesearch.py
@ -1,7 +1,7 @@
 from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser
-import requests
+import grequests
 import time


@ -13,31 +13,24 @@ def __init__(self, word, limit):
        self.server = 'www.dogpile.com'
        self.hostname = 'www.dogpile.com'
        self.limit = limit
-        self.counter = 0

    def do_search(self):
        #import ssl
        #ssl._create_default_https_context = ssl._create_unverified_context
        # Dogpile is hardcoded to return 10 results.
-        url = 'https://' + self.server + "/search/web?qsi=" + str(self.counter) \
-              + "&q=\"%40" + self.word + "\""
-        headers = {
-            'Host': self.hostname,
-            'User-agent': Core.get_user_agent()
-        }
        try:
-            h = requests.get(url=url, headers=headers, verify=False)
-            #print(h.text)
-            self.total_results += h.text
+            headers = {'User-agent': Core.get_user_agent()}
+            base_url = f'https://{self.server}/search/web?qsi=xx&q=%40{self.word}'
+            urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
+            req = (grequests.get(u, headers=headers, verify=False, timeout=5) for u in urls)
+            resp = grequests.imap(req, size=5)
+            for x in resp:
+                self.total_results += x.content.decode('UTF-8')
        except Exception as e:
            print(f'Error Occurred: {e}')

    def process(self):
-        while self.counter <= self.limit and self.counter <= 1000:
-            self.do_search()
-            time.sleep(getDelay())
-            print(f'\tSearching {self.counter} results.')
-            self.counter += 10
+        self.do_search()

    def get_emails(self):
        rawres = myparser.Parser(self.total_results, self.word)
@ -46,3 +39,4 @@ def get_emails(self):
    def get_hostnames(self):
        rawres = myparser.Parser(self.total_results, self.word)
        return rawres.hostnames()
+
--- a/theHarvester/discovery/huntersearch.py
+++ b/theHarvester/discovery/huntersearch.py
@ -1,30 +1,28 @@
 from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser
-import requests
+import grequests


 class SearchHunter:

    def __init__(self, word, limit, start):
        self.word = word
-        self.limit = 100
+        self.limit = limit
        self.start = start
        self.key = Core.hunter_key()
+        #self.key = "e802ef64e560430c3612ab7e9f2d018fd9946177"
        if self.key is None:
            raise MissingKey(True)
-        self.results = ""
-        self.totalresults = ""
+        self.total_results = ""
        self.counter = start
-        self.database = "https://api.hunter.io/v2/domain-search?domain=" + word + "&api_key=" + self.key + "&limit=" + str(self.limit)
+        self.database = f'https://api.hunter.io/v2/domain-search?domain={word}&api_key={self.key}&limit={self.limit}'

    def do_search(self):
-        try:
-            r = requests.get(self.database)
-        except Exception as e:
-            print(e)
-        self.results = r.text
-        self.totalresults += self.results
+        request = grequests.get(self.database)
+        response = grequests.map([request])
+        self.total_results = response[0].content.decode('UTF-8')
+

    def process(self):
            self.do_search()  # Only need to do it once.
@ -40,3 +38,4 @@ def get_hostnames(self):
    def get_profiles(self):
        rawres = myparser.Parser(self.totalresults, self.word)
        return rawres.profiles()
+
--- a/theHarvester/discovery/netcraft.py
+++ b/theHarvester/discovery/netcraft.py
@ -12,7 +12,7 @@ def __init__(self, word):
        self.word = word.replace(' ', '%20')
        self.totalresults = ""
        self.server = 'netcraft.com'
-        self.base_url = 'https://searchdns.netcraft.com/?restriction=site+ends+with&host={domain}'
+        self.base_url = f'https://searchdns.netcraft.com/?restriction=site+ends+with&host={word}'
        self.session = requests.session()
        self.headers = {
            'User-Agent': Core.get_user_agent()
@ -33,7 +33,7 @@ def get_next(self, resp):
        link_regx = re.compile('<A href="(.*?)"><b>Next page</b></a>')
        link = link_regx.findall(resp)
        link = re.sub(f'host=.*?{self.word}', f'host={self.domain}', link[0])
-        url = f'http://searchdns.netcraft.com{link}'
+        url = f'https://searchdns.netcraft.com{link.replace(" ", "%20")}'
        return url

    def create_cookies(self, cookie):
@ -56,13 +56,12 @@ def do_search(self):
        start_url = self.base_url
        resp = self.request(start_url)
        cookies = self.get_cookies(resp.headers)
-        url = self.base_url.format(domain="yale.edu")
        while True:
-            resp = self.request(url, cookies).text
+            resp = self.request(self.base_url, cookies).text
            self.totalresults += resp
            if 'Next page' not in resp or resp is None:
                break
-            url = self.get_next(resp)
+            self.base_url = self.get_next(resp)

    def get_hostnames(self):
        rawres = myparser.Parser(self.totalresults, self.word)
--- a/theHarvester/discovery/yahoosearch.py
+++ b/theHarvester/discovery/yahoosearch.py
@ -1,35 +1,30 @@
-from theHarvester.discovery.constants import *
+import grequests
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser
-import requests
-import time


-class search_yahoo:
+class SearchYahoo:

    def __init__(self, word, limit):
        self.word = word
        self.total_results = ""
        self.server = 'search.yahoo.com'
-        self.hostname = 'search.yahoo.com'
        self.limit = limit
-        self.counter = 0

    def do_search(self):
-        url = 'http://' + self.server + '/search?p=\"%40' + self.word + '\"&b=' + str(self.counter) + '&pz=10'
+        base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10'
        headers = {
-            'Host': self.hostname,
+            'Host': self.server,
            'User-agent': Core.get_user_agent()
        }
-        h = requests.get(url=url, headers=headers)
-        self.total_results += h.text
+        urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
+        request = (grequests.get(url, headers=headers) for url in urls)
+        response = grequests.imap(request, size=5)
+        for entry in response:
+            self.total_results += entry.content.decode('UTF-8')

    def process(self):
-        while self.counter <= self.limit and self.counter <= 1000:
-            self.do_search()
-            time.sleep(getDelay())
-            print(f'\tSearching {self.counter} results.')
-            self.counter += 10
+        self.do_search()

    def get_emails(self):
        rawres = myparser.Parser(self.total_results, self.word)
@ -46,4 +41,4 @@ def get_emails(self):

    def get_hostnames(self):
        rawres = myparser.Parser(self.total_results, self.word)
-        return rawres.hostnames()
+        return rawres.hostnames()
--- a/theHarvester/lib/core.py
+++ b/theHarvester/lib/core.py
@ -72,6 +72,7 @@ def get_supportedengines():
                            'dnsdumpster',
                            'dogpile',
                            'duckduckgo',
+                            'exalead',
                            'github-code',
                            'google',
                            'hunter',