Merge branch 'dev' into dev

2024-09-22 00:06:30 +08:00 · 2019-10-02 15:47:08 -04:00 · 2019-10-02 15:47:08 -04:00 · 40eaadfd2b
parent a2b6efdb4b b18383acdd
commit 40eaadfd2b
17 changed files with 88 additions and 1942 deletions
--- a/.github/workflows/theHarvester.yml
+++ b/.github/workflows/theHarvester.yml
@ -35,10 +35,6 @@ jobs:
      run: |
        python theHarvester.py -d metasploit.com -b bing

-    - name: Run theHarvester module censys
-      run: |
-        python theHarvester.py -d metasploit.com -b censys
-
    - name: Run theHarvester module crtsh
      run: |
        python theHarvester.py -d metasploit.com -b crtsh
@ -112,6 +108,6 @@ jobs:
    - name: Test with pytest
      run: |
        pytest
-#    - name: Check static type checking with mypy
+#    - name: Static type checking with mypy
 #      run: |
-#        mypy *.py
+#        mypy --pretty *.py
--- a/.gitignore
+++ b/.gitignore
@ -9,4 +9,4 @@ debug_results.txt
 tests/myparser.py
 venv
 .mypy_cache
-.pytest_cache
+.pytest_cache
--- a/.travis.yml
+++ b/.travis.yml
@ -14,10 +14,10 @@ before_install:
 install:
 - python setup.py test
 script:
- python theHarvester.py -d metasploit.com -b baidu,bing,censys,crtsh,dnsdumpster,dogpile,duckduckgo,exalead,linkedin,netcraft,otx,intelx,threatcrowd,trello,twitter,virustotal,yahoo -l 200
+- python theHarvester.py -d metasploit.com -b baidu,bing,crtsh,dnsdumpster,dogpile,duckduckgo,exalead,linkedin,netcraft,otx,intelx,threatcrowd,trello,twitter,virustotal,yahoo -l 200
 - pytest
 - flake8 . --count --show-source --statistics
-#- mypy *.py
+#- mypy --pretty *.py
 notifications:
  email: false
  slack:
--- a/README.md
+++ b/README.md
@ -19,8 +19,6 @@ Passive:

 * bingapi: Microsoft search engine, through the API (Requires API key, see below.)

-* censys: Censys.io search engine - www.censys.io
-
 * crtsh: Comodo Certificate search - www.crt.sh

 * dnsdumpster: DNSdumpster search engine - dnsdumpster.com
@ -51,6 +49,8 @@ Passive:
 * shodan: Shodan search engine, will search for ports and banners from discovered<br>
  hosts - www.shodanhq.com

+* Spyse: Web research tools for professionals(Requires an API key) - https://spyse.com/
+
 * threatcrowd: Open source threat intelligence - www.threatcrowd.org

 * trello: Search trello boards (Uses Google search.)
@ -79,6 +79,7 @@ Add your keys to api-keys.yaml
 * intelx
 * securityTrails
 * shodan
+* spyse

 Dependencies:
 -------------
@ -89,14 +90,16 @@ Dependencies:
 Comments, bugs, or requests?
 ----------------------------
 * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies
-* cmartorella@edge-security.com
+cmartorella@edge-security.com
+* [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
+* [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1

 Main contributors:
 ------------------
 * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
 * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
-* [![LinkedIn](https://static.licdn.com/scds/common/u/img/webpromo/btn_viewmy_160x25.png)](https://www.linkedin.com/in/janoszold/)  Janos Zold
 * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts 
+* [![LinkedIn](https://static.licdn.com/scds/common/u/img/webpromo/btn_viewmy_160x25.png)](https://www.linkedin.com/in/janoszold/)  Janos Zold

 Thanks:
 -------
--- a/api-keys.yaml
+++ b/api-keys.yaml
@ -16,3 +16,6 @@ apikeys:

  shodan:
    key: oCiMsgM6rQWqiTvPxFHYcExlZgg7wvTt
+
+  spyse:
+    key:
--- a/mypy.ini
+++ b/mypy.ini
@ -1,3 +1,4 @@
 [mypy]
 ignore_missing_imports = True
-show_traceback = True
+show_traceback = True
+show_error_codes = True
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,13 @@
 aiodns==2.0.0
 beautifulsoup4==4.8.0
-censys==0.0.8
 dnspython==1.16.0
 flake8==3.7.8
 grequests==0.4.0
-mypy==0.720
+mypy==0.730
 netaddr==0.7.19
 plotly==4.1.1
-pytest==5.1.3
+pytest==5.2.0
 PyYaml==5.1.2
 requests==2.22.0
-shodan==1.17.0
+shodan==1.19.0
 texttable==1.6.2
--- a/setup.cfg
+++ b/setup.cfg
@ -1,3 +1,2 @@
 [flake8]
-ignore = E501, F405, F403, E402
-exclude = theHarvester/discovery/IPy.py,theHarvester/discovery/s3_scanner.py
+ignore = E501, F405, F403, E402
--- a/theHarvester/main.py
+++ b/theHarvester/main.py
@ -32,10 +32,10 @@ def start():
    parser.add_argument('-n', '--dns-lookup', help='enable DNS server lookup, default False', default=False, action='store_true')
    parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true')
    parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str)
-    parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster,
+    parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, crtsh, dnsdumpster,
                        dogpile, duckduckgo, github-code, google,
                        hunter, intelx,
-                        linkedin, linkedin_links, netcraft, otx, securityTrails, threatcrowd,
+                        linkedin, linkedin_links, netcraft, otx, securityTrails, spyse, threatcrowd,
                        trello, twitter, vhost, virustotal, yahoo''')

    args = parser.parse_args()
@ -112,19 +112,6 @@ def start():
                        else:
                            pass

-                elif engineitem == 'censys':
-                    print('\033[94m[*] Searching Censys. \033[0m')
-                    from theHarvester.discovery import censys
-                    # Import locally or won't work
-                    censys_search = censys.SearchCensys(word, limit)
-                    censys_search.process()
-                    all_ip = censys_search.get_ipaddresses()
-                    hosts = filter(censys_search.get_hostnames())
-                    all_hosts.extend(hosts)
-                    db = stash.stash_manager()
-                    db.store_all(word, all_hosts, 'host', 'censys')
-                    db.store_all(word, all_ip, 'ip', 'censys')
-
                elif engineitem == 'crtsh':
                    try:
                        print('\033[94m[*] Searching CRT.sh. \033[0m')
@ -356,6 +343,22 @@ def start():
                        all_hosts.extend(hosts)
                        db = stash.stash_manager()
                        db.store_all(word, all_hosts, 'host', 'suip')
+
+                elif engineitem == 'spyse':
+                    print('\033[94m[*] Searching Spyse. \033[0m')
+                    from theHarvester.discovery import spyse
+                    try:
+                        spysesearch_search = spyse.SearchSpyse(word)
+                        spysesearch_search.process()
+                        hosts = filter(spysesearch_search.get_hostnames())
+                        all_hosts.extend(list(hosts))
+                        # ips = filter(spysesearch_search.get_ips())
+                        # all_ip.extend(list(ips))
+                        all_hosts.extend(hosts)
+                        db = stash.stash_manager()
+                        db.store_all(word, all_hosts, 'host', 'spyse')
+                        # db.store_all(word, all_ip, 'ip', 'spyse')
+
                    except Exception as e:
                        print(e)

@ -615,8 +618,7 @@ def start():

    # Here we need to add explosion mode.
    # We have to take out the TLDs to do this.
-    recursion = False
-    if recursion:
+    if args.dns_tld is not False:
        counter = 0
        for word in vhost:
            search = googlesearch.SearchGoogle(word, limit, counter)
--- a/theHarvester/discovery/IPy.py
+++ b/theHarvester/discovery/IPy.py
--- a/theHarvester/discovery/init.py
+++ b/theHarvester/discovery/init.py
@ -1,6 +1,5 @@
 __all__ = ['baidusearch',
           'bingsearch',
-           'censys',
           'crtsh',
           'dnssearch',
           'dogpilesearch',
@ -16,6 +15,7 @@
           'port_scanner',
           'securitytrailssearch',
           'shodansearch',
+           'spyse',
           'takeover',
           'threatcrowd',
           'trello',
--- a/theHarvester/discovery/censys.py
+++ b/theHarvester/discovery/censys.py
@ -1,133 +0,0 @@
-from theHarvester.lib.core import *
-from theHarvester.parsers import censysparser
-import requests
-
-# TODO rewrite this module to use the censys api as the current way does not work
-# TODO And not really that maintainable as it currently stands
-
-
-class SearchCensys:
-
-    def __init__(self, word, limit):
-        self.word = word
-        self.urlhost = ""
-        self.urlcert = ""
-        self.page = ""
-        self.resultshosts = ""
-        self.resultcerts = ""
-        self.total_resultshosts = ""
-        self.total_resultscerts = ""
-        self.server = 'censys.io'
-        self.ips = []
-        self.hostnamesall = []
-        self.limit = limit
-
-    def do_searchhosturl(self):
-        try:
-            headers = {'user-agent': Core.get_user_agent(), 'Accept': '*/*', 'Referer': self.urlhost}
-            responsehost = requests.get(self.urlhost, headers=headers)
-            self.resultshosts = responsehost.text
-            self.total_resultshosts += self.resultshosts
-        except Exception as e:
-            print(f'Error occurred in the Censys module downloading pages from Censys - IP search: + {e}')
-
-    def do_searchcertificateurl(self):
-        try:
-            headers = {'user-agent': Core.get_user_agent(), 'Accept': '*/*', 'Referer': self.urlcert}
-            responsecert = requests.get(self.urlcert, headers=headers)
-            self.resultcerts = responsecert.text
-            self.total_resultscerts += self.resultcerts
-        except Exception as e:
-            print(f'Error occurred in the Censys module downloading pages from Censys - certificates search: {e}')
-
-    def process(self):
-        try:
-            self.urlhost = 'https://' + self.server + '/ipv4/_search?q=' + str(self.word) + '&page=1'
-            self.urlcert = 'https://' + self.server + '/certificates/_search?q=' + str(self.word) + '&page=1'
-            self.do_searchhosturl()
-            self.do_searchcertificateurl()
-            counter = 2
-            pages = censysparser.Parser(self)
-            totalpages = pages.search_totalpageshosts()
-            pagestosearch = int(self.limit / 25)  # 25 results/page
-            if totalpages is None:
-                totalpages = 0
-            if totalpages <= pagestosearch:
-                while counter <= totalpages:
-                    try:
-                        self.page = str(counter)
-                        self.urlhost = 'https://' + self.server + '/ipv4/_search?q=' + str(self.word) + '&page=' + str(
-                            self.page)
-                        print('\tSearching IP results page ' + self.page + '.')
-                        self.do_searchhosturl()
-                        counter += 1
-                    except Exception as e:
-                        print(f'Error occurred in the Censys module requesting the pages: {e}')
-            else:
-                while counter <= pagestosearch:
-                    try:
-                        self.page = str(counter)
-                        self.urlhost = 'https://' + self.server + '/ipv4/_search?q=' + str(self.word) + '&page=' + str(
-                            self.page)
-                        print(f'\tSearching results page {self.page}.')
-                        self.do_searchhosturl()
-                        counter += 1
-                    except Exception as e:
-                        print(f'Error occurred in the Censys module requesting the pages: {e}')
-            counter = 2
-            totalpages = pages.search_totalpagescerts()
-            if totalpages is None:
-                totalpages = 0
-            if totalpages <= pagestosearch:
-                while counter <= totalpages:
-                    try:
-                        self.page = str(counter)
-                        self.urlhost = 'https://' + self.server + '/certificates/_search?q=' + str(
-                            self.word) + '&page=' + str(self.page)
-                        print(f'\tSearching certificates results page {self.page}.')
-                        self.do_searchcertificateurl()
-                        counter += 1
-                    except Exception as e:
-                        print(f'Error occurred in the Censys module requesting the pages: {e}')
-            else:
-                while counter <= pagestosearch:
-                    try:
-                        self.page = str(counter)
-                        self.urlhost = 'https://' + self.server + '/ipv4/_search?q=' + str(self.word) + '&page=' + str(
-                            self.page)
-                        print('\tSearching IP results page ' + self.page + '.')
-                        self.do_searchhosturl()
-                        counter += 1
-                    except Exception as e:
-                        print(f'Error occurred in the Censys module requesting the pages: {e}')
-
-        except Exception as e:
-            print(f'Error occurred in the main Censys module: {e}')
-
-    def get_hostnames(self):
-        try:
-            ips = self.get_ipaddresses()
-            headers = {'user-agent': Core.get_user_agent(), 'Accept': '*/*', 'Referer': self.urlcert}
-            response = requests.post('https://censys.io/ipv4/getdns', json={'ips': ips}, headers=headers)
-            responsejson = response.json()
-            domainsfromcensys = []
-            for key, jdata in responsejson.items():
-                if jdata is not None:
-                    domainsfromcensys.append(jdata)
-                else:
-                    pass
-            matchingdomains = [s for s in domainsfromcensys if str(self.word) in s]
-            self.hostnamesall.extend(matchingdomains)
-            hostnamesfromcerts = censysparser.Parser(self)
-            self.hostnamesall.extend(hostnamesfromcerts.search_hostnamesfromcerts())
-            return self.hostnamesall
-        except Exception as e:
-            print(f'Error occurred in the Censys module - hostname search: {e}')
-
-    def get_ipaddresses(self):
-        try:
-            ips = censysparser.Parser(self)
-            self.ips = ips.search_ipaddresses()
-            return self.ips
-        except Exception as e:
-            print(f'Error occurred in the main Censys module - IP address search: {e}')
--- a/theHarvester/discovery/duckduckgosearch.py
+++ b/theHarvester/discovery/duckduckgosearch.py
@ -20,12 +20,10 @@ def __init__(self, word, limit):
        self.limit = limit

    def do_search(self):
-        try:  # Do normal scraping.
-            url = self.api.replace('x', self.word)
-            headers = {'User-Agent': googleUA}
-            r = requests.get(url, headers=headers)
-        except Exception as e:
-            print(e)
+        # Do normal scraping.
+        url = self.api.replace('x', self.word)
+        headers = {'User-Agent': googleUA}
+        r = requests.get(url, headers=headers)
        time.sleep(getDelay())
        self.results = r.text
        self.totalresults += self.results
@ -46,8 +44,8 @@ def crawl(self, text):
        urls = set()
        try:
            load = json.loads(text)
-            for key in load.keys():  # Iterate through keys of dict.
-                val = load.get(key)
+            for keys in load.keys():  # Iterate through keys of dict.
+                val = load.get(keys)
                if isinstance(val, int) or isinstance(val, dict) or val is None:
                    continue
                if isinstance(val, list):
--- a/theHarvester/discovery/s3_scanner.py
+++ b/theHarvester/discovery/s3_scanner.py
@ -1,46 +0,0 @@
-import re
-import requests
-
-
-class s3_scanner:
-
-    def __init__(self, host):
-        self.host = host
-        self.results = ""
-        self.totalresults = ""
-        self.fingerprints = ['www.herokucdn.com/error-pages/no-such-app.html', '<title>Squarespace - No Such Account</title>', "<p> If you're trying to publish one, <a href=\"https://help.github.com/pages/\">read the full documentation</a> to learn how to set up <strong>GitHub Pages</strong> for your repository, organization, or user account. </p>", "<p> If you\'re trying to publish one, <a href=\"https://help.github.com/pages/\">read the full documentation</a> to learn how to set up <strong>GitHub Pages</strong> for your repository, organization, or user account. </p>", "<span class=\"title\">Bummer. It looks like the help center that you are trying to reach no longer exists.</span>", "<head> <title>The page you\'re looking for could not be found (404)</title> <style> body { color: #666; text-align: center; font-family: \"Helvetica Neue\", Helvetica, Arial, sans-serif; margin: 0; width: 800px; margin: auto; font-size: 14px; } h1 { font-size: 56px; line-height: 100px; font-weight: normal; color: #456; } h2 { font-size: 24px; color: #666; line-height: 1.5em; } h3 { color: #456; font-size: 20px; font-weight: normal; line-height: 28px; } hr { margin: 18px 0; border: 0; border-top: 1px solid #EEE; border-bottom: 1px solid white; } </style> </head>"]
-
-    def __check_http(self, bucket_url):
-        check_response = self.session.head(
-            S3_URL, timeout=3, headers={'Host': bucket_url})
-
-#       if not ARGS.ignore_rate_limiting\
-#              and (check_response.status_code == 503 and check_response.reason == 'Slow Down'):
-#            self.q.rate_limited = True
-        # Add it back to the bucket for re-processing.
-#           self.q.put(bucket_url)
-        if check_response.status_code == 307:  # valid bucket, lets check if its public
-            new_bucket_url = check_response.headers['Location']
-            bucket_response = requests.request(
-                'GET' if ARGS.only_interesting else 'HEAD', new_bucket_url, timeout=3)
-
-            if bucket_response.status_code == 200\
-                    and (not ARGS.only_interesting or
-                             (ARGS.only_interesting and any(keyword in bucket_response.text for keyword in KEYWORDS))):
-                print(f"Found bucket '{new_bucket_url}'")
-                self.__log(new_bucket_url)
-
-    def do_s3(self):
-        try:
-            print('\t Searching takeovers for ' + self.host)
-            r = requests.get('https://' + self.host, verify=False)
-            for x in self.fingerprints:
-                take_reg = re.compile(x)
-                self.temp = take_reg.findall(r.text)
-                if self.temp != []:
-                        print('\t\033[91m Takeover detected! - ' + self.host + '\033[1;32;40m')
-        except Exception as e:
-                print(e)
-
-    def process(self):
-        self.do_take()
--- a/theHarvester/discovery/spyse.py
+++ b/theHarvester/discovery/spyse.py
@ -0,0 +1,34 @@
+from theHarvester.discovery.constants import *
+from theHarvester.lib.core import *
+import requests
+from pprint import pprint
+
+
+class SearchSpyse:
+
+    def __init__(self, word):
+        self.word = word
+        self.key = Core.spyse_key()
+        if self.key is None:
+            raise MissingKey(True)
+        self.results = ''
+        self.totalresults = ''
+
+    def do_search(self):
+        try:
+            base_url = f'https://api.spyse.com/v1/subdomains?domain={self.word}&api_token={self.key}&page=2'
+            headers = {'User-Agent': Core.get_user_agent()}
+            request = requests.get(base_url, headers=headers)
+            self.results = request.json()
+            pprint(self.results)
+            # self.totalresults += self.results
+
+        except Exception as e:
+            print(f'An exception has occurred: {e}')
+
+    def get_hostnames(self):
+        return self.totalresults
+
+    def process(self):
+        self.do_search()
+        print('\tSearching results.')
--- a/theHarvester/lib/core.py
+++ b/theHarvester/lib/core.py
@ -46,6 +46,12 @@ def shodan_key() -> str:
            keys = yaml.safe_load(api_keys)
            return keys['apikeys']['shodan']['key']

+    @staticmethod
+    def spyse_key() -> str:
+        with open('api-keys.yaml', 'r') as api_keys:
+            keys = yaml.safe_load(api_keys)
+            return keys['apikeys']['spyse']['key']
+
    @staticmethod
    def banner() -> None:
        print('\n\033[93m*******************************************************************')
@ -67,7 +73,6 @@ def get_supportedengines() -> Set[Union[str, Any]]:
        supportedengines = {'baidu',
                            'bing',
                            'bingapi',
-                            'censys',
                            'crtsh',
                            'dnsdumpster',
                            'dogpile',
@ -83,6 +88,7 @@ def get_supportedengines() -> Set[Union[str, Any]]:
                            'otx',
                            'securityTrails',
                            'suip',
+                            'spyse',
                            'threatcrowd',
                            'trello',
                            'twitter',
--- a/theHarvester/parsers/censysparser.py
+++ b/theHarvester/parsers/censysparser.py
@ -1,67 +0,0 @@
-from bs4 import BeautifulSoup
-import re
-
-
-class Parser:
-
-    def __init__(self, resultstoparse):
-        self.ipaddresses = []
-        self.souphosts = BeautifulSoup(resultstoparse.total_resultshosts, features='html.parser')
-        self.soupcerts = BeautifulSoup(resultstoparse.total_resultscerts, features='html.parser')
-        self.hostnames = []
-        self.hostnamesfromcerts = []
-        self.urls = []
-        self.numberofpageshosts = 0
-        self.numberofpagescerts = 0
-        self.domain = resultstoparse.word
-
-    def search_hostnamesfromcerts(self):
-        try:
-            hostnamelist = self.soupcerts.findAll('i', 'fa fa-fw fa-home')
-            for hostnameitem in hostnamelist:
-                hostitems = hostnameitem.next_sibling
-                hostnames = str(hostitems)
-                hostnamesclean = re.sub(r'[ \'\[\]]', '', hostnames)
-                hostnamesclean = re.sub(r'\.\.\.', r'', hostnamesclean)
-                self.hostnamesfromcerts.extend(hostnamesclean.split(','))
-            self.hostnamesfromcerts = list(filter(None, self.hostnamesfromcerts))
-            matchingdomains = [s for s in self.hostnamesfromcerts if str(self.domain) in s]  # filter out domains issued to other sites
-            self.hostnamesfromcerts = matchingdomains
-            return self.hostnamesfromcerts
-        except Exception as e:
-            print('Error occurred in the Censys module: certificate hostname parser: ' + str(e))
-
-    def search_ipaddresses(self):
-        try:
-            ipaddresslist = self.souphosts.findAll('a', 'SearchResult__title-text')
-            for ipaddressitem in ipaddresslist:
-                self.ipaddresses.append(ipaddressitem.text.strip())
-            return self.ipaddresses
-        except Exception as e:
-            print('Error occurred in the Censys module: IP address parser: ' + str(e))
-
-    def search_totalpageshosts(self):
-        try:
-            items = self.souphosts.findAll('span', 'SearchResultSectionHeader__statistic')
-            if items == [] or items is None:
-                self.numberofpageshosts = 0
-                return self.numberofpageshosts
-            numbers = re.findall(r"/\d*", items[0].text)
-            pagenumber = numbers[0].replace('/', '')
-            self.numberofpageshosts = int(pagenumber)
-            return self.numberofpageshosts
-        except Exception as e:
-            print('Error occurred in the Censys module IP search: page parser: ' + str(e))
-
-    def search_totalpagescerts(self):
-        try:
-            items = self.soupcerts.findAll('span', 'SearchResultSectionHeader__statistic')
-            if items == [] or items is None:
-                self.numberofpageshosts = 0
-                return self.numberofpageshosts
-            numbers = re.findall(r"/\d*", items[0].text)
-            pagenumber = numbers[0].replace('/', '')
-            self.numberofpagescerts = int(pagenumber)
-            return self.numberofpagescerts
-        except Exception as e:
-            print('Error occurred in the Censys module IP search: page parser: ' + str(e))