Fixed VirusTotal module (#1172)

* Added call to api endpoint to explicitly gather subdomains from zoomeye, updated user agents, replaced orjson with ujson, and fixed substring not found error. * Updated orjson to ujson. * Fixed semantic error in html check in google workaround. * Fixed flake8 errors. * Fixed VT to use API. * Fixed virustotal module. * Fixed possible edge case that could possibly cause an infinite loop.
2025-02-24 14:32:57 +08:00 · 2022-08-06 18:42:05 -04:00 · 2022-08-06 18:42:05 -04:00 · c801db6725
commit c801db6725
parent 121e23b3f9
5 changed files with 75 additions and 27 deletions
--- a/requirements/base.txt
+++ b/requirements/base.txt
@ -19,4 +19,4 @@ setuptools==63.4.1
 shodan==1.28.0
 slowapi==0.1.5
 uvicorn==0.18.2
-uvloop==0.16.0; platform_system != "Windows"
+uvloop==0.16.0; platform_system != "Windows"
--- a/theHarvester.py
+++ b/theHarvester.py
@ -25,4 +25,4 @@

            # As we are not using Windows we can change the spawn method to fork for greater performance
            aiomultiprocess.set_context("fork")
-    asyncio.run(__main__.entry_point())
+    asyncio.run(__main__.entry_point())
--- a/theHarvester/main.py
+++ b/theHarvester/main.py
@ -154,13 +154,11 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
        if store_people:
            people_list = await search_engine.get_people()
            await db_stash.store_all(word, people_list, 'people', source)
-
        if store_links:
            links = await search_engine.get_links()
            linkedin_links_tracker.extend(links)
            if len(links) > 0:
                await db.store_all(word, links, 'linkedinlinks', engineitem)
-
        if store_interestingurls:
            iurls = await search_engine.get_interestingurls()
            interesting_urls.extend(iurls)
@ -286,8 +284,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                        stor_lst.append(store(github_search, engineitem, store_host=True, store_emails=True))
                    except MissingKey as ex:
                        print(ex)
-                    else:
-                        pass

                elif engineitem == 'hackertarget':
                    from theHarvester.discovery import hackertarget
@ -303,8 +299,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                    except Exception as e:
                        if isinstance(e, MissingKey):
                            print(e)
-                        else:
-                            pass

                elif engineitem == 'intelx':
                    from theHarvester.discovery import intelxsearch
@ -388,8 +382,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                    except Exception as e:
                        if isinstance(e, MissingKey):
                            print(e)
-                        else:
-                            pass

                elif engineitem == 'sublist3r':
                    from theHarvester.discovery import sublist3r
@ -432,8 +424,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                    except Exception as e:
                        if isinstance(e, MissingKey):
                            print(e)
-                        else:
-                            pass

                elif engineitem == 'yahoo':
                    from theHarvester.discovery import yahoosearch
@ -449,8 +439,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                    except Exception as e:
                        if isinstance(e, MissingKey):
                            print(e)
-                        else:
-                            pass
        else:
            try:
                # Check if dns_brute is defined
@ -836,4 +824,4 @@ async def entry_point():
        print('\n\n[!] ctrl+c detected from user, quitting.\n\n ')
    except Exception as error_entry_point:
        print(error_entry_point)
-        sys.exit(1)
+        sys.exit(1)
--- a/theHarvester/discovery/virustotal.py
+++ b/theHarvester/discovery/virustotal.py
@ -1,28 +1,88 @@
 from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
-from pprint import pprint


 class SearchVirustotal:

    def __init__(self, word):
-        self.word = word
        self.key = Core.virustotal_key()
        if self.key is None:
            raise MissingKey('virustotal')
-        self.totalhosts = set
+        self.word = word
        self.proxy = False
+        self.hostnames = []

    async def do_search(self):
-        url = f'https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40'
-        response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
-                                                                           'X-APIKEY': self.key},
-                                                proxy=self.proxy)
-        entry = [host for host in response]
-        pprint(entry.items())
+        # TODO determine if more endpoints can yield useful info given a domain
+        # based on: https://developers.virustotal.com/reference/domains-relationships
+        # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40"
+        headers = {
+            'User-Agent': Core.get_user_agent(),
+            "Accept": "application/json",
+            "x-apikey": self.key
+        }
+        base_url = f"https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40"
+        cursor = ''
+        count = 0
+        fail_counter = 0
+        counter = 0
+        breakcon = False
+        while True:
+            if breakcon:
+                break
+            # rate limit is 4 per minute
+            # TODO add timer logic if proven to be needed
+            # in the meantime sleeping 16 seconds should eliminate hitting the rate limit
+            # in case rate limit is hit, fail counter exists and sleep for 65 seconds
+            send_url = base_url + "&cursor=" + cursor if cursor != '' and len(cursor) > 2 else base_url
+            responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True)
+            jdata = responses[0]
+            if 'data' not in jdata.keys():
+                await asyncio.sleep(60 + 5)
+                fail_counter += 1
+            if 'meta' in jdata.keys():
+                cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else ''
+                if len(cursor) == 0 and 'data' in jdata.keys():
+                    # if cursor no longer is within the meta field have hit last entry
+                    breakcon = True
+            count += jdata['meta']['count']
+            if count == 0 or fail_counter >= 2:
+                break
+            if 'data' in jdata.keys():
+                data = jdata['data']
+                self.hostnames.extend(await self.parse_hostnames(data, self.word))
+                counter += 1
+            await asyncio.sleep(16)
+        self.hostnames = list(sorted(set(self.hostnames)))
+        # verify domains such as x.x.com.multicdn.x.com are parsed properly
+        self.hostnames = [host for host in self.hostnames if ((len(host.split('.')) >= 3)
+                                                              and host.split('.')[-2] == self.word.split('.')[-2])]

-    # async def get_hostnames(self) -> set:
-    #     return self.total_results
+    async def get_hostnames(self) -> list:
+        return self.hostnames
+
+    @staticmethod
+    async def parse_hostnames(data, word):
+        total_subdomains = set()
+        for attribute in data:
+            total_subdomains.add(attribute['id'].replace('"', '').replace('www.', ''))
+            attributes = attribute['attributes']
+            total_subdomains.update(
+                {value['value'].replace('"', '').replace('www.', '') for value in attributes['last_dns_records'] if
+                 word in value['value']})
+            if 'last_https_certificate' in attributes.keys():
+                total_subdomains.update({value.replace('"', '').replace('www.', '') for value in
+                                         attributes['last_https_certificate']['extensions']['subject_alternative_name']
+                                         if word in value})
+        total_subdomains = list(sorted(total_subdomains))
+        # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement
+        # them and submit a PR or raise an issue if you run into this filtering not being enough
+        # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing
+        total_subdomains = [x for x in total_subdomains if
+                            not str(x).endswith('edgekey.net') and not str(x).endswith('akadns.net')
+                            and 'include:_spf' not in str(x)]
+        total_subdomains.sort()
+        return total_subdomains

    async def process(self, proxy=False):
        self.proxy = proxy
--- a/theHarvester/lib/core.py
+++ b/theHarvester/lib/core.py
@ -357,4 +357,4 @@ async def fetch_all(cls, urls, headers='', params='', json=False, takeover=False
                    return texts
                else:
                    texts = await asyncio.gather(*[AsyncFetcher.fetch(session, url, params, json) for url in urls])
-                    return texts
+                    return texts