Screnshot functionality is ready for takeoff 🚀

2025-02-24 14:32:57 +08:00 · 2020-06-30 18:47:06 -04:00 · 2020-06-30 18:47:06 -04:00 · d3def10167
commit d3def10167
parent ab39c3402f
3 changed files with 90 additions and 84 deletions
--- a/theHarvester.py
+++ b/theHarvester.py
@ -21,9 +21,10 @@
    else:
        import uvloop
        uvloop.install()
-        import aiomultiprocess
-        # As we are not using Windows we can change the spawn method to fork for greater performance
-        aiomultiprocess.set_context("fork")
+        if platform == "linux":
+            import aiomultiprocess
+            # As we are not using Windows we can change the spawn method to fork for greater performance
+            aiomultiprocess.set_context("fork")
    asyncio.run(__main__.entry_point())

 # __main__
--- a/theHarvester/main.py
+++ b/theHarvester/main.py
@ -14,8 +14,6 @@
 import re
 import sys

-Core.banner()
-

 async def start(rest_args=None):
    parser = argparse.ArgumentParser(
@ -128,7 +126,8 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
        else:
            print(f'\033[94m[*] Searching {source[0].upper() + source[1:]}. \033[0m')
        if store_host:
-            host_names = filter(await search_engine.get_hostnames())
+            #host_names = filter(await search_engine.get_hostnames())
+            host_names = [host for host in filter(await search_engine.get_hostnames()) if f'.{word}' in host]
            if source != 'hackertarget' and source != 'pentesttools' and source != 'rapiddns':
                # If source is inside this conditional it means the hosts returned must be resolved to obtain ip
                full_hosts_checker = hostchecker.Checker(host_names)
@ -150,7 +149,7 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
        if store_results:
            email_list, host_names, urls = await search_engine.get_results()
            all_emails.extend(email_list)
-            host_names = filter(host_names)
+            host_names = [host for host in filter(host_names) if f'.{word}' in host]
            all_urls.extend(filter(urls))
            all_hosts.extend(host_names)
            await db.store_all(word, all_hosts, 'host', source)
@ -631,31 +630,45 @@ async def handler(lst):

    # Screenshots
    if len(args.screenshot) > 0:
-        print(f'Screenshots can be found: {args.screenshot}')
-        # screenshot_handler,
-        from theHarvester.screenshot.screenshot import take_screenshot, screenshot_handler, _chunk_list, receive, visit
+        # screenshot_handler
+        #from theHarvester.screenshot.screenshot import take_screenshot, screenshot_handler, _chunk_list, receive, visit
+        from theHarvester.screenshot.screenshot import ScreenShotter
+        # AsyncFetcher.fetch_all([])
+        screen_shotter = ScreenShotter(args.screenshot)
+        print(f'Screenshots can be found: {screen_shotter.output}{screen_shotter.slash}')
        start = time.perf_counter()
+        print('Filtering domains for ones we can reach')
        #from theHarvester.screenshot import take_screenshot
-        unique_resolved_domains = list(sorted({url.split(':')[0]for url in full if ':' in url and 'wwws' not in url}))
+        unique_resolved_domains = {url.split(':')[0]for url in full if ':' in url and 'www.' not in url}
+        # First filter out ones that didn't resolve
+        #unique_resolved_domains = list(sorted([x for x in unique_resolved_domains
+        #                                       if len(await screen_shotter.visit(x)) > 0]))
+        # Second filter out ones where we can't reach them with an http request
        # Grab resolved subdomains
        # coroutines = [take_screenshot(url) for url in unique_resolved_domains]
        #await screenshot_handler(coroutines)
-        async with Pool() as pool:
+        async with Pool(15) as pool:
            print('Created pool')
+            print('mapping for unique resolved domains')
+            y = await pool.map(screen_shotter.visit, list(unique_resolved_domains))
+            unique_resolved_domains = list(sorted({x[0] for x in y if len(x[1]) > 0}))
+            print(unique_resolved_domains)
+        async with Pool(3) as pool:
            #serialized_tiles = [take_screenshot(url) for url in unique_resolved_domains]
            #print(f'Length of serialized_tiles: {len(serialized_tiles)} ')
-            for chunk in _chunk_list(unique_resolved_domains, 20):
+            print(f'Length of unique resolved domains: {len(unique_resolved_domains)} chunking now!')
+            for chunk in screen_shotter._chunk_list(unique_resolved_domains, 25):
                print(f'Chunk: {chunk} and length: {len(chunk)}')
                try:
                    #resultsss = await  pool.map(visit, unique_resolved_domains)
-                    temp = await pool.map(take_screenshot, chunk)
+                    temp = await pool.map(screen_shotter.take_screenshot, chunk)
                    #resultsss = await pool.map(take_screenshot, unique_resolved_domains)
                    #await pool.map(screenshot_handler, chunk)
                except Exception as ee:
                    print(f'An excpeption has occurred while mapping: {ee}')
                    #continue
-        end = time.perf_counter()
-        print("Pipeline finished in {} seconds".format(end - start))
+    end = time.perf_counter()
+    print("Pipeline finished in {} seconds".format(end - start))

    # Shodan
    shodanres = []
@ -817,6 +830,7 @@ async def handler(lst):

 async def entry_point():
    try:
+        Core.banner()
        await start()
    except KeyboardInterrupt:
        print('\n\n\033[93m[!] ctrl+c detected from user, quitting.\n\n \033[0m')
--- a/theHarvester/screenshot/screenshot.py
+++ b/theHarvester/screenshot/screenshot.py
@ -2,81 +2,72 @@
 Screenshot module that utilizes pyppeteer in async fashion
 to break urls into list and assign them to workers in a queue
 """
-import asyncio
+
 from pyppeteer import launch
 import aiohttp
+import sys

+class ScreenShotter():

-def _chunk_list(items, chunk_size):
-    return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
+    def __init__(self, output):
+        self.output = output
+        self.slash = "\\" if 'win' in sys.platform else '/'
+        self.slash = "" if (self.output[-1] == "\\" or self.output[-1] == "/") else self.slash

+    @staticmethod
+    def _chunk_list(items, chunk_size):
+        return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]

-async def worker(queue):
-    while True:
-        # Get a "work item" out of the queue.
-        stor = await queue.get()
+    @staticmethod
+    async def visit(url):
        try:
-            await stor
-            queue.task_done()
-            # Notify the queue that the "work item" has been processed.
-        except Exception:
-            queue.task_done()
+            print(f'attempting to visit: {url}')
+            timeout = aiohttp.ClientTimeout(total=45)
+            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                                     'Chrome/83.0.4103.106 Safari/537.36'}
+            url = f'http://{url}' if ('http' not in url and 'https' not in url) else url
+            url = url.replace('www.', '')
+            async with aiohttp.ClientSession(timeout=timeout, headers=headers,
+                                             connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
+                async with session.get(url) as resp:
+                    # TODO fix with origin url I think it's there somewhere
+                    #return str(resp.url.origin()), await resp.text()
+                    text = await resp.text("UTF-8")
+                    print(text)
+                    print('\n\n\n\n')
+                    return f'http://{url}' if ('http' not in url and 'https' not in url) else url, text
+        except Exception as e:
+            print(f'An exception has occurred while attempting to visit: {e}')
+            return "", ""

-
-async def screenshot_handler(lst):
-    print('Created screenshot handler')
-    queue = asyncio.Queue()
-
-    for stor_method in lst:
-        # enqueue the coroutines
-        queue.put_nowait(stor_method)
-    # Create ten worker tasks to process the queue concurrently.
-    tasks = []
-    for i in range(10):
-        task = asyncio.create_task(worker(queue))
-        tasks.append(task)
-
-    # Wait until the queue is fully processed.
-    await queue.join()
-
-    # Cancel our worker tasks.
-    for task in tasks:
-        task.cancel()
-    # Wait until all worker tasks are cancelled.
-    await asyncio.gather(*tasks, return_exceptions=True)
-
-
-async def receive(lst):
-    for url in lst:
-        await take_screenshot(url)
-
-
-async def visit(url):
-    async with aiohttp.ClientSession() as session:
-        async with session.get(url) as resp:
-            return await resp.text()
-
-
-async def take_screenshot(url):
-    #url = f'http://{url}' if ('http' not in url and 'https' not in url) else url
-    url = f'https://{url}' if ('http' not in url and 'https' not in url) else url
-    url.replace('www.', '')
-    print(f'Taking a screenshot of: {url}')
-    try:
+    async def take_screenshot(self, url):
+        url = f'http://{url}' if ('http' not in url and 'https' not in url) else url
+        # url = f'https://{url}' if ('http' not in url and 'https' not in url) else url
+        url = url.replace('www.', '')
+        print(f'Taking a screenshot of: {url}')
        browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"])
+        context = await browser.createIncognitoBrowserContext()
        page = await browser.newPage()
-        # 50 second timeout
-        page.setDefaultNavigationTimeout(50000)
-        await page.setUserAgent(
-            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36')
-        # default timeout of 30 seconds
-        #await page.setDefaultNavigationTimeout(20000)
-        await page.goto(url)
-        await page.screenshot({'path': f'D:\\repos\\theHarvester\\theHarvester\\screenshot\\{url.replace("https://", "").replace("http://", "")}.png'})
-        await page.close()
-        await browser.close()
-        # return True
-    except Exception as e:
-        print(f'Exception occurred: {e} for: {url} ')
-    # No matter what happens make sure browser and page are closed
-    return False
+        try:
+
+            # change default timeout from 30 to 35 seconds
+            page.setDefaultNavigationTimeout(35000)
+            await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                                    'Chrome/83.0.4103.106 Safari/537.36')
+            #await page.goto(url, waitUntil='networkidle0')
+            await page.goto(url)
+            await page.screenshot({'path': f'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png'})
+            #print('inside try and page has been closed')
+            #await page.close()
+            # await browser.close()
+            # return True
+        except Exception as e:
+            print(f'Exception occurred: {e} for: {url} ')
+        finally:
+            await page.close()
+            #await page.close()
+            #print('page is closed')
+            await context.close()
+            #print('context is closed')
+            await browser.close()
+            print('everything is closed!')