WIP for twitter port to async

2025-02-24 14:32:57 +08:00 · 2020-01-01 21:59:59 +00:00 · 2020-01-01 21:59:59 +00:00 · 16df13775e
commit 16df13775e
parent d6f5a70f7f
3 changed files with 26 additions and 25 deletions
--- a/theHarvester/discovery/threatcrowd.py
+++ b/theHarvester/discovery/threatcrowd.py
@ -1,3 +1,4 @@
+from typing import Coroutine
 from theHarvester.lib.core import *
 from theHarvester.parsers import myparser

@ -19,7 +20,7 @@ async def do_search(self):
            print(e)
        self.totalresults += self.results

-    async def get_hostnames(self) -> set:
+    async def get_hostnames(self) -> Coroutine:
        return myparser.Parser(self.results, self.word).hostnames()

    async def process(self):
--- a/theHarvester/discovery/twittersearch.py
+++ b/theHarvester/discovery/twittersearch.py
@ -14,21 +14,21 @@ def __init__(self, word, limit):
        self.limit = int(limit)
        self.counter = 0

-    def do_search(self):
+    async def do_search(self):
        base_url = f'https://{self.server}/search?num=100&start=xx&hl=en&meta=&q=site%3Atwitter.com%20intitle%3A%22on+Twitter%22%20{self.word}'
        headers = {'User-Agent': Core.get_user_agent()}
        try:
            urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
-            request = (grequests.get(url, headers=headers) for url in urls)
-            response = grequests.imap(request, size=5)
+            request = (await AsyncFetcher.fetch_all([base_url], headers=headers) for url in urls)
+            response = request
            for entry in response:
-                self.totalresults += entry.content.decode('UTF-8')
+                self.totalresults += entry
        except Exception as error:
            print(error)

-    def get_people(self):
+    async def get_people(self):
        rawres = myparser.Parser(self.totalresults, self.word)
-        to_parse = rawres.people_twitter()
+        to_parse = await rawres.people_twitter()
        # fix invalid handles that look like @user other_output
        handles = set()
        for handle in to_parse:
@ -37,5 +37,5 @@ def get_people(self):
                handles.add(result.group(0))
        return handles

-    def process(self):
-        self.do_search()
+    async def process(self):
+        await self.do_search()
--- a/theHarvester/parsers/myparser.py
+++ b/theHarvester/parsers/myparser.py
@ -8,7 +8,7 @@ def __init__(self, results, word):
        self.word = word
        self.temp = []

-    def genericClean(self):
+    async def genericClean(self):
        self.results = self.results.replace('<em>', '').replace('<b>', '').replace('</b>', '').replace('</em>', '')\
            .replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>', '')\
            .replace('<wbr>', '').replace('</wbr>', '')
@ -16,13 +16,13 @@ def genericClean(self):
        for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'):
            self.results = self.results.replace(search, ' ')

-    def urlClean(self):
+    async def urlClean(self):
        self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
        for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
            self.results = self.results.replace(search, ' ')

-    def emails(self):
-        self.genericClean()
+    async def emails(self):
+        await self.genericClean()
        # Local part is required, charset is flexible.
        # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
        reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
@ -33,7 +33,7 @@ def emails(self):
        # if email starts with dot shift email string and make sure all emails are lowercase
        return true_emails

-    def fileurls(self, file):
+    async def fileurls(self, file):
        urls = []
        reg_urls = re.compile('<a href="(.*?)"')
        self.temp = reg_urls.findall(self.results)
@ -45,8 +45,8 @@ def fileurls(self, file):
                urls.append(iteration)
        return urls

-    def hostnames(self):
-        self.genericClean()
+    async def hostnames(self):
+        await self.genericClean()
        reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word)
        self.temp = reg_hosts.findall(self.results)
        hostnames = self.unique()
@ -55,7 +55,7 @@ def hostnames(self):
        hostnames.extend(self.unique())
        return list(set(hostnames))

-    def people_googleplus(self):
+    async def people_googleplus(self):
        self.results = re.sub('</b>', '', self.results)
        self.results = re.sub('<b>', '', self.results)
        reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+')
@ -71,7 +71,7 @@ def people_googleplus(self):
                resul.append(delete)
        return resul

-    def hostnames_all(self):
+    async def hostnames_all(self):
        reg_hosts = re.compile('<cite>(.*?)</cite>')
        temp = reg_hosts.findall(self.results)
        for iteration in temp:
@ -83,7 +83,7 @@ def hostnames_all(self):
        hostnames = self.unique()
        return hostnames

-    def links_linkedin(self):
+    async def links_linkedin(self):
        reg_links = re.compile(r"url=https:\/\/www\.linkedin.com(.*?)&")
        self.temp = reg_links.findall(self.results)
        resul = []
@ -92,7 +92,7 @@ def links_linkedin(self):
            resul.append("https://www.linkedin.com" + final_url)
        return resul

-    def people_linkedin(self):
+    async def people_linkedin(self):
        reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn')
        self.temp = reg_people.findall(self.results)
        resul = []
@ -106,7 +106,7 @@ def people_linkedin(self):
                resul.append(delete)
        return resul

-    def people_twitter(self):
+    async def people_twitter(self):
        reg_people = re.compile(r'(@[a-zA-Z0-9._ -]*)')
        self.temp = reg_people.findall(self.results)
        users = self.unique()
@ -121,7 +121,7 @@ def people_twitter(self):
                resul.append(delete)
        return resul

-    def profiles(self):
+    async def profiles(self):
        reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
        self.temp = reg_people.findall(self.results)
        resul = []
@ -133,7 +133,7 @@ def profiles(self):
                resul.append(delete)
        return resul

-    def set(self):
+    async def set(self):
        reg_sets = re.compile(r'>[a-zA-Z0-9]*</a></font>')
        self.temp = reg_sets.findall(self.results)
        sets = []
@ -143,10 +143,10 @@ def set(self):
            sets.append(delete)
        return sets

-    def urls(self):
+    async def urls(self):
        found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z0-9\-_\.]+/?)*', self.results)
        urls = {match.group().strip() for match in found}
        return urls

-    def unique(self) -> list:
+    async def unique(self) -> list:
        return list(set(self.temp))