diff --git a/theHarvester/discovery/threatcrowd.py b/theHarvester/discovery/threatcrowd.py index 2b396c4e..cbdbcda3 100644 --- a/theHarvester/discovery/threatcrowd.py +++ b/theHarvester/discovery/threatcrowd.py @@ -1,3 +1,4 @@ +from typing import Coroutine from theHarvester.lib.core import * from theHarvester.parsers import myparser @@ -19,7 +20,7 @@ async def do_search(self): print(e) self.totalresults += self.results - async def get_hostnames(self) -> set: + async def get_hostnames(self) -> Coroutine: return myparser.Parser(self.results, self.word).hostnames() async def process(self): diff --git a/theHarvester/discovery/twittersearch.py b/theHarvester/discovery/twittersearch.py index e4a04e7b..0c18e790 100644 --- a/theHarvester/discovery/twittersearch.py +++ b/theHarvester/discovery/twittersearch.py @@ -14,21 +14,21 @@ def __init__(self, word, limit): self.limit = int(limit) self.counter = 0 - def do_search(self): + async def do_search(self): base_url = f'https://{self.server}/search?num=100&start=xx&hl=en&meta=&q=site%3Atwitter.com%20intitle%3A%22on+Twitter%22%20{self.word}' headers = {'User-Agent': Core.get_user_agent()} try: urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] - request = (grequests.get(url, headers=headers) for url in urls) - response = grequests.imap(request, size=5) + request = (await AsyncFetcher.fetch_all([base_url], headers=headers) for url in urls) + response = request for entry in response: - self.totalresults += entry.content.decode('UTF-8') + self.totalresults += entry except Exception as error: print(error) - def get_people(self): + async def get_people(self): rawres = myparser.Parser(self.totalresults, self.word) - to_parse = rawres.people_twitter() + to_parse = await rawres.people_twitter() # fix invalid handles that look like @user other_output handles = set() for handle in to_parse: @@ -37,5 +37,5 @@ def get_people(self): handles.add(result.group(0)) return handles - def process(self): - self.do_search() + async def process(self): + await self.do_search() diff --git a/theHarvester/parsers/myparser.py b/theHarvester/parsers/myparser.py index 16a74865..63feb6ba 100644 --- a/theHarvester/parsers/myparser.py +++ b/theHarvester/parsers/myparser.py @@ -8,7 +8,7 @@ def __init__(self, results, word): self.word = word self.temp = [] - def genericClean(self): + async def genericClean(self): self.results = self.results.replace('', '').replace('', '').replace('', '').replace('', '')\ .replace('%2f', '').replace('%3a', '').replace('', '').replace('', '')\ .replace('', '').replace('', '') @@ -16,13 +16,13 @@ def genericClean(self): for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'): self.results = self.results.replace(search, ' ') - def urlClean(self): + async def urlClean(self): self.results = self.results.replace('', '').replace('', '').replace('%2f', '').replace('%3a', '') for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'): self.results = self.results.replace(search, ' ') - def emails(self): - self.genericClean() + async def emails(self): + await self.genericClean() # Local part is required, charset is flexible. # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly) reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', '')) @@ -33,7 +33,7 @@ def emails(self): # if email starts with dot shift email string and make sure all emails are lowercase return true_emails - def fileurls(self, file): + async def fileurls(self, file): urls = [] reg_urls = re.compile('', '', self.results) self.results = re.sub('', '', self.results) reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+') @@ -71,7 +71,7 @@ def people_googleplus(self): resul.append(delete) return resul - def hostnames_all(self): + async def hostnames_all(self): reg_hosts = re.compile('(.*?)') temp = reg_hosts.findall(self.results) for iteration in temp: @@ -83,7 +83,7 @@ def hostnames_all(self): hostnames = self.unique() return hostnames - def links_linkedin(self): + async def links_linkedin(self): reg_links = re.compile(r"url=https:\/\/www\.linkedin.com(.*?)&") self.temp = reg_links.findall(self.results) resul = [] @@ -92,7 +92,7 @@ def links_linkedin(self): resul.append("https://www.linkedin.com" + final_url) return resul - def people_linkedin(self): + async def people_linkedin(self): reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn') self.temp = reg_people.findall(self.results) resul = [] @@ -106,7 +106,7 @@ def people_linkedin(self): resul.append(delete) return resul - def people_twitter(self): + async def people_twitter(self): reg_people = re.compile(r'(@[a-zA-Z0-9._ -]*)') self.temp = reg_people.findall(self.results) users = self.unique() @@ -121,7 +121,7 @@ def people_twitter(self): resul.append(delete) return resul - def profiles(self): + async def profiles(self): reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - Google Profile') self.temp = reg_people.findall(self.results) resul = [] @@ -133,7 +133,7 @@ def profiles(self): resul.append(delete) return resul - def set(self): + async def set(self): reg_sets = re.compile(r'>[a-zA-Z0-9]*') self.temp = reg_sets.findall(self.results) sets = [] @@ -143,10 +143,10 @@ def set(self): sets.append(delete) return sets - def urls(self): + async def urls(self): found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z0-9\-_\.]+/?)*', self.results) urls = {match.group().strip() for match in found} return urls - def unique(self) -> list: + async def unique(self) -> list: return list(set(self.temp))