WIP for twitter port to async

This commit is contained in:
L1ghtn1ng 2020-01-01 21:59:59 +00:00
parent d6f5a70f7f
commit 16df13775e
3 changed files with 26 additions and 25 deletions

View file

@ -1,3 +1,4 @@
from typing import Coroutine
from theHarvester.lib.core import *
from theHarvester.parsers import myparser
@ -19,7 +20,7 @@ async def do_search(self):
print(e)
self.totalresults += self.results
async def get_hostnames(self) -> set:
async def get_hostnames(self) -> Coroutine:
return myparser.Parser(self.results, self.word).hostnames()
async def process(self):

View file

@ -14,21 +14,21 @@ def __init__(self, word, limit):
self.limit = int(limit)
self.counter = 0
def do_search(self):
async def do_search(self):
base_url = f'https://{self.server}/search?num=100&start=xx&hl=en&meta=&q=site%3Atwitter.com%20intitle%3A%22on+Twitter%22%20{self.word}'
headers = {'User-Agent': Core.get_user_agent()}
try:
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
request = (grequests.get(url, headers=headers) for url in urls)
response = grequests.imap(request, size=5)
request = (await AsyncFetcher.fetch_all([base_url], headers=headers) for url in urls)
response = request
for entry in response:
self.totalresults += entry.content.decode('UTF-8')
self.totalresults += entry
except Exception as error:
print(error)
def get_people(self):
async def get_people(self):
rawres = myparser.Parser(self.totalresults, self.word)
to_parse = rawres.people_twitter()
to_parse = await rawres.people_twitter()
# fix invalid handles that look like @user other_output
handles = set()
for handle in to_parse:
@ -37,5 +37,5 @@ def get_people(self):
handles.add(result.group(0))
return handles
def process(self):
self.do_search()
async def process(self):
await self.do_search()

View file

@ -8,7 +8,7 @@ def __init__(self, results, word):
self.word = word
self.temp = []
def genericClean(self):
async def genericClean(self):
self.results = self.results.replace('<em>', '').replace('<b>', '').replace('</b>', '').replace('</em>', '')\
.replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>', '')\
.replace('<wbr>', '').replace('</wbr>', '')
@ -16,13 +16,13 @@ def genericClean(self):
for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'):
self.results = self.results.replace(search, ' ')
def urlClean(self):
async def urlClean(self):
self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
self.results = self.results.replace(search, ' ')
def emails(self):
self.genericClean()
async def emails(self):
await self.genericClean()
# Local part is required, charset is flexible.
# https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
@ -33,7 +33,7 @@ def emails(self):
# if email starts with dot shift email string and make sure all emails are lowercase
return true_emails
def fileurls(self, file):
async def fileurls(self, file):
urls = []
reg_urls = re.compile('<a href="(.*?)"')
self.temp = reg_urls.findall(self.results)
@ -45,8 +45,8 @@ def fileurls(self, file):
urls.append(iteration)
return urls
def hostnames(self):
self.genericClean()
async def hostnames(self):
await self.genericClean()
reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word)
self.temp = reg_hosts.findall(self.results)
hostnames = self.unique()
@ -55,7 +55,7 @@ def hostnames(self):
hostnames.extend(self.unique())
return list(set(hostnames))
def people_googleplus(self):
async def people_googleplus(self):
self.results = re.sub('</b>', '', self.results)
self.results = re.sub('<b>', '', self.results)
reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+')
@ -71,7 +71,7 @@ def people_googleplus(self):
resul.append(delete)
return resul
def hostnames_all(self):
async def hostnames_all(self):
reg_hosts = re.compile('<cite>(.*?)</cite>')
temp = reg_hosts.findall(self.results)
for iteration in temp:
@ -83,7 +83,7 @@ def hostnames_all(self):
hostnames = self.unique()
return hostnames
def links_linkedin(self):
async def links_linkedin(self):
reg_links = re.compile(r"url=https:\/\/www\.linkedin.com(.*?)&")
self.temp = reg_links.findall(self.results)
resul = []
@ -92,7 +92,7 @@ def links_linkedin(self):
resul.append("https://www.linkedin.com" + final_url)
return resul
def people_linkedin(self):
async def people_linkedin(self):
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn')
self.temp = reg_people.findall(self.results)
resul = []
@ -106,7 +106,7 @@ def people_linkedin(self):
resul.append(delete)
return resul
def people_twitter(self):
async def people_twitter(self):
reg_people = re.compile(r'(@[a-zA-Z0-9._ -]*)')
self.temp = reg_people.findall(self.results)
users = self.unique()
@ -121,7 +121,7 @@ def people_twitter(self):
resul.append(delete)
return resul
def profiles(self):
async def profiles(self):
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
self.temp = reg_people.findall(self.results)
resul = []
@ -133,7 +133,7 @@ def profiles(self):
resul.append(delete)
return resul
def set(self):
async def set(self):
reg_sets = re.compile(r'>[a-zA-Z0-9]*</a></font>')
self.temp = reg_sets.findall(self.results)
sets = []
@ -143,10 +143,10 @@ def set(self):
sets.append(delete)
return sets
def urls(self):
async def urls(self):
found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z0-9\-_\.]+/?)*', self.results)
urls = {match.group().strip() for match in found}
return urls
def unique(self) -> list:
async def unique(self) -> list:
return list(set(self.temp))