mirror of
https://github.com/laramies/theHarvester.git
synced 2025-02-24 14:32:57 +08:00
WIP for twitter port to async
This commit is contained in:
parent
d6f5a70f7f
commit
16df13775e
3 changed files with 26 additions and 25 deletions
|
@ -1,3 +1,4 @@
|
|||
from typing import Coroutine
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
|
||||
|
@ -19,7 +20,7 @@ async def do_search(self):
|
|||
print(e)
|
||||
self.totalresults += self.results
|
||||
|
||||
async def get_hostnames(self) -> set:
|
||||
async def get_hostnames(self) -> Coroutine:
|
||||
return myparser.Parser(self.results, self.word).hostnames()
|
||||
|
||||
async def process(self):
|
||||
|
|
|
@ -14,21 +14,21 @@ def __init__(self, word, limit):
|
|||
self.limit = int(limit)
|
||||
self.counter = 0
|
||||
|
||||
def do_search(self):
|
||||
async def do_search(self):
|
||||
base_url = f'https://{self.server}/search?num=100&start=xx&hl=en&meta=&q=site%3Atwitter.com%20intitle%3A%22on+Twitter%22%20{self.word}'
|
||||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
try:
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
|
||||
request = (grequests.get(url, headers=headers) for url in urls)
|
||||
response = grequests.imap(request, size=5)
|
||||
request = (await AsyncFetcher.fetch_all([base_url], headers=headers) for url in urls)
|
||||
response = request
|
||||
for entry in response:
|
||||
self.totalresults += entry.content.decode('UTF-8')
|
||||
self.totalresults += entry
|
||||
except Exception as error:
|
||||
print(error)
|
||||
|
||||
def get_people(self):
|
||||
async def get_people(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
to_parse = rawres.people_twitter()
|
||||
to_parse = await rawres.people_twitter()
|
||||
# fix invalid handles that look like @user other_output
|
||||
handles = set()
|
||||
for handle in to_parse:
|
||||
|
@ -37,5 +37,5 @@ def get_people(self):
|
|||
handles.add(result.group(0))
|
||||
return handles
|
||||
|
||||
def process(self):
|
||||
self.do_search()
|
||||
async def process(self):
|
||||
await self.do_search()
|
||||
|
|
|
@ -8,7 +8,7 @@ def __init__(self, results, word):
|
|||
self.word = word
|
||||
self.temp = []
|
||||
|
||||
def genericClean(self):
|
||||
async def genericClean(self):
|
||||
self.results = self.results.replace('<em>', '').replace('<b>', '').replace('</b>', '').replace('</em>', '')\
|
||||
.replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>', '')\
|
||||
.replace('<wbr>', '').replace('</wbr>', '')
|
||||
|
@ -16,13 +16,13 @@ def genericClean(self):
|
|||
for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'):
|
||||
self.results = self.results.replace(search, ' ')
|
||||
|
||||
def urlClean(self):
|
||||
async def urlClean(self):
|
||||
self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
|
||||
for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
|
||||
self.results = self.results.replace(search, ' ')
|
||||
|
||||
def emails(self):
|
||||
self.genericClean()
|
||||
async def emails(self):
|
||||
await self.genericClean()
|
||||
# Local part is required, charset is flexible.
|
||||
# https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
|
||||
reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
|
||||
|
@ -33,7 +33,7 @@ def emails(self):
|
|||
# if email starts with dot shift email string and make sure all emails are lowercase
|
||||
return true_emails
|
||||
|
||||
def fileurls(self, file):
|
||||
async def fileurls(self, file):
|
||||
urls = []
|
||||
reg_urls = re.compile('<a href="(.*?)"')
|
||||
self.temp = reg_urls.findall(self.results)
|
||||
|
@ -45,8 +45,8 @@ def fileurls(self, file):
|
|||
urls.append(iteration)
|
||||
return urls
|
||||
|
||||
def hostnames(self):
|
||||
self.genericClean()
|
||||
async def hostnames(self):
|
||||
await self.genericClean()
|
||||
reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word)
|
||||
self.temp = reg_hosts.findall(self.results)
|
||||
hostnames = self.unique()
|
||||
|
@ -55,7 +55,7 @@ def hostnames(self):
|
|||
hostnames.extend(self.unique())
|
||||
return list(set(hostnames))
|
||||
|
||||
def people_googleplus(self):
|
||||
async def people_googleplus(self):
|
||||
self.results = re.sub('</b>', '', self.results)
|
||||
self.results = re.sub('<b>', '', self.results)
|
||||
reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+')
|
||||
|
@ -71,7 +71,7 @@ def people_googleplus(self):
|
|||
resul.append(delete)
|
||||
return resul
|
||||
|
||||
def hostnames_all(self):
|
||||
async def hostnames_all(self):
|
||||
reg_hosts = re.compile('<cite>(.*?)</cite>')
|
||||
temp = reg_hosts.findall(self.results)
|
||||
for iteration in temp:
|
||||
|
@ -83,7 +83,7 @@ def hostnames_all(self):
|
|||
hostnames = self.unique()
|
||||
return hostnames
|
||||
|
||||
def links_linkedin(self):
|
||||
async def links_linkedin(self):
|
||||
reg_links = re.compile(r"url=https:\/\/www\.linkedin.com(.*?)&")
|
||||
self.temp = reg_links.findall(self.results)
|
||||
resul = []
|
||||
|
@ -92,7 +92,7 @@ def links_linkedin(self):
|
|||
resul.append("https://www.linkedin.com" + final_url)
|
||||
return resul
|
||||
|
||||
def people_linkedin(self):
|
||||
async def people_linkedin(self):
|
||||
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn')
|
||||
self.temp = reg_people.findall(self.results)
|
||||
resul = []
|
||||
|
@ -106,7 +106,7 @@ def people_linkedin(self):
|
|||
resul.append(delete)
|
||||
return resul
|
||||
|
||||
def people_twitter(self):
|
||||
async def people_twitter(self):
|
||||
reg_people = re.compile(r'(@[a-zA-Z0-9._ -]*)')
|
||||
self.temp = reg_people.findall(self.results)
|
||||
users = self.unique()
|
||||
|
@ -121,7 +121,7 @@ def people_twitter(self):
|
|||
resul.append(delete)
|
||||
return resul
|
||||
|
||||
def profiles(self):
|
||||
async def profiles(self):
|
||||
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
|
||||
self.temp = reg_people.findall(self.results)
|
||||
resul = []
|
||||
|
@ -133,7 +133,7 @@ def profiles(self):
|
|||
resul.append(delete)
|
||||
return resul
|
||||
|
||||
def set(self):
|
||||
async def set(self):
|
||||
reg_sets = re.compile(r'>[a-zA-Z0-9]*</a></font>')
|
||||
self.temp = reg_sets.findall(self.results)
|
||||
sets = []
|
||||
|
@ -143,10 +143,10 @@ def set(self):
|
|||
sets.append(delete)
|
||||
return sets
|
||||
|
||||
def urls(self):
|
||||
async def urls(self):
|
||||
found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z0-9\-_\.]+/?)*', self.results)
|
||||
urls = {match.group().strip() for match in found}
|
||||
return urls
|
||||
|
||||
def unique(self) -> list:
|
||||
async def unique(self) -> list:
|
||||
return list(set(self.temp))
|
||||
|
|
Loading…
Reference in a new issue