mirror of
https://github.com/laramies/theHarvester.git
synced 2024-09-22 00:06:30 +08:00
Merge pull request #66 from NotoriousRebel/master
Ported github-code to use aiohttp.
This commit is contained in:
commit
c6e8077af9
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -10,4 +10,4 @@ venv
|
|||
.pytest_cache
|
||||
build/
|
||||
dist/
|
||||
theHarvester.egg-info/
|
||||
api-keys.yaml
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
from theHarvester.lib.core import *
|
||||
from typing import Union
|
||||
import random
|
||||
import aiohttp
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'
|
||||
googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \
|
||||
'Safari/537.36 '
|
||||
|
||||
|
||||
def splitter(links):
|
||||
|
@ -52,7 +57,7 @@ def getDelay() -> float:
|
|||
return random.randint(1, 3) - .5
|
||||
|
||||
|
||||
def search(text: str) -> bool:
|
||||
async def search(text: str) -> bool:
|
||||
# Helper function to check if Google has blocked traffic.
|
||||
for line in text.strip().splitlines():
|
||||
if 'This page appears when Google automatically detects requests coming from your computer network' in line \
|
||||
|
@ -62,13 +67,12 @@ def search(text: str) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def google_workaround(visit_url: str) -> Union[bool, str]:
|
||||
async def google_workaround(visit_url: str) -> Union[bool, str]:
|
||||
"""
|
||||
Function that makes a request on our behalf, if Google starts to block us
|
||||
:param visit_url: Url to scrape
|
||||
:return: Correct html that can be parsed by BS4
|
||||
"""
|
||||
import requests
|
||||
url = 'https://websniffer.cc/'
|
||||
data = {
|
||||
'Cookie': '',
|
||||
|
@ -77,12 +81,20 @@ def google_workaround(visit_url: str) -> Union[bool, str]:
|
|||
'type': 'GET&http=1.1',
|
||||
'uak': str(random.randint(4, 8)) # select random UA to send to Google
|
||||
}
|
||||
resp = requests.post(url, headers={'User-Agent': googleUA}, data=data)
|
||||
returned_html = resp.text
|
||||
if search(returned_html):
|
||||
import requests
|
||||
returned_html = requests.post(url, data=data, headers={'User-Agent': Core.get_user_agent()})
|
||||
returned_html = returned_html.text
|
||||
# TODO FIX
|
||||
#returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
|
||||
import pprint as p
|
||||
print('returned html')
|
||||
p.pprint(returned_html, indent=4)
|
||||
returned_html = "This page appears when Google automatically detects requests coming from your computer network"
|
||||
if await search(returned_html):
|
||||
print('going to second method!')
|
||||
# indicates that google is serving workaround a captcha
|
||||
# TODO rework workaround with more websites to send requests on our behalf or utilize proxies option in request
|
||||
return True
|
||||
# That means we will try out second option which will utilize proxies
|
||||
return await second_method(visit_url)
|
||||
# the html we get is malformed for BS4 as there are no greater than or less than signs
|
||||
if '<html>' in returned_html:
|
||||
start_index = returned_html.index('<html>')
|
||||
|
@ -96,6 +108,104 @@ def google_workaround(visit_url: str) -> Union[bool, str]:
|
|||
return correct_html
|
||||
|
||||
|
||||
async def request(url, params):
|
||||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
session = aiohttp.ClientSession(headers=headers)
|
||||
results = await AsyncFetcher.fetch(session, url=url, params=params)
|
||||
await session.close()
|
||||
return results
|
||||
|
||||
|
||||
async def proxy_fetch(session, url, proxy):
|
||||
try:
|
||||
async with session.get(url, proxy=proxy, ssl=False) as resp:
|
||||
return f'success:{proxy}', await resp.text()
|
||||
except Exception as e:
|
||||
# print(e)
|
||||
return f'failed:{proxy}', proxy
|
||||
|
||||
|
||||
async def proxy_test(proxies, url):
|
||||
print('doing proxy test with this number of proxies: ', len(proxies))
|
||||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
timeout = aiohttp.ClientTimeout(total=40)
|
||||
async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
|
||||
texts = await asyncio.gather(*[proxy_fetch(session, url, proxy) for proxy in proxies])
|
||||
return texts
|
||||
|
||||
|
||||
async def get_proxies():
|
||||
print('inside get proxies')
|
||||
# ideas borrowed and modified from twitterscraper
|
||||
proxy_url = 'https://free-proxy-list.net/'
|
||||
response = await AsyncFetcher.fetch_all([proxy_url])
|
||||
response = response[0]
|
||||
soup = BeautifulSoup(response, 'lxml')
|
||||
table = soup.find('table', id='proxylisttable')
|
||||
list_tr = table.find_all('tr')
|
||||
list_td = [elem.find_all('td') for elem in list_tr]
|
||||
list_td = [x for x in list_td if x is not None and len(x) > 0]
|
||||
list_ip = [elem[0].text for elem in list_td]
|
||||
list_ports = [elem[1].text for elem in list_td]
|
||||
list_proxies = [f"http://{':'.join(elem)}" for elem in list(zip(list_ip, list_ports))]
|
||||
return list_proxies
|
||||
|
||||
|
||||
async def clean_dct(dct: dict, second_test=False):
|
||||
print('cleaning dct and second test is: ', second_test)
|
||||
good_proxies = set()
|
||||
for proxy, text in dct.items():
|
||||
if 'failed' not in proxy:
|
||||
if second_test:
|
||||
if await search(text) is False:
|
||||
print(text)
|
||||
return text
|
||||
else:
|
||||
good_proxies.add(proxy[proxy.find(':') + 1:])
|
||||
return good_proxies if second_test is False else True
|
||||
|
||||
|
||||
async def create_init_proxies():
|
||||
print('inside create init proxies')
|
||||
url = "https://suip.biz"
|
||||
first_param = [url, (('act', 'proxy1'),), ]
|
||||
second_param = [url, (('act', 'proxy2'),), ]
|
||||
third_param = [url, (('act', 'proxy3'),), ]
|
||||
async_requests = [
|
||||
request(url=url, params=params)
|
||||
for url, params in [first_param, second_param, third_param]
|
||||
]
|
||||
results = await asyncio.gather(*async_requests)
|
||||
proxy_set = set()
|
||||
for resp in results:
|
||||
ip_candidates = re.findall(r'[0-9]+(?:\.[0-9]+){3}:[0-9]+', resp)
|
||||
proxy_set.update({f'http://{ip}' for ip in ip_candidates})
|
||||
|
||||
new_proxies = await get_proxies()
|
||||
proxy_set.update({proxy for proxy in new_proxies})
|
||||
return proxy_set
|
||||
|
||||
|
||||
async def second_method(url: str) -> Union[str, bool]:
|
||||
print('inside second method')
|
||||
# First visit example.com to make to filter out bad proxies
|
||||
init_url = "http://example.com"
|
||||
proxy_set = await create_init_proxies()
|
||||
tuples = await proxy_test(proxy_set, init_url)
|
||||
mega_dct = dict((x, y) for x, y in tuples)
|
||||
proxy_set = await clean_dct(mega_dct)
|
||||
# After we clean our proxy set now we use them to visit the url we care about
|
||||
print('got working proxies now onto the juice')
|
||||
tuples = await proxy_test(proxy_set, url)
|
||||
mega_dct = dict((x, y) for x, y in tuples)
|
||||
results = await clean_dct(mega_dct, second_test=True)
|
||||
print('returning the juice')
|
||||
# pass in second_test flag as True to indicate this will
|
||||
# the text we care about or a bool to indicate it was
|
||||
# not successful
|
||||
return results
|
||||
|
||||
|
||||
class MissingKey(Exception):
|
||||
|
||||
def __init__(self, identity_flag: bool):
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
from requests import Response
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional, NamedTuple
|
||||
from typing import List, Dict, Any, Optional, NamedTuple, Tuple
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import urllib.parse as urlparse
|
||||
|
||||
|
||||
|
@ -40,20 +39,21 @@ def __init__(self, word, limit):
|
|||
raise MissingKey(True)
|
||||
|
||||
@staticmethod
|
||||
def fragments_from_response(response: Response) -> List[str]:
|
||||
items: List[Dict[str, Any]] = response.json().get('items') or list()
|
||||
async def fragments_from_response(json_data: dict) -> List[str]:
|
||||
items: List[Dict[str, Any]] = json_data.get('items') or list()
|
||||
fragments: List[str] = list()
|
||||
for item in items:
|
||||
matches = item.get("text_matches") or list()
|
||||
for match in matches:
|
||||
fragments.append(match.get("fragment"))
|
||||
|
||||
return [fragment for fragment in fragments if fragment is not None]
|
||||
|
||||
@staticmethod
|
||||
def page_from_response(page: str, response: Response) -> Optional[Any]:
|
||||
page_link = response.links.get(page)
|
||||
async def page_from_response(page: str, links) -> Optional[Any]:
|
||||
page_link = links.get(page)
|
||||
if page_link:
|
||||
parsed = urlparse.urlparse(page_link.get("url"))
|
||||
parsed = urlparse.urlparse(str(page_link.get("url")))
|
||||
params = urlparse.parse_qs(parsed.query)
|
||||
pages: List[Any] = params.get('page', [None])
|
||||
page_number = pages[0] and int(pages[0])
|
||||
|
@ -61,21 +61,22 @@ def page_from_response(page: str, response: Response) -> Optional[Any]:
|
|||
else:
|
||||
return None
|
||||
|
||||
def handle_response(self, response: Response) -> Optional[Any]:
|
||||
if response.ok:
|
||||
results = self.fragments_from_response(response)
|
||||
next_page = self.page_from_response("next", response)
|
||||
last_page = self.page_from_response("last", response)
|
||||
async def handle_response(self, response: Tuple[str, dict, int, Any]):
|
||||
text, json_data, status, links = response
|
||||
if status == 200:
|
||||
results = await self.fragments_from_response(json_data)
|
||||
next_page = await self.page_from_response("next", links)
|
||||
last_page = await self.page_from_response("last", links)
|
||||
return SuccessResult(results, next_page, last_page)
|
||||
elif response.status_code == 429 or response.status_code == 403:
|
||||
elif status == 429 or status == 403:
|
||||
return RetryResult(60)
|
||||
else:
|
||||
try:
|
||||
return ErrorResult(response.status_code, response.json())
|
||||
return ErrorResult(status, json_data)
|
||||
except ValueError:
|
||||
return ErrorResult(response.status_code, response.text)
|
||||
return ErrorResult(status, text)
|
||||
|
||||
def do_search(self, page: Optional[int]) -> Response:
|
||||
async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]:
|
||||
if page is None:
|
||||
url = f'https://{self.server}/search/code?q="{self.word}"'
|
||||
else:
|
||||
|
@ -84,37 +85,41 @@ def do_search(self, page: Optional[int]) -> Response:
|
|||
'Host': self.server,
|
||||
'User-agent': Core.get_user_agent(),
|
||||
'Accept': "application/vnd.github.v3.text-match+json",
|
||||
'Authorization': 'token {}'.format(self.key)
|
||||
'Authorization': f'token {self.key}'
|
||||
}
|
||||
return requests.get(url=url, headers=headers, verify=True)
|
||||
async with aiohttp.ClientSession(headers=headers) as sess:
|
||||
async with sess.get(url) as resp:
|
||||
return await resp.text(), await resp.json(), resp.status, resp.links
|
||||
|
||||
@staticmethod
|
||||
def next_page_or_end(result: SuccessResult) -> Optional[int]:
|
||||
async def next_page_or_end(result: SuccessResult) -> Optional[int]:
|
||||
if result.next_page is not None:
|
||||
return result.next_page
|
||||
else:
|
||||
return result.last_page
|
||||
|
||||
def process(self):
|
||||
while self.counter <= self.limit and self.page is not None:
|
||||
api_response = self.do_search(self.page)
|
||||
result = self.handle_response(api_response)
|
||||
if type(result) == SuccessResult:
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
for fragment in result.fragments:
|
||||
self.total_results += fragment
|
||||
self.counter = self.counter + 1
|
||||
|
||||
self.page = self.next_page_or_end(result)
|
||||
time.sleep(getDelay())
|
||||
elif type(result) == RetryResult:
|
||||
sleepy_time = getDelay() + result.time
|
||||
print(f'\tRetrying page in {sleepy_time} seconds...')
|
||||
time.sleep(sleepy_time)
|
||||
elif type(result) == ErrorResult:
|
||||
raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
|
||||
else:
|
||||
raise Exception("\tUnknown exception occurred")
|
||||
async def process(self):
|
||||
try:
|
||||
while self.counter <= self.limit and self.page is not None:
|
||||
api_response = await self.do_search(self.page)
|
||||
result = await self.handle_response(api_response)
|
||||
if type(result) == SuccessResult:
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
for fragment in result.fragments:
|
||||
self.total_results += fragment
|
||||
self.counter = self.counter + 1
|
||||
self.page = await self.next_page_or_end(result)
|
||||
await asyncio.sleep(getDelay())
|
||||
elif type(result) == RetryResult:
|
||||
sleepy_time = getDelay() + result.time
|
||||
print(f'\tRetrying page in {sleepy_time} seconds...')
|
||||
await asyncio.sleep(sleepy_time)
|
||||
elif type(result) == ErrorResult:
|
||||
raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
|
||||
else:
|
||||
raise Exception("\tUnknown exception occurred")
|
||||
except Exception as e:
|
||||
print(f'An exception has occurred: {e}')
|
||||
|
||||
async def get_emails(self):
|
||||
rawres = myparser.Parser(self.total_results, self.word)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import time
|
||||
|
||||
import asyncio
|
||||
|
||||
class SearchGoogle:
|
||||
|
||||
|
@ -18,85 +16,91 @@ def __init__(self, word, limit, start):
|
|||
self.limit = limit
|
||||
self.counter = start
|
||||
|
||||
def do_search(self):
|
||||
async def do_search(self):
|
||||
# Do normal scraping.
|
||||
urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str(
|
||||
self.counter) + '&hl=en&meta=&q=%40\"' + self.word + '\"'
|
||||
try:
|
||||
headers = {'User-Agent': googleUA}
|
||||
r = requests.get(urly, headers=headers)
|
||||
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
self.results = resp[0]
|
||||
searched = await search(self.results)
|
||||
if searched:
|
||||
try:
|
||||
self.results = google_workaround(urly)
|
||||
self.results = await google_workaround(urly)
|
||||
print('self.results: ', self.results)
|
||||
p.pprint(self.results, indent=4)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
print(e)
|
||||
import traceback as t
|
||||
t.print_exc()
|
||||
# google blocked, no useful result
|
||||
return
|
||||
time.sleep(getDelay())
|
||||
await asyncio.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def do_search_profiles(self):
|
||||
async def do_search_profiles(self):
|
||||
urly = 'http://' + self.server + '/search?num=' + self.quantity + '&start=' + str(
|
||||
self.counter) + '&hl=en&meta=&q=site:www.google.com%20intitle:\"Google%20Profile\"%20\"Companies%20I%27ve%20worked%20for\"%20\"at%20' + self.word + '\"'
|
||||
try:
|
||||
headers = {'User-Agent': googleUA}
|
||||
r = requests.get(urly, headers=headers)
|
||||
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
self.results = resp[0]
|
||||
if await search(self.results):
|
||||
try:
|
||||
self.results = google_workaround(urly)
|
||||
self.results = await google_workaround(urly)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
except Exception:
|
||||
# google blocked, no useful result
|
||||
return
|
||||
time.sleep(getDelay())
|
||||
await asyncio.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def get_emails(self):
|
||||
async def get_emails(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.emails()
|
||||
return await rawres.emails()
|
||||
|
||||
def get_hostnames(self):
|
||||
async def get_hostnames(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.hostnames()
|
||||
return await rawres.hostnames()
|
||||
|
||||
def get_files(self):
|
||||
async def get_files(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.fileurls(self.files)
|
||||
|
||||
def get_profiles(self):
|
||||
async def get_profiles(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.profiles()
|
||||
|
||||
def process(self, google_dorking):
|
||||
async def process(self, google_dorking):
|
||||
if google_dorking is False:
|
||||
while self.counter <= self.limit and self.counter <= 1000:
|
||||
self.do_search()
|
||||
await self.do_search()
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
self.counter += 100
|
||||
else: # Google dorking is true.
|
||||
self.counter = 0 # Reset counter.
|
||||
print('\n')
|
||||
print('[-] Searching with Google Dorks: ')
|
||||
self.googledork() # Call Google dorking method if user wanted it!
|
||||
await self.googledork() # Call Google dorking method if user wanted it!
|
||||
|
||||
def process_profiles(self):
|
||||
async def process_profiles(self):
|
||||
while self.counter < self.limit:
|
||||
self.do_search_profiles()
|
||||
time.sleep(getDelay())
|
||||
await self.do_search_profiles()
|
||||
await asyncio.sleep(getDelay())
|
||||
self.counter += 100
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
|
||||
def append_dorks(self):
|
||||
async def append_dorks(self):
|
||||
# Wrap in try-except incase filepaths are messed up.
|
||||
try:
|
||||
with open('wordlists/dorks.txt', mode='r') as fp:
|
||||
|
@ -104,7 +108,7 @@ def append_dorks(self):
|
|||
except FileNotFoundError as error:
|
||||
print(error)
|
||||
|
||||
def construct_dorks(self):
|
||||
async def construct_dorks(self):
|
||||
# Format is: site:targetwebsite.com + space + inurl:admindork
|
||||
colon = '%3A'
|
||||
plus = '%2B'
|
||||
|
@ -128,12 +132,12 @@ def construct_dorks(self):
|
|||
.replace('&', ampersand).replace('(', left_peren).replace(')', right_peren).replace('|', pipe) + space + self.word
|
||||
for dork in self.dorks)
|
||||
|
||||
def googledork(self):
|
||||
self.append_dorks() # Call functions to create list.
|
||||
self.construct_dorks()
|
||||
self.send_dorks()
|
||||
async def googledork(self):
|
||||
await self.append_dorks() # Call functions to create list.
|
||||
await self.construct_dorks()
|
||||
await self.send_dorks()
|
||||
|
||||
def send_dorks(self): # Helper function to minimize code reusability.
|
||||
async def send_dorks(self): # Helper function to minimize code reusability.
|
||||
headers = {'User-Agent': googleUA}
|
||||
# Get random user agent to try and prevent google from blocking IP.
|
||||
for num in range(len(self.links)):
|
||||
|
@ -141,18 +145,18 @@ def send_dorks(self): # Helper function to minimize code reusability.
|
|||
if num % 10 == 0 and num > 0:
|
||||
print(f'\tSearching through {num} results')
|
||||
link = self.links[num]
|
||||
req = requests.get(link, headers=headers)
|
||||
self.results = req.text
|
||||
if search(self.results):
|
||||
req = await AsyncFetcher.fetch_all([link], headers=headers)
|
||||
self.results = req[0]
|
||||
if await search(self.results):
|
||||
try:
|
||||
self.results = google_workaround(link)
|
||||
self.results = await google_workaround(link)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
except Exception:
|
||||
# google blocked, no useful result
|
||||
return
|
||||
time.sleep(getDelay())
|
||||
await asyncio.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
except Exception as e:
|
||||
print(f'\tException Occurred {e}')
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import time
|
||||
import asyncio
|
||||
|
||||
|
||||
class SearchLinkedin:
|
||||
|
@ -16,15 +15,15 @@ def __init__(self, word, limit):
|
|||
self.limit = int(limit)
|
||||
self.counter = 0
|
||||
|
||||
def do_search(self):
|
||||
async def do_search(self):
|
||||
urly = 'http://' + self.server + '/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.word
|
||||
try:
|
||||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
r = requests.get(urly, headers=headers)
|
||||
self.results = r.text
|
||||
if search(self.results):
|
||||
resp = await AsyncFetcher.fetch_all([urly], headers=headers)
|
||||
self.results = resp[0]
|
||||
if await search(self.results):
|
||||
try:
|
||||
self.results = google_workaround(urly)
|
||||
self.results = await google_workaround(urly)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
|
@ -33,20 +32,20 @@ def do_search(self):
|
|||
return
|
||||
except Exception as e:
|
||||
print(e)
|
||||
time.sleep(getDelay())
|
||||
await asyncio.sleep(getDelay())
|
||||
self.totalresults += self.results
|
||||
|
||||
def get_people(self):
|
||||
async def get_people(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.people_linkedin()
|
||||
return await rawres.people_linkedin()
|
||||
|
||||
def get_links(self):
|
||||
async def get_links(self):
|
||||
links = myparser.Parser(self.totalresults, self.word)
|
||||
return splitter(links.links_linkedin())
|
||||
return splitter(await links.links_linkedin())
|
||||
|
||||
def process(self):
|
||||
async def process(self):
|
||||
while self.counter < self.limit:
|
||||
self.do_search()
|
||||
time.sleep(getDelay())
|
||||
await self.do_search()
|
||||
await asyncio.sleep(getDelay())
|
||||
self.counter += 100
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from theHarvester.parsers import myparser
|
||||
import requests
|
||||
import random
|
||||
import time
|
||||
import asyncio
|
||||
|
||||
|
||||
class SearchTrello:
|
||||
|
@ -18,54 +18,54 @@ def __init__(self, word):
|
|||
self.hostnames = []
|
||||
self.counter = 0
|
||||
|
||||
def do_search(self):
|
||||
async def do_search(self):
|
||||
base_url = f'https://{self.server}/search?num=300&start=xx&hl=en&q=site%3Atrello.com%20{self.word}'
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 20) if num <= self.limit]
|
||||
# limit is 20 as that is the most results google will show per num
|
||||
headers = {'User-Agent': googleUA}
|
||||
for url in urls:
|
||||
try:
|
||||
resp = requests.get(url, headers=headers)
|
||||
self.results = resp.text
|
||||
if search(self.results):
|
||||
resp = await AsyncFetcher.fetch_all([url], headers=headers)
|
||||
self.results = resp[0]
|
||||
if await search(self.results):
|
||||
try:
|
||||
self.results = google_workaround(base_url)
|
||||
self.results = await google_workaround(base_url)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
except Exception as e:
|
||||
print(e)
|
||||
self.totalresults += self.results
|
||||
time.sleep(getDelay() - .5)
|
||||
await asyncio.sleep(getDelay() - .5)
|
||||
except Exception as e:
|
||||
print(f'An exception has occurred in trello: {e}')
|
||||
|
||||
def get_emails(self):
|
||||
async def get_emails(self):
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
return rawres.emails()
|
||||
|
||||
def get_urls(self):
|
||||
async def get_urls(self):
|
||||
try:
|
||||
rawres = myparser.Parser(self.totalresults, 'trello.com')
|
||||
self.trello_urls = set(rawres.urls())
|
||||
self.trello_urls = set(await rawres.urls())
|
||||
self.totalresults = ''
|
||||
# reset what totalresults as before it was just google results now it is trello results
|
||||
headers = {'User-Agent': random.choice(['curl/7.37.0', 'Wget/1.19.4'])}
|
||||
# do not change the headers
|
||||
req = (grequests.get(url, headers=headers, timeout=4) for url in self.trello_urls)
|
||||
responses = grequests.imap(req, size=8)
|
||||
print('fetching trello urls')
|
||||
responses = await AsyncFetcher.fetch_all(self.trello_urls, headers=headers)
|
||||
for response in responses:
|
||||
self.totalresults += response.content.decode('UTF-8')
|
||||
self.totalresults += response
|
||||
|
||||
rawres = myparser.Parser(self.totalresults, self.word)
|
||||
self.hostnames = rawres.hostnames()
|
||||
self.hostnames = await rawres.hostnames()
|
||||
except Exception as e:
|
||||
print(f'Error occurred: {e}')
|
||||
|
||||
def process(self):
|
||||
self.do_search()
|
||||
self.get_urls()
|
||||
async def process(self):
|
||||
await self.do_search()
|
||||
await self.get_urls()
|
||||
print(f'\tSearching {self.counter} results.')
|
||||
|
||||
def get_results(self) -> tuple:
|
||||
return self.get_emails(), self.hostnames, self.trello_urls
|
||||
async def get_results(self) -> tuple:
|
||||
return await self.get_emails(), self.hostnames, self.trello_urls
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from theHarvester.discovery.constants import *
|
||||
from theHarvester.lib.core import *
|
||||
from theHarvester.parsers import myparser
|
||||
import re
|
||||
|
@ -19,9 +20,19 @@ async def do_search(self):
|
|||
headers = {'User-Agent': Core.get_user_agent()}
|
||||
try:
|
||||
urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
|
||||
responses = await AsyncFetcher.fetch_all(urls, headers=headers)
|
||||
for response in responses:
|
||||
self.totalresults += response
|
||||
for url in urls:
|
||||
response = await AsyncFetcher.fetch_all([url], headers=headers)
|
||||
self.results = response[0]
|
||||
if await search(self.results):
|
||||
try:
|
||||
self.results = await google_workaround(url)
|
||||
if isinstance(self.results, bool):
|
||||
print('Google is blocking your ip and the workaround, returning')
|
||||
return
|
||||
except Exception:
|
||||
# google blocked, no useful result
|
||||
return
|
||||
self.totalresults += self.results
|
||||
except Exception as error:
|
||||
print(error)
|
||||
|
||||
|
|
Loading…
Reference in a new issue