2016-04-18 19:40:01 +08:00
#!/usr/bin/env python3
# encoding: UTF-8
2016-04-17 01:20:38 +08:00
"""
This file is part of EmailHarvester
Copyright ( C ) 2016 @maldevel
https : / / github . com / maldevel / EmailHarvester
EmailHarvester - A tool to retrieve Domain email addresses from Search Engines .
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU General Public License for more details .
You should have received a copy of the GNU General Public License
along with this program . If not , see < http : / / www . gnu . org / licenses / > .
For more see the file ' LICENSE ' for copying permission .
"""
2016-04-18 19:40:01 +08:00
__author__ = " maldevel "
__copyright__ = " Copyright (c) 2016 @maldevel "
2016-04-22 21:07:14 +08:00
__credits__ = [ " maldevel " , " PaulSec " , " cclauss " , " Christian Martorella " ]
2016-04-18 19:40:01 +08:00
__license__ = " GPLv3 "
2016-04-22 21:46:56 +08:00
__version__ = " 1.2.7 "
2016-04-18 19:40:01 +08:00
__maintainer__ = " maldevel "
################################
import argparse
import sys
2016-04-19 18:20:42 +08:00
import time
import requests
import re
2016-04-22 21:07:14 +08:00
import os
2016-04-18 19:40:01 +08:00
2016-04-19 18:20:42 +08:00
from termcolor import colored
2016-04-18 19:40:01 +08:00
from argparse import RawTextHelpFormatter
2016-04-19 18:20:42 +08:00
from sys import platform as _platform
2016-04-20 22:55:26 +08:00
from urllib . parse import urlparse
2016-04-18 19:40:01 +08:00
################################
2016-04-19 18:20:42 +08:00
if _platform == ' win32 ' :
import colorama
colorama . init ( )
class myparser :
def __init__ ( self , results , word ) :
self . results = results
self . word = word
self . temp = [ ]
2016-04-22 21:07:14 +08:00
2016-04-19 18:20:42 +08:00
def genericClean ( self ) :
2016-04-21 22:06:38 +08:00
for e in ''' <KW> </KW> </a> <b> </b> </div> <em> </em> <p> </span>
< strong > < / strong > < title > < wbr > < / wbr > ''' .split():
self . results = self . results . replace ( e , ' ' )
for e in ' %2f %3a % 3A % 3C % 3D & / : ; < = > \\ ' . split ( ) :
2016-04-21 22:07:18 +08:00
self . results = self . results . replace ( e , ' ' )
2016-04-19 18:20:42 +08:00
def emails ( self ) :
self . genericClean ( )
reg_emails = re . compile (
2016-04-20 23:35:59 +08:00
' [a-zA-Z0-9. \ -_+#~!$& \' ,;=:]+ ' +
2016-04-19 18:20:42 +08:00
' @ ' +
2016-04-20 23:35:59 +08:00
' [a-zA-Z0-9.-]* ' +
2016-04-19 18:20:42 +08:00
self . word )
self . temp = reg_emails . findall ( self . results )
emails = self . unique ( )
return emails
def unique ( self ) :
2016-04-21 05:25:45 +08:00
self . new = list ( set ( self . temp ) )
2016-04-19 18:20:42 +08:00
return self . new
###################################################################
2016-04-22 21:07:14 +08:00
class EmailHarvester ( object ) :
2016-04-22 21:38:38 +08:00
def __init__ ( self , userAgent , proxy ) :
2016-04-22 21:07:14 +08:00
self . plugins = { }
2016-04-22 21:38:38 +08:00
self . proxy = proxy
self . userAgent = userAgent
2016-04-22 21:07:14 +08:00
path = " plugins/ "
plugins = { }
sys . path . insert ( 0 , path )
for f in os . listdir ( path ) :
fname , ext = os . path . splitext ( f )
if ext == ' .py ' :
mod = __import__ ( fname )
plugins [ fname ] = mod . Plugin ( self )
def register_plugin ( self , search_method , functions ) :
self . plugins [ search_method ] = functions
def get_plugins ( self ) :
return self . plugins
def show_message ( self , msg ) :
print ( green ( msg ) )
2016-04-22 21:38:38 +08:00
def init_search ( self , urlPattern , word , limit , counterInit , counterStep ) :
2016-04-19 18:20:42 +08:00
self . results = " "
self . totalresults = " "
self . limit = int ( limit )
self . counter = int ( counterInit )
self . urlPattern = urlPattern
self . step = int ( counterStep )
self . word = word
def do_search ( self ) :
try :
urly = self . urlPattern . format ( counter = str ( self . counter ) , word = self . word )
2016-04-22 04:45:02 +08:00
headers = { ' User-Agent ' : self . userAgent }
2016-04-20 22:55:26 +08:00
if ( self . proxy ) :
2016-04-22 04:45:02 +08:00
proxies = { self . proxy . scheme : " http:// " + self . proxy . netloc }
2016-04-20 22:55:26 +08:00
r = requests . get ( urly , headers = headers , proxies = proxies )
else :
r = requests . get ( urly , headers = headers )
2016-04-19 18:20:42 +08:00
except Exception as e :
print ( e )
2016-04-20 22:55:26 +08:00
sys . exit ( 4 )
2016-04-19 18:20:42 +08:00
self . results = r . content . decode ( r . encoding )
self . totalresults + = self . results
def process ( self ) :
while ( self . counter < self . limit ) :
self . do_search ( )
time . sleep ( 1 )
self . counter + = self . step
2016-04-22 21:07:14 +08:00
print ( " \t Searching " + str ( self . counter ) + " results... " )
2016-04-19 18:20:42 +08:00
def get_emails ( self ) :
rawres = myparser ( self . totalresults , self . word )
return rawres . emails ( )
###################################################################
def yellow ( text ) :
return colored ( text , ' yellow ' , attrs = [ ' bold ' ] )
def green ( text ) :
return colored ( text , ' green ' , attrs = [ ' bold ' ] )
def red ( text ) :
return colored ( text , ' red ' , attrs = [ ' bold ' ] )
def unique ( data ) :
2016-04-21 05:25:45 +08:00
return list ( set ( data ) )
2016-04-20 22:55:26 +08:00
2016-04-22 21:07:14 +08:00
###################################################################
2016-04-20 22:55:26 +08:00
def checkProxyUrl ( url ) :
url_checked = urlparse ( url )
2016-04-21 05:25:45 +08:00
if ( url_checked . scheme not in ( ' http ' , ' https ' ) ) | ( url_checked . netloc == ' ' ) :
2016-04-20 22:55:26 +08:00
raise argparse . ArgumentTypeError ( ' Invalid {} Proxy URL (example: http://127.0.0.1:8080). ' . format ( url ) )
return url_checked
2016-04-19 18:20:42 +08:00
2016-04-22 04:52:43 +08:00
def limit_type ( x ) :
2016-04-19 18:20:42 +08:00
x = int ( x )
2016-04-22 04:43:50 +08:00
if x > 0 :
return x
raise argparse . ArgumentTypeError ( " Minimum results limit is 1. " )
2016-04-19 18:20:42 +08:00
2016-04-23 14:13:32 +08:00
#===============================================================================
# def engine_type(engine):
# engines = 'all ask bing google yahoo'.split()
# if engine in engines:
# return engine
# raise argparse.ArgumentTypeError("Invalid search engine, try with: {}.".format(', '.join(engines)))
#===============================================================================
2016-04-22 15:53:39 +08:00
###################################################################
2016-04-19 18:20:42 +08:00
2016-04-18 19:40:01 +08:00
if __name__ == ' __main__ ' :
parser = argparse . ArgumentParser ( description = """
2016-04-19 18:20:42 +08:00
2016-04-18 19:40:01 +08:00
_____ _ _ _ _ _
| ___ | ( _ ) | | | | | | | |
| | __ _ __ ___ __ _ _ | | | | _ | | __ _ _ __ __ __ ___ ___ | | _ ___ _ __
| __ | | ' _ ` _ \ / _` || || | | _ | / _` || ' __ | \ \ / / / _ \/ __ | | __ | / _ \| ' __|
| | ___ | | | | | | | ( _ | | | | | | | | | | | ( _ | | | | \ V / | __ / \__ \| | _ | __ / | |
\____ / | _ | | _ | | _ | \__ , _ | | _ | | _ | \_ | | _ / \__ , _ | | _ | \_ / \___ | | ___ / \__ | \___ | | _ |
2016-04-19 18:20:42 +08:00
A tool to retrieve Domain email addresses from Search Engines | @maldevel
2016-04-20 23:38:38 +08:00
{ } : { }
2016-04-19 18:20:42 +08:00
""" .format(red( ' Version ' ), yellow(__version__)),
2016-04-18 19:40:01 +08:00
formatter_class = RawTextHelpFormatter )
2016-04-22 21:07:14 +08:00
parser . add_argument ( " -d " , ' --domain ' , action = " store " , metavar = ' DOMAIN ' , dest = ' domain ' ,
default = None , type = str , help = " Domain to search. " )
parser . add_argument ( " -s " , ' --save ' , action = " store " , metavar = ' FILE ' , dest = ' filename ' ,
default = None , type = str , help = " Save the results into a TXT and XML file (both). " )
#(google, bing, yahoo, ask, all) needs to be fixed/scan plugins folder to show available search engines??
parser . add_argument ( " -e " , ' --engine ' , action = " store " , metavar = ' ENGINE ' , dest = ' engine ' ,
2016-04-23 14:13:32 +08:00
default = " all " , type = str , help = " Select search engine plugin(google, bing, yahoo, ask, linkedin, all). " )
2016-04-22 21:07:14 +08:00
parser . add_argument ( " -l " , ' --limit ' , action = " store " , metavar = ' LIMIT ' , dest = ' limit ' ,
type = limit_type , default = 100 , help = " Limit the number of results. " )
parser . add_argument ( ' -u ' , ' --user-agent ' , action = " store " , metavar = ' USER-AGENT ' , dest = ' uagent ' ,
type = str , help = " Set the User-Agent request header. " )
parser . add_argument ( ' -x ' , ' --proxy ' , action = " store " , metavar = ' PROXY ' , dest = ' proxy ' ,
default = None , type = checkProxyUrl , help = ' Setup proxy server (example: http://127.0.0.1:8080) ' )
parser . add_argument ( ' --noprint ' , action = ' store_true ' , default = False ,
help = ' EmailHarvester will print discovered emails to terminal. It is possible to tell EmailHarvester not to print results to terminal with this option. ' )
2016-04-22 17:24:56 +08:00
2016-04-18 19:40:01 +08:00
if len ( sys . argv ) is 1 :
parser . print_help ( )
sys . exit ( )
2016-04-21 05:25:45 +08:00
2016-04-18 19:40:01 +08:00
args = parser . parse_args ( )
2016-04-21 05:25:45 +08:00
if not args . domain :
2016-04-20 22:55:26 +08:00
print ( red ( " [-] Please specify a domain name to search. " ) )
2016-04-19 18:20:42 +08:00
sys . exit ( 2 )
2016-04-21 05:25:45 +08:00
domain = args . domain
userAgent = ( args . uagent or
" Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1 " )
2016-04-20 22:55:26 +08:00
print ( " User-Agent in use: {} " . format ( yellow ( userAgent ) ) )
2016-04-21 05:25:45 +08:00
if args . proxy :
2016-04-20 22:55:26 +08:00
print ( " Proxy server in use: {} " . format ( yellow ( args . proxy . scheme + " :// " + args . proxy . netloc ) ) )
2016-04-21 05:25:45 +08:00
filename = args . filename or " "
2016-04-19 18:20:42 +08:00
limit = args . limit
2016-04-20 22:55:26 +08:00
engine = args . engine
2016-04-22 21:38:38 +08:00
app = EmailHarvester ( userAgent , args . proxy )
2016-04-22 21:07:14 +08:00
plugins = app . get_plugins ( )
2016-04-19 18:20:42 +08:00
2016-04-22 21:07:14 +08:00
all_emails = [ ]
2016-04-22 04:37:35 +08:00
if engine == " all " :
2016-04-22 16:16:59 +08:00
print ( green ( " [+] Searching everywhere.. " ) )
2016-04-22 21:07:14 +08:00
for search_engine in plugins :
2016-04-22 21:38:38 +08:00
all_emails + = plugins [ search_engine ] [ ' search ' ] ( domain , limit )
2016-04-22 21:07:14 +08:00
elif engine not in plugins :
print ( red ( " Search engine plugin not found " ) )
sys . exit ( 3 )
else :
2016-04-23 14:13:32 +08:00
all_emails = plugins [ engine ] [ ' search ' ] ( domain , limit )
2016-04-22 04:37:35 +08:00
all_emails = unique ( all_emails )
2016-04-19 18:20:42 +08:00
2016-04-21 05:25:45 +08:00
if not all_emails :
2016-04-19 18:20:42 +08:00
print ( red ( " No emails found " ) )
2016-04-22 21:07:14 +08:00
sys . exit ( 4 )
2016-04-22 04:37:35 +08:00
2016-04-22 16:16:59 +08:00
msg = " \n \n [+] {} emails found: " . format ( len ( all_emails ) )
2016-04-22 04:37:35 +08:00
print ( green ( msg ) )
print ( green ( " - " * len ( msg ) ) )
2016-04-22 17:24:56 +08:00
if not args . noprint :
for emails in all_emails :
print ( emails )
2016-04-19 18:20:42 +08:00
2016-04-21 05:29:44 +08:00
if filename :
2016-04-19 18:20:42 +08:00
try :
2016-04-22 16:16:59 +08:00
print ( green ( " \n [+] Saving files... " ) )
2016-04-21 05:25:45 +08:00
with open ( filename , ' w ' ) as out_file :
for email in all_emails :
try :
out_file . write ( email + " \n " )
except :
print ( red ( " Exception " + email ) )
2016-04-19 18:20:42 +08:00
except Exception as e :
2016-04-20 01:16:21 +08:00
print ( red ( " Error saving TXT file: " + e ) )
2016-04-19 18:20:42 +08:00
try :
filename = filename . split ( " . " ) [ 0 ] + " .xml "
2016-04-21 05:25:45 +08:00
with open ( filename , ' w ' ) as out_file :
out_file . write ( ' <?xml version= " 1.0 " encoding= " UTF-8 " ?><EmailHarvester> ' )
2016-04-22 04:37:35 +08:00
for email in all_emails :
out_file . write ( ' <email> {} </email> ' . format ( email ) )
2016-04-21 05:25:45 +08:00
out_file . write ( ' </EmailHarvester> ' )
2016-04-19 18:20:42 +08:00
print ( green ( " Files saved! " ) )
except Exception as er :
print ( red ( " Error saving XML file: " + er ) )
2016-04-21 05:25:45 +08:00