mirror of
https://github.com/laramies/theHarvester.git
synced 2024-09-20 15:26:31 +08:00
Merge pull request #150 from NotoriousRebel/master
Added DuckDuckGo and updated filter.
This commit is contained in:
commit
c8cd85c924
|
@ -1,7 +1,7 @@
|
|||
FROM python:2-alpine
|
||||
FROM python:3.6-alpine
|
||||
RUN mkdir /app
|
||||
RUN pip install requests beautifulsoup4 texttable plotly shodan
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN chmod +x *.py
|
||||
ENTRYPOINT ["/app/theHarvester.py"]
|
||||
ENTRYPOINT ["/app/theHarvester.py"]
|
||||
|
|
|
@ -262,9 +262,14 @@ def filter(lst):
|
|||
lst = set(lst) # remove duplicates
|
||||
new_lst = []
|
||||
for item in lst:
|
||||
item = str(item)
|
||||
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
|
||||
if '252f' in item:
|
||||
item = item.replace('252f', '')
|
||||
if '2F' in item:
|
||||
item = item.replace('2F', '')
|
||||
if '2f' in item:
|
||||
item = item.replace('2f', '')
|
||||
new_lst.append(item.lower())
|
||||
return new_lst
|
||||
|
||||
|
|
90
discovery/duckduckgosearch.py
Normal file
90
discovery/duckduckgosearch.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
from parsers import myparser
|
||||
import time
|
||||
import requests
|
||||
import json
|
||||
from discovery.constants import *
|
||||
|
||||
|
||||
class search_duckduckgo:
|
||||
|
||||
def __init__(self, word, limit):
|
||||
self.word = word
|
||||
self.results = ""
|
||||
self.totalresults = ""
|
||||
self.dorks = []
|
||||
self.links = []
|
||||
self.database = "https://duckduckgo.com/?q="
|
||||
self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # currently using api
|
||||
self.quantity = "100"
|
||||
self.limit = limit
|
||||
|
||||
def do_search(self):
|
||||
try: # do normal scraping
|
||||
url = self.api.replace('x', self.word)
|
||||
headers = {'User-Agent': googleUA}
|
||||
r = requests.get(url, headers=headers)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
time.sleep(getDelay())
|
||||
self.results = r.text
|
||||
self.totalresults += self.results
|
||||
urls = self.crawl(self.results)
|
||||
for url in urls:
|
||||
try:
|
||||
self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text
|
||||
time.sleep(getDelay())
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def crawl(self, text):
|
||||
"""
|
||||
function parses json and returns urls
|
||||
:param text: formatted json
|
||||
:return: set of urls
|
||||
"""
|
||||
urls = set()
|
||||
try:
|
||||
load = json.loads(text)
|
||||
for key in load.keys(): # iterate through keys of dict
|
||||
val = load.get(key)
|
||||
if isinstance(val, int) or isinstance(val, dict):
|
||||
continue
|
||||
if isinstance(val, list):
|
||||
val = val[0] # first value should be dict
|
||||
if isinstance(val, dict): # sanity check
|
||||
for key in val.keys():
|
||||
value = val.get(key)
|
||||
if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
|
||||
urls.add(value)
|
||||
if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
|
||||
urls.add(val)
|
||||
tmp = set()
|
||||
for url in urls:
|
||||
if '<' in url and 'href=' in url: # format is <href="https://www.website.com"/>
|
||||
equal_index = url.index('=')
|
||||
true_url = ''
|
||||
for ch in url[equal_index + 1:]:
|
||||
if ch == '"':
|
||||
tmp.add(true_url)
|
||||
break
|
||||
true_url += ch
|
||||
else:
|
||||
if url != '':
|
||||
tmp.add(url)
|
||||
return tmp
|
||||
except Exception as e:
|
||||
print('Exception occurred: ' + str(e))
|
||||
import traceback as t
|
||||
print(t.print_exc())
|
||||
return []
|
||||
|
||||
def get_emails(self):
|
||||
rawres = myparser.parser(self.totalresults, self.word)
|
||||
return rawres.emails()
|
||||
|
||||
def get_hostnames(self):
|
||||
rawres = myparser.parser(self.totalresults, self.word)
|
||||
return rawres.hostnames()
|
||||
|
||||
def process(self):
|
||||
self.do_search() # only need to search once since using API
|
|
@ -2,4 +2,4 @@ beautifulsoup4>=4.7.0
|
|||
plotly>=3.4.2
|
||||
requests>=2.21.0
|
||||
texttable>=1.4.0
|
||||
shodan>=1.10.0
|
||||
shodan>=1.10.0
|
||||
|
|
|
@ -95,7 +95,7 @@ def start(argv):
|
|||
elif opt == '-b':
|
||||
engines = set(arg.split(','))
|
||||
supportedengines = set(['baidu', 'bing', 'bingapi', 'censys', 'crtsh',
|
||||
'cymon', 'dogpile', 'google', 'googleCSE', 'google-certificates',
|
||||
'cymon', 'dogpile', 'duckduckgo', 'google', 'googleCSE', 'google-certificates',
|
||||
'google-profiles', 'hunter', 'linkedin',
|
||||
'netcraft', 'pgp', 'securityTrails', 'threatcrowd',
|
||||
'trello', 'twitter', 'vhost', 'virustotal', 'yahoo', 'all'])
|
||||
|
@ -181,6 +181,19 @@ def start(argv):
|
|||
db.store_all(word, all_hosts, 'email', 'dogpile')
|
||||
db.store_all(word, all_hosts, 'host', 'dogpile')
|
||||
|
||||
elif engineitem == "duckduckgo":
|
||||
print("[-] Searching in DuckDuckGo.")
|
||||
from discovery import duckduckgosearch
|
||||
search = duckduckgosearch.search_duckduckgo(word, limit)
|
||||
search.process()
|
||||
emails = filter(search.get_emails())
|
||||
hosts = filter(search.get_hostnames())
|
||||
all_hosts.extend(hosts)
|
||||
all_emails.extend(emails)
|
||||
db = stash.stash_manager()
|
||||
db.store_all(word, all_hosts, 'email', 'duckduckgo')
|
||||
db.store_all(word, all_hosts, 'host', 'duckduckgo')
|
||||
|
||||
elif engineitem == "google":
|
||||
print("[-] Searching in Google.")
|
||||
search = googlesearch.search_google(word, limit, start)
|
||||
|
|
Loading…
Reference in a new issue