Merge pull request #150 from NotoriousRebel/master

Added DuckDuckGo and updated filter.
This commit is contained in:
Christian Martorella 2019-01-05 18:55:38 +01:00 committed by GitHub
commit c8cd85c924
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 112 additions and 4 deletions

View file

@ -1,7 +1,7 @@
FROM python:2-alpine
FROM python:3.6-alpine
RUN mkdir /app
RUN pip install requests beautifulsoup4 texttable plotly shodan
WORKDIR /app
COPY . /app
RUN chmod +x *.py
ENTRYPOINT ["/app/theHarvester.py"]
ENTRYPOINT ["/app/theHarvester.py"]

View file

@ -262,9 +262,14 @@ def filter(lst):
lst = set(lst) # remove duplicates
new_lst = []
for item in lst:
item = str(item)
if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
if '252f' in item:
item = item.replace('252f', '')
if '2F' in item:
item = item.replace('2F', '')
if '2f' in item:
item = item.replace('2f', '')
new_lst.append(item.lower())
return new_lst

View file

@ -0,0 +1,90 @@
from parsers import myparser
import time
import requests
import json
from discovery.constants import *
class search_duckduckgo:
def __init__(self, word, limit):
self.word = word
self.results = ""
self.totalresults = ""
self.dorks = []
self.links = []
self.database = "https://duckduckgo.com/?q="
self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # currently using api
self.quantity = "100"
self.limit = limit
def do_search(self):
try: # do normal scraping
url = self.api.replace('x', self.word)
headers = {'User-Agent': googleUA}
r = requests.get(url, headers=headers)
except Exception as e:
print(e)
time.sleep(getDelay())
self.results = r.text
self.totalresults += self.results
urls = self.crawl(self.results)
for url in urls:
try:
self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text
time.sleep(getDelay())
except Exception:
continue
def crawl(self, text):
"""
function parses json and returns urls
:param text: formatted json
:return: set of urls
"""
urls = set()
try:
load = json.loads(text)
for key in load.keys(): # iterate through keys of dict
val = load.get(key)
if isinstance(val, int) or isinstance(val, dict):
continue
if isinstance(val, list):
val = val[0] # first value should be dict
if isinstance(val, dict): # sanity check
for key in val.keys():
value = val.get(key)
if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
urls.add(value)
if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
urls.add(val)
tmp = set()
for url in urls:
if '<' in url and 'href=' in url: # format is <href="https://www.website.com"/>
equal_index = url.index('=')
true_url = ''
for ch in url[equal_index + 1:]:
if ch == '"':
tmp.add(true_url)
break
true_url += ch
else:
if url != '':
tmp.add(url)
return tmp
except Exception as e:
print('Exception occurred: ' + str(e))
import traceback as t
print(t.print_exc())
return []
def get_emails(self):
rawres = myparser.parser(self.totalresults, self.word)
return rawres.emails()
def get_hostnames(self):
rawres = myparser.parser(self.totalresults, self.word)
return rawres.hostnames()
def process(self):
self.do_search() # only need to search once since using API

View file

@ -2,4 +2,4 @@ beautifulsoup4>=4.7.0
plotly>=3.4.2
requests>=2.21.0
texttable>=1.4.0
shodan>=1.10.0
shodan>=1.10.0

View file

@ -95,7 +95,7 @@ def start(argv):
elif opt == '-b':
engines = set(arg.split(','))
supportedengines = set(['baidu', 'bing', 'bingapi', 'censys', 'crtsh',
'cymon', 'dogpile', 'google', 'googleCSE', 'google-certificates',
'cymon', 'dogpile', 'duckduckgo', 'google', 'googleCSE', 'google-certificates',
'google-profiles', 'hunter', 'linkedin',
'netcraft', 'pgp', 'securityTrails', 'threatcrowd',
'trello', 'twitter', 'vhost', 'virustotal', 'yahoo', 'all'])
@ -181,6 +181,19 @@ def start(argv):
db.store_all(word, all_hosts, 'email', 'dogpile')
db.store_all(word, all_hosts, 'host', 'dogpile')
elif engineitem == "duckduckgo":
print("[-] Searching in DuckDuckGo.")
from discovery import duckduckgosearch
search = duckduckgosearch.search_duckduckgo(word, limit)
search.process()
emails = filter(search.get_emails())
hosts = filter(search.get_hostnames())
all_hosts.extend(hosts)
all_emails.extend(emails)
db = stash.stash_manager()
db.store_all(word, all_hosts, 'email', 'duckduckgo')
db.store_all(word, all_hosts, 'host', 'duckduckgo')
elif engineitem == "google":
print("[-] Searching in Google.")
search = googlesearch.search_google(word, limit, start)