theHarvester/discovery/duckduckgosearch.py
2019-01-13 21:19:40 -06:00

94 lines
3.3 KiB
Python

from discovery.constants import *
from lib.core import *
from parsers import myparser
import json
import requests
import time
class SearchDuckDuckGo:
def __init__(self, word, limit):
self.word = word
self.results = ""
self.totalresults = ""
self.dorks = []
self.links = []
self.database = 'https://duckduckgo.com/?q='
self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API.
self.quantity = '100'
self.limit = limit
def do_search(self):
try: # Do normal scraping.
url = self.api.replace('x', self.word)
headers = {'User-Agent': googleUA}
r = requests.get(url, headers=headers)
except Exception as e:
print(e)
time.sleep(getDelay())
self.results = r.text
self.totalresults += self.results
urls = self.crawl(self.results)
for url in urls:
try:
self.totalresults += requests.get(url, headers={'User-Agent': Core.get_user_agent()}).text
time.sleep(getDelay())
except Exception:
continue
def crawl(self, text):
"""
Function parses json and returns URLs.
:param text: formatted json
:return: set of URLs
"""
urls = set()
try:
load = json.loads(text)
for key in load.keys(): # Iterate through keys of dict.
val = load.get(key)
if isinstance(val, int) or isinstance(val, dict) or val is None:
continue
if isinstance(val, list):
if len(val) == 0: # Make sure not indexing an empty list.
continue
val = val[0] # First value should be dict.
if isinstance(val, dict): # Sanity check.
for key in val.keys():
value = val.get(key)
if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
urls.add(value)
if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
urls.add(val)
tmp = set()
for url in urls:
if '<' in url and 'href=' in url: # Format is <href="https://www.website.com"/>
equal_index = url.index('=')
true_url = ''
for ch in url[equal_index + 1:]:
if ch == '"':
tmp.add(true_url)
break
true_url += ch
else:
if url != '':
tmp.add(url)
return tmp
except Exception as e:
print(f'Exception occurred: {e}')
import traceback as t
print(t.print_exc())
return []
def get_emails(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.emails()
def get_hostnames(self):
rawres = myparser.Parser(self.totalresults, self.word)
return rawres.hostnames()
def process(self):
self.do_search() # Only need to search once since using API.