2019-01-02 10:18:09 +08:00
|
|
|
from parsers import myparser
|
|
|
|
import time
|
|
|
|
import requests
|
|
|
|
import json
|
|
|
|
from discovery.constants import *
|
|
|
|
|
|
|
|
|
|
|
|
class search_duckduckgo:
|
|
|
|
|
|
|
|
def __init__(self, word, limit):
|
|
|
|
self.word = word
|
|
|
|
self.results = ""
|
|
|
|
self.totalresults = ""
|
|
|
|
self.dorks = []
|
|
|
|
self.links = []
|
|
|
|
self.database = "https://duckduckgo.com/?q="
|
2019-01-02 10:45:58 +08:00
|
|
|
self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # currently using api
|
2019-01-02 10:18:09 +08:00
|
|
|
self.quantity = "100"
|
|
|
|
self.limit = limit
|
|
|
|
|
|
|
|
def do_search(self):
|
|
|
|
try: # do normal scraping
|
|
|
|
url = self.api.replace('x', self.word)
|
2019-01-02 12:33:18 +08:00
|
|
|
headers = {'User-Agent': googleUA}
|
2019-01-02 10:18:09 +08:00
|
|
|
r = requests.get(url, headers=headers)
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
time.sleep(getDelay())
|
|
|
|
self.results = r.text
|
|
|
|
self.totalresults += self.results
|
|
|
|
urls = self.crawl(self.results)
|
|
|
|
for url in urls:
|
|
|
|
try:
|
|
|
|
self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text
|
|
|
|
time.sleep(getDelay())
|
|
|
|
except Exception:
|
|
|
|
continue
|
|
|
|
|
|
|
|
def crawl(self, text):
|
2019-01-02 10:45:58 +08:00
|
|
|
"""
|
|
|
|
function parses json and returns urls
|
|
|
|
:param text: formatted json
|
|
|
|
:return: set of urls
|
|
|
|
"""
|
2019-01-02 10:18:09 +08:00
|
|
|
urls = set()
|
|
|
|
try:
|
|
|
|
load = json.loads(text)
|
2019-01-02 10:45:58 +08:00
|
|
|
for key in load.keys(): # iterate through keys of dict
|
2019-01-02 10:18:09 +08:00
|
|
|
val = load.get(key)
|
|
|
|
if isinstance(val, int) or isinstance(val, dict):
|
|
|
|
continue
|
|
|
|
if isinstance(val, list):
|
2019-01-02 10:45:58 +08:00
|
|
|
val = val[0] # first value should be dict
|
|
|
|
if isinstance(val, dict): # sanity check
|
2019-01-02 10:18:09 +08:00
|
|
|
for key in val.keys():
|
|
|
|
value = val.get(key)
|
|
|
|
if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
|
|
|
|
urls.add(value)
|
|
|
|
if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
|
|
|
|
urls.add(val)
|
|
|
|
tmp = set()
|
|
|
|
for url in urls:
|
2019-01-05 10:53:59 +08:00
|
|
|
if '<' in url and 'href=' in url: # format is <href="https://www.website.com"/>
|
2019-01-02 10:18:09 +08:00
|
|
|
equal_index = url.index('=')
|
|
|
|
true_url = ''
|
|
|
|
for ch in url[equal_index + 1:]:
|
|
|
|
if ch == '"':
|
|
|
|
tmp.add(true_url)
|
|
|
|
break
|
|
|
|
true_url += ch
|
|
|
|
else:
|
|
|
|
if url != '':
|
|
|
|
tmp.add(url)
|
|
|
|
return tmp
|
|
|
|
except Exception as e:
|
2019-01-02 10:40:36 +08:00
|
|
|
print('Exception occurred: ' + str(e))
|
2019-01-02 12:33:18 +08:00
|
|
|
import traceback as t
|
|
|
|
print(t.print_exc())
|
2019-01-02 10:18:09 +08:00
|
|
|
return []
|
|
|
|
|
|
|
|
def get_emails(self):
|
|
|
|
rawres = myparser.parser(self.totalresults, self.word)
|
|
|
|
return rawres.emails()
|
|
|
|
|
|
|
|
def get_hostnames(self):
|
|
|
|
rawres = myparser.parser(self.totalresults, self.word)
|
|
|
|
return rawres.hostnames()
|
|
|
|
|
|
|
|
def process(self):
|
2019-01-02 12:33:18 +08:00
|
|
|
self.do_search() # only need to search once since using API
|