theHarvester/discovery/duckduckgosearch.py

from parsers import myparser
import time
import requests
import json
from discovery.constants import *


class search_duckduckgo:

    def __init__(self, word, limit):
        self.word = word
        self.results = ""
        self.totalresults = ""
        self.dorks = []
        self.links = []
        self.database = "https://duckduckgo.com/?q="
        self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1"  # currently using api
        self.quantity = "100"
        self.limit = limit

    def do_search(self):
        try:  # do normal scraping
            url = self.api.replace('x', self.word)
            headers = {'User-Agent': googleUA}
            r = requests.get(url, headers=headers)
        except Exception as e:
            print(e)
        time.sleep(getDelay())
        self.results = r.text
        self.totalresults += self.results
        urls = self.crawl(self.results)
        for url in urls:
            try:
                self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text
                time.sleep(getDelay())
            except Exception:
                continue

    def crawl(self, text):
        """
        function parses json and returns urls
        :param text: formatted json
        :return: set of urls
        """
        urls = set()
        try:
            load = json.loads(text)
            for key in load.keys():  # iterate through keys of dict
                val = load.get(key)
                if isinstance(val, int) or isinstance(val, dict):
                    continue
                if isinstance(val, list):
                    val = val[0]  # first value should be dict
                    if isinstance(val, dict):  # sanity check
                        for key in val.keys():
                            value = val.get(key)
                            if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
                                urls.add(value)
                if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
                    urls.add(val)
            tmp = set()
            for url in urls:
                if '<' in url and 'href=' in url:  # format is <href="https://www.website.com"/>
                    equal_index = url.index('=')
                    true_url = ''
                    for ch in url[equal_index + 1:]:
                        if ch == '"':
                            tmp.add(true_url)
                            break
                        true_url += ch
                else:
                    if url != '':
                        tmp.add(url)
            return tmp
        except Exception as e:
            print('Exception occurred: ' + str(e))
            import traceback as t
            print(t.print_exc())
            return []

    def get_emails(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.emails()

    def get_hostnames(self):
        rawres = myparser.parser(self.totalresults, self.word)
        return rawres.hostnames()

    def process(self):
        self.do_search()  # only need to search once since using API
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`from parsers import myparser`
			`import time`
			`import requests`
			`import json`
			`from discovery.constants import *`


			`class search_duckduckgo:`

			`def __init__(self, word, limit):`
			`self.word = word`
			`self.results = ""`
			`self.totalresults = ""`
			`self.dorks = []`
			`self.links = []`
			`self.database = "https://duckduckgo.com/?q="`
Added docstring. 2019-01-02 10:45:58 +08:00			`self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # currently using api`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`self.quantity = "100"`
			`self.limit = limit`

			`def do_search(self):`
			`try: # do normal scraping`
			`url = self.api.replace('x', self.word)`
Made user agent for initial request to duckduckgo api constant. 2019-01-02 12:33:18 +08:00			`headers = {'User-Agent': googleUA}`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`r = requests.get(url, headers=headers)`
			`except Exception as e:`
			`print(e)`
			`time.sleep(getDelay())`
			`self.results = r.text`
			`self.totalresults += self.results`
			`urls = self.crawl(self.results)`
			`for url in urls:`
			`try:`
			`self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text`
			`time.sleep(getDelay())`
			`except Exception:`
			`continue`

			`def crawl(self, text):`
Added docstring. 2019-01-02 10:45:58 +08:00			`"""`
			`function parses json and returns urls`
			`:param text: formatted json`
			`:return: set of urls`
			`"""`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`urls = set()`
			`try:`
			`load = json.loads(text)`
Added docstring. 2019-01-02 10:45:58 +08:00			`for key in load.keys(): # iterate through keys of dict`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`val = load.get(key)`
			`if isinstance(val, int) or isinstance(val, dict):`
			`continue`
			`if isinstance(val, list):`
Added docstring. 2019-01-02 10:45:58 +08:00			`val = val[0] # first value should be dict`
			`if isinstance(val, dict): # sanity check`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`for key in val.keys():`
			`value = val.get(key)`
			`if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:`
			`urls.add(value)`
			`if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:`
			`urls.add(val)`
			`tmp = set()`
			`for url in urls:`
Syncing. 2019-01-05 10:53:59 +08:00			`if '<' in url and 'href=' in url: # format is <href="https://www.website.com"/>`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`equal_index = url.index('=')`
			`true_url = ''`
			`for ch in url[equal_index + 1:]:`
			`if ch == '"':`
			`tmp.add(true_url)`
			`break`
			`true_url += ch`
			`else:`
			`if url != '':`
			`tmp.add(url)`
			`return tmp`
			`except Exception as e:`
Minor cosmetic fix to notify when user encounters Exception. 2019-01-02 10:40:36 +08:00			`print('Exception occurred: ' + str(e))`
Made user agent for initial request to duckduckgo api constant. 2019-01-02 12:33:18 +08:00			`import traceback as t`
			`print(t.print_exc())`
Added duckduckgo as search engine, need to figure out how to crawl properly. 2019-01-02 10:18:09 +08:00			`return []`

			`def get_emails(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.emails()`

			`def get_hostnames(self):`
			`rawres = myparser.parser(self.totalresults, self.word)`
			`return rawres.hostnames()`

			`def process(self):`
Made user agent for initial request to duckduckgo api constant. 2019-01-02 12:33:18 +08:00			`self.do_search() # only need to search once since using API`