Merge pull request #150 from NotoriousRebel/master

Added DuckDuckGo and updated filter.
2024-09-20 15:26:31 +08:00 · 2019-01-05 18:55:38 +01:00 · 2019-01-05 18:55:38 +01:00 · c8cd85c924
parent 25b922d700 208025b23b
commit c8cd85c924
5 changed files with 112 additions and 4 deletions
--- a/4
+++ b/4
@ -1,7 +1,7 @@
-FROM python:2-alpine
+FROM python:3.6-alpine
 RUN mkdir /app
 RUN pip install requests beautifulsoup4 texttable plotly shodan
 WORKDIR /app
 COPY . /app
 RUN chmod +x *.py
- ENTRYPOINT ["/app/theHarvester.py"]
+ENTRYPOINT ["/app/theHarvester.py"]
--- a/discovery/constants.py
+++ b/discovery/constants.py
@ -262,9 +262,14 @@ def filter(lst):
    lst = set(lst)  # remove duplicates
    new_lst = []
    for item in lst:
+        item = str(item)
        if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
            if '252f' in item:
                item = item.replace('252f', '')
+            if '2F' in item:
+                item = item.replace('2F', '')
+            if '2f' in item:
+                item = item.replace('2f', '')
            new_lst.append(item.lower())
    return new_lst

--- a/discovery/duckduckgosearch.py
+++ b/discovery/duckduckgosearch.py
@ -0,0 +1,90 @@
+from parsers import myparser
+import time
+import requests
+import json
+from discovery.constants import *
+
+
+class search_duckduckgo:
+
+    def __init__(self, word, limit):
+        self.word = word
+        self.results = ""
+        self.totalresults = ""
+        self.dorks = []
+        self.links = []
+        self.database = "https://duckduckgo.com/?q="
+        self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1"  # currently using api
+        self.quantity = "100"
+        self.limit = limit
+
+    def do_search(self):
+        try:  # do normal scraping
+            url = self.api.replace('x', self.word)
+            headers = {'User-Agent': googleUA}
+            r = requests.get(url, headers=headers)
+        except Exception as e:
+            print(e)
+        time.sleep(getDelay())
+        self.results = r.text
+        self.totalresults += self.results
+        urls = self.crawl(self.results)
+        for url in urls:
+            try:
+                self.totalresults += requests.get(url, headers={'User-Agent': getUserAgent()}).text
+                time.sleep(getDelay())
+            except Exception:
+                continue
+
+    def crawl(self, text):
+        """
+        function parses json and returns urls
+        :param text: formatted json
+        :return: set of urls
+        """
+        urls = set()
+        try:
+            load = json.loads(text)
+            for key in load.keys():  # iterate through keys of dict
+                val = load.get(key)
+                if isinstance(val, int) or isinstance(val, dict):
+                    continue
+                if isinstance(val, list):
+                    val = val[0]  # first value should be dict
+                    if isinstance(val, dict):  # sanity check
+                        for key in val.keys():
+                            value = val.get(key)
+                            if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
+                                urls.add(value)
+                if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
+                    urls.add(val)
+            tmp = set()
+            for url in urls:
+                if '<' in url and 'href=' in url:  # format is <href="https://www.website.com"/>
+                    equal_index = url.index('=')
+                    true_url = ''
+                    for ch in url[equal_index + 1:]:
+                        if ch == '"':
+                            tmp.add(true_url)
+                            break
+                        true_url += ch
+                else:
+                    if url != '':
+                        tmp.add(url)
+            return tmp
+        except Exception as e:
+            print('Exception occurred: ' + str(e))
+            import traceback as t
+            print(t.print_exc())
+            return []
+
+    def get_emails(self):
+        rawres = myparser.parser(self.totalresults, self.word)
+        return rawres.emails()
+
+    def get_hostnames(self):
+        rawres = myparser.parser(self.totalresults, self.word)
+        return rawres.hostnames()
+
+    def process(self):
+        self.do_search()  # only need to search once since using API
--- a/requirements.txt
+++ b/requirements.txt
@ -2,4 +2,4 @@ beautifulsoup4>=4.7.0
 plotly>=3.4.2
 requests>=2.21.0
 texttable>=1.4.0
-shodan>=1.10.0
+shodan>=1.10.0
--- a/theHarvester.py
+++ b/theHarvester.py
@ -95,7 +95,7 @@ def start(argv):
        elif opt == '-b':
            engines = set(arg.split(','))
            supportedengines = set(['baidu', 'bing', 'bingapi', 'censys', 'crtsh',
-                                    'cymon', 'dogpile', 'google', 'googleCSE', 'google-certificates',
+                                    'cymon', 'dogpile', 'duckduckgo', 'google', 'googleCSE', 'google-certificates',
                                    'google-profiles', 'hunter', 'linkedin',
                                    'netcraft', 'pgp', 'securityTrails', 'threatcrowd',
                                    'trello', 'twitter', 'vhost', 'virustotal', 'yahoo', 'all'])
@ -181,6 +181,19 @@ def start(argv):
                        db.store_all(word, all_hosts, 'email', 'dogpile')
                        db.store_all(word, all_hosts, 'host', 'dogpile')

+                    elif engineitem == "duckduckgo":
+                        print("[-] Searching in DuckDuckGo.")
+                        from discovery import duckduckgosearch
+                        search = duckduckgosearch.search_duckduckgo(word, limit)
+                        search.process()
+                        emails = filter(search.get_emails())
+                        hosts = filter(search.get_hostnames())
+                        all_hosts.extend(hosts)
+                        all_emails.extend(emails)
+                        db = stash.stash_manager()
+                        db.store_all(word, all_hosts, 'email', 'duckduckgo')
+                        db.store_all(word, all_hosts, 'host', 'duckduckgo')
+
                    elif engineitem == "google":
                        print("[-] Searching in Google.")
                        search = googlesearch.search_google(word, limit, start)