diff --git a/README.md b/README.md index 3afa32de..5110a98b 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ Passive: * duckduckgo: DuckDuckGo search engine - www.duckduckgo.com +* github-code: Github code search engine (Requires Github Personal Access Token, see below.) - www.github.com + * google: Google search engine (Optional Google dorking.) - www.google.com * google-certificates: Google Certificate Transparency report @@ -83,6 +85,7 @@ Modules that require an API key: Add your keys to api-keys.yaml * bingapi +* github * hunter * intelx * securityTrails diff --git a/api-keys.yaml b/api-keys.yaml index d0be6aa3..5cbffed6 100644 --- a/api-keys.yaml +++ b/api-keys.yaml @@ -12,4 +12,7 @@ apikeys: key: shodan: - key: oCiMsgM6rQWqiTvPxFHYcExlZgg7wvTt \ No newline at end of file + key: oCiMsgM6rQWqiTvPxFHYcExlZgg7wvTt + + github: + key: diff --git a/tests/discovery/__init__.py b/tests/discovery/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/discovery/test_githubcode.py b/tests/discovery/test_githubcode.py new file mode 100644 index 00000000..49ab3fa7 --- /dev/null +++ b/tests/discovery/test_githubcode.py @@ -0,0 +1,116 @@ +from theHarvester.discovery import githubcode +from theHarvester.discovery.githubcode import RetryResult, ErrorResult, SuccessResult +from theHarvester.discovery.constants import MissingKey +from theHarvester.lib.core import Core +from unittest.mock import MagicMock +from requests import Response +import pytest + + +class TestSearchGithubCode: + + class OkResponse: + response = Response() + json = { + "items": [ + { + "text_matches": [ + { + "fragment": "test1" + } + ] + }, + { + "text_matches": [ + { + "fragment": "test2" + } + ] + } + ] + } + response.status_code = 200 + response.json = MagicMock(return_value=json) + + class FailureResponse: + response = Response() + response.json = MagicMock(return_value={}) + response.status_code = 401 + + class RetryResponse: + response = Response() + response.json = MagicMock(return_value={}) + response.status_code = 403 + + class MalformedResponse: + response = Response() + json = { + "items": [ + { + "fail": True + }, + { + "text_matches": [] + }, + { + "text_matches": [ + { + "weird": "result" + } + ] + } + ] + } + response.json = MagicMock(return_value=json) + response.status_code = 200 + + def test_missing_key(self): + with pytest.raises(MissingKey): + Core.github_key = MagicMock(return_value=None) + githubcode.SearchGithubCode(word="test", limit=500) + + def test_fragments_from_response(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = test_class_instance.fragments_from_response(self.OkResponse.response) + assert test_result == ["test1", "test2"] + + def test_invalid_fragments_from_response(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = test_class_instance.fragments_from_response(self.MalformedResponse.response) + assert test_result == [] + + def test_handle_response_ok(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = test_class_instance.handle_response(self.OkResponse.response) + assert isinstance(test_result, SuccessResult) + + def test_handle_response_retry(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = test_class_instance.handle_response(self.RetryResponse.response) + assert isinstance(test_result, RetryResult) + + def test_handle_response_fail(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = test_class_instance.handle_response(self.FailureResponse.response) + assert isinstance(test_result, ErrorResult) + + def test_next_page(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4) + assert(2 == test_class_instance.next_page_or_end(test_result)) + + def test_last_page(self): + Core.github_key = MagicMock(return_value="lol") + test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) + test_result = githubcode.SuccessResult(list(), None, None) + assert(None is test_class_instance.next_page_or_end(test_result)) + + if __name__ == '__main__': + pytest.main() + diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py index a111b6c8..3594ec56 100644 --- a/theHarvester/__main__.py +++ b/theHarvester/__main__.py @@ -54,7 +54,7 @@ def start(): parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true') parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str) parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster, - dogpile, duckduckgo, google, + dogpile, duckduckgo, github-code, google, google-certificates, hunter, intelx, linkedin, netcraft, securityTrails, threatcrowd, trello, twitter, vhost, virustotal, yahoo, all''') @@ -194,6 +194,24 @@ def start(): db.store_all(word, all_hosts, 'email', 'duckduckgo') db.store_all(word, all_hosts, 'host', 'duckduckgo') + elif engineitem == 'github-code': + print('\033[94m[*] Searching Github (code). \033[0m') + try: + from theHarvester.discovery import githubcode + search = githubcode.SearchGithubCode(word, limit) + search.process() + emails = filter(search.get_emails()) + all_emails.extend(emails) + hosts = filter(search.get_hostnames()) + all_hosts.extend(hosts) + db = stash.stash_manager() + db.store_all(word, all_hosts, 'host', 'github-code') + db.store_all(word, all_emails, 'email', 'github-code') + except MissingKey as ex: + print(ex) + else: + pass + elif engineitem == 'google': print('\033[94m[*] Searching Google. \033[0m') search = googlesearch.search_google(word, limit, start) diff --git a/theHarvester/discovery/githubcode.py b/theHarvester/discovery/githubcode.py new file mode 100644 index 00000000..93ddcf24 --- /dev/null +++ b/theHarvester/discovery/githubcode.py @@ -0,0 +1,126 @@ +from theHarvester.discovery.constants import * +from theHarvester.lib.core import * +from theHarvester.parsers import myparser +import requests +from requests import Response +import time +from typing import List, Dict, Any, Optional, NamedTuple +import urllib.parse as urlparse + + +class RetryResult(NamedTuple): + time: float + + +class SuccessResult(NamedTuple): + fragments: List[str] + next_page: Optional[int] + last_page: Optional[int] + + +class ErrorResult(NamedTuple): + status_code: int + body: any + + +class SearchGithubCode: + + def __init__(self, word, limit): + self.word = word + self.total_results = "" + self.server = 'api.github.com' + self.hostname = 'api.github.com' + self.limit = limit + self.counter = 0 + self.page = 1 + self.key = Core.github_key() + # If you don't have a personal access token, github narrows your search capabilities significantly + # rate limits you more severely + # https://developer.github.com/v3/search/#rate-limit + if self.key is None: + raise MissingKey(True) + + @staticmethod + def fragments_from_response(response: Response) -> List[str]: + items: List[Dict[str, Any]] = response.json().get('items') or list() + fragments: List[str] = list() + for item in items: + matches = item.get("text_matches") or list() + for match in matches: + fragments.append(match.get("fragment")) + return [fragment for fragment in fragments if fragment is not None] + + @staticmethod + def page_from_response(page: str, response: Response) -> Optional[int]: + page_link = response.links.get(page) + if page_link: + parsed = urlparse.urlparse(page_link.get("url")) + params = urlparse.parse_qs(parsed.query) + page = params.get('page') or [None] + page_number = page[0] and int(page[0]) + return page_number + else: + return None + + def handle_response(self, response: Response) -> Optional[Any]: + if response.ok: + results = self.fragments_from_response(response) + next_page = self.page_from_response("next", response) + last_page = self.page_from_response("last", response) + return SuccessResult(results, next_page, last_page) + elif response.status_code == 429 or response.status_code == 403: + return RetryResult(60) + else: + try: + return ErrorResult(response.status_code, response.json()) + except ValueError: + return ErrorResult(response.status_code, response.text) + + def do_search(self, page: Optional[int]) -> Response: + if page is None: + url = f'https://{self.server}/search/code?q="{self.word}"' + else: + url = f'https://{self.server}/search/code?q="{self.word}"&page={page}' + headers = { + 'Host': self.hostname, + 'User-agent': Core.get_user_agent(), + 'Accept': "application/vnd.github.v3.text-match+json", + 'Authorization': 'token {}'.format(self.key) + } + return requests.get(url=url, headers=headers, verify=True) + + @staticmethod + def next_page_or_end(result: SuccessResult) -> Optional[int]: + if result.next_page is not None: + return result.next_page + else: + return result.last_page + + def process(self): + while self.counter <= self.limit and self.page is not None: + api_response = self.do_search(self.page) + result = self.handle_response(api_response) + if type(result) == SuccessResult: + print(f'\tSearching {self.counter} results.') + for fragment in result.fragments: + self.total_results += fragment + self.counter = self.counter + 1 + + self.page = self.next_page_or_end(result) + time.sleep(getDelay()) + elif type(result) == RetryResult: + sleepy_time = getDelay() + result.time + print(f'\tRetrying page in {sleepy_time} seconds...') + time.sleep(sleepy_time) + elif type(result) == ErrorResult: + raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}") + else: + raise Exception("\tUnknown exception occurred") + + def get_emails(self): + rawres = myparser.Parser(self.total_results, self.word) + return rawres.emails() + + def get_hostnames(self): + rawres = myparser.Parser(self.total_results, self.word) + return rawres.hostnames() diff --git a/theHarvester/lib/core.py b/theHarvester/lib/core.py index 10a09348..d7bb8a28 100644 --- a/theHarvester/lib/core.py +++ b/theHarvester/lib/core.py @@ -15,6 +15,12 @@ def bing_key(): keys = yaml.safe_load(api_keys) return keys['apikeys']['bing']['key'] + @staticmethod + def github_key(): + with open('api-keys.yaml', 'r') as api_keys: + keys = yaml.safe_load(api_keys) + return keys['apikeys']['github']['key'] + @staticmethod def hunter_key(): with open('api-keys.yaml', 'r') as api_keys: @@ -66,6 +72,7 @@ def get_supportedengines(): 'dnsdumpster', 'dogpile', 'duckduckgo', + 'github-code', 'google', 'google-certificates', 'hunter',