added option to return linkedin links

2024-09-21 07:46:32 +08:00 · 2019-09-06 02:57:53 +00:00 · 2019-09-06 02:57:53 +00:00 · 4806751470
parent 1f4bc12de4
commit 4806751470
5 changed files with 64 additions and 4 deletions
--- a/theHarvester/main.py
+++ b/theHarvester/main.py
@ -53,9 +53,9 @@ def start():
    parser.add_argument('-c', '--dns-brute', help='perform a DNS brute force on the domain', default=False, action='store_true')
    parser.add_argument('-f', '--filename', help='save the results to an HTML and/or XML file', default='', type=str)
    parser.add_argument('-b', '--source', help='''baidu, bing, bingapi, censys, crtsh, dnsdumpster,
-                        dogpile, duckduckgo, github-code, google, 
+                        dogpile, duckduckgo, github-code, google,
                        hunter, intelx,
-                        linkedin, netcraft, securityTrails, threatcrowd,
+                        linkedin,linkedin_links, netcraft, securityTrails, threatcrowd,
                        trello, twitter, vhost, virustotal, yahoo, all''')
    parser.add_argument('-x', '--exclude', help='exclude options when using all sources', type=str)
    args = parser.parse_args()
@ -279,6 +279,22 @@ def start():
                        for user in sorted(list(set(people))):
                            print(user)

+                elif engineitem == 'linkedin_links':
+                    print('\033[94m[*] Searching Linkedin. \033[0m')
+                    search = linkedinsearch.SearchLinkedin(word, limit)
+                    search.process()
+                    people = search.get_links()
+                    db = stash.stash_manager()
+                    db.store_all(word, people, 'name', 'linkedin')
+
+                    if len(people) == 0:
+                        print('\n[*] No links found Linkedin.\n\n')
+                    else:
+                        print(f'\n[*] Links found: {len(people)}')
+                        print('---------------------')
+                        for user in sorted(list(set(people))):
+                            print(user)
+
                elif engineitem == 'netcraft':
                    print('\033[94m[*] Searching Netcraft. \033[0m')
                    search = netcraft.SearchNetcraft(word)
--- a/theHarvester/discovery/constants.py
+++ b/theHarvester/discovery/constants.py
@ -4,6 +4,28 @@
 googleUA = 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'


+def splitter(links):
+    """
+    Method that tries to remove duplicates
+    :param links: list of links to remove duplicates from
+    :return: unique-ish list
+    """
+    unique_list = []
+    name_check = []
+    for url in links:
+        tail = url.split("/")[-1]
+        if len(tail) == 2 or tail == "zh-cn":
+            tail = url.split("/")[-2]
+            name = tail.split("-")
+            if len(name) > 1:
+                joined_name = name[0] + name[1]
+            else:
+                joined_name = name[0]
+            if joined_name not in name_check:
+                unique_list.append(url)
+                name_check.append(joined_name)
+    return unique_list
+
 def filter(lst):
    """
    Method that filters list
--- a/theHarvester/discovery/linkedinsearch.py
+++ b/theHarvester/discovery/linkedinsearch.py
@ -4,7 +4,6 @@
 import requests
 import time

-
 class SearchLinkedin:

    def __init__(self, word, limit):
@ -34,9 +33,20 @@ def get_people(self):
        rawres = myparser.Parser(self.totalresults, self.word)
        return rawres.people_linkedin()

+    def get_links(self):
+        links = myparser.Parser(self.totalresults, self.word)
+        return splitter(links.links_linkedin())
+
+
    def process(self):
        while self.counter < self.limit:
            self.do_search()
            time.sleep(getDelay())
            self.counter += 100
            print(f'\tSearching {self.counter} results.')
+
+
+
+
+
+
--- a/theHarvester/lib/core.py
+++ b/theHarvester/lib/core.py
@ -77,6 +77,7 @@ def get_supportedengines():
                            'hunter',
                            'intelx',
                            'linkedin',
+                            'linkedin_links',
                            'netcraft',
                            'securityTrails',
                            'threatcrowd',
--- a/theHarvester/parsers/myparser.py
+++ b/theHarvester/parsers/myparser.py
@ -92,11 +92,22 @@ def hostnames_all(self):
        hostnames = self.unique()
        return hostnames

+    def links_linkedin(self):
+        reg_links = re.compile("url=https:\/\/www.linkedin.com(.*?)&")
+        self.temp = reg_links.findall(self.results)
+        resul = []
+        for x in self.temp:
+            y = x.replace("url=", "")
+            resul.append("https://www.linkedin.com" + y)
+        return set(resul)
+
    def people_linkedin(self):
        reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn')
        self.temp = reg_people.findall(self.results)
        resul = []
-        for x in self.temp:
+
+
+        for x in (self.temp):
            y = x.replace(' | LinkedIn', '')
            y = y.replace(' profiles ', '')
            y = y.replace('LinkedIn', '')