import string
import re
class parser:
def __init__(self, results, word):
self.results = results
self.word = word
self.temp = []
def genericClean(self):
self.results = re.sub('', '', self.results)
self.results = re.sub('', '', self.results)
self.results = re.sub('', '', self.results)
self.results = re.sub('', '', self.results)
self.results = re.sub('%2f', ' ', self.results)
self.results = re.sub('%3a', ' ', self.results)
self.results = re.sub('', '', self.results)
self.results = re.sub('', '', self.results)
for e in ('>', ':', '=', '<', '/', '\\', ';', '&', '%3A', '%3D', '%3C'):
self.results = string.replace(self.results, e, ' ')
def urlClean(self):
self.results = re.sub('', '', self.results)
self.results = re.sub('', '', self.results)
self.results = re.sub('%2f', ' ', self.results)
self.results = re.sub('%3a', ' ', self.results)
for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
self.results = string.replace(self.results, e, ' ')
def emails(self):
self.genericClean()
reg_emails = re.compile(
# Local part is required, charset is flexible
# https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' +
'@' +
'[a-zA-Z0-9.-]*' +
self.word)
self.temp = reg_emails.findall(self.results)
emails = self.unique()
return emails
def fileurls(self, file):
urls = []
reg_urls = re.compile('', '', self.results)
self.results = re.sub('', '', self.results)
reg_people = re.compile('>[a-zA-Z0-9._ ]* - Google\+')
#reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
self.temp = reg_people.findall(self.results)
resul = []
for x in self.temp:
y = string.replace(x, ' | LinkedIn', '')
y = string.replace(y, ' profiles ', '')
y = string.replace(y, 'LinkedIn', '')
y = string.replace(y, '"', '')
y = string.replace(y, '>', '')
if y != " ":
resul.append(y)
return resul
def people_twitter(self):
reg_people = re.compile('(@[a-zA-Z0-9._ -]*)')
#reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
self.temp = reg_people.findall(self.results)
users = self.unique()
resul = []
for x in users:
y = string.replace(x, ' | LinkedIn', '')
y = string.replace(y, ' profiles ', '')
y = string.replace(y, 'LinkedIn', '')
y = string.replace(y, '"', '')
y = string.replace(y, '>', '')
if y != " ":
resul.append(y)
return resul
def people_linkedin(self):
reg_people = re.compile('">[a-zA-Z0-9._ -]* \| LinkedIn')
#reg_people = re.compile('">[a-zA-Z0-9._ -]* profiles | LinkedIn')
self.temp = reg_people.findall(self.results)
resul = []
for x in self.temp:
y = string.replace(x, ' | LinkedIn', '')
y = string.replace(y, ' profiles ', '')
y = string.replace(y, 'LinkedIn', '')
y = string.replace(y, '"', '')
y = string.replace(y, '>', '')
if y != " ":
resul.append(y)
return resul
def profiles(self):
reg_people = re.compile('">[a-zA-Z0-9._ -]* - Google Profile')
self.temp = reg_people.findall(self.results)
resul = []
for x in self.temp:
y = string.replace(x, ' Google Profile', '')
y = string.replace(y, '-', '')
y = string.replace(y, '">', '')
if y != " ":
resul.append(y)
return resul
def people_jigsaw(self):
res = []
#reg_people = re.compile("'tblrow' title='[a-zA-Z0-9.-]*'>")
reg_people = re.compile(
"href=javascript:showContact\('[0-9]*'\)>[a-zA-Z0-9., ]*")
self.temp = reg_people.findall(self.results)
for x in self.temp:
a = x.split('>')[1].replace("[a-zA-Z0-9]*')
self.temp = reg_sets.findall(self.results)
sets = []
for x in self.temp:
y = string.replace(x, '>', '')
y = string.replace(y, '(.*?)')
temp = reg_hosts.findall(self.results)
for x in temp:
if x.count(':'):
res = x.split(':')[1].split('/')[2]
else:
res = x.split("/")[0]
self.temp.append(res)
hostnames = self.unique()
return hostnames
def unique(self):
self.new = []
for x in self.temp:
if x not in self.new:
self.new.append(x)
return self.new