Rename the variables in myparser.py

This commit is contained in:
Adithan 2019-10-05 14:06:52 +05:30
parent 1821ce48a1
commit 30ba3e5b86

View file

@ -13,13 +13,13 @@ def genericClean(self):
.replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>', '')\ .replace('%2f', '').replace('%3a', '').replace('<strong>', '').replace('</strong>', '')\
.replace('<wbr>', '').replace('</wbr>', '') .replace('<wbr>', '').replace('</wbr>', '')
for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'): for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '/', '\\'):
self.results = self.results.replace(e, ' ') self.results = self.results.replace(search, ' ')
def urlClean(self): def urlClean(self):
self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '') self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
for e in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'): for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
self.results = self.results.replace(e, ' ') self.results = self.results.replace(search, ' ')
def emails(self): def emails(self):
self.genericClean() self.genericClean()
@ -38,11 +38,11 @@ def fileurls(self, file):
reg_urls = re.compile('<a href="(.*?)"') reg_urls = re.compile('<a href="(.*?)"')
self.temp = reg_urls.findall(self.results) self.temp = reg_urls.findall(self.results)
allurls = self.unique() allurls = self.unique()
for x in allurls: for iteration in allurls:
if x.count('webcache') or x.count('google.com') or x.count('search?hl'): if iteration.count('webcache') or iteration.count('google.com') or iteration.count('search?hl'):
pass pass
else: else:
urls.append(x) urls.append(iteration)
return urls return urls
def hostnames(self): def hostnames(self):
@ -61,24 +61,24 @@ def people_googleplus(self):
reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+') reg_people = re.compile(r'>[a-zA-Z0-9._ ]* - Google\+')
self.temp = reg_people.findall(self.results) self.temp = reg_people.findall(self.results)
resul = [] resul = []
for x in self.temp: for iteration in self.temp:
y = x.replace(' | LinkedIn', '') delete = iteration.replace(' | LinkedIn', '')
y = y.replace(' profiles ', '') delete = delete.replace(' profiles ', '')
y = y.replace('LinkedIn', '') delete = delete.replace('LinkedIn', '')
y = y.replace('"', '') delete = delete.replace('"', '')
y = y.replace('>', '') delete = delete.replace('>', '')
if y != " ": if delete != " ":
resul.append(y) resul.append(delete)
return resul return resul
def hostnames_all(self): def hostnames_all(self):
reg_hosts = re.compile('<cite>(.*?)</cite>') reg_hosts = re.compile('<cite>(.*?)</cite>')
temp = reg_hosts.findall(self.results) temp = reg_hosts.findall(self.results)
for x in temp: for iteration in temp:
if x.count(':'): if iteration.count(':'):
res = x.split(':')[1].split('/')[2] res = iteration.split(':')[1].split('/')[2]
else: else:
res = x.split('/')[0] res = iteration.split('/')[0]
self.temp.append(res) self.temp.append(res)
hostnames = self.unique() hostnames = self.unique()
return hostnames return hostnames
@ -96,14 +96,14 @@ def people_linkedin(self):
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn') reg_people = re.compile(r'">[a-zA-Z0-9._ -]* \| LinkedIn')
self.temp = reg_people.findall(self.results) self.temp = reg_people.findall(self.results)
resul = [] resul = []
for x in (self.temp): for iteration in (self.temp):
y = x.replace(' | LinkedIn', '') delete = iteration.replace(' | LinkedIn', '')
y = y.replace(' profiles ', '') delete = delete.replace(' profiles ', '')
y = y.replace('LinkedIn', '') delete = delete.replace('LinkedIn', '')
y = y.replace('"', '') delete = delete.replace('"', '')
y = y.replace('>', '') delete = delete.replace('>', '')
if y != " ": if delete != " ":
resul.append(y) resul.append(delete)
return resul return resul
def people_twitter(self): def people_twitter(self):
@ -111,36 +111,36 @@ def people_twitter(self):
self.temp = reg_people.findall(self.results) self.temp = reg_people.findall(self.results)
users = self.unique() users = self.unique()
resul = [] resul = []
for x in users: for iteration in users:
y = x.replace(' | LinkedIn', '') delete = iteration.replace(' | LinkedIn', '')
y = y.replace(' profiles ', '') delete = delete.replace(' profiles ', '')
y = y.replace('LinkedIn', '') delete = delete.replace('LinkedIn', '')
y = y.replace('"', '') delete = delete.replace('"', '')
y = y.replace('>', '') delete = delete.replace('>', '')
if y != " ": if delete != " ":
resul.append(y) resul.append(delete)
return resul return resul
def profiles(self): def profiles(self):
reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - <em>Google Profile</em>') reg_people = re.compile(r'">[a-zA-Z0-9._ -]* - <em>Google Profile</em>')
self.temp = reg_people.findall(self.results) self.temp = reg_people.findall(self.results)
resul = [] resul = []
for x in self.temp: for iteration in self.temp:
y = x.replace(' <em>Google Profile</em>', '') delete = iteration.replace(' <em>Google Profile</em>', '')
y = y.replace('-', '') delete = delete.replace('-', '')
y = y.replace('">', '') delete = delete.replace('">', '')
if y != " ": if delete != " ":
resul.append(y) resul.append(delete)
return resul return resul
def set(self): def set(self):
reg_sets = re.compile(r'>[a-zA-Z0-9]*</a></font>') reg_sets = re.compile(r'>[a-zA-Z0-9]*</a></font>')
self.temp = reg_sets.findall(self.results) self.temp = reg_sets.findall(self.results)
sets = [] sets = []
for x in self.temp: for iteration in self.temp:
y = x.replace('>', '') delete = iteration.replace('>', '')
y = y.replace('</a</font', '') delete = delete.replace('</a</font', '')
sets.append(y) sets.append(delete)
return sets return sets
def urls(self): def urls(self):
@ -150,7 +150,7 @@ def urls(self):
def unique(self) -> list: def unique(self) -> list:
self.new = [] self.new = []
for x in self.temp: for iteration in self.temp:
if x not in self.new: if iteration not in self.new:
self.new.append(x) self.new.append(iteration)
return self.new return self.new