2018-02-08 02:13:19 +08:00
|
|
|
#!/usr/bin/python
|
|
|
|
# -*- coding: utf-8 -*-
|
2018-02-09 03:53:13 +08:00
|
|
|
# Comentarios en Español
|
|
|
|
# Blog: www.pythondiario.com
|
2018-02-08 02:13:19 +08:00
|
|
|
|
2018-02-08 22:29:19 +08:00
|
|
|
from googlesearch import search
|
2018-02-09 06:20:27 +08:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import urllib.request
|
2018-02-08 02:13:19 +08:00
|
|
|
import random
|
|
|
|
import os
|
|
|
|
import time
|
2018-02-08 11:15:41 +08:00
|
|
|
import sqlite3
|
|
|
|
from sqlite3 import Error
|
2018-02-09 03:53:13 +08:00
|
|
|
import sys
|
2018-02-09 08:28:13 +08:00
|
|
|
import re
|
2018-02-08 02:13:19 +08:00
|
|
|
|
2018-02-09 01:12:58 +08:00
|
|
|
# Menú Principal
|
2018-02-08 02:13:19 +08:00
|
|
|
def menu():
|
|
|
|
try:
|
2018-02-08 08:03:48 +08:00
|
|
|
clear()
|
|
|
|
print ("###################################################################")
|
|
|
|
print ("# #")
|
|
|
|
print ("# EMAIL EXTRACTOR #")
|
|
|
|
print ("# #")
|
|
|
|
print ("###################################################################")
|
|
|
|
print ("")
|
2018-02-09 03:53:13 +08:00
|
|
|
print (" ENGLISH - ESPAÑOL ")
|
|
|
|
print ("-------------------------------------------------------------------")
|
2018-02-09 08:28:13 +08:00
|
|
|
print ("1 - Search only in the entered URL - Buscar solo en la URL ingresada")
|
|
|
|
print ("2 - Search in a url (Two Levels) - Buscar en una URL(Dos Niveles) **Enter the url and the ones you find inside**")
|
|
|
|
print ("3 - Search phrase in google - Buscar frase en Google")
|
|
|
|
print ("4 - List emails - Listar correos")
|
|
|
|
print ("5 - Save emails in .txt file - Guardar correos en archivo .txt")
|
|
|
|
print ("6 - Exit - Salir")
|
2018-02-08 08:03:48 +08:00
|
|
|
print ("")
|
|
|
|
|
2018-02-09 03:53:13 +08:00
|
|
|
opcion = input("Enter option - Ingrese Opcion: ")
|
2018-02-08 02:13:19 +08:00
|
|
|
if (opcion == "1"):
|
2018-02-09 06:20:27 +08:00
|
|
|
print ("Example URL: http://www.pythondiario.com")
|
2018-02-09 03:53:13 +08:00
|
|
|
url = str(input("Enter URL - Ingrese URL: "))
|
2018-02-09 08:28:13 +08:00
|
|
|
extractOnlyUrl(url)
|
|
|
|
input("Press enter key to continue")
|
|
|
|
menu()
|
|
|
|
|
|
|
|
if (opcion == "2"):
|
|
|
|
print ("Example URL: http://www.pythondiario.com")
|
|
|
|
url = str(input("Enter URL - Ingrese URL: "))
|
2018-02-08 08:03:48 +08:00
|
|
|
extractUrl(url)
|
2018-02-09 06:20:27 +08:00
|
|
|
input("Press enter key to continue")
|
2018-02-09 03:53:13 +08:00
|
|
|
menu()
|
2018-02-08 08:03:48 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
elif (opcion == "3"):
|
2018-02-09 03:53:13 +08:00
|
|
|
frase = str(input("Enter a phrase to search - Ingrese una frase a buscar: "))
|
|
|
|
print ("***Warning: The amount of results chosen impacts the execution time***")
|
2018-02-08 08:03:48 +08:00
|
|
|
print ("*** Advertencia: La cantidad de resultados elejidos impacta el tiempo de ejecucion")
|
2018-02-09 03:53:13 +08:00
|
|
|
cantRes = int(input("Number of results in Google - Cantiad de resultados en Google: "))
|
2018-02-09 06:20:27 +08:00
|
|
|
print ("")
|
2018-02-08 08:03:48 +08:00
|
|
|
extractFraseGoogle(frase, cantRes)
|
2018-02-09 06:20:27 +08:00
|
|
|
input("Press enter key to continue")
|
|
|
|
menu()
|
2018-02-08 08:03:48 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
elif (opcion == "4"):
|
2018-02-08 11:15:41 +08:00
|
|
|
print ("")
|
2018-02-09 03:53:13 +08:00
|
|
|
print ("1 - Select a phrase - Seleccionar una frase")
|
|
|
|
print ("2 - All emails - Todos los correos")
|
|
|
|
opcListar = input("Enter option - Ingrese Opcion: ")
|
2018-02-08 11:15:41 +08:00
|
|
|
|
|
|
|
if (opcListar == "1"):
|
2018-02-09 01:12:58 +08:00
|
|
|
listarPorFrase()
|
2018-02-08 11:15:41 +08:00
|
|
|
|
|
|
|
elif (opcListar == "2"):
|
2018-02-08 12:53:25 +08:00
|
|
|
listarTodo("Emails.db")
|
2018-02-08 11:15:41 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
elif (opcion == "5"):
|
2018-02-08 11:49:02 +08:00
|
|
|
print ("")
|
2018-02-09 03:53:13 +08:00
|
|
|
print ("1 - Save emails from a phrase - Guardar correos de una frase")
|
|
|
|
print ("2 - Save all emails - Guardar todos los correos")
|
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
elif (opcion == "6"):
|
2018-02-09 03:53:13 +08:00
|
|
|
sys.exit(0)
|
2018-02-08 11:15:41 +08:00
|
|
|
|
2018-02-08 08:03:48 +08:00
|
|
|
else:
|
2018-02-09 03:53:13 +08:00
|
|
|
print ("Select a correct option - Seleccione un opcion correcta")
|
2018-02-08 02:13:19 +08:00
|
|
|
time.sleep(2)
|
|
|
|
clear()
|
|
|
|
menu()
|
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
except Exception as e:
|
|
|
|
print (e)
|
2018-02-08 08:03:48 +08:00
|
|
|
|
2018-02-09 01:12:58 +08:00
|
|
|
# Insertar correo, frase y Url en base de datos
|
|
|
|
def insertEmail(db_file, email, frase, url):
|
2018-02-08 12:53:25 +08:00
|
|
|
try:
|
|
|
|
conn = sqlite3.connect(db_file)
|
|
|
|
c = conn.cursor()
|
2018-02-09 01:12:58 +08:00
|
|
|
c.execute("INSERT INTO emails (frase, email, url) VALUES (?,?,?)", (frase, email, url))
|
2018-02-08 12:53:25 +08:00
|
|
|
conn.commit()
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
except Error as e:
|
|
|
|
print(e)
|
|
|
|
finally:
|
|
|
|
conn.close()
|
2018-02-08 11:15:41 +08:00
|
|
|
|
|
|
|
# Buscar correo en la base de datos
|
|
|
|
def searchEmail(db_file):
|
2018-02-08 12:53:25 +08:00
|
|
|
pass
|
|
|
|
#try:
|
|
|
|
#conn = sqlite3.connect(db_file)
|
|
|
|
#except Error as e:
|
|
|
|
#print(e)
|
|
|
|
#finally:
|
|
|
|
# conn.close()
|
|
|
|
|
|
|
|
# Crea tabla principal
|
2018-02-08 11:49:02 +08:00
|
|
|
def crearTabla(db_file):
|
|
|
|
try:
|
|
|
|
conn = sqlite3.connect(db_file)
|
|
|
|
c = conn.cursor()
|
2018-02-08 12:53:25 +08:00
|
|
|
|
|
|
|
#c.execute('drop table if exists emails')
|
|
|
|
|
2018-02-08 11:49:02 +08:00
|
|
|
sql = '''create table if not exists emails
|
2018-02-08 12:53:25 +08:00
|
|
|
(ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
2018-02-09 01:12:58 +08:00
|
|
|
frase varchar(500) NOT NULL,
|
2018-02-08 12:53:25 +08:00
|
|
|
email varchar(200) NOT NULL,
|
|
|
|
url varchar(500) NOT NULL)'''
|
2018-02-08 11:49:02 +08:00
|
|
|
|
|
|
|
c.execute(sql)
|
|
|
|
c.close()
|
|
|
|
|
|
|
|
except Error as e:
|
|
|
|
print(e)
|
|
|
|
finally:
|
|
|
|
conn.close()
|
|
|
|
|
2018-02-09 01:12:58 +08:00
|
|
|
# Lista correos por frase
|
|
|
|
def listarPorFrase():
|
2018-02-08 11:15:41 +08:00
|
|
|
pass
|
|
|
|
|
2018-02-08 12:53:25 +08:00
|
|
|
# Lista todos los correos
|
|
|
|
def listarTodo(db_file):
|
|
|
|
try:
|
|
|
|
conn = sqlite3.connect(db_file)
|
|
|
|
c = conn.cursor()
|
|
|
|
c.execute("SELECT * FROM emails")
|
|
|
|
|
|
|
|
for i in c:
|
|
|
|
|
|
|
|
print ("")
|
2018-02-09 03:53:13 +08:00
|
|
|
print ("Number: " + str(i[0]))
|
|
|
|
print ("Search: " + str(i[1]))
|
2018-02-08 12:53:25 +08:00
|
|
|
print ("Email: " + str(i[2]))
|
|
|
|
print ("Url: " + str(i[3]))
|
2018-02-08 22:29:19 +08:00
|
|
|
print ("-------------------------------------------------------------------------------")
|
2018-02-08 12:53:25 +08:00
|
|
|
|
|
|
|
c.close()
|
2018-02-09 01:12:58 +08:00
|
|
|
|
|
|
|
print ("")
|
2018-02-09 06:20:27 +08:00
|
|
|
input("Press enter key to continue")
|
2018-02-09 01:12:58 +08:00
|
|
|
menu()
|
2018-02-08 12:53:25 +08:00
|
|
|
|
|
|
|
except Error as e:
|
|
|
|
print(e)
|
|
|
|
finally:
|
|
|
|
conn.close()
|
2018-02-08 11:15:41 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
|
|
|
|
def extractOnlyUrl(url):
|
|
|
|
try:
|
|
|
|
count = 0
|
|
|
|
conn = urllib.request.urlopen(url)
|
|
|
|
|
|
|
|
html = conn.read().decode('utf-8')
|
|
|
|
|
|
|
|
print ("Searching emails... please wait")
|
|
|
|
|
|
|
|
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", html)
|
|
|
|
|
|
|
|
for email in emails:
|
|
|
|
count += 1
|
|
|
|
print(str(count) + " - " + email)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print (e)
|
|
|
|
|
2018-02-08 08:03:48 +08:00
|
|
|
# Extrae los correos de una Url - 2 niveles
|
|
|
|
def extractUrl(url):
|
2018-02-09 08:28:13 +08:00
|
|
|
print ("Searching emails... please wait")
|
|
|
|
print ("This operation may take several minutes")
|
2018-02-09 06:20:27 +08:00
|
|
|
try:
|
2018-02-09 08:28:13 +08:00
|
|
|
count = 0
|
|
|
|
|
|
|
|
listUrl = []
|
|
|
|
|
2018-02-09 06:20:27 +08:00
|
|
|
conn = urllib.request.urlopen(url)
|
2018-02-09 08:28:13 +08:00
|
|
|
|
2018-02-09 06:20:27 +08:00
|
|
|
html = conn.read()
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
links = soup.find_all('a')
|
|
|
|
|
|
|
|
for tag in links:
|
|
|
|
link = tag.get('href', None)
|
|
|
|
if link is not None:
|
2018-02-09 08:28:13 +08:00
|
|
|
try:
|
|
|
|
#listUrl.append(link)
|
|
|
|
print ("Searching in " + link)
|
|
|
|
if(link[0:4] == 'http'):
|
|
|
|
f = urllib.request.urlopen(link)
|
|
|
|
s = f.read().decode('utf-8')
|
|
|
|
emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
|
|
|
|
for email in emails:
|
|
|
|
count += 1
|
|
|
|
print(str(count) + " - " + email)
|
|
|
|
# Sigue si existe algun error
|
|
|
|
except Exception:
|
|
|
|
pass
|
|
|
|
|
2018-02-09 06:20:27 +08:00
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
2018-02-08 08:03:48 +08:00
|
|
|
|
|
|
|
# Extrae los correos de todas las Url encontradas en las busquedas
|
|
|
|
# De cada Url extrae los correo - 2 niveles
|
|
|
|
def extractFraseGoogle(frase, cantRes):
|
2018-02-09 08:28:13 +08:00
|
|
|
try:
|
|
|
|
listUrl = []
|
2018-02-09 01:12:58 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
for url in search(frase, stop=cantRes):
|
|
|
|
listUrl.append(url)
|
2018-02-08 08:03:48 +08:00
|
|
|
|
2018-02-09 08:28:13 +08:00
|
|
|
for i in listUrl:
|
|
|
|
conn = urllib.request.urlopen(i)
|
|
|
|
|
|
|
|
html = conn.read()
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
links = soup.find_all('a')
|
|
|
|
|
|
|
|
for tag in links:
|
|
|
|
link = tag.get('href', None)
|
|
|
|
if link is not None:
|
|
|
|
print (link)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
2018-02-09 06:20:27 +08:00
|
|
|
|
2018-02-08 02:13:19 +08:00
|
|
|
# Limpia la pantalla según el sistema operativo
|
|
|
|
def clear():
|
2018-02-08 11:15:41 +08:00
|
|
|
try:
|
|
|
|
if os.name == "posix":
|
|
|
|
os.system("clear")
|
|
|
|
elif os.name == "ce" or os.name == "nt" or os.name == "dos":
|
|
|
|
os.system("cls")
|
2018-02-09 08:28:13 +08:00
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
2018-02-08 02:13:19 +08:00
|
|
|
|
2018-02-08 11:49:02 +08:00
|
|
|
# Inicio de Programa
|
2018-02-09 03:53:13 +08:00
|
|
|
def Main():
|
|
|
|
clear()
|
|
|
|
crearTabla("Emails.db")
|
|
|
|
menu()
|
|
|
|
insertEmail("Emails.db", "Programadores en Uruguay", "prueba@gmail.com", "www.pythondiario.com")
|
|
|
|
|
|
|
|
Main()
|
|
|
|
|