From 847d892e18849c9f58136f180a72eadac509cf18 Mon Sep 17 00:00:00 2001
From: Diego Caraballo <dcaraballo@amsj.com.uy>
Date: Wed, 13 Jul 2022 11:19:21 -0300
Subject: [PATCH] Save emails in .csv

---
 EmailExtractor.py |  72 +++++++++++++++++++++++++++++++++++++---------
 README.md         |  23 +++++++++------
 requirements.txt  | Bin 88 -> 188 bytes
 3 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/EmailExtractor.py b/EmailExtractor.py
index da7e09b..5dcdfd4 100644
--- a/EmailExtractor.py
+++ b/EmailExtractor.py
@@ -19,9 +19,11 @@ import re
 from fake_useragent import UserAgent
 from socket import timeout
 from urllib.error import HTTPError, URLError
+from datetime import datetime
+import csv
 
-imageExt = ["jpeg", "jpg", "exif", "tif", "tiff", "gif", "bmp", "png", "ppm",
-			"pgm", "pbm", "pnm", "webp", "hdr", "heif", "bat", "bpg", "cgm", "svg"]
+imageExt = (".jpeg", ".jpg", ".exif", ".tif", ".tiff", ".gif", ".bmp", ".png", ".ppm",
+			".pgm", ".pbm", ".pnm", ".webp", ".hdr", ".heif", ".bat", ".bpg", ".cgm", ".svg")
 ua = UserAgent()
 
 count_email_in_phrase = 0
@@ -52,8 +54,9 @@ def menu():
 		print("4 - Same as option 3 but with a list of keywords")
 		print("5 - List emails - Listar correos")
 		print("6 - Save emails in .txt file - Guardar correos en archivo .txt")
-		print("7 - Delete Emails from Data Base")
-		print("8 - Exit - Salir")
+		print("7 - Save emails in .csv file - Guardar correos en archivo .csv")
+		print("8 - Delete Emails from Data Base")
+		print("9 - Exit - Salir")
 		print("")
 
 		opcion = input("Enter option - Ingrese Opcion: ")
@@ -136,6 +139,9 @@ def menu():
 				menu()
 
 		elif (opcion == "7"):
+			guardarCsv("Emails.db")
+
+		elif (opcion == "8"):
 			print("")
 			print("1 - Delete emails from a especific URL")
 			print("2 - Delete emails from a especific phrase")
@@ -159,7 +165,7 @@ def menu():
 				time.sleep(2)
 				menu()
 		
-		elif (opcion == "8"):
+		elif (opcion == "9"):
 			sys.exit(0)
 
 		else:			
@@ -680,15 +686,14 @@ def extractOnlyUrl(url):
 		contentType = conn.info().get_content_type()
 
 		if(status != 200 or contentType == "audio/mpeg"):
-    			raise ValueError('Bad Url...')
-
+			raise ValueError('Bad Url...')
 
 		html = conn.read().decode('utf-8')
 
 		emails = re.findall(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}', html)
 
 		for email in emails:
-			if (email not in listUrl and email[-3:] not in imageExt):
+			if (email not in listUrl and not email.endswith(imageExt)):
 				count += 1
 				print(str(count) + " - " + email)
 				listUrl.append(email)
@@ -736,7 +741,7 @@ def extractUrl(url):
 		contentType = conn.info().get_content_type()
 
 		if(status != 200 or contentType == "audio/mpeg"):
-    			raise ValueError('Bad Url...')
+			raise ValueError('Bad Url...')
 
 		html = conn.read().decode('utf-8')
 		
@@ -744,7 +749,7 @@ def extractUrl(url):
 		print ("Searching in " + url)
 		
 		for email in emails:
-			if (email not in listUrl and email[-3:] not in imageExt):
+			if (email not in listUrl and not email.endswith(imageExt)):
 					count += 1
 					print(str(count) + " - " + email)
 					listUrl.append(email)
@@ -794,7 +799,7 @@ def extractUrl(url):
 						emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
 
 						for email in emails:
-							if (email not in listUrl and email[-3:] not in imageExt):
+							if (email not in listUrl and not email.endswith(imageExt)):
 								count += 1
 								print(str(count) + " - " + email)
 								listUrl.append(email)
@@ -909,8 +914,6 @@ def extractKeywordsList(txtFile):
 	for key in keywordList:
     		print(key)
 
-
-
 # Limpia la pantalla según el sistema operativo
 def clear():
 	try:
@@ -934,7 +937,7 @@ def searchSpecificLink(link, listEmails, frase):
 			s = f.read().decode('utf-8')
 			emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}", s)
 			for email in emails:
-				if (email not in listEmails and email[-3:] not in imageExt):
+				if (email not in listEmails and not email.endswith(imageExt)):
 					count_email_in_phrase += 1
 					listEmails.append(email)
 					print(str(count_email_in_phrase) + " - " + email)										
@@ -955,6 +958,47 @@ def searchSpecificLink(link, listEmails, frase):
 		print(e)
 		pass
 
+def guardarCsv(db_file):
+	try:
+		conn = sqlite3.connect(db_file)
+		c = conn.cursor()
+
+		nameFile = datetime.now().strftime('csvemails_%Y_%m_%d_%H_%M_%S.csv')
+		print("")
+		print("Creating csv, please wait...")
+		
+		f = open(nameFile, "w", newline="")
+		writer = csv.writer(f)
+
+		header = ['Phrase', 'Email', 'Url']
+		writer.writerow(header)
+	
+		c.execute('SELECT * FROM emails')
+				
+		for i in c:
+			row = [str(i[1]), str(i[2]), str(i[3])]
+			writer.writerow(row)
+			
+		f.close()
+			
+		conn.close()
+		
+		input("Press enter to continue")
+		menu()
+		
+	except Error as e:
+		print(e)
+		input("Press enter to continue")
+		menu()
+		
+	except Exception as o:
+		print(o)
+		input("Press enter to continue")
+		menu()
+		
+	finally:
+		conn.close()
+
 # Inicio de Programa
 def Main():
 	clear()
diff --git a/README.md b/README.md
index 7432ecd..420eb6e 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,9 @@
   <img width="560" height="400" src="https://github.com/DiegoCaraballo/Email-extractor/blob/master/EmailExtractor.PNG">
 </p>
 
+# Add Feature: 13-07-2022
+- You can save the mailing list in a .csv file
+
 # Fix: 13-09-2019
 - Fix - The script was pasted when searching for phrases on Google.
 - Add Requirements - pip install -r requirements.txt
@@ -12,9 +15,12 @@
 - (1) Extract emails from a single URL
 - (2) Extract emails from a URL (Two Levels) - Search on the page and all its URLs
 - (3) Do a Google search, save the Urls found and search the emails
-- (4) You can list the saved emails
-- (5) You can save the mailing list in a .txt file
-- (6) Delete Emails from data base
+- (4) Same as option 3 but with a list of keywords (TODO)
+- (5) You can list the saved emails
+- (6) You can save the mailing list in a .txt file
+- (7) You can save the mailing list in a .csv file
+- (8) Delete Emails from data base
+- (9) Exit
 
 - The emails are stored in a Sqlite database ("Emails.db")
 
@@ -22,9 +28,12 @@
 - (1) Extraer los correos de una única URL
 - (2) Extraer los correos de una Url (Dos Niveles) - Busca sobre la página y todas sus URL
 - (3) Hacer una busqueda en Google, guardar las Urls encontradas y buscar los correos en dichas Urls
-- (4) Los correos son guardados en una base de datos Sqlite
-- (5) Se pueden listar los correos guardados
+- (4) Igual que la opción 3 pero con una lista de palabras (TODO)
+- (5) Listar correos guardados
 - (6) Se pueden guardar los correo en un archivo .txt
+- (7) Se pueden guardar los correo en un archivo .csv
+- (8) Eliminar correos de la base de datos
+- (9) Salir
 
 - Todos los correos son guardados en una base de datos Sqlite ("Emails.db")
 
@@ -32,12 +41,8 @@
 
 # Required modules - Modulos necesarios
 
-**Update 13-09-2019**
-
 pip install -r requirements.txt
 
-[Software para Empresas](https://www.softtero.com)
-
 [Extraer correos de paginas web con Python](http://www.pythondiario.com/2018/04/extraer-correos-electronicos-de-paginas.html)
 
 ## Docker
diff --git a/requirements.txt b/requirements.txt
index 20dca2f9ba13abd6476afb6fccf1187fce7de107..1e872d832cbdeb369d382b1d71487b4561461bf1 100644
GIT binary patch
literal 188
zcmXwzK?=e^5CrQi_>_di<e&#16Cnn1HR7uHdbMUT3^UC#)xGom92gmR^5Dc#O=M>E
zM$g{rsoB~MYgQ5iXC0M9R`0$i^{PMUMzSs47Jf2omPj(O-;|_UjkLs@i*xyBSDdj~
QZpdCW^7^|YIhd}ve=0^DH~;_u

literal 88
zcmWm5F%Ezr3_#Jncd?<!phFLXNg>24MyT=l%KCraCeeCHrmfl)m;)~8AjMrKYjVdd
gXJ>XuNCdgotitTkp$B~r9biR*p<9vXpY(?rKKW7_`Tzg`