From e7da7d1f4b9120a47398b7967f623ecdfa750f55 Mon Sep 17 00:00:00 2001
From: DiegoCaraballo <diegocaraballo84@gmail.com>
Date: Thu, 8 Feb 2018 19:20:27 -0300
Subject: [PATCH] Comienzo con scraping para sacar todas las url de una pagina

---
 .google-cookie    |   3 +++
 EmailExtractor.py |  30 ++++++++++++++++++++++++++----
 Emails.db         | Bin 12288 -> 12288 bytes
 3 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 .google-cookie

diff --git a/.google-cookie b/.google-cookie
new file mode 100644
index 0000000..2e9b332
--- /dev/null
+++ b/.google-cookie
@@ -0,0 +1,3 @@
+#LWP-Cookies-2.0
+Set-Cookie3: 1P_JAR="2018-02-08-21"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-03-10 21:39:28Z"; version=0
+Set-Cookie3: NID="123=CbKGqAltj16C1zTWkq2BlUKeiOVZqivMs6OhZr687_2tU660CpirwrDwNJOUHbFVNspWDchk-RAys4X7ozXmBMM9l62e2x-RgAMWNPaLbZwivMs0VTDuc6kw-GqEbQ4B"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-08-10 21:39:28Z"; HttpOnly=None; version=0
diff --git a/EmailExtractor.py b/EmailExtractor.py
index ff04ed1..f42a0f1 100644
--- a/EmailExtractor.py
+++ b/EmailExtractor.py
@@ -4,6 +4,8 @@
 # Blog: www.pythondiario.com
 
 from googlesearch import search
+from bs4 import BeautifulSoup
+import urllib.request
 import random
 import os
 import time
@@ -32,10 +34,11 @@ def menu():
 
 		opcion = input("Enter option - Ingrese Opcion: ")
 		if (opcion == "1"):
+			print ("Example URL: http://www.pythondiario.com")
 			url = str(input("Enter URL - Ingrese URL: "))
-			searchEmail("Emails.db")
+			#searchEmail("Emails.db")
 			extractUrl(url)
-			input("Press any key to continue")
+			input("Press enter key to continue")
 			menu()
 
 		elif (opcion == "2"):
@@ -43,7 +46,10 @@ def menu():
 			print ("***Warning: The amount of results chosen impacts the execution time***")
 			print ("*** Advertencia: La cantidad de resultados elejidos impacta el tiempo de ejecucion")
 			cantRes = int(input("Number of results in Google - Cantiad de resultados en Google: ")) 
+			print ("")
 			extractFraseGoogle(frase, cantRes)
+			input("Press enter key to continue")
+			menu()
 		
 		elif (opcion == "3"):
 			print ("")
@@ -143,7 +149,7 @@ def listarTodo(db_file):
 		c.close()
 		
 		print ("")
-		input("Press any key to continue")
+		input("Press enter key to continue")
 		menu()
 
 	except Error as e:
@@ -153,7 +159,21 @@ def listarTodo(db_file):
 
 # Extrae los correos de una Url - 2 niveles
 def extractUrl(url):
-	pass
+	print ("Entro en url....")
+	try:
+		conn = urllib.request.urlopen(url)
+		html = conn.read()
+
+		soup = BeautifulSoup(html, "lxml")
+		links = soup.find_all('a')
+
+		for tag in links:
+			link = tag.get('href', None)
+			if link is not None:
+				print (link)
+
+	except Exception as e:
+		print(e)
 
 # Extrae los correos de todas las Url encontradas en las busquedas
 # De cada Url extrae los correo - 2 niveles
@@ -162,6 +182,8 @@ def extractFraseGoogle(frase, cantRes):
 	for url in search(frase, stop=cantRes):
             print(url)
 
+	input("Press enter key to continue")
+
 # Limpia la pantalla según el sistema operativo
 def clear():
 	try:
diff --git a/Emails.db b/Emails.db
index 2357921ae5da5d0c6f81fdb1a7059d2b53feb8e0..6a6ce29c863275c0a916dca60b41b24238253ca7 100644
GIT binary patch
delta 86
zcmV-c0IC0gV1Qtd8v!zr977Q@04^BF4G%p40uJi{4(ks=vk@SB4zueH837y&4BP+@
sy$^a1HxBd;u?}tyD-Gxkr43>Y9}L{Fal8xyK@5{29YV7p9p4fG3=uIHH2?qr

delta 77
zcmV-T0J8spV1Qtd8v!bj977Q*04@D;4G%p40uJi{4(ks=vk@SB4zueH837su4)g#I
jy$^a1HxBd;u?}tyD-Gxkr43>YAF*)(4YMB|;1U4@TPzi~