Comienzo con scraping para sacar todas las url de una pagina

2025-02-20 20:44:12 +08:00 · 2018-02-08 19:20:27 -03:00 · 2018-02-08 19:20:27 -03:00 · e7da7d1f4b
commit e7da7d1f4b
parent 8373a9f5c1
3 changed files with 29 additions and 4 deletions
--- a/.google-cookie
+++ b/.google-cookie
@ -0,0 +1,3 @@
+#LWP-Cookies-2.0
+Set-Cookie3: 1P_JAR="2018-02-08-21"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-03-10 21:39:28Z"; version=0
+Set-Cookie3: NID="123=CbKGqAltj16C1zTWkq2BlUKeiOVZqivMs6OhZr687_2tU660CpirwrDwNJOUHbFVNspWDchk-RAys4X7ozXmBMM9l62e2x-RgAMWNPaLbZwivMs0VTDuc6kw-GqEbQ4B"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-08-10 21:39:28Z"; HttpOnly=None; version=0
--- a/EmailExtractor.py
+++ b/EmailExtractor.py
@ -4,6 +4,8 @@
 # Blog: www.pythondiario.com

 from googlesearch import search
+from bs4 import BeautifulSoup
+import urllib.request
 import random
 import os
 import time
@ -32,10 +34,11 @@ def menu():

 		opcion = input("Enter option - Ingrese Opcion: ")
 		if (opcion == "1"):
+			print ("Example URL: http://www.pythondiario.com")
 			url = str(input("Enter URL - Ingrese URL: "))
-			searchEmail("Emails.db")
+			#searchEmail("Emails.db")
 			extractUrl(url)
-			input("Press any key to continue")
+			input("Press enter key to continue")
 			menu()

 		elif (opcion == "2"):
@ -43,7 +46,10 @@ def menu():
 			print ("***Warning: The amount of results chosen impacts the execution time***")
 			print ("*** Advertencia: La cantidad de resultados elejidos impacta el tiempo de ejecucion")
 			cantRes = int(input("Number of results in Google - Cantiad de resultados en Google: ")) 
+			print ("")
 			extractFraseGoogle(frase, cantRes)
+			input("Press enter key to continue")
+			menu()
 		
 		elif (opcion == "3"):
 			print ("")
@ -143,7 +149,7 @@ def listarTodo(db_file):
 		c.close()
 		
 		print ("")
-		input("Press any key to continue")
+		input("Press enter key to continue")
 		menu()

 	except Error as e:
@ -153,7 +159,21 @@ def listarTodo(db_file):

 # Extrae los correos de una Url - 2 niveles
 def extractUrl(url):
-	pass
+	print ("Entro en url....")
+	try:
+		conn = urllib.request.urlopen(url)
+		html = conn.read()
+
+		soup = BeautifulSoup(html, "lxml")
+		links = soup.find_all('a')
+
+		for tag in links:
+			link = tag.get('href', None)
+			if link is not None:
+				print (link)
+
+	except Exception as e:
+		print(e)

 # Extrae los correos de todas las Url encontradas en las busquedas
 # De cada Url extrae los correo - 2 niveles
@ -162,6 +182,8 @@ def extractFraseGoogle(frase, cantRes):
 	for url in search(frase, stop=cantRes):
            print(url)

+	input("Press enter key to continue")
+
 # Limpia la pantalla según el sistema operativo
 def clear():
 	try:
--- a/Emails.db
+++ b/Emails.db