From e7da7d1f4b9120a47398b7967f623ecdfa750f55 Mon Sep 17 00:00:00 2001 From: DiegoCaraballo Date: Thu, 8 Feb 2018 19:20:27 -0300 Subject: [PATCH] Comienzo con scraping para sacar todas las url de una pagina --- .google-cookie | 3 +++ EmailExtractor.py | 30 ++++++++++++++++++++++++++---- Emails.db | Bin 12288 -> 12288 bytes 3 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 .google-cookie diff --git a/.google-cookie b/.google-cookie new file mode 100644 index 0000000..2e9b332 --- /dev/null +++ b/.google-cookie @@ -0,0 +1,3 @@ +#LWP-Cookies-2.0 +Set-Cookie3: 1P_JAR="2018-02-08-21"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-03-10 21:39:28Z"; version=0 +Set-Cookie3: NID="123=CbKGqAltj16C1zTWkq2BlUKeiOVZqivMs6OhZr687_2tU660CpirwrDwNJOUHbFVNspWDchk-RAys4X7ozXmBMM9l62e2x-RgAMWNPaLbZwivMs0VTDuc6kw-GqEbQ4B"; path="/"; domain=".google.com"; path_spec; domain_dot; expires="2018-08-10 21:39:28Z"; HttpOnly=None; version=0 diff --git a/EmailExtractor.py b/EmailExtractor.py index ff04ed1..f42a0f1 100644 --- a/EmailExtractor.py +++ b/EmailExtractor.py @@ -4,6 +4,8 @@ # Blog: www.pythondiario.com from googlesearch import search +from bs4 import BeautifulSoup +import urllib.request import random import os import time @@ -32,10 +34,11 @@ def menu(): opcion = input("Enter option - Ingrese Opcion: ") if (opcion == "1"): + print ("Example URL: http://www.pythondiario.com") url = str(input("Enter URL - Ingrese URL: ")) - searchEmail("Emails.db") + #searchEmail("Emails.db") extractUrl(url) - input("Press any key to continue") + input("Press enter key to continue") menu() elif (opcion == "2"): @@ -43,7 +46,10 @@ def menu(): print ("***Warning: The amount of results chosen impacts the execution time***") print ("*** Advertencia: La cantidad de resultados elejidos impacta el tiempo de ejecucion") cantRes = int(input("Number of results in Google - Cantiad de resultados en Google: ")) + print ("") extractFraseGoogle(frase, cantRes) + input("Press enter key to continue") + menu() elif (opcion == "3"): print ("") @@ -143,7 +149,7 @@ def listarTodo(db_file): c.close() print ("") - input("Press any key to continue") + input("Press enter key to continue") menu() except Error as e: @@ -153,7 +159,21 @@ def listarTodo(db_file): # Extrae los correos de una Url - 2 niveles def extractUrl(url): - pass + print ("Entro en url....") + try: + conn = urllib.request.urlopen(url) + html = conn.read() + + soup = BeautifulSoup(html, "lxml") + links = soup.find_all('a') + + for tag in links: + link = tag.get('href', None) + if link is not None: + print (link) + + except Exception as e: + print(e) # Extrae los correos de todas las Url encontradas en las busquedas # De cada Url extrae los correo - 2 niveles @@ -162,6 +182,8 @@ def extractFraseGoogle(frase, cantRes): for url in search(frase, stop=cantRes): print(url) + input("Press enter key to continue") + # Limpia la pantalla segĂșn el sistema operativo def clear(): try: diff --git a/Emails.db b/Emails.db index 2357921ae5da5d0c6f81fdb1a7059d2b53feb8e0..6a6ce29c863275c0a916dca60b41b24238253ca7 100644 GIT binary patch delta 86 zcmV-c0IC0gV1Qtd8v!zr977Q@04^BF4G%p40uJi{4(ks=vk@SB4zueH837y&4BP+@ sy$^a1HxBd;u?}tyD-Gxkr43>Y9}L{Fal8xyK@5{29YV7p9p4fG3=uIHH2?qr delta 77 zcmV-T0J8spV1Qtd8v!bj977Q*04@D;4G%p40uJi{4(ks=vk@SB4zueH837su4)g#I jy$^a1HxBd;u?}tyD-Gxkr43>YAF*)(4YMB|;1U4@TPzi~