import requests
from bs4 import BeautifulSoup
import mysql.connector
from datetime import datetime
import time
import random
from dotenv import load_dotenv
import os
# Cargar variables de entorno del archivo .env
load_dotenv()
# Configuración de la base de datos MySQL
db_config = {
'host': os.getenv('DB_HOST'),
'user': os.getenv('DB_USER'),
'password': os.getenv('DB_PASSWORD'),
'database': os.getenv('DB_NAME')
}
# Función para obtener proxies desde la base de datos
def get_proxies():
try:
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
cursor.execute("SELECT proxy_address FROM proxies") # Tabla de proxies en la BD
proxies = [row[0] for row in cursor.fetchall()]
conn.close()
return proxies
except mysql.connector.Error as e:
print(f"Error al obtener proxies: {e}")
return []
# Función para insertar resultados en la base de datos
def insert_search_result(site, url, keyword, description):
try:
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
cursor.execute("""
INSERT IGNORE INTO search_results (site, url, keyword, description, date_found)
VALUES (%s, %s, %s, %s, %s)
""", (site, url, keyword, description, datetime.now()))
conn.commit()
print(f"Resultado insertado: {url}")
except mysql.connector.Error as e:
print(f"Error al insertar en la base de datos: {e}")
finally:
cursor.close()
conn.close()
# Función para realizar scraping en Google usando proxies y pausas largas
def google_search(query, num_pages=5):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
results = []
proxies = get_proxies() # Obtener lista de proxies desde la BD
for page in range(num_pages):
start = page * 10
url = f"https://www.google.com/search?q={query}&start={start}"
# Seleccionar un proxy al azar si hay proxies disponibles
proxy = random.choice(proxies) if proxies else None
proxies_dict = {"http": proxy, "https": proxy} if proxy else None
print(f"Solicitando página de Google: {url} usando proxy: {proxy}")
try:
response = requests.get(url, headers=headers, proxies=proxies_dict, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for g in soup.find_all('div', class_='tF2Cxc'):
link = g.find('a')['href']
title = g.find('h3').text
description = g.find('span', class_='aCOpRe').text
results.append((link, title, description))
insert_search_result("google.com", link, query, description)
# Pausa entre 10 y 20 segundos para evitar bloqueos
time.sleep(random.uniform(10, 20))
except requests.exceptions.RequestException as e:
print(f"Error al realizar la solicitud con proxy {proxy}: {e}")
except Exception as ex:
print(f"Error al procesar resultados de la página: {ex}")
return results
# Función principal para ejecutar el scraping
def run_scraping():
site = 'cheemsporn.com'
keyword = 'putalocura gratis'
query = f"site:{site} {keyword}"
print(f"Ejecutando búsqueda para la consulta: {query}")
results = google_search(query)
print(f"Scraping completado para: {query}")
if __name__ == "__main__":
run_scraping()
No se seleccionaron enlaces.