import requests from bs4 import BeautifulSoup import mysql.connector from datetime import datetime import time import random from dotenv import load_dotenv import os # Cargar variables de entorno del archivo .env load_dotenv() # Configuración de la base de datos MySQL db_config = { 'host': os.getenv('DB_HOST'), 'user': os.getenv('DB_USER'), 'password': os.getenv('DB_PASSWORD'), 'database': os.getenv('DB_NAME') } # Función para obtener proxies desde la base de datos def get_proxies(): try: conn = mysql.connector.connect(**db_config) cursor = conn.cursor() cursor.execute("SELECT proxy_address FROM proxies") # Tabla de proxies en la BD proxies = [row[0] for row in cursor.fetchall()] conn.close() return proxies except mysql.connector.Error as e: print(f"Error al obtener proxies: {e}") return [] # Función para insertar resultados en la base de datos def insert_search_result(site, url, keyword, description): try: conn = mysql.connector.connect(**db_config) cursor = conn.cursor() cursor.execute(""" INSERT IGNORE INTO search_results (site, url, keyword, description, date_found) VALUES (%s, %s, %s, %s, %s) """, (site, url, keyword, description, datetime.now())) conn.commit() print(f"Resultado insertado: {url}") except mysql.connector.Error as e: print(f"Error al insertar en la base de datos: {e}") finally: cursor.close() conn.close() # Función para realizar scraping en Google usando proxies y pausas largas def google_search(query, num_pages=5): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" } results = [] proxies = get_proxies() # Obtener lista de proxies desde la BD for page in range(num_pages): start = page * 10 url = f"https://www.google.com/search?q={query}&start={start}" # Seleccionar un proxy al azar si hay proxies disponibles proxy = random.choice(proxies) if proxies else None proxies_dict = {"http": proxy, "https": proxy} if proxy else None print(f"Solicitando página de Google: {url} usando proxy: {proxy}") try: response = requests.get(url, headers=headers, proxies=proxies_dict, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') for g in soup.find_all('div', class_='tF2Cxc'): link = g.find('a')['href'] title = g.find('h3').text description = g.find('span', class_='aCOpRe').text results.append((link, title, description)) insert_search_result("google.com", link, query, description) # Pausa entre 10 y 20 segundos para evitar bloqueos time.sleep(random.uniform(10, 20)) except requests.exceptions.RequestException as e: print(f"Error al realizar la solicitud con proxy {proxy}: {e}") except Exception as ex: print(f"Error al procesar resultados de la página: {ex}") return results # Función principal para ejecutar el scraping def run_scraping(): site = 'cheemsporn.com' keyword = 'putalocura gratis' query = f"site:{site} {keyword}" print(f"Ejecutando búsqueda para la consulta: {query}") results = google_search(query) print(f"Scraping completado para: {query}") if __name__ == "__main__": run_scraping()

No se seleccionaron enlaces.