diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..25a65b6 --- /dev/null +++ b/bot.py @@ -0,0 +1,537 @@ +# bot.py +# 🚀 Авто-Скрейпер — версия БЕЗ ИИ оценки (Ollama полностью удалена) + +import argparse +import logging +from pathlib import Path +from datetime import datetime +import asyncio +import tempfile +import httpx +import time + +from aiogram import Bot, Dispatcher, types, F +from aiogram.filters import Command +from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton, FSInputFile, CallbackQuery + +# Импорт только нужного +from config import CONFIG +from link_collector import collect_links +from main import process_batch, save_to_excel, load_urls + +logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) +logger = logging.getLogger(__name__) + +# Глобальные переменные +last_output_file = None +pending_links = None + +# ==================== ГОРОДА ==================== +CITIES = { + "Нижний Новгород": {"lr": 47, "keywords": ['Нижний Новгород', 'Новгород', 'нижний новгород'], "name": "Нижний Новгород"}, + "Краснодар": {"lr": 35, "keywords": ['Краснодар', 'краснодар'], "name": "Краснодар"}, + "Москва": {"lr": 213, "keywords": ['Москва', 'москва'], "name": "Москва"} +} + +async def simple_progress(message: types.Message, done: int, total: int, last: list, unique_phones: set): + """ + Обновляет сообщение со счётчиком и уникальными номерами не чаще раза в 1 секунду (для более динамичного обновления). + last: [last_time] — mutable list для отслеживания. + unique_phones: set уникальных номеров для добавления. + """ + import time + now = time.time() + + # Обновляем: либо прошло 1+ сек, либо это последний элемент + if (now - last[0] >= 1.0) or (done == total): + try: + phones_text = "\n".join(sorted(unique_phones)) if unique_phones else "" # Сортировка для стабильного вида + text = f"⏳ Обработка: {done}/{total}" + if phones_text: + text += f"\n\n{phones_text}" + await message.edit_text( + text, + parse_mode="HTML" + ) + last[0] = now # запоминаем время обновления + except: + pass # игнорируем ошибки редактирования + +def save_config(): + try: + timeout_obj = CONFIG["http"]["timeout"] + + # Временно заменяем объект на плейсхолдер для pprint + CONFIG["http"]["timeout"] = "PLACEHOLDER_TIMEOUT" + + import pprint + s = pprint.pformat(CONFIG, width=100, sort_dicts=False) + + # Возвращаем объект обратно + CONFIG["http"]["timeout"] = timeout_obj + + # Правильно восстанавливаем httpx.Timeout + if isinstance(timeout_obj, httpx.Timeout): + connect = getattr(timeout_obj, 'connect', 5.0) + read = getattr(timeout_obj, 'read', 10.0) + timeout_str = f"httpx.Timeout({read}, connect={connect})" + else: + timeout_str = repr(timeout_obj) + + s = s.replace("'PLACEHOLDER_TIMEOUT'", timeout_str) + + with open('config.py', 'w', encoding='utf-8') as f: + f.write(f'# config.py\nimport httpx\n# 🔧 КОНФИГУРАЦИЯ\nCONFIG = {s}') + + logger.info("✅ CONFIG успешно сохранён") + return True + except Exception as e: + logger.error(f"❌ Ошибка сохранения config.py: {e}") + import traceback + logger.error(traceback.format_exc()) + return False + +def get_search_queries_count() -> int: + try: + with open(CONFIG["search_file"], encoding="utf-8") as f: + return len([line for line in f if line.strip()]) + except: + return 0 + +# ==================== КЛАВИАТУРЫ ==================== +def main_menu_keyboard(): + return InlineKeyboardMarkup(inline_keyboard=[ + [InlineKeyboardButton(text="📤 Загрузить файл с ссылками", callback_data="upload_links")], + [ + InlineKeyboardButton(text="⚙️ Настройки", callback_data="settings_menu"), + InlineKeyboardButton(text="🌆 Сменить город", callback_data="city_menu") + ], + ]) + +def back_to_menu_keyboard(): + return InlineKeyboardMarkup(inline_keyboard=[[InlineKeyboardButton(text="⬅️ Вернуться в меню", callback_data="main_menu")]]) + +def cities_keyboard(): + kb = [[InlineKeyboardButton(text=f"🌆 {city}", callback_data=f"set_city_{city}")] for city in CITIES] + kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]) + return InlineKeyboardMarkup(inline_keyboard=kb) + +def search_menu_keyboard(): + return InlineKeyboardMarkup(inline_keyboard=[ + [InlineKeyboardButton(text="📖 Показать search.txt", callback_data="show_search")], + [InlineKeyboardButton(text="✏️ Изменить search.txt", callback_data="edit_search")], + [InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")] + ]) + +def output_format_keyboard(): + formats = { + "phones": "📞 Только номера", + "domains": "🌐 Только домены", + "both": "📞+🌐 Номера и домены", + "excel": "📊 Excel файл" + } + kb = [[InlineKeyboardButton(text=label, callback_data=f"set_format_{key}")] + for key, label in formats.items()] + kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]) + return InlineKeyboardMarkup(inline_keyboard=kb) + +def settings_keyboard(): + return InlineKeyboardMarkup(inline_keyboard=[ + [InlineKeyboardButton(text="📤 Формат вывода", callback_data="output_format_menu")], + [InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")] + ]) + +# ==================== НАСТРОЙКИ ВЫВОДА ==================== + +async def settings_menu_callback(query: CallbackQuery): + current = CONFIG.get("output_format", "excel") + labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐", "excel": "📊"} + await query.message.edit_text( + f"⚙️ Настройки\n\n" + f"📤 Текущий формат: {labels.get(current, '📊')} {current}", + reply_markup=settings_keyboard(), + parse_mode="HTML" + ) + await query.answer() + +async def output_format_menu_callback(query: CallbackQuery): + current = CONFIG.get("output_format", "excel") + await query.message.edit_text( + "📤 Выберите формат вывода:", + reply_markup=output_format_keyboard(), + parse_mode="HTML" + ) + await query.answer() + +async def set_output_format_callback(query: CallbackQuery): + fmt = query.data.replace("set_format_", "") + if fmt in ["phones", "domains", "both", "excel"]: + CONFIG["output_format"] = fmt + save_config() + labels = {"phones": "📞 Только номера", "domains": "🌐 Только домены", + "both": "📞+🌐 Номера и домены", "excel": "📊 Excel файл"} + await query.answer(f"✅ Формат: {labels[fmt]}") + await settings_menu_callback(query) + else: + await query.answer("❌ Неверный формат", show_alert=True) + +# ==================== ФОРМАТИРОВАНИЕ РЕЗУЛЬТАТОВ ==================== + +def format_results(results: list, fmt: str) -> str: + """ + Форматирует результаты согласно выбранному формату. + results: [(org, phone, domain, promo, rating), ...] + fmt: 'phones', 'domains', 'both', 'excel' + """ + if fmt == "phones": + return "\n".join([r[1] for r in results if r[1]]) + elif fmt == "domains": + return "\n".join([r[2] for r in results if r[2]]) + elif fmt == "both": + return "\n".join([f"{r[1]} — {r[2]}" for r in results if r[1] or r[2]]) + return "" # для excel не используем текстовый формат + +# ==================== ХЕНДЛЕРЫ ==================== +async def start_handler(message: types.Message): + await message.answer( + "👋 Авто-Скрейпер\n\nГотов к работе!\nВыбери действие ниже:", + reply_markup=main_menu_keyboard(), + parse_mode="HTML" + ) + +async def menu_callback(query: CallbackQuery): + await query.message.edit_text("👋 Главное меню", reply_markup=main_menu_keyboard(), parse_mode="HTML") + await query.answer() + +async def status_handler(query: CallbackQuery): + status_text = ( + f"📊 Статус скрейпера\n\n" + f"🌆 Город: {CONFIG['region_name']}\n" + f"🔎 Запросов в search.txt: {get_search_queries_count()}\n" + f"📁 Последний результат: {last_output_file or '—'}\n\n" + f"✅ Работает без ИИ-оценки" + ) + await query.message.edit_text(status_text, reply_markup=back_to_menu_keyboard(), parse_mode="HTML") + await query.answer() + +# ==================== СМЕНА ГОРОДА ==================== +async def city_menu_callback(query: CallbackQuery): + await query.message.edit_text("🌆 Выберите город:", reply_markup=cities_keyboard(), parse_mode="HTML") + await query.answer() + +async def set_city_callback(query: CallbackQuery): + global pending_links + city_name = query.data.replace("set_city_", "") + city = CITIES[city_name] + + CONFIG["region_name"] = city["name"] + CONFIG["region_lr"] = city["lr"] + CONFIG["required_keywords"] = city["keywords"] + + save_config() + + if pending_links: + await query.answer(f"✅ Город изменён на {city_name}. Начинаю обработку ссылок...") + await query.message.edit_text("⏳ Обрабатываю ссылки из файла...", parse_mode="HTML") + + try: + processed = load_urls(pending_links) + + last = [time.time()] # для simple_progress + unique_phones = set() # для уникальных номеров + + async def progress_cb(done: int, total: int): + await simple_progress(query.message, done, total, last, unique_phones) + + raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones + + seen = set() + unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] + results = [(o, p, d, promo, "—") for o, p, d, promo in unique] + + # Финальное обновление прогресса (unique_phones уже заполнен инкрементально) + await progress_cb(len(processed), len(processed)) + + fmt = CONFIG.get("output_format", "excel") + + if fmt == "excel" and results: + global last_output_file + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + path = CONFIG["output_file"].format(timestamp=ts) + save_to_excel(results, path) + last_output_file = path + + promo_cnt = sum(1 for r in results if r[3]) + await query.message.answer( + f"✅ Обработка завершена!\n" + f"📊 Обработано URL: {len(pending_links)}\n" + f"📞 Найдено телефонов: {len(results)}\n" + f"🎯 Из них promo: {promo_cnt}\n" + f"📁 Формат: Excel", + parse_mode="HTML" + ) + await query.message.answer_document(FSInputFile(path), caption="📁 Результаты") + elif results: + text = format_results(results, fmt) + if text: + if len(text) > 3800: + with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f: + f.write(text) + path = f.name + caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты") + await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}") + Path(path).unlink() + else: + labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"} + await query.message.answer( + f"✅ {labels.get(fmt, '')} Результаты:\n" + f"📊 Обработано: {len(results)}\n\n{text}", + parse_mode="HTML" + ) + else: + await query.message.answer("⚠️ Нет данных для отображения") + else: + await query.message.answer("⚠️ Телефоны не найдены") + + except Exception as e: + logger.error(f"❌ set_city_callback error: {e}") + await query.message.answer(f"❌ Ошибка: {e}") + finally: + pending_links = None + + await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard()) + else: + await query.answer(f"✅ Город изменён на {city_name}") + await status_handler(query) + +# ==================== search.txt ==================== +async def search_menu_callback(query: CallbackQuery): + await query.message.edit_text("📝 Управление search.txt", reply_markup=search_menu_keyboard(), parse_mode="HTML") + await query.answer() + +async def show_search_callback(query: CallbackQuery): + try: + with open(CONFIG["search_file"], encoding="utf-8") as f: + content = f.read() + if len(content) > 3800: + with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f: + f.write(content) + path = f.name + await query.message.answer_document(FSInputFile(path), caption="📝 search.txt") + Path(path).unlink() + else: + await query.message.answer(f"📝 search.txt:\n\n{content}") + except Exception as e: + await query.message.answer(f"❌ Ошибка: {e}") + await query.answer() + +async def edit_search_callback(query: CallbackQuery): + await query.answer("✏️ Отправьте новое содержимое search.txt в следующем сообщении") + +# ==================== ОСНОВНЫЕ ДЕЙСТВИЯ ==================== + +async def scrape_callback(query: CallbackQuery): + await query.answer("🚀 Запуск скрейпинга...") + msg = await query.message.edit_text("⏳ Выполняю полный скрейпинг...", parse_mode="HTML") + + last = [time.time()] + unique_phones = set() + + async def progress_cb(done: int, total: int): + await simple_progress(msg, done, total, last, unique_phones) + + try: + links = collect_links() + urls = load_urls(links) + if not urls: + await msg.answer("❌ Нет ссылок для обработки", reply_markup=main_menu_keyboard()) + return + + raw = await process_batch(urls, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones + + seen = set() + unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс + results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки + + # Финальное обновление (unique_phones уже заполнен) + await progress_cb(len(urls), len(urls)) + + if results: + global last_output_file + fmt = CONFIG.get("output_format", "excel") + + if fmt == "excel": + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + path = CONFIG["output_file"].format(timestamp=ts) + save_to_excel(results, path) + last_output_file = path + + promo_cnt = sum(1 for r in results if r[3]) + await query.message.answer( + f"✅ Скрейпинг завершён!\n" + f"📊 Обработано URL: {len(urls)}\n" + f"📞 Найдено телефонов: {len(results)}\n" + f"🎯 Из них promo: {promo_cnt}", + parse_mode="HTML" + ) + await query.message.answer_document(FSInputFile(path), caption="📁 Результаты") + else: + # Текстовый вывод + text = format_results(results, fmt) + if text: + if len(text) > 3800: + with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f: + f.write(text) + path = f.name + caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты") + await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}") + Path(path).unlink() + else: + labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"} + await query.message.answer( + f"✅ {labels.get(fmt, '')} Результаты:\n\n{text}", + parse_mode="HTML" + ) + else: + await query.message.answer("⚠️ Нет данных для отображения") + else: + await query.message.answer("⚠️ Телефоны не найдены") + except Exception as e: + await query.message.answer(f"❌ Ошибка: {e}") + + await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard()) + +async def process_callback(query: CallbackQuery): + await query.answer() + await query.message.answer("📋 Отправьте ссылки для обработки\n(можно несколько через пробел)", parse_mode="HTML") + +async def manual_process(message: types.Message): + urls = [u.strip() for u in message.text.split() if u.strip().startswith("http")] + if not urls: + await message.answer("❌ Не найдено валидных ссылок") + return + + progress_msg = await message.answer(f"⏳ Обрабатываю {len(urls)} URL...") + last = [time.time()] + unique_phones = set() + + async def progress_cb(done: int, total: int): + await simple_progress(progress_msg, done, total, last, unique_phones) + + try: + processed = load_urls(urls) + raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones + + seen = set() + unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс + results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки + + await progress_cb(len(processed), len(processed)) + + fmt = CONFIG.get("output_format", "excel") + + if fmt == "excel" and results: + global last_output_file + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + path = CONFIG["output_file"].format(timestamp=ts) + save_to_excel(results, path) + last_output_file = path + + await message.answer( + f"✅ Готово!\n📊 Обработано: {len(results)}\n📁 Формат: Excel", + parse_mode="HTML" + ) + await message.answer_document(FSInputFile(path), caption="📁 Результаты") + elif results: + text = format_results(results, fmt) + if text: + if len(text) > 3800: + with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f: + f.write(text) + path = f.name + caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты") + await message.answer_document(FSInputFile(path), caption=f"✅ {caption}") + await message.answer("Что дальше?", reply_markup=main_menu_keyboard()) + Path(path).unlink() + else: + labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"} + await message.answer( + f"✅ {labels.get(fmt, '')} Результаты:\n\n{text}", + parse_mode="HTML" + ) + await message.answer("Что дальше?", reply_markup=main_menu_keyboard()) + else: + await message.answer("⚠️ Нет данных для отображения") + else: + await message.answer("⚠️ Телефоны не найдены") + + except Exception as e: + logger.error(f"❌ manual_process error: {e}") + await message.answer(f"❌ Ошибка: {e}") + +async def upload_links_callback(query: CallbackQuery): + await query.answer() + await query.message.answer("📤 Отправьте файл .txt с ссылками\n(одна на строку)", parse_mode="HTML") + +async def handle_uploaded_file(message: types.Message, bot: Bot): + global pending_links + document = message.document + if not document.file_name.endswith('.txt'): + await message.answer("❌ Файл должен быть .txt") + return + + file_info = await bot.get_file(document.file_id) + with tempfile.NamedTemporaryFile(delete=False) as tmp: + await bot.download_file(file_info.file_path, tmp.name) + + try: + with open(tmp.name, 'r', encoding='utf-8') as f: + content = f.read() + urls = [line.strip() for line in content.splitlines() if line.strip().startswith('http')] + + if not urls: + await message.answer("❌ Нет валидных ссылок в файле") + return + + # 🔥 ФИКС: Очищаем конфиг, чтобы load_urls не добавил лишнего + CONFIG["urls"] = [] + CONFIG["input_file"] = "" + + pending_links = urls + await message.answer(f"✅ Файл загружен! Найдено {len(urls)} ссылок.\nТеперь выберите город:", parse_mode="HTML") + await message.answer("🌆 Выберите город:", reply_markup=cities_keyboard(), parse_mode="HTML") + except Exception as e: + await message.answer(f"❌ Ошибка: {e}") + finally: + Path(tmp.name).unlink() + +# ==================== ЗАПУСК ==================== +def main_bot(token: str): + bot = Bot(token=token) + dp = Dispatcher() + + dp.message.register(start_handler, Command("start")) + dp.message.register(manual_process, F.text & ~F.text.startswith("/")) + dp.message.register(handle_uploaded_file, F.document) + + dp.callback_query.register(menu_callback, F.data == "main_menu") + dp.callback_query.register(status_handler, F.data == "status") + dp.callback_query.register(scrape_callback, F.data == "scrape") + dp.callback_query.register(process_callback, F.data == "process") + dp.callback_query.register(upload_links_callback, F.data == "upload_links") + dp.callback_query.register(search_menu_callback, F.data == "search_menu") + dp.callback_query.register(show_search_callback, F.data == "show_search") + dp.callback_query.register(edit_search_callback, F.data == "edit_search") + dp.callback_query.register(city_menu_callback, F.data == "city_menu") + dp.callback_query.register(set_city_callback, F.data.startswith("set_city_")) + dp.callback_query.register(settings_menu_callback, F.data == "settings_menu") + dp.callback_query.register(output_format_menu_callback, F.data == "output_format_menu") + dp.callback_query.register(set_output_format_callback, F.data.startswith("set_format_")) + + asyncio.run(dp.start_polling(bot)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--token', required=True) + args = parser.parse_args() + main_bot(args.token) \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..02ef642 --- /dev/null +++ b/config.py @@ -0,0 +1,43 @@ +# config.py +import httpx +# 🔧 КОНФИГУРАЦИЯ +CONFIG = {'input_file': '', + 'output_file': 'C:\\Coding\\auto-scraper\\output\\phones_{timestamp}.xlsx', + 'log_file': 'C:\\Coding\\auto-scraper\\logs\\scraper.log', + 'excluded_domains': {'auto.drom.ru', + 'auto.ru', + 'autocompass-j.ru', + 'autocompass-v.ru', + 'avito.ru', + 'duckduckgo.com', + 'google.com', + 'sberauto.com', + 'sberleasing.ru'}, + 'urls': [], + 'output_format': 'both', + 'http': {'timeout': httpx.Timeout(10.0, connect=5.0), + 'max_redirects': 5, + 'retry_attempts': 3, + 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like ' + 'Gecko) Chrome/120.0.0.0 Safari/537.36', + 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8', + 'Referer': 'https://yandex.ru/', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate'}}, + 'phone': {'patterns': ['href=["\\\']tel:([^"\\\']+)["\\\']', + 'tel["\\\']?\\s*[:=]\\s*["\\\']?([+()0-9\\-\\s]{10,})["\\\']?', + '(?:телефон|phone|контакт)["\\\']?\\s*[:=]?\\s*["\\\']?([+()0-9\\-\\s]{10,})'], + 'country_code': '7', + 'min_digits': 10, + 'max_digits': 12}, + 'required_keywords': ['Краснодар', 'краснодар'], + 'stop_keywords': ['аренда', 'АРЕНДА', 'Аренда', '2311373680', 'autocompass'], + 'headless': False, + 'search_pages': 3, + 'workers': 3, + 'search_template_url': 'https://ya.ru/search/?text={search}&lr={lr}', + 'region_lr': 35, + 'region_name': 'Краснодар', + 'search_file': 'search.txt', + 'profile_dir': 'C:\\Users\\Дмитрий\\chrome_profile_yandex'} \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..2923569 --- /dev/null +++ b/main.py @@ -0,0 +1,419 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +📞 DuckDuckGo/Yandex → Phone Number Scraper v2.2 +Извлекает ссылки → следует редиректам (включая yabs) → парсит телефоны → сохраняет в Excel ++ Исправления: PoolTimeout, экспоненциальный бэк-офф, адаптивные лимиты, рандомизация +""" + +import re +import sys +import asyncio +import random +import httpx +from pathlib import Path +from urllib.parse import urlparse +from openpyxl import Workbook +from openpyxl.styles import Font, Alignment, PatternFill +from datetime import datetime +import argparse +import logging + +# === Специфичные исключения httpx === +from httpx import PoolTimeout, ConnectTimeout, ReadTimeout, HTTPStatusError, RequestError + +from config import CONFIG +from link_collector import collect_links + +# Suppress httpx info logs +logging.getLogger("httpx").setLevel(logging.WARNING) + +# Компиляция regex заранее +TEL_PATTERNS = [re.compile(p, re.IGNORECASE) for p in CONFIG["phone"]["patterns"]] + +# Глобальный список User-Agent для ротации +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", +] + + +def normalize_domain(url: str) -> str: + """Нормализация домена для проверки дубликатов.""" + try: + domain = urlparse(url.strip()).hostname or '' + return domain.replace('www.', '', 1).lower() + except Exception: + return '' + + +def is_excluded(domain: str) -> bool: + """Проверка домена на исключение (ТОЧНОЕ совпадение, без поддоменов).""" + return domain in CONFIG["excluded_domains"] + + +def normalize_phone(phone: str) -> str | None: + """Нормализация телефона к формату: +7 (XXX) XXX-XX-XX""" + digits = re.sub(r"[^\d+]", "", phone.strip()) + + if digits.startswith('+7'): + digits = digits[2:] + elif digits.startswith('8') and len(digits) == 11: + digits = digits[1:] + elif digits.startswith('7') and len(digits) == 11: + digits = digits[1:] + + if len(digits) != 10: + return None + + return f"+7 ({digits[:3]}) {digits[3:6]}-{digits[6:8]}-{digits[8:10]}" + + +def extract_phone_from_html(html: str) -> str | None: + """Поиск телефона в HTML-контенте по множеству паттернов.""" + for pattern in TEL_PATTERNS: + match = pattern.search(html) + if match: + raw = match.group(1).strip() + normalized = normalize_phone(raw) + if normalized: + return normalized + return None + + +def check_content_filters(html: str) -> bool: + """Проверка HTML на наличие required_keywords и отсутствие stop_keywords (case-insensitive).""" + lower_html = html.lower() + + if CONFIG["required_keywords"]: + required_lower = [kw.lower() for kw in CONFIG["required_keywords"]] + if not any(kw in lower_html for kw in required_lower): + return False + + if CONFIG["stop_keywords"]: + stop_lower = [kw.lower() for kw in CONFIG["stop_keywords"]] + if any(kw in lower_html for kw in stop_lower): + return False + + return True + + +def analyze_redirect_chain(url: str, final_url: str) -> tuple[str, bool]: + """ + Анализирует цепочку редиректов. + Возвращает: (финальный домен, is_promo) + """ + try: + original_host = urlparse(url.strip()).hostname or '' + is_promo = (original_host == 'yabs.yandex.ru') + + final_domain = urlparse(final_url.strip()).hostname or '' + final_domain = final_domain.replace('www.', '', 1).lower() + return final_domain, is_promo + except Exception: + return normalize_domain(url), False + + +def _get_client_config(url: str) -> dict: + """Возвращает конфигурацию клиента в зависимости от домена (щадящий режим для Яндекса).""" + is_yandex = any(x in url.lower() for x in ['yandex.ru', 'yabs.yandex.ru', 'ya.ru']) + + if is_yandex: + return { + "limits": httpx.Limits(max_connections=30, max_keepalive_connections=20), + "timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=30.0), + "retry_base_delay": 2.0, + "max_retries": 2, + } + else: + return { + "limits": httpx.Limits(max_connections=20, max_keepalive_connections=10), + "timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0), + "retry_base_delay": 1.0, + "max_retries": CONFIG["http"]["retry_attempts"], + } + + +async def fetch_with_retry(client: httpx.AsyncClient, url: str, retries: int = 0, + base_delay: float = 1.0, max_retries: int = 3) -> tuple[str, str | None, str | None, bool]: + """ + ГИБРИДНАЯ функция с улучшенной обработкой тайм-аутов и экспоненциальным бэк-оффом. + """ + try: + # Ротация User-Agent для каждого запроса + headers = { + "User-Agent": random.choice(USER_AGENTS), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Cache-Control": "max-age=0", + } + + async with client.stream("GET", url, headers=headers, follow_redirects=True) as response: + if response.status_code >= 400: + raise HTTPStatusError( + f"Status {response.status_code}", + request=response.request, + response=response + ) + + final_url = str(response.url) + buffer = [] + async for chunk in response.aiter_text(chunk_size=8192): + buffer.append(chunk) + full_html = ''.join(buffer) + + final_domain, is_promo = analyze_redirect_chain(url, final_url) + + if not check_content_filters(full_html): + return url, None, final_domain, is_promo + + phone = extract_phone_from_html(full_html) + return url, phone, final_domain, is_promo + + except PoolTimeout as e: + if retries < min(2, max_retries): + delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5) + await asyncio.sleep(delay) + return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries) + return url, None, normalize_domain(url), False + + except ConnectTimeout as e: + if retries < max_retries: + delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5) + await asyncio.sleep(delay) + return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries) + return url, None, normalize_domain(url), False + + except ReadTimeout as e: + return url, None, normalize_domain(url), False + + except HTTPStatusError as e: + if e.response.status_code == 429: + retry_after = e.response.headers.get('Retry-After', '5') + await asyncio.sleep(int(retry_after) + random.randint(1, 3)) + return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries) + return url, None, normalize_domain(url), False + + except RequestError as e: + if retries < max_retries: + delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5) + await asyncio.sleep(delay) + return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries) + return url, None, normalize_domain(url), False + + except Exception as e: + if retries < max_retries: + delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5) + await asyncio.sleep(delay) + return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries) + return url, None, normalize_domain(url), False + + +async def process_batch(urls: list[str], batch_size: int = 50, progress_callback=None, unique_phones: set = None): + """ + Пакетная обработка с ограничением параллелизма и адаптивными настройками. + :param progress_callback: async функция (done: int, total: int) для обновления прогресса + :param unique_phones: set для инкрементального добавления уникальных номеров + """ + results = [] + total_urls = len(urls) + + for i in range(0, len(urls), batch_size): + batch = urls[i:i + batch_size] + + sample_url = batch[0] if batch else "" + client_config = _get_client_config(sample_url) + + async with httpx.AsyncClient( + headers={"User-Agent": random.choice(USER_AGENTS)}, + timeout=client_config["timeout"], + follow_redirects=True, + limits=client_config["limits"] + ) as client: + + tasks = [ + fetch_with_retry( + client, url, + base_delay=client_config["retry_base_delay"], + max_retries=client_config["max_retries"] + ) + for url in batch + ] + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + + for j, result in enumerate(batch_results): + current_idx = i + j + 1 + + if isinstance(result, Exception): + if progress_callback and callable(progress_callback): + await progress_callback(current_idx, total_urls) + continue + + original_url, phone, final_domain, is_promo = result + print(f"{current_idx}: {final_domain} - {phone if phone else 'нет'}") + if phone: + results.append((original_url, phone, final_domain, is_promo)) + if unique_phones is not None: + unique_phones.add(phone) # Инкрементальное добавление уникального номера + + if progress_callback and callable(progress_callback): + await progress_callback(current_idx, total_urls) + + if i + batch_size < len(urls): + delay = random.uniform(1.0, 2.0) + await asyncio.sleep(delay) + + if progress_callback and callable(progress_callback): + await progress_callback(total_urls, total_urls) + + return results + + +def save_to_excel(results: list[tuple], filepath: str): + """Сохранение результатов в Excel с пометкой promo и оценкой.""" + wb = Workbook() + ws = wb.active + ws.title = "Phone Numbers" + + headers = ["Original URL", "Phone", "Final Domain", "Promo", "Processed At", "Rating"] + ws.append(headers) + + header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") + header_font = Font(bold=True, color="FFFFFF") + for cell in ws[1]: + cell.fill = header_fill + cell.font = header_font + cell.alignment = Alignment(horizontal="center") + + for original_url, phone, final_domain, is_promo, rating in results: + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + promo_mark = "YES" if is_promo else "no" + + ws.append([original_url, phone, final_domain, promo_mark, timestamp, rating]) + + if is_promo: + row_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid") + for cell in ws[ws.max_row]: + cell.fill = row_fill + + for column in ws.columns: + max_len = 0 + for cell in column: + if cell.value: + max_len = max(max_len, len(str(cell.value))) + ws.column_dimensions[column[0].column_letter].width = min(max_len + 2, 60) + + ws.freeze_panes = 'A2' + + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + wb.save(filepath) + + +def load_urls(additional_urls: list[str]) -> list[str]: + """Загрузка и фильтрация URL из конфигурации.""" + urls = [] + + urls.extend(CONFIG["urls"]) + + if CONFIG["input_file"] and Path(CONFIG["input_file"]).exists(): + try: + with open(CONFIG["input_file"], 'r', encoding='utf-8') as f: + for line in f: + line = line.strip().strip('"\'').rstrip(',') + if line and line.startswith('http'): + urls.append(line.split()[0]) + except Exception: + pass + + urls.extend(additional_urls) + + seen_domains = set() + cleaned = [] + + for url in urls: + url = url.strip() + if not url or not url.startswith('http'): + continue + domain = normalize_domain(url) + if not domain or is_excluded(domain): + continue + if domain == 'yabs.yandex.ru': + cleaned.append(url) + continue + if domain in seen_domains: + continue + seen_domains.add(domain) + cleaned.append(url) + + return cleaned + + +async def main(): + """Точка входа.""" + + parser = argparse.ArgumentParser(description="Phone Scraper") + parser.add_argument('--promo-only', action='store_true', help="Сохранять только promo-записи (yabs.yandex.ru)") + parser.add_argument('urls', nargs='*', help="Дополнительные URL для обработки") + args = parser.parse_args() + + promo_only = args.promo_only + + try: + collected_links = collect_links() + except Exception as e: + print(f"Ошибка в collect_links: {e}. Продолжаем без собранных ссылок.") + collected_links = [] + + urls = load_urls(collected_links + args.urls) + + if not urls: + print("\n💡 Использование:") + print(" python script.py [--promo-only] 'https://site1.ru' 'https://site2.ru'") + return + + raw_results = await process_batch(urls) + + seen_final_domains = set() + unique_raw_results = [] + for result in raw_results: + original_url, phone, final_domain, is_promo = result + if final_domain not in seen_final_domains: + seen_final_domains.add(final_domain) + unique_raw_results.append(result) + + results = unique_raw_results + + if promo_only: + results = [r for r in results if r[3]] + + if results: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = CONFIG["output_file"].format(timestamp=timestamp) + save_to_excel(results, output_path) + + promo_count = sum(1 for r in results if r[3]) + print(f"\n📊 ИТОГИ:") + print(f" 🔍 Обработано: {len(urls)}") + print(f" 📞 Найдено телефонов: {len(results)}") + print(f" 🎯 Из promo (yabs): {promo_count}") + print(f" 📁 Файл: {output_path}") + + print("\n✅ Готово!") + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n⚠ Прервано пользователем") + except Exception as e: + raise \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..26fcd67 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +httpx +selenium +openpyxl +ollama +aiogram \ No newline at end of file