diff --git a/bot.py b/bot.py
new file mode 100644
index 0000000..25a65b6
--- /dev/null
+++ b/bot.py
@@ -0,0 +1,537 @@
+# bot.py
+# 🚀 Авто-Скрейпер — версия БЕЗ ИИ оценки (Ollama полностью удалена)
+
+import argparse
+import logging
+from pathlib import Path
+from datetime import datetime
+import asyncio
+import tempfile
+import httpx
+import time
+
+from aiogram import Bot, Dispatcher, types, F
+from aiogram.filters import Command
+from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton, FSInputFile, CallbackQuery
+
+# Импорт только нужного
+from config import CONFIG
+from link_collector import collect_links
+from main import process_batch, save_to_excel, load_urls
+
+logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Глобальные переменные
+last_output_file = None
+pending_links = None
+
+# ==================== ГОРОДА ====================
+CITIES = {
+ "Нижний Новгород": {"lr": 47, "keywords": ['Нижний Новгород', 'Новгород', 'нижний новгород'], "name": "Нижний Новгород"},
+ "Краснодар": {"lr": 35, "keywords": ['Краснодар', 'краснодар'], "name": "Краснодар"},
+ "Москва": {"lr": 213, "keywords": ['Москва', 'москва'], "name": "Москва"}
+}
+
+async def simple_progress(message: types.Message, done: int, total: int, last: list, unique_phones: set):
+ """
+ Обновляет сообщение со счётчиком и уникальными номерами не чаще раза в 1 секунду (для более динамичного обновления).
+ last: [last_time] — mutable list для отслеживания.
+ unique_phones: set уникальных номеров для добавления.
+ """
+ import time
+ now = time.time()
+
+ # Обновляем: либо прошло 1+ сек, либо это последний элемент
+ if (now - last[0] >= 1.0) or (done == total):
+ try:
+ phones_text = "\n".join(sorted(unique_phones)) if unique_phones else "" # Сортировка для стабильного вида
+ text = f"⏳ Обработка: {done}/{total}"
+ if phones_text:
+ text += f"\n\n{phones_text}"
+ await message.edit_text(
+ text,
+ parse_mode="HTML"
+ )
+ last[0] = now # запоминаем время обновления
+ except:
+ pass # игнорируем ошибки редактирования
+
+def save_config():
+ try:
+ timeout_obj = CONFIG["http"]["timeout"]
+
+ # Временно заменяем объект на плейсхолдер для pprint
+ CONFIG["http"]["timeout"] = "PLACEHOLDER_TIMEOUT"
+
+ import pprint
+ s = pprint.pformat(CONFIG, width=100, sort_dicts=False)
+
+ # Возвращаем объект обратно
+ CONFIG["http"]["timeout"] = timeout_obj
+
+ # Правильно восстанавливаем httpx.Timeout
+ if isinstance(timeout_obj, httpx.Timeout):
+ connect = getattr(timeout_obj, 'connect', 5.0)
+ read = getattr(timeout_obj, 'read', 10.0)
+ timeout_str = f"httpx.Timeout({read}, connect={connect})"
+ else:
+ timeout_str = repr(timeout_obj)
+
+ s = s.replace("'PLACEHOLDER_TIMEOUT'", timeout_str)
+
+ with open('config.py', 'w', encoding='utf-8') as f:
+ f.write(f'# config.py\nimport httpx\n# 🔧 КОНФИГУРАЦИЯ\nCONFIG = {s}')
+
+ logger.info("✅ CONFIG успешно сохранён")
+ return True
+ except Exception as e:
+ logger.error(f"❌ Ошибка сохранения config.py: {e}")
+ import traceback
+ logger.error(traceback.format_exc())
+ return False
+
+def get_search_queries_count() -> int:
+ try:
+ with open(CONFIG["search_file"], encoding="utf-8") as f:
+ return len([line for line in f if line.strip()])
+ except:
+ return 0
+
+# ==================== КЛАВИАТУРЫ ====================
+def main_menu_keyboard():
+ return InlineKeyboardMarkup(inline_keyboard=[
+ [InlineKeyboardButton(text="📤 Загрузить файл с ссылками", callback_data="upload_links")],
+ [
+ InlineKeyboardButton(text="⚙️ Настройки", callback_data="settings_menu"),
+ InlineKeyboardButton(text="🌆 Сменить город", callback_data="city_menu")
+ ],
+ ])
+
+def back_to_menu_keyboard():
+ return InlineKeyboardMarkup(inline_keyboard=[[InlineKeyboardButton(text="⬅️ Вернуться в меню", callback_data="main_menu")]])
+
+def cities_keyboard():
+ kb = [[InlineKeyboardButton(text=f"🌆 {city}", callback_data=f"set_city_{city}")] for city in CITIES]
+ kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
+ return InlineKeyboardMarkup(inline_keyboard=kb)
+
+def search_menu_keyboard():
+ return InlineKeyboardMarkup(inline_keyboard=[
+ [InlineKeyboardButton(text="📖 Показать search.txt", callback_data="show_search")],
+ [InlineKeyboardButton(text="✏️ Изменить search.txt", callback_data="edit_search")],
+ [InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
+ ])
+
+def output_format_keyboard():
+ formats = {
+ "phones": "📞 Только номера",
+ "domains": "🌐 Только домены",
+ "both": "📞+🌐 Номера и домены",
+ "excel": "📊 Excel файл"
+ }
+ kb = [[InlineKeyboardButton(text=label, callback_data=f"set_format_{key}")]
+ for key, label in formats.items()]
+ kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
+ return InlineKeyboardMarkup(inline_keyboard=kb)
+
+def settings_keyboard():
+ return InlineKeyboardMarkup(inline_keyboard=[
+ [InlineKeyboardButton(text="📤 Формат вывода", callback_data="output_format_menu")],
+ [InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
+ ])
+
+# ==================== НАСТРОЙКИ ВЫВОДА ====================
+
+async def settings_menu_callback(query: CallbackQuery):
+ current = CONFIG.get("output_format", "excel")
+ labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐", "excel": "📊"}
+ await query.message.edit_text(
+ f"⚙️ Настройки\n\n"
+ f"📤 Текущий формат: {labels.get(current, '📊')} {current}",
+ reply_markup=settings_keyboard(),
+ parse_mode="HTML"
+ )
+ await query.answer()
+
+async def output_format_menu_callback(query: CallbackQuery):
+ current = CONFIG.get("output_format", "excel")
+ await query.message.edit_text(
+ "📤 Выберите формат вывода:",
+ reply_markup=output_format_keyboard(),
+ parse_mode="HTML"
+ )
+ await query.answer()
+
+async def set_output_format_callback(query: CallbackQuery):
+ fmt = query.data.replace("set_format_", "")
+ if fmt in ["phones", "domains", "both", "excel"]:
+ CONFIG["output_format"] = fmt
+ save_config()
+ labels = {"phones": "📞 Только номера", "domains": "🌐 Только домены",
+ "both": "📞+🌐 Номера и домены", "excel": "📊 Excel файл"}
+ await query.answer(f"✅ Формат: {labels[fmt]}")
+ await settings_menu_callback(query)
+ else:
+ await query.answer("❌ Неверный формат", show_alert=True)
+
+# ==================== ФОРМАТИРОВАНИЕ РЕЗУЛЬТАТОВ ====================
+
+def format_results(results: list, fmt: str) -> str:
+ """
+ Форматирует результаты согласно выбранному формату.
+ results: [(org, phone, domain, promo, rating), ...]
+ fmt: 'phones', 'domains', 'both', 'excel'
+ """
+ if fmt == "phones":
+ return "\n".join([r[1] for r in results if r[1]])
+ elif fmt == "domains":
+ return "\n".join([r[2] for r in results if r[2]])
+ elif fmt == "both":
+ return "\n".join([f"{r[1]} — {r[2]}" for r in results if r[1] or r[2]])
+ return "" # для excel не используем текстовый формат
+
+# ==================== ХЕНДЛЕРЫ ====================
+async def start_handler(message: types.Message):
+ await message.answer(
+ "👋 Авто-Скрейпер\n\nГотов к работе!\nВыбери действие ниже:",
+ reply_markup=main_menu_keyboard(),
+ parse_mode="HTML"
+ )
+
+async def menu_callback(query: CallbackQuery):
+ await query.message.edit_text("👋 Главное меню", reply_markup=main_menu_keyboard(), parse_mode="HTML")
+ await query.answer()
+
+async def status_handler(query: CallbackQuery):
+ status_text = (
+ f"📊 Статус скрейпера\n\n"
+ f"🌆 Город: {CONFIG['region_name']}\n"
+ f"🔎 Запросов в search.txt: {get_search_queries_count()}\n"
+ f"📁 Последний результат: {last_output_file or '—'}\n\n"
+ f"✅ Работает без ИИ-оценки"
+ )
+ await query.message.edit_text(status_text, reply_markup=back_to_menu_keyboard(), parse_mode="HTML")
+ await query.answer()
+
+# ==================== СМЕНА ГОРОДА ====================
+async def city_menu_callback(query: CallbackQuery):
+ await query.message.edit_text("🌆 Выберите город:", reply_markup=cities_keyboard(), parse_mode="HTML")
+ await query.answer()
+
+async def set_city_callback(query: CallbackQuery):
+ global pending_links
+ city_name = query.data.replace("set_city_", "")
+ city = CITIES[city_name]
+
+ CONFIG["region_name"] = city["name"]
+ CONFIG["region_lr"] = city["lr"]
+ CONFIG["required_keywords"] = city["keywords"]
+
+ save_config()
+
+ if pending_links:
+ await query.answer(f"✅ Город изменён на {city_name}. Начинаю обработку ссылок...")
+ await query.message.edit_text("⏳ Обрабатываю ссылки из файла...", parse_mode="HTML")
+
+ try:
+ processed = load_urls(pending_links)
+
+ last = [time.time()] # для simple_progress
+ unique_phones = set() # для уникальных номеров
+
+ async def progress_cb(done: int, total: int):
+ await simple_progress(query.message, done, total, last, unique_phones)
+
+ raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
+
+ seen = set()
+ unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])]
+ results = [(o, p, d, promo, "—") for o, p, d, promo in unique]
+
+ # Финальное обновление прогресса (unique_phones уже заполнен инкрементально)
+ await progress_cb(len(processed), len(processed))
+
+ fmt = CONFIG.get("output_format", "excel")
+
+ if fmt == "excel" and results:
+ global last_output_file
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+ path = CONFIG["output_file"].format(timestamp=ts)
+ save_to_excel(results, path)
+ last_output_file = path
+
+ promo_cnt = sum(1 for r in results if r[3])
+ await query.message.answer(
+ f"✅ Обработка завершена!\n"
+ f"📊 Обработано URL: {len(pending_links)}\n"
+ f"📞 Найдено телефонов: {len(results)}\n"
+ f"🎯 Из них promo: {promo_cnt}\n"
+ f"📁 Формат: Excel",
+ parse_mode="HTML"
+ )
+ await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
+ elif results:
+ text = format_results(results, fmt)
+ if text:
+ if len(text) > 3800:
+ with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
+ f.write(text)
+ path = f.name
+ caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
+ await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
+ Path(path).unlink()
+ else:
+ labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
+ await query.message.answer(
+ f"✅ {labels.get(fmt, '')} Результаты:\n"
+ f"📊 Обработано: {len(results)}\n\n{text}",
+ parse_mode="HTML"
+ )
+ else:
+ await query.message.answer("⚠️ Нет данных для отображения")
+ else:
+ await query.message.answer("⚠️ Телефоны не найдены")
+
+ except Exception as e:
+ logger.error(f"❌ set_city_callback error: {e}")
+ await query.message.answer(f"❌ Ошибка: {e}")
+ finally:
+ pending_links = None
+
+ await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
+ else:
+ await query.answer(f"✅ Город изменён на {city_name}")
+ await status_handler(query)
+
+# ==================== search.txt ====================
+async def search_menu_callback(query: CallbackQuery):
+ await query.message.edit_text("📝 Управление search.txt", reply_markup=search_menu_keyboard(), parse_mode="HTML")
+ await query.answer()
+
+async def show_search_callback(query: CallbackQuery):
+ try:
+ with open(CONFIG["search_file"], encoding="utf-8") as f:
+ content = f.read()
+ if len(content) > 3800:
+ with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
+ f.write(content)
+ path = f.name
+ await query.message.answer_document(FSInputFile(path), caption="📝 search.txt")
+ Path(path).unlink()
+ else:
+ await query.message.answer(f"📝 search.txt:\n\n{content}")
+ except Exception as e:
+ await query.message.answer(f"❌ Ошибка: {e}")
+ await query.answer()
+
+async def edit_search_callback(query: CallbackQuery):
+ await query.answer("✏️ Отправьте новое содержимое search.txt в следующем сообщении")
+
+# ==================== ОСНОВНЫЕ ДЕЙСТВИЯ ====================
+
+async def scrape_callback(query: CallbackQuery):
+ await query.answer("🚀 Запуск скрейпинга...")
+ msg = await query.message.edit_text("⏳ Выполняю полный скрейпинг...", parse_mode="HTML")
+
+ last = [time.time()]
+ unique_phones = set()
+
+ async def progress_cb(done: int, total: int):
+ await simple_progress(msg, done, total, last, unique_phones)
+
+ try:
+ links = collect_links()
+ urls = load_urls(links)
+ if not urls:
+ await msg.answer("❌ Нет ссылок для обработки", reply_markup=main_menu_keyboard())
+ return
+
+ raw = await process_batch(urls, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
+
+ seen = set()
+ unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
+ results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки
+
+ # Финальное обновление (unique_phones уже заполнен)
+ await progress_cb(len(urls), len(urls))
+
+ if results:
+ global last_output_file
+ fmt = CONFIG.get("output_format", "excel")
+
+ if fmt == "excel":
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+ path = CONFIG["output_file"].format(timestamp=ts)
+ save_to_excel(results, path)
+ last_output_file = path
+
+ promo_cnt = sum(1 for r in results if r[3])
+ await query.message.answer(
+ f"✅ Скрейпинг завершён!\n"
+ f"📊 Обработано URL: {len(urls)}\n"
+ f"📞 Найдено телефонов: {len(results)}\n"
+ f"🎯 Из них promo: {promo_cnt}",
+ parse_mode="HTML"
+ )
+ await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
+ else:
+ # Текстовый вывод
+ text = format_results(results, fmt)
+ if text:
+ if len(text) > 3800:
+ with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
+ f.write(text)
+ path = f.name
+ caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
+ await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
+ Path(path).unlink()
+ else:
+ labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
+ await query.message.answer(
+ f"✅ {labels.get(fmt, '')} Результаты:\n\n{text}",
+ parse_mode="HTML"
+ )
+ else:
+ await query.message.answer("⚠️ Нет данных для отображения")
+ else:
+ await query.message.answer("⚠️ Телефоны не найдены")
+ except Exception as e:
+ await query.message.answer(f"❌ Ошибка: {e}")
+
+ await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
+
+async def process_callback(query: CallbackQuery):
+ await query.answer()
+ await query.message.answer("📋 Отправьте ссылки для обработки\n(можно несколько через пробел)", parse_mode="HTML")
+
+async def manual_process(message: types.Message):
+ urls = [u.strip() for u in message.text.split() if u.strip().startswith("http")]
+ if not urls:
+ await message.answer("❌ Не найдено валидных ссылок")
+ return
+
+ progress_msg = await message.answer(f"⏳ Обрабатываю {len(urls)} URL...")
+ last = [time.time()]
+ unique_phones = set()
+
+ async def progress_cb(done: int, total: int):
+ await simple_progress(progress_msg, done, total, last, unique_phones)
+
+ try:
+ processed = load_urls(urls)
+ raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
+
+ seen = set()
+ unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
+ results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки
+
+ await progress_cb(len(processed), len(processed))
+
+ fmt = CONFIG.get("output_format", "excel")
+
+ if fmt == "excel" and results:
+ global last_output_file
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+ path = CONFIG["output_file"].format(timestamp=ts)
+ save_to_excel(results, path)
+ last_output_file = path
+
+ await message.answer(
+ f"✅ Готово!\n📊 Обработано: {len(results)}\n📁 Формат: Excel",
+ parse_mode="HTML"
+ )
+ await message.answer_document(FSInputFile(path), caption="📁 Результаты")
+ elif results:
+ text = format_results(results, fmt)
+ if text:
+ if len(text) > 3800:
+ with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
+ f.write(text)
+ path = f.name
+ caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
+ await message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
+ await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
+ Path(path).unlink()
+ else:
+ labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
+ await message.answer(
+ f"✅ {labels.get(fmt, '')} Результаты:\n\n{text}",
+ parse_mode="HTML"
+ )
+ await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
+ else:
+ await message.answer("⚠️ Нет данных для отображения")
+ else:
+ await message.answer("⚠️ Телефоны не найдены")
+
+ except Exception as e:
+ logger.error(f"❌ manual_process error: {e}")
+ await message.answer(f"❌ Ошибка: {e}")
+
+async def upload_links_callback(query: CallbackQuery):
+ await query.answer()
+ await query.message.answer("📤 Отправьте файл .txt с ссылками\n(одна на строку)", parse_mode="HTML")
+
+async def handle_uploaded_file(message: types.Message, bot: Bot):
+ global pending_links
+ document = message.document
+ if not document.file_name.endswith('.txt'):
+ await message.answer("❌ Файл должен быть .txt")
+ return
+
+ file_info = await bot.get_file(document.file_id)
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
+ await bot.download_file(file_info.file_path, tmp.name)
+
+ try:
+ with open(tmp.name, 'r', encoding='utf-8') as f:
+ content = f.read()
+ urls = [line.strip() for line in content.splitlines() if line.strip().startswith('http')]
+
+ if not urls:
+ await message.answer("❌ Нет валидных ссылок в файле")
+ return
+
+ # 🔥 ФИКС: Очищаем конфиг, чтобы load_urls не добавил лишнего
+ CONFIG["urls"] = []
+ CONFIG["input_file"] = ""
+
+ pending_links = urls
+ await message.answer(f"✅ Файл загружен! Найдено {len(urls)} ссылок.\nТеперь выберите город:", parse_mode="HTML")
+ await message.answer("🌆 Выберите город:", reply_markup=cities_keyboard(), parse_mode="HTML")
+ except Exception as e:
+ await message.answer(f"❌ Ошибка: {e}")
+ finally:
+ Path(tmp.name).unlink()
+
+# ==================== ЗАПУСК ====================
+def main_bot(token: str):
+ bot = Bot(token=token)
+ dp = Dispatcher()
+
+ dp.message.register(start_handler, Command("start"))
+ dp.message.register(manual_process, F.text & ~F.text.startswith("/"))
+ dp.message.register(handle_uploaded_file, F.document)
+
+ dp.callback_query.register(menu_callback, F.data == "main_menu")
+ dp.callback_query.register(status_handler, F.data == "status")
+ dp.callback_query.register(scrape_callback, F.data == "scrape")
+ dp.callback_query.register(process_callback, F.data == "process")
+ dp.callback_query.register(upload_links_callback, F.data == "upload_links")
+ dp.callback_query.register(search_menu_callback, F.data == "search_menu")
+ dp.callback_query.register(show_search_callback, F.data == "show_search")
+ dp.callback_query.register(edit_search_callback, F.data == "edit_search")
+ dp.callback_query.register(city_menu_callback, F.data == "city_menu")
+ dp.callback_query.register(set_city_callback, F.data.startswith("set_city_"))
+ dp.callback_query.register(settings_menu_callback, F.data == "settings_menu")
+ dp.callback_query.register(output_format_menu_callback, F.data == "output_format_menu")
+ dp.callback_query.register(set_output_format_callback, F.data.startswith("set_format_"))
+
+ asyncio.run(dp.start_polling(bot))
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--token', required=True)
+ args = parser.parse_args()
+ main_bot(args.token)
\ No newline at end of file
diff --git a/config.py b/config.py
new file mode 100644
index 0000000..02ef642
--- /dev/null
+++ b/config.py
@@ -0,0 +1,43 @@
+# config.py
+import httpx
+# 🔧 КОНФИГУРАЦИЯ
+CONFIG = {'input_file': '',
+ 'output_file': 'C:\\Coding\\auto-scraper\\output\\phones_{timestamp}.xlsx',
+ 'log_file': 'C:\\Coding\\auto-scraper\\logs\\scraper.log',
+ 'excluded_domains': {'auto.drom.ru',
+ 'auto.ru',
+ 'autocompass-j.ru',
+ 'autocompass-v.ru',
+ 'avito.ru',
+ 'duckduckgo.com',
+ 'google.com',
+ 'sberauto.com',
+ 'sberleasing.ru'},
+ 'urls': [],
+ 'output_format': 'both',
+ 'http': {'timeout': httpx.Timeout(10.0, connect=5.0),
+ 'max_redirects': 5,
+ 'retry_attempts': 3,
+ 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like '
+ 'Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+ 'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
+ 'Referer': 'https://yandex.ru/',
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate'}},
+ 'phone': {'patterns': ['href=["\\\']tel:([^"\\\']+)["\\\']',
+ 'tel["\\\']?\\s*[:=]\\s*["\\\']?([+()0-9\\-\\s]{10,})["\\\']?',
+ '(?:телефон|phone|контакт)["\\\']?\\s*[:=]?\\s*["\\\']?([+()0-9\\-\\s]{10,})'],
+ 'country_code': '7',
+ 'min_digits': 10,
+ 'max_digits': 12},
+ 'required_keywords': ['Краснодар', 'краснодар'],
+ 'stop_keywords': ['аренда', 'АРЕНДА', 'Аренда', '2311373680', 'autocompass'],
+ 'headless': False,
+ 'search_pages': 3,
+ 'workers': 3,
+ 'search_template_url': 'https://ya.ru/search/?text={search}&lr={lr}',
+ 'region_lr': 35,
+ 'region_name': 'Краснодар',
+ 'search_file': 'search.txt',
+ 'profile_dir': 'C:\\Users\\Дмитрий\\chrome_profile_yandex'}
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..2923569
--- /dev/null
+++ b/main.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+📞 DuckDuckGo/Yandex → Phone Number Scraper v2.2
+Извлекает ссылки → следует редиректам (включая yabs) → парсит телефоны → сохраняет в Excel
++ Исправления: PoolTimeout, экспоненциальный бэк-офф, адаптивные лимиты, рандомизация
+"""
+
+import re
+import sys
+import asyncio
+import random
+import httpx
+from pathlib import Path
+from urllib.parse import urlparse
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment, PatternFill
+from datetime import datetime
+import argparse
+import logging
+
+# === Специфичные исключения httpx ===
+from httpx import PoolTimeout, ConnectTimeout, ReadTimeout, HTTPStatusError, RequestError
+
+from config import CONFIG
+from link_collector import collect_links
+
+# Suppress httpx info logs
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
+# Компиляция regex заранее
+TEL_PATTERNS = [re.compile(p, re.IGNORECASE) for p in CONFIG["phone"]["patterns"]]
+
+# Глобальный список User-Agent для ротации
+USER_AGENTS = [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
+]
+
+
+def normalize_domain(url: str) -> str:
+ """Нормализация домена для проверки дубликатов."""
+ try:
+ domain = urlparse(url.strip()).hostname or ''
+ return domain.replace('www.', '', 1).lower()
+ except Exception:
+ return ''
+
+
+def is_excluded(domain: str) -> bool:
+ """Проверка домена на исключение (ТОЧНОЕ совпадение, без поддоменов)."""
+ return domain in CONFIG["excluded_domains"]
+
+
+def normalize_phone(phone: str) -> str | None:
+ """Нормализация телефона к формату: +7 (XXX) XXX-XX-XX"""
+ digits = re.sub(r"[^\d+]", "", phone.strip())
+
+ if digits.startswith('+7'):
+ digits = digits[2:]
+ elif digits.startswith('8') and len(digits) == 11:
+ digits = digits[1:]
+ elif digits.startswith('7') and len(digits) == 11:
+ digits = digits[1:]
+
+ if len(digits) != 10:
+ return None
+
+ return f"+7 ({digits[:3]}) {digits[3:6]}-{digits[6:8]}-{digits[8:10]}"
+
+
+def extract_phone_from_html(html: str) -> str | None:
+ """Поиск телефона в HTML-контенте по множеству паттернов."""
+ for pattern in TEL_PATTERNS:
+ match = pattern.search(html)
+ if match:
+ raw = match.group(1).strip()
+ normalized = normalize_phone(raw)
+ if normalized:
+ return normalized
+ return None
+
+
+def check_content_filters(html: str) -> bool:
+ """Проверка HTML на наличие required_keywords и отсутствие stop_keywords (case-insensitive)."""
+ lower_html = html.lower()
+
+ if CONFIG["required_keywords"]:
+ required_lower = [kw.lower() for kw in CONFIG["required_keywords"]]
+ if not any(kw in lower_html for kw in required_lower):
+ return False
+
+ if CONFIG["stop_keywords"]:
+ stop_lower = [kw.lower() for kw in CONFIG["stop_keywords"]]
+ if any(kw in lower_html for kw in stop_lower):
+ return False
+
+ return True
+
+
+def analyze_redirect_chain(url: str, final_url: str) -> tuple[str, bool]:
+ """
+ Анализирует цепочку редиректов.
+ Возвращает: (финальный домен, is_promo)
+ """
+ try:
+ original_host = urlparse(url.strip()).hostname or ''
+ is_promo = (original_host == 'yabs.yandex.ru')
+
+ final_domain = urlparse(final_url.strip()).hostname or ''
+ final_domain = final_domain.replace('www.', '', 1).lower()
+ return final_domain, is_promo
+ except Exception:
+ return normalize_domain(url), False
+
+
+def _get_client_config(url: str) -> dict:
+ """Возвращает конфигурацию клиента в зависимости от домена (щадящий режим для Яндекса)."""
+ is_yandex = any(x in url.lower() for x in ['yandex.ru', 'yabs.yandex.ru', 'ya.ru'])
+
+ if is_yandex:
+ return {
+ "limits": httpx.Limits(max_connections=30, max_keepalive_connections=20),
+ "timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=30.0),
+ "retry_base_delay": 2.0,
+ "max_retries": 2,
+ }
+ else:
+ return {
+ "limits": httpx.Limits(max_connections=20, max_keepalive_connections=10),
+ "timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
+ "retry_base_delay": 1.0,
+ "max_retries": CONFIG["http"]["retry_attempts"],
+ }
+
+
+async def fetch_with_retry(client: httpx.AsyncClient, url: str, retries: int = 0,
+ base_delay: float = 1.0, max_retries: int = 3) -> tuple[str, str | None, str | None, bool]:
+ """
+ ГИБРИДНАЯ функция с улучшенной обработкой тайм-аутов и экспоненциальным бэк-оффом.
+ """
+ try:
+ # Ротация User-Agent для каждого запроса
+ headers = {
+ "User-Agent": random.choice(USER_AGENTS),
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+ "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
+ "Accept-Encoding": "gzip, deflate, br",
+ "Connection": "keep-alive",
+ "Upgrade-Insecure-Requests": "1",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "none",
+ "Cache-Control": "max-age=0",
+ }
+
+ async with client.stream("GET", url, headers=headers, follow_redirects=True) as response:
+ if response.status_code >= 400:
+ raise HTTPStatusError(
+ f"Status {response.status_code}",
+ request=response.request,
+ response=response
+ )
+
+ final_url = str(response.url)
+ buffer = []
+ async for chunk in response.aiter_text(chunk_size=8192):
+ buffer.append(chunk)
+ full_html = ''.join(buffer)
+
+ final_domain, is_promo = analyze_redirect_chain(url, final_url)
+
+ if not check_content_filters(full_html):
+ return url, None, final_domain, is_promo
+
+ phone = extract_phone_from_html(full_html)
+ return url, phone, final_domain, is_promo
+
+ except PoolTimeout as e:
+ if retries < min(2, max_retries):
+ delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
+ await asyncio.sleep(delay)
+ return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
+ return url, None, normalize_domain(url), False
+
+ except ConnectTimeout as e:
+ if retries < max_retries:
+ delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
+ await asyncio.sleep(delay)
+ return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
+ return url, None, normalize_domain(url), False
+
+ except ReadTimeout as e:
+ return url, None, normalize_domain(url), False
+
+ except HTTPStatusError as e:
+ if e.response.status_code == 429:
+ retry_after = e.response.headers.get('Retry-After', '5')
+ await asyncio.sleep(int(retry_after) + random.randint(1, 3))
+ return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
+ return url, None, normalize_domain(url), False
+
+ except RequestError as e:
+ if retries < max_retries:
+ delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
+ await asyncio.sleep(delay)
+ return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
+ return url, None, normalize_domain(url), False
+
+ except Exception as e:
+ if retries < max_retries:
+ delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
+ await asyncio.sleep(delay)
+ return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
+ return url, None, normalize_domain(url), False
+
+
+async def process_batch(urls: list[str], batch_size: int = 50, progress_callback=None, unique_phones: set = None):
+ """
+ Пакетная обработка с ограничением параллелизма и адаптивными настройками.
+ :param progress_callback: async функция (done: int, total: int) для обновления прогресса
+ :param unique_phones: set для инкрементального добавления уникальных номеров
+ """
+ results = []
+ total_urls = len(urls)
+
+ for i in range(0, len(urls), batch_size):
+ batch = urls[i:i + batch_size]
+
+ sample_url = batch[0] if batch else ""
+ client_config = _get_client_config(sample_url)
+
+ async with httpx.AsyncClient(
+ headers={"User-Agent": random.choice(USER_AGENTS)},
+ timeout=client_config["timeout"],
+ follow_redirects=True,
+ limits=client_config["limits"]
+ ) as client:
+
+ tasks = [
+ fetch_with_retry(
+ client, url,
+ base_delay=client_config["retry_base_delay"],
+ max_retries=client_config["max_retries"]
+ )
+ for url in batch
+ ]
+ batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ for j, result in enumerate(batch_results):
+ current_idx = i + j + 1
+
+ if isinstance(result, Exception):
+ if progress_callback and callable(progress_callback):
+ await progress_callback(current_idx, total_urls)
+ continue
+
+ original_url, phone, final_domain, is_promo = result
+ print(f"{current_idx}: {final_domain} - {phone if phone else 'нет'}")
+ if phone:
+ results.append((original_url, phone, final_domain, is_promo))
+ if unique_phones is not None:
+ unique_phones.add(phone) # Инкрементальное добавление уникального номера
+
+ if progress_callback and callable(progress_callback):
+ await progress_callback(current_idx, total_urls)
+
+ if i + batch_size < len(urls):
+ delay = random.uniform(1.0, 2.0)
+ await asyncio.sleep(delay)
+
+ if progress_callback and callable(progress_callback):
+ await progress_callback(total_urls, total_urls)
+
+ return results
+
+
+def save_to_excel(results: list[tuple], filepath: str):
+ """Сохранение результатов в Excel с пометкой promo и оценкой."""
+ wb = Workbook()
+ ws = wb.active
+ ws.title = "Phone Numbers"
+
+ headers = ["Original URL", "Phone", "Final Domain", "Promo", "Processed At", "Rating"]
+ ws.append(headers)
+
+ header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
+ header_font = Font(bold=True, color="FFFFFF")
+ for cell in ws[1]:
+ cell.fill = header_fill
+ cell.font = header_font
+ cell.alignment = Alignment(horizontal="center")
+
+ for original_url, phone, final_domain, is_promo, rating in results:
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ promo_mark = "YES" if is_promo else "no"
+
+ ws.append([original_url, phone, final_domain, promo_mark, timestamp, rating])
+
+ if is_promo:
+ row_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+ for cell in ws[ws.max_row]:
+ cell.fill = row_fill
+
+ for column in ws.columns:
+ max_len = 0
+ for cell in column:
+ if cell.value:
+ max_len = max(max_len, len(str(cell.value)))
+ ws.column_dimensions[column[0].column_letter].width = min(max_len + 2, 60)
+
+ ws.freeze_panes = 'A2'
+
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+ wb.save(filepath)
+
+
+def load_urls(additional_urls: list[str]) -> list[str]:
+ """Загрузка и фильтрация URL из конфигурации."""
+ urls = []
+
+ urls.extend(CONFIG["urls"])
+
+ if CONFIG["input_file"] and Path(CONFIG["input_file"]).exists():
+ try:
+ with open(CONFIG["input_file"], 'r', encoding='utf-8') as f:
+ for line in f:
+ line = line.strip().strip('"\'').rstrip(',')
+ if line and line.startswith('http'):
+ urls.append(line.split()[0])
+ except Exception:
+ pass
+
+ urls.extend(additional_urls)
+
+ seen_domains = set()
+ cleaned = []
+
+ for url in urls:
+ url = url.strip()
+ if not url or not url.startswith('http'):
+ continue
+ domain = normalize_domain(url)
+ if not domain or is_excluded(domain):
+ continue
+ if domain == 'yabs.yandex.ru':
+ cleaned.append(url)
+ continue
+ if domain in seen_domains:
+ continue
+ seen_domains.add(domain)
+ cleaned.append(url)
+
+ return cleaned
+
+
+async def main():
+ """Точка входа."""
+
+ parser = argparse.ArgumentParser(description="Phone Scraper")
+ parser.add_argument('--promo-only', action='store_true', help="Сохранять только promo-записи (yabs.yandex.ru)")
+ parser.add_argument('urls', nargs='*', help="Дополнительные URL для обработки")
+ args = parser.parse_args()
+
+ promo_only = args.promo_only
+
+ try:
+ collected_links = collect_links()
+ except Exception as e:
+ print(f"Ошибка в collect_links: {e}. Продолжаем без собранных ссылок.")
+ collected_links = []
+
+ urls = load_urls(collected_links + args.urls)
+
+ if not urls:
+ print("\n💡 Использование:")
+ print(" python script.py [--promo-only] 'https://site1.ru' 'https://site2.ru'")
+ return
+
+ raw_results = await process_batch(urls)
+
+ seen_final_domains = set()
+ unique_raw_results = []
+ for result in raw_results:
+ original_url, phone, final_domain, is_promo = result
+ if final_domain not in seen_final_domains:
+ seen_final_domains.add(final_domain)
+ unique_raw_results.append(result)
+
+ results = unique_raw_results
+
+ if promo_only:
+ results = [r for r in results if r[3]]
+
+ if results:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_path = CONFIG["output_file"].format(timestamp=timestamp)
+ save_to_excel(results, output_path)
+
+ promo_count = sum(1 for r in results if r[3])
+ print(f"\n📊 ИТОГИ:")
+ print(f" 🔍 Обработано: {len(urls)}")
+ print(f" 📞 Найдено телефонов: {len(results)}")
+ print(f" 🎯 Из promo (yabs): {promo_count}")
+ print(f" 📁 Файл: {output_path}")
+
+ print("\n✅ Готово!")
+
+
+if __name__ == "__main__":
+ try:
+ asyncio.run(main())
+ except KeyboardInterrupt:
+ print("\n⚠ Прервано пользователем")
+ except Exception as e:
+ raise
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..26fcd67
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+httpx
+selenium
+openpyxl
+ollama
+aiogram
\ No newline at end of file