feat: upload files

This commit is contained in:
Herzic 2026-03-10 15:59:20 +03:00
parent 4f44480bcf
commit 6325b485b2
4 changed files with 1004 additions and 0 deletions

537
bot.py Normal file
View File

@ -0,0 +1,537 @@
# bot.py
# 🚀 Авто-Скрейпер — версия БЕЗ ИИ оценки (Ollama полностью удалена)
import argparse
import logging
from pathlib import Path
from datetime import datetime
import asyncio
import tempfile
import httpx
import time
from aiogram import Bot, Dispatcher, types, F
from aiogram.filters import Command
from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton, FSInputFile, CallbackQuery
# Импорт только нужного
from config import CONFIG
from link_collector import collect_links
from main import process_batch, save_to_excel, load_urls
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
# Глобальные переменные
last_output_file = None
pending_links = None
# ==================== ГОРОДА ====================
CITIES = {
"Нижний Новгород": {"lr": 47, "keywords": ['Нижний Новгород', 'Новгород', 'нижний новгород'], "name": "Нижний Новгород"},
"Краснодар": {"lr": 35, "keywords": ['Краснодар', 'краснодар'], "name": "Краснодар"},
"Москва": {"lr": 213, "keywords": ['Москва', 'москва'], "name": "Москва"}
}
async def simple_progress(message: types.Message, done: int, total: int, last: list, unique_phones: set):
"""
Обновляет сообщение со счётчиком и уникальными номерами не чаще раза в 1 секунду (для более динамичного обновления).
last: [last_time] mutable list для отслеживания.
unique_phones: set уникальных номеров для добавления.
"""
import time
now = time.time()
# Обновляем: либо прошло 1+ сек, либо это последний элемент
if (now - last[0] >= 1.0) or (done == total):
try:
phones_text = "\n".join(sorted(unique_phones)) if unique_phones else "" # Сортировка для стабильного вида
text = f"⏳ <b>Обработка:</b> {done}/{total}"
if phones_text:
text += f"\n\n{phones_text}"
await message.edit_text(
text,
parse_mode="HTML"
)
last[0] = now # запоминаем время обновления
except:
pass # игнорируем ошибки редактирования
def save_config():
try:
timeout_obj = CONFIG["http"]["timeout"]
# Временно заменяем объект на плейсхолдер для pprint
CONFIG["http"]["timeout"] = "PLACEHOLDER_TIMEOUT"
import pprint
s = pprint.pformat(CONFIG, width=100, sort_dicts=False)
# Возвращаем объект обратно
CONFIG["http"]["timeout"] = timeout_obj
# Правильно восстанавливаем httpx.Timeout
if isinstance(timeout_obj, httpx.Timeout):
connect = getattr(timeout_obj, 'connect', 5.0)
read = getattr(timeout_obj, 'read', 10.0)
timeout_str = f"httpx.Timeout({read}, connect={connect})"
else:
timeout_str = repr(timeout_obj)
s = s.replace("'PLACEHOLDER_TIMEOUT'", timeout_str)
with open('config.py', 'w', encoding='utf-8') as f:
f.write(f'# config.py\nimport httpx\n# 🔧 КОНФИГУРАЦИЯ\nCONFIG = {s}')
logger.info("✅ CONFIG успешно сохранён")
return True
except Exception as e:
logger.error(f"❌ Ошибка сохранения config.py: {e}")
import traceback
logger.error(traceback.format_exc())
return False
def get_search_queries_count() -> int:
try:
with open(CONFIG["search_file"], encoding="utf-8") as f:
return len([line for line in f if line.strip()])
except:
return 0
# ==================== КЛАВИАТУРЫ ====================
def main_menu_keyboard():
return InlineKeyboardMarkup(inline_keyboard=[
[InlineKeyboardButton(text="📤 Загрузить файл с ссылками", callback_data="upload_links")],
[
InlineKeyboardButton(text="⚙️ Настройки", callback_data="settings_menu"),
InlineKeyboardButton(text="🌆 Сменить город", callback_data="city_menu")
],
])
def back_to_menu_keyboard():
return InlineKeyboardMarkup(inline_keyboard=[[InlineKeyboardButton(text="⬅️ Вернуться в меню", callback_data="main_menu")]])
def cities_keyboard():
kb = [[InlineKeyboardButton(text=f"🌆 {city}", callback_data=f"set_city_{city}")] for city in CITIES]
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
return InlineKeyboardMarkup(inline_keyboard=kb)
def search_menu_keyboard():
return InlineKeyboardMarkup(inline_keyboard=[
[InlineKeyboardButton(text="📖 Показать search.txt", callback_data="show_search")],
[InlineKeyboardButton(text="✏️ Изменить search.txt", callback_data="edit_search")],
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
])
def output_format_keyboard():
formats = {
"phones": "📞 Только номера",
"domains": "🌐 Только домены",
"both": "📞+🌐 Номера и домены",
"excel": "📊 Excel файл"
}
kb = [[InlineKeyboardButton(text=label, callback_data=f"set_format_{key}")]
for key, label in formats.items()]
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
return InlineKeyboardMarkup(inline_keyboard=kb)
def settings_keyboard():
return InlineKeyboardMarkup(inline_keyboard=[
[InlineKeyboardButton(text="📤 Формат вывода", callback_data="output_format_menu")],
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
])
# ==================== НАСТРОЙКИ ВЫВОДА ====================
async def settings_menu_callback(query: CallbackQuery):
current = CONFIG.get("output_format", "excel")
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐", "excel": "📊"}
await query.message.edit_text(
f"⚙️ <b>Настройки</b>\n\n"
f"📤 Текущий формат: <b>{labels.get(current, '📊')} {current}</b>",
reply_markup=settings_keyboard(),
parse_mode="HTML"
)
await query.answer()
async def output_format_menu_callback(query: CallbackQuery):
current = CONFIG.get("output_format", "excel")
await query.message.edit_text(
"📤 <b>Выберите формат вывода:</b>",
reply_markup=output_format_keyboard(),
parse_mode="HTML"
)
await query.answer()
async def set_output_format_callback(query: CallbackQuery):
fmt = query.data.replace("set_format_", "")
if fmt in ["phones", "domains", "both", "excel"]:
CONFIG["output_format"] = fmt
save_config()
labels = {"phones": "📞 Только номера", "domains": "🌐 Только домены",
"both": "📞+🌐 Номера и домены", "excel": "📊 Excel файл"}
await query.answer(f"✅ Формат: {labels[fmt]}")
await settings_menu_callback(query)
else:
await query.answer("❌ Неверный формат", show_alert=True)
# ==================== ФОРМАТИРОВАНИЕ РЕЗУЛЬТАТОВ ====================
def format_results(results: list, fmt: str) -> str:
"""
Форматирует результаты согласно выбранному формату.
results: [(org, phone, domain, promo, rating), ...]
fmt: 'phones', 'domains', 'both', 'excel'
"""
if fmt == "phones":
return "\n".join([r[1] for r in results if r[1]])
elif fmt == "domains":
return "\n".join([r[2] for r in results if r[2]])
elif fmt == "both":
return "\n".join([f"{r[1]}{r[2]}" for r in results if r[1] or r[2]])
return "" # для excel не используем текстовый формат
# ==================== ХЕНДЛЕРЫ ====================
async def start_handler(message: types.Message):
await message.answer(
"👋 <b>Авто-Скрейпер</b>\n\nГотов к работе!\nВыбери действие ниже:",
reply_markup=main_menu_keyboard(),
parse_mode="HTML"
)
async def menu_callback(query: CallbackQuery):
await query.message.edit_text("👋 <b>Главное меню</b>", reply_markup=main_menu_keyboard(), parse_mode="HTML")
await query.answer()
async def status_handler(query: CallbackQuery):
status_text = (
f"📊 <b>Статус скрейпера</b>\n\n"
f"🌆 Город: <b>{CONFIG['region_name']}</b>\n"
f"🔎 Запросов в search.txt: <b>{get_search_queries_count()}</b>\n"
f"📁 Последний результат: <code>{last_output_file or ''}</code>\n\n"
f"✅ Работает без ИИ-оценки"
)
await query.message.edit_text(status_text, reply_markup=back_to_menu_keyboard(), parse_mode="HTML")
await query.answer()
# ==================== СМЕНА ГОРОДА ====================
async def city_menu_callback(query: CallbackQuery):
await query.message.edit_text("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
await query.answer()
async def set_city_callback(query: CallbackQuery):
global pending_links
city_name = query.data.replace("set_city_", "")
city = CITIES[city_name]
CONFIG["region_name"] = city["name"]
CONFIG["region_lr"] = city["lr"]
CONFIG["required_keywords"] = city["keywords"]
save_config()
if pending_links:
await query.answer(f"✅ Город изменён на {city_name}. Начинаю обработку ссылок...")
await query.message.edit_text("⏳ <b>Обрабатываю ссылки из файла...</b>", parse_mode="HTML")
try:
processed = load_urls(pending_links)
last = [time.time()] # для simple_progress
unique_phones = set() # для уникальных номеров
async def progress_cb(done: int, total: int):
await simple_progress(query.message, done, total, last, unique_phones)
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
seen = set()
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])]
results = [(o, p, d, promo, "") for o, p, d, promo in unique]
# Финальное обновление прогресса (unique_phones уже заполнен инкрементально)
await progress_cb(len(processed), len(processed))
fmt = CONFIG.get("output_format", "excel")
if fmt == "excel" and results:
global last_output_file
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = CONFIG["output_file"].format(timestamp=ts)
save_to_excel(results, path)
last_output_file = path
promo_cnt = sum(1 for r in results if r[3])
await query.message.answer(
f"✅ <b>Обработка завершена!</b>\n"
f"📊 Обработано URL: {len(pending_links)}\n"
f"📞 Найдено телефонов: {len(results)}\n"
f"🎯 Из них promo: {promo_cnt}\n"
f"📁 Формат: Excel",
parse_mode="HTML"
)
await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
elif results:
text = format_results(results, fmt)
if text:
if len(text) > 3800:
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
f.write(text)
path = f.name
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
await query.message.answer_document(FSInputFile(path), caption=f"{caption}")
Path(path).unlink()
else:
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
await query.message.answer(
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n"
f"📊 Обработано: {len(results)}\n\n{text}",
parse_mode="HTML"
)
else:
await query.message.answer("⚠️ Нет данных для отображения")
else:
await query.message.answer("⚠️ Телефоны не найдены")
except Exception as e:
logger.error(f"❌ set_city_callback error: {e}")
await query.message.answer(f"❌ Ошибка: {e}")
finally:
pending_links = None
await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
else:
await query.answer(f"✅ Город изменён на {city_name}")
await status_handler(query)
# ==================== search.txt ====================
async def search_menu_callback(query: CallbackQuery):
await query.message.edit_text("📝 <b>Управление search.txt</b>", reply_markup=search_menu_keyboard(), parse_mode="HTML")
await query.answer()
async def show_search_callback(query: CallbackQuery):
try:
with open(CONFIG["search_file"], encoding="utf-8") as f:
content = f.read()
if len(content) > 3800:
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
f.write(content)
path = f.name
await query.message.answer_document(FSInputFile(path), caption="📝 search.txt")
Path(path).unlink()
else:
await query.message.answer(f"📝 <b>search.txt</b>:\n\n{content}")
except Exception as e:
await query.message.answer(f"❌ Ошибка: {e}")
await query.answer()
async def edit_search_callback(query: CallbackQuery):
await query.answer("✏️ Отправьте новое содержимое search.txt в следующем сообщении")
# ==================== ОСНОВНЫЕ ДЕЙСТВИЯ ====================
async def scrape_callback(query: CallbackQuery):
await query.answer("🚀 Запуск скрейпинга...")
msg = await query.message.edit_text("⏳ <b>Выполняю полный скрейпинг...</b>", parse_mode="HTML")
last = [time.time()]
unique_phones = set()
async def progress_cb(done: int, total: int):
await simple_progress(msg, done, total, last, unique_phones)
try:
links = collect_links()
urls = load_urls(links)
if not urls:
await msg.answer("❌ Нет ссылок для обработки", reply_markup=main_menu_keyboard())
return
raw = await process_batch(urls, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
seen = set()
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
results = [(o, p, d, promo, "") for o, p, d, promo in unique] # 🔥 Фикс распаковки
# Финальное обновление (unique_phones уже заполнен)
await progress_cb(len(urls), len(urls))
if results:
global last_output_file
fmt = CONFIG.get("output_format", "excel")
if fmt == "excel":
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = CONFIG["output_file"].format(timestamp=ts)
save_to_excel(results, path)
last_output_file = path
promo_cnt = sum(1 for r in results if r[3])
await query.message.answer(
f"✅ <b>Скрейпинг завершён!</b>\n"
f"📊 Обработано URL: {len(urls)}\n"
f"📞 Найдено телефонов: {len(results)}\n"
f"🎯 Из них promo: {promo_cnt}",
parse_mode="HTML"
)
await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
else:
# Текстовый вывод
text = format_results(results, fmt)
if text:
if len(text) > 3800:
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
f.write(text)
path = f.name
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
await query.message.answer_document(FSInputFile(path), caption=f"{caption}")
Path(path).unlink()
else:
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
await query.message.answer(
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n\n{text}",
parse_mode="HTML"
)
else:
await query.message.answer("⚠️ Нет данных для отображения")
else:
await query.message.answer("⚠️ Телефоны не найдены")
except Exception as e:
await query.message.answer(f"❌ Ошибка: {e}")
await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
async def process_callback(query: CallbackQuery):
await query.answer()
await query.message.answer("📋 <b>Отправьте ссылки для обработки</b>\n(можно несколько через пробел)", parse_mode="HTML")
async def manual_process(message: types.Message):
urls = [u.strip() for u in message.text.split() if u.strip().startswith("http")]
if not urls:
await message.answer("Не найдено валидных ссылок")
return
progress_msg = await message.answer(f"⏳ Обрабатываю {len(urls)} URL...")
last = [time.time()]
unique_phones = set()
async def progress_cb(done: int, total: int):
await simple_progress(progress_msg, done, total, last, unique_phones)
try:
processed = load_urls(urls)
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
seen = set()
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
results = [(o, p, d, promo, "") for o, p, d, promo in unique] # 🔥 Фикс распаковки
await progress_cb(len(processed), len(processed))
fmt = CONFIG.get("output_format", "excel")
if fmt == "excel" and results:
global last_output_file
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = CONFIG["output_file"].format(timestamp=ts)
save_to_excel(results, path)
last_output_file = path
await message.answer(
f"✅ <b>Готово!</b>\n📊 Обработано: {len(results)}\n📁 Формат: Excel",
parse_mode="HTML"
)
await message.answer_document(FSInputFile(path), caption="📁 Результаты")
elif results:
text = format_results(results, fmt)
if text:
if len(text) > 3800:
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
f.write(text)
path = f.name
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
await message.answer_document(FSInputFile(path), caption=f"{caption}")
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
Path(path).unlink()
else:
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
await message.answer(
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n\n{text}",
parse_mode="HTML"
)
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
else:
await message.answer("⚠️ Нет данных для отображения")
else:
await message.answer("⚠️ Телефоны не найдены")
except Exception as e:
logger.error(f"❌ manual_process error: {e}")
await message.answer(f"❌ Ошибка: {e}")
async def upload_links_callback(query: CallbackQuery):
await query.answer()
await query.message.answer("📤 <b>Отправьте файл .txt с ссылками</b>\n(одна на строку)", parse_mode="HTML")
async def handle_uploaded_file(message: types.Message, bot: Bot):
global pending_links
document = message.document
if not document.file_name.endswith('.txt'):
await message.answer("❌ Файл должен быть .txt")
return
file_info = await bot.get_file(document.file_id)
with tempfile.NamedTemporaryFile(delete=False) as tmp:
await bot.download_file(file_info.file_path, tmp.name)
try:
with open(tmp.name, 'r', encoding='utf-8') as f:
content = f.read()
urls = [line.strip() for line in content.splitlines() if line.strip().startswith('http')]
if not urls:
await message.answer("❌ Нет валидных ссылок в файле")
return
# 🔥 ФИКС: Очищаем конфиг, чтобы load_urls не добавил лишнего
CONFIG["urls"] = []
CONFIG["input_file"] = ""
pending_links = urls
await message.answer(f"✅ Файл загружен! Найдено {len(urls)} ссылок.\nТеперь выберите город:", parse_mode="HTML")
await message.answer("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
except Exception as e:
await message.answer(f"❌ Ошибка: {e}")
finally:
Path(tmp.name).unlink()
# ==================== ЗАПУСК ====================
def main_bot(token: str):
bot = Bot(token=token)
dp = Dispatcher()
dp.message.register(start_handler, Command("start"))
dp.message.register(manual_process, F.text & ~F.text.startswith("/"))
dp.message.register(handle_uploaded_file, F.document)
dp.callback_query.register(menu_callback, F.data == "main_menu")
dp.callback_query.register(status_handler, F.data == "status")
dp.callback_query.register(scrape_callback, F.data == "scrape")
dp.callback_query.register(process_callback, F.data == "process")
dp.callback_query.register(upload_links_callback, F.data == "upload_links")
dp.callback_query.register(search_menu_callback, F.data == "search_menu")
dp.callback_query.register(show_search_callback, F.data == "show_search")
dp.callback_query.register(edit_search_callback, F.data == "edit_search")
dp.callback_query.register(city_menu_callback, F.data == "city_menu")
dp.callback_query.register(set_city_callback, F.data.startswith("set_city_"))
dp.callback_query.register(settings_menu_callback, F.data == "settings_menu")
dp.callback_query.register(output_format_menu_callback, F.data == "output_format_menu")
dp.callback_query.register(set_output_format_callback, F.data.startswith("set_format_"))
asyncio.run(dp.start_polling(bot))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--token', required=True)
args = parser.parse_args()
main_bot(args.token)

43
config.py Normal file
View File

@ -0,0 +1,43 @@
# config.py
import httpx
# 🔧 КОНФИГУРАЦИЯ
CONFIG = {'input_file': '',
'output_file': 'C:\\Coding\\auto-scraper\\output\\phones_{timestamp}.xlsx',
'log_file': 'C:\\Coding\\auto-scraper\\logs\\scraper.log',
'excluded_domains': {'auto.drom.ru',
'auto.ru',
'autocompass-j.ru',
'autocompass-v.ru',
'avito.ru',
'duckduckgo.com',
'google.com',
'sberauto.com',
'sberleasing.ru'},
'urls': [],
'output_format': 'both',
'http': {'timeout': httpx.Timeout(10.0, connect=5.0),
'max_redirects': 5,
'retry_attempts': 3,
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like '
'Gecko) Chrome/120.0.0.0 Safari/537.36',
'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
'Referer': 'https://yandex.ru/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate'}},
'phone': {'patterns': ['href=["\\\']tel:([^"\\\']+)["\\\']',
'tel["\\\']?\\s*[:=]\\s*["\\\']?([+()0-9\\-\\s]{10,})["\\\']?',
'(?:телефон|phone|контакт)["\\\']?\\s*[:=]?\\s*["\\\']?([+()0-9\\-\\s]{10,})'],
'country_code': '7',
'min_digits': 10,
'max_digits': 12},
'required_keywords': ['Краснодар', 'краснодар'],
'stop_keywords': ['аренда', 'АРЕНДА', 'Аренда', '2311373680', 'autocompass'],
'headless': False,
'search_pages': 3,
'workers': 3,
'search_template_url': 'https://ya.ru/search/?text={search}&lr={lr}',
'region_lr': 35,
'region_name': 'Краснодар',
'search_file': 'search.txt',
'profile_dir': 'C:\\Users\\Дмитрий\\chrome_profile_yandex'}

419
main.py Normal file
View File

@ -0,0 +1,419 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
📞 DuckDuckGo/Yandex Phone Number Scraper v2.2
Извлекает ссылки следует редиректам (включая yabs) парсит телефоны сохраняет в Excel
+ Исправления: PoolTimeout, экспоненциальный бэк-офф, адаптивные лимиты, рандомизация
"""
import re
import sys
import asyncio
import random
import httpx
from pathlib import Path
from urllib.parse import urlparse
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from datetime import datetime
import argparse
import logging
# === Специфичные исключения httpx ===
from httpx import PoolTimeout, ConnectTimeout, ReadTimeout, HTTPStatusError, RequestError
from config import CONFIG
from link_collector import collect_links
# Suppress httpx info logs
logging.getLogger("httpx").setLevel(logging.WARNING)
# Компиляция regex заранее
TEL_PATTERNS = [re.compile(p, re.IGNORECASE) for p in CONFIG["phone"]["patterns"]]
# Глобальный список User-Agent для ротации
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
]
def normalize_domain(url: str) -> str:
"""Нормализация домена для проверки дубликатов."""
try:
domain = urlparse(url.strip()).hostname or ''
return domain.replace('www.', '', 1).lower()
except Exception:
return ''
def is_excluded(domain: str) -> bool:
"""Проверка домена на исключение (ТОЧНОЕ совпадение, без поддоменов)."""
return domain in CONFIG["excluded_domains"]
def normalize_phone(phone: str) -> str | None:
"""Нормализация телефона к формату: +7 (XXX) XXX-XX-XX"""
digits = re.sub(r"[^\d+]", "", phone.strip())
if digits.startswith('+7'):
digits = digits[2:]
elif digits.startswith('8') and len(digits) == 11:
digits = digits[1:]
elif digits.startswith('7') and len(digits) == 11:
digits = digits[1:]
if len(digits) != 10:
return None
return f"+7 ({digits[:3]}) {digits[3:6]}-{digits[6:8]}-{digits[8:10]}"
def extract_phone_from_html(html: str) -> str | None:
"""Поиск телефона в HTML-контенте по множеству паттернов."""
for pattern in TEL_PATTERNS:
match = pattern.search(html)
if match:
raw = match.group(1).strip()
normalized = normalize_phone(raw)
if normalized:
return normalized
return None
def check_content_filters(html: str) -> bool:
"""Проверка HTML на наличие required_keywords и отсутствие stop_keywords (case-insensitive)."""
lower_html = html.lower()
if CONFIG["required_keywords"]:
required_lower = [kw.lower() for kw in CONFIG["required_keywords"]]
if not any(kw in lower_html for kw in required_lower):
return False
if CONFIG["stop_keywords"]:
stop_lower = [kw.lower() for kw in CONFIG["stop_keywords"]]
if any(kw in lower_html for kw in stop_lower):
return False
return True
def analyze_redirect_chain(url: str, final_url: str) -> tuple[str, bool]:
"""
Анализирует цепочку редиректов.
Возвращает: (финальный домен, is_promo)
"""
try:
original_host = urlparse(url.strip()).hostname or ''
is_promo = (original_host == 'yabs.yandex.ru')
final_domain = urlparse(final_url.strip()).hostname or ''
final_domain = final_domain.replace('www.', '', 1).lower()
return final_domain, is_promo
except Exception:
return normalize_domain(url), False
def _get_client_config(url: str) -> dict:
"""Возвращает конфигурацию клиента в зависимости от домена (щадящий режим для Яндекса)."""
is_yandex = any(x in url.lower() for x in ['yandex.ru', 'yabs.yandex.ru', 'ya.ru'])
if is_yandex:
return {
"limits": httpx.Limits(max_connections=30, max_keepalive_connections=20),
"timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=30.0),
"retry_base_delay": 2.0,
"max_retries": 2,
}
else:
return {
"limits": httpx.Limits(max_connections=20, max_keepalive_connections=10),
"timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
"retry_base_delay": 1.0,
"max_retries": CONFIG["http"]["retry_attempts"],
}
async def fetch_with_retry(client: httpx.AsyncClient, url: str, retries: int = 0,
base_delay: float = 1.0, max_retries: int = 3) -> tuple[str, str | None, str | None, bool]:
"""
ГИБРИДНАЯ функция с улучшенной обработкой тайм-аутов и экспоненциальным бэк-оффом.
"""
try:
# Ротация User-Agent для каждого запроса
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Cache-Control": "max-age=0",
}
async with client.stream("GET", url, headers=headers, follow_redirects=True) as response:
if response.status_code >= 400:
raise HTTPStatusError(
f"Status {response.status_code}",
request=response.request,
response=response
)
final_url = str(response.url)
buffer = []
async for chunk in response.aiter_text(chunk_size=8192):
buffer.append(chunk)
full_html = ''.join(buffer)
final_domain, is_promo = analyze_redirect_chain(url, final_url)
if not check_content_filters(full_html):
return url, None, final_domain, is_promo
phone = extract_phone_from_html(full_html)
return url, phone, final_domain, is_promo
except PoolTimeout as e:
if retries < min(2, max_retries):
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
await asyncio.sleep(delay)
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
return url, None, normalize_domain(url), False
except ConnectTimeout as e:
if retries < max_retries:
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
await asyncio.sleep(delay)
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
return url, None, normalize_domain(url), False
except ReadTimeout as e:
return url, None, normalize_domain(url), False
except HTTPStatusError as e:
if e.response.status_code == 429:
retry_after = e.response.headers.get('Retry-After', '5')
await asyncio.sleep(int(retry_after) + random.randint(1, 3))
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
return url, None, normalize_domain(url), False
except RequestError as e:
if retries < max_retries:
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
await asyncio.sleep(delay)
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
return url, None, normalize_domain(url), False
except Exception as e:
if retries < max_retries:
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
await asyncio.sleep(delay)
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
return url, None, normalize_domain(url), False
async def process_batch(urls: list[str], batch_size: int = 50, progress_callback=None, unique_phones: set = None):
"""
Пакетная обработка с ограничением параллелизма и адаптивными настройками.
:param progress_callback: async функция (done: int, total: int) для обновления прогресса
:param unique_phones: set для инкрементального добавления уникальных номеров
"""
results = []
total_urls = len(urls)
for i in range(0, len(urls), batch_size):
batch = urls[i:i + batch_size]
sample_url = batch[0] if batch else ""
client_config = _get_client_config(sample_url)
async with httpx.AsyncClient(
headers={"User-Agent": random.choice(USER_AGENTS)},
timeout=client_config["timeout"],
follow_redirects=True,
limits=client_config["limits"]
) as client:
tasks = [
fetch_with_retry(
client, url,
base_delay=client_config["retry_base_delay"],
max_retries=client_config["max_retries"]
)
for url in batch
]
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
for j, result in enumerate(batch_results):
current_idx = i + j + 1
if isinstance(result, Exception):
if progress_callback and callable(progress_callback):
await progress_callback(current_idx, total_urls)
continue
original_url, phone, final_domain, is_promo = result
print(f"{current_idx}: {final_domain} - {phone if phone else 'нет'}")
if phone:
results.append((original_url, phone, final_domain, is_promo))
if unique_phones is not None:
unique_phones.add(phone) # Инкрементальное добавление уникального номера
if progress_callback and callable(progress_callback):
await progress_callback(current_idx, total_urls)
if i + batch_size < len(urls):
delay = random.uniform(1.0, 2.0)
await asyncio.sleep(delay)
if progress_callback and callable(progress_callback):
await progress_callback(total_urls, total_urls)
return results
def save_to_excel(results: list[tuple], filepath: str):
"""Сохранение результатов в Excel с пометкой promo и оценкой."""
wb = Workbook()
ws = wb.active
ws.title = "Phone Numbers"
headers = ["Original URL", "Phone", "Final Domain", "Promo", "Processed At", "Rating"]
ws.append(headers)
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF")
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
cell.alignment = Alignment(horizontal="center")
for original_url, phone, final_domain, is_promo, rating in results:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
promo_mark = "YES" if is_promo else "no"
ws.append([original_url, phone, final_domain, promo_mark, timestamp, rating])
if is_promo:
row_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
for cell in ws[ws.max_row]:
cell.fill = row_fill
for column in ws.columns:
max_len = 0
for cell in column:
if cell.value:
max_len = max(max_len, len(str(cell.value)))
ws.column_dimensions[column[0].column_letter].width = min(max_len + 2, 60)
ws.freeze_panes = 'A2'
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
wb.save(filepath)
def load_urls(additional_urls: list[str]) -> list[str]:
"""Загрузка и фильтрация URL из конфигурации."""
urls = []
urls.extend(CONFIG["urls"])
if CONFIG["input_file"] and Path(CONFIG["input_file"]).exists():
try:
with open(CONFIG["input_file"], 'r', encoding='utf-8') as f:
for line in f:
line = line.strip().strip('"\'').rstrip(',')
if line and line.startswith('http'):
urls.append(line.split()[0])
except Exception:
pass
urls.extend(additional_urls)
seen_domains = set()
cleaned = []
for url in urls:
url = url.strip()
if not url or not url.startswith('http'):
continue
domain = normalize_domain(url)
if not domain or is_excluded(domain):
continue
if domain == 'yabs.yandex.ru':
cleaned.append(url)
continue
if domain in seen_domains:
continue
seen_domains.add(domain)
cleaned.append(url)
return cleaned
async def main():
"""Точка входа."""
parser = argparse.ArgumentParser(description="Phone Scraper")
parser.add_argument('--promo-only', action='store_true', help="Сохранять только promo-записи (yabs.yandex.ru)")
parser.add_argument('urls', nargs='*', help="Дополнительные URL для обработки")
args = parser.parse_args()
promo_only = args.promo_only
try:
collected_links = collect_links()
except Exception as e:
print(f"Ошибка в collect_links: {e}. Продолжаем без собранных ссылок.")
collected_links = []
urls = load_urls(collected_links + args.urls)
if not urls:
print("\n💡 Использование:")
print(" python script.py [--promo-only] 'https://site1.ru' 'https://site2.ru'")
return
raw_results = await process_batch(urls)
seen_final_domains = set()
unique_raw_results = []
for result in raw_results:
original_url, phone, final_domain, is_promo = result
if final_domain not in seen_final_domains:
seen_final_domains.add(final_domain)
unique_raw_results.append(result)
results = unique_raw_results
if promo_only:
results = [r for r in results if r[3]]
if results:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = CONFIG["output_file"].format(timestamp=timestamp)
save_to_excel(results, output_path)
promo_count = sum(1 for r in results if r[3])
print(f"\n📊 ИТОГИ:")
print(f" 🔍 Обработано: {len(urls)}")
print(f" 📞 Найдено телефонов: {len(results)}")
print(f" 🎯 Из promo (yabs): {promo_count}")
print(f" 📁 Файл: {output_path}")
print("\n✅ Готово!")
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\n⚠ Прервано пользователем")
except Exception as e:
raise

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
httpx
selenium
openpyxl
ollama
aiogram