feat: upload files
This commit is contained in:
parent
4f44480bcf
commit
6325b485b2
537
bot.py
Normal file
537
bot.py
Normal file
@ -0,0 +1,537 @@
|
|||||||
|
# bot.py
|
||||||
|
# 🚀 Авто-Скрейпер — версия БЕЗ ИИ оценки (Ollama полностью удалена)
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import httpx
|
||||||
|
import time
|
||||||
|
|
||||||
|
from aiogram import Bot, Dispatcher, types, F
|
||||||
|
from aiogram.filters import Command
|
||||||
|
from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton, FSInputFile, CallbackQuery
|
||||||
|
|
||||||
|
# Импорт только нужного
|
||||||
|
from config import CONFIG
|
||||||
|
from link_collector import collect_links
|
||||||
|
from main import process_batch, save_to_excel, load_urls
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Глобальные переменные
|
||||||
|
last_output_file = None
|
||||||
|
pending_links = None
|
||||||
|
|
||||||
|
# ==================== ГОРОДА ====================
|
||||||
|
CITIES = {
|
||||||
|
"Нижний Новгород": {"lr": 47, "keywords": ['Нижний Новгород', 'Новгород', 'нижний новгород'], "name": "Нижний Новгород"},
|
||||||
|
"Краснодар": {"lr": 35, "keywords": ['Краснодар', 'краснодар'], "name": "Краснодар"},
|
||||||
|
"Москва": {"lr": 213, "keywords": ['Москва', 'москва'], "name": "Москва"}
|
||||||
|
}
|
||||||
|
|
||||||
|
async def simple_progress(message: types.Message, done: int, total: int, last: list, unique_phones: set):
|
||||||
|
"""
|
||||||
|
Обновляет сообщение со счётчиком и уникальными номерами не чаще раза в 1 секунду (для более динамичного обновления).
|
||||||
|
last: [last_time] — mutable list для отслеживания.
|
||||||
|
unique_phones: set уникальных номеров для добавления.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# Обновляем: либо прошло 1+ сек, либо это последний элемент
|
||||||
|
if (now - last[0] >= 1.0) or (done == total):
|
||||||
|
try:
|
||||||
|
phones_text = "\n".join(sorted(unique_phones)) if unique_phones else "" # Сортировка для стабильного вида
|
||||||
|
text = f"⏳ <b>Обработка:</b> {done}/{total}"
|
||||||
|
if phones_text:
|
||||||
|
text += f"\n\n{phones_text}"
|
||||||
|
await message.edit_text(
|
||||||
|
text,
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
last[0] = now # запоминаем время обновления
|
||||||
|
except:
|
||||||
|
pass # игнорируем ошибки редактирования
|
||||||
|
|
||||||
|
def save_config():
|
||||||
|
try:
|
||||||
|
timeout_obj = CONFIG["http"]["timeout"]
|
||||||
|
|
||||||
|
# Временно заменяем объект на плейсхолдер для pprint
|
||||||
|
CONFIG["http"]["timeout"] = "PLACEHOLDER_TIMEOUT"
|
||||||
|
|
||||||
|
import pprint
|
||||||
|
s = pprint.pformat(CONFIG, width=100, sort_dicts=False)
|
||||||
|
|
||||||
|
# Возвращаем объект обратно
|
||||||
|
CONFIG["http"]["timeout"] = timeout_obj
|
||||||
|
|
||||||
|
# Правильно восстанавливаем httpx.Timeout
|
||||||
|
if isinstance(timeout_obj, httpx.Timeout):
|
||||||
|
connect = getattr(timeout_obj, 'connect', 5.0)
|
||||||
|
read = getattr(timeout_obj, 'read', 10.0)
|
||||||
|
timeout_str = f"httpx.Timeout({read}, connect={connect})"
|
||||||
|
else:
|
||||||
|
timeout_str = repr(timeout_obj)
|
||||||
|
|
||||||
|
s = s.replace("'PLACEHOLDER_TIMEOUT'", timeout_str)
|
||||||
|
|
||||||
|
with open('config.py', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f'# config.py\nimport httpx\n# 🔧 КОНФИГУРАЦИЯ\nCONFIG = {s}')
|
||||||
|
|
||||||
|
logger.info("✅ CONFIG успешно сохранён")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Ошибка сохранения config.py: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error(traceback.format_exc())
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_search_queries_count() -> int:
|
||||||
|
try:
|
||||||
|
with open(CONFIG["search_file"], encoding="utf-8") as f:
|
||||||
|
return len([line for line in f if line.strip()])
|
||||||
|
except:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# ==================== КЛАВИАТУРЫ ====================
|
||||||
|
def main_menu_keyboard():
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="📤 Загрузить файл с ссылками", callback_data="upload_links")],
|
||||||
|
[
|
||||||
|
InlineKeyboardButton(text="⚙️ Настройки", callback_data="settings_menu"),
|
||||||
|
InlineKeyboardButton(text="🌆 Сменить город", callback_data="city_menu")
|
||||||
|
],
|
||||||
|
])
|
||||||
|
|
||||||
|
def back_to_menu_keyboard():
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[[InlineKeyboardButton(text="⬅️ Вернуться в меню", callback_data="main_menu")]])
|
||||||
|
|
||||||
|
def cities_keyboard():
|
||||||
|
kb = [[InlineKeyboardButton(text=f"🌆 {city}", callback_data=f"set_city_{city}")] for city in CITIES]
|
||||||
|
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=kb)
|
||||||
|
|
||||||
|
def search_menu_keyboard():
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="📖 Показать search.txt", callback_data="show_search")],
|
||||||
|
[InlineKeyboardButton(text="✏️ Изменить search.txt", callback_data="edit_search")],
|
||||||
|
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
|
||||||
|
])
|
||||||
|
|
||||||
|
def output_format_keyboard():
|
||||||
|
formats = {
|
||||||
|
"phones": "📞 Только номера",
|
||||||
|
"domains": "🌐 Только домены",
|
||||||
|
"both": "📞+🌐 Номера и домены",
|
||||||
|
"excel": "📊 Excel файл"
|
||||||
|
}
|
||||||
|
kb = [[InlineKeyboardButton(text=label, callback_data=f"set_format_{key}")]
|
||||||
|
for key, label in formats.items()]
|
||||||
|
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=kb)
|
||||||
|
|
||||||
|
def settings_keyboard():
|
||||||
|
return InlineKeyboardMarkup(inline_keyboard=[
|
||||||
|
[InlineKeyboardButton(text="📤 Формат вывода", callback_data="output_format_menu")],
|
||||||
|
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
|
||||||
|
])
|
||||||
|
|
||||||
|
# ==================== НАСТРОЙКИ ВЫВОДА ====================
|
||||||
|
|
||||||
|
async def settings_menu_callback(query: CallbackQuery):
|
||||||
|
current = CONFIG.get("output_format", "excel")
|
||||||
|
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐", "excel": "📊"}
|
||||||
|
await query.message.edit_text(
|
||||||
|
f"⚙️ <b>Настройки</b>\n\n"
|
||||||
|
f"📤 Текущий формат: <b>{labels.get(current, '📊')} {current}</b>",
|
||||||
|
reply_markup=settings_keyboard(),
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def output_format_menu_callback(query: CallbackQuery):
|
||||||
|
current = CONFIG.get("output_format", "excel")
|
||||||
|
await query.message.edit_text(
|
||||||
|
"📤 <b>Выберите формат вывода:</b>",
|
||||||
|
reply_markup=output_format_keyboard(),
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def set_output_format_callback(query: CallbackQuery):
|
||||||
|
fmt = query.data.replace("set_format_", "")
|
||||||
|
if fmt in ["phones", "domains", "both", "excel"]:
|
||||||
|
CONFIG["output_format"] = fmt
|
||||||
|
save_config()
|
||||||
|
labels = {"phones": "📞 Только номера", "domains": "🌐 Только домены",
|
||||||
|
"both": "📞+🌐 Номера и домены", "excel": "📊 Excel файл"}
|
||||||
|
await query.answer(f"✅ Формат: {labels[fmt]}")
|
||||||
|
await settings_menu_callback(query)
|
||||||
|
else:
|
||||||
|
await query.answer("❌ Неверный формат", show_alert=True)
|
||||||
|
|
||||||
|
# ==================== ФОРМАТИРОВАНИЕ РЕЗУЛЬТАТОВ ====================
|
||||||
|
|
||||||
|
def format_results(results: list, fmt: str) -> str:
|
||||||
|
"""
|
||||||
|
Форматирует результаты согласно выбранному формату.
|
||||||
|
results: [(org, phone, domain, promo, rating), ...]
|
||||||
|
fmt: 'phones', 'domains', 'both', 'excel'
|
||||||
|
"""
|
||||||
|
if fmt == "phones":
|
||||||
|
return "\n".join([r[1] for r in results if r[1]])
|
||||||
|
elif fmt == "domains":
|
||||||
|
return "\n".join([r[2] for r in results if r[2]])
|
||||||
|
elif fmt == "both":
|
||||||
|
return "\n".join([f"{r[1]} — {r[2]}" for r in results if r[1] or r[2]])
|
||||||
|
return "" # для excel не используем текстовый формат
|
||||||
|
|
||||||
|
# ==================== ХЕНДЛЕРЫ ====================
|
||||||
|
async def start_handler(message: types.Message):
|
||||||
|
await message.answer(
|
||||||
|
"👋 <b>Авто-Скрейпер</b>\n\nГотов к работе!\nВыбери действие ниже:",
|
||||||
|
reply_markup=main_menu_keyboard(),
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def menu_callback(query: CallbackQuery):
|
||||||
|
await query.message.edit_text("👋 <b>Главное меню</b>", reply_markup=main_menu_keyboard(), parse_mode="HTML")
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def status_handler(query: CallbackQuery):
|
||||||
|
status_text = (
|
||||||
|
f"📊 <b>Статус скрейпера</b>\n\n"
|
||||||
|
f"🌆 Город: <b>{CONFIG['region_name']}</b>\n"
|
||||||
|
f"🔎 Запросов в search.txt: <b>{get_search_queries_count()}</b>\n"
|
||||||
|
f"📁 Последний результат: <code>{last_output_file or '—'}</code>\n\n"
|
||||||
|
f"✅ Работает без ИИ-оценки"
|
||||||
|
)
|
||||||
|
await query.message.edit_text(status_text, reply_markup=back_to_menu_keyboard(), parse_mode="HTML")
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
# ==================== СМЕНА ГОРОДА ====================
|
||||||
|
async def city_menu_callback(query: CallbackQuery):
|
||||||
|
await query.message.edit_text("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def set_city_callback(query: CallbackQuery):
|
||||||
|
global pending_links
|
||||||
|
city_name = query.data.replace("set_city_", "")
|
||||||
|
city = CITIES[city_name]
|
||||||
|
|
||||||
|
CONFIG["region_name"] = city["name"]
|
||||||
|
CONFIG["region_lr"] = city["lr"]
|
||||||
|
CONFIG["required_keywords"] = city["keywords"]
|
||||||
|
|
||||||
|
save_config()
|
||||||
|
|
||||||
|
if pending_links:
|
||||||
|
await query.answer(f"✅ Город изменён на {city_name}. Начинаю обработку ссылок...")
|
||||||
|
await query.message.edit_text("⏳ <b>Обрабатываю ссылки из файла...</b>", parse_mode="HTML")
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed = load_urls(pending_links)
|
||||||
|
|
||||||
|
last = [time.time()] # для simple_progress
|
||||||
|
unique_phones = set() # для уникальных номеров
|
||||||
|
|
||||||
|
async def progress_cb(done: int, total: int):
|
||||||
|
await simple_progress(query.message, done, total, last, unique_phones)
|
||||||
|
|
||||||
|
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])]
|
||||||
|
results = [(o, p, d, promo, "—") for o, p, d, promo in unique]
|
||||||
|
|
||||||
|
# Финальное обновление прогресса (unique_phones уже заполнен инкрементально)
|
||||||
|
await progress_cb(len(processed), len(processed))
|
||||||
|
|
||||||
|
fmt = CONFIG.get("output_format", "excel")
|
||||||
|
|
||||||
|
if fmt == "excel" and results:
|
||||||
|
global last_output_file
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
path = CONFIG["output_file"].format(timestamp=ts)
|
||||||
|
save_to_excel(results, path)
|
||||||
|
last_output_file = path
|
||||||
|
|
||||||
|
promo_cnt = sum(1 for r in results if r[3])
|
||||||
|
await query.message.answer(
|
||||||
|
f"✅ <b>Обработка завершена!</b>\n"
|
||||||
|
f"📊 Обработано URL: {len(pending_links)}\n"
|
||||||
|
f"📞 Найдено телефонов: {len(results)}\n"
|
||||||
|
f"🎯 Из них promo: {promo_cnt}\n"
|
||||||
|
f"📁 Формат: Excel",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
|
||||||
|
elif results:
|
||||||
|
text = format_results(results, fmt)
|
||||||
|
if text:
|
||||||
|
if len(text) > 3800:
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||||||
|
f.write(text)
|
||||||
|
path = f.name
|
||||||
|
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
|
||||||
|
await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
|
||||||
|
Path(path).unlink()
|
||||||
|
else:
|
||||||
|
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
|
||||||
|
await query.message.answer(
|
||||||
|
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n"
|
||||||
|
f"📊 Обработано: {len(results)}\n\n{text}",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await query.message.answer("⚠️ Нет данных для отображения")
|
||||||
|
else:
|
||||||
|
await query.message.answer("⚠️ Телефоны не найдены")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ set_city_callback error: {e}")
|
||||||
|
await query.message.answer(f"❌ Ошибка: {e}")
|
||||||
|
finally:
|
||||||
|
pending_links = None
|
||||||
|
|
||||||
|
await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||||||
|
else:
|
||||||
|
await query.answer(f"✅ Город изменён на {city_name}")
|
||||||
|
await status_handler(query)
|
||||||
|
|
||||||
|
# ==================== search.txt ====================
|
||||||
|
async def search_menu_callback(query: CallbackQuery):
|
||||||
|
await query.message.edit_text("📝 <b>Управление search.txt</b>", reply_markup=search_menu_keyboard(), parse_mode="HTML")
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def show_search_callback(query: CallbackQuery):
|
||||||
|
try:
|
||||||
|
with open(CONFIG["search_file"], encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
if len(content) > 3800:
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||||||
|
f.write(content)
|
||||||
|
path = f.name
|
||||||
|
await query.message.answer_document(FSInputFile(path), caption="📝 search.txt")
|
||||||
|
Path(path).unlink()
|
||||||
|
else:
|
||||||
|
await query.message.answer(f"📝 <b>search.txt</b>:\n\n{content}")
|
||||||
|
except Exception as e:
|
||||||
|
await query.message.answer(f"❌ Ошибка: {e}")
|
||||||
|
await query.answer()
|
||||||
|
|
||||||
|
async def edit_search_callback(query: CallbackQuery):
|
||||||
|
await query.answer("✏️ Отправьте новое содержимое search.txt в следующем сообщении")
|
||||||
|
|
||||||
|
# ==================== ОСНОВНЫЕ ДЕЙСТВИЯ ====================
|
||||||
|
|
||||||
|
async def scrape_callback(query: CallbackQuery):
|
||||||
|
await query.answer("🚀 Запуск скрейпинга...")
|
||||||
|
msg = await query.message.edit_text("⏳ <b>Выполняю полный скрейпинг...</b>", parse_mode="HTML")
|
||||||
|
|
||||||
|
last = [time.time()]
|
||||||
|
unique_phones = set()
|
||||||
|
|
||||||
|
async def progress_cb(done: int, total: int):
|
||||||
|
await simple_progress(msg, done, total, last, unique_phones)
|
||||||
|
|
||||||
|
try:
|
||||||
|
links = collect_links()
|
||||||
|
urls = load_urls(links)
|
||||||
|
if not urls:
|
||||||
|
await msg.answer("❌ Нет ссылок для обработки", reply_markup=main_menu_keyboard())
|
||||||
|
return
|
||||||
|
|
||||||
|
raw = await process_batch(urls, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
|
||||||
|
results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки
|
||||||
|
|
||||||
|
# Финальное обновление (unique_phones уже заполнен)
|
||||||
|
await progress_cb(len(urls), len(urls))
|
||||||
|
|
||||||
|
if results:
|
||||||
|
global last_output_file
|
||||||
|
fmt = CONFIG.get("output_format", "excel")
|
||||||
|
|
||||||
|
if fmt == "excel":
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
path = CONFIG["output_file"].format(timestamp=ts)
|
||||||
|
save_to_excel(results, path)
|
||||||
|
last_output_file = path
|
||||||
|
|
||||||
|
promo_cnt = sum(1 for r in results if r[3])
|
||||||
|
await query.message.answer(
|
||||||
|
f"✅ <b>Скрейпинг завершён!</b>\n"
|
||||||
|
f"📊 Обработано URL: {len(urls)}\n"
|
||||||
|
f"📞 Найдено телефонов: {len(results)}\n"
|
||||||
|
f"🎯 Из них promo: {promo_cnt}",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
|
||||||
|
else:
|
||||||
|
# Текстовый вывод
|
||||||
|
text = format_results(results, fmt)
|
||||||
|
if text:
|
||||||
|
if len(text) > 3800:
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||||||
|
f.write(text)
|
||||||
|
path = f.name
|
||||||
|
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
|
||||||
|
await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
|
||||||
|
Path(path).unlink()
|
||||||
|
else:
|
||||||
|
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
|
||||||
|
await query.message.answer(
|
||||||
|
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n\n{text}",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await query.message.answer("⚠️ Нет данных для отображения")
|
||||||
|
else:
|
||||||
|
await query.message.answer("⚠️ Телефоны не найдены")
|
||||||
|
except Exception as e:
|
||||||
|
await query.message.answer(f"❌ Ошибка: {e}")
|
||||||
|
|
||||||
|
await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||||||
|
|
||||||
|
async def process_callback(query: CallbackQuery):
|
||||||
|
await query.answer()
|
||||||
|
await query.message.answer("📋 <b>Отправьте ссылки для обработки</b>\n(можно несколько через пробел)", parse_mode="HTML")
|
||||||
|
|
||||||
|
async def manual_process(message: types.Message):
|
||||||
|
urls = [u.strip() for u in message.text.split() if u.strip().startswith("http")]
|
||||||
|
if not urls:
|
||||||
|
await message.answer("❌ Не найдено валидных ссылок")
|
||||||
|
return
|
||||||
|
|
||||||
|
progress_msg = await message.answer(f"⏳ Обрабатываю {len(urls)} URL...")
|
||||||
|
last = [time.time()]
|
||||||
|
unique_phones = set()
|
||||||
|
|
||||||
|
async def progress_cb(done: int, total: int):
|
||||||
|
await simple_progress(progress_msg, done, total, last, unique_phones)
|
||||||
|
|
||||||
|
try:
|
||||||
|
processed = load_urls(urls)
|
||||||
|
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
|
||||||
|
results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки
|
||||||
|
|
||||||
|
await progress_cb(len(processed), len(processed))
|
||||||
|
|
||||||
|
fmt = CONFIG.get("output_format", "excel")
|
||||||
|
|
||||||
|
if fmt == "excel" and results:
|
||||||
|
global last_output_file
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
path = CONFIG["output_file"].format(timestamp=ts)
|
||||||
|
save_to_excel(results, path)
|
||||||
|
last_output_file = path
|
||||||
|
|
||||||
|
await message.answer(
|
||||||
|
f"✅ <b>Готово!</b>\n📊 Обработано: {len(results)}\n📁 Формат: Excel",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await message.answer_document(FSInputFile(path), caption="📁 Результаты")
|
||||||
|
elif results:
|
||||||
|
text = format_results(results, fmt)
|
||||||
|
if text:
|
||||||
|
if len(text) > 3800:
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||||||
|
f.write(text)
|
||||||
|
path = f.name
|
||||||
|
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
|
||||||
|
await message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
|
||||||
|
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||||||
|
Path(path).unlink()
|
||||||
|
else:
|
||||||
|
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
|
||||||
|
await message.answer(
|
||||||
|
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n\n{text}",
|
||||||
|
parse_mode="HTML"
|
||||||
|
)
|
||||||
|
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||||||
|
else:
|
||||||
|
await message.answer("⚠️ Нет данных для отображения")
|
||||||
|
else:
|
||||||
|
await message.answer("⚠️ Телефоны не найдены")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ manual_process error: {e}")
|
||||||
|
await message.answer(f"❌ Ошибка: {e}")
|
||||||
|
|
||||||
|
async def upload_links_callback(query: CallbackQuery):
|
||||||
|
await query.answer()
|
||||||
|
await query.message.answer("📤 <b>Отправьте файл .txt с ссылками</b>\n(одна на строку)", parse_mode="HTML")
|
||||||
|
|
||||||
|
async def handle_uploaded_file(message: types.Message, bot: Bot):
|
||||||
|
global pending_links
|
||||||
|
document = message.document
|
||||||
|
if not document.file_name.endswith('.txt'):
|
||||||
|
await message.answer("❌ Файл должен быть .txt")
|
||||||
|
return
|
||||||
|
|
||||||
|
file_info = await bot.get_file(document.file_id)
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
||||||
|
await bot.download_file(file_info.file_path, tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(tmp.name, 'r', encoding='utf-8') as f:
|
||||||
|
content = f.read()
|
||||||
|
urls = [line.strip() for line in content.splitlines() if line.strip().startswith('http')]
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
await message.answer("❌ Нет валидных ссылок в файле")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 🔥 ФИКС: Очищаем конфиг, чтобы load_urls не добавил лишнего
|
||||||
|
CONFIG["urls"] = []
|
||||||
|
CONFIG["input_file"] = ""
|
||||||
|
|
||||||
|
pending_links = urls
|
||||||
|
await message.answer(f"✅ Файл загружен! Найдено {len(urls)} ссылок.\nТеперь выберите город:", parse_mode="HTML")
|
||||||
|
await message.answer("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
|
||||||
|
except Exception as e:
|
||||||
|
await message.answer(f"❌ Ошибка: {e}")
|
||||||
|
finally:
|
||||||
|
Path(tmp.name).unlink()
|
||||||
|
|
||||||
|
# ==================== ЗАПУСК ====================
|
||||||
|
def main_bot(token: str):
|
||||||
|
bot = Bot(token=token)
|
||||||
|
dp = Dispatcher()
|
||||||
|
|
||||||
|
dp.message.register(start_handler, Command("start"))
|
||||||
|
dp.message.register(manual_process, F.text & ~F.text.startswith("/"))
|
||||||
|
dp.message.register(handle_uploaded_file, F.document)
|
||||||
|
|
||||||
|
dp.callback_query.register(menu_callback, F.data == "main_menu")
|
||||||
|
dp.callback_query.register(status_handler, F.data == "status")
|
||||||
|
dp.callback_query.register(scrape_callback, F.data == "scrape")
|
||||||
|
dp.callback_query.register(process_callback, F.data == "process")
|
||||||
|
dp.callback_query.register(upload_links_callback, F.data == "upload_links")
|
||||||
|
dp.callback_query.register(search_menu_callback, F.data == "search_menu")
|
||||||
|
dp.callback_query.register(show_search_callback, F.data == "show_search")
|
||||||
|
dp.callback_query.register(edit_search_callback, F.data == "edit_search")
|
||||||
|
dp.callback_query.register(city_menu_callback, F.data == "city_menu")
|
||||||
|
dp.callback_query.register(set_city_callback, F.data.startswith("set_city_"))
|
||||||
|
dp.callback_query.register(settings_menu_callback, F.data == "settings_menu")
|
||||||
|
dp.callback_query.register(output_format_menu_callback, F.data == "output_format_menu")
|
||||||
|
dp.callback_query.register(set_output_format_callback, F.data.startswith("set_format_"))
|
||||||
|
|
||||||
|
asyncio.run(dp.start_polling(bot))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--token', required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main_bot(args.token)
|
||||||
43
config.py
Normal file
43
config.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# config.py
|
||||||
|
import httpx
|
||||||
|
# 🔧 КОНФИГУРАЦИЯ
|
||||||
|
CONFIG = {'input_file': '',
|
||||||
|
'output_file': 'C:\\Coding\\auto-scraper\\output\\phones_{timestamp}.xlsx',
|
||||||
|
'log_file': 'C:\\Coding\\auto-scraper\\logs\\scraper.log',
|
||||||
|
'excluded_domains': {'auto.drom.ru',
|
||||||
|
'auto.ru',
|
||||||
|
'autocompass-j.ru',
|
||||||
|
'autocompass-v.ru',
|
||||||
|
'avito.ru',
|
||||||
|
'duckduckgo.com',
|
||||||
|
'google.com',
|
||||||
|
'sberauto.com',
|
||||||
|
'sberleasing.ru'},
|
||||||
|
'urls': [],
|
||||||
|
'output_format': 'both',
|
||||||
|
'http': {'timeout': httpx.Timeout(10.0, connect=5.0),
|
||||||
|
'max_redirects': 5,
|
||||||
|
'retry_attempts': 3,
|
||||||
|
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like '
|
||||||
|
'Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
|
||||||
|
'Referer': 'https://yandex.ru/',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate'}},
|
||||||
|
'phone': {'patterns': ['href=["\\\']tel:([^"\\\']+)["\\\']',
|
||||||
|
'tel["\\\']?\\s*[:=]\\s*["\\\']?([+()0-9\\-\\s]{10,})["\\\']?',
|
||||||
|
'(?:телефон|phone|контакт)["\\\']?\\s*[:=]?\\s*["\\\']?([+()0-9\\-\\s]{10,})'],
|
||||||
|
'country_code': '7',
|
||||||
|
'min_digits': 10,
|
||||||
|
'max_digits': 12},
|
||||||
|
'required_keywords': ['Краснодар', 'краснодар'],
|
||||||
|
'stop_keywords': ['аренда', 'АРЕНДА', 'Аренда', '2311373680', 'autocompass'],
|
||||||
|
'headless': False,
|
||||||
|
'search_pages': 3,
|
||||||
|
'workers': 3,
|
||||||
|
'search_template_url': 'https://ya.ru/search/?text={search}&lr={lr}',
|
||||||
|
'region_lr': 35,
|
||||||
|
'region_name': 'Краснодар',
|
||||||
|
'search_file': 'search.txt',
|
||||||
|
'profile_dir': 'C:\\Users\\Дмитрий\\chrome_profile_yandex'}
|
||||||
419
main.py
Normal file
419
main.py
Normal file
@ -0,0 +1,419 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
📞 DuckDuckGo/Yandex → Phone Number Scraper v2.2
|
||||||
|
Извлекает ссылки → следует редиректам (включая yabs) → парсит телефоны → сохраняет в Excel
|
||||||
|
+ Исправления: PoolTimeout, экспоненциальный бэк-офф, адаптивные лимиты, рандомизация
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
import random
|
||||||
|
import httpx
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from openpyxl import Workbook
|
||||||
|
from openpyxl.styles import Font, Alignment, PatternFill
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# === Специфичные исключения httpx ===
|
||||||
|
from httpx import PoolTimeout, ConnectTimeout, ReadTimeout, HTTPStatusError, RequestError
|
||||||
|
|
||||||
|
from config import CONFIG
|
||||||
|
from link_collector import collect_links
|
||||||
|
|
||||||
|
# Suppress httpx info logs
|
||||||
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
# Компиляция regex заранее
|
||||||
|
TEL_PATTERNS = [re.compile(p, re.IGNORECASE) for p in CONFIG["phone"]["patterns"]]
|
||||||
|
|
||||||
|
# Глобальный список User-Agent для ротации
|
||||||
|
USER_AGENTS = [
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_domain(url: str) -> str:
|
||||||
|
"""Нормализация домена для проверки дубликатов."""
|
||||||
|
try:
|
||||||
|
domain = urlparse(url.strip()).hostname or ''
|
||||||
|
return domain.replace('www.', '', 1).lower()
|
||||||
|
except Exception:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def is_excluded(domain: str) -> bool:
|
||||||
|
"""Проверка домена на исключение (ТОЧНОЕ совпадение, без поддоменов)."""
|
||||||
|
return domain in CONFIG["excluded_domains"]
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_phone(phone: str) -> str | None:
|
||||||
|
"""Нормализация телефона к формату: +7 (XXX) XXX-XX-XX"""
|
||||||
|
digits = re.sub(r"[^\d+]", "", phone.strip())
|
||||||
|
|
||||||
|
if digits.startswith('+7'):
|
||||||
|
digits = digits[2:]
|
||||||
|
elif digits.startswith('8') and len(digits) == 11:
|
||||||
|
digits = digits[1:]
|
||||||
|
elif digits.startswith('7') and len(digits) == 11:
|
||||||
|
digits = digits[1:]
|
||||||
|
|
||||||
|
if len(digits) != 10:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return f"+7 ({digits[:3]}) {digits[3:6]}-{digits[6:8]}-{digits[8:10]}"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_phone_from_html(html: str) -> str | None:
|
||||||
|
"""Поиск телефона в HTML-контенте по множеству паттернов."""
|
||||||
|
for pattern in TEL_PATTERNS:
|
||||||
|
match = pattern.search(html)
|
||||||
|
if match:
|
||||||
|
raw = match.group(1).strip()
|
||||||
|
normalized = normalize_phone(raw)
|
||||||
|
if normalized:
|
||||||
|
return normalized
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_content_filters(html: str) -> bool:
|
||||||
|
"""Проверка HTML на наличие required_keywords и отсутствие stop_keywords (case-insensitive)."""
|
||||||
|
lower_html = html.lower()
|
||||||
|
|
||||||
|
if CONFIG["required_keywords"]:
|
||||||
|
required_lower = [kw.lower() for kw in CONFIG["required_keywords"]]
|
||||||
|
if not any(kw in lower_html for kw in required_lower):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if CONFIG["stop_keywords"]:
|
||||||
|
stop_lower = [kw.lower() for kw in CONFIG["stop_keywords"]]
|
||||||
|
if any(kw in lower_html for kw in stop_lower):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_redirect_chain(url: str, final_url: str) -> tuple[str, bool]:
|
||||||
|
"""
|
||||||
|
Анализирует цепочку редиректов.
|
||||||
|
Возвращает: (финальный домен, is_promo)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
original_host = urlparse(url.strip()).hostname or ''
|
||||||
|
is_promo = (original_host == 'yabs.yandex.ru')
|
||||||
|
|
||||||
|
final_domain = urlparse(final_url.strip()).hostname or ''
|
||||||
|
final_domain = final_domain.replace('www.', '', 1).lower()
|
||||||
|
return final_domain, is_promo
|
||||||
|
except Exception:
|
||||||
|
return normalize_domain(url), False
|
||||||
|
|
||||||
|
|
||||||
|
def _get_client_config(url: str) -> dict:
|
||||||
|
"""Возвращает конфигурацию клиента в зависимости от домена (щадящий режим для Яндекса)."""
|
||||||
|
is_yandex = any(x in url.lower() for x in ['yandex.ru', 'yabs.yandex.ru', 'ya.ru'])
|
||||||
|
|
||||||
|
if is_yandex:
|
||||||
|
return {
|
||||||
|
"limits": httpx.Limits(max_connections=30, max_keepalive_connections=20),
|
||||||
|
"timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=30.0),
|
||||||
|
"retry_base_delay": 2.0,
|
||||||
|
"max_retries": 2,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"limits": httpx.Limits(max_connections=20, max_keepalive_connections=10),
|
||||||
|
"timeout": httpx.Timeout(connect=10.0, read=30.0, write=10.0, pool=10.0),
|
||||||
|
"retry_base_delay": 1.0,
|
||||||
|
"max_retries": CONFIG["http"]["retry_attempts"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_with_retry(client: httpx.AsyncClient, url: str, retries: int = 0,
|
||||||
|
base_delay: float = 1.0, max_retries: int = 3) -> tuple[str, str | None, str | None, bool]:
|
||||||
|
"""
|
||||||
|
ГИБРИДНАЯ функция с улучшенной обработкой тайм-аутов и экспоненциальным бэк-оффом.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Ротация User-Agent для каждого запроса
|
||||||
|
headers = {
|
||||||
|
"User-Agent": random.choice(USER_AGENTS),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
"Upgrade-Insecure-Requests": "1",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Cache-Control": "max-age=0",
|
||||||
|
}
|
||||||
|
|
||||||
|
async with client.stream("GET", url, headers=headers, follow_redirects=True) as response:
|
||||||
|
if response.status_code >= 400:
|
||||||
|
raise HTTPStatusError(
|
||||||
|
f"Status {response.status_code}",
|
||||||
|
request=response.request,
|
||||||
|
response=response
|
||||||
|
)
|
||||||
|
|
||||||
|
final_url = str(response.url)
|
||||||
|
buffer = []
|
||||||
|
async for chunk in response.aiter_text(chunk_size=8192):
|
||||||
|
buffer.append(chunk)
|
||||||
|
full_html = ''.join(buffer)
|
||||||
|
|
||||||
|
final_domain, is_promo = analyze_redirect_chain(url, final_url)
|
||||||
|
|
||||||
|
if not check_content_filters(full_html):
|
||||||
|
return url, None, final_domain, is_promo
|
||||||
|
|
||||||
|
phone = extract_phone_from_html(full_html)
|
||||||
|
return url, phone, final_domain, is_promo
|
||||||
|
|
||||||
|
except PoolTimeout as e:
|
||||||
|
if retries < min(2, max_retries):
|
||||||
|
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
except ConnectTimeout as e:
|
||||||
|
if retries < max_retries:
|
||||||
|
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
except ReadTimeout as e:
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
except HTTPStatusError as e:
|
||||||
|
if e.response.status_code == 429:
|
||||||
|
retry_after = e.response.headers.get('Retry-After', '5')
|
||||||
|
await asyncio.sleep(int(retry_after) + random.randint(1, 3))
|
||||||
|
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
except RequestError as e:
|
||||||
|
if retries < max_retries:
|
||||||
|
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if retries < max_retries:
|
||||||
|
delay = base_delay * (2 ** retries) + random.uniform(0.5, 1.5)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
return await fetch_with_retry(client, url, retries + 1, base_delay, max_retries)
|
||||||
|
return url, None, normalize_domain(url), False
|
||||||
|
|
||||||
|
|
||||||
|
async def process_batch(urls: list[str], batch_size: int = 50, progress_callback=None, unique_phones: set = None):
|
||||||
|
"""
|
||||||
|
Пакетная обработка с ограничением параллелизма и адаптивными настройками.
|
||||||
|
:param progress_callback: async функция (done: int, total: int) для обновления прогресса
|
||||||
|
:param unique_phones: set для инкрементального добавления уникальных номеров
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
total_urls = len(urls)
|
||||||
|
|
||||||
|
for i in range(0, len(urls), batch_size):
|
||||||
|
batch = urls[i:i + batch_size]
|
||||||
|
|
||||||
|
sample_url = batch[0] if batch else ""
|
||||||
|
client_config = _get_client_config(sample_url)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
headers={"User-Agent": random.choice(USER_AGENTS)},
|
||||||
|
timeout=client_config["timeout"],
|
||||||
|
follow_redirects=True,
|
||||||
|
limits=client_config["limits"]
|
||||||
|
) as client:
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
fetch_with_retry(
|
||||||
|
client, url,
|
||||||
|
base_delay=client_config["retry_base_delay"],
|
||||||
|
max_retries=client_config["max_retries"]
|
||||||
|
)
|
||||||
|
for url in batch
|
||||||
|
]
|
||||||
|
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
for j, result in enumerate(batch_results):
|
||||||
|
current_idx = i + j + 1
|
||||||
|
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
if progress_callback and callable(progress_callback):
|
||||||
|
await progress_callback(current_idx, total_urls)
|
||||||
|
continue
|
||||||
|
|
||||||
|
original_url, phone, final_domain, is_promo = result
|
||||||
|
print(f"{current_idx}: {final_domain} - {phone if phone else 'нет'}")
|
||||||
|
if phone:
|
||||||
|
results.append((original_url, phone, final_domain, is_promo))
|
||||||
|
if unique_phones is not None:
|
||||||
|
unique_phones.add(phone) # Инкрементальное добавление уникального номера
|
||||||
|
|
||||||
|
if progress_callback and callable(progress_callback):
|
||||||
|
await progress_callback(current_idx, total_urls)
|
||||||
|
|
||||||
|
if i + batch_size < len(urls):
|
||||||
|
delay = random.uniform(1.0, 2.0)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
if progress_callback and callable(progress_callback):
|
||||||
|
await progress_callback(total_urls, total_urls)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_excel(results: list[tuple], filepath: str):
|
||||||
|
"""Сохранение результатов в Excel с пометкой promo и оценкой."""
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "Phone Numbers"
|
||||||
|
|
||||||
|
headers = ["Original URL", "Phone", "Final Domain", "Promo", "Processed At", "Rating"]
|
||||||
|
ws.append(headers)
|
||||||
|
|
||||||
|
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
||||||
|
header_font = Font(bold=True, color="FFFFFF")
|
||||||
|
for cell in ws[1]:
|
||||||
|
cell.fill = header_fill
|
||||||
|
cell.font = header_font
|
||||||
|
cell.alignment = Alignment(horizontal="center")
|
||||||
|
|
||||||
|
for original_url, phone, final_domain, is_promo, rating in results:
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
promo_mark = "YES" if is_promo else "no"
|
||||||
|
|
||||||
|
ws.append([original_url, phone, final_domain, promo_mark, timestamp, rating])
|
||||||
|
|
||||||
|
if is_promo:
|
||||||
|
row_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
||||||
|
for cell in ws[ws.max_row]:
|
||||||
|
cell.fill = row_fill
|
||||||
|
|
||||||
|
for column in ws.columns:
|
||||||
|
max_len = 0
|
||||||
|
for cell in column:
|
||||||
|
if cell.value:
|
||||||
|
max_len = max(max_len, len(str(cell.value)))
|
||||||
|
ws.column_dimensions[column[0].column_letter].width = min(max_len + 2, 60)
|
||||||
|
|
||||||
|
ws.freeze_panes = 'A2'
|
||||||
|
|
||||||
|
Path(filepath).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
wb.save(filepath)
|
||||||
|
|
||||||
|
|
||||||
|
def load_urls(additional_urls: list[str]) -> list[str]:
|
||||||
|
"""Загрузка и фильтрация URL из конфигурации."""
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
urls.extend(CONFIG["urls"])
|
||||||
|
|
||||||
|
if CONFIG["input_file"] and Path(CONFIG["input_file"]).exists():
|
||||||
|
try:
|
||||||
|
with open(CONFIG["input_file"], 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip().strip('"\'').rstrip(',')
|
||||||
|
if line and line.startswith('http'):
|
||||||
|
urls.append(line.split()[0])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
urls.extend(additional_urls)
|
||||||
|
|
||||||
|
seen_domains = set()
|
||||||
|
cleaned = []
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
url = url.strip()
|
||||||
|
if not url or not url.startswith('http'):
|
||||||
|
continue
|
||||||
|
domain = normalize_domain(url)
|
||||||
|
if not domain or is_excluded(domain):
|
||||||
|
continue
|
||||||
|
if domain == 'yabs.yandex.ru':
|
||||||
|
cleaned.append(url)
|
||||||
|
continue
|
||||||
|
if domain in seen_domains:
|
||||||
|
continue
|
||||||
|
seen_domains.add(domain)
|
||||||
|
cleaned.append(url)
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Точка входа."""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Phone Scraper")
|
||||||
|
parser.add_argument('--promo-only', action='store_true', help="Сохранять только promo-записи (yabs.yandex.ru)")
|
||||||
|
parser.add_argument('urls', nargs='*', help="Дополнительные URL для обработки")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
promo_only = args.promo_only
|
||||||
|
|
||||||
|
try:
|
||||||
|
collected_links = collect_links()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Ошибка в collect_links: {e}. Продолжаем без собранных ссылок.")
|
||||||
|
collected_links = []
|
||||||
|
|
||||||
|
urls = load_urls(collected_links + args.urls)
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
print("\n💡 Использование:")
|
||||||
|
print(" python script.py [--promo-only] 'https://site1.ru' 'https://site2.ru'")
|
||||||
|
return
|
||||||
|
|
||||||
|
raw_results = await process_batch(urls)
|
||||||
|
|
||||||
|
seen_final_domains = set()
|
||||||
|
unique_raw_results = []
|
||||||
|
for result in raw_results:
|
||||||
|
original_url, phone, final_domain, is_promo = result
|
||||||
|
if final_domain not in seen_final_domains:
|
||||||
|
seen_final_domains.add(final_domain)
|
||||||
|
unique_raw_results.append(result)
|
||||||
|
|
||||||
|
results = unique_raw_results
|
||||||
|
|
||||||
|
if promo_only:
|
||||||
|
results = [r for r in results if r[3]]
|
||||||
|
|
||||||
|
if results:
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
output_path = CONFIG["output_file"].format(timestamp=timestamp)
|
||||||
|
save_to_excel(results, output_path)
|
||||||
|
|
||||||
|
promo_count = sum(1 for r in results if r[3])
|
||||||
|
print(f"\n📊 ИТОГИ:")
|
||||||
|
print(f" 🔍 Обработано: {len(urls)}")
|
||||||
|
print(f" 📞 Найдено телефонов: {len(results)}")
|
||||||
|
print(f" 🎯 Из promo (yabs): {promo_count}")
|
||||||
|
print(f" 📁 Файл: {output_path}")
|
||||||
|
|
||||||
|
print("\n✅ Готово!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n⚠ Прервано пользователем")
|
||||||
|
except Exception as e:
|
||||||
|
raise
|
||||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
httpx
|
||||||
|
selenium
|
||||||
|
openpyxl
|
||||||
|
ollama
|
||||||
|
aiogram
|
||||||
Loading…
Reference in New Issue
Block a user