464 lines
21 KiB
Python
464 lines
21 KiB
Python
# bot.py
|
||
# 🚀 Авто-Скрейпер — версия БЕЗ ИИ оценки (Ollama полностью удалена)
|
||
|
||
import argparse
|
||
import logging
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
import asyncio
|
||
import tempfile
|
||
import httpx
|
||
import time
|
||
|
||
from aiogram import Bot, Dispatcher, types, F
|
||
from aiogram.filters import Command
|
||
from aiogram.types import InlineKeyboardMarkup, InlineKeyboardButton, FSInputFile, CallbackQuery
|
||
|
||
# Импорт только нужного
|
||
from config import CONFIG
|
||
from main import process_batch, save_to_excel, load_urls
|
||
|
||
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Глобальные переменные
|
||
last_output_file = None
|
||
pending_links = None
|
||
|
||
# ==================== ГОРОДА ====================
|
||
CITIES = {
|
||
"Нижний Новгород": {"lr": 47, "keywords": ['Нижний Новгород', 'Новгород', 'нижний новгород'], "name": "Нижний Новгород"},
|
||
"Краснодар": {"lr": 35, "keywords": ['Краснодар', 'краснодар'], "name": "Краснодар"},
|
||
"Москва": {"lr": 213, "keywords": ['Москва', 'москва'], "name": "Москва"}
|
||
}
|
||
|
||
async def simple_progress(message: types.Message, done: int, total: int, last: list, unique_phones: set):
|
||
"""
|
||
Обновляет сообщение со счётчиком и уникальными номерами не чаще раза в 1 секунду (для более динамичного обновления).
|
||
last: [last_time] — mutable list для отслеживания.
|
||
unique_phones: set уникальных номеров для добавления.
|
||
"""
|
||
import time
|
||
now = time.time()
|
||
|
||
# Обновляем: либо прошло 1+ сек, либо это последний элемент
|
||
if (now - last[0] >= 1.0) or (done == total):
|
||
try:
|
||
phones_text = "\n".join(sorted(unique_phones)) if unique_phones else "" # Сортировка для стабильного вида
|
||
text = f"⏳ <b>Обработка:</b> {done}/{total}"
|
||
if phones_text:
|
||
text += f"\n\n{phones_text}"
|
||
await message.edit_text(
|
||
text,
|
||
parse_mode="HTML"
|
||
)
|
||
last[0] = now # запоминаем время обновления
|
||
except:
|
||
pass # игнорируем ошибки редактирования
|
||
|
||
def save_config():
|
||
try:
|
||
timeout_obj = CONFIG["http"]["timeout"]
|
||
|
||
# Временно заменяем объект на плейсхолдер для pprint
|
||
CONFIG["http"]["timeout"] = "PLACEHOLDER_TIMEOUT"
|
||
|
||
import pprint
|
||
s = pprint.pformat(CONFIG, width=100, sort_dicts=False)
|
||
|
||
# Возвращаем объект обратно
|
||
CONFIG["http"]["timeout"] = timeout_obj
|
||
|
||
# Правильно восстанавливаем httpx.Timeout
|
||
if isinstance(timeout_obj, httpx.Timeout):
|
||
connect = getattr(timeout_obj, 'connect', 5.0)
|
||
read = getattr(timeout_obj, 'read', 10.0)
|
||
timeout_str = f"httpx.Timeout({read}, connect={connect})"
|
||
else:
|
||
timeout_str = repr(timeout_obj)
|
||
|
||
s = s.replace("'PLACEHOLDER_TIMEOUT'", timeout_str)
|
||
|
||
with open('config.py', 'w', encoding='utf-8') as f:
|
||
f.write(f'# config.py\nimport httpx\n# 🔧 КОНФИГУРАЦИЯ\nCONFIG = {s}')
|
||
|
||
logger.info("✅ CONFIG успешно сохранён")
|
||
return True
|
||
except Exception as e:
|
||
logger.error(f"❌ Ошибка сохранения config.py: {e}")
|
||
import traceback
|
||
logger.error(traceback.format_exc())
|
||
return False
|
||
|
||
def get_search_queries_count() -> int:
|
||
try:
|
||
with open(CONFIG["search_file"], encoding="utf-8") as f:
|
||
return len([line for line in f if line.strip()])
|
||
except:
|
||
return 0
|
||
|
||
# ==================== КЛАВИАТУРЫ ====================
|
||
def main_menu_keyboard():
|
||
return InlineKeyboardMarkup(inline_keyboard=[
|
||
[InlineKeyboardButton(text="📤 Загрузить файл с ссылками", callback_data="upload_links")],
|
||
[
|
||
InlineKeyboardButton(text="⚙️ Настройки", callback_data="settings_menu"),
|
||
],
|
||
])
|
||
|
||
def back_to_menu_keyboard():
|
||
return InlineKeyboardMarkup(inline_keyboard=[[InlineKeyboardButton(text="⬅️ Вернуться в меню", callback_data="main_menu")]])
|
||
|
||
def cities_keyboard():
|
||
kb = [[InlineKeyboardButton(text=f"🌆 {city}", callback_data=f"set_city_{city}")] for city in CITIES]
|
||
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
|
||
return InlineKeyboardMarkup(inline_keyboard=kb)
|
||
|
||
def search_menu_keyboard():
|
||
return InlineKeyboardMarkup(inline_keyboard=[
|
||
[InlineKeyboardButton(text="📖 Показать search.txt", callback_data="show_search")],
|
||
[InlineKeyboardButton(text="✏️ Изменить search.txt", callback_data="edit_search")],
|
||
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
|
||
])
|
||
|
||
def output_format_keyboard():
|
||
formats = {
|
||
"phones": "📞 Только номера",
|
||
"domains": "🌐 Только домены",
|
||
"both": "📞+🌐 Номера и домены",
|
||
"excel": "📊 Excel файл"
|
||
}
|
||
kb = [[InlineKeyboardButton(text=label, callback_data=f"set_format_{key}")]
|
||
for key, label in formats.items()]
|
||
kb.append([InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")])
|
||
return InlineKeyboardMarkup(inline_keyboard=kb)
|
||
|
||
def settings_keyboard():
|
||
return InlineKeyboardMarkup(inline_keyboard=[
|
||
[InlineKeyboardButton(text="📤 Формат вывода", callback_data="output_format_menu")],
|
||
[InlineKeyboardButton(text="⬅️ Назад", callback_data="main_menu")]
|
||
])
|
||
|
||
# ==================== НАСТРОЙКИ ВЫВОДА ====================
|
||
|
||
async def settings_menu_callback(query: CallbackQuery):
|
||
current = CONFIG.get("output_format", "excel")
|
||
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐", "excel": "📊"}
|
||
await query.message.edit_text(
|
||
f"⚙️ <b>Настройки</b>\n\n"
|
||
f"📤 Текущий формат: <b>{labels.get(current, '📊')} {current}</b>",
|
||
reply_markup=settings_keyboard(),
|
||
parse_mode="HTML"
|
||
)
|
||
await query.answer()
|
||
|
||
async def output_format_menu_callback(query: CallbackQuery):
|
||
current = CONFIG.get("output_format", "excel")
|
||
await query.message.edit_text(
|
||
"📤 <b>Выберите формат вывода:</b>",
|
||
reply_markup=output_format_keyboard(),
|
||
parse_mode="HTML"
|
||
)
|
||
await query.answer()
|
||
|
||
async def set_output_format_callback(query: CallbackQuery):
|
||
fmt = query.data.replace("set_format_", "")
|
||
if fmt in ["phones", "domains", "both", "excel"]:
|
||
CONFIG["output_format"] = fmt
|
||
save_config()
|
||
labels = {"phones": "📞 Только номера", "domains": "🌐 Только домены",
|
||
"both": "📞+🌐 Номера и домены", "excel": "📊 Excel файл"}
|
||
await query.answer(f"✅ Формат: {labels[fmt]}")
|
||
await settings_menu_callback(query)
|
||
else:
|
||
await query.answer("❌ Неверный формат", show_alert=True)
|
||
|
||
# ==================== ФОРМАТИРОВАНИЕ РЕЗУЛЬТАТОВ ====================
|
||
|
||
def format_results(results: list, fmt: str) -> str:
|
||
"""
|
||
Форматирует результаты согласно выбранному формату.
|
||
results: [(org, phone, domain, promo, rating), ...]
|
||
fmt: 'phones', 'domains', 'both', 'excel'
|
||
"""
|
||
if fmt == "phones":
|
||
return "\n".join([r[1] for r in results if r[1]])
|
||
elif fmt == "domains":
|
||
return "\n".join([r[2] for r in results if r[2]])
|
||
elif fmt == "both":
|
||
return "\n".join([f"{r[1]} — {r[2]}" for r in results if r[1] or r[2]])
|
||
return "" # для excel не используем текстовый формат
|
||
|
||
# ==================== ХЕНДЛЕРЫ ====================
|
||
async def start_handler(message: types.Message):
|
||
await message.answer(
|
||
"👋 <b>Авто-Скрейпер</b>\n\nГотов к работе!\nВыбери действие ниже:",
|
||
reply_markup=main_menu_keyboard(),
|
||
parse_mode="HTML"
|
||
)
|
||
|
||
async def menu_callback(query: CallbackQuery):
|
||
await query.message.edit_text("👋 <b>Главное меню</b>", reply_markup=main_menu_keyboard(), parse_mode="HTML")
|
||
await query.answer()
|
||
|
||
async def status_handler(query: CallbackQuery):
|
||
status_text = (
|
||
f"📊 <b>Статус скрейпера</b>\n\n"
|
||
f"🌆 Город: <b>{CONFIG['region_name']}</b>\n"
|
||
f"🔎 Запросов в search.txt: <b>{get_search_queries_count()}</b>\n"
|
||
f"📁 Последний результат: <code>{last_output_file or '—'}</code>\n\n"
|
||
f"✅ Работает без ИИ-оценки"
|
||
)
|
||
await query.message.edit_text(status_text, reply_markup=back_to_menu_keyboard(), parse_mode="HTML")
|
||
await query.answer()
|
||
|
||
# ==================== СМЕНА ГОРОДА ====================
|
||
async def city_menu_callback(query: CallbackQuery):
|
||
await query.message.edit_text("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
|
||
await query.answer()
|
||
|
||
async def set_city_callback(query: CallbackQuery):
|
||
global pending_links
|
||
city_name = query.data.replace("set_city_", "")
|
||
city = CITIES[city_name]
|
||
|
||
CONFIG["region_name"] = city["name"]
|
||
CONFIG["region_lr"] = city["lr"]
|
||
CONFIG["required_keywords"] = city["keywords"]
|
||
|
||
save_config()
|
||
|
||
if pending_links:
|
||
await query.answer(f"✅ Город изменён на {city_name}. Начинаю обработку ссылок...")
|
||
await query.message.edit_text("⏳ <b>Обрабатываю ссылки из файла...</b>", parse_mode="HTML")
|
||
|
||
try:
|
||
processed = load_urls(pending_links)
|
||
|
||
last = [time.time()] # для simple_progress
|
||
unique_phones = set() # для уникальных номеров
|
||
|
||
async def progress_cb(done: int, total: int):
|
||
await simple_progress(query.message, done, total, last, unique_phones)
|
||
|
||
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
|
||
|
||
seen = set()
|
||
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])]
|
||
results = [(o, p, d, promo, "—") for o, p, d, promo in unique]
|
||
|
||
# Финальное обновление прогресса (unique_phones уже заполнен инкрементально)
|
||
await progress_cb(len(processed), len(processed))
|
||
|
||
fmt = CONFIG.get("output_format", "excel")
|
||
|
||
if fmt == "excel" and results:
|
||
global last_output_file
|
||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
path = CONFIG["output_file"].format(timestamp=ts)
|
||
save_to_excel(results, path)
|
||
last_output_file = path
|
||
|
||
promo_cnt = sum(1 for r in results if r[3])
|
||
await query.message.answer(
|
||
f"✅ <b>Обработка завершена!</b>\n"
|
||
f"📊 Обработано URL: {len(pending_links)}\n"
|
||
f"📞 Найдено телефонов: {len(results)}\n"
|
||
f"🎯 Из них promo: {promo_cnt}\n"
|
||
f"📁 Формат: Excel",
|
||
parse_mode="HTML"
|
||
)
|
||
await query.message.answer_document(FSInputFile(path), caption="📁 Результаты")
|
||
elif results:
|
||
text = format_results(results, fmt)
|
||
if text:
|
||
if len(text) > 3800:
|
||
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||
f.write(text)
|
||
path = f.name
|
||
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
|
||
await query.message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
|
||
Path(path).unlink()
|
||
else:
|
||
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
|
||
await query.message.answer(
|
||
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n"
|
||
f"📊 Обработано: {len(results)}\n\n{text}",
|
||
parse_mode="HTML"
|
||
)
|
||
else:
|
||
await query.message.answer("⚠️ Нет данных для отображения")
|
||
else:
|
||
await query.message.answer("⚠️ Телефоны не найдены")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ set_city_callback error: {e}")
|
||
await query.message.answer(f"❌ Ошибка: {e}")
|
||
finally:
|
||
pending_links = None
|
||
|
||
await query.message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||
else:
|
||
await query.answer(f"✅ Город изменён на {city_name}")
|
||
await status_handler(query)
|
||
|
||
# ==================== search.txt ====================
|
||
async def search_menu_callback(query: CallbackQuery):
|
||
await query.message.edit_text("📝 <b>Управление search.txt</b>", reply_markup=search_menu_keyboard(), parse_mode="HTML")
|
||
await query.answer()
|
||
|
||
async def show_search_callback(query: CallbackQuery):
|
||
try:
|
||
with open(CONFIG["search_file"], encoding="utf-8") as f:
|
||
content = f.read()
|
||
if len(content) > 3800:
|
||
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||
f.write(content)
|
||
path = f.name
|
||
await query.message.answer_document(FSInputFile(path), caption="📝 search.txt")
|
||
Path(path).unlink()
|
||
else:
|
||
await query.message.answer(f"📝 <b>search.txt</b>:\n\n{content}")
|
||
except Exception as e:
|
||
await query.message.answer(f"❌ Ошибка: {e}")
|
||
await query.answer()
|
||
|
||
async def edit_search_callback(query: CallbackQuery):
|
||
await query.answer("✏️ Отправьте новое содержимое search.txt в следующем сообщении")
|
||
|
||
# ==================== ОСНОВНЫЕ ДЕЙСТВИЯ ====================
|
||
|
||
|
||
async def process_callback(query: CallbackQuery):
|
||
await query.answer()
|
||
await query.message.answer("📋 <b>Отправьте ссылки для обработки</b>\n(можно несколько через пробел)", parse_mode="HTML")
|
||
|
||
async def manual_process(message: types.Message):
|
||
urls = [u.strip() for u in message.text.split() if u.strip().startswith("http")]
|
||
if not urls:
|
||
await message.answer("❌ Не найдено валидных ссылок")
|
||
return
|
||
|
||
progress_msg = await message.answer(f"⏳ Обрабатываю {len(urls)} URL...")
|
||
last = [time.time()]
|
||
unique_phones = set()
|
||
|
||
async def progress_cb(done: int, total: int):
|
||
await simple_progress(progress_msg, done, total, last, unique_phones)
|
||
|
||
try:
|
||
processed = load_urls(urls)
|
||
raw = await process_batch(processed, progress_callback=progress_cb, unique_phones=unique_phones) # Передаём unique_phones
|
||
|
||
seen = set()
|
||
unique = [r for r in raw if r[1] and r[2] not in seen and not seen.add(r[2])] # 🔥 Фикс
|
||
results = [(o, p, d, promo, "—") for o, p, d, promo in unique] # 🔥 Фикс распаковки
|
||
|
||
await progress_cb(len(processed), len(processed))
|
||
|
||
fmt = CONFIG.get("output_format", "excel")
|
||
|
||
if fmt == "excel" and results:
|
||
global last_output_file
|
||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
path = CONFIG["output_file"].format(timestamp=ts)
|
||
save_to_excel(results, path)
|
||
last_output_file = path
|
||
|
||
await message.answer(
|
||
f"✅ <b>Готово!</b>\n📊 Обработано: {len(results)}\n📁 Формат: Excel",
|
||
parse_mode="HTML"
|
||
)
|
||
await message.answer_document(FSInputFile(path), caption="📁 Результаты")
|
||
elif results:
|
||
text = format_results(results, fmt)
|
||
if text:
|
||
if len(text) > 3800:
|
||
with tempfile.NamedTemporaryFile(mode='w+', encoding='utf-8', suffix='.txt', delete=False) as f:
|
||
f.write(text)
|
||
path = f.name
|
||
caption = {"phones": "📞 Номера", "domains": "🌐 Домены", "both": "📞+🌐 Результаты"}.get(fmt, "Результаты")
|
||
await message.answer_document(FSInputFile(path), caption=f"✅ {caption}")
|
||
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||
Path(path).unlink()
|
||
else:
|
||
labels = {"phones": "📞", "domains": "🌐", "both": "📞+🌐"}
|
||
await message.answer(
|
||
f"✅ <b>{labels.get(fmt, '')} Результаты:</b>\n\n{text}",
|
||
parse_mode="HTML"
|
||
)
|
||
await message.answer("Что дальше?", reply_markup=main_menu_keyboard())
|
||
else:
|
||
await message.answer("⚠️ Нет данных для отображения")
|
||
else:
|
||
await message.answer("⚠️ Телефоны не найдены")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ manual_process error: {e}")
|
||
await message.answer(f"❌ Ошибка: {e}")
|
||
|
||
async def upload_links_callback(query: CallbackQuery):
|
||
await query.answer()
|
||
await query.message.answer("📤 <b>Отправьте файл .txt с ссылками</b>\n(одна на строку)", parse_mode="HTML")
|
||
|
||
async def handle_uploaded_file(message: types.Message, bot: Bot):
|
||
global pending_links
|
||
document = message.document
|
||
if not document.file_name.endswith('.txt'):
|
||
await message.answer("❌ Файл должен быть .txt")
|
||
return
|
||
|
||
file_info = await bot.get_file(document.file_id)
|
||
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
||
await bot.download_file(file_info.file_path, tmp.name)
|
||
|
||
try:
|
||
with open(tmp.name, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
urls = [line.strip() for line in content.splitlines() if line.strip().startswith('http')]
|
||
|
||
if not urls:
|
||
await message.answer("❌ Нет валидных ссылок в файле")
|
||
return
|
||
|
||
# 🔥 ФИКС: Очищаем конфиг, чтобы load_urls не добавил лишнего
|
||
CONFIG["urls"] = []
|
||
CONFIG["input_file"] = ""
|
||
|
||
pending_links = urls
|
||
await message.answer(f"✅ Файл загружен! Найдено {len(urls)} ссылок.\nТеперь выберите город:", parse_mode="HTML")
|
||
await message.answer("🌆 <b>Выберите город:</b>", reply_markup=cities_keyboard(), parse_mode="HTML")
|
||
except Exception as e:
|
||
await message.answer(f"❌ Ошибка: {e}")
|
||
finally:
|
||
Path(tmp.name).unlink()
|
||
|
||
# ==================== ЗАПУСК ====================
|
||
def main_bot(token: str):
|
||
bot = Bot(token=token)
|
||
dp = Dispatcher()
|
||
|
||
dp.message.register(start_handler, Command("start"))
|
||
dp.message.register(manual_process, F.text & ~F.text.startswith("/"))
|
||
dp.message.register(handle_uploaded_file, F.document)
|
||
|
||
dp.callback_query.register(menu_callback, F.data == "main_menu")
|
||
dp.callback_query.register(status_handler, F.data == "status")
|
||
dp.callback_query.register(process_callback, F.data == "process")
|
||
dp.callback_query.register(upload_links_callback, F.data == "upload_links")
|
||
dp.callback_query.register(search_menu_callback, F.data == "search_menu")
|
||
dp.callback_query.register(show_search_callback, F.data == "show_search")
|
||
dp.callback_query.register(edit_search_callback, F.data == "edit_search")
|
||
dp.callback_query.register(city_menu_callback, F.data == "city_menu")
|
||
dp.callback_query.register(set_city_callback, F.data.startswith("set_city_"))
|
||
dp.callback_query.register(settings_menu_callback, F.data == "settings_menu")
|
||
dp.callback_query.register(output_format_menu_callback, F.data == "output_format_menu")
|
||
dp.callback_query.register(set_output_format_callback, F.data.startswith("set_format_"))
|
||
|
||
asyncio.run(dp.start_polling(bot))
|
||
|
||
if __name__ == '__main__':
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('--token', required=True)
|
||
args = parser.parse_args()
|
||
main_bot(args.token) |