from __future__ import annotations import hashlib import time import signal from abc import ABC, abstractmethod from datetime import datetime from typing import List, Dict, Optional import schedule from app.config import AppConfig from app.services import storage from app.services import notifications as notif class BaseCrawler(ABC): def __init__(self, name: str, config: AppConfig, logger, data_filename: str): self.name = name self.config = config self.logger = logger self.data_path = storage.data_file_path(config.data_dir, data_filename) self.running = True self._first_check_done = False self.stats = { 'start_time': datetime.now().isoformat(), 'total_checks': 0, 'new_picks_found': 0, 'last_check': None, 'last_notification': None, 'errors': 0, } # --- Abstract site-specific hooks --- @abstractmethod def fetch_page(self) -> Optional[str]: ... @abstractmethod def parse_items(self, html_content: str) -> List[Dict]: ... # --- Generic helpers --- def find_new(self, current: List[Dict], previous: List[Dict]) -> List[Dict]: prev_hashes = {p.get('hash') for p in previous if 'hash' in p} return [p for p in current if p.get('hash') not in prev_hashes] # --- Main check --- def run_check(self) -> Optional[List[Dict]]: self.logger.info(f"開始檢查 {self.name}...") self.stats['total_checks'] += 1 self.stats['last_check'] = datetime.now().isoformat() try: html = self.fetch_page() if not html: return [] current = self.parse_items(html) if not current: self.logger.warning("未找到內容") return [] prev = storage.load_json(self.data_path).get('stock_picks', []) new_items = self.find_new(current, prev) if new_items: self.logger.info(f"🚨 發現 {len(new_items)} 條新內容") self.stats['new_picks_found'] += len(new_items) self._send_notifications(new_items) storage.save_json(self.data_path, { 'last_update': datetime.now().isoformat(), 'stock_picks': current, 'stats': self.stats, }) return new_items # Optionally notify on first run if (not self._first_check_done) and self.config.always_notify_on_startup and current: self.logger.info("🟢 啟動首次檢查:無新內容,但依設定寄出目前清單") self._send_notifications(current) storage.save_json(self.data_path, { 'last_update': datetime.now().isoformat(), 'stock_picks': current, 'stats': self.stats, }) return current self.logger.info("✅ 沒有發現新內容") return [] except Exception as e: self.logger.error(f"檢查過程錯誤: {e}") self.stats['errors'] += 1 return None def _send_notifications(self, items: List[Dict]) -> None: sent = False # Build subject/body via hook for consistency across crawlers subject, body = self._build_email(items) if self.config.email: try: notif.send_custom_email(subject, body, self.config.email) sent = True except Exception as e: self.logger.error(f"電子郵件通知失敗: {e}") if self.config.webhook_url: try: notif.send_webhook(items, self.config.webhook_url) sent = True except Exception as e: self.logger.error(f"Webhook 通知失敗: {e}") if self.config.discord_webhook: try: notif.send_discord(items, self.config.discord_webhook) sent = True except Exception as e: self.logger.error(f"Discord 通知失敗: {e}") if sent: self.stats['last_notification'] = datetime.now().isoformat() def _build_email(self, items: List[Dict]): """Construct a generic email subject and body. Subclasses can override to customize content/format. """ subject = f"{self.name} ({len(items)}條)" lines = [] for pick in items: line = f"📊 {pick.get('title','').strip()}\n" if pick.get('link'): line += f"🔗 {pick['link']}\n" line += f"🕒 {pick.get('scraped_at', datetime.now().isoformat())}\n" line += "-" * 60 + "\n" lines.append(line) body = f"發現 {len(items)} 條新內容:\n\n" + "".join(lines) return subject, body # --- Run loop --- def _signal_handler(self, signum, frame): self.logger.info("收到停止信號,正在關閉...") self.running = False def run(self): signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) if getattr(self.config, 'run_daily_at', None): schedule.every().day.at(self.config.run_daily_at).do(self.run_check) self.logger.info(f"🚀 爬蟲已啟動,每天 {self.config.run_daily_at} 檢查一次") else: schedule.every(self.config.check_interval).seconds.do(self.run_check) self.logger.info(f"🚀 爬蟲已啟動,每 {self.config.check_interval} 秒檢查一次") self.run_check() self._first_check_done = True while self.running: schedule.run_pending() time.sleep(1) self.logger.info("爬蟲已停止")