stock-info-crawler/app/crawlers/barrons.py

from __future__ import annotations

import hashlib
from datetime import datetime
from typing import List, Dict, Optional

import requests
from bs4 import BeautifulSoup

from app.crawlers.base import BaseCrawler
from app.services import notifications as notif


class BarronsCrawler(BaseCrawler):
    def __init__(self, config, logger):
        # Name used in generic notifications; include emoji to match previous subject
        super().__init__(name="📈 Barron's 新股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
        self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def fetch_page(self) -> Optional[str]:
        try:
            resp = requests.get(self.url, headers=self.headers, timeout=30)
            resp.raise_for_status()
            return resp.text
        except requests.RequestException as e:
            self.logger.error(f"獲取網頁失敗: {e}")
            self.stats['errors'] += 1
            return None

    def parse_items(self, html_content: str) -> List[Dict]:
        soup = BeautifulSoup(html_content, 'html.parser')
        stock_picks: List[Dict] = []
        try:
            selectors = [
                'article[data-module="ArticleItem"]',
                '.WSJTheme--headline',
                '.MarketDataModule-headline',
                'h3 a, h4 a',
                '[data-module] a[href*="articles"]',
            ]
            elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    self.logger.info(f"使用選擇器找到內容: {selector}")
                    break

            for element in elements[:10]:
                title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
                link = element.get('href') if element.name == 'a' else element.find('a', href=True)
                if isinstance(link, dict):
                    link = link.get('href')
                elif hasattr(link, 'get'):
                    link = link.get('href')
                if link and isinstance(link, str) and link.startswith('/'):
                    link = "https://www.barrons.com" + link
                if title and len(title) > 10:
                    stock_picks.append({
                        'title': title,
                        'link': link,
                        'scraped_at': datetime.now().isoformat(),
                        'hash': hashlib.md5(title.encode()).hexdigest()[:8],
                    })
            return stock_picks
        except Exception as e:
            self.logger.error(f"解析網頁內容失敗: {e}")
            self.stats['errors'] += 1
            return []

    # Keep Barron's specific email formatting (subject + body)
    def _build_email(self, items: List[Dict]):
        subject = f"📈 Barron's 新股票推薦 ({len(items)}條)"
        body = notif.format_email_body(items)
        return subject, body