stock-info-crawler/app/crawlers/barrons.py

from __future__ import annotations

import hashlib
from datetime import datetime
from typing import List, Dict, Optional

import requests
from bs4 import BeautifulSoup

from app.crawlers.base import BaseCrawler


class BarronsCrawler(BaseCrawler):
    def __init__(self, config, logger):
        super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
        self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def fetch_page(self) -> Optional[str]:
        try:
            resp = requests.get(self.url, headers=self.headers, timeout=30)
            resp.raise_for_status()
            return resp.text
        except requests.RequestException as e:
            self.logger.error(f"獲取網頁失敗: {e}")
            self.stats['errors'] += 1
            return None

    def parse_items(self, html_content: str) -> List[Dict]:
        soup = BeautifulSoup(html_content, 'html.parser')
        stock_picks: List[Dict] = []
        try:
            selectors = [
                'article[data-module="ArticleItem"]',
                '.WSJTheme--headline',
                '.MarketDataModule-headline',
                'h3 a, h4 a',
                '[data-module] a[href*="articles"]',
            ]
            elements = []
            for selector in selectors:
                elements = soup.select(selector)
                if elements:
                    self.logger.info(f"使用選擇器找到內容: {selector}")
                    break

            for element in elements[:10]:
                title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
                link = element.get('href') if element.name == 'a' else element.find('a', href=True)
                if isinstance(link, dict):
                    link = link.get('href')
                elif hasattr(link, 'get'):
                    link = link.get('href')
                if link and isinstance(link, str) and link.startswith('/'):
                    link = "https://www.barrons.com" + link
                if title and len(title) > 10:
                    stock_picks.append({
                        'title': title,
                        'link': link,
                        'scraped_at': datetime.now().isoformat(),
                        'hash': hashlib.md5(title.encode()).hexdigest()[:8],
                    })
            return stock_picks
        except Exception as e:
            self.logger.error(f"解析網頁內容失敗: {e}")
            self.stats['errors'] += 1
            return []