from __future__ import annotations import hashlib from datetime import datetime from typing import List, Dict, Optional import requests from bs4 import BeautifulSoup from app.crawlers.base import BaseCrawler class BarronsCrawler(BaseCrawler): def __init__(self, config, logger): super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json') self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def fetch_page(self) -> Optional[str]: try: resp = requests.get(self.url, headers=self.headers, timeout=30) resp.raise_for_status() return resp.text except requests.RequestException as e: self.logger.error(f"獲取網頁失敗: {e}") self.stats['errors'] += 1 return None def parse_items(self, html_content: str) -> List[Dict]: soup = BeautifulSoup(html_content, 'html.parser') stock_picks: List[Dict] = [] try: selectors = [ 'article[data-module="ArticleItem"]', '.WSJTheme--headline', '.MarketDataModule-headline', 'h3 a, h4 a', '[data-module] a[href*="articles"]', ] elements = [] for selector in selectors: elements = soup.select(selector) if elements: self.logger.info(f"使用選擇器找到內容: {selector}") break for element in elements[:10]: title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True) link = element.get('href') if element.name == 'a' else element.find('a', href=True) if isinstance(link, dict): link = link.get('href') elif hasattr(link, 'get'): link = link.get('href') if link and isinstance(link, str) and link.startswith('/'): link = "https://www.barrons.com" + link if title and len(title) > 10: stock_picks.append({ 'title': title, 'link': link, 'scraped_at': datetime.now().isoformat(), 'hash': hashlib.md5(title.encode()).hexdigest()[:8], }) return stock_picks except Exception as e: self.logger.error(f"解析網頁內容失敗: {e}") self.stats['errors'] += 1 return []