78 lines
3.1 KiB
Python
78 lines
3.1 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from app.crawlers.base import BaseCrawler
|
|
from app.services import notifications as notif
|
|
|
|
|
|
class BarronsCrawler(BaseCrawler):
|
|
def __init__(self, config, logger):
|
|
# Name used in generic notifications; include emoji to match previous subject
|
|
super().__init__(name="📈 Barron's 新股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
|
|
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
|
|
self.headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
def fetch_page(self) -> Optional[str]:
|
|
try:
|
|
resp = requests.get(self.url, headers=self.headers, timeout=30)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
except requests.RequestException as e:
|
|
self.logger.error(f"獲取網頁失敗: {e}")
|
|
self.stats['errors'] += 1
|
|
return None
|
|
|
|
def parse_items(self, html_content: str) -> List[Dict]:
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
stock_picks: List[Dict] = []
|
|
try:
|
|
selectors = [
|
|
'article[data-module="ArticleItem"]',
|
|
'.WSJTheme--headline',
|
|
'.MarketDataModule-headline',
|
|
'h3 a, h4 a',
|
|
'[data-module] a[href*="articles"]',
|
|
]
|
|
elements = []
|
|
for selector in selectors:
|
|
elements = soup.select(selector)
|
|
if elements:
|
|
self.logger.info(f"使用選擇器找到內容: {selector}")
|
|
break
|
|
|
|
for element in elements[:10]:
|
|
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
|
|
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
|
|
if isinstance(link, dict):
|
|
link = link.get('href')
|
|
elif hasattr(link, 'get'):
|
|
link = link.get('href')
|
|
if link and isinstance(link, str) and link.startswith('/'):
|
|
link = "https://www.barrons.com" + link
|
|
if title and len(title) > 10:
|
|
stock_picks.append({
|
|
'title': title,
|
|
'link': link,
|
|
'scraped_at': datetime.now().isoformat(),
|
|
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
|
|
})
|
|
return stock_picks
|
|
except Exception as e:
|
|
self.logger.error(f"解析網頁內容失敗: {e}")
|
|
self.stats['errors'] += 1
|
|
return []
|
|
|
|
# Keep Barron's specific email formatting (subject + body)
|
|
def _build_email(self, items: List[Dict]):
|
|
subject = f"📈 Barron's 新股票推薦 ({len(items)}條)"
|
|
body = notif.format_email_body(items)
|
|
return subject, body
|