Files
stock-info-crawler/app/crawlers/barrons.py
MH Hung 58cc979b5b refactor: modularize project structure and separate API from crawlers
- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules
- Add BaseCrawler and BarronsCrawler; extract notifications and storage
- Keep enhanced_crawler.py as back-compat entry delegating to app.runner
- Add template crawler for future sites
- Update README with new structure and usage
- Extend .env.template with DATA_DIR/LOG_DIR options
2025-09-04 21:39:24 +08:00

71 lines
2.7 KiB
Python

from __future__ import annotations
import hashlib
from datetime import datetime
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from app.crawlers.base import BaseCrawler
class BarronsCrawler(BaseCrawler):
def __init__(self, config, logger):
super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self) -> Optional[str]:
try:
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_items(self, html_content: str) -> List[Dict]:
soup = BeautifulSoup(html_content, 'html.parser')
stock_picks: List[Dict] = []
try:
selectors = [
'article[data-module="ArticleItem"]',
'.WSJTheme--headline',
'.MarketDataModule-headline',
'h3 a, h4 a',
'[data-module] a[href*="articles"]',
]
elements = []
for selector in selectors:
elements = soup.select(selector)
if elements:
self.logger.info(f"使用選擇器找到內容: {selector}")
break
for element in elements[:10]:
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
if isinstance(link, dict):
link = link.get('href')
elif hasattr(link, 'get'):
link = link.get('href')
if link and isinstance(link, str) and link.startswith('/'):
link = "https://www.barrons.com" + link
if title and len(title) > 10:
stock_picks.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
})
return stock_picks
except Exception as e:
self.logger.error(f"解析網頁內容失敗: {e}")
self.stats['errors'] += 1
return []