refactor: modularize project structure and separate API from crawlers
- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules - Add BaseCrawler and BarronsCrawler; extract notifications and storage - Keep enhanced_crawler.py as back-compat entry delegating to app.runner - Add template crawler for future sites - Update README with new structure and usage - Extend .env.template with DATA_DIR/LOG_DIR options
This commit is contained in:
6
app/__init__.py
Normal file
6
app/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Modular crawler package.
|
||||
|
||||
This package separates API, crawlers, services, and configuration
|
||||
to support multiple sites in the future.
|
||||
"""
|
||||
|
56
app/api/server.py
Normal file
56
app/api/server.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from flask import Flask, jsonify, request
|
||||
|
||||
from app.services import notifications as notif
|
||||
|
||||
|
||||
def create_app(crawler) -> Flask:
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.get('/health')
|
||||
def health():
|
||||
return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()})
|
||||
|
||||
@app.get('/stats')
|
||||
def stats():
|
||||
if crawler:
|
||||
return jsonify(crawler.stats)
|
||||
return jsonify({"error": "Crawler not initialized"}), 500
|
||||
|
||||
@app.get('/check')
|
||||
def manual_check():
|
||||
if not crawler:
|
||||
return jsonify({"error": "Crawler not initialized"}), 500
|
||||
result = crawler.run_check() or []
|
||||
return jsonify({"result": f"Found {len(result)} new picks"})
|
||||
|
||||
@app.get('/notify_test')
|
||||
def notify_test():
|
||||
if not crawler:
|
||||
return jsonify({"error": "Crawler not initialized"}), 500
|
||||
channel = (request.args.get('channel') or 'email').lower()
|
||||
test_pick = [notif.build_test_pick()]
|
||||
try:
|
||||
if channel == 'email':
|
||||
if not crawler.config.email:
|
||||
return jsonify({"error": "Email config not set"}), 400
|
||||
notif.send_email(test_pick, crawler.config.email)
|
||||
elif channel == 'webhook':
|
||||
if not crawler.config.webhook_url:
|
||||
return jsonify({"error": "Webhook URL not set"}), 400
|
||||
notif.send_webhook(test_pick, crawler.config.webhook_url)
|
||||
elif channel == 'discord':
|
||||
if not crawler.config.discord_webhook:
|
||||
return jsonify({"error": "Discord webhook not set"}), 400
|
||||
notif.send_discord(test_pick, crawler.config.discord_webhook)
|
||||
else:
|
||||
return jsonify({"error": f"Unsupported channel: {channel}"}), 400
|
||||
return jsonify({"result": f"Test notification sent via {channel}"})
|
||||
except Exception as e:
|
||||
crawler.logger.error(f"測試通知發送失敗: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
return app
|
||||
|
96
app/config.py
Normal file
96
app/config.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import os
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmailConfig:
|
||||
smtp_server: str
|
||||
smtp_port: int
|
||||
smtp_security: str # 'ssl' | 'starttls' | 'none'
|
||||
from_email: str
|
||||
to_email: str
|
||||
username: str
|
||||
password: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
check_interval: int
|
||||
log_level: str
|
||||
always_notify_on_startup: bool
|
||||
webhook_url: str | None
|
||||
discord_webhook: str | None
|
||||
data_dir: str
|
||||
log_dir: str
|
||||
email: EmailConfig | None
|
||||
|
||||
|
||||
def _resolve_dir(env_key: str, default_subdir: str) -> str:
|
||||
# Prefer explicit env var
|
||||
val = os.getenv(env_key)
|
||||
if val:
|
||||
return val
|
||||
# Prefer Docker paths if present
|
||||
docker_path = f"/app/{default_subdir}"
|
||||
if os.path.isdir(docker_path):
|
||||
return docker_path
|
||||
# Fallback to local ./subdir
|
||||
return os.path.join(os.getcwd(), default_subdir)
|
||||
|
||||
|
||||
def load_email_config() -> EmailConfig | None:
|
||||
required = [
|
||||
'EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD'
|
||||
]
|
||||
if not all(os.getenv(k) for k in required):
|
||||
return None
|
||||
|
||||
security = os.getenv('EMAIL_SMTP_SECURITY', 'starttls').lower()
|
||||
default_port = 465 if security == 'ssl' else 587 if security == 'starttls' else 25
|
||||
smtp_port = int(os.getenv('EMAIL_SMTP_PORT', default_port))
|
||||
|
||||
return EmailConfig(
|
||||
smtp_server=os.getenv('EMAIL_SMTP_SERVER', ''),
|
||||
smtp_port=smtp_port,
|
||||
smtp_security=security,
|
||||
from_email=os.getenv('EMAIL_FROM', ''),
|
||||
to_email=os.getenv('EMAIL_TO', ''),
|
||||
username=os.getenv('EMAIL_USERNAME', ''),
|
||||
password=os.getenv('EMAIL_PASSWORD', ''),
|
||||
)
|
||||
|
||||
|
||||
def setup_logging(level: str, log_dir: str) -> logging.Logger:
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, level.upper(), logging.INFO),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(log_dir, 'crawler.log')),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
return logging.getLogger(__name__)
|
||||
|
||||
|
||||
def load_config() -> AppConfig:
|
||||
check_interval = int(os.getenv('CHECK_INTERVAL', 300))
|
||||
log_level = os.getenv('LOG_LEVEL', 'INFO')
|
||||
always_notify_on_startup = os.getenv('ALWAYS_NOTIFY_ON_STARTUP', 'false').lower() in ('1', 'true', 'yes')
|
||||
webhook_url = os.getenv('WEBHOOK_URL')
|
||||
discord_webhook = os.getenv('DISCORD_WEBHOOK')
|
||||
data_dir = _resolve_dir('DATA_DIR', 'data')
|
||||
log_dir = _resolve_dir('LOG_DIR', 'logs')
|
||||
|
||||
return AppConfig(
|
||||
check_interval=check_interval,
|
||||
log_level=log_level,
|
||||
always_notify_on_startup=always_notify_on_startup,
|
||||
webhook_url=webhook_url,
|
||||
discord_webhook=discord_webhook,
|
||||
data_dir=data_dir,
|
||||
log_dir=log_dir,
|
||||
email=load_email_config(),
|
||||
)
|
||||
|
70
app/crawlers/barrons.py
Normal file
70
app/crawlers/barrons.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.crawlers.base import BaseCrawler
|
||||
|
||||
|
||||
class BarronsCrawler(BaseCrawler):
|
||||
def __init__(self, config, logger):
|
||||
super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
|
||||
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
def fetch_page(self) -> Optional[str]:
|
||||
try:
|
||||
resp = requests.get(self.url, headers=self.headers, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except requests.RequestException as e:
|
||||
self.logger.error(f"獲取網頁失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def parse_items(self, html_content: str) -> List[Dict]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
stock_picks: List[Dict] = []
|
||||
try:
|
||||
selectors = [
|
||||
'article[data-module="ArticleItem"]',
|
||||
'.WSJTheme--headline',
|
||||
'.MarketDataModule-headline',
|
||||
'h3 a, h4 a',
|
||||
'[data-module] a[href*="articles"]',
|
||||
]
|
||||
elements = []
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
self.logger.info(f"使用選擇器找到內容: {selector}")
|
||||
break
|
||||
|
||||
for element in elements[:10]:
|
||||
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
|
||||
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
|
||||
if isinstance(link, dict):
|
||||
link = link.get('href')
|
||||
elif hasattr(link, 'get'):
|
||||
link = link.get('href')
|
||||
if link and isinstance(link, str) and link.startswith('/'):
|
||||
link = "https://www.barrons.com" + link
|
||||
if title and len(title) > 10:
|
||||
stock_picks.append({
|
||||
'title': title,
|
||||
'link': link,
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
|
||||
})
|
||||
return stock_picks
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析網頁內容失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return []
|
||||
|
136
app/crawlers/base.py
Normal file
136
app/crawlers/base.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import time
|
||||
import signal
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import schedule
|
||||
|
||||
from app.config import AppConfig
|
||||
from app.services import storage
|
||||
from app.services import notifications as notif
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
def __init__(self, name: str, config: AppConfig, logger, data_filename: str):
|
||||
self.name = name
|
||||
self.config = config
|
||||
self.logger = logger
|
||||
self.data_path = storage.data_file_path(config.data_dir, data_filename)
|
||||
|
||||
self.running = True
|
||||
self._first_check_done = False
|
||||
|
||||
self.stats = {
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'total_checks': 0,
|
||||
'new_picks_found': 0,
|
||||
'last_check': None,
|
||||
'last_notification': None,
|
||||
'errors': 0,
|
||||
}
|
||||
|
||||
# --- Abstract site-specific hooks ---
|
||||
@abstractmethod
|
||||
def fetch_page(self) -> Optional[str]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def parse_items(self, html_content: str) -> List[Dict]:
|
||||
...
|
||||
|
||||
# --- Generic helpers ---
|
||||
def find_new(self, current: List[Dict], previous: List[Dict]) -> List[Dict]:
|
||||
prev_hashes = {p.get('hash') for p in previous if 'hash' in p}
|
||||
return [p for p in current if p.get('hash') not in prev_hashes]
|
||||
|
||||
# --- Main check ---
|
||||
def run_check(self) -> Optional[List[Dict]]:
|
||||
self.logger.info(f"開始檢查 {self.name}...")
|
||||
self.stats['total_checks'] += 1
|
||||
self.stats['last_check'] = datetime.now().isoformat()
|
||||
try:
|
||||
html = self.fetch_page()
|
||||
if not html:
|
||||
return []
|
||||
current = self.parse_items(html)
|
||||
if not current:
|
||||
self.logger.warning("未找到內容")
|
||||
return []
|
||||
|
||||
prev = storage.load_json(self.data_path).get('stock_picks', [])
|
||||
new_items = self.find_new(current, prev)
|
||||
|
||||
if new_items:
|
||||
self.logger.info(f"🚨 發現 {len(new_items)} 條新內容")
|
||||
self.stats['new_picks_found'] += len(new_items)
|
||||
self._send_notifications(new_items)
|
||||
storage.save_json(self.data_path, {
|
||||
'last_update': datetime.now().isoformat(),
|
||||
'stock_picks': current,
|
||||
'stats': self.stats,
|
||||
})
|
||||
return new_items
|
||||
|
||||
# Optionally notify on first run
|
||||
if (not self._first_check_done) and self.config.always_notify_on_startup and current:
|
||||
self.logger.info("🟢 啟動首次檢查:無新內容,但依設定寄出目前清單")
|
||||
self._send_notifications(current)
|
||||
storage.save_json(self.data_path, {
|
||||
'last_update': datetime.now().isoformat(),
|
||||
'stock_picks': current,
|
||||
'stats': self.stats,
|
||||
})
|
||||
return current
|
||||
|
||||
self.logger.info("✅ 沒有發現新內容")
|
||||
return []
|
||||
except Exception as e:
|
||||
self.logger.error(f"檢查過程錯誤: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def _send_notifications(self, items: List[Dict]) -> None:
|
||||
sent = False
|
||||
if self.config.email:
|
||||
try:
|
||||
notif.send_email(items, self.config.email)
|
||||
sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"電子郵件通知失敗: {e}")
|
||||
if self.config.webhook_url:
|
||||
try:
|
||||
notif.send_webhook(items, self.config.webhook_url)
|
||||
sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Webhook 通知失敗: {e}")
|
||||
if self.config.discord_webhook:
|
||||
try:
|
||||
notif.send_discord(items, self.config.discord_webhook)
|
||||
sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Discord 通知失敗: {e}")
|
||||
if sent:
|
||||
self.stats['last_notification'] = datetime.now().isoformat()
|
||||
|
||||
# --- Run loop ---
|
||||
def _signal_handler(self, signum, frame):
|
||||
self.logger.info("收到停止信號,正在關閉...")
|
||||
self.running = False
|
||||
|
||||
def run(self):
|
||||
signal.signal(signal.SIGINT, self._signal_handler)
|
||||
signal.signal(signal.SIGTERM, self._signal_handler)
|
||||
|
||||
schedule.every(self.config.check_interval).seconds.do(self.run_check)
|
||||
self.logger.info(f"🚀 爬蟲已啟動,每 {self.config.check_interval} 秒檢查一次")
|
||||
self.run_check()
|
||||
self._first_check_done = True
|
||||
while self.running:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
self.logger.info("爬蟲已停止")
|
||||
|
53
app/crawlers/template.py
Normal file
53
app/crawlers/template.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
from app.crawlers.base import BaseCrawler
|
||||
|
||||
|
||||
class TemplateCrawler(BaseCrawler):
|
||||
"""範本:建立新站點時複製本檔並改名。
|
||||
|
||||
必要實作:fetch_page 與 parse_items
|
||||
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
|
||||
"""
|
||||
|
||||
def __init__(self, config, logger):
|
||||
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
|
||||
self.url = "https://example.com"
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
def fetch_page(self) -> Optional[str]:
|
||||
try:
|
||||
resp = requests.get(self.url, headers=self.headers, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except requests.RequestException as e:
|
||||
self.logger.error(f"獲取網頁失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def parse_items(self, html_content: str) -> List[Dict]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
items: List[Dict] = []
|
||||
|
||||
# TODO: 依站點結構實作解析邏輯,以下為示意
|
||||
for a in soup.select('a')[:5]:
|
||||
title = a.get_text(strip=True)
|
||||
link = a.get('href')
|
||||
if title and len(title) > 5:
|
||||
items.append({
|
||||
'title': title,
|
||||
'link': link,
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
|
||||
})
|
||||
return items
|
||||
|
29
app/runner.py
Normal file
29
app/runner.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import threading
|
||||
|
||||
from app.config import load_config, setup_logging
|
||||
from app.crawlers.barrons import BarronsCrawler
|
||||
from app.api.server import create_app
|
||||
|
||||
|
||||
def start():
|
||||
# Load configuration and setup logging
|
||||
config = load_config()
|
||||
logger = setup_logging(config.log_level, config.log_dir)
|
||||
|
||||
# Create crawler instance
|
||||
crawler = BarronsCrawler(config, logger)
|
||||
|
||||
# Create and start API in background
|
||||
app = create_app(crawler)
|
||||
|
||||
def run_api():
|
||||
app.run(host='0.0.0.0', port=8080, debug=False)
|
||||
|
||||
flask_thread = threading.Thread(target=run_api, daemon=True)
|
||||
flask_thread.start()
|
||||
|
||||
# Run crawler loop (blocking)
|
||||
crawler.run()
|
||||
|
79
app/services/notifications.py
Normal file
79
app/services/notifications.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from app.config import EmailConfig
|
||||
|
||||
|
||||
def format_email_body(new_picks: List[Dict]) -> str:
|
||||
body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n"
|
||||
for pick in new_picks:
|
||||
body += f"📊 {pick['title']}\n"
|
||||
if pick.get('link'):
|
||||
body += f"🔗 {pick['link']}\n"
|
||||
body += f"🕒 {pick.get('scraped_at', datetime.now().isoformat())}\n"
|
||||
body += "-" * 60 + "\n"
|
||||
return body
|
||||
|
||||
|
||||
def send_email(new_picks: List[Dict], cfg: EmailConfig) -> None:
|
||||
msg = MIMEMultipart()
|
||||
msg['From'] = cfg.from_email
|
||||
msg['To'] = cfg.to_email
|
||||
msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)"
|
||||
msg.attach(MIMEText(format_email_body(new_picks), 'plain', 'utf-8'))
|
||||
|
||||
if cfg.smtp_security == 'ssl':
|
||||
server = smtplib.SMTP_SSL(cfg.smtp_server, cfg.smtp_port)
|
||||
else:
|
||||
server = smtplib.SMTP(cfg.smtp_server, cfg.smtp_port)
|
||||
server.ehlo()
|
||||
if cfg.smtp_security == 'starttls':
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
|
||||
server.login(cfg.username, cfg.password)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
|
||||
|
||||
def send_webhook(new_picks: List[Dict], url: str) -> None:
|
||||
message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n"
|
||||
for pick in new_picks[:5]:
|
||||
message += f"• {pick['title']}\n"
|
||||
if pick.get('link'):
|
||||
message += f" {pick['link']}\n"
|
||||
message += "\n"
|
||||
payload = {"text": message}
|
||||
requests.post(url, json=payload, timeout=10)
|
||||
|
||||
|
||||
def send_discord(new_picks: List[Dict], webhook: str) -> None:
|
||||
embed = {
|
||||
"title": "📈 Barron's 新股票推薦",
|
||||
"description": f"發現 {len(new_picks)} 條新推薦",
|
||||
"color": 0x00ff00,
|
||||
"fields": [],
|
||||
}
|
||||
for pick in new_picks[:5]:
|
||||
embed["fields"].append({
|
||||
"name": pick['title'][:256],
|
||||
"value": (pick.get('link') or '無連結')[:1024],
|
||||
"inline": False,
|
||||
})
|
||||
requests.post(webhook, json={"embeds": [embed]}, timeout=10)
|
||||
|
||||
|
||||
def build_test_pick() -> Dict:
|
||||
return {
|
||||
'title': f"[測試] Barron's 通知發送 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
'link': 'https://example.com/test',
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'hash': hashlib.md5(str(datetime.now().timestamp()).encode()).hexdigest()[:8],
|
||||
}
|
||||
|
23
app/services/storage.py
Normal file
23
app/services/storage.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
def data_file_path(data_dir: str, name: str) -> str:
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
return os.path.join(data_dir, name)
|
||||
|
||||
|
||||
def load_json(path: str) -> Dict[str, Any]:
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
return {'last_update': None, 'stock_picks': [], 'stats': {}}
|
||||
|
||||
|
||||
def save_json(path: str, data: Dict[str, Any]) -> None:
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
Reference in New Issue
Block a user