refactor: modularize project structure and separate API from crawlers

- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules
- Add BaseCrawler and BarronsCrawler; extract notifications and storage
- Keep enhanced_crawler.py as back-compat entry delegating to app.runner
- Add template crawler for future sites
- Update README with new structure and usage
- Extend .env.template with DATA_DIR/LOG_DIR options
This commit is contained in:
2025-09-04 21:39:24 +08:00
parent 099f156e6f
commit 58cc979b5b
12 changed files with 663 additions and 666 deletions

6
app/__init__.py Normal file
View File

@@ -0,0 +1,6 @@
"""Modular crawler package.
This package separates API, crawlers, services, and configuration
to support multiple sites in the future.
"""

56
app/api/server.py Normal file
View File

@@ -0,0 +1,56 @@
from __future__ import annotations
from datetime import datetime
from flask import Flask, jsonify, request
from app.services import notifications as notif
def create_app(crawler) -> Flask:
app = Flask(__name__)
@app.get('/health')
def health():
return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()})
@app.get('/stats')
def stats():
if crawler:
return jsonify(crawler.stats)
return jsonify({"error": "Crawler not initialized"}), 500
@app.get('/check')
def manual_check():
if not crawler:
return jsonify({"error": "Crawler not initialized"}), 500
result = crawler.run_check() or []
return jsonify({"result": f"Found {len(result)} new picks"})
@app.get('/notify_test')
def notify_test():
if not crawler:
return jsonify({"error": "Crawler not initialized"}), 500
channel = (request.args.get('channel') or 'email').lower()
test_pick = [notif.build_test_pick()]
try:
if channel == 'email':
if not crawler.config.email:
return jsonify({"error": "Email config not set"}), 400
notif.send_email(test_pick, crawler.config.email)
elif channel == 'webhook':
if not crawler.config.webhook_url:
return jsonify({"error": "Webhook URL not set"}), 400
notif.send_webhook(test_pick, crawler.config.webhook_url)
elif channel == 'discord':
if not crawler.config.discord_webhook:
return jsonify({"error": "Discord webhook not set"}), 400
notif.send_discord(test_pick, crawler.config.discord_webhook)
else:
return jsonify({"error": f"Unsupported channel: {channel}"}), 400
return jsonify({"result": f"Test notification sent via {channel}"})
except Exception as e:
crawler.logger.error(f"測試通知發送失敗: {e}")
return jsonify({"error": str(e)}), 500
return app

96
app/config.py Normal file
View File

@@ -0,0 +1,96 @@
import os
import logging
from dataclasses import dataclass
@dataclass
class EmailConfig:
smtp_server: str
smtp_port: int
smtp_security: str # 'ssl' | 'starttls' | 'none'
from_email: str
to_email: str
username: str
password: str
@dataclass
class AppConfig:
check_interval: int
log_level: str
always_notify_on_startup: bool
webhook_url: str | None
discord_webhook: str | None
data_dir: str
log_dir: str
email: EmailConfig | None
def _resolve_dir(env_key: str, default_subdir: str) -> str:
# Prefer explicit env var
val = os.getenv(env_key)
if val:
return val
# Prefer Docker paths if present
docker_path = f"/app/{default_subdir}"
if os.path.isdir(docker_path):
return docker_path
# Fallback to local ./subdir
return os.path.join(os.getcwd(), default_subdir)
def load_email_config() -> EmailConfig | None:
required = [
'EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD'
]
if not all(os.getenv(k) for k in required):
return None
security = os.getenv('EMAIL_SMTP_SECURITY', 'starttls').lower()
default_port = 465 if security == 'ssl' else 587 if security == 'starttls' else 25
smtp_port = int(os.getenv('EMAIL_SMTP_PORT', default_port))
return EmailConfig(
smtp_server=os.getenv('EMAIL_SMTP_SERVER', ''),
smtp_port=smtp_port,
smtp_security=security,
from_email=os.getenv('EMAIL_FROM', ''),
to_email=os.getenv('EMAIL_TO', ''),
username=os.getenv('EMAIL_USERNAME', ''),
password=os.getenv('EMAIL_PASSWORD', ''),
)
def setup_logging(level: str, log_dir: str) -> logging.Logger:
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(os.path.join(log_dir, 'crawler.log')),
logging.StreamHandler(),
],
)
return logging.getLogger(__name__)
def load_config() -> AppConfig:
check_interval = int(os.getenv('CHECK_INTERVAL', 300))
log_level = os.getenv('LOG_LEVEL', 'INFO')
always_notify_on_startup = os.getenv('ALWAYS_NOTIFY_ON_STARTUP', 'false').lower() in ('1', 'true', 'yes')
webhook_url = os.getenv('WEBHOOK_URL')
discord_webhook = os.getenv('DISCORD_WEBHOOK')
data_dir = _resolve_dir('DATA_DIR', 'data')
log_dir = _resolve_dir('LOG_DIR', 'logs')
return AppConfig(
check_interval=check_interval,
log_level=log_level,
always_notify_on_startup=always_notify_on_startup,
webhook_url=webhook_url,
discord_webhook=discord_webhook,
data_dir=data_dir,
log_dir=log_dir,
email=load_email_config(),
)

70
app/crawlers/barrons.py Normal file
View File

@@ -0,0 +1,70 @@
from __future__ import annotations
import hashlib
from datetime import datetime
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from app.crawlers.base import BaseCrawler
class BarronsCrawler(BaseCrawler):
def __init__(self, config, logger):
super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json')
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self) -> Optional[str]:
try:
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_items(self, html_content: str) -> List[Dict]:
soup = BeautifulSoup(html_content, 'html.parser')
stock_picks: List[Dict] = []
try:
selectors = [
'article[data-module="ArticleItem"]',
'.WSJTheme--headline',
'.MarketDataModule-headline',
'h3 a, h4 a',
'[data-module] a[href*="articles"]',
]
elements = []
for selector in selectors:
elements = soup.select(selector)
if elements:
self.logger.info(f"使用選擇器找到內容: {selector}")
break
for element in elements[:10]:
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
if isinstance(link, dict):
link = link.get('href')
elif hasattr(link, 'get'):
link = link.get('href')
if link and isinstance(link, str) and link.startswith('/'):
link = "https://www.barrons.com" + link
if title and len(title) > 10:
stock_picks.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
})
return stock_picks
except Exception as e:
self.logger.error(f"解析網頁內容失敗: {e}")
self.stats['errors'] += 1
return []

136
app/crawlers/base.py Normal file
View File

@@ -0,0 +1,136 @@
from __future__ import annotations
import hashlib
import time
import signal
from abc import ABC, abstractmethod
from datetime import datetime
from typing import List, Dict, Optional
import schedule
from app.config import AppConfig
from app.services import storage
from app.services import notifications as notif
class BaseCrawler(ABC):
def __init__(self, name: str, config: AppConfig, logger, data_filename: str):
self.name = name
self.config = config
self.logger = logger
self.data_path = storage.data_file_path(config.data_dir, data_filename)
self.running = True
self._first_check_done = False
self.stats = {
'start_time': datetime.now().isoformat(),
'total_checks': 0,
'new_picks_found': 0,
'last_check': None,
'last_notification': None,
'errors': 0,
}
# --- Abstract site-specific hooks ---
@abstractmethod
def fetch_page(self) -> Optional[str]:
...
@abstractmethod
def parse_items(self, html_content: str) -> List[Dict]:
...
# --- Generic helpers ---
def find_new(self, current: List[Dict], previous: List[Dict]) -> List[Dict]:
prev_hashes = {p.get('hash') for p in previous if 'hash' in p}
return [p for p in current if p.get('hash') not in prev_hashes]
# --- Main check ---
def run_check(self) -> Optional[List[Dict]]:
self.logger.info(f"開始檢查 {self.name}...")
self.stats['total_checks'] += 1
self.stats['last_check'] = datetime.now().isoformat()
try:
html = self.fetch_page()
if not html:
return []
current = self.parse_items(html)
if not current:
self.logger.warning("未找到內容")
return []
prev = storage.load_json(self.data_path).get('stock_picks', [])
new_items = self.find_new(current, prev)
if new_items:
self.logger.info(f"🚨 發現 {len(new_items)} 條新內容")
self.stats['new_picks_found'] += len(new_items)
self._send_notifications(new_items)
storage.save_json(self.data_path, {
'last_update': datetime.now().isoformat(),
'stock_picks': current,
'stats': self.stats,
})
return new_items
# Optionally notify on first run
if (not self._first_check_done) and self.config.always_notify_on_startup and current:
self.logger.info("🟢 啟動首次檢查:無新內容,但依設定寄出目前清單")
self._send_notifications(current)
storage.save_json(self.data_path, {
'last_update': datetime.now().isoformat(),
'stock_picks': current,
'stats': self.stats,
})
return current
self.logger.info("✅ 沒有發現新內容")
return []
except Exception as e:
self.logger.error(f"檢查過程錯誤: {e}")
self.stats['errors'] += 1
return None
def _send_notifications(self, items: List[Dict]) -> None:
sent = False
if self.config.email:
try:
notif.send_email(items, self.config.email)
sent = True
except Exception as e:
self.logger.error(f"電子郵件通知失敗: {e}")
if self.config.webhook_url:
try:
notif.send_webhook(items, self.config.webhook_url)
sent = True
except Exception as e:
self.logger.error(f"Webhook 通知失敗: {e}")
if self.config.discord_webhook:
try:
notif.send_discord(items, self.config.discord_webhook)
sent = True
except Exception as e:
self.logger.error(f"Discord 通知失敗: {e}")
if sent:
self.stats['last_notification'] = datetime.now().isoformat()
# --- Run loop ---
def _signal_handler(self, signum, frame):
self.logger.info("收到停止信號,正在關閉...")
self.running = False
def run(self):
signal.signal(signal.SIGINT, self._signal_handler)
signal.signal(signal.SIGTERM, self._signal_handler)
schedule.every(self.config.check_interval).seconds.do(self.run_check)
self.logger.info(f"🚀 爬蟲已啟動,每 {self.config.check_interval} 秒檢查一次")
self.run_check()
self._first_check_done = True
while self.running:
schedule.run_pending()
time.sleep(1)
self.logger.info("爬蟲已停止")

53
app/crawlers/template.py Normal file
View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
from app.crawlers.base import BaseCrawler
class TemplateCrawler(BaseCrawler):
"""範本:建立新站點時複製本檔並改名。
必要實作fetch_page 與 parse_items
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
"""
def __init__(self, config, logger):
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
self.url = "https://example.com"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self) -> Optional[str]:
try:
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_items(self, html_content: str) -> List[Dict]:
soup = BeautifulSoup(html_content, 'html.parser')
items: List[Dict] = []
# TODO: 依站點結構實作解析邏輯,以下為示意
for a in soup.select('a')[:5]:
title = a.get_text(strip=True)
link = a.get('href')
if title and len(title) > 5:
items.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
})
return items

29
app/runner.py Normal file
View File

@@ -0,0 +1,29 @@
from __future__ import annotations
import threading
from app.config import load_config, setup_logging
from app.crawlers.barrons import BarronsCrawler
from app.api.server import create_app
def start():
# Load configuration and setup logging
config = load_config()
logger = setup_logging(config.log_level, config.log_dir)
# Create crawler instance
crawler = BarronsCrawler(config, logger)
# Create and start API in background
app = create_app(crawler)
def run_api():
app.run(host='0.0.0.0', port=8080, debug=False)
flask_thread = threading.Thread(target=run_api, daemon=True)
flask_thread.start()
# Run crawler loop (blocking)
crawler.run()

View File

@@ -0,0 +1,79 @@
from datetime import datetime
import hashlib
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from typing import List, Dict, Optional
import requests
from app.config import EmailConfig
def format_email_body(new_picks: List[Dict]) -> str:
body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n"
for pick in new_picks:
body += f"📊 {pick['title']}\n"
if pick.get('link'):
body += f"🔗 {pick['link']}\n"
body += f"🕒 {pick.get('scraped_at', datetime.now().isoformat())}\n"
body += "-" * 60 + "\n"
return body
def send_email(new_picks: List[Dict], cfg: EmailConfig) -> None:
msg = MIMEMultipart()
msg['From'] = cfg.from_email
msg['To'] = cfg.to_email
msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)"
msg.attach(MIMEText(format_email_body(new_picks), 'plain', 'utf-8'))
if cfg.smtp_security == 'ssl':
server = smtplib.SMTP_SSL(cfg.smtp_server, cfg.smtp_port)
else:
server = smtplib.SMTP(cfg.smtp_server, cfg.smtp_port)
server.ehlo()
if cfg.smtp_security == 'starttls':
server.starttls()
server.ehlo()
server.login(cfg.username, cfg.password)
server.send_message(msg)
server.quit()
def send_webhook(new_picks: List[Dict], url: str) -> None:
message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n"
for pick in new_picks[:5]:
message += f"{pick['title']}\n"
if pick.get('link'):
message += f" {pick['link']}\n"
message += "\n"
payload = {"text": message}
requests.post(url, json=payload, timeout=10)
def send_discord(new_picks: List[Dict], webhook: str) -> None:
embed = {
"title": "📈 Barron's 新股票推薦",
"description": f"發現 {len(new_picks)} 條新推薦",
"color": 0x00ff00,
"fields": [],
}
for pick in new_picks[:5]:
embed["fields"].append({
"name": pick['title'][:256],
"value": (pick.get('link') or '無連結')[:1024],
"inline": False,
})
requests.post(webhook, json={"embeds": [embed]}, timeout=10)
def build_test_pick() -> Dict:
return {
'title': f"[測試] Barron's 通知發送 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
'link': 'https://example.com/test',
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(str(datetime.now().timestamp()).encode()).hexdigest()[:8],
}

23
app/services/storage.py Normal file
View File

@@ -0,0 +1,23 @@
import json
import os
from typing import Any, Dict
def data_file_path(data_dir: str, name: str) -> str:
os.makedirs(data_dir, exist_ok=True)
return os.path.join(data_dir, name)
def load_json(path: str) -> Dict[str, Any]:
try:
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
return {'last_update': None, 'stock_picks': [], 'stats': {}}
def save_json(path: str, data: Dict[str, Any]) -> None:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)