From 58cc979b5b8cca294983d43bdd0d199b87f67aea Mon Sep 17 00:00:00 2001 From: MH Hung Date: Thu, 4 Sep 2025 21:39:24 +0800 Subject: [PATCH] refactor: modularize project structure and separate API from crawlers - Introduce app/ package with config, services (storage, notifications), API server, and crawler modules - Add BaseCrawler and BarronsCrawler; extract notifications and storage - Keep enhanced_crawler.py as back-compat entry delegating to app.runner - Add template crawler for future sites - Update README with new structure and usage - Extend .env.template with DATA_DIR/LOG_DIR options --- .env.template | 5 + README.md | 357 +++++++++-------------------- app/__init__.py | 6 + app/api/server.py | 56 +++++ app/config.py | 96 ++++++++ app/crawlers/barrons.py | 70 ++++++ app/crawlers/base.py | 136 +++++++++++ app/crawlers/template.py | 53 +++++ app/runner.py | 29 +++ app/services/notifications.py | 79 +++++++ app/services/storage.py | 23 ++ enhanced_crawler.py | 419 +--------------------------------- 12 files changed, 663 insertions(+), 666 deletions(-) create mode 100644 app/__init__.py create mode 100644 app/api/server.py create mode 100644 app/config.py create mode 100644 app/crawlers/barrons.py create mode 100644 app/crawlers/base.py create mode 100644 app/crawlers/template.py create mode 100644 app/runner.py create mode 100644 app/services/notifications.py create mode 100644 app/services/storage.py diff --git a/.env.template b/.env.template index 02da85b..6c09325 100644 --- a/.env.template +++ b/.env.template @@ -18,3 +18,8 @@ WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK # Discord Webhook(可選) DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK + +# 進階(可選):自訂資料與日誌目錄 +# 預設 Docker 會使用 /app/data、/app/logs;本機則使用 ./data、./logs +# DATA_DIR=./data +# LOG_DIR=./logs diff --git a/README.md b/README.md index 797e8fa..912df8e 100644 --- a/README.md +++ b/README.md @@ -1,281 +1,128 @@ -# Barron's 股票推薦爬蟲 Docker 部署指南 +# Barron's 股票推薦爬蟲(模組化架構) -## 🚀 快速開始 +一個可擴充的爬蟲服務,內建 HTTP API 與多種通知(Email/Webhook/Discord)。 +現已模組化:API 與爬蟲核心分離,便於未來新增其他網站的爬蟲。 -### 1. 建立專案目錄 +## 功能 +- 定時抓取 Barron's 股票推薦頁面 +- 只在有新內容時發送通知(可設定首次啟動也通知) +- 內建 `/health`、`/stats`、`/check` 與 `/notify_test` API +- Docker 化部署,資料與日誌可持久化 +- 架構模組化,易於擴充其他站點 + +## 專案結構 +``` +app/ + runner.py # 啟動流程(載入設定、啟動 API 與爬蟲) + config.py # 設定載入與 logging + api/server.py # Flask API + crawlers/base.py # BaseCrawler:通用排程/比對/通知 + crawlers/barrons.py # Barron’s 爬蟲 + crawlers/template.py # 新站點範本(複製後改名擴充) + services/storage.py # JSON 儲存 + services/notifications.py # Email/Webhook/Discord +enhanced_crawler.py # 舊入口,現委派到 app.runner +Dockerfile +docker-compose.yml +requirements.txt +health_check.py +.env / .env.template +data/ # 持久化資料(預設) +logs/ # 持久化日誌(預設) +``` + +## 快速開始(Docker) +1) 建立環境變數檔 ```bash -mkdir barrons-crawler -cd barrons-crawler +cp .env.template .env +# 編輯 .env,至少設定你要用到的通知方式(Email/Webhook/Discord 任選) ``` -### 2. 創建文件結構 -``` -barrons-crawler/ -├── Dockerfile -├── docker-compose.yml -├── requirements.txt -├── enhanced_crawler.py -├── health_check.py -├── .dockerignore -├── .env # 環境變數設定檔 -├── data/ # 資料持久化目錄 -└── logs/ # 日誌目錄 -``` - -### 3. 設定環境變數 -創建 `.env` 檔案: +2) 啟動服務 ```bash -# 基本設定 -CHECK_INTERVAL=300 -LOG_LEVEL=INFO -ALWAYS_NOTIFY_ON_STARTUP=false # 啟動後第一次必定寄當前清單 - -# 電子郵件通知設定(SMTP) -# 對 Gmail:建議使用應用程式密碼 -# 對學校/企業信箱:請依管理者提供之 SMTP 主機與加密方式設定 -EMAIL_SMTP_SERVER=smtp.gmail.com # 例:mail.ntust.edu.tw -EMAIL_SMTP_PORT=587 # starttls 常用 587;ssl 常用 465 -EMAIL_SMTP_SECURITY=starttls # starttls | ssl | none -EMAIL_FROM=your_email@gmail.com # 例:m10605505@mail.ntust.edu.tw -EMAIL_TO=notification@gmail.com -EMAIL_USERNAME=your_email@gmail.com # 有些伺服器需填完整信箱 -EMAIL_PASSWORD=your_app_specific_password - -# Slack Webhook(可選) -WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK - -# Discord Webhook(可選) -DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK -``` - -### 4. 啟動服務 -```bash -# 使用 Docker Compose 啟動 docker-compose up -d - -# 查看日誌 docker-compose logs -f barrons-crawler ``` -## 📋 詳細設定選項 - -### 電子郵件設定(SMTP) -1. 若使用 Gmail: - - 開啟兩步驟驗證 - - 生成應用程式密碼:https://myaccount.google.com/apppasswords - - 在 `.env` 使用應用程式密碼,而非一般密碼 -2. 若使用學校/企業郵件(如 NTUST): - - 向管理者確認 SMTP 主機、連接埠與加密方式(starttls 或 ssl) - - `EMAIL_USERNAME` 可能需要填完整信箱(例如 `m10605505@mail.ntust.edu.tw`) - -### 啟動後首次通知行為 -- 環境變數 `ALWAYS_NOTIFY_ON_STARTUP` - - `true/1/yes`:服務啟動完成後,第一次檢查即使沒有新內容也會寄出目前清單;之後只在有更新時寄出 - - `false`(預設):只有在偵測到新內容時才寄出 - -### Slack 通知設定 -1. 建立 Slack App: https://api.slack.com/apps -2. 創建 Incoming Webhook -3. 複製 Webhook URL 到 `.env` 檔案 - -### Discord 通知設定 -1. 在 Discord 伺服器創建 Webhook -2. 複製 Webhook URL 到 `.env` 檔案 - -## 🔧 Docker 指令 - -### 基本操作 -```bash -# 建構映像 -docker-compose build - -# 啟動服務 -docker-compose up -d - -# 停止服務 -docker-compose down - -# 重啟服務 -docker-compose restart - -# 查看日誌 -docker-compose logs -f - -# 進入容器 -docker-compose exec barrons-crawler bash -``` - -### 維護指令 -```bash -# 清理停用的容器 -docker system prune - -# 更新並重新建構 -docker-compose down -docker-compose build --no-cache -docker-compose up -d - -# 備份資料 -docker cp barrons-crawler:/app/data ./data_backup -``` - -## 🌐 Web API 端點 - -爬蟲提供了以下 HTTP 端點: - -### 健康檢查 +3) 驗證 API ```bash curl http://localhost:8080/health +curl http://localhost:8080/stats | jq +curl http://localhost:8080/check | jq +curl "http://localhost:8080/notify_test?channel=email" # 或 webhook/discord ``` -回應:`{"status": "healthy", "timestamp": "2024-01-15T10:30:00"}` -### 查看統計資料 +## 本機執行(非 Docker) ```bash -curl http://localhost:8080/stats -``` -回應: -```json -{ - "start_time": "2024-01-15T10:00:00", - "total_checks": 24, - "new_picks_found": 3, - "last_check": "2024-01-15T10:25:00", - "last_notification": "2024-01-15T09:45:00", - "errors": 0 -} +pip install -r requirements.txt +python enhanced_crawler.py ``` -### 手動觸發檢查 +## 環境變數說明 +- 基本 + - `CHECK_INTERVAL`: 檢查間隔(秒),預設 300 + - `LOG_LEVEL`: 日誌等級,預設 `INFO`(可 `DEBUG`) + - `ALWAYS_NOTIFY_ON_STARTUP`: 是否在啟動後第一次就寄出目前清單(true/false),預設 false +- Email(可選) + - `EMAIL_SMTP_SERVER`、`EMAIL_SMTP_PORT`(587/465/25) + - `EMAIL_SMTP_SECURITY`: starttls | ssl | none(預設 starttls) + - `EMAIL_FROM`、`EMAIL_TO`、`EMAIL_USERNAME`、`EMAIL_PASSWORD` +- Webhook(可選) + - `WEBHOOK_URL`: Slack/Teams Incoming Webhook URL +- Discord(可選) + - `DISCORD_WEBHOOK`: Discord Webhook URL +- 進階路徑(可選) + - `DATA_DIR`: 資料輸出路徑(Docker 預設 `/app/data`;本機預設 `./data`) + - `LOG_DIR`: 日誌輸出路徑(Docker 預設 `/app/logs`;本機預設 `./logs`) + +Email 使用建議: +- Gmail 請使用「應用程式密碼」並開啟兩步驟驗證 +- 校園/企業信箱請向管理者確認 SMTP 主機、連接埠與加密方式 + +## Web API 端點 +- `GET /health`: 健康檢查 +- `GET /stats`: 目前統計資訊(啟動時間、檢查次數、錯誤數…) +- `GET /check`: 立即執行一次檢查 +- `GET /notify_test?channel=email|webhook|discord`: 測試通知 + +## 健康檢查與維運 +- 容器內建 HEALTHCHECK(每 30 秒檢查一次 `/health`,連續 3 次失敗標記為不健康) +- 常用指令 ```bash -curl http://localhost:8080/check -``` - -## 📊 監控和警報 - -### 健康檢查 -Docker 容器包含自動健康檢查: -- 每30秒檢查一次 -- 3次失敗後標記為不健康 -- 可用於自動重啟策略 - -### 日誌監控 -```bash -# 即時查看日誌 +docker-compose build +docker-compose up -d +docker-compose restart docker-compose logs -f barrons-crawler - -# 查看特定時間的日誌 -docker-compose logs --since "2024-01-15T10:00:00" barrons-crawler +docker-compose down ``` -### 資料備份 -```bash -# 設定定期備份(加到 crontab) -0 2 * * * docker cp barrons-crawler:/app/data /backup/barrons-$(date +\%Y\%m\%d) -``` +## 資料與日誌 +- 預設位置(Docker):`/app/data`、`/app/logs`(已透過 volume 映射至宿主 `./data`、`./logs`) +- 檔案格式(以 Barron’s 為例:`data/barrons_data.json`) + - `last_update`: ISO 時間 + - `stock_picks`: 文章清單(title/link/hash/scraped_at) + - `stats`: 執行統計 -## 🐛 故障排除 +## 擴充新站點(建議流程) +1) 複製範本:`app/crawlers/template.py` → `app/crawlers/.py` +2) 實作兩個方法: + - `fetch_page(self) -> Optional[str]`:抓取 HTML + - `parse_items(self, html: str) -> List[Dict]`:輸出包含 `title`(必要)、建議 `link`、`hash`、`scraped_at` +3) 啟動方式: + - 簡單做法:為新站點建立第二個容器(複製服務段落,指定不同資料檔/埠口) + - 進階做法:在 `app/runner.py` 啟動多個爬蟲與多站 API(需擴充 API 路由與執行緒管理) -### 常見問題 +## 故障排除 +- 取不到網頁:檢查網路、User-Agent、目標網站是否改版 +- Email 失敗:確認 SMTP 設定、應用程式密碼、連接埠與加密方式 +- 解析不到內容:查看日誌,更新選擇器邏輯 +- 服務無回應:檢查容器日誌與健康檢查狀態 -1. **無法獲取網頁內容** - ```bash - # 檢查網路連線 - docker-compose exec barrons-crawler curl -I https://www.barrons.com - ``` +## 安全建議 +- 不要把密碼放到版本控制;使用 `.env` 並將其列入 `.gitignore` +- 適度限制通知頻率與內容,避免濫用 +- 若對外開放 API,建議加上認證與 HTTPS -2. **電子郵件發送失敗** - - 檢查 Gmail 應用程式密碼是否正確 - - 確認兩步驟驗證已開啟 - - 檢查防火牆設定 +## 版本記事 +- 2025-09:重構為模組化架構,API 與爬蟲邏輯分離,新增擴充範本 -3. **解析內容失敗** - - 網頁結構可能已變更 - - 檢查日誌中的錯誤訊息 - - 可能需要更新解析邏輯 - -4. **容器無法啟動** - ```bash - # 檢查詳細錯誤 - docker-compose logs barrons-crawler - - # 檢查磁碟空間 - df -h - - # 檢查埠口占用 - netstat -tlnp | grep 8080 - ``` - -### 調試模式 -```yaml -# 在 docker-compose.yml 中添加 -environment: - - LOG_LEVEL=DEBUG - -# 或者進入容器手動執行 -docker-compose exec barrons-crawler python enhanced_crawler.py -``` - -## 🔒 安全建議 - -1. **不要在代碼中硬編碼密碼** - - 使用 `.env` 檔案或 Docker secrets - - 將 `.env` 加入 `.gitignore` - -2. **定期更新依賴** - ```bash - # 更新基礎映像 - docker-compose pull - docker-compose up -d - ``` - -3. **監控資源使用** - ```bash - # 查看容器資源使用 - docker stats barrons-crawler - ``` - -4. **網路安全** - - 使用反向代理(如 Nginx) - - 設定適當的防火牆規則 - - 啟用 HTTPS(如果對外開放) - -## 📈 擴展功能 - -### 多實例部署 -```yaml -# docker-compose.yml -services: - barrons-crawler-1: - # ... 設定 - barrons-crawler-2: - # ... 設定 - environment: - - CHECK_INTERVAL=600 # 不同檢查間隔 -``` - -### 與其他服務整合 -```yaml -# 加入資料庫 - postgres: - image: postgres:15 - environment: - POSTGRES_DB: barrons - POSTGRES_USER: crawler - POSTGRES_PASSWORD: password -``` - -### 定制通知 -可以擴展 `enhanced_crawler.py` 添加: -- Line Notify -- Telegram Bot -- 推播通知 -- 簡訊通知 - -## 🎯 最佳實踐 - -1. **定期監控日誌** -2. **設定適當的檢查間隔**(避免過於頻繁) -3. **定期備份資料** -4. **監控資源使用情況** -5. **設定適當的通知渠道** -6. **遵守網站使用條款** diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..332cdc2 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,6 @@ +"""Modular crawler package. + +This package separates API, crawlers, services, and configuration +to support multiple sites in the future. +""" + diff --git a/app/api/server.py b/app/api/server.py new file mode 100644 index 0000000..40b3816 --- /dev/null +++ b/app/api/server.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from datetime import datetime +from flask import Flask, jsonify, request + +from app.services import notifications as notif + + +def create_app(crawler) -> Flask: + app = Flask(__name__) + + @app.get('/health') + def health(): + return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()}) + + @app.get('/stats') + def stats(): + if crawler: + return jsonify(crawler.stats) + return jsonify({"error": "Crawler not initialized"}), 500 + + @app.get('/check') + def manual_check(): + if not crawler: + return jsonify({"error": "Crawler not initialized"}), 500 + result = crawler.run_check() or [] + return jsonify({"result": f"Found {len(result)} new picks"}) + + @app.get('/notify_test') + def notify_test(): + if not crawler: + return jsonify({"error": "Crawler not initialized"}), 500 + channel = (request.args.get('channel') or 'email').lower() + test_pick = [notif.build_test_pick()] + try: + if channel == 'email': + if not crawler.config.email: + return jsonify({"error": "Email config not set"}), 400 + notif.send_email(test_pick, crawler.config.email) + elif channel == 'webhook': + if not crawler.config.webhook_url: + return jsonify({"error": "Webhook URL not set"}), 400 + notif.send_webhook(test_pick, crawler.config.webhook_url) + elif channel == 'discord': + if not crawler.config.discord_webhook: + return jsonify({"error": "Discord webhook not set"}), 400 + notif.send_discord(test_pick, crawler.config.discord_webhook) + else: + return jsonify({"error": f"Unsupported channel: {channel}"}), 400 + return jsonify({"result": f"Test notification sent via {channel}"}) + except Exception as e: + crawler.logger.error(f"測試通知發送失敗: {e}") + return jsonify({"error": str(e)}), 500 + + return app + diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..4280b11 --- /dev/null +++ b/app/config.py @@ -0,0 +1,96 @@ +import os +import logging +from dataclasses import dataclass + + +@dataclass +class EmailConfig: + smtp_server: str + smtp_port: int + smtp_security: str # 'ssl' | 'starttls' | 'none' + from_email: str + to_email: str + username: str + password: str + + +@dataclass +class AppConfig: + check_interval: int + log_level: str + always_notify_on_startup: bool + webhook_url: str | None + discord_webhook: str | None + data_dir: str + log_dir: str + email: EmailConfig | None + + +def _resolve_dir(env_key: str, default_subdir: str) -> str: + # Prefer explicit env var + val = os.getenv(env_key) + if val: + return val + # Prefer Docker paths if present + docker_path = f"/app/{default_subdir}" + if os.path.isdir(docker_path): + return docker_path + # Fallback to local ./subdir + return os.path.join(os.getcwd(), default_subdir) + + +def load_email_config() -> EmailConfig | None: + required = [ + 'EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD' + ] + if not all(os.getenv(k) for k in required): + return None + + security = os.getenv('EMAIL_SMTP_SECURITY', 'starttls').lower() + default_port = 465 if security == 'ssl' else 587 if security == 'starttls' else 25 + smtp_port = int(os.getenv('EMAIL_SMTP_PORT', default_port)) + + return EmailConfig( + smtp_server=os.getenv('EMAIL_SMTP_SERVER', ''), + smtp_port=smtp_port, + smtp_security=security, + from_email=os.getenv('EMAIL_FROM', ''), + to_email=os.getenv('EMAIL_TO', ''), + username=os.getenv('EMAIL_USERNAME', ''), + password=os.getenv('EMAIL_PASSWORD', ''), + ) + + +def setup_logging(level: str, log_dir: str) -> logging.Logger: + os.makedirs(log_dir, exist_ok=True) + logging.basicConfig( + level=getattr(logging, level.upper(), logging.INFO), + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler(os.path.join(log_dir, 'crawler.log')), + logging.StreamHandler(), + ], + ) + return logging.getLogger(__name__) + + +def load_config() -> AppConfig: + check_interval = int(os.getenv('CHECK_INTERVAL', 300)) + log_level = os.getenv('LOG_LEVEL', 'INFO') + always_notify_on_startup = os.getenv('ALWAYS_NOTIFY_ON_STARTUP', 'false').lower() in ('1', 'true', 'yes') + webhook_url = os.getenv('WEBHOOK_URL') + discord_webhook = os.getenv('DISCORD_WEBHOOK') + data_dir = _resolve_dir('DATA_DIR', 'data') + log_dir = _resolve_dir('LOG_DIR', 'logs') + + return AppConfig( + check_interval=check_interval, + log_level=log_level, + always_notify_on_startup=always_notify_on_startup, + webhook_url=webhook_url, + discord_webhook=discord_webhook, + data_dir=data_dir, + log_dir=log_dir, + email=load_email_config(), + ) + diff --git a/app/crawlers/barrons.py b/app/crawlers/barrons.py new file mode 100644 index 0000000..9791700 --- /dev/null +++ b/app/crawlers/barrons.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import hashlib +from datetime import datetime +from typing import List, Dict, Optional + +import requests +from bs4 import BeautifulSoup + +from app.crawlers.base import BaseCrawler + + +class BarronsCrawler(BaseCrawler): + def __init__(self, config, logger): + super().__init__(name="Barron's 股票推薦", config=config, logger=logger, data_filename='barrons_data.json') + self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV" + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + def fetch_page(self) -> Optional[str]: + try: + resp = requests.get(self.url, headers=self.headers, timeout=30) + resp.raise_for_status() + return resp.text + except requests.RequestException as e: + self.logger.error(f"獲取網頁失敗: {e}") + self.stats['errors'] += 1 + return None + + def parse_items(self, html_content: str) -> List[Dict]: + soup = BeautifulSoup(html_content, 'html.parser') + stock_picks: List[Dict] = [] + try: + selectors = [ + 'article[data-module="ArticleItem"]', + '.WSJTheme--headline', + '.MarketDataModule-headline', + 'h3 a, h4 a', + '[data-module] a[href*="articles"]', + ] + elements = [] + for selector in selectors: + elements = soup.select(selector) + if elements: + self.logger.info(f"使用選擇器找到內容: {selector}") + break + + for element in elements[:10]: + title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True) + link = element.get('href') if element.name == 'a' else element.find('a', href=True) + if isinstance(link, dict): + link = link.get('href') + elif hasattr(link, 'get'): + link = link.get('href') + if link and isinstance(link, str) and link.startswith('/'): + link = "https://www.barrons.com" + link + if title and len(title) > 10: + stock_picks.append({ + 'title': title, + 'link': link, + 'scraped_at': datetime.now().isoformat(), + 'hash': hashlib.md5(title.encode()).hexdigest()[:8], + }) + return stock_picks + except Exception as e: + self.logger.error(f"解析網頁內容失敗: {e}") + self.stats['errors'] += 1 + return [] + diff --git a/app/crawlers/base.py b/app/crawlers/base.py new file mode 100644 index 0000000..472a8c1 --- /dev/null +++ b/app/crawlers/base.py @@ -0,0 +1,136 @@ +from __future__ import annotations + +import hashlib +import time +import signal +from abc import ABC, abstractmethod +from datetime import datetime +from typing import List, Dict, Optional + +import schedule + +from app.config import AppConfig +from app.services import storage +from app.services import notifications as notif + + +class BaseCrawler(ABC): + def __init__(self, name: str, config: AppConfig, logger, data_filename: str): + self.name = name + self.config = config + self.logger = logger + self.data_path = storage.data_file_path(config.data_dir, data_filename) + + self.running = True + self._first_check_done = False + + self.stats = { + 'start_time': datetime.now().isoformat(), + 'total_checks': 0, + 'new_picks_found': 0, + 'last_check': None, + 'last_notification': None, + 'errors': 0, + } + + # --- Abstract site-specific hooks --- + @abstractmethod + def fetch_page(self) -> Optional[str]: + ... + + @abstractmethod + def parse_items(self, html_content: str) -> List[Dict]: + ... + + # --- Generic helpers --- + def find_new(self, current: List[Dict], previous: List[Dict]) -> List[Dict]: + prev_hashes = {p.get('hash') for p in previous if 'hash' in p} + return [p for p in current if p.get('hash') not in prev_hashes] + + # --- Main check --- + def run_check(self) -> Optional[List[Dict]]: + self.logger.info(f"開始檢查 {self.name}...") + self.stats['total_checks'] += 1 + self.stats['last_check'] = datetime.now().isoformat() + try: + html = self.fetch_page() + if not html: + return [] + current = self.parse_items(html) + if not current: + self.logger.warning("未找到內容") + return [] + + prev = storage.load_json(self.data_path).get('stock_picks', []) + new_items = self.find_new(current, prev) + + if new_items: + self.logger.info(f"🚨 發現 {len(new_items)} 條新內容") + self.stats['new_picks_found'] += len(new_items) + self._send_notifications(new_items) + storage.save_json(self.data_path, { + 'last_update': datetime.now().isoformat(), + 'stock_picks': current, + 'stats': self.stats, + }) + return new_items + + # Optionally notify on first run + if (not self._first_check_done) and self.config.always_notify_on_startup and current: + self.logger.info("🟢 啟動首次檢查:無新內容,但依設定寄出目前清單") + self._send_notifications(current) + storage.save_json(self.data_path, { + 'last_update': datetime.now().isoformat(), + 'stock_picks': current, + 'stats': self.stats, + }) + return current + + self.logger.info("✅ 沒有發現新內容") + return [] + except Exception as e: + self.logger.error(f"檢查過程錯誤: {e}") + self.stats['errors'] += 1 + return None + + def _send_notifications(self, items: List[Dict]) -> None: + sent = False + if self.config.email: + try: + notif.send_email(items, self.config.email) + sent = True + except Exception as e: + self.logger.error(f"電子郵件通知失敗: {e}") + if self.config.webhook_url: + try: + notif.send_webhook(items, self.config.webhook_url) + sent = True + except Exception as e: + self.logger.error(f"Webhook 通知失敗: {e}") + if self.config.discord_webhook: + try: + notif.send_discord(items, self.config.discord_webhook) + sent = True + except Exception as e: + self.logger.error(f"Discord 通知失敗: {e}") + if sent: + self.stats['last_notification'] = datetime.now().isoformat() + + # --- Run loop --- + def _signal_handler(self, signum, frame): + self.logger.info("收到停止信號,正在關閉...") + self.running = False + + def run(self): + signal.signal(signal.SIGINT, self._signal_handler) + signal.signal(signal.SIGTERM, self._signal_handler) + + schedule.every(self.config.check_interval).seconds.do(self.run_check) + self.logger.info(f"🚀 爬蟲已啟動,每 {self.config.check_interval} 秒檢查一次") + self.run_check() + self._first_check_done = True + while self.running: + schedule.run_pending() + time.sleep(1) + self.logger.info("爬蟲已停止") + diff --git a/app/crawlers/template.py b/app/crawlers/template.py new file mode 100644 index 0000000..1fe0976 --- /dev/null +++ b/app/crawlers/template.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from typing import List, Dict, Optional + +import requests +from bs4 import BeautifulSoup +from datetime import datetime +import hashlib + +from app.crawlers.base import BaseCrawler + + +class TemplateCrawler(BaseCrawler): + """範本:建立新站點時複製本檔並改名。 + + 必要實作:fetch_page 與 parse_items + - parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單 + """ + + def __init__(self, config, logger): + super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json') + self.url = "https://example.com" + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + def fetch_page(self) -> Optional[str]: + try: + resp = requests.get(self.url, headers=self.headers, timeout=30) + resp.raise_for_status() + return resp.text + except requests.RequestException as e: + self.logger.error(f"獲取網頁失敗: {e}") + self.stats['errors'] += 1 + return None + + def parse_items(self, html_content: str) -> List[Dict]: + soup = BeautifulSoup(html_content, 'html.parser') + items: List[Dict] = [] + + # TODO: 依站點結構實作解析邏輯,以下為示意 + for a in soup.select('a')[:5]: + title = a.get_text(strip=True) + link = a.get('href') + if title and len(title) > 5: + items.append({ + 'title': title, + 'link': link, + 'scraped_at': datetime.now().isoformat(), + 'hash': hashlib.md5(title.encode()).hexdigest()[:8], + }) + return items + diff --git a/app/runner.py b/app/runner.py new file mode 100644 index 0000000..b2ee3ee --- /dev/null +++ b/app/runner.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +import threading + +from app.config import load_config, setup_logging +from app.crawlers.barrons import BarronsCrawler +from app.api.server import create_app + + +def start(): + # Load configuration and setup logging + config = load_config() + logger = setup_logging(config.log_level, config.log_dir) + + # Create crawler instance + crawler = BarronsCrawler(config, logger) + + # Create and start API in background + app = create_app(crawler) + + def run_api(): + app.run(host='0.0.0.0', port=8080, debug=False) + + flask_thread = threading.Thread(target=run_api, daemon=True) + flask_thread.start() + + # Run crawler loop (blocking) + crawler.run() + diff --git a/app/services/notifications.py b/app/services/notifications.py new file mode 100644 index 0000000..76bf104 --- /dev/null +++ b/app/services/notifications.py @@ -0,0 +1,79 @@ +from datetime import datetime +import hashlib +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from typing import List, Dict, Optional + +import requests + +from app.config import EmailConfig + + +def format_email_body(new_picks: List[Dict]) -> str: + body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n" + for pick in new_picks: + body += f"📊 {pick['title']}\n" + if pick.get('link'): + body += f"🔗 {pick['link']}\n" + body += f"🕒 {pick.get('scraped_at', datetime.now().isoformat())}\n" + body += "-" * 60 + "\n" + return body + + +def send_email(new_picks: List[Dict], cfg: EmailConfig) -> None: + msg = MIMEMultipart() + msg['From'] = cfg.from_email + msg['To'] = cfg.to_email + msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)" + msg.attach(MIMEText(format_email_body(new_picks), 'plain', 'utf-8')) + + if cfg.smtp_security == 'ssl': + server = smtplib.SMTP_SSL(cfg.smtp_server, cfg.smtp_port) + else: + server = smtplib.SMTP(cfg.smtp_server, cfg.smtp_port) + server.ehlo() + if cfg.smtp_security == 'starttls': + server.starttls() + server.ehlo() + + server.login(cfg.username, cfg.password) + server.send_message(msg) + server.quit() + + +def send_webhook(new_picks: List[Dict], url: str) -> None: + message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n" + for pick in new_picks[:5]: + message += f"• {pick['title']}\n" + if pick.get('link'): + message += f" {pick['link']}\n" + message += "\n" + payload = {"text": message} + requests.post(url, json=payload, timeout=10) + + +def send_discord(new_picks: List[Dict], webhook: str) -> None: + embed = { + "title": "📈 Barron's 新股票推薦", + "description": f"發現 {len(new_picks)} 條新推薦", + "color": 0x00ff00, + "fields": [], + } + for pick in new_picks[:5]: + embed["fields"].append({ + "name": pick['title'][:256], + "value": (pick.get('link') or '無連結')[:1024], + "inline": False, + }) + requests.post(webhook, json={"embeds": [embed]}, timeout=10) + + +def build_test_pick() -> Dict: + return { + 'title': f"[測試] Barron's 通知發送 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", + 'link': 'https://example.com/test', + 'scraped_at': datetime.now().isoformat(), + 'hash': hashlib.md5(str(datetime.now().timestamp()).encode()).hexdigest()[:8], + } + diff --git a/app/services/storage.py b/app/services/storage.py new file mode 100644 index 0000000..aff2ba2 --- /dev/null +++ b/app/services/storage.py @@ -0,0 +1,23 @@ +import json +import os +from typing import Any, Dict + + +def data_file_path(data_dir: str, name: str) -> str: + os.makedirs(data_dir, exist_ok=True) + return os.path.join(data_dir, name) + + +def load_json(path: str) -> Dict[str, Any]: + try: + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + return {'last_update': None, 'stock_picks': [], 'stats': {}} + + +def save_json(path: str, data: Dict[str, Any]) -> None: + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + diff --git a/enhanced_crawler.py b/enhanced_crawler.py index 2f61c3b..f1276c6 100644 --- a/enhanced_crawler.py +++ b/enhanced_crawler.py @@ -1,416 +1,13 @@ -import requests -from bs4 import BeautifulSoup -import time -import json -import hashlib -from datetime import datetime -import smtplib -from email.mime.text import MIMEText -from email.mime.multipart import MIMEMultipart -import logging -import os -import schedule -from flask import Flask, jsonify, request -import threading -import signal -import sys +"""Back-compat entry point. -class EnhancedBarronsCrawler: - def __init__(self): - self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV" - self.headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - self.data_file = "/app/data/barrons_data.json" - self.running = True - - # 從環境變數讀取設定 - self.check_interval = int(os.getenv('CHECK_INTERVAL', 300)) - self.email_config = self.load_email_config() - self.webhook_url = os.getenv('WEBHOOK_URL') - self.discord_webhook = os.getenv('DISCORD_WEBHOOK') - # 啟動時是否強制寄出一次目前內容 - self.always_notify_on_startup = os.getenv('ALWAYS_NOTIFY_ON_STARTUP', 'false').lower() in ('1', 'true', 'yes') - self._first_check_done = False - - # 設定日誌 - log_level = os.getenv('LOG_LEVEL', 'INFO') - logging.basicConfig( - level=getattr(logging, log_level), - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('/app/logs/barrons_crawler.log'), - logging.StreamHandler() - ] - ) - self.logger = logging.getLogger(__name__) - - # 統計資料 - self.stats = { - 'start_time': datetime.now().isoformat(), - 'total_checks': 0, - 'new_picks_found': 0, - 'last_check': None, - 'last_notification': None, - 'errors': 0 - } +The project has been refactored to separate API and crawler logic. +This file now just delegates to the modular runner to keep +Docker and existing commands unchanged. +""" - def load_email_config(self): - """從環境變數載入電子郵件設定""" - if all(os.getenv(key) for key in ['EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD']): - security = os.getenv('EMAIL_SMTP_SECURITY', 'starttls').lower() - # 根據安全機制推導預設連接埠 - default_port = 465 if security == 'ssl' else 587 if security == 'starttls' else 25 - smtp_port = int(os.getenv('EMAIL_SMTP_PORT', default_port)) - return { - 'smtp_server': os.getenv('EMAIL_SMTP_SERVER'), - 'smtp_port': smtp_port, - 'smtp_security': security, # 'ssl' | 'starttls' | 'none' - 'from_email': os.getenv('EMAIL_FROM'), - 'to_email': os.getenv('EMAIL_TO'), - 'username': os.getenv('EMAIL_USERNAME'), - 'password': os.getenv('EMAIL_PASSWORD') - } - return None - - def fetch_page(self): - """獲取網頁內容""" - try: - response = requests.get(self.url, headers=self.headers, timeout=30) - response.raise_for_status() - return response.text - except requests.RequestException as e: - self.logger.error(f"獲取網頁失敗: {e}") - self.stats['errors'] += 1 - return None - - def parse_stock_picks(self, html_content): - """解析股票推薦內容""" - soup = BeautifulSoup(html_content, 'html.parser') - stock_picks = [] - - try: - # 多種選擇器策略 - selectors = [ - 'article[data-module="ArticleItem"]', - '.WSJTheme--headline', - '.MarketDataModule-headline', - 'h3 a, h4 a', - '[data-module] a[href*="articles"]' - ] - - for selector in selectors: - elements = soup.select(selector) - if elements: - self.logger.info(f"使用選擇器找到內容: {selector}") - break - - for element in elements[:10]: # 限制最多10個 - title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True) - link = element.get('href') if element.name == 'a' else element.find('a', href=True) - - if isinstance(link, dict): - link = link.get('href') - elif hasattr(link, 'get'): - link = link.get('href') - - if link and link.startswith('/'): - link = "https://www.barrons.com" + link - - if title and len(title) > 10: # 過濾太短的標題 - stock_picks.append({ - 'title': title, - 'link': link, - 'scraped_at': datetime.now().isoformat(), - 'hash': hashlib.md5(title.encode()).hexdigest()[:8] - }) - - return stock_picks - - except Exception as e: - self.logger.error(f"解析網頁內容失敗: {e}") - self.stats['errors'] += 1 - return [] - - def load_previous_data(self): - """載入之前的資料""" - try: - with open(self.data_file, 'r', encoding='utf-8') as f: - return json.load(f) - except FileNotFoundError: - return {'content_hash': None, 'last_update': None, 'stock_picks': []} - - def save_data(self, data): - """儲存資料""" - try: - os.makedirs(os.path.dirname(self.data_file), exist_ok=True) - with open(self.data_file, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=2) - except Exception as e: - self.logger.error(f"儲存資料失敗: {e}") - self.stats['errors'] += 1 - - def send_notifications(self, new_picks): - """發送各種通知""" - notification_sent = False - - # 電子郵件通知 - if self.email_config: - try: - self.send_email_notification(new_picks) - notification_sent = True - except Exception as e: - self.logger.error(f"電子郵件通知失敗: {e}") - - # Slack/Teams Webhook - if self.webhook_url: - try: - self.send_webhook_notification(new_picks) - notification_sent = True - except Exception as e: - self.logger.error(f"Webhook 通知失敗: {e}") - - # Discord Webhook - if self.discord_webhook: - try: - self.send_discord_notification(new_picks) - notification_sent = True - except Exception as e: - self.logger.error(f"Discord 通知失敗: {e}") - - if notification_sent: - self.stats['last_notification'] = datetime.now().isoformat() - - def send_email_notification(self, new_picks): - """發送電子郵件通知""" - msg = MIMEMultipart() - msg['From'] = self.email_config['from_email'] - msg['To'] = self.email_config['to_email'] - msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)" - - body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n" - for pick in new_picks: - body += f"📊 {pick['title']}\n" - if pick.get('link'): - body += f"🔗 {pick['link']}\n" - body += f"🕒 {pick['scraped_at']}\n" - body += "-" * 60 + "\n" - - msg.attach(MIMEText(body, 'plain', 'utf-8')) - - smtp_server = self.email_config['smtp_server'] - smtp_port = self.email_config['smtp_port'] - security = self.email_config.get('smtp_security', 'starttls') - - if security == 'ssl': - server = smtplib.SMTP_SSL(smtp_server, smtp_port) - else: - server = smtplib.SMTP(smtp_server, smtp_port) - server.ehlo() - if security == 'starttls': - server.starttls() - server.ehlo() - - server.login(self.email_config['username'], self.email_config['password']) - server.send_message(msg) - server.quit() - - def send_webhook_notification(self, new_picks): - """發送 Webhook 通知(Slack/Teams)""" - message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n" - for pick in new_picks[:5]: # 限制5條避免訊息太長 - message += f"📊 {pick['title']}\n" - if pick.get('link'): - message += f"🔗 {pick['link']}\n" - - payload = {"text": message} - requests.post(self.webhook_url, json=payload) - - def send_discord_notification(self, new_picks): - """發送 Discord 通知""" - embed = { - "title": f"📈 Barron's 新股票推薦", - "description": f"發現 {len(new_picks)} 條新推薦", - "color": 0x00ff00, - "fields": [] - } - - for pick in new_picks[:5]: - embed["fields"].append({ - "name": pick['title'][:256], - "value": pick.get('link', '無連結')[:1024], - "inline": False - }) - - payload = {"embeds": [embed]} - requests.post(self.discord_webhook, json=payload) - - def find_new_picks(self, current_picks, previous_picks): - """找出新的股票推薦""" - previous_hashes = {pick['hash'] for pick in previous_picks if 'hash' in pick} - return [pick for pick in current_picks if pick['hash'] not in previous_hashes] - - def run_check(self): - """執行一次檢查""" - self.logger.info("開始檢查 Barron's 股票推薦...") - self.stats['total_checks'] += 1 - self.stats['last_check'] = datetime.now().isoformat() - - try: - # 獲取和解析內容 - html_content = self.fetch_page() - if not html_content: - return - - current_picks = self.parse_stock_picks(html_content) - if not current_picks: - self.logger.warning("未找到股票推薦內容") - return - - # 載入之前的資料 - previous_data = self.load_previous_data() - previous_picks = previous_data.get('stock_picks', []) - - # 檢查新內容 - new_picks = self.find_new_picks(current_picks, previous_picks) - - if new_picks: - self.logger.info(f"🚨 發現 {len(new_picks)} 條新推薦") - self.stats['new_picks_found'] += len(new_picks) - - # 發送通知 - self.send_notifications(new_picks) - - # 儲存資料 - new_data = { - 'last_update': datetime.now().isoformat(), - 'stock_picks': current_picks, - 'stats': self.stats - } - self.save_data(new_data) - - return new_picks - else: - # 啟動後第一次且啟用 ALWAYS_NOTIFY_ON_STARTUP,則寄出目前內容 - if (not self._first_check_done) and self.always_notify_on_startup and current_picks: - self.logger.info("🟢 啟動首次檢查:沒有新內容,但已依設定寄出目前清單") - # 發送通知(使用全部目前項目) - self.send_notifications(current_picks) - # 儲存資料(仍以目前清單為準) - new_data = { - 'last_update': datetime.now().isoformat(), - 'stock_picks': current_picks, - 'stats': self.stats - } - self.save_data(new_data) - return current_picks - - self.logger.info("✅ 沒有發現新內容") - return [] - - except Exception as e: - self.logger.error(f"檢查過程中發生錯誤: {e}") - self.stats['errors'] += 1 - return None - - def signal_handler(self, signum, frame): - """處理停止信號""" - self.logger.info("收到停止信號,正在關閉...") - self.running = False - - def run(self): - """主運行循環""" - # 註冊信號處理 - signal.signal(signal.SIGINT, self.signal_handler) - signal.signal(signal.SIGTERM, self.signal_handler) - - # 使用 schedule 庫進行調度 - schedule.every(self.check_interval).seconds.do(self.run_check) - - self.logger.info(f"🚀 爬蟲已啟動,每 {self.check_interval} 秒檢查一次") - - # 立即執行一次檢查 - self.run_check() - self._first_check_done = True - - while self.running: - schedule.run_pending() - time.sleep(1) - - self.logger.info("爬蟲已停止") - - -# Flask Web API -app = Flask(__name__) -crawler_instance = None - -@app.route('/health') -def health_check(): - """健康檢查端點""" - return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()}) - -@app.route('/stats') -def get_stats(): - """獲取統計資料""" - if crawler_instance: - return jsonify(crawler_instance.stats) - return jsonify({"error": "Crawler not initialized"}) - -@app.route('/check') -def manual_check(): - """手動觸發檢查""" - if crawler_instance: - result = crawler_instance.run_check() - return jsonify({"result": f"Found {len(result) if result else 0} new picks"}) - return jsonify({"error": "Crawler not initialized"}) - - -@app.route('/notify_test') -def notify_test(): - """手動測試通知(預設只寄 Email)。可加參數 ?channel=email|webhook|discord""" - if not crawler_instance: - return jsonify({"error": "Crawler not initialized"}), 500 - - channel = (request.args.get('channel') or 'email').lower() - test_pick = [{ - 'title': f"[測試] Barron's 通知發送 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", - 'link': 'https://example.com/test', - 'scraped_at': datetime.now().isoformat(), - 'hash': hashlib.md5(str(datetime.now().timestamp()).encode()).hexdigest()[:8] - }] - - try: - if channel == 'email': - if not crawler_instance.email_config: - return jsonify({"error": "Email config not set"}), 400 - crawler_instance.send_email_notification(test_pick) - elif channel == 'webhook': - if not crawler_instance.webhook_url: - return jsonify({"error": "Webhook URL not set"}), 400 - crawler_instance.send_webhook_notification(test_pick) - elif channel == 'discord': - if not crawler_instance.discord_webhook: - return jsonify({"error": "Discord webhook not set"}), 400 - crawler_instance.send_discord_notification(test_pick) - else: - return jsonify({"error": f"Unsupported channel: {channel}"}), 400 - return jsonify({"result": f"Test notification sent via {channel}"}) - except Exception as e: - crawler_instance.logger.error(f"測試通知發送失敗: {e}") - return jsonify({"error": str(e)}), 500 - -def run_flask_app(): - """運行 Flask 應用""" - app.run(host='0.0.0.0', port=8080, debug=False) +from app.runner import start if __name__ == "__main__": - # 創建爬蟲實例 - crawler_instance = EnhancedBarronsCrawler() - - # 在背景執行 Flask API - flask_thread = threading.Thread(target=run_flask_app, daemon=True) - flask_thread.start() - - # 運行主爬蟲 - crawler_instance.run() + start() +