feat: init project

This commit is contained in:
2025-09-03 16:47:02 +08:00
commit 852f206d2e
8 changed files with 1015 additions and 0 deletions

17
.env.template Normal file
View File

@@ -0,0 +1,17 @@
# 基本設定
CHECK_INTERVAL=300
LOG_LEVEL=INFO
# 電子郵件通知設定Gmail 範例)
EMAIL_SMTP_SERVER=smtp.gmail.com
EMAIL_SMTP_PORT=587
EMAIL_FROM=your_email@gmail.com
EMAIL_TO=notification@gmail.com
EMAIL_USERNAME=your_email@gmail.com
EMAIL_PASSWORD=your_app_specific_password
# Slack Webhook可選
WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
# Discord Webhook可選
DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK

276
.gitignore vendored Normal file
View File

@@ -0,0 +1,276 @@
# === Python ===
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# === 爬蟲專案特定 ===
# 資料檔案
data/
*.json
*.csv
*.xlsx
*.db
# 日誌檔案
logs/
*.log
*.out
# 快取檔案
cache/
.cache/
# 備份檔案
backup/
*.backup
*.bak
# 設定檔案(包含敏感資訊)
config.ini
settings.json
credentials.json
# 臨時檔案
temp/
tmp/
*.tmp
# === Docker ===
# Docker相關保留 docker-compose.yml 和 Dockerfile
.dockerignore
# === IDE/編輯器 ===
# VSCode
.vscode/
*.code-workspace
# PyCharm
.idea/
*.iws
*.iml
*.ipr
# Sublime Text
*.sublime-project
*.sublime-workspace
# Vim
*.swp
*.swo
*~
# Emacs
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# === 系統檔案 ===
# macOS
.DS_Store
.AppleDouble
.LSOverride
Icon
._*
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Windows
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
*.stackdump
[Dd]esktop.ini
$RECYCLE.BIN/
*.cab
*.msi
*.msix
*.msm
*.msp
*.lnk
# Linux
*~
.fuse_hidden*
.directory
.Trash-*
.nfs*
# === 安全相關 ===
# API 金鑰和密碼
*.key
*.pem
*.p12
*.pfx
secrets.json
.secrets/
# 環境變數檔案
.env.local
.env.development
.env.test
.env.production
# === 其他 ===
# 壓縮檔案
*.zip
*.rar
*.7z
*.tar.gz
*.tar.bz2
# 圖片(如果不需要版控)
# *.jpg
# *.jpeg
# *.png
# *.gif
# *.bmp
# *.svg
# 文件檔案(如果是輸出結果)
# *.pdf
# *.doc
# *.docx
# 測試輸出
test_output/
screenshots/
# 性能分析
*.prof

36
Dockerfile Normal file
View File

@@ -0,0 +1,36 @@
# Dockerfile
FROM python:3.11-slim
# 設定工作目錄
WORKDIR /app
# 安裝系統依賴
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/*
# 複製需求檔案
COPY requirements.txt .
# 安裝 Python 依賴
RUN pip install --no-cache-dir -r requirements.txt
# 複製應用程式檔案
COPY . .
# 創建資料目錄
RUN mkdir -p /app/data /app/logs
# 設定環境變數
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# 健康檢查
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python health_check.py || exit 1
# 暴露端口(用於健康檢查 API
EXPOSE 8080
# 執行爬蟲
CMD ["python", "crawler.py"]

268
README.md Normal file
View File

@@ -0,0 +1,268 @@
# Barron's 股票推薦爬蟲 Docker 部署指南
## 🚀 快速開始
### 1. 建立專案目錄
```bash
mkdir barrons-crawler
cd barrons-crawler
```
### 2. 創建文件結構
```
barrons-crawler/
├── Dockerfile
├── docker-compose.yml
├── requirements.txt
├── enhanced_crawler.py
├── health_check.py
├── .dockerignore
├── .env # 環境變數設定檔
├── data/ # 資料持久化目錄
└── logs/ # 日誌目錄
```
### 3. 設定環境變數
創建 `.env` 檔案:
```bash
# 基本設定
CHECK_INTERVAL=300
LOG_LEVEL=INFO
# 電子郵件通知設定Gmail 範例)
EMAIL_SMTP_SERVER=smtp.gmail.com
EMAIL_SMTP_PORT=587
EMAIL_FROM=your_email@gmail.com
EMAIL_TO=notification@gmail.com
EMAIL_USERNAME=your_email@gmail.com
EMAIL_PASSWORD=your_app_specific_password
# Slack Webhook可選
WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
# Discord Webhook可選
DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK
```
### 4. 啟動服務
```bash
# 使用 Docker Compose 啟動
docker-compose up -d
# 查看日誌
docker-compose logs -f barrons-crawler
```
## 📋 詳細設定選項
### 電子郵件設定Gmail
1. 開啟 Gmail 的兩步驟驗證
2. 生成應用程式密碼https://myaccount.google.com/apppasswords
3.`.env` 中使用應用程式密碼,不是一般密碼
### Slack 通知設定
1. 建立 Slack App: https://api.slack.com/apps
2. 創建 Incoming Webhook
3. 複製 Webhook URL 到 `.env` 檔案
### Discord 通知設定
1. 在 Discord 伺服器創建 Webhook
2. 複製 Webhook URL 到 `.env` 檔案
## 🔧 Docker 指令
### 基本操作
```bash
# 建構映像
docker-compose build
# 啟動服務
docker-compose up -d
# 停止服務
docker-compose down
# 重啟服務
docker-compose restart
# 查看日誌
docker-compose logs -f
# 進入容器
docker-compose exec barrons-crawler bash
```
### 維護指令
```bash
# 清理停用的容器
docker system prune
# 更新並重新建構
docker-compose down
docker-compose build --no-cache
docker-compose up -d
# 備份資料
docker cp barrons-crawler:/app/data ./data_backup
```
## 🌐 Web API 端點
爬蟲提供了以下 HTTP 端點:
### 健康檢查
```bash
curl http://localhost:8080/health
```
回應:`{"status": "healthy", "timestamp": "2024-01-15T10:30:00"}`
### 查看統計資料
```bash
curl http://localhost:8080/stats
```
回應:
```json
{
"start_time": "2024-01-15T10:00:00",
"total_checks": 24,
"new_picks_found": 3,
"last_check": "2024-01-15T10:25:00",
"last_notification": "2024-01-15T09:45:00",
"errors": 0
}
```
### 手動觸發檢查
```bash
curl http://localhost:8080/check
```
## 📊 監控和警報
### 健康檢查
Docker 容器包含自動健康檢查:
- 每30秒檢查一次
- 3次失敗後標記為不健康
- 可用於自動重啟策略
### 日誌監控
```bash
# 即時查看日誌
docker-compose logs -f barrons-crawler
# 查看特定時間的日誌
docker-compose logs --since "2024-01-15T10:00:00" barrons-crawler
```
### 資料備份
```bash
# 設定定期備份(加到 crontab
0 2 * * * docker cp barrons-crawler:/app/data /backup/barrons-$(date +\%Y\%m\%d)
```
## 🐛 故障排除
### 常見問題
1. **無法獲取網頁內容**
```bash
# 檢查網路連線
docker-compose exec barrons-crawler curl -I https://www.barrons.com
```
2. **電子郵件發送失敗**
- 檢查 Gmail 應用程式密碼是否正確
- 確認兩步驟驗證已開啟
- 檢查防火牆設定
3. **解析內容失敗**
- 網頁結構可能已變更
- 檢查日誌中的錯誤訊息
- 可能需要更新解析邏輯
4. **容器無法啟動**
```bash
# 檢查詳細錯誤
docker-compose logs barrons-crawler
# 檢查磁碟空間
df -h
# 檢查埠口占用
netstat -tlnp | grep 8080
```
### 調試模式
```yaml
# 在 docker-compose.yml 中添加
environment:
- LOG_LEVEL=DEBUG
# 或者進入容器手動執行
docker-compose exec barrons-crawler python enhanced_crawler.py
```
## 🔒 安全建議
1. **不要在代碼中硬編碼密碼**
- 使用 `.env` 檔案或 Docker secrets
- 將 `.env` 加入 `.gitignore`
2. **定期更新依賴**
```bash
# 更新基礎映像
docker-compose pull
docker-compose up -d
```
3. **監控資源使用**
```bash
# 查看容器資源使用
docker stats barrons-crawler
```
4. **網路安全**
- 使用反向代理(如 Nginx
- 設定適當的防火牆規則
- 啟用 HTTPS如果對外開放
## 📈 擴展功能
### 多實例部署
```yaml
# docker-compose.yml
services:
barrons-crawler-1:
# ... 設定
barrons-crawler-2:
# ... 設定
environment:
- CHECK_INTERVAL=600 # 不同檢查間隔
```
### 與其他服務整合
```yaml
# 加入資料庫
postgres:
image: postgres:15
environment:
POSTGRES_DB: barrons
POSTGRES_USER: crawler
POSTGRES_PASSWORD: password
```
### 定制通知
可以擴展 `enhanced_crawler.py` 添加:
- Line Notify
- Telegram Bot
- 推播通知
- 簡訊通知
## 🎯 最佳實踐
1. **定期監控日誌**
2. **設定適當的檢查間隔**(避免過於頻繁)
3. **定期備份資料**
4. **監控資源使用情況**
5. **設定適當的通知渠道**
6. **遵守網站使用條款**

50
docker-compose.yml Normal file
View File

@@ -0,0 +1,50 @@
# docker-compose.yml
version: '3.8'
services:
barrons-crawler:
build: .
container_name: barrons-crawler
restart: unless-stopped
environment:
# 爬蟲設定
- CHECK_INTERVAL=300 # 5分鐘
- LOG_LEVEL=INFO
# 電子郵件設定(可選)
- EMAIL_SMTP_SERVER=smtp.gmail.com
- EMAIL_SMTP_PORT=587
- EMAIL_FROM=your_email@gmail.com
- EMAIL_TO=notification@gmail.com
- EMAIL_USERNAME=your_email@gmail.com
- EMAIL_PASSWORD=your_app_password
# Webhook 設定(可選)
- WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
# Discord 設定(可選)
- DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK
volumes:
- ./data:/app/data # 資料持久化
- ./logs:/app/logs # 日誌持久化
ports:
- "8080:8080" # 健康檢查和狀態 API
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# 可選:加入 Redis 用於快取
redis:
image: redis:7-alpine
container_name: barrons-redis
restart: unless-stopped
volumes:
- redis_data:/data
command: redis-server --appendonly yes
volumes:
redis_data:

348
enhanced_crawler.py Normal file
View File

@@ -0,0 +1,348 @@
import requests
from bs4 import BeautifulSoup
import time
import json
import hashlib
from datetime import datetime
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import logging
import os
import schedule
from flask import Flask, jsonify
import threading
import signal
import sys
class EnhancedBarronsCrawler:
def __init__(self):
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.data_file = "/app/data/barrons_data.json"
self.running = True
# 從環境變數讀取設定
self.check_interval = int(os.getenv('CHECK_INTERVAL', 300))
self.email_config = self.load_email_config()
self.webhook_url = os.getenv('WEBHOOK_URL')
self.discord_webhook = os.getenv('DISCORD_WEBHOOK')
# 設定日誌
log_level = os.getenv('LOG_LEVEL', 'INFO')
logging.basicConfig(
level=getattr(logging, log_level),
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('/app/logs/barrons_crawler.log'),
logging.StreamHandler()
]
)
self.logger = logging.getLogger(__name__)
# 統計資料
self.stats = {
'start_time': datetime.now().isoformat(),
'total_checks': 0,
'new_picks_found': 0,
'last_check': None,
'last_notification': None,
'errors': 0
}
def load_email_config(self):
"""從環境變數載入電子郵件設定"""
if all(os.getenv(key) for key in ['EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD']):
return {
'smtp_server': os.getenv('EMAIL_SMTP_SERVER'),
'smtp_port': int(os.getenv('EMAIL_SMTP_PORT', 587)),
'from_email': os.getenv('EMAIL_FROM'),
'to_email': os.getenv('EMAIL_TO'),
'username': os.getenv('EMAIL_USERNAME'),
'password': os.getenv('EMAIL_PASSWORD')
}
return None
def fetch_page(self):
"""獲取網頁內容"""
try:
response = requests.get(self.url, headers=self.headers, timeout=30)
response.raise_for_status()
return response.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_stock_picks(self, html_content):
"""解析股票推薦內容"""
soup = BeautifulSoup(html_content, 'html.parser')
stock_picks = []
try:
# 多種選擇器策略
selectors = [
'article[data-module="ArticleItem"]',
'.WSJTheme--headline',
'.MarketDataModule-headline',
'h3 a, h4 a',
'[data-module] a[href*="articles"]'
]
for selector in selectors:
elements = soup.select(selector)
if elements:
self.logger.info(f"使用選擇器找到內容: {selector}")
break
for element in elements[:10]: # 限制最多10個
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
if isinstance(link, dict):
link = link.get('href')
elif hasattr(link, 'get'):
link = link.get('href')
if link and link.startswith('/'):
link = "https://www.barrons.com" + link
if title and len(title) > 10: # 過濾太短的標題
stock_picks.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8]
})
return stock_picks
except Exception as e:
self.logger.error(f"解析網頁內容失敗: {e}")
self.stats['errors'] += 1
return []
def load_previous_data(self):
"""載入之前的資料"""
try:
with open(self.data_file, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
return {'content_hash': None, 'last_update': None, 'stock_picks': []}
def save_data(self, data):
"""儲存資料"""
try:
os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
with open(self.data_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except Exception as e:
self.logger.error(f"儲存資料失敗: {e}")
self.stats['errors'] += 1
def send_notifications(self, new_picks):
"""發送各種通知"""
notification_sent = False
# 電子郵件通知
if self.email_config:
try:
self.send_email_notification(new_picks)
notification_sent = True
except Exception as e:
self.logger.error(f"電子郵件通知失敗: {e}")
# Slack/Teams Webhook
if self.webhook_url:
try:
self.send_webhook_notification(new_picks)
notification_sent = True
except Exception as e:
self.logger.error(f"Webhook 通知失敗: {e}")
# Discord Webhook
if self.discord_webhook:
try:
self.send_discord_notification(new_picks)
notification_sent = True
except Exception as e:
self.logger.error(f"Discord 通知失敗: {e}")
if notification_sent:
self.stats['last_notification'] = datetime.now().isoformat()
def send_email_notification(self, new_picks):
"""發送電子郵件通知"""
msg = MIMEMultipart()
msg['From'] = self.email_config['from_email']
msg['To'] = self.email_config['to_email']
msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)"
body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n"
for pick in new_picks:
body += f"📊 {pick['title']}\n"
if pick.get('link'):
body += f"🔗 {pick['link']}\n"
body += f"🕒 {pick['scraped_at']}\n"
body += "-" * 60 + "\n"
msg.attach(MIMEText(body, 'plain', 'utf-8'))
server = smtplib.SMTP(self.email_config['smtp_server'], self.email_config['smtp_port'])
server.starttls()
server.login(self.email_config['username'], self.email_config['password'])
server.send_message(msg)
server.quit()
def send_webhook_notification(self, new_picks):
"""發送 Webhook 通知Slack/Teams"""
message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n"
for pick in new_picks[:5]: # 限制5條避免訊息太長
message += f"📊 {pick['title']}\n"
if pick.get('link'):
message += f"🔗 {pick['link']}\n"
payload = {"text": message}
requests.post(self.webhook_url, json=payload)
def send_discord_notification(self, new_picks):
"""發送 Discord 通知"""
embed = {
"title": f"📈 Barron's 新股票推薦",
"description": f"發現 {len(new_picks)} 條新推薦",
"color": 0x00ff00,
"fields": []
}
for pick in new_picks[:5]:
embed["fields"].append({
"name": pick['title'][:256],
"value": pick.get('link', '無連結')[:1024],
"inline": False
})
payload = {"embeds": [embed]}
requests.post(self.discord_webhook, json=payload)
def find_new_picks(self, current_picks, previous_picks):
"""找出新的股票推薦"""
previous_hashes = {pick['hash'] for pick in previous_picks if 'hash' in pick}
return [pick for pick in current_picks if pick['hash'] not in previous_hashes]
def run_check(self):
"""執行一次檢查"""
self.logger.info("開始檢查 Barron's 股票推薦...")
self.stats['total_checks'] += 1
self.stats['last_check'] = datetime.now().isoformat()
try:
# 獲取和解析內容
html_content = self.fetch_page()
if not html_content:
return
current_picks = self.parse_stock_picks(html_content)
if not current_picks:
self.logger.warning("未找到股票推薦內容")
return
# 載入之前的資料
previous_data = self.load_previous_data()
previous_picks = previous_data.get('stock_picks', [])
# 檢查新內容
new_picks = self.find_new_picks(current_picks, previous_picks)
if new_picks:
self.logger.info(f"🚨 發現 {len(new_picks)} 條新推薦")
self.stats['new_picks_found'] += len(new_picks)
# 發送通知
self.send_notifications(new_picks)
# 儲存資料
new_data = {
'last_update': datetime.now().isoformat(),
'stock_picks': current_picks,
'stats': self.stats
}
self.save_data(new_data)
return new_picks
else:
self.logger.info("✅ 沒有發現新內容")
return []
except Exception as e:
self.logger.error(f"檢查過程中發生錯誤: {e}")
self.stats['errors'] += 1
return None
def signal_handler(self, signum, frame):
"""處理停止信號"""
self.logger.info("收到停止信號,正在關閉...")
self.running = False
def run(self):
"""主運行循環"""
# 註冊信號處理
signal.signal(signal.SIGINT, self.signal_handler)
signal.signal(signal.SIGTERM, self.signal_handler)
# 使用 schedule 庫進行調度
schedule.every(self.check_interval).seconds.do(self.run_check)
self.logger.info(f"🚀 爬蟲已啟動,每 {self.check_interval} 秒檢查一次")
# 立即執行一次檢查
self.run_check()
while self.running:
schedule.run_pending()
time.sleep(1)
self.logger.info("爬蟲已停止")
# Flask Web API
app = Flask(__name__)
crawler_instance = None
@app.route('/health')
def health_check():
"""健康檢查端點"""
return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()})
@app.route('/stats')
def get_stats():
"""獲取統計資料"""
if crawler_instance:
return jsonify(crawler_instance.stats)
return jsonify({"error": "Crawler not initialized"})
@app.route('/check')
def manual_check():
"""手動觸發檢查"""
if crawler_instance:
result = crawler_instance.run_check()
return jsonify({"result": f"Found {len(result) if result else 0} new picks"})
return jsonify({"error": "Crawler not initialized"})
def run_flask_app():
"""運行 Flask 應用"""
app.run(host='0.0.0.0', port=8080, debug=False)
if __name__ == "__main__":
# 創建爬蟲實例
crawler_instance = EnhancedBarronsCrawler()
# 在背景執行 Flask API
flask_thread = threading.Thread(target=run_flask_app, daemon=True)
flask_thread.start()
# 運行主爬蟲
crawler_instance.run()

15
health_check.py Normal file
View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python3
import requests
import sys
try:
response = requests.get('http://localhost:8080/health', timeout=5)
if response.status_code == 200:
print("Health check passed")
sys.exit(0)
else:
print(f"Health check failed: {response.status_code}")
sys.exit(1)
except Exception as e:
print(f"Health check error: {e}")
sys.exit(1)

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
# requirements.txt
requests==2.31.0
beautifulsoup4==4.12.2
schedule==1.2.0
flask==2.3.3