feat: init project
This commit is contained in:
17
.env.template
Normal file
17
.env.template
Normal file
@@ -0,0 +1,17 @@
|
||||
# 基本設定
|
||||
CHECK_INTERVAL=300
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# 電子郵件通知設定(Gmail 範例)
|
||||
EMAIL_SMTP_SERVER=smtp.gmail.com
|
||||
EMAIL_SMTP_PORT=587
|
||||
EMAIL_FROM=your_email@gmail.com
|
||||
EMAIL_TO=notification@gmail.com
|
||||
EMAIL_USERNAME=your_email@gmail.com
|
||||
EMAIL_PASSWORD=your_app_specific_password
|
||||
|
||||
# Slack Webhook(可選)
|
||||
WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
|
||||
|
||||
# Discord Webhook(可選)
|
||||
DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK
|
276
.gitignore
vendored
Normal file
276
.gitignore
vendored
Normal file
@@ -0,0 +1,276 @@
|
||||
# === Python ===
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# === 爬蟲專案特定 ===
|
||||
# 資料檔案
|
||||
data/
|
||||
*.json
|
||||
*.csv
|
||||
*.xlsx
|
||||
*.db
|
||||
|
||||
# 日誌檔案
|
||||
logs/
|
||||
*.log
|
||||
*.out
|
||||
|
||||
# 快取檔案
|
||||
cache/
|
||||
.cache/
|
||||
|
||||
# 備份檔案
|
||||
backup/
|
||||
*.backup
|
||||
*.bak
|
||||
|
||||
# 設定檔案(包含敏感資訊)
|
||||
config.ini
|
||||
settings.json
|
||||
credentials.json
|
||||
|
||||
# 臨時檔案
|
||||
temp/
|
||||
tmp/
|
||||
*.tmp
|
||||
|
||||
# === Docker ===
|
||||
# Docker相關(保留 docker-compose.yml 和 Dockerfile)
|
||||
.dockerignore
|
||||
|
||||
# === IDE/編輯器 ===
|
||||
# VSCode
|
||||
.vscode/
|
||||
*.code-workspace
|
||||
|
||||
# PyCharm
|
||||
.idea/
|
||||
*.iws
|
||||
*.iml
|
||||
*.ipr
|
||||
|
||||
# Sublime Text
|
||||
*.sublime-project
|
||||
*.sublime-workspace
|
||||
|
||||
# Vim
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Emacs
|
||||
*~
|
||||
\#*\#
|
||||
/.emacs.desktop
|
||||
/.emacs.desktop.lock
|
||||
*.elc
|
||||
auto-save-list
|
||||
tramp
|
||||
.\#*
|
||||
|
||||
# === 系統檔案 ===
|
||||
# macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
Icon
|
||||
._*
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
Thumbs.db:encryptable
|
||||
ehthumbs.db
|
||||
ehthumbs_vista.db
|
||||
*.stackdump
|
||||
[Dd]esktop.ini
|
||||
$RECYCLE.BIN/
|
||||
*.cab
|
||||
*.msi
|
||||
*.msix
|
||||
*.msm
|
||||
*.msp
|
||||
*.lnk
|
||||
|
||||
# Linux
|
||||
*~
|
||||
.fuse_hidden*
|
||||
.directory
|
||||
.Trash-*
|
||||
.nfs*
|
||||
|
||||
# === 安全相關 ===
|
||||
# API 金鑰和密碼
|
||||
*.key
|
||||
*.pem
|
||||
*.p12
|
||||
*.pfx
|
||||
secrets.json
|
||||
.secrets/
|
||||
|
||||
# 環境變數檔案
|
||||
.env.local
|
||||
.env.development
|
||||
.env.test
|
||||
.env.production
|
||||
|
||||
# === 其他 ===
|
||||
# 壓縮檔案
|
||||
*.zip
|
||||
*.rar
|
||||
*.7z
|
||||
*.tar.gz
|
||||
*.tar.bz2
|
||||
|
||||
# 圖片(如果不需要版控)
|
||||
# *.jpg
|
||||
# *.jpeg
|
||||
# *.png
|
||||
# *.gif
|
||||
# *.bmp
|
||||
# *.svg
|
||||
|
||||
# 文件檔案(如果是輸出結果)
|
||||
# *.pdf
|
||||
# *.doc
|
||||
# *.docx
|
||||
|
||||
# 測試輸出
|
||||
test_output/
|
||||
screenshots/
|
||||
|
||||
# 性能分析
|
||||
*.prof
|
36
Dockerfile
Normal file
36
Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# Dockerfile
|
||||
FROM python:3.11-slim
|
||||
|
||||
# 設定工作目錄
|
||||
WORKDIR /app
|
||||
|
||||
# 安裝系統依賴
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 複製需求檔案
|
||||
COPY requirements.txt .
|
||||
|
||||
# 安裝 Python 依賴
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 複製應用程式檔案
|
||||
COPY . .
|
||||
|
||||
# 創建資料目錄
|
||||
RUN mkdir -p /app/data /app/logs
|
||||
|
||||
# 設定環境變數
|
||||
ENV PYTHONPATH=/app
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# 健康檢查
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD python health_check.py || exit 1
|
||||
|
||||
# 暴露端口(用於健康檢查 API)
|
||||
EXPOSE 8080
|
||||
|
||||
# 執行爬蟲
|
||||
CMD ["python", "crawler.py"]
|
268
README.md
Normal file
268
README.md
Normal file
@@ -0,0 +1,268 @@
|
||||
# Barron's 股票推薦爬蟲 Docker 部署指南
|
||||
|
||||
## 🚀 快速開始
|
||||
|
||||
### 1. 建立專案目錄
|
||||
```bash
|
||||
mkdir barrons-crawler
|
||||
cd barrons-crawler
|
||||
```
|
||||
|
||||
### 2. 創建文件結構
|
||||
```
|
||||
barrons-crawler/
|
||||
├── Dockerfile
|
||||
├── docker-compose.yml
|
||||
├── requirements.txt
|
||||
├── enhanced_crawler.py
|
||||
├── health_check.py
|
||||
├── .dockerignore
|
||||
├── .env # 環境變數設定檔
|
||||
├── data/ # 資料持久化目錄
|
||||
└── logs/ # 日誌目錄
|
||||
```
|
||||
|
||||
### 3. 設定環境變數
|
||||
創建 `.env` 檔案:
|
||||
```bash
|
||||
# 基本設定
|
||||
CHECK_INTERVAL=300
|
||||
LOG_LEVEL=INFO
|
||||
|
||||
# 電子郵件通知設定(Gmail 範例)
|
||||
EMAIL_SMTP_SERVER=smtp.gmail.com
|
||||
EMAIL_SMTP_PORT=587
|
||||
EMAIL_FROM=your_email@gmail.com
|
||||
EMAIL_TO=notification@gmail.com
|
||||
EMAIL_USERNAME=your_email@gmail.com
|
||||
EMAIL_PASSWORD=your_app_specific_password
|
||||
|
||||
# Slack Webhook(可選)
|
||||
WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
|
||||
|
||||
# Discord Webhook(可選)
|
||||
DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK
|
||||
```
|
||||
|
||||
### 4. 啟動服務
|
||||
```bash
|
||||
# 使用 Docker Compose 啟動
|
||||
docker-compose up -d
|
||||
|
||||
# 查看日誌
|
||||
docker-compose logs -f barrons-crawler
|
||||
```
|
||||
|
||||
## 📋 詳細設定選項
|
||||
|
||||
### 電子郵件設定(Gmail)
|
||||
1. 開啟 Gmail 的兩步驟驗證
|
||||
2. 生成應用程式密碼:https://myaccount.google.com/apppasswords
|
||||
3. 在 `.env` 中使用應用程式密碼,不是一般密碼
|
||||
|
||||
### Slack 通知設定
|
||||
1. 建立 Slack App: https://api.slack.com/apps
|
||||
2. 創建 Incoming Webhook
|
||||
3. 複製 Webhook URL 到 `.env` 檔案
|
||||
|
||||
### Discord 通知設定
|
||||
1. 在 Discord 伺服器創建 Webhook
|
||||
2. 複製 Webhook URL 到 `.env` 檔案
|
||||
|
||||
## 🔧 Docker 指令
|
||||
|
||||
### 基本操作
|
||||
```bash
|
||||
# 建構映像
|
||||
docker-compose build
|
||||
|
||||
# 啟動服務
|
||||
docker-compose up -d
|
||||
|
||||
# 停止服務
|
||||
docker-compose down
|
||||
|
||||
# 重啟服務
|
||||
docker-compose restart
|
||||
|
||||
# 查看日誌
|
||||
docker-compose logs -f
|
||||
|
||||
# 進入容器
|
||||
docker-compose exec barrons-crawler bash
|
||||
```
|
||||
|
||||
### 維護指令
|
||||
```bash
|
||||
# 清理停用的容器
|
||||
docker system prune
|
||||
|
||||
# 更新並重新建構
|
||||
docker-compose down
|
||||
docker-compose build --no-cache
|
||||
docker-compose up -d
|
||||
|
||||
# 備份資料
|
||||
docker cp barrons-crawler:/app/data ./data_backup
|
||||
```
|
||||
|
||||
## 🌐 Web API 端點
|
||||
|
||||
爬蟲提供了以下 HTTP 端點:
|
||||
|
||||
### 健康檢查
|
||||
```bash
|
||||
curl http://localhost:8080/health
|
||||
```
|
||||
回應:`{"status": "healthy", "timestamp": "2024-01-15T10:30:00"}`
|
||||
|
||||
### 查看統計資料
|
||||
```bash
|
||||
curl http://localhost:8080/stats
|
||||
```
|
||||
回應:
|
||||
```json
|
||||
{
|
||||
"start_time": "2024-01-15T10:00:00",
|
||||
"total_checks": 24,
|
||||
"new_picks_found": 3,
|
||||
"last_check": "2024-01-15T10:25:00",
|
||||
"last_notification": "2024-01-15T09:45:00",
|
||||
"errors": 0
|
||||
}
|
||||
```
|
||||
|
||||
### 手動觸發檢查
|
||||
```bash
|
||||
curl http://localhost:8080/check
|
||||
```
|
||||
|
||||
## 📊 監控和警報
|
||||
|
||||
### 健康檢查
|
||||
Docker 容器包含自動健康檢查:
|
||||
- 每30秒檢查一次
|
||||
- 3次失敗後標記為不健康
|
||||
- 可用於自動重啟策略
|
||||
|
||||
### 日誌監控
|
||||
```bash
|
||||
# 即時查看日誌
|
||||
docker-compose logs -f barrons-crawler
|
||||
|
||||
# 查看特定時間的日誌
|
||||
docker-compose logs --since "2024-01-15T10:00:00" barrons-crawler
|
||||
```
|
||||
|
||||
### 資料備份
|
||||
```bash
|
||||
# 設定定期備份(加到 crontab)
|
||||
0 2 * * * docker cp barrons-crawler:/app/data /backup/barrons-$(date +\%Y\%m\%d)
|
||||
```
|
||||
|
||||
## 🐛 故障排除
|
||||
|
||||
### 常見問題
|
||||
|
||||
1. **無法獲取網頁內容**
|
||||
```bash
|
||||
# 檢查網路連線
|
||||
docker-compose exec barrons-crawler curl -I https://www.barrons.com
|
||||
```
|
||||
|
||||
2. **電子郵件發送失敗**
|
||||
- 檢查 Gmail 應用程式密碼是否正確
|
||||
- 確認兩步驟驗證已開啟
|
||||
- 檢查防火牆設定
|
||||
|
||||
3. **解析內容失敗**
|
||||
- 網頁結構可能已變更
|
||||
- 檢查日誌中的錯誤訊息
|
||||
- 可能需要更新解析邏輯
|
||||
|
||||
4. **容器無法啟動**
|
||||
```bash
|
||||
# 檢查詳細錯誤
|
||||
docker-compose logs barrons-crawler
|
||||
|
||||
# 檢查磁碟空間
|
||||
df -h
|
||||
|
||||
# 檢查埠口占用
|
||||
netstat -tlnp | grep 8080
|
||||
```
|
||||
|
||||
### 調試模式
|
||||
```yaml
|
||||
# 在 docker-compose.yml 中添加
|
||||
environment:
|
||||
- LOG_LEVEL=DEBUG
|
||||
|
||||
# 或者進入容器手動執行
|
||||
docker-compose exec barrons-crawler python enhanced_crawler.py
|
||||
```
|
||||
|
||||
## 🔒 安全建議
|
||||
|
||||
1. **不要在代碼中硬編碼密碼**
|
||||
- 使用 `.env` 檔案或 Docker secrets
|
||||
- 將 `.env` 加入 `.gitignore`
|
||||
|
||||
2. **定期更新依賴**
|
||||
```bash
|
||||
# 更新基礎映像
|
||||
docker-compose pull
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
3. **監控資源使用**
|
||||
```bash
|
||||
# 查看容器資源使用
|
||||
docker stats barrons-crawler
|
||||
```
|
||||
|
||||
4. **網路安全**
|
||||
- 使用反向代理(如 Nginx)
|
||||
- 設定適當的防火牆規則
|
||||
- 啟用 HTTPS(如果對外開放)
|
||||
|
||||
## 📈 擴展功能
|
||||
|
||||
### 多實例部署
|
||||
```yaml
|
||||
# docker-compose.yml
|
||||
services:
|
||||
barrons-crawler-1:
|
||||
# ... 設定
|
||||
barrons-crawler-2:
|
||||
# ... 設定
|
||||
environment:
|
||||
- CHECK_INTERVAL=600 # 不同檢查間隔
|
||||
```
|
||||
|
||||
### 與其他服務整合
|
||||
```yaml
|
||||
# 加入資料庫
|
||||
postgres:
|
||||
image: postgres:15
|
||||
environment:
|
||||
POSTGRES_DB: barrons
|
||||
POSTGRES_USER: crawler
|
||||
POSTGRES_PASSWORD: password
|
||||
```
|
||||
|
||||
### 定制通知
|
||||
可以擴展 `enhanced_crawler.py` 添加:
|
||||
- Line Notify
|
||||
- Telegram Bot
|
||||
- 推播通知
|
||||
- 簡訊通知
|
||||
|
||||
## 🎯 最佳實踐
|
||||
|
||||
1. **定期監控日誌**
|
||||
2. **設定適當的檢查間隔**(避免過於頻繁)
|
||||
3. **定期備份資料**
|
||||
4. **監控資源使用情況**
|
||||
5. **設定適當的通知渠道**
|
||||
6. **遵守網站使用條款**
|
50
docker-compose.yml
Normal file
50
docker-compose.yml
Normal file
@@ -0,0 +1,50 @@
|
||||
# docker-compose.yml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
barrons-crawler:
|
||||
build: .
|
||||
container_name: barrons-crawler
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# 爬蟲設定
|
||||
- CHECK_INTERVAL=300 # 5分鐘
|
||||
- LOG_LEVEL=INFO
|
||||
|
||||
# 電子郵件設定(可選)
|
||||
- EMAIL_SMTP_SERVER=smtp.gmail.com
|
||||
- EMAIL_SMTP_PORT=587
|
||||
- EMAIL_FROM=your_email@gmail.com
|
||||
- EMAIL_TO=notification@gmail.com
|
||||
- EMAIL_USERNAME=your_email@gmail.com
|
||||
- EMAIL_PASSWORD=your_app_password
|
||||
|
||||
# Webhook 設定(可選)
|
||||
- WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
|
||||
|
||||
# Discord 設定(可選)
|
||||
- DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK
|
||||
|
||||
volumes:
|
||||
- ./data:/app/data # 資料持久化
|
||||
- ./logs:/app/logs # 日誌持久化
|
||||
ports:
|
||||
- "8080:8080" # 健康檢查和狀態 API
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
|
||||
# 可選:加入 Redis 用於快取
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: barrons-redis
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
command: redis-server --appendonly yes
|
||||
|
||||
volumes:
|
||||
redis_data:
|
348
enhanced_crawler.py
Normal file
348
enhanced_crawler.py
Normal file
@@ -0,0 +1,348 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import json
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
import logging
|
||||
import os
|
||||
import schedule
|
||||
from flask import Flask, jsonify
|
||||
import threading
|
||||
import signal
|
||||
import sys
|
||||
|
||||
class EnhancedBarronsCrawler:
|
||||
def __init__(self):
|
||||
self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV"
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
self.data_file = "/app/data/barrons_data.json"
|
||||
self.running = True
|
||||
|
||||
# 從環境變數讀取設定
|
||||
self.check_interval = int(os.getenv('CHECK_INTERVAL', 300))
|
||||
self.email_config = self.load_email_config()
|
||||
self.webhook_url = os.getenv('WEBHOOK_URL')
|
||||
self.discord_webhook = os.getenv('DISCORD_WEBHOOK')
|
||||
|
||||
# 設定日誌
|
||||
log_level = os.getenv('LOG_LEVEL', 'INFO')
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level),
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('/app/logs/barrons_crawler.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# 統計資料
|
||||
self.stats = {
|
||||
'start_time': datetime.now().isoformat(),
|
||||
'total_checks': 0,
|
||||
'new_picks_found': 0,
|
||||
'last_check': None,
|
||||
'last_notification': None,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
def load_email_config(self):
|
||||
"""從環境變數載入電子郵件設定"""
|
||||
if all(os.getenv(key) for key in ['EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD']):
|
||||
return {
|
||||
'smtp_server': os.getenv('EMAIL_SMTP_SERVER'),
|
||||
'smtp_port': int(os.getenv('EMAIL_SMTP_PORT', 587)),
|
||||
'from_email': os.getenv('EMAIL_FROM'),
|
||||
'to_email': os.getenv('EMAIL_TO'),
|
||||
'username': os.getenv('EMAIL_USERNAME'),
|
||||
'password': os.getenv('EMAIL_PASSWORD')
|
||||
}
|
||||
return None
|
||||
|
||||
def fetch_page(self):
|
||||
"""獲取網頁內容"""
|
||||
try:
|
||||
response = requests.get(self.url, headers=self.headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except requests.RequestException as e:
|
||||
self.logger.error(f"獲取網頁失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def parse_stock_picks(self, html_content):
|
||||
"""解析股票推薦內容"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
stock_picks = []
|
||||
|
||||
try:
|
||||
# 多種選擇器策略
|
||||
selectors = [
|
||||
'article[data-module="ArticleItem"]',
|
||||
'.WSJTheme--headline',
|
||||
'.MarketDataModule-headline',
|
||||
'h3 a, h4 a',
|
||||
'[data-module] a[href*="articles"]'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
if elements:
|
||||
self.logger.info(f"使用選擇器找到內容: {selector}")
|
||||
break
|
||||
|
||||
for element in elements[:10]: # 限制最多10個
|
||||
title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True)
|
||||
link = element.get('href') if element.name == 'a' else element.find('a', href=True)
|
||||
|
||||
if isinstance(link, dict):
|
||||
link = link.get('href')
|
||||
elif hasattr(link, 'get'):
|
||||
link = link.get('href')
|
||||
|
||||
if link and link.startswith('/'):
|
||||
link = "https://www.barrons.com" + link
|
||||
|
||||
if title and len(title) > 10: # 過濾太短的標題
|
||||
stock_picks.append({
|
||||
'title': title,
|
||||
'link': link,
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'hash': hashlib.md5(title.encode()).hexdigest()[:8]
|
||||
})
|
||||
|
||||
return stock_picks
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析網頁內容失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return []
|
||||
|
||||
def load_previous_data(self):
|
||||
"""載入之前的資料"""
|
||||
try:
|
||||
with open(self.data_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
return {'content_hash': None, 'last_update': None, 'stock_picks': []}
|
||||
|
||||
def save_data(self, data):
|
||||
"""儲存資料"""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
|
||||
with open(self.data_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
except Exception as e:
|
||||
self.logger.error(f"儲存資料失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
|
||||
def send_notifications(self, new_picks):
|
||||
"""發送各種通知"""
|
||||
notification_sent = False
|
||||
|
||||
# 電子郵件通知
|
||||
if self.email_config:
|
||||
try:
|
||||
self.send_email_notification(new_picks)
|
||||
notification_sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"電子郵件通知失敗: {e}")
|
||||
|
||||
# Slack/Teams Webhook
|
||||
if self.webhook_url:
|
||||
try:
|
||||
self.send_webhook_notification(new_picks)
|
||||
notification_sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Webhook 通知失敗: {e}")
|
||||
|
||||
# Discord Webhook
|
||||
if self.discord_webhook:
|
||||
try:
|
||||
self.send_discord_notification(new_picks)
|
||||
notification_sent = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Discord 通知失敗: {e}")
|
||||
|
||||
if notification_sent:
|
||||
self.stats['last_notification'] = datetime.now().isoformat()
|
||||
|
||||
def send_email_notification(self, new_picks):
|
||||
"""發送電子郵件通知"""
|
||||
msg = MIMEMultipart()
|
||||
msg['From'] = self.email_config['from_email']
|
||||
msg['To'] = self.email_config['to_email']
|
||||
msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)"
|
||||
|
||||
body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n"
|
||||
for pick in new_picks:
|
||||
body += f"📊 {pick['title']}\n"
|
||||
if pick.get('link'):
|
||||
body += f"🔗 {pick['link']}\n"
|
||||
body += f"🕒 {pick['scraped_at']}\n"
|
||||
body += "-" * 60 + "\n"
|
||||
|
||||
msg.attach(MIMEText(body, 'plain', 'utf-8'))
|
||||
|
||||
server = smtplib.SMTP(self.email_config['smtp_server'], self.email_config['smtp_port'])
|
||||
server.starttls()
|
||||
server.login(self.email_config['username'], self.email_config['password'])
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
|
||||
def send_webhook_notification(self, new_picks):
|
||||
"""發送 Webhook 通知(Slack/Teams)"""
|
||||
message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n"
|
||||
for pick in new_picks[:5]: # 限制5條避免訊息太長
|
||||
message += f"📊 {pick['title']}\n"
|
||||
if pick.get('link'):
|
||||
message += f"🔗 {pick['link']}\n"
|
||||
|
||||
payload = {"text": message}
|
||||
requests.post(self.webhook_url, json=payload)
|
||||
|
||||
def send_discord_notification(self, new_picks):
|
||||
"""發送 Discord 通知"""
|
||||
embed = {
|
||||
"title": f"📈 Barron's 新股票推薦",
|
||||
"description": f"發現 {len(new_picks)} 條新推薦",
|
||||
"color": 0x00ff00,
|
||||
"fields": []
|
||||
}
|
||||
|
||||
for pick in new_picks[:5]:
|
||||
embed["fields"].append({
|
||||
"name": pick['title'][:256],
|
||||
"value": pick.get('link', '無連結')[:1024],
|
||||
"inline": False
|
||||
})
|
||||
|
||||
payload = {"embeds": [embed]}
|
||||
requests.post(self.discord_webhook, json=payload)
|
||||
|
||||
def find_new_picks(self, current_picks, previous_picks):
|
||||
"""找出新的股票推薦"""
|
||||
previous_hashes = {pick['hash'] for pick in previous_picks if 'hash' in pick}
|
||||
return [pick for pick in current_picks if pick['hash'] not in previous_hashes]
|
||||
|
||||
def run_check(self):
|
||||
"""執行一次檢查"""
|
||||
self.logger.info("開始檢查 Barron's 股票推薦...")
|
||||
self.stats['total_checks'] += 1
|
||||
self.stats['last_check'] = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
# 獲取和解析內容
|
||||
html_content = self.fetch_page()
|
||||
if not html_content:
|
||||
return
|
||||
|
||||
current_picks = self.parse_stock_picks(html_content)
|
||||
if not current_picks:
|
||||
self.logger.warning("未找到股票推薦內容")
|
||||
return
|
||||
|
||||
# 載入之前的資料
|
||||
previous_data = self.load_previous_data()
|
||||
previous_picks = previous_data.get('stock_picks', [])
|
||||
|
||||
# 檢查新內容
|
||||
new_picks = self.find_new_picks(current_picks, previous_picks)
|
||||
|
||||
if new_picks:
|
||||
self.logger.info(f"🚨 發現 {len(new_picks)} 條新推薦")
|
||||
self.stats['new_picks_found'] += len(new_picks)
|
||||
|
||||
# 發送通知
|
||||
self.send_notifications(new_picks)
|
||||
|
||||
# 儲存資料
|
||||
new_data = {
|
||||
'last_update': datetime.now().isoformat(),
|
||||
'stock_picks': current_picks,
|
||||
'stats': self.stats
|
||||
}
|
||||
self.save_data(new_data)
|
||||
|
||||
return new_picks
|
||||
else:
|
||||
self.logger.info("✅ 沒有發現新內容")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"檢查過程中發生錯誤: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def signal_handler(self, signum, frame):
|
||||
"""處理停止信號"""
|
||||
self.logger.info("收到停止信號,正在關閉...")
|
||||
self.running = False
|
||||
|
||||
def run(self):
|
||||
"""主運行循環"""
|
||||
# 註冊信號處理
|
||||
signal.signal(signal.SIGINT, self.signal_handler)
|
||||
signal.signal(signal.SIGTERM, self.signal_handler)
|
||||
|
||||
# 使用 schedule 庫進行調度
|
||||
schedule.every(self.check_interval).seconds.do(self.run_check)
|
||||
|
||||
self.logger.info(f"🚀 爬蟲已啟動,每 {self.check_interval} 秒檢查一次")
|
||||
|
||||
# 立即執行一次檢查
|
||||
self.run_check()
|
||||
|
||||
while self.running:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
||||
|
||||
self.logger.info("爬蟲已停止")
|
||||
|
||||
|
||||
# Flask Web API
|
||||
app = Flask(__name__)
|
||||
crawler_instance = None
|
||||
|
||||
@app.route('/health')
|
||||
def health_check():
|
||||
"""健康檢查端點"""
|
||||
return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()})
|
||||
|
||||
@app.route('/stats')
|
||||
def get_stats():
|
||||
"""獲取統計資料"""
|
||||
if crawler_instance:
|
||||
return jsonify(crawler_instance.stats)
|
||||
return jsonify({"error": "Crawler not initialized"})
|
||||
|
||||
@app.route('/check')
|
||||
def manual_check():
|
||||
"""手動觸發檢查"""
|
||||
if crawler_instance:
|
||||
result = crawler_instance.run_check()
|
||||
return jsonify({"result": f"Found {len(result) if result else 0} new picks"})
|
||||
return jsonify({"error": "Crawler not initialized"})
|
||||
|
||||
|
||||
def run_flask_app():
|
||||
"""運行 Flask 應用"""
|
||||
app.run(host='0.0.0.0', port=8080, debug=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 創建爬蟲實例
|
||||
crawler_instance = EnhancedBarronsCrawler()
|
||||
|
||||
# 在背景執行 Flask API
|
||||
flask_thread = threading.Thread(target=run_flask_app, daemon=True)
|
||||
flask_thread.start()
|
||||
|
||||
# 運行主爬蟲
|
||||
crawler_instance.run()
|
15
health_check.py
Normal file
15
health_check.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
import requests
|
||||
import sys
|
||||
|
||||
try:
|
||||
response = requests.get('http://localhost:8080/health', timeout=5)
|
||||
if response.status_code == 200:
|
||||
print("Health check passed")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(f"Health check failed: {response.status_code}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Health check error: {e}")
|
||||
sys.exit(1)
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
# requirements.txt
|
||||
requests==2.31.0
|
||||
beautifulsoup4==4.12.2
|
||||
schedule==1.2.0
|
||||
flask==2.3.3
|
Reference in New Issue
Block a user