From 852f206d2e3b1301819f464d2a741fb9a3c8990e Mon Sep 17 00:00:00 2001 From: MH Hung Date: Wed, 3 Sep 2025 16:47:02 +0800 Subject: [PATCH] feat: init project --- .env.template | 17 +++ .gitignore | 276 +++++++++++++++++++++++++++++++++++ Dockerfile | 36 +++++ README.md | 268 ++++++++++++++++++++++++++++++++++ docker-compose.yml | 50 +++++++ enhanced_crawler.py | 348 ++++++++++++++++++++++++++++++++++++++++++++ health_check.py | 15 ++ requirements.txt | 5 + 8 files changed, 1015 insertions(+) create mode 100644 .env.template create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 enhanced_crawler.py create mode 100644 health_check.py create mode 100644 requirements.txt diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..0ee00e3 --- /dev/null +++ b/.env.template @@ -0,0 +1,17 @@ +# 基本設定 +CHECK_INTERVAL=300 +LOG_LEVEL=INFO + +# 電子郵件通知設定(Gmail 範例) +EMAIL_SMTP_SERVER=smtp.gmail.com +EMAIL_SMTP_PORT=587 +EMAIL_FROM=your_email@gmail.com +EMAIL_TO=notification@gmail.com +EMAIL_USERNAME=your_email@gmail.com +EMAIL_PASSWORD=your_app_specific_password + +# Slack Webhook(可選) +WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK + +# Discord Webhook(可選) +DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..195286d --- /dev/null +++ b/.gitignore @@ -0,0 +1,276 @@ +# === Python === +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# === 爬蟲專案特定 === +# 資料檔案 +data/ +*.json +*.csv +*.xlsx +*.db + +# 日誌檔案 +logs/ +*.log +*.out + +# 快取檔案 +cache/ +.cache/ + +# 備份檔案 +backup/ +*.backup +*.bak + +# 設定檔案(包含敏感資訊) +config.ini +settings.json +credentials.json + +# 臨時檔案 +temp/ +tmp/ +*.tmp + +# === Docker === +# Docker相關(保留 docker-compose.yml 和 Dockerfile) +.dockerignore + +# === IDE/編輯器 === +# VSCode +.vscode/ +*.code-workspace + +# PyCharm +.idea/ +*.iws +*.iml +*.ipr + +# Sublime Text +*.sublime-project +*.sublime-workspace + +# Vim +*.swp +*.swo +*~ + +# Emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# === 系統檔案 === +# macOS +.DS_Store +.AppleDouble +.LSOverride +Icon +._* +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Windows +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db +*.stackdump +[Dd]esktop.ini +$RECYCLE.BIN/ +*.cab +*.msi +*.msix +*.msm +*.msp +*.lnk + +# Linux +*~ +.fuse_hidden* +.directory +.Trash-* +.nfs* + +# === 安全相關 === +# API 金鑰和密碼 +*.key +*.pem +*.p12 +*.pfx +secrets.json +.secrets/ + +# 環境變數檔案 +.env.local +.env.development +.env.test +.env.production + +# === 其他 === +# 壓縮檔案 +*.zip +*.rar +*.7z +*.tar.gz +*.tar.bz2 + +# 圖片(如果不需要版控) +# *.jpg +# *.jpeg +# *.png +# *.gif +# *.bmp +# *.svg + +# 文件檔案(如果是輸出結果) +# *.pdf +# *.doc +# *.docx + +# 測試輸出 +test_output/ +screenshots/ + +# 性能分析 +*.prof \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a0d0169 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,36 @@ +# Dockerfile +FROM python:3.11-slim + +# 設定工作目錄 +WORKDIR /app + +# 安裝系統依賴 +RUN apt-get update && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# 複製需求檔案 +COPY requirements.txt . + +# 安裝 Python 依賴 +RUN pip install --no-cache-dir -r requirements.txt + +# 複製應用程式檔案 +COPY . . + +# 創建資料目錄 +RUN mkdir -p /app/data /app/logs + +# 設定環境變數 +ENV PYTHONPATH=/app +ENV PYTHONUNBUFFERED=1 + +# 健康檢查 +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python health_check.py || exit 1 + +# 暴露端口(用於健康檢查 API) +EXPOSE 8080 + +# 執行爬蟲 +CMD ["python", "crawler.py"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..e7ba03e --- /dev/null +++ b/README.md @@ -0,0 +1,268 @@ +# Barron's 股票推薦爬蟲 Docker 部署指南 + +## 🚀 快速開始 + +### 1. 建立專案目錄 +```bash +mkdir barrons-crawler +cd barrons-crawler +``` + +### 2. 創建文件結構 +``` +barrons-crawler/ +├── Dockerfile +├── docker-compose.yml +├── requirements.txt +├── enhanced_crawler.py +├── health_check.py +├── .dockerignore +├── .env # 環境變數設定檔 +├── data/ # 資料持久化目錄 +└── logs/ # 日誌目錄 +``` + +### 3. 設定環境變數 +創建 `.env` 檔案: +```bash +# 基本設定 +CHECK_INTERVAL=300 +LOG_LEVEL=INFO + +# 電子郵件通知設定(Gmail 範例) +EMAIL_SMTP_SERVER=smtp.gmail.com +EMAIL_SMTP_PORT=587 +EMAIL_FROM=your_email@gmail.com +EMAIL_TO=notification@gmail.com +EMAIL_USERNAME=your_email@gmail.com +EMAIL_PASSWORD=your_app_specific_password + +# Slack Webhook(可選) +WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK + +# Discord Webhook(可選) +DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK +``` + +### 4. 啟動服務 +```bash +# 使用 Docker Compose 啟動 +docker-compose up -d + +# 查看日誌 +docker-compose logs -f barrons-crawler +``` + +## 📋 詳細設定選項 + +### 電子郵件設定(Gmail) +1. 開啟 Gmail 的兩步驟驗證 +2. 生成應用程式密碼:https://myaccount.google.com/apppasswords +3. 在 `.env` 中使用應用程式密碼,不是一般密碼 + +### Slack 通知設定 +1. 建立 Slack App: https://api.slack.com/apps +2. 創建 Incoming Webhook +3. 複製 Webhook URL 到 `.env` 檔案 + +### Discord 通知設定 +1. 在 Discord 伺服器創建 Webhook +2. 複製 Webhook URL 到 `.env` 檔案 + +## 🔧 Docker 指令 + +### 基本操作 +```bash +# 建構映像 +docker-compose build + +# 啟動服務 +docker-compose up -d + +# 停止服務 +docker-compose down + +# 重啟服務 +docker-compose restart + +# 查看日誌 +docker-compose logs -f + +# 進入容器 +docker-compose exec barrons-crawler bash +``` + +### 維護指令 +```bash +# 清理停用的容器 +docker system prune + +# 更新並重新建構 +docker-compose down +docker-compose build --no-cache +docker-compose up -d + +# 備份資料 +docker cp barrons-crawler:/app/data ./data_backup +``` + +## 🌐 Web API 端點 + +爬蟲提供了以下 HTTP 端點: + +### 健康檢查 +```bash +curl http://localhost:8080/health +``` +回應:`{"status": "healthy", "timestamp": "2024-01-15T10:30:00"}` + +### 查看統計資料 +```bash +curl http://localhost:8080/stats +``` +回應: +```json +{ + "start_time": "2024-01-15T10:00:00", + "total_checks": 24, + "new_picks_found": 3, + "last_check": "2024-01-15T10:25:00", + "last_notification": "2024-01-15T09:45:00", + "errors": 0 +} +``` + +### 手動觸發檢查 +```bash +curl http://localhost:8080/check +``` + +## 📊 監控和警報 + +### 健康檢查 +Docker 容器包含自動健康檢查: +- 每30秒檢查一次 +- 3次失敗後標記為不健康 +- 可用於自動重啟策略 + +### 日誌監控 +```bash +# 即時查看日誌 +docker-compose logs -f barrons-crawler + +# 查看特定時間的日誌 +docker-compose logs --since "2024-01-15T10:00:00" barrons-crawler +``` + +### 資料備份 +```bash +# 設定定期備份(加到 crontab) +0 2 * * * docker cp barrons-crawler:/app/data /backup/barrons-$(date +\%Y\%m\%d) +``` + +## 🐛 故障排除 + +### 常見問題 + +1. **無法獲取網頁內容** + ```bash + # 檢查網路連線 + docker-compose exec barrons-crawler curl -I https://www.barrons.com + ``` + +2. **電子郵件發送失敗** + - 檢查 Gmail 應用程式密碼是否正確 + - 確認兩步驟驗證已開啟 + - 檢查防火牆設定 + +3. **解析內容失敗** + - 網頁結構可能已變更 + - 檢查日誌中的錯誤訊息 + - 可能需要更新解析邏輯 + +4. **容器無法啟動** + ```bash + # 檢查詳細錯誤 + docker-compose logs barrons-crawler + + # 檢查磁碟空間 + df -h + + # 檢查埠口占用 + netstat -tlnp | grep 8080 + ``` + +### 調試模式 +```yaml +# 在 docker-compose.yml 中添加 +environment: + - LOG_LEVEL=DEBUG + +# 或者進入容器手動執行 +docker-compose exec barrons-crawler python enhanced_crawler.py +``` + +## 🔒 安全建議 + +1. **不要在代碼中硬編碼密碼** + - 使用 `.env` 檔案或 Docker secrets + - 將 `.env` 加入 `.gitignore` + +2. **定期更新依賴** + ```bash + # 更新基礎映像 + docker-compose pull + docker-compose up -d + ``` + +3. **監控資源使用** + ```bash + # 查看容器資源使用 + docker stats barrons-crawler + ``` + +4. **網路安全** + - 使用反向代理(如 Nginx) + - 設定適當的防火牆規則 + - 啟用 HTTPS(如果對外開放) + +## 📈 擴展功能 + +### 多實例部署 +```yaml +# docker-compose.yml +services: + barrons-crawler-1: + # ... 設定 + barrons-crawler-2: + # ... 設定 + environment: + - CHECK_INTERVAL=600 # 不同檢查間隔 +``` + +### 與其他服務整合 +```yaml +# 加入資料庫 + postgres: + image: postgres:15 + environment: + POSTGRES_DB: barrons + POSTGRES_USER: crawler + POSTGRES_PASSWORD: password +``` + +### 定制通知 +可以擴展 `enhanced_crawler.py` 添加: +- Line Notify +- Telegram Bot +- 推播通知 +- 簡訊通知 + +## 🎯 最佳實踐 + +1. **定期監控日誌** +2. **設定適當的檢查間隔**(避免過於頻繁) +3. **定期備份資料** +4. **監控資源使用情況** +5. **設定適當的通知渠道** +6. **遵守網站使用條款** \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..34476af --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,50 @@ +# docker-compose.yml +version: '3.8' + +services: + barrons-crawler: + build: . + container_name: barrons-crawler + restart: unless-stopped + environment: + # 爬蟲設定 + - CHECK_INTERVAL=300 # 5分鐘 + - LOG_LEVEL=INFO + + # 電子郵件設定(可選) + - EMAIL_SMTP_SERVER=smtp.gmail.com + - EMAIL_SMTP_PORT=587 + - EMAIL_FROM=your_email@gmail.com + - EMAIL_TO=notification@gmail.com + - EMAIL_USERNAME=your_email@gmail.com + - EMAIL_PASSWORD=your_app_password + + # Webhook 設定(可選) + - WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK + + # Discord 設定(可選) + - DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK + + volumes: + - ./data:/app/data # 資料持久化 + - ./logs:/app/logs # 日誌持久化 + ports: + - "8080:8080" # 健康檢查和狀態 API + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + # 可選:加入 Redis 用於快取 + redis: + image: redis:7-alpine + container_name: barrons-redis + restart: unless-stopped + volumes: + - redis_data:/data + command: redis-server --appendonly yes + +volumes: + redis_data: \ No newline at end of file diff --git a/enhanced_crawler.py b/enhanced_crawler.py new file mode 100644 index 0000000..9bc554a --- /dev/null +++ b/enhanced_crawler.py @@ -0,0 +1,348 @@ +import requests +from bs4 import BeautifulSoup +import time +import json +import hashlib +from datetime import datetime +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +import logging +import os +import schedule +from flask import Flask, jsonify +import threading +import signal +import sys + +class EnhancedBarronsCrawler: + def __init__(self): + self.url = "https://www.barrons.com/market-data/stocks/stock-picks?mod=BOL_TOPNAV" + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + self.data_file = "/app/data/barrons_data.json" + self.running = True + + # 從環境變數讀取設定 + self.check_interval = int(os.getenv('CHECK_INTERVAL', 300)) + self.email_config = self.load_email_config() + self.webhook_url = os.getenv('WEBHOOK_URL') + self.discord_webhook = os.getenv('DISCORD_WEBHOOK') + + # 設定日誌 + log_level = os.getenv('LOG_LEVEL', 'INFO') + logging.basicConfig( + level=getattr(logging, log_level), + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('/app/logs/barrons_crawler.log'), + logging.StreamHandler() + ] + ) + self.logger = logging.getLogger(__name__) + + # 統計資料 + self.stats = { + 'start_time': datetime.now().isoformat(), + 'total_checks': 0, + 'new_picks_found': 0, + 'last_check': None, + 'last_notification': None, + 'errors': 0 + } + + def load_email_config(self): + """從環境變數載入電子郵件設定""" + if all(os.getenv(key) for key in ['EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD']): + return { + 'smtp_server': os.getenv('EMAIL_SMTP_SERVER'), + 'smtp_port': int(os.getenv('EMAIL_SMTP_PORT', 587)), + 'from_email': os.getenv('EMAIL_FROM'), + 'to_email': os.getenv('EMAIL_TO'), + 'username': os.getenv('EMAIL_USERNAME'), + 'password': os.getenv('EMAIL_PASSWORD') + } + return None + + def fetch_page(self): + """獲取網頁內容""" + try: + response = requests.get(self.url, headers=self.headers, timeout=30) + response.raise_for_status() + return response.text + except requests.RequestException as e: + self.logger.error(f"獲取網頁失敗: {e}") + self.stats['errors'] += 1 + return None + + def parse_stock_picks(self, html_content): + """解析股票推薦內容""" + soup = BeautifulSoup(html_content, 'html.parser') + stock_picks = [] + + try: + # 多種選擇器策略 + selectors = [ + 'article[data-module="ArticleItem"]', + '.WSJTheme--headline', + '.MarketDataModule-headline', + 'h3 a, h4 a', + '[data-module] a[href*="articles"]' + ] + + for selector in selectors: + elements = soup.select(selector) + if elements: + self.logger.info(f"使用選擇器找到內容: {selector}") + break + + for element in elements[:10]: # 限制最多10個 + title = element.get_text(strip=True) if element.name != 'a' else element.get_text(strip=True) + link = element.get('href') if element.name == 'a' else element.find('a', href=True) + + if isinstance(link, dict): + link = link.get('href') + elif hasattr(link, 'get'): + link = link.get('href') + + if link and link.startswith('/'): + link = "https://www.barrons.com" + link + + if title and len(title) > 10: # 過濾太短的標題 + stock_picks.append({ + 'title': title, + 'link': link, + 'scraped_at': datetime.now().isoformat(), + 'hash': hashlib.md5(title.encode()).hexdigest()[:8] + }) + + return stock_picks + + except Exception as e: + self.logger.error(f"解析網頁內容失敗: {e}") + self.stats['errors'] += 1 + return [] + + def load_previous_data(self): + """載入之前的資料""" + try: + with open(self.data_file, 'r', encoding='utf-8') as f: + return json.load(f) + except FileNotFoundError: + return {'content_hash': None, 'last_update': None, 'stock_picks': []} + + def save_data(self, data): + """儲存資料""" + try: + os.makedirs(os.path.dirname(self.data_file), exist_ok=True) + with open(self.data_file, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + except Exception as e: + self.logger.error(f"儲存資料失敗: {e}") + self.stats['errors'] += 1 + + def send_notifications(self, new_picks): + """發送各種通知""" + notification_sent = False + + # 電子郵件通知 + if self.email_config: + try: + self.send_email_notification(new_picks) + notification_sent = True + except Exception as e: + self.logger.error(f"電子郵件通知失敗: {e}") + + # Slack/Teams Webhook + if self.webhook_url: + try: + self.send_webhook_notification(new_picks) + notification_sent = True + except Exception as e: + self.logger.error(f"Webhook 通知失敗: {e}") + + # Discord Webhook + if self.discord_webhook: + try: + self.send_discord_notification(new_picks) + notification_sent = True + except Exception as e: + self.logger.error(f"Discord 通知失敗: {e}") + + if notification_sent: + self.stats['last_notification'] = datetime.now().isoformat() + + def send_email_notification(self, new_picks): + """發送電子郵件通知""" + msg = MIMEMultipart() + msg['From'] = self.email_config['from_email'] + msg['To'] = self.email_config['to_email'] + msg['Subject'] = f"📈 Barron's 新股票推薦 ({len(new_picks)}條)" + + body = f"發現 {len(new_picks)} 條新的股票推薦:\n\n" + for pick in new_picks: + body += f"📊 {pick['title']}\n" + if pick.get('link'): + body += f"🔗 {pick['link']}\n" + body += f"🕒 {pick['scraped_at']}\n" + body += "-" * 60 + "\n" + + msg.attach(MIMEText(body, 'plain', 'utf-8')) + + server = smtplib.SMTP(self.email_config['smtp_server'], self.email_config['smtp_port']) + server.starttls() + server.login(self.email_config['username'], self.email_config['password']) + server.send_message(msg) + server.quit() + + def send_webhook_notification(self, new_picks): + """發送 Webhook 通知(Slack/Teams)""" + message = f"🚨 發現 {len(new_picks)} 條新的 Barron's 股票推薦!\n\n" + for pick in new_picks[:5]: # 限制5條避免訊息太長 + message += f"📊 {pick['title']}\n" + if pick.get('link'): + message += f"🔗 {pick['link']}\n" + + payload = {"text": message} + requests.post(self.webhook_url, json=payload) + + def send_discord_notification(self, new_picks): + """發送 Discord 通知""" + embed = { + "title": f"📈 Barron's 新股票推薦", + "description": f"發現 {len(new_picks)} 條新推薦", + "color": 0x00ff00, + "fields": [] + } + + for pick in new_picks[:5]: + embed["fields"].append({ + "name": pick['title'][:256], + "value": pick.get('link', '無連結')[:1024], + "inline": False + }) + + payload = {"embeds": [embed]} + requests.post(self.discord_webhook, json=payload) + + def find_new_picks(self, current_picks, previous_picks): + """找出新的股票推薦""" + previous_hashes = {pick['hash'] for pick in previous_picks if 'hash' in pick} + return [pick for pick in current_picks if pick['hash'] not in previous_hashes] + + def run_check(self): + """執行一次檢查""" + self.logger.info("開始檢查 Barron's 股票推薦...") + self.stats['total_checks'] += 1 + self.stats['last_check'] = datetime.now().isoformat() + + try: + # 獲取和解析內容 + html_content = self.fetch_page() + if not html_content: + return + + current_picks = self.parse_stock_picks(html_content) + if not current_picks: + self.logger.warning("未找到股票推薦內容") + return + + # 載入之前的資料 + previous_data = self.load_previous_data() + previous_picks = previous_data.get('stock_picks', []) + + # 檢查新內容 + new_picks = self.find_new_picks(current_picks, previous_picks) + + if new_picks: + self.logger.info(f"🚨 發現 {len(new_picks)} 條新推薦") + self.stats['new_picks_found'] += len(new_picks) + + # 發送通知 + self.send_notifications(new_picks) + + # 儲存資料 + new_data = { + 'last_update': datetime.now().isoformat(), + 'stock_picks': current_picks, + 'stats': self.stats + } + self.save_data(new_data) + + return new_picks + else: + self.logger.info("✅ 沒有發現新內容") + return [] + + except Exception as e: + self.logger.error(f"檢查過程中發生錯誤: {e}") + self.stats['errors'] += 1 + return None + + def signal_handler(self, signum, frame): + """處理停止信號""" + self.logger.info("收到停止信號,正在關閉...") + self.running = False + + def run(self): + """主運行循環""" + # 註冊信號處理 + signal.signal(signal.SIGINT, self.signal_handler) + signal.signal(signal.SIGTERM, self.signal_handler) + + # 使用 schedule 庫進行調度 + schedule.every(self.check_interval).seconds.do(self.run_check) + + self.logger.info(f"🚀 爬蟲已啟動,每 {self.check_interval} 秒檢查一次") + + # 立即執行一次檢查 + self.run_check() + + while self.running: + schedule.run_pending() + time.sleep(1) + + self.logger.info("爬蟲已停止") + + +# Flask Web API +app = Flask(__name__) +crawler_instance = None + +@app.route('/health') +def health_check(): + """健康檢查端點""" + return jsonify({"status": "healthy", "timestamp": datetime.now().isoformat()}) + +@app.route('/stats') +def get_stats(): + """獲取統計資料""" + if crawler_instance: + return jsonify(crawler_instance.stats) + return jsonify({"error": "Crawler not initialized"}) + +@app.route('/check') +def manual_check(): + """手動觸發檢查""" + if crawler_instance: + result = crawler_instance.run_check() + return jsonify({"result": f"Found {len(result) if result else 0} new picks"}) + return jsonify({"error": "Crawler not initialized"}) + + +def run_flask_app(): + """運行 Flask 應用""" + app.run(host='0.0.0.0', port=8080, debug=False) + + +if __name__ == "__main__": + # 創建爬蟲實例 + crawler_instance = EnhancedBarronsCrawler() + + # 在背景執行 Flask API + flask_thread = threading.Thread(target=run_flask_app, daemon=True) + flask_thread.start() + + # 運行主爬蟲 + crawler_instance.run() \ No newline at end of file diff --git a/health_check.py b/health_check.py new file mode 100644 index 0000000..c695b58 --- /dev/null +++ b/health_check.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import requests +import sys + +try: + response = requests.get('http://localhost:8080/health', timeout=5) + if response.status_code == 200: + print("Health check passed") + sys.exit(0) + else: + print(f"Health check failed: {response.status_code}") + sys.exit(1) +except Exception as e: + print(f"Health check error: {e}") + sys.exit(1) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f87722a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +# requirements.txt +requests==2.31.0 +beautifulsoup4==4.12.2 +schedule==1.2.0 +flask==2.3.3 \ No newline at end of file