From f708f3bf1d5cf237c8ce0ede20f02c45fca00d7f Mon Sep 17 00:00:00 2001 From: MH Hung Date: Tue, 9 Sep 2025 21:17:50 +0800 Subject: [PATCH] feat: add OpenInsider Top-of-day crawler and multi-CRAWLER_TYPE support New crawler: app/crawlers/openinsider_top.py\n- Scrapes three pages (sales/purchases/officer purchases)\n- Filters rows with Value/Amount >= ,000,000 (configurable via INSIDER_MIN_AMOUNT)\n- Builds concise notifications; saves to data/openinsider_top.json Runner: support comma-separated CRAWLER_TYPE and new openinsider_top type\n- Accepts e.g., CRAWLER_TYPE=openinsider_top,openinsider,barrons\n- Preserves order, removes duplicates; warns on unknown types\n- Uses shared schedule: RUN_DAILY_AT or CHECK_INTERVAL; initial run per crawler Entrypoint: rename enhanced_crawler.py -> main.py\n- Update Dockerfile CMD and README references Config & docs:\n- Reorganize .env.template into clear sections with examples\n- Update .env with multi-crawler example and INSIDER_MIN_AMOUNT\n- README: document new crawler, usage, and multi-type CRAWLER_TYPE --- .env.template | 37 +++-- Dockerfile | 2 +- README.md | 18 ++- app/crawlers/openinsider_top.py | 230 ++++++++++++++++++++++++++++++++ app/runner.py | 42 ++++-- enhanced_crawler.py => main.py | 6 +- 6 files changed, 308 insertions(+), 27 deletions(-) create mode 100644 app/crawlers/openinsider_top.py rename enhanced_crawler.py => main.py (53%) diff --git a/.env.template b/.env.template index 83709df..e5acdb6 100644 --- a/.env.template +++ b/.env.template @@ -1,10 +1,19 @@ -# 基本設定 +############################# +# 基本設定(排程與日誌) +############################# +# 每 N 秒檢查一次(若設定 RUN_DAILY_AT 則忽略) CHECK_INTERVAL=300 +# 每天固定時間檢查(例如 12:00);若不需固定時間請保留此值或註解 RUN_DAILY_AT=12:00 +# 日誌等級:DEBUG | INFO | WARNING | ERROR LOG_LEVEL=INFO +# 啟動後第一次是否也發送目前清單(true/false) ALWAYS_NOTIFY_ON_STARTUP=false -# 電子郵件通知設定(Gmail 範例) +############################# +# 通知設定(擇一或多個) +############################# +# Email(範例以 Gmail) EMAIL_SMTP_SERVER=smtp.gmail.com EMAIL_SMTP_PORT=587 # 可選: starttls | ssl | none @@ -20,15 +29,27 @@ WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK # Discord Webhook(可選) DISCORD_WEBHOOK=https://discord.com/api/webhooks/YOUR/DISCORD/WEBHOOK -# 進階(可選):自訂資料與日誌目錄 -# 預設 Docker 會使用 /app/data、/app/logs;本機則使用 ./data、./logs +############################# +# 進階:資料與日誌路徑(可選) +############################# +# Docker 預設 /app/data、/app/logs;本機預設 ./data、./logs # DATA_DIR=./data # LOG_DIR=./logs -# 選擇爬蟲類型與參數 -# 可選: barrons | openinsider +############################# +# 爬蟲選擇與參數 +############################# +# 可選:barrons | openinsider | openinsider_top +# 可逗號同時啟多個(例如同時跑三種): +# CRAWLER_TYPE=openinsider_top,openinsider,barrons CRAWLER_TYPE=openinsider -# 針對內部人交易爬蟲的股票代號(單一) + +# OpenInsider(依個別股票查詢) +# - 單一標的使用 SYMBOL +# - 多個標的使用 SYMBOLS(以逗號分隔) SYMBOL=PLTR -# 或一次追多個:以逗號分隔 # SYMBOLS=PLTR,NVDA,TSLA + +# OpenInsider 當日大額(三頁合併) +# - 金額門檻(整數,單位:美元),僅通知 >= 該金額的交易 +# INSIDER_MIN_AMOUNT=1000000 diff --git a/Dockerfile b/Dockerfile index 6ebca2f..c707c3a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,4 +33,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ EXPOSE 8080 # 執行爬蟲主程式 -CMD ["python", "enhanced_crawler.py"] +CMD ["python", "main.py"] diff --git a/README.md b/README.md index 1eac77a..594f788 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # 股票爬蟲服務(模組化架構) 可擴充的股票爬蟲服務,內建 HTTP API 與多種通知(Email/Webhook/Discord)。 -目前提供兩類爬蟲: +目前提供三類爬蟲: - Barron's 股票推薦 - OpenInsider 內部人交易(支援多標的) +- OpenInsider 當日大額內部人交易(跨三頁合併、金額過濾) ## 功能 - 定時抓取(支援每 N 秒或每日固定時間) @@ -24,7 +25,7 @@ app/ crawlers/template.py # 新站點範本(複製後改名擴充) services/storage.py # JSON 儲存 services/notifications.py # Email/Webhook/Discord -enhanced_crawler.py # 舊入口,現委派到 app.runner + main.py # 入口點,委派到 app.runner Dockerfile docker-compose.yml requirements.txt @@ -58,7 +59,7 @@ curl "http://localhost:8080/notify_test?channel=email" # 或 webhook/discord ## 本機執行(非 Docker) ```bash pip install -r requirements.txt -python enhanced_crawler.py +python main.py ``` ## 環境變數說明 @@ -80,11 +81,17 @@ python enhanced_crawler.py - `LOG_DIR`: 日誌輸出路徑(Docker 預設 `/app/logs`;本機預設 `./logs`) - 爬蟲選擇與參數 - - `CRAWLER_TYPE`: `barrons` | `openinsider` + - `CRAWLER_TYPE`: `barrons` | `openinsider` | `openinsider_top`(可用逗號同時啟多種,例如:`openinsider_top,openinsider`) - Barron's:無額外參數 - - OpenInsider: + - OpenInsider(依個別股票查詢): - 單一標的:`SYMBOL=PLTR` - 多個標的:`SYMBOLS=PLTR,NVDA,TSLA` + - OpenInsider 當日大額(跨頁合併): + - 來源頁面: + - `http://openinsider.com/top-insider-sales-of-the-day` + - `http://openinsider.com/top-insider-purchases-of-the-day` + - `http://openinsider.com/top-officer-purchases-of-the-day` + - 金額門檻(含千分位、自動去 `$`):`INSIDER_MIN_AMOUNT`,預設 `1000000` Email 使用建議: - Gmail 請使用「應用程式密碼」並開啟兩步驟驗證 @@ -141,3 +148,4 @@ docker-compose down - 2025-09: - 重構為模組化架構,API 與爬蟲邏輯分離 - 新增 OpenInsider 內部人交易爬蟲與多標的支援 + - 新增 OpenInsider 當日大額內部人交易(≥$1,000,000)爬蟲 diff --git a/app/crawlers/openinsider_top.py b/app/crawlers/openinsider_top.py new file mode 100644 index 0000000..9a1d1ee --- /dev/null +++ b/app/crawlers/openinsider_top.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import hashlib +import os +from datetime import datetime +from typing import List, Dict, Optional + +import requests +from bs4 import BeautifulSoup + +from app.crawlers.base import BaseCrawler + + +class OpenInsiderTopCrawler(BaseCrawler): + """Crawler for OpenInsider Top-of-the-day pages. + + Pages: + - http://openinsider.com/top-insider-sales-of-the-day + - http://openinsider.com/top-insider-purchases-of-the-day + - http://openinsider.com/top-officer-purchases-of-the-day + + Filters rows where Value/Amount >= 1,000,000 (default, configurable via + env var INSIDER_MIN_AMOUNT). + """ + + DEFAULT_URLS = [ + "http://openinsider.com/top-insider-sales-of-the-day", + "http://openinsider.com/top-insider-purchases-of-the-day", + "http://openinsider.com/top-officer-purchases-of-the-day", + ] + + def __init__(self, config, logger, urls: Optional[List[str]] = None, min_amount: Optional[int] = None): + super().__init__( + name="OpenInsider 當日大額內部人交易", + config=config, + logger=logger, + data_filename="openinsider_top.json", + ) + self.urls = urls or self.DEFAULT_URLS + # Allow override via env var INSIDER_MIN_AMOUNT + env_min = os.getenv('INSIDER_MIN_AMOUNT') + self.min_amount = ( + int(env_min) if (env_min and env_min.isdigit()) else (min_amount if min_amount is not None else 1_000_000) + ) + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/114.0 Safari/537.36' + } + + # For multi-page crawler, fetch_page is not used; keep for interface + def fetch_page(self) -> Optional[str]: + return "" + + def _fetch_one(self, url: str) -> Optional[str]: + try: + resp = requests.get(url, headers=self.headers, timeout=30) + resp.raise_for_status() + return resp.text + except requests.RequestException as e: + self.logger.error(f"獲取 OpenInsider 頁面失敗: {e} ({url})") + self.stats['errors'] += 1 + return None + + @staticmethod + def _parse_money(val: str) -> Optional[int]: + if not val: + return None + s = val.strip() + # Remove $ and commas and any parentheses + for ch in ['$', ',', '(', ')', '+']: + s = s.replace(ch, '') + # Some cells may include text like "$1,234,567 (incl. options)" + # Keep only leading numeric part + num = '' + for c in s: + if c.isdigit(): + num += c + elif c in ' .': + continue + else: + break + if not num: + return None + try: + return int(num) + except ValueError: + return None + + def parse_items_from_html(self, html_content: str, url: str) -> List[Dict]: + soup = BeautifulSoup(html_content, 'html.parser') + items: List[Dict] = [] + + # Find table with headers we care about + tables = soup.find_all('table') + target_table = None + expected_any = {'value', 'amount', 'qty', 'ticker', 'transaction', 'trans type', 'trade date'} + for tbl in tables: + headers = [th.get_text(strip=True).lower() for th in tbl.find_all('th')] + if not headers: + continue + hset = set(headers) + if any(h in hset for h in expected_any): + target_table = tbl + break + if not target_table: + return items + + header_map = {} + headers = [th.get_text(strip=True).lower() for th in target_table.find_all('th')] + for idx, h in enumerate(headers): + header_map[h] = idx + + def find_idx(possible): + for k in possible: + if k in header_map: + return header_map[k] + for k, v in header_map.items(): + if any(p in k for p in possible): + return v + return None + + idx_ticker = find_idx(['ticker']) + idx_insider = find_idx(['insider', 'insider name', 'name']) + idx_type = find_idx(['trans type', 'transaction', 'type']) + idx_qty = find_idx(['qty', 'quantity', 'shares']) + idx_price = find_idx(['price']) + idx_value = find_idx(['value', 'amount']) + idx_trade_date = find_idx(['trade date', 'date']) + + rows = [r for r in target_table.find_all('tr') if r.find('td')] + for row in rows: + cols = row.find_all('td') + + def cell(i): + if i is None or i >= len(cols): + return '' + return cols[i].get_text(strip=True) + + ticker = (cell(idx_ticker) or '').upper() + insider = cell(idx_insider) + trans_type = cell(idx_type) + qty = cell(idx_qty) + price = cell(idx_price) + value_text = cell(idx_value) + trade_date = cell(idx_trade_date) + + amount = self._parse_money(value_text) + if amount is None or amount < self.min_amount: + continue + + title = f"{ticker} {trans_type} - {insider} qty {qty} @ {price} value ${amount:,} on {trade_date}" + hash_src = f"{ticker}|{insider}|{trans_type}|{qty}|{price}|{trade_date}|{amount}|{url}" + items.append({ + 'title': title, + 'link': url, + 'scraped_at': datetime.now().isoformat(), + 'hash': hashlib.md5(hash_src.encode('utf-8')).hexdigest()[:12], + }) + + return items + + def parse_items(self, html_content: str) -> List[Dict]: + # Not used; we fetch multiple pages in run_check + return [] + + # Override run_check to handle multiple pages and combine results + def run_check(self) -> Optional[List[Dict]]: + self.logger.info(f"開始檢查 {self.name} (閾值 ${self.min_amount:,}) ...") + self.stats['total_checks'] += 1 + self.stats['last_check'] = datetime.now().isoformat() + try: + combined: List[Dict] = [] + for url in self.urls: + html = self._fetch_one(url) + if not html: + continue + items = self.parse_items_from_html(html, url) + combined.extend(items) + if not combined: + self.logger.info("✅ 沒有符合金額門檻的交易") + return [] + + prev = self._load_previous() + new_items = self.find_new(combined, prev) + + if new_items: + self.logger.info(f"🚨 發現 {len(new_items)} 筆新交易(>= ${self.min_amount:,})") + self.stats['new_picks_found'] += len(new_items) + self._send_notifications(new_items) + self._save_current(combined) + return new_items + + # Notify on first run if requested + if (not self._first_check_done) and self.config.always_notify_on_startup and combined: + self.logger.info("🟢 啟動首次檢查:無新內容,但依設定寄出目前清單") + self._send_notifications(combined) + self._save_current(combined) + return combined + + self.logger.info("✅ 沒有發現新內容") + return [] + except Exception as e: + self.logger.error(f"檢查過程錯誤: {e}") + self.stats['errors'] += 1 + return None + + def _load_previous(self) -> List[Dict]: + from app.services import storage + return storage.load_json(self.data_path).get('stock_picks', []) + + def _save_current(self, items: List[Dict]) -> None: + from app.services import storage + storage.save_json(self.data_path, { + 'last_update': datetime.now().isoformat(), + 'stock_picks': items, + 'stats': self.stats, + }) + + def _build_email(self, items: List[Dict]): + subject = f"OpenInsider 當日大額內部人交易(≥${self.min_amount:,}) - {len(items)} 筆" + lines = [] + for it in items[:10]: + lines.append(f"• {it.get('title','')}") + body = ( + f"發現 {len(items)} 筆符合金額門檻的內部人交易(OpenInsider):\n\n" + "\n".join(lines) + "\n\n" + f"抓取時間:{datetime.now().isoformat()}\n來源:\n- " + "\n- ".join(self.urls) + ) + return subject, body + diff --git a/app/runner.py b/app/runner.py index 4b66148..b8fd228 100644 --- a/app/runner.py +++ b/app/runner.py @@ -8,6 +8,7 @@ import schedule from app.config import load_config, setup_logging from app.crawlers.barrons import BarronsCrawler from app.crawlers.openinsider import OpenInsiderCrawler +from app.crawlers.openinsider_top import OpenInsiderTopCrawler from app.api.server import create_app @@ -16,17 +17,38 @@ def start(): config = load_config() logger = setup_logging(config.log_level, config.log_dir) - # Select crawler via env var - crawler_type = (os.getenv('CRAWLER_TYPE') or 'barrons').lower() + # Select crawler(s) via env var (supports comma-separated types) + types_raw = os.getenv('CRAWLER_TYPE') or 'barrons' + type_list = [t.strip().lower() for t in types_raw.split(',') if t.strip()] + # Preserve order, remove duplicates + seen = set() + crawler_types = [] + for t in type_list: + if t not in seen: + seen.add(t) + crawler_types.append(t) + + logger.info(f"選擇爬蟲類型: {crawler_types}") crawlers = [] - if crawler_type in ('openinsider', 'open_insider'): - symbols_raw = os.getenv('SYMBOLS') or os.getenv('SYMBOL', 'PLTR') - symbols = [s.strip().upper() for s in symbols_raw.split(',') if s.strip()] - logger.info(f"使用 OpenInsider 內部人交易爬蟲,symbols={symbols}") - for sym in symbols: - crawlers.append(OpenInsiderCrawler(config, logger, symbol=sym)) - else: - logger.info("使用 Barron's 股票推薦爬蟲") + for ctype in crawler_types: + if ctype in ('openinsider', 'open_insider'): + symbols_raw = os.getenv('SYMBOLS') or os.getenv('SYMBOL', 'PLTR') + symbols = [s.strip().upper() for s in symbols_raw.split(',') if s.strip()] + logger.info(f"使用 OpenInsider 內部人交易爬蟲,symbols={symbols}") + for sym in symbols: + crawlers.append(OpenInsiderCrawler(config, logger, symbol=sym)) + elif ctype in ('openinsider_top', 'open_insider_top', 'openinsider_topday'): + logger.info("使用 OpenInsider 當日大額內部人交易爬蟲 (三頁合併,金額>=1,000,000)") + crawlers.append(OpenInsiderTopCrawler(config, logger)) + elif ctype in ('barrons', "barron's", 'barrons_stock_picks'): + logger.info("使用 Barron's 股票推薦爬蟲") + crawlers.append(BarronsCrawler(config, logger)) + else: + logger.warning(f"未知的 CRAWLER_TYPE: {ctype},忽略此項") + + # Fallback when none recognized + if not crawlers: + logger.info("未選到任何爬蟲,預設使用 Barron's") crawlers.append(BarronsCrawler(config, logger)) # Create and start API in background diff --git a/enhanced_crawler.py b/main.py similarity index 53% rename from enhanced_crawler.py rename to main.py index f1276c6..6552a79 100644 --- a/enhanced_crawler.py +++ b/main.py @@ -1,8 +1,8 @@ -"""Back-compat entry point. +"""Entry point. The project has been refactored to separate API and crawler logic. -This file now just delegates to the modular runner to keep -Docker and existing commands unchanged. +This script delegates to the modular runner to keep Docker and +local commands straightforward. """ from app.runner import start