feat(openinsider): 新增 OpenInsider 內部人交易爬蟲，支援多標的與每日排程

- 新增 app/crawlers/openinsider.py，來源 http://openinsider.com/search?q={symbol} - 支援多標的：以 SYMBOLS=PLTR,NVDA,... 同時追多檔（或使用 SYMBOL 單一） - runner: 多實例排程與啟動；/check 會依序觸發全部爬蟲 - API: /info、/stats、/check、/notify_test 支援多爬蟲回應 - config/base: 新增 RUN_DAILY_AT 每日固定時間；未設定則用 CHECK_INTERVAL - notifications: 新增 send_custom_email、send_text_webhook、send_text_discord - README 與 .env.template 更新；.env 改為 CRAWLER_TYPE=openinsider - 移除 quiver_insiders 爬蟲與相關設定 BREAKING CHANGE: 不再支援 CRAWLER_TYPE=quiver_insiders；請改用 openinsider。
2025-09-04 22:32:29 +08:00
parent 58cc979b5b
commit e89567643b
8 changed files with 368 additions and 40 deletions
--- a/app/crawlers/base.py
+++ b/app/crawlers/base.py
@@ -125,12 +125,15 @@ class BaseCrawler(ABC):
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)

-        schedule.every(self.config.check_interval).seconds.do(self.run_check)
-        self.logger.info(f"🚀 爬蟲已啟動，每 {self.config.check_interval} 秒檢查一次")
+        if getattr(self.config, 'run_daily_at', None):
+            schedule.every().day.at(self.config.run_daily_at).do(self.run_check)
+            self.logger.info(f"🚀 爬蟲已啟動，每天 {self.config.run_daily_at} 檢查一次")
+        else:
+            schedule.every(self.config.check_interval).seconds.do(self.run_check)
+            self.logger.info(f"🚀 爬蟲已啟動，每 {self.config.check_interval} 秒檢查一次")
        self.run_check()
        self._first_check_done = True
        while self.running:
            schedule.run_pending()
            time.sleep(1)
        self.logger.info("爬蟲已停止")
-
--- a/app/crawlers/openinsider.py
+++ b/app/crawlers/openinsider.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime
+from typing import List, Dict, Optional
+
+import requests
+from bs4 import BeautifulSoup
+
+from app.crawlers.base import BaseCrawler
+from app.services import notifications as notif
+
+
+class OpenInsiderCrawler(BaseCrawler):
+    """Crawler for OpenInsider search results.
+
+    Source: http://openinsider.com/search?q={symbol}
+    Parses the HTML table and emits insider transactions.
+    """
+
+    def __init__(self, config, logger, symbol: str = "PLTR"):
+        super().__init__(
+            name=f"OpenInsider 內部人交易：{symbol}",
+            config=config,
+            logger=logger,
+            data_filename=f"openinsider_{symbol}.json",
+        )
+        self.symbol = symbol.upper()
+        self.url = f"http://openinsider.com/search?q={self.symbol}"
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                          'AppleWebKit/537.36 (KHTML, like Gecko) '
+                          'Chrome/114.0 Safari/537.36'
+        }
+
+    def fetch_page(self) -> Optional[str]:
+        try:
+            resp = requests.get(self.url, headers=self.headers, timeout=30)
+            resp.raise_for_status()
+            return resp.text
+        except requests.RequestException as e:
+            self.logger.error(f"獲取 OpenInsider 頁面失敗: {e}")
+            self.stats['errors'] += 1
+            return None
+
+    def parse_items(self, html_content: str) -> List[Dict]:
+        soup = BeautifulSoup(html_content, 'html.parser')
+
+        # Find the main results table by looking for expected headers
+        best_table = None
+        candidate_tables = soup.find_all('table')
+        self.logger.info(f"OpenInsider：發現 {len(candidate_tables)} 個 <table>")
+        expected_headers = {'insider', 'insider name', 'ticker', 'trans type', 'transaction', 'trade date', 'filing date'}
+        for tbl in candidate_tables:
+            headers = [th.get_text(strip=True).lower() for th in tbl.find_all('th')]
+            if not headers:
+                continue
+            hset = set(headers)
+            if any(h in hset for h in expected_headers):
+                best_table = tbl
+                break
+        if not best_table and candidate_tables:
+            best_table = candidate_tables[0]
+
+        if not best_table:
+            self.logger.warning("OpenInsider：找不到結果表格")
+            return []
+
+        # Build header index map (robust match)
+        header_map: Dict[str, int] = {}
+        header_texts = [th.get_text(strip=True).lower() for th in best_table.find_all('th')]
+        for idx, text in enumerate(header_texts):
+            header_map[text] = idx
+
+        def find_idx(possible: List[str]) -> Optional[int]:
+            for key in possible:
+                if key in header_map:
+                    return header_map[key]
+            # fuzzy contains
+            for k, v in header_map.items():
+                if any(p in k for p in possible):
+                    return v
+            return None
+
+        idx_insider = find_idx(['insider name', 'insider', 'name'])
+        idx_type = find_idx(['trans type', 'transaction', 'type'])
+        idx_qty = find_idx(['qty', 'quantity', 'shares'])
+        idx_price = find_idx(['price'])
+        idx_ticker = find_idx(['ticker'])
+        idx_trade_date = find_idx(['trade date', 'date'])
+        idx_filing_date = find_idx(['filing date', 'filed'])
+
+        rows = best_table.find_all('tr')
+        # Skip header rows (those that contain th)
+        data_rows = [r for r in rows if r.find('td')]
+
+        items: List[Dict] = []
+        for row in data_rows[:100]:
+            cols = row.find_all('td')
+            def col_text(i: Optional[int]) -> str:
+                if i is None or i >= len(cols):
+                    return ''
+                return cols[i].get_text(strip=True)
+
+            insider = col_text(idx_insider) or 'Unknown Insider'
+            trans_type = col_text(idx_type) or 'N/A'
+            qty = col_text(idx_qty) or 'N/A'
+            price = col_text(idx_price) or 'N/A'
+            ticker = (col_text(idx_ticker) or '').upper()
+            trade_date = col_text(idx_trade_date)
+            filing_date = col_text(idx_filing_date)
+
+            if ticker and self.symbol not in ticker:
+                # Keep results aligned to symbol query
+                continue
+
+            title = f"{self.symbol} {trans_type} - {insider} qty {qty} @ {price} on {trade_date}"
+            if filing_date:
+                title += f" (filed {filing_date})"
+            hash_src = f"{self.symbol}|{insider}|{trans_type}|{qty}|{price}|{trade_date}|{filing_date}"
+            items.append({
+                'title': title,
+                'link': self.url,
+                'scraped_at': datetime.now().isoformat(),
+                'hash': hashlib.md5(hash_src.encode('utf-8')).hexdigest()[:12],
+            })
+
+        self.logger.info(f"OpenInsider：解析完成，擷取 {len(items)} 筆交易")
+        return items
+
+    def _send_notifications(self, items: List[Dict]) -> None:
+        subject = f"OpenInsider 內部人交易異動 - {self.symbol} ({len(items)}筆)"
+        lines = []
+        for it in items[:10]:
+            lines.append(f"• {it['title']}")
+        body = (
+            f"發現 {len(items)} 筆新的內部人交易異動（OpenInsider）：\n\n" + "\n".join(lines) + "\n\n"
+            f"抓取時間：{datetime.now().isoformat()}\n來源：{self.url}"
+        )
+
+        sent = False
+        if self.config.email:
+            try:
+                notif.send_custom_email(subject, body, self.config.email)
+                sent = True
+            except Exception as e:
+                self.logger.error(f"電子郵件通知失敗: {e}")
+        if self.config.webhook_url:
+            try:
+                notif.send_text_webhook(subject + "\n\n" + body, self.config.webhook_url)
+                sent = True
+            except Exception as e:
+                self.logger.error(f"Webhook 通知失敗: {e}")
+        if self.config.discord_webhook:
+            try:
+                notif.send_text_discord(title=subject, description=f"{self.symbol} 內部人交易更新（OpenInsider）", lines=lines[:10], webhook=self.config.discord_webhook)
+                sent = True
+            except Exception as e:
+                self.logger.error(f"Discord 通知失敗: {e}")
+        if sent:
+            self.stats['last_notification'] = datetime.now().isoformat()
+