refactor: modularize project structure and separate API from crawlers

- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules - Add BaseCrawler and BarronsCrawler; extract notifications and storage - Keep enhanced_crawler.py as back-compat entry delegating to app.runner - Add template crawler for future sites - Update README with new structure and usage - Extend .env.template with DATA_DIR/LOG_DIR options
2025-09-04 21:39:24 +08:00
parent 099f156e6f
commit 58cc979b5b
12 changed files with 663 additions and 666 deletions
--- a/app/crawlers/template.py
+++ b/app/crawlers/template.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import List, Dict, Optional
+
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+import hashlib
+
+from app.crawlers.base import BaseCrawler
+
+
+class TemplateCrawler(BaseCrawler):
+    """範本：建立新站點時複製本檔並改名。
+
+    必要實作：fetch_page 與 parse_items
+    - parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
+    """
+
+    def __init__(self, config, logger):
+        super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
+        self.url = "https://example.com"
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+
+    def fetch_page(self) -> Optional[str]:
+        try:
+            resp = requests.get(self.url, headers=self.headers, timeout=30)
+            resp.raise_for_status()
+            return resp.text
+        except requests.RequestException as e:
+            self.logger.error(f"獲取網頁失敗: {e}")
+            self.stats['errors'] += 1
+            return None
+
+    def parse_items(self, html_content: str) -> List[Dict]:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        items: List[Dict] = []
+
+        # TODO: 依站點結構實作解析邏輯，以下為示意
+        for a in soup.select('a')[:5]:
+            title = a.get_text(strip=True)
+            link = a.get('href')
+            if title and len(title) > 5:
+                items.append({
+                    'title': title,
+                    'link': link,
+                    'scraped_at': datetime.now().isoformat(),
+                    'hash': hashlib.md5(title.encode()).hexdigest()[:8],
+                })
+        return items
+