refactor: modularize project structure and separate API from crawlers

- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules
- Add BaseCrawler and BarronsCrawler; extract notifications and storage
- Keep enhanced_crawler.py as back-compat entry delegating to app.runner
- Add template crawler for future sites
- Update README with new structure and usage
- Extend .env.template with DATA_DIR/LOG_DIR options
This commit is contained in:
2025-09-04 21:39:24 +08:00
parent 099f156e6f
commit 58cc979b5b
12 changed files with 663 additions and 666 deletions

53
app/crawlers/template.py Normal file
View File

@@ -0,0 +1,53 @@
from __future__ import annotations
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
from app.crawlers.base import BaseCrawler
class TemplateCrawler(BaseCrawler):
"""範本:建立新站點時複製本檔並改名。
必要實作fetch_page 與 parse_items
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
"""
def __init__(self, config, logger):
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
self.url = "https://example.com"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self) -> Optional[str]:
try:
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_items(self, html_content: str) -> List[Dict]:
soup = BeautifulSoup(html_content, 'html.parser')
items: List[Dict] = []
# TODO: 依站點結構實作解析邏輯,以下為示意
for a in soup.select('a')[:5]:
title = a.get_text(strip=True)
link = a.get('href')
if title and len(title) > 5:
items.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
})
return items