from __future__ import annotations from typing import List, Dict, Optional import requests from bs4 import BeautifulSoup from datetime import datetime import hashlib from app.crawlers.base import BaseCrawler class TemplateCrawler(BaseCrawler): """範本:建立新站點時複製本檔並改名。 必要實作:fetch_page 與 parse_items - parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單 """ def __init__(self, config, logger): super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json') self.url = "https://example.com" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def fetch_page(self) -> Optional[str]: try: resp = requests.get(self.url, headers=self.headers, timeout=30) resp.raise_for_status() return resp.text except requests.RequestException as e: self.logger.error(f"獲取網頁失敗: {e}") self.stats['errors'] += 1 return None def parse_items(self, html_content: str) -> List[Dict]: soup = BeautifulSoup(html_content, 'html.parser') items: List[Dict] = [] # TODO: 依站點結構實作解析邏輯,以下為示意 for a in soup.select('a')[:5]: title = a.get_text(strip=True) link = a.get('href') if title and len(title) > 5: items.append({ 'title': title, 'link': link, 'scraped_at': datetime.now().isoformat(), 'hash': hashlib.md5(title.encode()).hexdigest()[:8], }) return items