- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules - Add BaseCrawler and BarronsCrawler; extract notifications and storage - Keep enhanced_crawler.py as back-compat entry delegating to app.runner - Add template crawler for future sites - Update README with new structure and usage - Extend .env.template with DATA_DIR/LOG_DIR options
54 lines
1.8 KiB
Python
54 lines
1.8 KiB
Python
from __future__ import annotations
|
||
|
||
from typing import List, Dict, Optional
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from datetime import datetime
|
||
import hashlib
|
||
|
||
from app.crawlers.base import BaseCrawler
|
||
|
||
|
||
class TemplateCrawler(BaseCrawler):
|
||
"""範本:建立新站點時複製本檔並改名。
|
||
|
||
必要實作:fetch_page 與 parse_items
|
||
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
|
||
"""
|
||
|
||
def __init__(self, config, logger):
|
||
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
|
||
self.url = "https://example.com"
|
||
self.headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
}
|
||
|
||
def fetch_page(self) -> Optional[str]:
|
||
try:
|
||
resp = requests.get(self.url, headers=self.headers, timeout=30)
|
||
resp.raise_for_status()
|
||
return resp.text
|
||
except requests.RequestException as e:
|
||
self.logger.error(f"獲取網頁失敗: {e}")
|
||
self.stats['errors'] += 1
|
||
return None
|
||
|
||
def parse_items(self, html_content: str) -> List[Dict]:
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
items: List[Dict] = []
|
||
|
||
# TODO: 依站點結構實作解析邏輯,以下為示意
|
||
for a in soup.select('a')[:5]:
|
||
title = a.get_text(strip=True)
|
||
link = a.get('href')
|
||
if title and len(title) > 5:
|
||
items.append({
|
||
'title': title,
|
||
'link': link,
|
||
'scraped_at': datetime.now().isoformat(),
|
||
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
|
||
})
|
||
return items
|
||
|