refactor: modularize project structure and separate API from crawlers
- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules - Add BaseCrawler and BarronsCrawler; extract notifications and storage - Keep enhanced_crawler.py as back-compat entry delegating to app.runner - Add template crawler for future sites - Update README with new structure and usage - Extend .env.template with DATA_DIR/LOG_DIR options
This commit is contained in:
53
app/crawlers/template.py
Normal file
53
app/crawlers/template.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
|
||||
from app.crawlers.base import BaseCrawler
|
||||
|
||||
|
||||
class TemplateCrawler(BaseCrawler):
|
||||
"""範本:建立新站點時複製本檔並改名。
|
||||
|
||||
必要實作:fetch_page 與 parse_items
|
||||
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
|
||||
"""
|
||||
|
||||
def __init__(self, config, logger):
|
||||
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
|
||||
self.url = "https://example.com"
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
def fetch_page(self) -> Optional[str]:
|
||||
try:
|
||||
resp = requests.get(self.url, headers=self.headers, timeout=30)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
except requests.RequestException as e:
|
||||
self.logger.error(f"獲取網頁失敗: {e}")
|
||||
self.stats['errors'] += 1
|
||||
return None
|
||||
|
||||
def parse_items(self, html_content: str) -> List[Dict]:
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
items: List[Dict] = []
|
||||
|
||||
# TODO: 依站點結構實作解析邏輯,以下為示意
|
||||
for a in soup.select('a')[:5]:
|
||||
title = a.get_text(strip=True)
|
||||
link = a.get('href')
|
||||
if title and len(title) > 5:
|
||||
items.append({
|
||||
'title': title,
|
||||
'link': link,
|
||||
'scraped_at': datetime.now().isoformat(),
|
||||
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
|
||||
})
|
||||
return items
|
||||
|
Reference in New Issue
Block a user