Files
stock-info-crawler/app/crawlers/template.py
MH Hung 58cc979b5b refactor: modularize project structure and separate API from crawlers
- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules
- Add BaseCrawler and BarronsCrawler; extract notifications and storage
- Keep enhanced_crawler.py as back-compat entry delegating to app.runner
- Add template crawler for future sites
- Update README with new structure and usage
- Extend .env.template with DATA_DIR/LOG_DIR options
2025-09-04 21:39:24 +08:00

54 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib
from app.crawlers.base import BaseCrawler
class TemplateCrawler(BaseCrawler):
"""範本:建立新站點時複製本檔並改名。
必要實作fetch_page 與 parse_items
- parse_items 請回傳包含 title、可選 link、scraped_at、hash 的清單
"""
def __init__(self, config, logger):
super().__init__(name="Template Site", config=config, logger=logger, data_filename='template_site.json')
self.url = "https://example.com"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def fetch_page(self) -> Optional[str]:
try:
resp = requests.get(self.url, headers=self.headers, timeout=30)
resp.raise_for_status()
return resp.text
except requests.RequestException as e:
self.logger.error(f"獲取網頁失敗: {e}")
self.stats['errors'] += 1
return None
def parse_items(self, html_content: str) -> List[Dict]:
soup = BeautifulSoup(html_content, 'html.parser')
items: List[Dict] = []
# TODO: 依站點結構實作解析邏輯,以下為示意
for a in soup.select('a')[:5]:
title = a.get_text(strip=True)
link = a.get('href')
if title and len(title) > 5:
items.append({
'title': title,
'link': link,
'scraped_at': datetime.now().isoformat(),
'hash': hashlib.md5(title.encode()).hexdigest()[:8],
})
return items