refactor: modularize project structure and separate API from crawlers

- Introduce app/ package with config, services (storage, notifications), API server, and crawler modules
- Add BaseCrawler and BarronsCrawler; extract notifications and storage
- Keep enhanced_crawler.py as back-compat entry delegating to app.runner
- Add template crawler for future sites
- Update README with new structure and usage
- Extend .env.template with DATA_DIR/LOG_DIR options
This commit is contained in:
2025-09-04 21:39:24 +08:00
parent 099f156e6f
commit 58cc979b5b
12 changed files with 663 additions and 666 deletions

96
app/config.py Normal file
View File

@@ -0,0 +1,96 @@
import os
import logging
from dataclasses import dataclass
@dataclass
class EmailConfig:
smtp_server: str
smtp_port: int
smtp_security: str # 'ssl' | 'starttls' | 'none'
from_email: str
to_email: str
username: str
password: str
@dataclass
class AppConfig:
check_interval: int
log_level: str
always_notify_on_startup: bool
webhook_url: str | None
discord_webhook: str | None
data_dir: str
log_dir: str
email: EmailConfig | None
def _resolve_dir(env_key: str, default_subdir: str) -> str:
# Prefer explicit env var
val = os.getenv(env_key)
if val:
return val
# Prefer Docker paths if present
docker_path = f"/app/{default_subdir}"
if os.path.isdir(docker_path):
return docker_path
# Fallback to local ./subdir
return os.path.join(os.getcwd(), default_subdir)
def load_email_config() -> EmailConfig | None:
required = [
'EMAIL_SMTP_SERVER', 'EMAIL_FROM', 'EMAIL_TO', 'EMAIL_USERNAME', 'EMAIL_PASSWORD'
]
if not all(os.getenv(k) for k in required):
return None
security = os.getenv('EMAIL_SMTP_SECURITY', 'starttls').lower()
default_port = 465 if security == 'ssl' else 587 if security == 'starttls' else 25
smtp_port = int(os.getenv('EMAIL_SMTP_PORT', default_port))
return EmailConfig(
smtp_server=os.getenv('EMAIL_SMTP_SERVER', ''),
smtp_port=smtp_port,
smtp_security=security,
from_email=os.getenv('EMAIL_FROM', ''),
to_email=os.getenv('EMAIL_TO', ''),
username=os.getenv('EMAIL_USERNAME', ''),
password=os.getenv('EMAIL_PASSWORD', ''),
)
def setup_logging(level: str, log_dir: str) -> logging.Logger:
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(os.path.join(log_dir, 'crawler.log')),
logging.StreamHandler(),
],
)
return logging.getLogger(__name__)
def load_config() -> AppConfig:
check_interval = int(os.getenv('CHECK_INTERVAL', 300))
log_level = os.getenv('LOG_LEVEL', 'INFO')
always_notify_on_startup = os.getenv('ALWAYS_NOTIFY_ON_STARTUP', 'false').lower() in ('1', 'true', 'yes')
webhook_url = os.getenv('WEBHOOK_URL')
discord_webhook = os.getenv('DISCORD_WEBHOOK')
data_dir = _resolve_dir('DATA_DIR', 'data')
log_dir = _resolve_dir('LOG_DIR', 'logs')
return AppConfig(
check_interval=check_interval,
log_level=log_level,
always_notify_on_startup=always_notify_on_startup,
webhook_url=webhook_url,
discord_webhook=discord_webhook,
data_dir=data_dir,
log_dir=log_dir,
email=load_email_config(),
)