diff --git a/lnp_ml/modeling/benchmark.py b/lnp_ml/modeling/benchmark.py index 2c6be78..42f2ff5 100644 --- a/lnp_ml/modeling/benchmark.py +++ b/lnp_ml/modeling/benchmark.py @@ -1,6 +1,7 @@ """Benchmark 脚本:在 baseline 论文公开的 CV 划分上评估模型(仅 delivery 任务)""" import json +import math from pathlib import Path from typing import Dict, List, Optional @@ -9,6 +10,7 @@ import pandas as pd import torch import torch.nn as nn from torch.utils.data import DataLoader +from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR, SequentialLR from loguru import logger from tqdm import tqdm from sklearn.metrics import mean_squared_error, r2_score @@ -158,6 +160,7 @@ def train_fold( weight_decay: float = 1e-5, epochs: int = 50, patience: int = 10, + warmup_epochs: int = 3, config: Optional[Dict] = None, ) -> Dict: """训练单个 fold""" @@ -166,9 +169,19 @@ def train_fold( logger.info(f"{'='*60}") optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay) - scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( - optimizer, mode="min", factor=0.5, patience=5 + + warmup_scheduler = LambdaLR( + optimizer, lr_lambda=lambda epoch: (epoch + 1) / warmup_epochs ) + cosine_scheduler = CosineAnnealingLR( + optimizer, T_max=epochs - warmup_epochs + ) + scheduler = SequentialLR( + optimizer, + schedulers=[warmup_scheduler, cosine_scheduler], + milestones=[warmup_epochs], + ) + early_stopping = EarlyStopping(patience=patience) best_val_loss = float("inf") @@ -198,7 +211,7 @@ def train_fold( "lr": current_lr, }) - scheduler.step(val_metrics["loss"]) + scheduler.step() if val_metrics["loss"] < best_val_loss: best_val_loss = val_metrics["loss"] @@ -284,11 +297,11 @@ def main( data_dir: Path = PROCESSED_DATA_DIR / "benchmark", output_dir: Path = MODELS_DIR / "benchmark", # 模型参数 - d_model: int = 128, - num_heads: int = 4, - n_attn_layers: int = 2, + d_model: int = 256, + num_heads: int = 8, + n_attn_layers: int = 4, fusion_strategy: str = "attention", - head_hidden_dim: int = 64, + head_hidden_dim: int = 128, dropout: float = 0.1, # MPNN 参数 use_mpnn: bool = False, @@ -416,7 +429,9 @@ def main( model.rdkit_encoder._cache = rdkit_cache logger.info(f"Reusing RDKit cache with {len(rdkit_cache)} entries") - logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}") + n_params_total = sum(p.numel() for p in model.parameters()) + n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + logger.info(f"Model parameters: {n_params_total:,} total, {n_params_trainable:,} trainable") # 训练 result = train_fold( diff --git a/lnp_ml/modeling/pretrain.py b/lnp_ml/modeling/pretrain.py index abb702d..b7d9d66 100644 --- a/lnp_ml/modeling/pretrain.py +++ b/lnp_ml/modeling/pretrain.py @@ -303,8 +303,9 @@ def main( dropout=dropout, ) - n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - logger.info(f"Model parameters: {n_params:,}") + n_params_total = sum(p.numel() for p in model.parameters()) + n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + logger.info(f"Model parameters: {n_params_total:,} total, {n_params_trainable:,} trainable") # 预热 RDKit 缓存(避免训练时阻塞) all_smiles = train_df["smiles"].tolist() + val_df["smiles"].tolist()