mirror of
https://github.com/RYDE-WORK/lnp_ml.git
synced 2026-03-23 02:35:59 +08:00
Compare commits
No commits in common. "3a45c0641c46e9f45b4a20458bb6fce40df4c626" and "c7c33e3f489a49b20bc760aee85e6e62e0b2552f" have entirely different histories.
3a45c0641c
...
c7c33e3f48
@ -67,7 +67,7 @@ logs/
|
||||
# Models (will be mounted as volume or copied explicitly)
|
||||
# Note: models/final/ is copied in Dockerfile
|
||||
models/finetune_cv/
|
||||
models/benchmark/
|
||||
models/pretrain_cv/
|
||||
models/mpnn/
|
||||
models/*.pt
|
||||
models/*.json
|
||||
|
||||
147
Makefile
147
Makefile
@ -7,8 +7,10 @@ PYTHON_VERSION = 3.8
|
||||
PYTHON_INTERPRETER = python
|
||||
|
||||
# --- CLI flag 变量 ---
|
||||
MPNN_FLAG = $(if $(NO_MPNN),,--use-mpnn)
|
||||
MPNN_FLAG = $(if $(USE_MPNN),--use-mpnn,)
|
||||
FREEZE_FLAG = $(if $(FREEZE_BACKBONE),--freeze-backbone,)
|
||||
DEVICE_FLAG = $(if $(DEVICE),--device $(DEVICE),)
|
||||
SCAFFOLD_SPLIT_FLAG = $(if $(filter 1,$(SCAFFOLD_SPLIT)),--scaffold-split,)
|
||||
SEED_FLAG = $(if $(SEED),--seed $(SEED),)
|
||||
N_TRIALS_FLAG = $(if $(N_TRIALS),--n-trials $(N_TRIALS),)
|
||||
EPOCHS_PER_TRIAL_FLAG = $(if $(EPOCHS_PER_TRIAL),--epochs-per-trial $(EPOCHS_PER_TRIAL),)
|
||||
@ -59,57 +61,136 @@ format:
|
||||
preprocess: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/preprocess_internal.py
|
||||
|
||||
## Process dataset (interim -> processed)
|
||||
.PHONY: data
|
||||
data: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_data.py
|
||||
|
||||
## Process dataset for final training (interim -> processed/final, train:val=9:1, no test)
|
||||
.PHONY: data_final
|
||||
data_final: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_data_final.py
|
||||
|
||||
## Process external data for pretrain (external -> processed)
|
||||
.PHONY: data_pretrain
|
||||
data_pretrain: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_external.py
|
||||
|
||||
## Process baseline CV data for benchmark (external/all_amine_split_for_LiON -> processed/benchmark)
|
||||
.PHONY: data_benchmark
|
||||
data_benchmark: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_benchmark_data.py
|
||||
## Process CV data for cross-validation pretrain (external/all_amine_split_for_LiON -> processed/cv)
|
||||
.PHONY: data_pretrain_cv
|
||||
data_pretrain_cv: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_external_cv.py
|
||||
|
||||
## Process internal data with CV splitting (interim -> processed/cv)
|
||||
## Use SCAFFOLD_SPLIT=1 to enable amine-based scaffold splitting (default: random shuffle)
|
||||
.PHONY: data_cv
|
||||
data_cv: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_data_cv.py $(SCAFFOLD_SPLIT_FLAG)
|
||||
|
||||
#################################################################################
|
||||
# BENCHMARKING #
|
||||
# TRAINING #
|
||||
#################################################################################
|
||||
|
||||
## Benchmark on baseline CV data: 5-fold train + test (delivery only)
|
||||
.PHONY: benchmark
|
||||
benchmark: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.benchmark main $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.benchmark test $(DEVICE_FLAG)
|
||||
|
||||
#################################################################################
|
||||
# TRAINING (Nested CV + Optuna) #
|
||||
#################################################################################
|
||||
# 通用参数:
|
||||
# SEED 随机种子 (默认: 42)
|
||||
# N_TRIALS Optuna 试验数 (默认: 20)
|
||||
# EPOCHS_PER_TRIAL 每个试验的最大 epoch (默认: 30)
|
||||
# MIN_STRATUM_COUNT 复合分层标签的最小样本数 (默认: 5)
|
||||
# OUTPUT_DIR 输出目录 (根据命令有不同默认值)
|
||||
# INIT_PRETRAIN 预训练权重路径 (默认: models/pretrain_delivery.pt)
|
||||
# NO_PRETRAIN=1 禁用预训练权重
|
||||
# USE_SWA=1 启用 SWA (final train 阶段)
|
||||
#
|
||||
# 使用示例:
|
||||
# make pretrain
|
||||
# make train DEVICE=cuda N_TRIALS=30 USE_SWA=1 INIT_PRETRAIN=models/pretrain_delivery.pt
|
||||
|
||||
|
||||
## Pretrain on external data (delivery only)
|
||||
.PHONY: pretrain
|
||||
pretrain: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain main $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Train: nested CV evaluation + final model training
|
||||
## Step 1: 外层 5-fold 产生无偏性能估计,内层 3-fold 做超参搜索
|
||||
## Step 2: 3-fold 调参后用全量数据训练最终模型
|
||||
## Pretrain with cross-validation (5-fold)
|
||||
.PHONY: pretrain_cv
|
||||
pretrain_cv: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain_cv main $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Train model (multi-task, from scratch)
|
||||
.PHONY: train
|
||||
train: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Finetune from pretrained checkpoint (use FREEZE_BACKBONE=1 to freeze backbone)
|
||||
.PHONY: finetune
|
||||
finetune: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train --init-from-pretrain models/pretrain_delivery.pt $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Final training using all data (train:val=9:1, no test set), with pretrained weights
|
||||
.PHONY: train_final
|
||||
train_final: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train \
|
||||
--train-path data/processed/final/train.parquet \
|
||||
--val-path data/processed/final/val.parquet \
|
||||
--output-dir models/final \
|
||||
--init-from-pretrain models/pretrain_delivery.pt \
|
||||
$(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Train with cross-validation on internal data only (5-fold, amine-based split)
|
||||
.PHONY: train_cv
|
||||
train_cv: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv main $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Finetune with cross-validation on internal data (5-fold) with pretrained weights
|
||||
.PHONY: finetune_cv
|
||||
finetune_cv: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv main --init-from-pretrain models/pretrain_delivery.pt $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
#################################################################################
|
||||
# EVALUATION #
|
||||
#################################################################################
|
||||
|
||||
## Evaluate pretrain model (delivery metrics)
|
||||
.PHONY: test_pretrain
|
||||
test_pretrain: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain test $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Evaluate CV pretrain models on test sets (auto-detects MPNN from checkpoint)
|
||||
.PHONY: test_pretrain_cv
|
||||
test_pretrain_cv: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain_cv test $(DEVICE_FLAG)
|
||||
|
||||
## Evaluate CV finetuned models on test sets (auto-detects MPNN from checkpoint)
|
||||
.PHONY: test_cv
|
||||
test_cv: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv test $(DEVICE_FLAG)
|
||||
|
||||
## Test model on test set (with detailed metrics, auto-detects MPNN from checkpoint)
|
||||
.PHONY: test
|
||||
test: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.predict test $(DEVICE_FLAG)
|
||||
|
||||
## Run predictions
|
||||
.PHONY: predict
|
||||
predict: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.predict $(DEVICE_FLAG)
|
||||
|
||||
#################################################################################
|
||||
# HYPERPARAMETER TUNING #
|
||||
#################################################################################
|
||||
# 通用参数:
|
||||
# SEED 随机种子 (默认: 42)
|
||||
# N_TRIALS Optuna 试验数 (默认: 20)
|
||||
# EPOCHS_PER_TRIAL 每个试验的最大 epoch (默认: 30)
|
||||
# MIN_STRATUM_COUNT 复合分层标签的最小样本数 (默认: 5)
|
||||
# OUTPUT_DIR 输出目录 (根据命令有不同默认值)
|
||||
# INIT_PRETRAIN 预训练权重路径 (默认: models/pretrain_delivery.pt)
|
||||
# NO_PRETRAIN=1 禁用预训练权重
|
||||
|
||||
## Train with hyperparameter tuning
|
||||
.PHONY: tune
|
||||
tune: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train --tune $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Nested CV with Optuna: outer 5-fold (test) + inner 3-fold (tune)
|
||||
## 用于模型评估:外层 5-fold 产生无偏性能估计,内层 3-fold 做超参搜索
|
||||
## 使用示例: make nested_cv_tune DEVICE=cuda N_TRIALS=30
|
||||
.PHONY: nested_cv_tune
|
||||
nested_cv_tune: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.nested_cv_optuna \
|
||||
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
|
||||
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG)
|
||||
|
||||
## Final training with Optuna: 3-fold CV tune + full data train
|
||||
## 用于最终模型训练:3-fold 调参后用全量数据训练(无 early-stop)
|
||||
## 使用示例: make final_optuna DEVICE=cuda N_TRIALS=30 USE_SWA=1
|
||||
.PHONY: final_optuna
|
||||
final_optuna: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.final_train_optuna_cv \
|
||||
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
|
||||
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG) $(USE_SWA_FLAG)
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
smiles
|
||||
Cationic_Lipid_to_mRNA_weight_ratio
|
||||
Cationic_Lipid_Mol_Ratio
|
||||
Phospholipid_Mol_Ratio
|
||||
Cholesterol_Mol_Ratio
|
||||
PEG_Lipid_Mol_Ratio
|
||||
Purity_Pure
|
||||
Purity_Crude
|
||||
Mix_type_Microfluidic
|
||||
Mix_type_Pipetting
|
||||
Cargo_type_mRNA
|
||||
Cargo_type_pDNA
|
||||
Cargo_type_siRNA
|
||||
Target_or_delivered_gene_FFL
|
||||
Target_or_delivered_gene_Peptide_barcode
|
||||
Target_or_delivered_gene_hEPO
|
||||
Target_or_delivered_gene_FVII
|
||||
Target_or_delivered_gene_GFP
|
||||
Helper_lipid_ID_DOPE
|
||||
Helper_lipid_ID_DOTAP
|
||||
Helper_lipid_ID_DSPC
|
||||
Helper_lipid_ID_MDOA
|
||||
Model_type_A549
|
||||
Model_type_BDMC
|
||||
Model_type_BMDM
|
||||
Model_type_HBEC_ALI
|
||||
Model_type_HEK293T
|
||||
Model_type_HeLa
|
||||
Model_type_IGROV1
|
||||
Model_type_Mouse
|
||||
Model_type_RAW264p7
|
||||
Delivery_target_body
|
||||
Delivery_target_dendritic_cell
|
||||
Delivery_target_generic_cell
|
||||
Delivery_target_liver
|
||||
Delivery_target_lung
|
||||
Delivery_target_lung_epithelium
|
||||
Delivery_target_macrophage
|
||||
Delivery_target_muscle
|
||||
Delivery_target_spleen
|
||||
Route_of_administration_in_vitro
|
||||
Route_of_administration_intramuscular
|
||||
Route_of_administration_intratracheal
|
||||
Route_of_administration_intravenous
|
||||
Batch_or_individual_or_barcoded_Barcoded
|
||||
Batch_or_individual_or_barcoded_Individual
|
||||
Value_name_log_luminescence
|
||||
Value_name_luminescence
|
||||
Value_name_FFL_silencing
|
||||
Value_name_Peptide_abundance
|
||||
Value_name_hEPO
|
||||
Value_name_FVII_silencing
|
||||
Value_name_GFP_delivery
|
||||
Value_name_Discretized_luminescence
|
||||
quantified_delivery
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -367,7 +367,7 @@ def run_optuna_cv(
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Path = INTERIM_DATA_DIR / "internal.csv",
|
||||
output_dir: Path = MODELS_DIR / "final",
|
||||
output_dir: Path = MODELS_DIR / "final_optuna",
|
||||
# CV 参数
|
||||
n_folds: int = 3,
|
||||
min_stratum_count: int = 5,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""Benchmark 脚本:在 baseline 论文公开的 CV 划分上评估模型(仅 delivery 任务)"""
|
||||
"""基于 Cross-Validation 的预训练脚本"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
@ -232,7 +232,7 @@ def train_fold(
|
||||
plot_loss_curves(
|
||||
history=history,
|
||||
output_path=loss_plot_path,
|
||||
title=f"Benchmark Fold {fold_idx} Loss Curves",
|
||||
title=f"Pretrain Fold {fold_idx} Loss Curves",
|
||||
)
|
||||
logger.info(f"Saved fold {fold_idx} loss curves to {loss_plot_path}")
|
||||
|
||||
@ -281,8 +281,8 @@ def create_model(
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "benchmark",
|
||||
output_dir: Path = MODELS_DIR / "benchmark",
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "pretrain_cv",
|
||||
output_dir: Path = MODELS_DIR / "pretrain_cv",
|
||||
# 模型参数
|
||||
d_model: int = 256,
|
||||
num_heads: int = 8,
|
||||
@ -305,7 +305,7 @@ def main(
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
):
|
||||
"""
|
||||
在 baseline 论文公开的 5-fold CV 划分上训练模型(仅 delivery 任务)。
|
||||
基于 5-fold Cross-Validation 预训练 LNP 模型(仅 delivery 任务)。
|
||||
|
||||
每个 fold 单独训练一个模型,保存到 output_dir/fold_x/model.pt。
|
||||
使用 --use-mpnn 启用 MPNN encoder。
|
||||
@ -332,7 +332,7 @@ def main(
|
||||
|
||||
if not fold_dirs:
|
||||
logger.error(f"No fold_* directories found in {data_dir}")
|
||||
logger.info("Please run 'make data_benchmark' first to process benchmark CV data.")
|
||||
logger.info("Please run 'make data_pretrain_cv' first to process CV data.")
|
||||
raise typer.Exit(1)
|
||||
|
||||
logger.info(f"Found {len(fold_dirs)} folds: {[d.name for d in fold_dirs]}")
|
||||
@ -430,7 +430,7 @@ def main(
|
||||
|
||||
# 汇总结果
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("BENCHMARK CV TRAINING COMPLETE")
|
||||
logger.info("CROSS-VALIDATION TRAINING COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
|
||||
val_losses = [r["best_val_loss"] for r in fold_results]
|
||||
@ -474,16 +474,16 @@ def main(
|
||||
|
||||
@app.command()
|
||||
def test(
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "benchmark",
|
||||
model_dir: Path = MODELS_DIR / "benchmark",
|
||||
output_path: Path = MODELS_DIR / "benchmark" / "test_results.json",
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "pretrain_cv",
|
||||
model_dir: Path = MODELS_DIR / "pretrain_cv",
|
||||
output_path: Path = MODELS_DIR / "pretrain_cv" / "test_results.json",
|
||||
batch_size: int = 64,
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
):
|
||||
"""
|
||||
在 baseline CV 测试集上评估 benchmark 模型。
|
||||
在测试集上评估 CV 预训练模型。
|
||||
|
||||
使用每个 fold 训练的模型在对应的测试集上评估,汇总跨 fold 结果。
|
||||
使用每个 fold 的模型在对应的测试集上评估。
|
||||
"""
|
||||
logger.info(f"Using device: {device}")
|
||||
device = torch.device(device)
|
||||
@ -609,7 +609,7 @@ def test(
|
||||
r2s = [r["r2"] for r in fold_results]
|
||||
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("BENCHMARK TEST EVALUATION RESULTS")
|
||||
logger.info("CV TEST EVALUATION RESULTS")
|
||||
logger.info("=" * 60)
|
||||
|
||||
logger.info(f"\n[Summary Statistics (across {len(fold_results)} folds)]")
|
||||
423
lnp_ml/modeling/train.py
Normal file
423
lnp_ml/modeling/train.py
Normal file
@ -0,0 +1,423 @@
|
||||
"""训练脚本:支持超参数调优"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from loguru import logger
|
||||
import typer
|
||||
|
||||
from lnp_ml.config import MODELS_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import LNPDataset, collate_fn
|
||||
from lnp_ml.modeling.models import LNPModel, LNPModelWithoutMPNN
|
||||
from lnp_ml.modeling.trainer import (
|
||||
train_epoch,
|
||||
validate,
|
||||
EarlyStopping,
|
||||
LossWeights,
|
||||
)
|
||||
from lnp_ml.modeling.visualization import plot_multitask_loss_curves
|
||||
|
||||
# MPNN ensemble 默认路径
|
||||
DEFAULT_MPNN_ENSEMBLE_DIR = MODELS_DIR / "mpnn" / "all_amine_split_for_LiON"
|
||||
|
||||
|
||||
def find_mpnn_ensemble_paths(base_dir: Path = DEFAULT_MPNN_ENSEMBLE_DIR) -> List[str]:
|
||||
"""
|
||||
自动查找 MPNN ensemble 的 model.pt 文件。
|
||||
|
||||
在 base_dir 下查找所有 cv_*/fold_*/model_*/model.pt 文件。
|
||||
"""
|
||||
model_paths = sorted(base_dir.glob("cv_*/fold_*/model_*/model.pt"))
|
||||
if not model_paths:
|
||||
raise FileNotFoundError(f"No model.pt files found in {base_dir}")
|
||||
return [str(p) for p in model_paths]
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def create_model(
|
||||
d_model: int = 256,
|
||||
num_heads: int = 8,
|
||||
n_attn_layers: int = 4,
|
||||
fusion_strategy: str = "attention",
|
||||
head_hidden_dim: int = 128,
|
||||
dropout: float = 0.1,
|
||||
# MPNN 参数(可选)
|
||||
mpnn_checkpoint: Optional[str] = None,
|
||||
mpnn_ensemble_paths: Optional[List[str]] = None,
|
||||
mpnn_device: str = "cpu",
|
||||
) -> Union[LNPModel, LNPModelWithoutMPNN]:
|
||||
"""创建模型(支持可选的 MPNN encoder)"""
|
||||
use_mpnn = mpnn_checkpoint is not None or mpnn_ensemble_paths is not None
|
||||
|
||||
if use_mpnn:
|
||||
return LNPModel(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
mpnn_checkpoint=mpnn_checkpoint,
|
||||
mpnn_ensemble_paths=mpnn_ensemble_paths,
|
||||
mpnn_device=mpnn_device,
|
||||
)
|
||||
else:
|
||||
return LNPModelWithoutMPNN(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
def train_model(
|
||||
train_loader: DataLoader,
|
||||
val_loader: DataLoader,
|
||||
model: torch.nn.Module,
|
||||
device: torch.device,
|
||||
lr: float = 1e-4,
|
||||
weight_decay: float = 1e-5,
|
||||
epochs: int = 100,
|
||||
patience: int = 15,
|
||||
loss_weights: Optional[LossWeights] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
训练模型。
|
||||
|
||||
Returns:
|
||||
训练历史和最佳验证损失
|
||||
"""
|
||||
model = model.to(device)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optimizer, mode="min", factor=0.5, patience=5, verbose=True
|
||||
)
|
||||
early_stopping = EarlyStopping(patience=patience)
|
||||
|
||||
history = {"train": [], "val": []}
|
||||
best_val_loss = float("inf")
|
||||
best_state = None
|
||||
|
||||
for epoch in range(epochs):
|
||||
# Train
|
||||
train_metrics = train_epoch(model, train_loader, optimizer, device, loss_weights)
|
||||
|
||||
# Validate
|
||||
val_metrics = validate(model, val_loader, device, loss_weights)
|
||||
|
||||
# Log
|
||||
logger.info(
|
||||
f"Epoch {epoch+1}/{epochs} | "
|
||||
f"Train Loss: {train_metrics['loss']:.4f} | "
|
||||
f"Val Loss: {val_metrics['loss']:.4f}"
|
||||
)
|
||||
|
||||
history["train"].append(train_metrics)
|
||||
history["val"].append(val_metrics)
|
||||
|
||||
# Learning rate scheduling
|
||||
scheduler.step(val_metrics["loss"])
|
||||
|
||||
# Save best model
|
||||
if val_metrics["loss"] < best_val_loss:
|
||||
best_val_loss = val_metrics["loss"]
|
||||
best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
|
||||
logger.info(f" -> New best model (val_loss={best_val_loss:.4f})")
|
||||
|
||||
# Early stopping
|
||||
if early_stopping(val_metrics["loss"]):
|
||||
logger.info(f"Early stopping at epoch {epoch+1}")
|
||||
break
|
||||
|
||||
# Restore best model
|
||||
if best_state is not None:
|
||||
model.load_state_dict(best_state)
|
||||
|
||||
return {
|
||||
"history": history,
|
||||
"best_val_loss": best_val_loss,
|
||||
}
|
||||
|
||||
|
||||
def run_hyperparameter_tuning(
|
||||
train_loader: DataLoader,
|
||||
val_loader: DataLoader,
|
||||
device: torch.device,
|
||||
n_trials: int = 20,
|
||||
epochs_per_trial: int = 30,
|
||||
) -> dict:
|
||||
"""
|
||||
使用 Optuna 进行超参数调优。
|
||||
|
||||
Returns:
|
||||
最佳超参数
|
||||
"""
|
||||
try:
|
||||
import optuna
|
||||
except ImportError:
|
||||
logger.error("Optuna not installed. Run: pip install optuna")
|
||||
raise
|
||||
|
||||
def objective(trial: optuna.Trial) -> float:
|
||||
# 采样超参数
|
||||
d_model = trial.suggest_categorical("d_model", [128, 256, 512])
|
||||
num_heads = trial.suggest_categorical("num_heads", [4, 8])
|
||||
n_attn_layers = trial.suggest_int("n_attn_layers", 2, 6)
|
||||
fusion_strategy = trial.suggest_categorical(
|
||||
"fusion_strategy", ["attention", "avg", "max"]
|
||||
)
|
||||
head_hidden_dim = trial.suggest_categorical("head_hidden_dim", [64, 128, 256])
|
||||
dropout = trial.suggest_float("dropout", 0.05, 0.3)
|
||||
lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
|
||||
weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-4, log=True)
|
||||
|
||||
# 创建模型
|
||||
model = create_model(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
# 训练
|
||||
result = train_model(
|
||||
train_loader=train_loader,
|
||||
val_loader=val_loader,
|
||||
model=model,
|
||||
device=device,
|
||||
lr=lr,
|
||||
weight_decay=weight_decay,
|
||||
epochs=epochs_per_trial,
|
||||
patience=10,
|
||||
)
|
||||
|
||||
return result["best_val_loss"]
|
||||
|
||||
# 运行优化
|
||||
study = optuna.create_study(direction="minimize")
|
||||
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
|
||||
|
||||
logger.info(f"Best trial: {study.best_trial.number}")
|
||||
logger.info(f"Best val_loss: {study.best_trial.value:.4f}")
|
||||
logger.info(f"Best params: {study.best_trial.params}")
|
||||
|
||||
return study.best_trial.params
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
train_path: Path = PROCESSED_DATA_DIR / "train.parquet",
|
||||
val_path: Path = PROCESSED_DATA_DIR / "val.parquet",
|
||||
output_dir: Path = MODELS_DIR,
|
||||
# 模型参数
|
||||
d_model: int = 256,
|
||||
num_heads: int = 8,
|
||||
n_attn_layers: int = 4,
|
||||
fusion_strategy: str = "attention",
|
||||
head_hidden_dim: int = 128,
|
||||
dropout: float = 0.1,
|
||||
# MPNN 参数(可选)
|
||||
use_mpnn: bool = False, # 启用 MPNN,自动从默认路径加载 ensemble
|
||||
mpnn_checkpoint: Optional[str] = None,
|
||||
mpnn_ensemble_paths: Optional[str] = None, # 逗号分隔的路径列表
|
||||
mpnn_device: str = "cpu",
|
||||
# 训练参数
|
||||
batch_size: int = 32,
|
||||
lr: float = 1e-4,
|
||||
weight_decay: float = 1e-5,
|
||||
epochs: int = 100,
|
||||
patience: int = 15,
|
||||
# 超参数调优
|
||||
tune: bool = False,
|
||||
n_trials: int = 20,
|
||||
epochs_per_trial: int = 30,
|
||||
# 预训练权重加载
|
||||
init_from_pretrain: Optional[Path] = None,
|
||||
load_delivery_head: bool = True,
|
||||
freeze_backbone: bool = False, # 冻结 backbone,只训练 heads
|
||||
# 设备
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
):
|
||||
"""
|
||||
训练 LNP 预测模型(多任务 finetune)。
|
||||
|
||||
使用 --tune 启用超参数调优。
|
||||
使用 --init-from-pretrain 从预训练 checkpoint 初始化 backbone。
|
||||
使用 --use-mpnn 启用 MPNN encoder(自动从 models/mpnn/all_amine_split_for_LiON 加载)。
|
||||
使用 --freeze-backbone 冻结 backbone,只训练多任务 heads。
|
||||
"""
|
||||
logger.info(f"Using device: {device}")
|
||||
device = torch.device(device)
|
||||
|
||||
# 加载数据
|
||||
logger.info(f"Loading train data from {train_path}")
|
||||
train_df = pd.read_parquet(train_path)
|
||||
train_dataset = LNPDataset(train_df)
|
||||
train_loader = DataLoader(
|
||||
train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
logger.info(f"Loading val data from {val_path}")
|
||||
val_df = pd.read_parquet(val_path)
|
||||
val_dataset = LNPDataset(val_df)
|
||||
val_loader = DataLoader(
|
||||
val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
logger.info(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 超参数调优
|
||||
if tune:
|
||||
logger.info(f"Starting hyperparameter tuning with {n_trials} trials...")
|
||||
best_params = run_hyperparameter_tuning(
|
||||
train_loader=train_loader,
|
||||
val_loader=val_loader,
|
||||
device=device,
|
||||
n_trials=n_trials,
|
||||
epochs_per_trial=epochs_per_trial,
|
||||
)
|
||||
|
||||
# 保存最佳参数
|
||||
params_path = output_dir / "best_params.json"
|
||||
with open(params_path, "w") as f:
|
||||
json.dump(best_params, f, indent=2)
|
||||
logger.success(f"Saved best params to {params_path}")
|
||||
|
||||
# 使用最佳参数重新训练
|
||||
d_model = best_params["d_model"]
|
||||
num_heads = best_params["num_heads"]
|
||||
n_attn_layers = best_params["n_attn_layers"]
|
||||
fusion_strategy = best_params["fusion_strategy"]
|
||||
head_hidden_dim = best_params["head_hidden_dim"]
|
||||
dropout = best_params["dropout"]
|
||||
lr = best_params["lr"]
|
||||
weight_decay = best_params["weight_decay"]
|
||||
|
||||
# 解析 MPNN 配置
|
||||
# 优先级:mpnn_checkpoint > mpnn_ensemble_paths > use_mpnn(自动查找)
|
||||
ensemble_paths_list = None
|
||||
if mpnn_ensemble_paths:
|
||||
ensemble_paths_list = mpnn_ensemble_paths.split(",")
|
||||
elif use_mpnn and mpnn_checkpoint is None:
|
||||
# --use-mpnn 但没有指定具体路径,自动查找
|
||||
logger.info(f"Auto-detecting MPNN ensemble from {DEFAULT_MPNN_ENSEMBLE_DIR}")
|
||||
ensemble_paths_list = find_mpnn_ensemble_paths()
|
||||
logger.info(f"Found {len(ensemble_paths_list)} MPNN models")
|
||||
|
||||
enable_mpnn = mpnn_checkpoint is not None or ensemble_paths_list is not None
|
||||
|
||||
# 创建模型
|
||||
logger.info(f"Creating model (use_mpnn={enable_mpnn})...")
|
||||
model = create_model(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
mpnn_checkpoint=mpnn_checkpoint,
|
||||
mpnn_ensemble_paths=ensemble_paths_list,
|
||||
mpnn_device=mpnn_device,
|
||||
)
|
||||
|
||||
# 加载预训练权重(如果指定)
|
||||
if init_from_pretrain is not None:
|
||||
logger.info(f"Loading pretrain weights from {init_from_pretrain}")
|
||||
checkpoint = torch.load(init_from_pretrain, map_location="cpu")
|
||||
|
||||
# 检查配置是否兼容
|
||||
pretrain_config = checkpoint.get("config", {})
|
||||
if pretrain_config.get("d_model") != d_model:
|
||||
logger.warning(
|
||||
f"d_model mismatch: pretrain={pretrain_config.get('d_model')}, "
|
||||
f"current={d_model}. Skipping pretrain loading."
|
||||
)
|
||||
else:
|
||||
# 加载 backbone + (可选) delivery head
|
||||
model.load_pretrain_weights(
|
||||
pretrain_state_dict=checkpoint["model_state_dict"],
|
||||
load_delivery_head=load_delivery_head,
|
||||
strict=False,
|
||||
)
|
||||
logger.success(
|
||||
f"Loaded pretrain weights (backbone + delivery_head={load_delivery_head})"
|
||||
)
|
||||
|
||||
# 冻结 backbone(如果指定)
|
||||
if freeze_backbone:
|
||||
logger.info("Freezing backbone (token_projector, cross_attention, fusion)...")
|
||||
frozen_count = 0
|
||||
for name, param in model.named_parameters():
|
||||
if name.startswith(("token_projector.", "cross_attention.", "fusion.")):
|
||||
param.requires_grad = False
|
||||
frozen_count += 1
|
||||
logger.info(f"Frozen {frozen_count} parameter tensors")
|
||||
|
||||
# 打印模型信息
|
||||
n_params_total = sum(p.numel() for p in model.parameters())
|
||||
n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
logger.info(f"Model parameters: {n_params_total:,} total, {n_params_trainable:,} trainable")
|
||||
|
||||
# 训练
|
||||
logger.info("Starting training...")
|
||||
result = train_model(
|
||||
train_loader=train_loader,
|
||||
val_loader=val_loader,
|
||||
model=model,
|
||||
device=device,
|
||||
lr=lr,
|
||||
weight_decay=weight_decay,
|
||||
epochs=epochs,
|
||||
patience=patience,
|
||||
)
|
||||
|
||||
# 保存模型
|
||||
model_path = output_dir / "model.pt"
|
||||
torch.save({
|
||||
"model_state_dict": model.state_dict(),
|
||||
"config": {
|
||||
"d_model": d_model,
|
||||
"num_heads": num_heads,
|
||||
"n_attn_layers": n_attn_layers,
|
||||
"fusion_strategy": fusion_strategy,
|
||||
"head_hidden_dim": head_hidden_dim,
|
||||
"dropout": dropout,
|
||||
"use_mpnn": enable_mpnn,
|
||||
},
|
||||
"best_val_loss": result["best_val_loss"],
|
||||
"init_from_pretrain": str(init_from_pretrain) if init_from_pretrain else None,
|
||||
}, model_path)
|
||||
logger.success(f"Saved model to {model_path}")
|
||||
|
||||
# 保存训练历史
|
||||
history_path = output_dir / "history.json"
|
||||
with open(history_path, "w") as f:
|
||||
json.dump(result["history"], f, indent=2)
|
||||
logger.success(f"Saved training history to {history_path}")
|
||||
|
||||
# 绘制多任务 loss 曲线图
|
||||
loss_plot_path = output_dir / "loss_curves.png"
|
||||
plot_multitask_loss_curves(
|
||||
history=result["history"],
|
||||
output_path=loss_plot_path,
|
||||
title="Multi-task Training Loss Curves",
|
||||
)
|
||||
logger.success(f"Saved loss curves plot to {loss_plot_path}")
|
||||
|
||||
logger.success(f"Training complete! Best val_loss: {result['best_val_loss']:.4f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
730
lnp_ml/modeling/train_cv.py
Normal file
730
lnp_ml/modeling/train_cv.py
Normal file
@ -0,0 +1,730 @@
|
||||
"""Cross-Validation 训练脚本:在 5-fold 内部数据上进行多任务训练"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import typer
|
||||
|
||||
from lnp_ml.config import MODELS_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import LNPDataset, collate_fn
|
||||
from lnp_ml.modeling.models import LNPModel, LNPModelWithoutMPNN
|
||||
from lnp_ml.modeling.trainer import (
|
||||
train_epoch,
|
||||
validate,
|
||||
EarlyStopping,
|
||||
LossWeights,
|
||||
)
|
||||
from lnp_ml.modeling.visualization import plot_multitask_loss_curves
|
||||
|
||||
|
||||
# MPNN ensemble 默认路径
|
||||
DEFAULT_MPNN_ENSEMBLE_DIR = MODELS_DIR / "mpnn" / "all_amine_split_for_LiON"
|
||||
|
||||
|
||||
def find_mpnn_ensemble_paths(base_dir: Path = DEFAULT_MPNN_ENSEMBLE_DIR) -> List[str]:
|
||||
"""自动查找 MPNN ensemble 的 model.pt 文件。"""
|
||||
model_paths = sorted(base_dir.glob("cv_*/fold_*/model_*/model.pt"))
|
||||
if not model_paths:
|
||||
raise FileNotFoundError(f"No model.pt files found in {base_dir}")
|
||||
return [str(p) for p in model_paths]
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def create_model(
|
||||
d_model: int = 256,
|
||||
num_heads: int = 8,
|
||||
n_attn_layers: int = 4,
|
||||
fusion_strategy: str = "attention",
|
||||
head_hidden_dim: int = 128,
|
||||
dropout: float = 0.1,
|
||||
mpnn_checkpoint: Optional[str] = None,
|
||||
mpnn_ensemble_paths: Optional[List[str]] = None,
|
||||
mpnn_device: str = "cpu",
|
||||
) -> Union[LNPModel, LNPModelWithoutMPNN]:
|
||||
"""创建模型(支持可选的 MPNN encoder)"""
|
||||
use_mpnn = mpnn_checkpoint is not None or mpnn_ensemble_paths is not None
|
||||
|
||||
if use_mpnn:
|
||||
return LNPModel(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
mpnn_checkpoint=mpnn_checkpoint,
|
||||
mpnn_ensemble_paths=mpnn_ensemble_paths,
|
||||
mpnn_device=mpnn_device,
|
||||
)
|
||||
else:
|
||||
return LNPModelWithoutMPNN(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
)
|
||||
|
||||
|
||||
def train_fold(
|
||||
fold_idx: int,
|
||||
train_loader: DataLoader,
|
||||
val_loader: DataLoader,
|
||||
model: nn.Module,
|
||||
device: torch.device,
|
||||
output_dir: Path,
|
||||
lr: float = 1e-4,
|
||||
weight_decay: float = 1e-5,
|
||||
epochs: int = 100,
|
||||
patience: int = 15,
|
||||
loss_weights: Optional[LossWeights] = None,
|
||||
config: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""训练单个 fold"""
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"Training Fold {fold_idx}")
|
||||
logger.info(f"{'='*60}")
|
||||
|
||||
model = model.to(device)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
|
||||
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
|
||||
optimizer, mode="min", factor=0.5, patience=5
|
||||
)
|
||||
early_stopping = EarlyStopping(patience=patience)
|
||||
|
||||
history = {"train": [], "val": []}
|
||||
best_val_loss = float("inf")
|
||||
best_state = None
|
||||
|
||||
for epoch in range(epochs):
|
||||
# Train
|
||||
train_metrics = train_epoch(model, train_loader, optimizer, device, loss_weights)
|
||||
|
||||
# Validate
|
||||
val_metrics = validate(model, val_loader, device, loss_weights)
|
||||
|
||||
current_lr = optimizer.param_groups[0]["lr"]
|
||||
|
||||
# Log
|
||||
logger.info(
|
||||
f"Fold {fold_idx} Epoch {epoch+1}/{epochs} | "
|
||||
f"Train Loss: {train_metrics['loss']:.4f} | "
|
||||
f"Val Loss: {val_metrics['loss']:.4f} | "
|
||||
f"LR: {current_lr:.2e}"
|
||||
)
|
||||
|
||||
history["train"].append(train_metrics)
|
||||
history["val"].append(val_metrics)
|
||||
|
||||
# Learning rate scheduling
|
||||
scheduler.step(val_metrics["loss"])
|
||||
|
||||
# Save best model
|
||||
if val_metrics["loss"] < best_val_loss:
|
||||
best_val_loss = val_metrics["loss"]
|
||||
best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
|
||||
logger.info(f" -> New best model (val_loss={best_val_loss:.4f})")
|
||||
|
||||
# Early stopping
|
||||
if early_stopping(val_metrics["loss"]):
|
||||
logger.info(f"Early stopping at epoch {epoch+1}")
|
||||
break
|
||||
|
||||
# 保存最佳模型
|
||||
fold_output_dir = output_dir / f"fold_{fold_idx}"
|
||||
fold_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
checkpoint_path = fold_output_dir / "model.pt"
|
||||
torch.save({
|
||||
"model_state_dict": best_state,
|
||||
"config": config,
|
||||
"best_val_loss": best_val_loss,
|
||||
"fold_idx": fold_idx,
|
||||
}, checkpoint_path)
|
||||
logger.success(f"Saved fold {fold_idx} model to {checkpoint_path}")
|
||||
|
||||
# 保存训练历史
|
||||
history_path = fold_output_dir / "history.json"
|
||||
with open(history_path, "w") as f:
|
||||
json.dump(history, f, indent=2)
|
||||
|
||||
# 绘制多任务 loss 曲线图
|
||||
loss_plot_path = fold_output_dir / "loss_curves.png"
|
||||
plot_multitask_loss_curves(
|
||||
history=history,
|
||||
output_path=loss_plot_path,
|
||||
title=f"Fold {fold_idx} Multi-task Loss Curves",
|
||||
)
|
||||
logger.info(f"Saved fold {fold_idx} loss curves to {loss_plot_path}")
|
||||
|
||||
return {
|
||||
"fold_idx": fold_idx,
|
||||
"best_val_loss": best_val_loss,
|
||||
"epochs_trained": len(history["train"]),
|
||||
"final_train_loss": history["train"][-1]["loss"] if history["train"] else 0,
|
||||
}
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "cv",
|
||||
output_dir: Path = MODELS_DIR / "finetune_cv",
|
||||
# 模型参数
|
||||
d_model: int = 256,
|
||||
num_heads: int = 8,
|
||||
n_attn_layers: int = 4,
|
||||
fusion_strategy: str = "attention",
|
||||
head_hidden_dim: int = 128,
|
||||
dropout: float = 0.1,
|
||||
# MPNN 参数(可选)
|
||||
use_mpnn: bool = False,
|
||||
mpnn_checkpoint: Optional[str] = None,
|
||||
mpnn_ensemble_paths: Optional[str] = None,
|
||||
mpnn_device: str = "cpu",
|
||||
# 训练参数
|
||||
batch_size: int = 32,
|
||||
lr: float = 1e-4,
|
||||
weight_decay: float = 1e-5,
|
||||
epochs: int = 100,
|
||||
patience: int = 15,
|
||||
# 预训练权重加载
|
||||
init_from_pretrain: Optional[Path] = None,
|
||||
load_delivery_head: bool = True,
|
||||
freeze_backbone: bool = False,
|
||||
# 设备
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
):
|
||||
"""
|
||||
基于 Cross-Validation 训练 LNP 模型(多任务)。
|
||||
|
||||
在 5-fold 内部数据上训练 5 个模型。
|
||||
|
||||
使用 --use-mpnn 启用 MPNN encoder。
|
||||
使用 --init-from-pretrain 从预训练 checkpoint 初始化。
|
||||
使用 --freeze-backbone 冻结 backbone,只训练 heads。
|
||||
"""
|
||||
logger.info(f"Using device: {device}")
|
||||
device = torch.device(device)
|
||||
|
||||
# 查找所有 fold 目录
|
||||
fold_dirs = sorted([d for d in data_dir.iterdir() if d.is_dir() and d.name.startswith("fold_")])
|
||||
|
||||
if not fold_dirs:
|
||||
logger.error(f"No fold_* directories found in {data_dir}")
|
||||
logger.info("Please run 'make data_cv' first to process CV data.")
|
||||
raise typer.Exit(1)
|
||||
|
||||
logger.info(f"Found {len(fold_dirs)} folds: {[d.name for d in fold_dirs]}")
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 解析 MPNN 配置
|
||||
ensemble_paths_list = None
|
||||
if mpnn_ensemble_paths:
|
||||
ensemble_paths_list = mpnn_ensemble_paths.split(",")
|
||||
elif use_mpnn and mpnn_checkpoint is None:
|
||||
logger.info(f"Auto-detecting MPNN ensemble from {DEFAULT_MPNN_ENSEMBLE_DIR}")
|
||||
ensemble_paths_list = find_mpnn_ensemble_paths()
|
||||
logger.info(f"Found {len(ensemble_paths_list)} MPNN models")
|
||||
|
||||
enable_mpnn = mpnn_checkpoint is not None or ensemble_paths_list is not None
|
||||
|
||||
# 模型配置
|
||||
config = {
|
||||
"d_model": d_model,
|
||||
"num_heads": num_heads,
|
||||
"n_attn_layers": n_attn_layers,
|
||||
"fusion_strategy": fusion_strategy,
|
||||
"head_hidden_dim": head_hidden_dim,
|
||||
"dropout": dropout,
|
||||
"use_mpnn": enable_mpnn,
|
||||
"lr": lr,
|
||||
"weight_decay": weight_decay,
|
||||
"batch_size": batch_size,
|
||||
"epochs": epochs,
|
||||
"patience": patience,
|
||||
"init_from_pretrain": str(init_from_pretrain) if init_from_pretrain else None,
|
||||
"freeze_backbone": freeze_backbone,
|
||||
}
|
||||
|
||||
# 保存配置
|
||||
config_path = output_dir / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
logger.info(f"Saved config to {config_path}")
|
||||
|
||||
# 加载预训练权重(如果指定)
|
||||
pretrain_state = None
|
||||
if init_from_pretrain is not None:
|
||||
logger.info(f"Loading pretrain weights from {init_from_pretrain}")
|
||||
checkpoint = torch.load(init_from_pretrain, map_location="cpu")
|
||||
pretrain_config = checkpoint.get("config", {})
|
||||
if pretrain_config.get("d_model") != d_model:
|
||||
logger.warning(
|
||||
f"d_model mismatch: pretrain={pretrain_config.get('d_model')}, "
|
||||
f"current={d_model}. Skipping pretrain loading."
|
||||
)
|
||||
else:
|
||||
pretrain_state = checkpoint["model_state_dict"]
|
||||
|
||||
# 训练每个 fold
|
||||
fold_results = []
|
||||
|
||||
for fold_dir in tqdm(fold_dirs, desc="Training folds"):
|
||||
fold_idx = int(fold_dir.name.split("_")[1])
|
||||
|
||||
# 加载数据
|
||||
train_df = pd.read_parquet(fold_dir / "train.parquet")
|
||||
val_df = pd.read_parquet(fold_dir / "val.parquet")
|
||||
|
||||
logger.info(f"\nFold {fold_idx}: train={len(train_df)}, val={len(val_df)}")
|
||||
|
||||
# 创建 Dataset 和 DataLoader
|
||||
train_dataset = LNPDataset(train_df)
|
||||
val_dataset = LNPDataset(val_df)
|
||||
|
||||
train_loader = DataLoader(
|
||||
train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
|
||||
)
|
||||
val_loader = DataLoader(
|
||||
val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
# 创建新模型(每个 fold 独立初始化)
|
||||
model = create_model(
|
||||
d_model=d_model,
|
||||
num_heads=num_heads,
|
||||
n_attn_layers=n_attn_layers,
|
||||
fusion_strategy=fusion_strategy,
|
||||
head_hidden_dim=head_hidden_dim,
|
||||
dropout=dropout,
|
||||
mpnn_checkpoint=mpnn_checkpoint,
|
||||
mpnn_ensemble_paths=ensemble_paths_list,
|
||||
mpnn_device=device.type,
|
||||
)
|
||||
|
||||
# 加载预训练权重
|
||||
if pretrain_state is not None:
|
||||
model.load_pretrain_weights(
|
||||
pretrain_state_dict=pretrain_state,
|
||||
load_delivery_head=load_delivery_head,
|
||||
strict=False,
|
||||
)
|
||||
logger.info(f"Loaded pretrain weights (backbone + delivery_head={load_delivery_head})")
|
||||
|
||||
# 冻结 backbone(如果指定)
|
||||
if freeze_backbone:
|
||||
frozen_count = 0
|
||||
for name, param in model.named_parameters():
|
||||
if name.startswith(("token_projector.", "cross_attention.", "fusion.")):
|
||||
param.requires_grad = False
|
||||
frozen_count += 1
|
||||
logger.info(f"Frozen {frozen_count} parameter tensors")
|
||||
|
||||
# 打印模型信息(仅第一个 fold)
|
||||
if fold_idx == 0:
|
||||
n_params_total = sum(p.numel() for p in model.parameters())
|
||||
n_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
logger.info(f"Model parameters: {n_params_total:,} total, {n_params_trainable:,} trainable")
|
||||
|
||||
# 训练
|
||||
result = train_fold(
|
||||
fold_idx=fold_idx,
|
||||
train_loader=train_loader,
|
||||
val_loader=val_loader,
|
||||
model=model,
|
||||
device=device,
|
||||
output_dir=output_dir,
|
||||
lr=lr,
|
||||
weight_decay=weight_decay,
|
||||
epochs=epochs,
|
||||
patience=patience,
|
||||
config=config,
|
||||
)
|
||||
fold_results.append(result)
|
||||
|
||||
# 汇总结果
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("CROSS-VALIDATION TRAINING COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
|
||||
val_losses = [r["best_val_loss"] for r in fold_results]
|
||||
|
||||
logger.info(f"\n[Per-Fold Results]")
|
||||
for r in fold_results:
|
||||
logger.info(
|
||||
f" Fold {r['fold_idx']}: "
|
||||
f"Val Loss={r['best_val_loss']:.4f}, "
|
||||
f"Epochs={r['epochs_trained']}"
|
||||
)
|
||||
|
||||
logger.info(f"\n[Summary Statistics]")
|
||||
logger.info(f" Val Loss: {np.mean(val_losses):.4f} ± {np.std(val_losses):.4f}")
|
||||
|
||||
# 保存 CV 结果
|
||||
cv_results = {
|
||||
"fold_results": fold_results,
|
||||
"summary": {
|
||||
"val_loss_mean": float(np.mean(val_losses)),
|
||||
"val_loss_std": float(np.std(val_losses)),
|
||||
},
|
||||
"config": config,
|
||||
}
|
||||
|
||||
results_path = output_dir / "cv_results.json"
|
||||
with open(results_path, "w") as f:
|
||||
json.dump(cv_results, f, indent=2)
|
||||
logger.success(f"Saved CV results to {results_path}")
|
||||
|
||||
|
||||
@app.command()
|
||||
def test(
|
||||
data_dir: Path = PROCESSED_DATA_DIR / "cv",
|
||||
model_dir: Path = MODELS_DIR / "finetune_cv",
|
||||
output_path: Path = MODELS_DIR / "finetune_cv" / "test_results.json",
|
||||
batch_size: int = 64,
|
||||
device: str = "cuda" if torch.cuda.is_available() else "cpu",
|
||||
):
|
||||
"""
|
||||
在测试集上评估 CV 训练的模型。
|
||||
|
||||
使用每个 fold 的模型在对应的测试集上评估,然后汇总结果。
|
||||
"""
|
||||
from scipy.special import rel_entr
|
||||
from sklearn.metrics import (
|
||||
mean_squared_error,
|
||||
mean_absolute_error,
|
||||
r2_score,
|
||||
accuracy_score,
|
||||
precision_score,
|
||||
recall_score,
|
||||
f1_score,
|
||||
)
|
||||
|
||||
def kl_divergence(p: np.ndarray, q: np.ndarray, eps: float = 1e-10) -> float:
|
||||
"""计算 KL 散度 KL(p || q)"""
|
||||
p = np.clip(p, eps, 1.0)
|
||||
q = np.clip(q, eps, 1.0)
|
||||
return float(np.sum(rel_entr(p, q), axis=-1).mean())
|
||||
|
||||
def js_divergence(p: np.ndarray, q: np.ndarray, eps: float = 1e-10) -> float:
|
||||
"""计算 JS 散度"""
|
||||
p = np.clip(p, eps, 1.0)
|
||||
q = np.clip(q, eps, 1.0)
|
||||
m = 0.5 * (p + q)
|
||||
return float(0.5 * (np.sum(rel_entr(p, m), axis=-1) + np.sum(rel_entr(q, m), axis=-1)).mean())
|
||||
|
||||
logger.info(f"Using device: {device}")
|
||||
device = torch.device(device)
|
||||
|
||||
# 查找所有 fold 目录
|
||||
fold_dirs = sorted([d for d in data_dir.iterdir() if d.is_dir() and d.name.startswith("fold_")])
|
||||
|
||||
if not fold_dirs:
|
||||
logger.error(f"No fold_* directories found in {data_dir}")
|
||||
raise typer.Exit(1)
|
||||
|
||||
logger.info(f"Found {len(fold_dirs)} folds")
|
||||
|
||||
fold_results = []
|
||||
# 用于汇总所有 fold 的预测
|
||||
all_preds = {
|
||||
"size": [], "delivery": [], "pdi": [], "ee": [], "toxic": [], "biodist": []
|
||||
}
|
||||
all_targets = {
|
||||
"size": [], "delivery": [], "pdi": [], "ee": [], "toxic": [], "biodist": []
|
||||
}
|
||||
|
||||
for fold_dir in tqdm(fold_dirs, desc="Evaluating folds"):
|
||||
fold_idx = int(fold_dir.name.split("_")[1])
|
||||
model_path = model_dir / f"fold_{fold_idx}" / "model.pt"
|
||||
test_path = fold_dir / "test.parquet"
|
||||
|
||||
if not model_path.exists():
|
||||
logger.warning(f"Fold {fold_idx}: model not found at {model_path}, skipping")
|
||||
continue
|
||||
|
||||
if not test_path.exists():
|
||||
logger.warning(f"Fold {fold_idx}: test data not found at {test_path}, skipping")
|
||||
continue
|
||||
|
||||
# 加载模型
|
||||
checkpoint = torch.load(model_path, map_location=device)
|
||||
config = checkpoint["config"]
|
||||
|
||||
use_mpnn = config.get("use_mpnn", False)
|
||||
|
||||
# 总是重新查找 MPNN 路径
|
||||
if use_mpnn:
|
||||
mpnn_paths = find_mpnn_ensemble_paths()
|
||||
else:
|
||||
mpnn_paths = None
|
||||
|
||||
model = create_model(
|
||||
d_model=config["d_model"],
|
||||
num_heads=config["num_heads"],
|
||||
n_attn_layers=config["n_attn_layers"],
|
||||
fusion_strategy=config["fusion_strategy"],
|
||||
head_hidden_dim=config["head_hidden_dim"],
|
||||
dropout=config["dropout"],
|
||||
mpnn_ensemble_paths=mpnn_paths,
|
||||
mpnn_device=device.type,
|
||||
)
|
||||
model.load_state_dict(checkpoint["model_state_dict"])
|
||||
model = model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载测试数据
|
||||
test_df = pd.read_parquet(test_path)
|
||||
test_dataset = LNPDataset(test_df)
|
||||
test_loader = DataLoader(
|
||||
test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
|
||||
)
|
||||
|
||||
# 收集当前 fold 的预测
|
||||
fold_preds = {k: [] for k in all_preds.keys()}
|
||||
fold_targets = {k: [] for k in all_targets.keys()}
|
||||
|
||||
with torch.no_grad():
|
||||
pbar = tqdm(test_loader, desc=f"Fold {fold_idx} [Test]", leave=False)
|
||||
for batch in pbar:
|
||||
smiles = batch["smiles"]
|
||||
tabular = {k: v.to(device) for k, v in batch["tabular"].items()}
|
||||
targets = batch["targets"]
|
||||
masks = batch["mask"]
|
||||
|
||||
outputs = model(smiles, tabular)
|
||||
|
||||
# Size
|
||||
if "size" in masks and masks["size"].any():
|
||||
mask = masks["size"]
|
||||
fold_preds["size"].extend(
|
||||
outputs["size"].squeeze(-1)[mask].cpu().numpy().tolist()
|
||||
)
|
||||
fold_targets["size"].extend(
|
||||
targets["size"][mask].cpu().numpy().tolist()
|
||||
)
|
||||
|
||||
# Delivery
|
||||
if "delivery" in masks and masks["delivery"].any():
|
||||
mask = masks["delivery"]
|
||||
fold_preds["delivery"].extend(
|
||||
outputs["delivery"].squeeze(-1)[mask].cpu().numpy().tolist()
|
||||
)
|
||||
fold_targets["delivery"].extend(
|
||||
targets["delivery"][mask].cpu().numpy().tolist()
|
||||
)
|
||||
|
||||
# PDI (classification)
|
||||
if "pdi" in masks and masks["pdi"].any():
|
||||
mask = masks["pdi"]
|
||||
pdi_preds = outputs["pdi"][mask].argmax(dim=-1).cpu().numpy()
|
||||
pdi_targets = targets["pdi"][mask].cpu().numpy()
|
||||
fold_preds["pdi"].extend(pdi_preds.tolist())
|
||||
fold_targets["pdi"].extend(pdi_targets.tolist())
|
||||
|
||||
# EE (classification)
|
||||
if "ee" in masks and masks["ee"].any():
|
||||
mask = masks["ee"]
|
||||
ee_preds = outputs["ee"][mask].argmax(dim=-1).cpu().numpy()
|
||||
ee_targets = targets["ee"][mask].cpu().numpy()
|
||||
fold_preds["ee"].extend(ee_preds.tolist())
|
||||
fold_targets["ee"].extend(ee_targets.tolist())
|
||||
|
||||
# Toxic (classification)
|
||||
if "toxic" in masks and masks["toxic"].any():
|
||||
mask = masks["toxic"]
|
||||
toxic_preds = outputs["toxic"][mask].argmax(dim=-1).cpu().numpy()
|
||||
toxic_targets = targets["toxic"][mask].cpu().numpy().astype(int)
|
||||
fold_preds["toxic"].extend(toxic_preds.tolist())
|
||||
fold_targets["toxic"].extend(toxic_targets.tolist())
|
||||
|
||||
# Biodist (distribution)
|
||||
if "biodist" in masks and masks["biodist"].any():
|
||||
mask = masks["biodist"]
|
||||
biodist_preds = outputs["biodist"][mask].cpu().numpy()
|
||||
biodist_targets = targets["biodist"][mask].cpu().numpy()
|
||||
fold_preds["biodist"].extend(biodist_preds.tolist())
|
||||
fold_targets["biodist"].extend(biodist_targets.tolist())
|
||||
|
||||
# 计算当前 fold 的指标
|
||||
fold_metrics = {"fold_idx": fold_idx, "n_samples": len(test_df)}
|
||||
|
||||
# 回归任务指标
|
||||
for task in ["size", "delivery"]:
|
||||
if fold_preds[task]:
|
||||
p = np.array(fold_preds[task])
|
||||
t = np.array(fold_targets[task])
|
||||
fold_metrics[task] = {
|
||||
"n": len(p),
|
||||
"rmse": float(np.sqrt(mean_squared_error(t, p))),
|
||||
"mae": float(mean_absolute_error(t, p)),
|
||||
"r2": float(r2_score(t, p)),
|
||||
}
|
||||
|
||||
# 分类任务指标
|
||||
for task in ["pdi", "ee", "toxic"]:
|
||||
if fold_preds[task]:
|
||||
p = np.array(fold_preds[task])
|
||||
t = np.array(fold_targets[task])
|
||||
fold_metrics[task] = {
|
||||
"n": len(p),
|
||||
"accuracy": float(accuracy_score(t, p)),
|
||||
"precision": float(precision_score(t, p, average="macro", zero_division=0)),
|
||||
"recall": float(recall_score(t, p, average="macro", zero_division=0)),
|
||||
"f1": float(f1_score(t, p, average="macro", zero_division=0)),
|
||||
}
|
||||
|
||||
# 分布任务指标
|
||||
if fold_preds["biodist"]:
|
||||
p = np.array(fold_preds["biodist"])
|
||||
t = np.array(fold_targets["biodist"])
|
||||
fold_metrics["biodist"] = {
|
||||
"n": len(p),
|
||||
"kl_divergence": kl_divergence(t, p),
|
||||
"js_divergence": js_divergence(t, p),
|
||||
}
|
||||
|
||||
fold_results.append(fold_metrics)
|
||||
|
||||
# 汇总到全局
|
||||
for task in all_preds.keys():
|
||||
all_preds[task].extend(fold_preds[task])
|
||||
all_targets[task].extend(fold_targets[task])
|
||||
|
||||
# 打印当前 fold 结果
|
||||
log_parts = [f"Fold {fold_idx}: n={len(test_df)}"]
|
||||
for task in ["delivery", "size"]:
|
||||
if task in fold_metrics and isinstance(fold_metrics[task], dict):
|
||||
log_parts.append(f"{task}_RMSE={fold_metrics[task]['rmse']:.4f}")
|
||||
log_parts.append(f"{task}_R²={fold_metrics[task]['r2']:.4f}")
|
||||
for task in ["pdi", "ee", "toxic"]:
|
||||
if task in fold_metrics and isinstance(fold_metrics[task], dict):
|
||||
log_parts.append(f"{task}_acc={fold_metrics[task]['accuracy']:.4f}")
|
||||
log_parts.append(f"{task}_f1={fold_metrics[task]['f1']:.4f}")
|
||||
if "biodist" in fold_metrics and isinstance(fold_metrics["biodist"], dict):
|
||||
log_parts.append(f"biodist_KL={fold_metrics['biodist']['kl_divergence']:.4f}")
|
||||
log_parts.append(f"biodist_JS={fold_metrics['biodist']['js_divergence']:.4f}")
|
||||
logger.info(", ".join(log_parts))
|
||||
|
||||
# 计算跨 fold 汇总统计
|
||||
summary_stats = {}
|
||||
for task in ["size", "delivery"]:
|
||||
rmses = [r[task]["rmse"] for r in fold_results if task in r and isinstance(r[task], dict)]
|
||||
r2s = [r[task]["r2"] for r in fold_results if task in r and isinstance(r[task], dict)]
|
||||
if rmses:
|
||||
summary_stats[task] = {
|
||||
"rmse_mean": float(np.mean(rmses)),
|
||||
"rmse_std": float(np.std(rmses)),
|
||||
"r2_mean": float(np.mean(r2s)),
|
||||
"r2_std": float(np.std(r2s)),
|
||||
}
|
||||
|
||||
for task in ["pdi", "ee", "toxic"]:
|
||||
accs = [r[task]["accuracy"] for r in fold_results if task in r and isinstance(r[task], dict)]
|
||||
f1s = [r[task]["f1"] for r in fold_results if task in r and isinstance(r[task], dict)]
|
||||
if accs:
|
||||
summary_stats[task] = {
|
||||
"accuracy_mean": float(np.mean(accs)),
|
||||
"accuracy_std": float(np.std(accs)),
|
||||
"f1_mean": float(np.mean(f1s)),
|
||||
"f1_std": float(np.std(f1s)),
|
||||
}
|
||||
|
||||
# 分布任务汇总
|
||||
kls = [r["biodist"]["kl_divergence"] for r in fold_results if "biodist" in r and isinstance(r["biodist"], dict)]
|
||||
jss = [r["biodist"]["js_divergence"] for r in fold_results if "biodist" in r and isinstance(r["biodist"], dict)]
|
||||
if kls:
|
||||
summary_stats["biodist"] = {
|
||||
"kl_mean": float(np.mean(kls)),
|
||||
"kl_std": float(np.std(kls)),
|
||||
"js_mean": float(np.mean(jss)),
|
||||
"js_std": float(np.std(jss)),
|
||||
}
|
||||
|
||||
# 计算整体 pooled 指标
|
||||
overall = {}
|
||||
for task in ["size", "delivery"]:
|
||||
if all_preds[task]:
|
||||
p = np.array(all_preds[task])
|
||||
t = np.array(all_targets[task])
|
||||
overall[task] = {
|
||||
"n_samples": len(p),
|
||||
"mse": float(mean_squared_error(t, p)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(t, p))),
|
||||
"mae": float(mean_absolute_error(t, p)),
|
||||
"r2": float(r2_score(t, p)),
|
||||
}
|
||||
|
||||
for task in ["pdi", "ee", "toxic"]:
|
||||
if all_preds[task]:
|
||||
p = np.array(all_preds[task])
|
||||
t = np.array(all_targets[task])
|
||||
overall[task] = {
|
||||
"n_samples": len(p),
|
||||
"accuracy": float(accuracy_score(t, p)),
|
||||
"precision": float(precision_score(t, p, average="macro", zero_division=0)),
|
||||
"recall": float(recall_score(t, p, average="macro", zero_division=0)),
|
||||
"f1": float(f1_score(t, p, average="macro", zero_division=0)),
|
||||
}
|
||||
|
||||
# 分布任务
|
||||
if all_preds["biodist"]:
|
||||
p = np.array(all_preds["biodist"])
|
||||
t = np.array(all_targets["biodist"])
|
||||
overall["biodist"] = {
|
||||
"n_samples": len(p),
|
||||
"kl_divergence": kl_divergence(t, p),
|
||||
"js_divergence": js_divergence(t, p),
|
||||
}
|
||||
|
||||
# 打印汇总结果
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("CV TEST EVALUATION RESULTS")
|
||||
logger.info("=" * 60)
|
||||
|
||||
logger.info(f"\n[Summary Statistics (across {len(fold_results)} folds)]")
|
||||
for task, stats in summary_stats.items():
|
||||
if "rmse_mean" in stats:
|
||||
logger.info(f" {task}: RMSE={stats['rmse_mean']:.4f}±{stats['rmse_std']:.4f}, R²={stats['r2_mean']:.4f}±{stats['r2_std']:.4f}")
|
||||
elif "accuracy_mean" in stats:
|
||||
logger.info(f" {task}: Accuracy={stats['accuracy_mean']:.4f}±{stats['accuracy_std']:.4f}, F1={stats['f1_mean']:.4f}±{stats['f1_std']:.4f}")
|
||||
elif "kl_mean" in stats:
|
||||
logger.info(f" {task}: KL={stats['kl_mean']:.4f}±{stats['kl_std']:.4f}, JS={stats['js_mean']:.4f}±{stats['js_std']:.4f}")
|
||||
|
||||
logger.info(f"\n[Overall (all samples pooled)]")
|
||||
for task, metrics in overall.items():
|
||||
if "rmse" in metrics:
|
||||
logger.info(f" {task} (n={metrics['n_samples']}): RMSE={metrics['rmse']:.4f}, MAE={metrics['mae']:.4f}, R²={metrics['r2']:.4f}")
|
||||
elif "accuracy" in metrics:
|
||||
logger.info(f" {task} (n={metrics['n_samples']}): Accuracy={metrics['accuracy']:.4f}, Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}, F1={metrics['f1']:.4f}")
|
||||
elif "kl_divergence" in metrics:
|
||||
logger.info(f" {task} (n={metrics['n_samples']}): KL={metrics['kl_divergence']:.4f}, JS={metrics['js_divergence']:.4f}")
|
||||
|
||||
# 保存结果
|
||||
results = {
|
||||
"fold_results": fold_results,
|
||||
"summary_stats": summary_stats,
|
||||
"overall": overall,
|
||||
}
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
logger.success(f"\nSaved test results to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
@ -1,15 +0,0 @@
|
||||
{
|
||||
"d_model": 256,
|
||||
"num_heads": 8,
|
||||
"n_attn_layers": 4,
|
||||
"fusion_strategy": "attention",
|
||||
"head_hidden_dim": 128,
|
||||
"dropout": 0.1,
|
||||
"use_mpnn": false,
|
||||
"mpnn_ensemble_paths": null,
|
||||
"lr": 0.0001,
|
||||
"weight_decay": 1e-05,
|
||||
"batch_size": 64,
|
||||
"epochs": 50,
|
||||
"patience": 10
|
||||
}
|
||||
158
scripts/process_data.py
Normal file
158
scripts/process_data.py
Normal file
@ -0,0 +1,158 @@
|
||||
"""数据处理脚本:将原始数据转换为模型可用的格式"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
from lnp_ml.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import (
|
||||
process_dataframe,
|
||||
SMILES_COL,
|
||||
COMP_COLS,
|
||||
HELP_COLS,
|
||||
TARGET_REGRESSION,
|
||||
TARGET_CLASSIFICATION_PDI,
|
||||
TARGET_CLASSIFICATION_EE,
|
||||
TARGET_TOXIC,
|
||||
TARGET_BIODIST,
|
||||
get_phys_cols,
|
||||
get_exp_cols,
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Path = INTERIM_DATA_DIR / "internal.csv",
|
||||
output_dir: Path = PROCESSED_DATA_DIR,
|
||||
train_ratio: float = 0.56,
|
||||
val_ratio: float = 0.14,
|
||||
seed: int = 42,
|
||||
):
|
||||
"""
|
||||
处理原始数据并划分训练/验证/测试集。
|
||||
|
||||
输出文件:
|
||||
- train.parquet: 训练集
|
||||
- val.parquet: 验证集
|
||||
- test.parquet: 测试集
|
||||
- feature_columns.txt: 特征列名配置
|
||||
"""
|
||||
logger.info(f"Loading data from {input_path}")
|
||||
df = pd.read_csv(input_path)
|
||||
logger.info(f"Loaded {len(df)} samples")
|
||||
|
||||
# 处理数据
|
||||
logger.info("Processing dataframe...")
|
||||
df = process_dataframe(df)
|
||||
|
||||
# 定义要保留的列
|
||||
phys_cols = get_phys_cols()
|
||||
exp_cols = get_exp_cols()
|
||||
|
||||
keep_cols = (
|
||||
[SMILES_COL]
|
||||
+ COMP_COLS
|
||||
+ phys_cols
|
||||
+ HELP_COLS
|
||||
+ exp_cols
|
||||
+ TARGET_REGRESSION
|
||||
+ TARGET_CLASSIFICATION_PDI
|
||||
+ TARGET_CLASSIFICATION_EE
|
||||
+ [TARGET_TOXIC]
|
||||
+ TARGET_BIODIST
|
||||
)
|
||||
|
||||
# 只保留存在的列
|
||||
keep_cols = [c for c in keep_cols if c in df.columns]
|
||||
df = df[keep_cols]
|
||||
|
||||
# 随机打乱并划分
|
||||
logger.info("Splitting dataset...")
|
||||
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||
|
||||
n = len(df)
|
||||
n_train = int(n * train_ratio)
|
||||
n_val = int(n * val_ratio)
|
||||
|
||||
train_df = df.iloc[:n_train]
|
||||
val_df = df.iloc[n_train:n_train + n_val]
|
||||
test_df = df.iloc[n_train + n_val:]
|
||||
|
||||
logger.info(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
|
||||
|
||||
# 保存
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
train_path = output_dir / "train.parquet"
|
||||
val_path = output_dir / "val.parquet"
|
||||
test_path = output_dir / "test.parquet"
|
||||
|
||||
train_df.to_parquet(train_path, index=False)
|
||||
val_df.to_parquet(val_path, index=False)
|
||||
test_df.to_parquet(test_path, index=False)
|
||||
|
||||
logger.success(f"Saved train to {train_path}")
|
||||
logger.success(f"Saved val to {val_path}")
|
||||
logger.success(f"Saved test to {test_path}")
|
||||
|
||||
# 保存列名配置
|
||||
config_path = output_dir / "feature_columns.txt"
|
||||
with open(config_path, "w") as f:
|
||||
f.write("# Feature columns configuration\n\n")
|
||||
f.write(f"# SMILES\n{SMILES_COL}\n\n")
|
||||
f.write(f"# comp token [{len(COMP_COLS)}]\n")
|
||||
f.write("\n".join(COMP_COLS) + "\n\n")
|
||||
f.write(f"# phys token [{len(phys_cols)}]\n")
|
||||
f.write("\n".join(phys_cols) + "\n\n")
|
||||
f.write(f"# help token [{len(HELP_COLS)}]\n")
|
||||
f.write("\n".join(HELP_COLS) + "\n\n")
|
||||
f.write(f"# exp token [{len(exp_cols)}]\n")
|
||||
f.write("\n".join(exp_cols) + "\n\n")
|
||||
f.write("# Targets\n")
|
||||
f.write("## Regression\n")
|
||||
f.write("\n".join(TARGET_REGRESSION) + "\n")
|
||||
f.write("## PDI classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_PDI) + "\n")
|
||||
f.write("## EE classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_EE) + "\n")
|
||||
f.write("## Toxic\n")
|
||||
f.write(f"{TARGET_TOXIC}\n")
|
||||
f.write("## Biodistribution\n")
|
||||
f.write("\n".join(TARGET_BIODIST) + "\n")
|
||||
|
||||
logger.success(f"Saved feature config to {config_path}")
|
||||
|
||||
# 打印统计信息
|
||||
logger.info("\n=== Dataset Statistics ===")
|
||||
logger.info(f"Total samples: {n}")
|
||||
logger.info(f"SMILES unique: {df[SMILES_COL].nunique()}")
|
||||
|
||||
# 缺失值统计
|
||||
logger.info("\nMissing values in targets:")
|
||||
for col in TARGET_REGRESSION + [TARGET_TOXIC]:
|
||||
if col in df.columns:
|
||||
missing = df[col].isna().sum()
|
||||
logger.info(f" {col}: {missing} ({100*missing/n:.1f}%)")
|
||||
|
||||
# PDI 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_PDI):
|
||||
pdi_sum = df[TARGET_CLASSIFICATION_PDI].sum()
|
||||
logger.info(f"\nPDI distribution:")
|
||||
for col, count in pdi_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
# EE 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_EE):
|
||||
ee_sum = df[TARGET_CLASSIFICATION_EE].sum()
|
||||
logger.info(f"\nEE distribution:")
|
||||
for col, count in ee_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
302
scripts/process_data_cv.py
Normal file
302
scripts/process_data_cv.py
Normal file
@ -0,0 +1,302 @@
|
||||
"""内部数据 Cross-Validation 划分脚本:支持随机划分或基于 Amine 的分组划分"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
from lnp_ml.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import (
|
||||
process_dataframe,
|
||||
SMILES_COL,
|
||||
COMP_COLS,
|
||||
HELP_COLS,
|
||||
TARGET_REGRESSION,
|
||||
TARGET_CLASSIFICATION_PDI,
|
||||
TARGET_CLASSIFICATION_EE,
|
||||
TARGET_TOXIC,
|
||||
TARGET_BIODIST,
|
||||
get_phys_cols,
|
||||
get_exp_cols,
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def random_cv_split(
|
||||
df: pd.DataFrame,
|
||||
n_folds: int = 5,
|
||||
seed: int = 42,
|
||||
) -> List[dict]:
|
||||
"""
|
||||
随机 shuffle 进行 Cross-Validation 划分。
|
||||
|
||||
步骤:
|
||||
1. 打乱所有样本
|
||||
2. 将样本分成 n_folds 个容器
|
||||
3. 对于每个 fold i:
|
||||
- validation = container[i]
|
||||
- test = container[(i+1) % n_folds]
|
||||
- train = 其余所有
|
||||
|
||||
Args:
|
||||
df: 输入 DataFrame
|
||||
n_folds: 折数
|
||||
seed: 随机种子
|
||||
|
||||
Returns:
|
||||
List of dicts,每个 dict 包含 train_df, val_df, test_df
|
||||
"""
|
||||
# 打乱所有样本
|
||||
df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||
n_samples = len(df_shuffled)
|
||||
|
||||
logger.info(f"Total {n_samples} samples for random CV split")
|
||||
|
||||
# 将样本分成 n_folds 个容器
|
||||
indices = np.arange(n_samples)
|
||||
containers = np.array_split(indices, n_folds)
|
||||
|
||||
# 打印每个容器的大小
|
||||
for i, container in enumerate(containers):
|
||||
logger.info(f" Container {i}: {len(container)} samples")
|
||||
|
||||
# 生成每个 fold 的数据
|
||||
fold_splits = []
|
||||
for i in range(n_folds):
|
||||
val_indices = containers[i]
|
||||
test_indices = containers[(i + 1) % n_folds]
|
||||
train_indices = np.concatenate([
|
||||
containers[j] for j in range(n_folds)
|
||||
if j != i and j != (i + 1) % n_folds
|
||||
])
|
||||
|
||||
train_df = df_shuffled.iloc[train_indices].reset_index(drop=True)
|
||||
val_df = df_shuffled.iloc[val_indices].reset_index(drop=True)
|
||||
test_df = df_shuffled.iloc[test_indices].reset_index(drop=True)
|
||||
|
||||
fold_splits.append({
|
||||
"train": train_df,
|
||||
"val": val_df,
|
||||
"test": test_df,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
f"Fold {i}: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}"
|
||||
)
|
||||
|
||||
return fold_splits
|
||||
|
||||
|
||||
def amine_based_cv_split(
|
||||
df: pd.DataFrame,
|
||||
n_folds: int = 5,
|
||||
seed: int = 42,
|
||||
amine_col: str = "Amine",
|
||||
) -> List[dict]:
|
||||
"""
|
||||
基于 Amine 列进行 Cross-Validation 划分。
|
||||
|
||||
步骤:
|
||||
1. 按 amine_col 分组
|
||||
2. 打乱分组顺序
|
||||
3. 将分组 round-robin 分配到 n_folds 个容器
|
||||
4. 对于每个 fold i:
|
||||
- validation = container[i]
|
||||
- test = container[(i+1) % n_folds]
|
||||
- train = 其余所有
|
||||
|
||||
Args:
|
||||
df: 输入 DataFrame
|
||||
n_folds: 折数
|
||||
seed: 随机种子
|
||||
amine_col: 用于分组的列名
|
||||
|
||||
Returns:
|
||||
List of dicts,每个 dict 包含 train_df, val_df, test_df
|
||||
"""
|
||||
# 获取唯一的 amine 并打乱
|
||||
unique_amines = df[amine_col].unique()
|
||||
rng = np.random.RandomState(seed)
|
||||
rng.shuffle(unique_amines)
|
||||
|
||||
logger.info(f"Found {len(unique_amines)} unique amines")
|
||||
|
||||
# Round-robin 分配到 n_folds 个容器
|
||||
containers = [[] for _ in range(n_folds)]
|
||||
for i, amine in enumerate(unique_amines):
|
||||
containers[i % n_folds].append(amine)
|
||||
|
||||
# 打印每个容器的大小
|
||||
for i, container in enumerate(containers):
|
||||
container_samples = df[df[amine_col].isin(container)]
|
||||
logger.info(f" Container {i}: {len(container)} amines, {len(container_samples)} samples")
|
||||
|
||||
# 生成每个 fold 的数据
|
||||
fold_splits = []
|
||||
for i in range(n_folds):
|
||||
val_amines = set(containers[i])
|
||||
test_amines = set(containers[(i + 1) % n_folds])
|
||||
train_amines = set()
|
||||
for j in range(n_folds):
|
||||
if j != i and j != (i + 1) % n_folds:
|
||||
train_amines.update(containers[j])
|
||||
|
||||
train_df = df[df[amine_col].isin(train_amines)].reset_index(drop=True)
|
||||
val_df = df[df[amine_col].isin(val_amines)].reset_index(drop=True)
|
||||
test_df = df[df[amine_col].isin(test_amines)].reset_index(drop=True)
|
||||
|
||||
fold_splits.append({
|
||||
"train": train_df,
|
||||
"val": val_df,
|
||||
"test": test_df,
|
||||
})
|
||||
|
||||
logger.info(
|
||||
f"Fold {i}: train={len(train_df)} ({len(train_amines)} amines), "
|
||||
f"val={len(val_df)} ({len(val_amines)} amines), "
|
||||
f"test={len(test_df)} ({len(test_amines)} amines)"
|
||||
)
|
||||
|
||||
return fold_splits
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Path = INTERIM_DATA_DIR / "internal.csv",
|
||||
output_dir: Path = PROCESSED_DATA_DIR / "cv",
|
||||
n_folds: int = 5,
|
||||
seed: int = 42,
|
||||
amine_col: str = "Amine",
|
||||
scaffold_split: bool = typer.Option(
|
||||
False,
|
||||
"--scaffold-split",
|
||||
help="使用基于 Amine 的 scaffold splitting(默认:随机 shuffle)",
|
||||
),
|
||||
):
|
||||
"""
|
||||
Cross-Validation 数据划分。
|
||||
|
||||
支持两种划分方式:
|
||||
- 随机划分(默认):直接 shuffle 所有样本
|
||||
- Scaffold splitting(--scaffold-split):基于 Amine 分组,确保同一 Amine 的数据在同一组
|
||||
|
||||
划分比例约为 train:val:test ≈ 3:1:1
|
||||
|
||||
输出结构:
|
||||
- processed/cv/fold_0/train.parquet
|
||||
- processed/cv/fold_0/val.parquet
|
||||
- processed/cv/fold_0/test.parquet
|
||||
- processed/cv/fold_1/...
|
||||
- processed/cv/feature_columns.txt
|
||||
"""
|
||||
logger.info(f"Loading data from {input_path}")
|
||||
df = pd.read_csv(input_path)
|
||||
logger.info(f"Loaded {len(df)} samples")
|
||||
|
||||
# 处理数据(列对齐、one-hot 生成等)
|
||||
logger.info("Processing dataframe...")
|
||||
df = process_dataframe(df)
|
||||
|
||||
# 如果使用 scaffold split,检查 amine 列是否存在
|
||||
if scaffold_split:
|
||||
# 重新加载原始数据获取 Amine 列(process_dataframe 可能不会保留它)
|
||||
original_df = pd.read_csv(input_path)
|
||||
if amine_col not in original_df.columns:
|
||||
logger.error(f"Column '{amine_col}' not found in data. Available columns: {list(original_df.columns)}")
|
||||
raise typer.Exit(1)
|
||||
if amine_col not in df.columns:
|
||||
df[amine_col] = original_df[amine_col].values
|
||||
|
||||
# 定义要保留的列
|
||||
phys_cols = get_phys_cols()
|
||||
exp_cols = get_exp_cols()
|
||||
|
||||
keep_cols = (
|
||||
[SMILES_COL]
|
||||
+ COMP_COLS
|
||||
+ phys_cols
|
||||
+ HELP_COLS
|
||||
+ exp_cols
|
||||
+ TARGET_REGRESSION
|
||||
+ TARGET_CLASSIFICATION_PDI
|
||||
+ TARGET_CLASSIFICATION_EE
|
||||
+ [TARGET_TOXIC]
|
||||
+ TARGET_BIODIST
|
||||
)
|
||||
|
||||
# 只保留存在的列
|
||||
keep_cols = [c for c in keep_cols if c in df.columns]
|
||||
|
||||
# 进行 CV 划分
|
||||
if scaffold_split:
|
||||
logger.info(f"\nPerforming {n_folds}-fold amine-based scaffold CV split (seed={seed})...")
|
||||
fold_splits = amine_based_cv_split(df, n_folds=n_folds, seed=seed, amine_col=amine_col)
|
||||
split_method = f"Amine-based scaffold (column: {amine_col})"
|
||||
else:
|
||||
logger.info(f"\nPerforming {n_folds}-fold random CV split (seed={seed})...")
|
||||
fold_splits = random_cv_split(df, n_folds=n_folds, seed=seed)
|
||||
split_method = "Random shuffle"
|
||||
|
||||
# 保存每个 fold
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for i, split in enumerate(fold_splits):
|
||||
fold_dir = output_dir / f"fold_{i}"
|
||||
fold_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 只保留需要的列
|
||||
train_df = split["train"][keep_cols].reset_index(drop=True)
|
||||
val_df = split["val"][keep_cols].reset_index(drop=True)
|
||||
test_df = split["test"][keep_cols].reset_index(drop=True)
|
||||
|
||||
# 保存
|
||||
train_df.to_parquet(fold_dir / "train.parquet", index=False)
|
||||
val_df.to_parquet(fold_dir / "val.parquet", index=False)
|
||||
test_df.to_parquet(fold_dir / "test.parquet", index=False)
|
||||
|
||||
logger.success(f"Saved fold {i} to {fold_dir}")
|
||||
|
||||
# 保存列名配置
|
||||
config_path = output_dir / "feature_columns.txt"
|
||||
with open(config_path, "w") as f:
|
||||
f.write("# Feature columns configuration\n\n")
|
||||
f.write(f"# SMILES\n{SMILES_COL}\n\n")
|
||||
f.write(f"# comp token [{len(COMP_COLS)}]\n")
|
||||
f.write("\n".join(COMP_COLS) + "\n\n")
|
||||
f.write(f"# phys token [{len(phys_cols)}]\n")
|
||||
f.write("\n".join(phys_cols) + "\n\n")
|
||||
f.write(f"# help token [{len(HELP_COLS)}]\n")
|
||||
f.write("\n".join(HELP_COLS) + "\n\n")
|
||||
f.write(f"# exp token [{len(exp_cols)}]\n")
|
||||
f.write("\n".join(exp_cols) + "\n\n")
|
||||
f.write("# Targets\n")
|
||||
f.write("## Regression\n")
|
||||
f.write("\n".join(TARGET_REGRESSION) + "\n")
|
||||
f.write("## PDI classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_PDI) + "\n")
|
||||
f.write("## EE classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_EE) + "\n")
|
||||
f.write("## Toxic\n")
|
||||
f.write(f"{TARGET_TOXIC}\n")
|
||||
f.write("## Biodistribution\n")
|
||||
f.write("\n".join(TARGET_BIODIST) + "\n")
|
||||
|
||||
logger.success(f"Saved feature config to {config_path}")
|
||||
|
||||
# 打印汇总
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("CV DATA PROCESSING COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Output directory: {output_dir}")
|
||||
logger.info(f"Number of folds: {n_folds}")
|
||||
logger.info(f"Splitting method: {split_method}")
|
||||
logger.info(f"Random seed: {seed}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
153
scripts/process_data_final.py
Normal file
153
scripts/process_data_final.py
Normal file
@ -0,0 +1,153 @@
|
||||
"""最终训练数据处理脚本:train:val = 9:1,无测试集"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
from lnp_ml.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import (
|
||||
process_dataframe,
|
||||
SMILES_COL,
|
||||
COMP_COLS,
|
||||
HELP_COLS,
|
||||
TARGET_REGRESSION,
|
||||
TARGET_CLASSIFICATION_PDI,
|
||||
TARGET_CLASSIFICATION_EE,
|
||||
TARGET_TOXIC,
|
||||
TARGET_BIODIST,
|
||||
get_phys_cols,
|
||||
get_exp_cols,
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Path = INTERIM_DATA_DIR / "internal.csv",
|
||||
output_dir: Path = PROCESSED_DATA_DIR / "final",
|
||||
train_ratio: float = 0.9,
|
||||
seed: int = 42,
|
||||
):
|
||||
"""
|
||||
处理原始数据并划分训练/验证集(无测试集)。
|
||||
|
||||
用于最终训练,使用所有数据。
|
||||
|
||||
输出文件:
|
||||
- final/train.parquet: 训练集 (90%)
|
||||
- final/val.parquet: 验证集 (10%)
|
||||
- final/feature_columns.txt: 特征列名配置
|
||||
"""
|
||||
logger.info(f"Loading data from {input_path}")
|
||||
df = pd.read_csv(input_path)
|
||||
logger.info(f"Loaded {len(df)} samples")
|
||||
|
||||
# 处理数据
|
||||
logger.info("Processing dataframe...")
|
||||
df = process_dataframe(df)
|
||||
|
||||
# 定义要保留的列
|
||||
phys_cols = get_phys_cols()
|
||||
exp_cols = get_exp_cols()
|
||||
|
||||
keep_cols = (
|
||||
[SMILES_COL]
|
||||
+ COMP_COLS
|
||||
+ phys_cols
|
||||
+ HELP_COLS
|
||||
+ exp_cols
|
||||
+ TARGET_REGRESSION
|
||||
+ TARGET_CLASSIFICATION_PDI
|
||||
+ TARGET_CLASSIFICATION_EE
|
||||
+ [TARGET_TOXIC]
|
||||
+ TARGET_BIODIST
|
||||
)
|
||||
|
||||
# 只保留存在的列
|
||||
keep_cols = [c for c in keep_cols if c in df.columns]
|
||||
df = df[keep_cols]
|
||||
|
||||
# 随机打乱并划分
|
||||
logger.info(f"Splitting dataset (train:val = {train_ratio}:{1-train_ratio:.1f})...")
|
||||
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||
|
||||
n = len(df)
|
||||
n_train = int(n * train_ratio)
|
||||
|
||||
train_df = df.iloc[:n_train]
|
||||
val_df = df.iloc[n_train:]
|
||||
|
||||
logger.info(f"Train: {len(train_df)}, Val: {len(val_df)}")
|
||||
|
||||
# 保存
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
train_path = output_dir / "train.parquet"
|
||||
val_path = output_dir / "val.parquet"
|
||||
|
||||
train_df.to_parquet(train_path, index=False)
|
||||
val_df.to_parquet(val_path, index=False)
|
||||
|
||||
logger.success(f"Saved train to {train_path}")
|
||||
logger.success(f"Saved val to {val_path}")
|
||||
|
||||
# 保存列名配置
|
||||
config_path = output_dir / "feature_columns.txt"
|
||||
with open(config_path, "w") as f:
|
||||
f.write("# Feature columns configuration (final training)\n\n")
|
||||
f.write(f"# SMILES\n{SMILES_COL}\n\n")
|
||||
f.write(f"# comp token [{len(COMP_COLS)}]\n")
|
||||
f.write("\n".join(COMP_COLS) + "\n\n")
|
||||
f.write(f"# phys token [{len(phys_cols)}]\n")
|
||||
f.write("\n".join(phys_cols) + "\n\n")
|
||||
f.write(f"# help token [{len(HELP_COLS)}]\n")
|
||||
f.write("\n".join(HELP_COLS) + "\n\n")
|
||||
f.write(f"# exp token [{len(exp_cols)}]\n")
|
||||
f.write("\n".join(exp_cols) + "\n\n")
|
||||
f.write("# Targets\n")
|
||||
f.write("## Regression\n")
|
||||
f.write("\n".join(TARGET_REGRESSION) + "\n")
|
||||
f.write("## PDI classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_PDI) + "\n")
|
||||
f.write("## EE classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_EE) + "\n")
|
||||
f.write("## Toxic\n")
|
||||
f.write(f"{TARGET_TOXIC}\n")
|
||||
f.write("## Biodistribution\n")
|
||||
f.write("\n".join(TARGET_BIODIST) + "\n")
|
||||
|
||||
logger.success(f"Saved feature config to {config_path}")
|
||||
|
||||
# 打印统计信息
|
||||
logger.info("\n=== Dataset Statistics ===")
|
||||
logger.info(f"Total samples: {n}")
|
||||
logger.info(f"SMILES unique: {df[SMILES_COL].nunique()}")
|
||||
|
||||
# 缺失值统计
|
||||
logger.info("\nMissing values in targets:")
|
||||
for col in TARGET_REGRESSION + [TARGET_TOXIC]:
|
||||
if col in df.columns:
|
||||
missing = df[col].isna().sum()
|
||||
logger.info(f" {col}: {missing} ({100*missing/n:.1f}%)")
|
||||
|
||||
# PDI 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_PDI):
|
||||
pdi_sum = df[TARGET_CLASSIFICATION_PDI].sum()
|
||||
logger.info(f"\nPDI distribution:")
|
||||
for col, count in pdi_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
# EE 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_EE):
|
||||
ee_sum = df[TARGET_CLASSIFICATION_EE].sum()
|
||||
logger.info(f"\nEE distribution:")
|
||||
for col, count in ee_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
@ -18,7 +18,7 @@ app = typer.Typer()
|
||||
def main(
|
||||
input_path: Path = EXTERNAL_DATA_DIR / "all_data_LiON.csv",
|
||||
output_dir: Path = PROCESSED_DATA_DIR,
|
||||
train_ratio: float = 0.85,
|
||||
train_ratio: float = 0.7,
|
||||
seed: int = 42,
|
||||
):
|
||||
"""
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""处理 benchmark 数据脚本:将 baseline 论文公开的 CV splits 转换为模型所需的 parquet 格式"""
|
||||
"""处理 cross-validation 数据脚本:将 CV splits 转换为模型所需的 parquet 格式"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
@ -151,18 +151,18 @@ def get_feature_columns() -> List[str]:
|
||||
@app.command()
|
||||
def main(
|
||||
data_dir: Path = EXTERNAL_DATA_DIR / "all_amine_split_for_LiON",
|
||||
output_dir: Path = PROCESSED_DATA_DIR / "benchmark",
|
||||
output_dir: Path = PROCESSED_DATA_DIR / "pretrain_cv",
|
||||
n_folds: int = 5,
|
||||
):
|
||||
"""
|
||||
处理 baseline 论文公开的 CV 划分数据,生成 benchmark 所需的 parquet 文件。
|
||||
处理 cross-validation 数据,生成模型所需的 parquet 文件。
|
||||
|
||||
输出结构:
|
||||
- processed/benchmark/fold_0/train.parquet
|
||||
- processed/benchmark/fold_0/valid.parquet
|
||||
- processed/benchmark/fold_0/test.parquet
|
||||
- processed/benchmark/fold_1/...
|
||||
- processed/benchmark/feature_columns.txt
|
||||
- processed/pretrain_cv/fold_0/train.parquet
|
||||
- processed/pretrain_cv/fold_0/valid.parquet
|
||||
- processed/pretrain_cv/fold_0/test.parquet
|
||||
- processed/pretrain_cv/fold_1/...
|
||||
- processed/pretrain_cv/feature_columns.txt
|
||||
"""
|
||||
logger.info(f"Processing CV data from {data_dir}")
|
||||
|
||||
@ -223,7 +223,7 @@ def main(
|
||||
logger.success(f"Saved feature columns to {cols_path}")
|
||||
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("BENCHMARK DATA PROCESSING COMPLETE")
|
||||
logger.info("CV DATA PROCESSING COMPLETE")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Output directory: {output_dir}")
|
||||
logger.info(f"Number of folds: {len(cv_dirs)}")
|
||||
Loading…
x
Reference in New Issue
Block a user