lnp_ml/Makefile
2026-02-26 18:06:07 +08:00

285 lines
11 KiB
Makefile
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#################################################################################
# GLOBALS #
#################################################################################
PROJECT_NAME = lnp-ml
PYTHON_VERSION = 3.8
PYTHON_INTERPRETER = python
# --- CLI flag 变量 ---
MPNN_FLAG = $(if $(USE_MPNN),--use-mpnn,)
FREEZE_FLAG = $(if $(FREEZE_BACKBONE),--freeze-backbone,)
DEVICE_FLAG = $(if $(DEVICE),--device $(DEVICE),)
SCAFFOLD_SPLIT_FLAG = $(if $(filter 1,$(SCAFFOLD_SPLIT)),--scaffold-split,)
SEED_FLAG = $(if $(SEED),--seed $(SEED),)
N_TRIALS_FLAG = $(if $(N_TRIALS),--n-trials $(N_TRIALS),)
EPOCHS_PER_TRIAL_FLAG = $(if $(EPOCHS_PER_TRIAL),--epochs-per-trial $(EPOCHS_PER_TRIAL),)
MIN_STRATUM_FLAG = $(if $(MIN_STRATUM_COUNT),--min-stratum-count $(MIN_STRATUM_COUNT),)
OUTPUT_DIR_FLAG = $(if $(OUTPUT_DIR),--output-dir $(OUTPUT_DIR),)
USE_SWA_FLAG = $(if $(USE_SWA),--use-swa,)
INIT_PRETRAIN_FLAG = $(if $(NO_PRETRAIN),,--init-from-pretrain $(or $(INIT_PRETRAIN),models/pretrain_delivery.pt))
#################################################################################
# ENVIRONMENT & CODE QUALITY #
#################################################################################
## Install Python dependencies
.PHONY: requirements
requirements:
pixi install
## Set up Python interpreter environment
.PHONY: create_environment
create_environment:
@echo ">>> Pixi environment will be created when running 'make requirements'"
@echo ">>> Activate with:\npixi shell"
## Delete all compiled Python files
.PHONY: clean
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
## Lint using ruff (use `make format` to do formatting)
.PHONY: lint
lint:
ruff format --check
ruff check
## Format source code with ruff
.PHONY: format
format:
ruff check --fix
ruff format
#################################################################################
# DATA PROCESSING #
#################################################################################
## Preprocess internal data (raw -> interim)
.PHONY: preprocess
preprocess: requirements
$(PYTHON_INTERPRETER) scripts/preprocess_internal.py
## Process dataset (interim -> processed)
.PHONY: data
data: requirements
$(PYTHON_INTERPRETER) scripts/process_data.py
## Process dataset for final training (interim -> processed/final, train:val=9:1, no test)
.PHONY: data_final
data_final: requirements
$(PYTHON_INTERPRETER) scripts/process_data_final.py
## Process external data for pretrain (external -> processed)
.PHONY: data_pretrain
data_pretrain: requirements
$(PYTHON_INTERPRETER) scripts/process_external.py
## Process CV data for cross-validation pretrain (external/all_amine_split_for_LiON -> processed/cv)
.PHONY: data_pretrain_cv
data_pretrain_cv: requirements
$(PYTHON_INTERPRETER) scripts/process_external_cv.py
## Process internal data with CV splitting (interim -> processed/cv)
## Use SCAFFOLD_SPLIT=1 to enable amine-based scaffold splitting (default: random shuffle)
.PHONY: data_cv
data_cv: requirements
$(PYTHON_INTERPRETER) scripts/process_data_cv.py $(SCAFFOLD_SPLIT_FLAG)
#################################################################################
# TRAINING #
#################################################################################
## Pretrain on external data (delivery only)
.PHONY: pretrain
pretrain: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain main $(MPNN_FLAG) $(DEVICE_FLAG)
## Pretrain with cross-validation (5-fold)
.PHONY: pretrain_cv
pretrain_cv: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain_cv main $(MPNN_FLAG) $(DEVICE_FLAG)
## Train model (multi-task, from scratch)
.PHONY: train
train: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train $(MPNN_FLAG) $(DEVICE_FLAG)
## Finetune from pretrained checkpoint (use FREEZE_BACKBONE=1 to freeze backbone)
.PHONY: finetune
finetune: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train --init-from-pretrain models/pretrain_delivery.pt $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
## Final training using all data (train:val=9:1, no test set), with pretrained weights
.PHONY: train_final
train_final: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train \
--train-path data/processed/final/train.parquet \
--val-path data/processed/final/val.parquet \
--output-dir models/final \
--init-from-pretrain models/pretrain_delivery.pt \
$(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
## Train with cross-validation on internal data only (5-fold, amine-based split)
.PHONY: train_cv
train_cv: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv main $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
## Finetune with cross-validation on internal data (5-fold) with pretrained weights
.PHONY: finetune_cv
finetune_cv: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv main --init-from-pretrain models/pretrain_delivery.pt $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
#################################################################################
# EVALUATION #
#################################################################################
## Evaluate pretrain model (delivery metrics)
.PHONY: test_pretrain
test_pretrain: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain test $(MPNN_FLAG) $(DEVICE_FLAG)
## Evaluate CV pretrain models on test sets (auto-detects MPNN from checkpoint)
.PHONY: test_pretrain_cv
test_pretrain_cv: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain_cv test $(DEVICE_FLAG)
## Evaluate CV finetuned models on test sets (auto-detects MPNN from checkpoint)
.PHONY: test_cv
test_cv: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train_cv test $(DEVICE_FLAG)
## Test model on test set (with detailed metrics, auto-detects MPNN from checkpoint)
.PHONY: test
test: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.predict test $(DEVICE_FLAG)
## Run predictions
.PHONY: predict
predict: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.predict $(DEVICE_FLAG)
#################################################################################
# HYPERPARAMETER TUNING #
#################################################################################
# 通用参数:
# SEED 随机种子 (默认: 42)
# N_TRIALS Optuna 试验数 (默认: 20)
# EPOCHS_PER_TRIAL 每个试验的最大 epoch (默认: 30)
# MIN_STRATUM_COUNT 复合分层标签的最小样本数 (默认: 5)
# OUTPUT_DIR 输出目录 (根据命令有不同默认值)
# INIT_PRETRAIN 预训练权重路径 (默认: models/pretrain_delivery.pt)
# NO_PRETRAIN=1 禁用预训练权重
## Train with hyperparameter tuning
.PHONY: tune
tune: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train --tune $(MPNN_FLAG) $(DEVICE_FLAG)
## Nested CV with Optuna: outer 5-fold (test) + inner 3-fold (tune)
## 用于模型评估:外层 5-fold 产生无偏性能估计,内层 3-fold 做超参搜索
## 使用示例: make nested_cv_tune DEVICE=cuda N_TRIALS=30
.PHONY: nested_cv_tune
nested_cv_tune: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.nested_cv_optuna \
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG)
## Final training with Optuna: 3-fold CV tune + full data train
## 用于最终模型训练3-fold 调参后用全量数据训练(无 early-stop
## 使用示例: make final_optuna DEVICE=cuda N_TRIALS=30 USE_SWA=1
.PHONY: final_optuna
final_optuna: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.final_train_optuna_cv \
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG) $(USE_SWA_FLAG)
#################################################################################
# SERVING & DEPLOYMENT #
#################################################################################
## Formulation optimization: find optimal LNP formulation for target organ
## Usage: make optimize SMILES="CC(C)..." ORGAN=liver
.PHONY: optimize
optimize: requirements
$(PYTHON_INTERPRETER) -m app.optimize --smiles "$(SMILES)" --organ $(ORGAN) $(DEVICE_FLAG)
## Start FastAPI backend server (port 8000)
.PHONY: api
api: requirements
uvicorn app.api:app --host 0.0.0.0 --port 8000 --reload
## Start Streamlit frontend app (port 8501)
.PHONY: webapp
webapp: requirements
streamlit run app/app.py --server.port 8501
## Start both API and webapp (run in separate terminals)
.PHONY: serve
serve:
@echo "请在两个终端分别运行:"
@echo " 终端 1: make api"
@echo " 终端 2: make webapp"
@echo ""
@echo "然后访问: http://localhost:8501"
#################################################################################
# DOCKER #
#################################################################################
## Build Docker images
.PHONY: docker-build
docker-build:
docker compose build
## Start all services with Docker Compose
.PHONY: docker-up
docker-up:
docker compose up -d
## Stop all Docker services
.PHONY: docker-down
docker-down:
docker compose down
## View Docker logs
.PHONY: docker-logs
docker-logs:
docker compose logs -f
## Build and start all services
.PHONY: docker-serve
docker-serve: docker-build docker-up
@echo ""
@echo "🚀 服务已启动!"
@echo " - API: http://localhost:8000"
@echo " - Web 应用: http://localhost:8501"
@echo ""
@echo "查看日志: make docker-logs"
@echo "停止服务: make docker-down"
## Clean Docker resources (images, volumes, etc.)
.PHONY: docker-clean
docker-clean:
docker compose down -v --rmi local
docker system prune -f
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
define PRINT_HELP_PYSCRIPT
import re, sys; \
lines = '\n'.join([line for line in sys.stdin]); \
matches = re.findall(r'\n## (.*)\n[\s\S]+?\n([a-zA-Z_-]+):', lines); \
print('Available rules:\n'); \
print('\n'.join(['{:25}{}'.format(*reversed(match)) for match in matches]))
endef
export PRINT_HELP_PYSCRIPT
help:
@$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST)