lnp_ml/Makefile
2026-03-03 13:45:46 +08:00

233 lines
8.7 KiB
Makefile
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#################################################################################
# GLOBALS #
#################################################################################
PROJECT_NAME = lnp-ml
PYTHON_VERSION = 3.8
PYTHON_INTERPRETER = python
# --- CLI flag 变量 ---
MPNN_FLAG = $(if $(NO_MPNN),,--use-mpnn)
DEVICE_FLAG = $(if $(DEVICE),--device $(DEVICE),)
SEED_FLAG = $(if $(SEED),--seed $(SEED),)
N_TRIALS_FLAG = $(if $(N_TRIALS),--n-trials $(N_TRIALS),)
EPOCHS_PER_TRIAL_FLAG = $(if $(EPOCHS_PER_TRIAL),--epochs-per-trial $(EPOCHS_PER_TRIAL),)
MIN_STRATUM_FLAG = $(if $(MIN_STRATUM_COUNT),--min-stratum-count $(MIN_STRATUM_COUNT),)
OUTPUT_DIR_FLAG = $(if $(OUTPUT_DIR),--output-dir $(OUTPUT_DIR),)
USE_SWA_FLAG = $(if $(USE_SWA),--use-swa,)
PARALLEL_FLAG = $(if $(PARALLEL),--parallel,)
INIT_PRETRAIN_FLAG = $(if $(NO_PRETRAIN),,--init-from-pretrain $(or $(INIT_PRETRAIN),models/pretrain_delivery.pt))
#################################################################################
# ENVIRONMENT & CODE QUALITY #
#################################################################################
## Install Python dependencies
.PHONY: requirements
requirements:
pixi install
## Set up Python interpreter environment
.PHONY: create_environment
create_environment:
@echo ">>> Pixi environment will be created when running 'make requirements'"
@echo ">>> Activate with:\npixi shell"
## Delete all compiled Python files
.PHONY: clean
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
## Lint using ruff (use `make format` to do formatting)
.PHONY: lint
lint:
ruff format --check
ruff check
## Format source code with ruff
.PHONY: format
format:
ruff check --fix
ruff format
#################################################################################
# DATA PROCESSING #
#################################################################################
## Preprocess internal data (raw -> interim)
.PHONY: preprocess
preprocess: requirements
$(PYTHON_INTERPRETER) scripts/preprocess_internal.py
## Process external data for pretrain (external -> processed)
.PHONY: data_pretrain
data_pretrain: requirements
$(PYTHON_INTERPRETER) scripts/process_external.py
## Process baseline CV data for benchmark (external/all_amine_split_for_LiON -> processed/benchmark)
.PHONY: data_benchmark
data_benchmark: requirements
$(PYTHON_INTERPRETER) scripts/process_benchmark_data.py
#################################################################################
# BENCHMARKING #
#################################################################################
## Benchmark on baseline CV data: 5-fold train + test (delivery only)
.PHONY: benchmark
benchmark: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.benchmark main $(MPNN_FLAG) $(DEVICE_FLAG)
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.benchmark test $(DEVICE_FLAG)
## Evaluate baseline method on public test splits (test.csv vs preds.csv in cv_*)
.PHONY: baseline
baseline: requirements
$(PYTHON_INTERPRETER) scripts/evaluate_external.py
#################################################################################
# TRAINING (Nested CV + Optuna) #
#################################################################################
# 通用参数:
# SEED 随机种子 (默认: 42)
# N_TRIALS Optuna 试验数 (默认: 20)
# EPOCHS_PER_TRIAL 每个试验的最大 epoch (默认: 30)
# MIN_STRATUM_COUNT 复合分层标签的最小样本数 (默认: 5)
# OUTPUT_DIR 输出目录 (根据命令有不同默认值)
# INIT_PRETRAIN 预训练权重路径 (默认: models/pretrain_delivery.pt)
# NO_PRETRAIN=1 禁用预训练权重
# USE_SWA=1 启用 SWA (final train 阶段)
# PARALLEL=1 并行运行外层 fold (nested CV 阶段,需足够 GPU 显存)
#
# 使用示例:
# make pretrain
# make train DEVICE=cuda N_TRIALS=30 USE_SWA=1 INIT_PRETRAIN=models/pretrain_delivery.pt
# make train DEVICE=cuda PARALLEL=1
## Pretrain on external data (delivery only)
.PHONY: pretrain
pretrain: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.pretrain main $(MPNN_FLAG) $(DEVICE_FLAG)
## Train: nested CV evaluation + final model training
## Step 1: 外层 5-fold 产生无偏性能估计,内层 3-fold 做超参搜索
## Step 2: 3-fold 调参后用全量数据训练最终模型
.PHONY: train
train: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.nested_cv_optuna \
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG) $(PARALLEL_FLAG)
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.final_train_optuna_cv \
$(DEVICE_FLAG) $(MPNN_FLAG) $(SEED_FLAG) $(INIT_PRETRAIN_FLAG) \
$(N_TRIALS_FLAG) $(EPOCHS_PER_TRIAL_FLAG) $(MIN_STRATUM_FLAG) $(OUTPUT_DIR_FLAG) $(USE_SWA_FLAG)
#################################################################################
# INTERPRETABILITY #
#################################################################################
# 参数:
# TASK 目标任务 (delivery, size, pdi, ee, biodist, toxic, all; 默认: delivery)
# 如果指定 'all',将依次计算所有 6 个任务
# METHOD 方法 (ig, ablation, attention, all; 默认: ig)
# DATA 数据路径 (默认: data/interim/internal.csv即最终模型的全量训练数据)
# MODEL 模型路径 (默认: models/final/model.pt)
TASK_FLAG = $(if $(TASK),--task $(TASK),)
METHOD_FLAG = $(if $(METHOD),--method $(METHOD),)
DATA_FLAG = $(if $(DATA),--data-path $(DATA),)
MODEL_FLAG = $(if $(MODEL),--model-path $(MODEL),)
## Compute token-level feature importance
.PHONY: feature_importance
feature_importance: requirements
$(PYTHON_INTERPRETER) -m lnp_ml.interpretability.token_importance \
$(TASK_FLAG) $(METHOD_FLAG) $(DATA_FLAG) $(MODEL_FLAG) $(DEVICE_FLAG)
#################################################################################
# SERVING & DEPLOYMENT #
#################################################################################
## Formulation optimization: find optimal LNP formulation for target organ
## Usage: make optimize SMILES="CC(C)..." ORGAN=liver
.PHONY: optimize
optimize: requirements
$(PYTHON_INTERPRETER) -m app.optimize --smiles "$(SMILES)" --organ $(ORGAN) $(DEVICE_FLAG)
## Start FastAPI backend server (port 8000)
.PHONY: api
api: requirements
uvicorn app.api:app --host 0.0.0.0 --port 8000 --reload
## Start Streamlit frontend app (port 8501)
.PHONY: webapp
webapp: requirements
streamlit run app/app.py --server.port 8501
## Start both API and webapp (run in separate terminals)
.PHONY: serve
serve:
@echo "请在两个终端分别运行:"
@echo " 终端 1: make api"
@echo " 终端 2: make webapp"
@echo ""
@echo "然后访问: http://localhost:8501"
#################################################################################
# DOCKER #
#################################################################################
## Build Docker images
.PHONY: docker-build
docker-build:
docker compose build
## Start all services with Docker Compose
.PHONY: docker-up
docker-up:
docker compose up -d
## Stop all Docker services
.PHONY: docker-down
docker-down:
docker compose down
## View Docker logs
.PHONY: docker-logs
docker-logs:
docker compose logs -f
## Build and start all services
.PHONY: docker-serve
docker-serve: docker-build docker-up
@echo ""
@echo "🚀 服务已启动!"
@echo " - API: http://localhost:8000"
@echo " - Web 应用: http://localhost:8501"
@echo ""
@echo "查看日志: make docker-logs"
@echo "停止服务: make docker-down"
## Clean Docker resources (images, volumes, etc.)
.PHONY: docker-clean
docker-clean:
docker compose down -v --rmi local
docker system prune -f
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
define PRINT_HELP_PYSCRIPT
import re, sys; \
lines = '\n'.join([line for line in sys.stdin]); \
matches = re.findall(r'\n## (.*)\n[\s\S]+?\n([a-zA-Z_-]+):', lines); \
print('Available rules:\n'); \
print('\n'.join(['{:25}{}'.format(*reversed(match)) for match in matches]))
endef
export PRINT_HELP_PYSCRIPT
help:
@$(PYTHON_INTERPRETER) -c "${PRINT_HELP_PYSCRIPT}" < $(MAKEFILE_LIST)