数据预处理时自动存档总荧光强度的均值与方差,以便于预测时反演荧光强度

This commit is contained in:
RYDE-WORK 2026-03-03 15:48:59 +08:00
parent 8e02bf7f05
commit 28b181e194
5 changed files with 79 additions and 5 deletions

View File

@ -124,6 +124,7 @@ class FormulationResult(BaseModel):
all_biodist: Dict[str, float] all_biodist: Dict[str, float]
# 额外预测值 # 额外预测值
quantified_delivery: Optional[float] = None quantified_delivery: Optional[float] = None
unnormalized_delivery: Optional[float] = None # 反推的原始递送值z-score 逆变换)
size: Optional[float] = None size: Optional[float] = None
pdi_class: Optional[int] = None # PDI 分类 (0: <0.2, 1: 0.2-0.3, 2: 0.3-0.4, 3: >0.4) pdi_class: Optional[int] = None # PDI 分类 (0: <0.2, 1: 0.2-0.3, 2: 0.3-0.4, 3: >0.4)
ee_class: Optional[int] = None # EE 分类 (0: <80%, 1: 80-90%, 2: >90%) ee_class: Optional[int] = None # EE 分类 (0: <80%, 1: 80-90%, 2: >90%)
@ -331,6 +332,7 @@ async def optimize_formulation(request: OptimizeRequest):
}, },
# 额外预测值 # 额外预测值
quantified_delivery=f.quantified_delivery, quantified_delivery=f.quantified_delivery,
unnormalized_delivery=f.unnormalized_delivery,
size=f.size, size=f.size,
pdi_class=f.pdi_class, pdi_class=f.pdi_class,
ee_class=f.ee_class, ee_class=f.ee_class,

View File

@ -227,6 +227,8 @@ def format_results_dataframe(results: dict, smiles_label: str = None) -> pd.Data
# 添加额外预测值 # 添加额外预测值
if f.get("quantified_delivery") is not None: if f.get("quantified_delivery") is not None:
row["量化递送"] = f"{f['quantified_delivery']:.4f}" row["量化递送"] = f"{f['quantified_delivery']:.4f}"
if f.get("unnormalized_delivery") is not None:
row["总荧光强度"] = f"{f['unnormalized_delivery']:.4f}"
if f.get("size") is not None: if f.get("size") is not None:
row["粒径(nm)"] = f"{f['size']:.1f}" row["粒径(nm)"] = f"{f['size']:.1f}"
if f.get("pdi_class") is not None: if f.get("pdi_class") is not None:

View File

@ -0,0 +1,14 @@
{
"intramuscular": {
"mean": 0.7281303554081238,
"std": 0.7006554090148486,
"qd_min": -1.0387570720282182,
"qd_max": 4.73706835052163
},
"intravenous": {
"mean": 0.29940387649347033,
"std": 0.37474351840219583,
"qd_min": -0.7985592911689305,
"qd_max": 4.497814051056962
}
}

View File

@ -8,6 +8,7 @@
""" """
import itertools import itertools
import json
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, field from dataclasses import dataclass, field
@ -124,11 +125,25 @@ HELPER_LIPID_OPTIONS = ["DOPE", "DSPC"]
# Route of administration 选项 # Route of administration 选项
ROUTE_OPTIONS = ["intravenous", "intramuscular"] ROUTE_OPTIONS = ["intravenous", "intramuscular"]
# quantified_delivery 归一化常量(按给药途径) # delivery 统计量(由 preprocess_internal.py 生成)
DELIVERY_NORM = { # 包含: mean/stdz-score 逆变换、qd_min/qd_max评分归一化
"intravenous": {"min": -0.798559291, "max": 4.497814051056962}, _DELIVERY_STATS_PATH = Path(__file__).resolve().parent / "delivery_zscore_stats.json"
"intramuscular": {"min": -0.794912427, "max": 10.220042980012716}, if _DELIVERY_STATS_PATH.exists():
} with open(_DELIVERY_STATS_PATH) as _f:
DELIVERY_ZSCORE_STATS: Dict[str, Dict[str, float]] = json.load(_f)
logger.info(f"Loaded delivery stats from {_DELIVERY_STATS_PATH}")
else:
DELIVERY_ZSCORE_STATS = {}
logger.warning(f"delivery_zscore_stats.json not found at {_DELIVERY_STATS_PATH}, "
"run 'make preprocess' to generate it")
# quantified_delivery 归一化常量(从统计量中提取 qd_min/qd_max用于评分归一化到 [0,1]
DELIVERY_NORM: Dict[str, Dict[str, float]] = {}
for _route, _stats in DELIVERY_ZSCORE_STATS.items():
if "qd_min" in _stats and "qd_max" in _stats:
DELIVERY_NORM[_route] = {"min": _stats["qd_min"], "max": _stats["qd_max"]}
if not DELIVERY_NORM:
logger.warning("DELIVERY_NORM is empty — scoring normalization for delivery will be disabled")
@dataclass @dataclass
@ -282,6 +297,7 @@ class Formulation:
biodist_predictions: Dict[str, float] = field(default_factory=dict) biodist_predictions: Dict[str, float] = field(default_factory=dict)
# 额外预测值 # 额外预测值
quantified_delivery: Optional[float] = None quantified_delivery: Optional[float] = None
unnormalized_delivery: Optional[float] = None # 反推的原始递送值z-score 逆变换)
size: Optional[float] = None size: Optional[float] = None
pdi_class: Optional[int] = None # PDI 分类 (0-3) pdi_class: Optional[int] = None # PDI 分类 (0-3)
ee_class: Optional[int] = None # EE 分类 (0-2) ee_class: Optional[int] = None # EE 分类 (0-2)
@ -588,6 +604,16 @@ def predict_all(
df["pred_ee_class"] = ee_preds df["pred_ee_class"] = ee_preds
df["pred_toxic_class"] = toxic_preds df["pred_toxic_class"] = toxic_preds
# 反推 unnormalized_delivery: value = z-score * std + mean
df["pred_unnorm_delivery"] = np.nan
if DELIVERY_ZSCORE_STATS:
for route_name, stats in DELIVERY_ZSCORE_STATS.items():
mask = df["_route"] == route_name
if mask.any():
df.loc[mask, "pred_unnorm_delivery"] = (
delivery_preds[mask.values] * stats["std"] + stats["mean"]
)
return df return df
@ -645,6 +671,9 @@ def select_top_k(
if key not in seen: if key not in seen:
seen.add(key) seen.add(key)
unnorm_val = row.get("pred_unnorm_delivery")
unnorm_delivery = float(unnorm_val) if pd.notna(unnorm_val) else None
formulation = Formulation( formulation = Formulation(
cationic_lipid_to_mrna_ratio=row["Cationic_Lipid_to_mRNA_weight_ratio"], cationic_lipid_to_mrna_ratio=row["Cationic_Lipid_to_mRNA_weight_ratio"],
cationic_lipid_mol_ratio=row["Cationic_Lipid_Mol_Ratio"], cationic_lipid_mol_ratio=row["Cationic_Lipid_Mol_Ratio"],
@ -658,6 +687,7 @@ def select_top_k(
}, },
# 额外预测值 # 额外预测值
quantified_delivery=row.get("pred_delivery"), quantified_delivery=row.get("pred_delivery"),
unnormalized_delivery=unnorm_delivery,
size=row.get("pred_size"), size=row.get("pred_size"),
pdi_class=int(row.get("pred_pdi_class")) if row.get("pred_pdi_class") is not None else None, pdi_class=int(row.get("pred_pdi_class")) if row.get("pred_pdi_class") is not None else None,
ee_class=int(row.get("pred_ee_class")) if row.get("pred_ee_class") is not None else None, ee_class=int(row.get("pred_ee_class")) if row.get("pred_ee_class") is not None else None,

View File

@ -1,5 +1,6 @@
"""数据清洗脚本:修正原始数据中的问题""" """数据清洗脚本:修正原始数据中的问题"""
import json
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@ -9,6 +10,7 @@ from loguru import logger
from lnp_ml.config import RAW_DATA_DIR, INTERIM_DATA_DIR from lnp_ml.config import RAW_DATA_DIR, INTERIM_DATA_DIR
APP_DIR = Path(__file__).resolve().parents[1] / "app"
app = typer.Typer() app = typer.Typer()
@ -24,6 +26,7 @@ def main(
修正内容 修正内容
1. 按给药途径分组进行 z-score 标准化 1. 按给药途径分组进行 z-score 标准化
2. size 列取 log 2. size 列取 log
3. z-score mean/std 保存到 app/ 供推理时反推
""" """
logger.info(f"Loading data from {input_path}") logger.info(f"Loading data from {input_path}")
df = pd.read_excel(input_path, header=2) df = pd.read_excel(input_path, header=2)
@ -32,11 +35,34 @@ def main(
# 分别对肌肉注射组和静脉注射组重新进行 z-score 标准化 # 分别对肌肉注射组和静脉注射组重新进行 z-score 标准化
logger.info("Z-score normalizing delivery by Route_of_administration...") logger.info("Z-score normalizing delivery by Route_of_administration...")
df["unnormalized_delivery"] = pd.to_numeric(df["unnormalized_delivery"], errors="coerce") df["unnormalized_delivery"] = pd.to_numeric(df["unnormalized_delivery"], errors="coerce")
# 计算并保存 per-route 统计量,用于推理时反推和评分归一化
zscore_stats = {}
for route, group in df.groupby("Route_of_administration"):
vals = group["unnormalized_delivery"].dropna()
if len(vals) > 1:
zscore_stats[route] = {"mean": float(vals.mean()), "std": float(vals.std())}
logger.info(f" {route}: mean={vals.mean():.6f}, std={vals.std():.6f}, n={len(vals)}")
df["quantified_delivery"] = ( df["quantified_delivery"] = (
df.groupby("Route_of_administration")["unnormalized_delivery"] df.groupby("Route_of_administration")["unnormalized_delivery"]
.transform(lambda x: (x - x.mean()) / x.std()) .transform(lambda x: (x - x.mean()) / x.std())
) )
# 补充 quantified_delivery 的 per-route min/max用于评分时归一化到 [0,1]
for route, group in df.groupby("Route_of_administration"):
qd = group["quantified_delivery"].dropna()
if len(qd) > 0 and route in zscore_stats:
zscore_stats[route]["qd_min"] = float(qd.min())
zscore_stats[route]["qd_max"] = float(qd.max())
logger.info(f" {route}: qd_min={qd.min():.6f}, qd_max={qd.max():.6f}")
stats_path = APP_DIR / "delivery_zscore_stats.json"
stats_path.parent.mkdir(parents=True, exist_ok=True)
with open(stats_path, "w") as f:
json.dump(zscore_stats, f, indent=2)
logger.success(f"Saved delivery stats to {stats_path}")
# 对 size 列取 log # 对 size 列取 log
logger.info("Log-transforming size column...") logger.info("Log-transforming size column...")
df["size"] = pd.to_numeric(df["size"], errors="coerce") df["size"] = pd.to_numeric(df["size"], errors="coerce")