mirror of
https://github.com/RYDE-WORK/lnp_ml.git
synced 2026-03-21 09:36:32 +08:00
Add final models
This commit is contained in:
parent
93a6f8654d
commit
39a14e4274
15
Makefile
15
Makefile
@ -68,6 +68,11 @@ clean_data: requirements
|
||||
data: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_data.py
|
||||
|
||||
## Process dataset for final training (interim -> processed/final, train:val=9:1, no test)
|
||||
.PHONY: data_final
|
||||
data_final: requirements
|
||||
$(PYTHON_INTERPRETER) scripts/process_data_final.py
|
||||
|
||||
## Process external data for pretrain (external -> processed)
|
||||
.PHONY: data_pretrain
|
||||
data_pretrain: requirements
|
||||
@ -128,6 +133,16 @@ train: requirements
|
||||
finetune: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train --init-from-pretrain models/pretrain_delivery.pt $(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Final training using all data (train:val=9:1, no test set), with pretrained weights
|
||||
.PHONY: train_final
|
||||
train_final: requirements
|
||||
$(PYTHON_INTERPRETER) -m lnp_ml.modeling.train \
|
||||
--train-path data/processed/final/train.parquet \
|
||||
--val-path data/processed/final/val.parquet \
|
||||
--output-dir models/final \
|
||||
--init-from-pretrain models/pretrain_delivery.pt \
|
||||
$(FREEZE_FLAG) $(MPNN_FLAG) $(DEVICE_FLAG)
|
||||
|
||||
## Finetune with cross-validation on internal data (5-fold, amine-based split) with pretrained weights
|
||||
.PHONY: finetune_cv
|
||||
finetune_cv: requirements
|
||||
|
||||
89
data/processed/final/feature_columns.txt
Normal file
89
data/processed/final/feature_columns.txt
Normal file
@ -0,0 +1,89 @@
|
||||
# Feature columns configuration (final training)
|
||||
|
||||
# SMILES
|
||||
smiles
|
||||
|
||||
# comp token [5]
|
||||
Cationic_Lipid_to_mRNA_weight_ratio
|
||||
Cationic_Lipid_Mol_Ratio
|
||||
Phospholipid_Mol_Ratio
|
||||
Cholesterol_Mol_Ratio
|
||||
PEG_Lipid_Mol_Ratio
|
||||
|
||||
# phys token [12]
|
||||
Purity_Pure
|
||||
Purity_Crude
|
||||
Mix_type_Microfluidic
|
||||
Mix_type_Pipetting
|
||||
Cargo_type_mRNA
|
||||
Cargo_type_pDNA
|
||||
Cargo_type_siRNA
|
||||
Target_or_delivered_gene_FFL
|
||||
Target_or_delivered_gene_Peptide_barcode
|
||||
Target_or_delivered_gene_hEPO
|
||||
Target_or_delivered_gene_FVII
|
||||
Target_or_delivered_gene_GFP
|
||||
|
||||
# help token [4]
|
||||
Helper_lipid_ID_DOPE
|
||||
Helper_lipid_ID_DOTAP
|
||||
Helper_lipid_ID_DSPC
|
||||
Helper_lipid_ID_MDOA
|
||||
|
||||
# exp token [32]
|
||||
Model_type_A549
|
||||
Model_type_BDMC
|
||||
Model_type_BMDM
|
||||
Model_type_HBEC_ALI
|
||||
Model_type_HEK293T
|
||||
Model_type_HeLa
|
||||
Model_type_IGROV1
|
||||
Model_type_Mouse
|
||||
Model_type_RAW264p7
|
||||
Delivery_target_body
|
||||
Delivery_target_dendritic_cell
|
||||
Delivery_target_generic_cell
|
||||
Delivery_target_liver
|
||||
Delivery_target_lung
|
||||
Delivery_target_lung_epithelium
|
||||
Delivery_target_macrophage
|
||||
Delivery_target_muscle
|
||||
Delivery_target_spleen
|
||||
Route_of_administration_in_vitro
|
||||
Route_of_administration_intramuscular
|
||||
Route_of_administration_intratracheal
|
||||
Route_of_administration_intravenous
|
||||
Batch_or_individual_or_barcoded_Barcoded
|
||||
Batch_or_individual_or_barcoded_Individual
|
||||
Value_name_log_luminescence
|
||||
Value_name_luminescence
|
||||
Value_name_FFL_silencing
|
||||
Value_name_Peptide_abundance
|
||||
Value_name_hEPO
|
||||
Value_name_FVII_silencing
|
||||
Value_name_GFP_delivery
|
||||
Value_name_Discretized_luminescence
|
||||
|
||||
# Targets
|
||||
## Regression
|
||||
size
|
||||
quantified_delivery
|
||||
## PDI classification
|
||||
PDI_0_0to0_2
|
||||
PDI_0_2to0_3
|
||||
PDI_0_3to0_4
|
||||
PDI_0_4to0_5
|
||||
## EE classification
|
||||
Encapsulation_Efficiency_EE<50
|
||||
Encapsulation_Efficiency_50<=EE<80
|
||||
Encapsulation_Efficiency_80<EE<=100
|
||||
## Toxic
|
||||
toxic
|
||||
## Biodistribution
|
||||
Biodistribution_lymph_nodes
|
||||
Biodistribution_heart
|
||||
Biodistribution_liver
|
||||
Biodistribution_spleen
|
||||
Biodistribution_lung
|
||||
Biodistribution_kidney
|
||||
Biodistribution_muscle
|
||||
BIN
data/processed/final/train.parquet
Normal file
BIN
data/processed/final/train.parquet
Normal file
Binary file not shown.
BIN
data/processed/final/val.parquet
Normal file
BIN
data/processed/final/val.parquet
Normal file
Binary file not shown.
720
models/final/history.json
Normal file
720
models/final/history.json
Normal file
@ -0,0 +1,720 @@
|
||||
{
|
||||
"train": [
|
||||
{
|
||||
"loss": 16.125412720900314,
|
||||
"loss_size": 10.958004199541532,
|
||||
"loss_pdi": 1.2954746576455922,
|
||||
"loss_ee": 1.032385032910567,
|
||||
"loss_delivery": 0.9622580982171572,
|
||||
"loss_biodist": 1.3037926875627959,
|
||||
"loss_toxic": 0.5734981435995835
|
||||
},
|
||||
{
|
||||
"loss": 5.045667318197397,
|
||||
"loss_size": 0.9689683621892562,
|
||||
"loss_pdi": 0.9294195175170898,
|
||||
"loss_ee": 0.9251096065227802,
|
||||
"loss_delivery": 0.8660176992416382,
|
||||
"loss_biodist": 1.0843104399167574,
|
||||
"loss_toxic": 0.27184163607083833
|
||||
},
|
||||
{
|
||||
"loss": 3.8401913826282206,
|
||||
"loss_size": 0.2623294328267758,
|
||||
"loss_pdi": 0.6934819909242483,
|
||||
"loss_ee": 0.8734692564377418,
|
||||
"loss_delivery": 0.9325725705577776,
|
||||
"loss_biodist": 0.8828928974958566,
|
||||
"loss_toxic": 0.1954453088916265
|
||||
},
|
||||
{
|
||||
"loss": 3.3800897414867697,
|
||||
"loss_size": 0.25510963281759846,
|
||||
"loss_pdi": 0.6420884224084707,
|
||||
"loss_ee": 0.8549692860016456,
|
||||
"loss_delivery": 0.8087867188912171,
|
||||
"loss_biodist": 0.6538480222225189,
|
||||
"loss_toxic": 0.1652876244714627
|
||||
},
|
||||
{
|
||||
"loss": 3.126784379665668,
|
||||
"loss_size": 0.24261613419422737,
|
||||
"loss_pdi": 0.6058944624203902,
|
||||
"loss_ee": 0.8256107156093304,
|
||||
"loss_delivery": 0.7888596923305438,
|
||||
"loss_biodist": 0.531504328434284,
|
||||
"loss_toxic": 0.13229898621256536
|
||||
},
|
||||
{
|
||||
"loss": 2.8701502359830418,
|
||||
"loss_size": 0.2547111769135182,
|
||||
"loss_pdi": 0.5767100866024311,
|
||||
"loss_ee": 0.7695629459161025,
|
||||
"loss_delivery": 0.7461442443040701,
|
||||
"loss_biodist": 0.42365796520159793,
|
||||
"loss_toxic": 0.0993638989300682
|
||||
},
|
||||
{
|
||||
"loss": 2.640800641133235,
|
||||
"loss_size": 0.23404385321415389,
|
||||
"loss_pdi": 0.569634735584259,
|
||||
"loss_ee": 0.7578622423685514,
|
||||
"loss_delivery": 0.6383189450089748,
|
||||
"loss_biodist": 0.3649272185105544,
|
||||
"loss_toxic": 0.0760135928598734
|
||||
},
|
||||
{
|
||||
"loss": 2.5832578127200785,
|
||||
"loss_size": 0.23187803878233984,
|
||||
"loss_pdi": 0.5650712022414575,
|
||||
"loss_ee": 0.7505346032289358,
|
||||
"loss_delivery": 0.6600469161684697,
|
||||
"loss_biodist": 0.31232603925925034,
|
||||
"loss_toxic": 0.06340097868815064
|
||||
},
|
||||
{
|
||||
"loss": 2.4268490993059597,
|
||||
"loss_size": 0.2535917151432771,
|
||||
"loss_pdi": 0.49883827567100525,
|
||||
"loss_ee": 0.6963567412816561,
|
||||
"loss_delivery": 0.6287171774758742,
|
||||
"loss_biodist": 0.2783840080866447,
|
||||
"loss_toxic": 0.07096117032835117
|
||||
},
|
||||
{
|
||||
"loss": 2.260506345675542,
|
||||
"loss_size": 0.2080249826495464,
|
||||
"loss_pdi": 0.5252504096581385,
|
||||
"loss_ee": 0.6650349016372974,
|
||||
"loss_delivery": 0.5313630184301963,
|
||||
"loss_biodist": 0.26103301919423616,
|
||||
"loss_toxic": 0.06980002821924594
|
||||
},
|
||||
{
|
||||
"loss": 2.2684748631257277,
|
||||
"loss_size": 0.23266072456653303,
|
||||
"loss_pdi": 0.5147650150152353,
|
||||
"loss_ee": 0.6746502197705783,
|
||||
"loss_delivery": 0.5654611770923321,
|
||||
"loss_biodist": 0.2386850737608396,
|
||||
"loss_toxic": 0.04225267691967579
|
||||
},
|
||||
{
|
||||
"loss": 2.085041247881376,
|
||||
"loss_size": 0.21298694839844337,
|
||||
"loss_pdi": 0.478065183529487,
|
||||
"loss_ee": 0.624563992023468,
|
||||
"loss_delivery": 0.4895915320286384,
|
||||
"loss_biodist": 0.24257883085654333,
|
||||
"loss_toxic": 0.03725476738495322
|
||||
},
|
||||
{
|
||||
"loss": 2.034342252291166,
|
||||
"loss_size": 0.20522931390083754,
|
||||
"loss_pdi": 0.49255690895594084,
|
||||
"loss_ee": 0.6087538760442001,
|
||||
"loss_delivery": 0.480256066012841,
|
||||
"loss_biodist": 0.2110953021507997,
|
||||
"loss_toxic": 0.03645075023031005
|
||||
},
|
||||
{
|
||||
"loss": 1.9308108733250544,
|
||||
"loss_size": 0.17175877896639016,
|
||||
"loss_pdi": 0.45260854638539827,
|
||||
"loss_ee": 0.6115030990197108,
|
||||
"loss_delivery": 0.46177717126332796,
|
||||
"loss_biodist": 0.17210170110830894,
|
||||
"loss_toxic": 0.06106155129292837
|
||||
},
|
||||
{
|
||||
"loss": 1.8926622317387507,
|
||||
"loss_size": 0.18219156162096903,
|
||||
"loss_pdi": 0.45498812886384815,
|
||||
"loss_ee": 0.6137785132114704,
|
||||
"loss_delivery": 0.442503636273054,
|
||||
"loss_biodist": 0.1760710489291411,
|
||||
"loss_toxic": 0.023129337216512516
|
||||
},
|
||||
{
|
||||
"loss": 1.973634958267212,
|
||||
"loss_size": 0.1851495263668207,
|
||||
"loss_pdi": 0.44218484254983753,
|
||||
"loss_ee": 0.5531376554415777,
|
||||
"loss_delivery": 0.5962466007241836,
|
||||
"loss_biodist": 0.1732567583139126,
|
||||
"loss_toxic": 0.023659600159869745
|
||||
},
|
||||
{
|
||||
"loss": 1.834187232531034,
|
||||
"loss_size": 0.1886328814121393,
|
||||
"loss_pdi": 0.417209030343936,
|
||||
"loss_ee": 0.6130589017501245,
|
||||
"loss_delivery": 0.41392402522839034,
|
||||
"loss_biodist": 0.17852016595693734,
|
||||
"loss_toxic": 0.022842258698521897
|
||||
},
|
||||
{
|
||||
"loss": 1.6386324167251587,
|
||||
"loss_size": 0.1619069530413701,
|
||||
"loss_pdi": 0.40920511002723986,
|
||||
"loss_ee": 0.543202471274596,
|
||||
"loss_delivery": 0.3477886743270434,
|
||||
"loss_biodist": 0.15435600338073877,
|
||||
"loss_toxic": 0.02217323684061949
|
||||
},
|
||||
{
|
||||
"loss": 1.6642086597589345,
|
||||
"loss_size": 0.1822590002646813,
|
||||
"loss_pdi": 0.38968331194840944,
|
||||
"loss_ee": 0.5327657415316656,
|
||||
"loss_delivery": 0.38456671570356077,
|
||||
"loss_biodist": 0.15715198791944063,
|
||||
"loss_toxic": 0.01778192028331642
|
||||
},
|
||||
{
|
||||
"loss": 1.561285321529095,
|
||||
"loss_size": 0.1437432708648535,
|
||||
"loss_pdi": 0.38639354361937595,
|
||||
"loss_ee": 0.5043096313109765,
|
||||
"loss_delivery": 0.35284727811813354,
|
||||
"loss_biodist": 0.15738230026685274,
|
||||
"loss_toxic": 0.016609296166839507
|
||||
},
|
||||
{
|
||||
"loss": 1.5598738010113056,
|
||||
"loss_size": 0.14988142280624464,
|
||||
"loss_pdi": 0.41852312133862424,
|
||||
"loss_ee": 0.5153307456236619,
|
||||
"loss_delivery": 0.30808181086411845,
|
||||
"loss_biodist": 0.1537162667283645,
|
||||
"loss_toxic": 0.014340438235264558
|
||||
},
|
||||
{
|
||||
"loss": 1.586504679459792,
|
||||
"loss_size": 0.1579624334206948,
|
||||
"loss_pdi": 0.4118466583582071,
|
||||
"loss_ee": 0.5027729616715357,
|
||||
"loss_delivery": 0.33311148560964143,
|
||||
"loss_biodist": 0.16044681118084833,
|
||||
"loss_toxic": 0.020364307870085422
|
||||
},
|
||||
{
|
||||
"loss": 1.5559651026358972,
|
||||
"loss_size": 0.15422992723492476,
|
||||
"loss_pdi": 0.40416549490048337,
|
||||
"loss_ee": 0.5043272055112399,
|
||||
"loss_delivery": 0.34229428607683915,
|
||||
"loss_biodist": 0.13546629192737433,
|
||||
"loss_toxic": 0.015481884215170374
|
||||
},
|
||||
{
|
||||
"loss": 1.474733068392827,
|
||||
"loss_size": 0.1376458337673774,
|
||||
"loss_pdi": 0.3663271631185825,
|
||||
"loss_ee": 0.5019905475469736,
|
||||
"loss_delivery": 0.3069626304965753,
|
||||
"loss_biodist": 0.1442684789116566,
|
||||
"loss_toxic": 0.01753841727398909
|
||||
},
|
||||
{
|
||||
"loss": 1.4861183900099535,
|
||||
"loss_size": 0.15266291739848944,
|
||||
"loss_pdi": 0.3844867234046643,
|
||||
"loss_ee": 0.49344547666036165,
|
||||
"loss_delivery": 0.292184143112256,
|
||||
"loss_biodist": 0.14641749343046775,
|
||||
"loss_toxic": 0.016921617932474382
|
||||
},
|
||||
{
|
||||
"loss": 1.512233894604903,
|
||||
"loss_size": 0.14189060032367706,
|
||||
"loss_pdi": 0.4098552969785837,
|
||||
"loss_ee": 0.4653961268755106,
|
||||
"loss_delivery": 0.29666711619267094,
|
||||
"loss_biodist": 0.13669535231131774,
|
||||
"loss_toxic": 0.061729387344362646
|
||||
},
|
||||
{
|
||||
"loss": 1.4599237900513868,
|
||||
"loss_size": 0.14220070122526243,
|
||||
"loss_pdi": 0.4150706483767583,
|
||||
"loss_ee": 0.46269946602674633,
|
||||
"loss_delivery": 0.2918037512841133,
|
||||
"loss_biodist": 0.13078988859286675,
|
||||
"loss_toxic": 0.017359334491909698
|
||||
},
|
||||
{
|
||||
"loss": 1.576150316458482,
|
||||
"loss_size": 0.1595412979905422,
|
||||
"loss_pdi": 0.3765770483475465,
|
||||
"loss_ee": 0.5104989432371579,
|
||||
"loss_delivery": 0.3169563438456792,
|
||||
"loss_biodist": 0.16685812347210371,
|
||||
"loss_toxic": 0.04571856157137798
|
||||
},
|
||||
{
|
||||
"loss": 1.4343401743815496,
|
||||
"loss_size": 0.13940962203420126,
|
||||
"loss_pdi": 0.3983347278374892,
|
||||
"loss_ee": 0.4601492996399219,
|
||||
"loss_delivery": 0.27466822358278126,
|
||||
"loss_biodist": 0.1437123933663735,
|
||||
"loss_toxic": 0.018065906255147778
|
||||
},
|
||||
{
|
||||
"loss": 1.3912398769305303,
|
||||
"loss_size": 0.12413510250357482,
|
||||
"loss_pdi": 0.37387907390411085,
|
||||
"loss_ee": 0.4670978830410884,
|
||||
"loss_delivery": 0.2875180955116565,
|
||||
"loss_biodist": 0.1249157666013791,
|
||||
"loss_toxic": 0.01369395311205433
|
||||
},
|
||||
{
|
||||
"loss": 1.424091059427995,
|
||||
"loss_size": 0.15850819360751373,
|
||||
"loss_pdi": 0.3705967183296497,
|
||||
"loss_ee": 0.4417672294836778,
|
||||
"loss_delivery": 0.30434694112493443,
|
||||
"loss_biodist": 0.13330914022830817,
|
||||
"loss_toxic": 0.015562845026859297
|
||||
},
|
||||
{
|
||||
"loss": 1.40188762316337,
|
||||
"loss_size": 0.13292228774382517,
|
||||
"loss_pdi": 0.3610769865604547,
|
||||
"loss_ee": 0.5077145374738253,
|
||||
"loss_delivery": 0.2582832815555426,
|
||||
"loss_biodist": 0.12532487941476014,
|
||||
"loss_toxic": 0.0165656553223156
|
||||
},
|
||||
{
|
||||
"loss": 1.3533825324131892,
|
||||
"loss_size": 0.13495178577991632,
|
||||
"loss_pdi": 0.3524953780265955,
|
||||
"loss_ee": 0.4514327874550453,
|
||||
"loss_delivery": 0.25540869382138437,
|
||||
"loss_biodist": 0.13806180197459,
|
||||
"loss_toxic": 0.021032094086806934
|
||||
},
|
||||
{
|
||||
"loss": 1.3358305784372182,
|
||||
"loss_size": 0.14862898737192154,
|
||||
"loss_pdi": 0.3689935069817763,
|
||||
"loss_ee": 0.4314877482560965,
|
||||
"loss_delivery": 0.24364815193873185,
|
||||
"loss_biodist": 0.13041460055571336,
|
||||
"loss_toxic": 0.01265757203173752
|
||||
}
|
||||
],
|
||||
"val": [
|
||||
{
|
||||
"loss": 9.567692279815674,
|
||||
"loss_size": 4.553072690963745,
|
||||
"loss_pdi": 1.0625805258750916,
|
||||
"loss_ee": 0.9942680895328522,
|
||||
"loss_delivery": 1.47209894657135,
|
||||
"loss_biodist": 1.1432022452354431,
|
||||
"loss_toxic": 0.3424694389104843,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.5909090909090909,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 4.354300379753113,
|
||||
"loss_size": 0.20095597207546234,
|
||||
"loss_pdi": 0.7770168483257294,
|
||||
"loss_ee": 0.9953322410583496,
|
||||
"loss_delivery": 1.3067785501480103,
|
||||
"loss_biodist": 0.9856867492198944,
|
||||
"loss_toxic": 0.08853016048669815,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.5909090909090909,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 4.091485023498535,
|
||||
"loss_size": 0.1840391382575035,
|
||||
"loss_pdi": 0.6579808592796326,
|
||||
"loss_ee": 0.9572223722934723,
|
||||
"loss_delivery": 1.386644184589386,
|
||||
"loss_biodist": 0.8630380034446716,
|
||||
"loss_toxic": 0.04256031848490238,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.5909090909090909,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 3.5865455865859985,
|
||||
"loss_size": 0.16276441887021065,
|
||||
"loss_pdi": 0.6156152784824371,
|
||||
"loss_ee": 0.8769127726554871,
|
||||
"loss_delivery": 1.185390830039978,
|
||||
"loss_biodist": 0.693861186504364,
|
||||
"loss_toxic": 0.05200100876390934,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.5909090909090909,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 3.216905355453491,
|
||||
"loss_size": 0.20165041834115982,
|
||||
"loss_pdi": 0.6010490655899048,
|
||||
"loss_ee": 0.8362293839454651,
|
||||
"loss_delivery": 1.0108550190925598,
|
||||
"loss_biodist": 0.4944496750831604,
|
||||
"loss_toxic": 0.0726719107478857,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6363636363636364,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 3.0632435083389282,
|
||||
"loss_size": 0.24732542037963867,
|
||||
"loss_pdi": 0.5763635039329529,
|
||||
"loss_ee": 0.8021132349967957,
|
||||
"loss_delivery": 0.8960984945297241,
|
||||
"loss_biodist": 0.4863189309835434,
|
||||
"loss_toxic": 0.05502396076917648,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.8909536600112915,
|
||||
"loss_size": 0.19840088486671448,
|
||||
"loss_pdi": 0.5675790905952454,
|
||||
"loss_ee": 0.789268285036087,
|
||||
"loss_delivery": 0.8245587646961212,
|
||||
"loss_biodist": 0.45656538009643555,
|
||||
"loss_toxic": 0.054581284523010254,
|
||||
"acc_pdi": 0.7272727272727273,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.771771550178528,
|
||||
"loss_size": 0.1412012092769146,
|
||||
"loss_pdi": 0.5785803645849228,
|
||||
"loss_ee": 0.7794539630413055,
|
||||
"loss_delivery": 0.7300527393817902,
|
||||
"loss_biodist": 0.4118006229400635,
|
||||
"loss_toxic": 0.130682535469532,
|
||||
"acc_pdi": 0.7272727272727273,
|
||||
"acc_ee": 0.6363636363636364,
|
||||
"acc_toxic": 0.9090909090909091
|
||||
},
|
||||
{
|
||||
"loss": 2.692520260810852,
|
||||
"loss_size": 0.1475178860127926,
|
||||
"loss_pdi": 0.5785025954246521,
|
||||
"loss_ee": 0.7986349761486053,
|
||||
"loss_delivery": 0.6745082139968872,
|
||||
"loss_biodist": 0.3999589830636978,
|
||||
"loss_toxic": 0.09339761920273304,
|
||||
"acc_pdi": 0.7272727272727273,
|
||||
"acc_ee": 0.5454545454545454,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.8346426486968994,
|
||||
"loss_size": 0.1516040787100792,
|
||||
"loss_pdi": 0.5753654837608337,
|
||||
"loss_ee": 0.8160740733146667,
|
||||
"loss_delivery": 0.7168894708156586,
|
||||
"loss_biodist": 0.47982172667980194,
|
||||
"loss_toxic": 0.09488803520798683,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.7991228103637695,
|
||||
"loss_size": 0.13234595209360123,
|
||||
"loss_pdi": 0.5597751885652542,
|
||||
"loss_ee": 0.8118472993373871,
|
||||
"loss_delivery": 0.8289965093135834,
|
||||
"loss_biodist": 0.3804939240217209,
|
||||
"loss_toxic": 0.0856639239937067,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6136363636363636,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.5899158716201782,
|
||||
"loss_size": 0.10683015361428261,
|
||||
"loss_pdi": 0.5460522472858429,
|
||||
"loss_ee": 0.8205565512180328,
|
||||
"loss_delivery": 0.5881542563438416,
|
||||
"loss_biodist": 0.47443418204784393,
|
||||
"loss_toxic": 0.053888481110334396,
|
||||
"acc_pdi": 0.7954545454545454,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.725231647491455,
|
||||
"loss_size": 0.11643071845173836,
|
||||
"loss_pdi": 0.5444842875003815,
|
||||
"loss_ee": 0.8331143856048584,
|
||||
"loss_delivery": 0.7808746695518494,
|
||||
"loss_biodist": 0.3872501105070114,
|
||||
"loss_toxic": 0.06307746656239033,
|
||||
"acc_pdi": 0.8181818181818182,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6474616527557373,
|
||||
"loss_size": 0.10570626705884933,
|
||||
"loss_pdi": 0.5306982547044754,
|
||||
"loss_ee": 0.8475350737571716,
|
||||
"loss_delivery": 0.6773415505886078,
|
||||
"loss_biodist": 0.45252545177936554,
|
||||
"loss_toxic": 0.033654999919235706,
|
||||
"acc_pdi": 0.7954545454545454,
|
||||
"acc_ee": 0.7045454545454546,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.7429730892181396,
|
||||
"loss_size": 0.13143501430749893,
|
||||
"loss_pdi": 0.5128554701805115,
|
||||
"loss_ee": 0.8301066756248474,
|
||||
"loss_delivery": 0.7826772928237915,
|
||||
"loss_biodist": 0.4481300711631775,
|
||||
"loss_toxic": 0.03776852134615183,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.7127978801727295,
|
||||
"loss_size": 0.19175250455737114,
|
||||
"loss_pdi": 0.516972616314888,
|
||||
"loss_ee": 0.8794091939926147,
|
||||
"loss_delivery": 0.6612942218780518,
|
||||
"loss_biodist": 0.4495534151792526,
|
||||
"loss_toxic": 0.01381598599255085,
|
||||
"acc_pdi": 0.7954545454545454,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.605361819267273,
|
||||
"loss_size": 0.0976286269724369,
|
||||
"loss_pdi": 0.5153753459453583,
|
||||
"loss_ee": 0.8854215145111084,
|
||||
"loss_delivery": 0.6795051693916321,
|
||||
"loss_biodist": 0.3929741233587265,
|
||||
"loss_toxic": 0.0344569431617856,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6136363636363636,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6061036586761475,
|
||||
"loss_size": 0.13404572382569313,
|
||||
"loss_pdi": 0.5658221244812012,
|
||||
"loss_ee": 0.8649452030658722,
|
||||
"loss_delivery": 0.5898179858922958,
|
||||
"loss_biodist": 0.4181220978498459,
|
||||
"loss_toxic": 0.03335061576217413,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6363636363636364,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.566210627555847,
|
||||
"loss_size": 0.09801355004310608,
|
||||
"loss_pdi": 0.5331112891435623,
|
||||
"loss_ee": 0.8543781340122223,
|
||||
"loss_delivery": 0.663883626461029,
|
||||
"loss_biodist": 0.40053851902484894,
|
||||
"loss_toxic": 0.01628561830148101,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6363636363636364,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.5774325132369995,
|
||||
"loss_size": 0.10316971689462662,
|
||||
"loss_pdi": 0.5015804171562195,
|
||||
"loss_ee": 0.8556492626667023,
|
||||
"loss_delivery": 0.6874183714389801,
|
||||
"loss_biodist": 0.40917880833148956,
|
||||
"loss_toxic": 0.02043590322136879,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.605422019958496,
|
||||
"loss_size": 0.09912058711051941,
|
||||
"loss_pdi": 0.5235853344202042,
|
||||
"loss_ee": 0.873578816652298,
|
||||
"loss_delivery": 0.6754274368286133,
|
||||
"loss_biodist": 0.41106243431568146,
|
||||
"loss_toxic": 0.022647667676210403,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6543829441070557,
|
||||
"loss_size": 0.09768814221024513,
|
||||
"loss_pdi": 0.49928364157676697,
|
||||
"loss_ee": 0.887874037027359,
|
||||
"loss_delivery": 0.7190346419811249,
|
||||
"loss_biodist": 0.3921310156583786,
|
||||
"loss_toxic": 0.0583715345710516,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 0.9696969696969697
|
||||
},
|
||||
{
|
||||
"loss": 2.567589282989502,
|
||||
"loss_size": 0.09272187761962414,
|
||||
"loss_pdi": 0.51718769967556,
|
||||
"loss_ee": 0.9185941517353058,
|
||||
"loss_delivery": 0.6525538563728333,
|
||||
"loss_biodist": 0.37214455008506775,
|
||||
"loss_toxic": 0.014387213159352541,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.5968352556228638,
|
||||
"loss_size": 0.08045755326747894,
|
||||
"loss_pdi": 0.5356858968734741,
|
||||
"loss_ee": 0.9407581686973572,
|
||||
"loss_delivery": 0.634858101606369,
|
||||
"loss_biodist": 0.3985549360513687,
|
||||
"loss_toxic": 0.006520765833556652,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6400684118270874,
|
||||
"loss_size": 0.10199694335460663,
|
||||
"loss_pdi": 0.517327144742012,
|
||||
"loss_ee": 0.9348995685577393,
|
||||
"loss_delivery": 0.6469141840934753,
|
||||
"loss_biodist": 0.4060143977403641,
|
||||
"loss_toxic": 0.032916306518018246,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6818181818181818,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.684706211090088,
|
||||
"loss_size": 0.08828861452639103,
|
||||
"loss_pdi": 0.5358051210641861,
|
||||
"loss_ee": 0.9240424036979675,
|
||||
"loss_delivery": 0.7182384729385376,
|
||||
"loss_biodist": 0.4059095233678818,
|
||||
"loss_toxic": 0.012422125786542892,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.665715456008911,
|
||||
"loss_size": 0.09193641319870949,
|
||||
"loss_pdi": 0.5586960315704346,
|
||||
"loss_ee": 0.9341873526573181,
|
||||
"loss_delivery": 0.6672921180725098,
|
||||
"loss_biodist": 0.39184093475341797,
|
||||
"loss_toxic": 0.021762709133327007,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.596559524536133,
|
||||
"loss_size": 0.09265327453613281,
|
||||
"loss_pdi": 0.528680682182312,
|
||||
"loss_ee": 0.919374018907547,
|
||||
"loss_delivery": 0.6490764021873474,
|
||||
"loss_biodist": 0.3949074149131775,
|
||||
"loss_toxic": 0.011867486871778965,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.585421562194824,
|
||||
"loss_size": 0.0936437975615263,
|
||||
"loss_pdi": 0.5345576107501984,
|
||||
"loss_ee": 0.9352433085441589,
|
||||
"loss_delivery": 0.6098059117794037,
|
||||
"loss_biodist": 0.3938410133123398,
|
||||
"loss_toxic": 0.018329923041164875,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.655733346939087,
|
||||
"loss_size": 0.10606781952083111,
|
||||
"loss_pdi": 0.5337405353784561,
|
||||
"loss_ee": 0.9154745638370514,
|
||||
"loss_delivery": 0.7101629674434662,
|
||||
"loss_biodist": 0.38175711035728455,
|
||||
"loss_toxic": 0.00853043282404542,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6239495277404785,
|
||||
"loss_size": 0.0914073996245861,
|
||||
"loss_pdi": 0.5375382900238037,
|
||||
"loss_ee": 0.9078292548656464,
|
||||
"loss_delivery": 0.693512886762619,
|
||||
"loss_biodist": 0.3826553374528885,
|
||||
"loss_toxic": 0.011006365530192852,
|
||||
"acc_pdi": 0.8181818181818182,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6274508237838745,
|
||||
"loss_size": 0.0975069236010313,
|
||||
"loss_pdi": 0.5298525840044022,
|
||||
"loss_ee": 0.9227176904678345,
|
||||
"loss_delivery": 0.6834201812744141,
|
||||
"loss_biodist": 0.3833360821008682,
|
||||
"loss_toxic": 0.010617360007017851,
|
||||
"acc_pdi": 0.7954545454545454,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6211040019989014,
|
||||
"loss_size": 0.108822550624609,
|
||||
"loss_pdi": 0.5202214568853378,
|
||||
"loss_ee": 0.9265661537647247,
|
||||
"loss_delivery": 0.6657091081142426,
|
||||
"loss_biodist": 0.39240916073322296,
|
||||
"loss_toxic": 0.007375639397650957,
|
||||
"acc_pdi": 0.75,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
},
|
||||
{
|
||||
"loss": 2.6309741735458374,
|
||||
"loss_size": 0.10216855630278587,
|
||||
"loss_pdi": 0.5254682153463364,
|
||||
"loss_ee": 0.9219010472297668,
|
||||
"loss_delivery": 0.6798527538776398,
|
||||
"loss_biodist": 0.3923489600419998,
|
||||
"loss_toxic": 0.009234576486051083,
|
||||
"acc_pdi": 0.7727272727272727,
|
||||
"acc_ee": 0.6590909090909091,
|
||||
"acc_toxic": 1.0
|
||||
}
|
||||
]
|
||||
}
|
||||
BIN
models/final/loss_curves.png
Normal file
BIN
models/final/loss_curves.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 260 KiB |
BIN
models/final/model.pt
Normal file
BIN
models/final/model.pt
Normal file
Binary file not shown.
153
scripts/process_data_final.py
Normal file
153
scripts/process_data_final.py
Normal file
@ -0,0 +1,153 @@
|
||||
"""最终训练数据处理脚本:train:val = 9:1,无测试集"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import typer
|
||||
from loguru import logger
|
||||
|
||||
from lnp_ml.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR
|
||||
from lnp_ml.dataset import (
|
||||
process_dataframe,
|
||||
SMILES_COL,
|
||||
COMP_COLS,
|
||||
HELP_COLS,
|
||||
TARGET_REGRESSION,
|
||||
TARGET_CLASSIFICATION_PDI,
|
||||
TARGET_CLASSIFICATION_EE,
|
||||
TARGET_TOXIC,
|
||||
TARGET_BIODIST,
|
||||
get_phys_cols,
|
||||
get_exp_cols,
|
||||
)
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
@app.command()
|
||||
def main(
|
||||
input_path: Path = INTERIM_DATA_DIR / "internal_corrected.csv",
|
||||
output_dir: Path = PROCESSED_DATA_DIR / "final",
|
||||
train_ratio: float = 0.9,
|
||||
seed: int = 42,
|
||||
):
|
||||
"""
|
||||
处理原始数据并划分训练/验证集(无测试集)。
|
||||
|
||||
用于最终训练,使用所有数据。
|
||||
|
||||
输出文件:
|
||||
- final/train.parquet: 训练集 (90%)
|
||||
- final/val.parquet: 验证集 (10%)
|
||||
- final/feature_columns.txt: 特征列名配置
|
||||
"""
|
||||
logger.info(f"Loading data from {input_path}")
|
||||
df = pd.read_csv(input_path)
|
||||
logger.info(f"Loaded {len(df)} samples")
|
||||
|
||||
# 处理数据
|
||||
logger.info("Processing dataframe...")
|
||||
df = process_dataframe(df)
|
||||
|
||||
# 定义要保留的列
|
||||
phys_cols = get_phys_cols()
|
||||
exp_cols = get_exp_cols()
|
||||
|
||||
keep_cols = (
|
||||
[SMILES_COL]
|
||||
+ COMP_COLS
|
||||
+ phys_cols
|
||||
+ HELP_COLS
|
||||
+ exp_cols
|
||||
+ TARGET_REGRESSION
|
||||
+ TARGET_CLASSIFICATION_PDI
|
||||
+ TARGET_CLASSIFICATION_EE
|
||||
+ [TARGET_TOXIC]
|
||||
+ TARGET_BIODIST
|
||||
)
|
||||
|
||||
# 只保留存在的列
|
||||
keep_cols = [c for c in keep_cols if c in df.columns]
|
||||
df = df[keep_cols]
|
||||
|
||||
# 随机打乱并划分
|
||||
logger.info(f"Splitting dataset (train:val = {train_ratio}:{1-train_ratio:.1f})...")
|
||||
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||
|
||||
n = len(df)
|
||||
n_train = int(n * train_ratio)
|
||||
|
||||
train_df = df.iloc[:n_train]
|
||||
val_df = df.iloc[n_train:]
|
||||
|
||||
logger.info(f"Train: {len(train_df)}, Val: {len(val_df)}")
|
||||
|
||||
# 保存
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
train_path = output_dir / "train.parquet"
|
||||
val_path = output_dir / "val.parquet"
|
||||
|
||||
train_df.to_parquet(train_path, index=False)
|
||||
val_df.to_parquet(val_path, index=False)
|
||||
|
||||
logger.success(f"Saved train to {train_path}")
|
||||
logger.success(f"Saved val to {val_path}")
|
||||
|
||||
# 保存列名配置
|
||||
config_path = output_dir / "feature_columns.txt"
|
||||
with open(config_path, "w") as f:
|
||||
f.write("# Feature columns configuration (final training)\n\n")
|
||||
f.write(f"# SMILES\n{SMILES_COL}\n\n")
|
||||
f.write(f"# comp token [{len(COMP_COLS)}]\n")
|
||||
f.write("\n".join(COMP_COLS) + "\n\n")
|
||||
f.write(f"# phys token [{len(phys_cols)}]\n")
|
||||
f.write("\n".join(phys_cols) + "\n\n")
|
||||
f.write(f"# help token [{len(HELP_COLS)}]\n")
|
||||
f.write("\n".join(HELP_COLS) + "\n\n")
|
||||
f.write(f"# exp token [{len(exp_cols)}]\n")
|
||||
f.write("\n".join(exp_cols) + "\n\n")
|
||||
f.write("# Targets\n")
|
||||
f.write("## Regression\n")
|
||||
f.write("\n".join(TARGET_REGRESSION) + "\n")
|
||||
f.write("## PDI classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_PDI) + "\n")
|
||||
f.write("## EE classification\n")
|
||||
f.write("\n".join(TARGET_CLASSIFICATION_EE) + "\n")
|
||||
f.write("## Toxic\n")
|
||||
f.write(f"{TARGET_TOXIC}\n")
|
||||
f.write("## Biodistribution\n")
|
||||
f.write("\n".join(TARGET_BIODIST) + "\n")
|
||||
|
||||
logger.success(f"Saved feature config to {config_path}")
|
||||
|
||||
# 打印统计信息
|
||||
logger.info("\n=== Dataset Statistics ===")
|
||||
logger.info(f"Total samples: {n}")
|
||||
logger.info(f"SMILES unique: {df[SMILES_COL].nunique()}")
|
||||
|
||||
# 缺失值统计
|
||||
logger.info("\nMissing values in targets:")
|
||||
for col in TARGET_REGRESSION + [TARGET_TOXIC]:
|
||||
if col in df.columns:
|
||||
missing = df[col].isna().sum()
|
||||
logger.info(f" {col}: {missing} ({100*missing/n:.1f}%)")
|
||||
|
||||
# PDI 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_PDI):
|
||||
pdi_sum = df[TARGET_CLASSIFICATION_PDI].sum()
|
||||
logger.info(f"\nPDI distribution:")
|
||||
for col, count in pdi_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
# EE 分布
|
||||
if all(c in df.columns for c in TARGET_CLASSIFICATION_EE):
|
||||
ee_sum = df[TARGET_CLASSIFICATION_EE].sum()
|
||||
logger.info(f"\nEE distribution:")
|
||||
for col, count in ee_sum.items():
|
||||
logger.info(f" {col}: {int(count)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user