diff --git a/.hooks/post-checkout b/.hooks/post-checkout new file mode 100755 index 0000000..5abf8ed --- /dev/null +++ b/.hooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-checkout "$@" diff --git a/.hooks/post-commit b/.hooks/post-commit new file mode 100755 index 0000000..b8b76c2 --- /dev/null +++ b/.hooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-commit "$@" diff --git a/.hooks/post-merge b/.hooks/post-merge new file mode 100755 index 0000000..726f909 --- /dev/null +++ b/.hooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-merge "$@" diff --git a/.hooks/pre-push b/.hooks/pre-push new file mode 100755 index 0000000..5f26dc4 --- /dev/null +++ b/.hooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs pre-push "$@" diff --git a/Makefile b/Makefile index 1608bbb..96341d0 100644 --- a/Makefile +++ b/Makefile @@ -58,10 +58,10 @@ create_environment: ################################################################################# -## Clean raw data (raw -> interim) -.PHONY: clean_data -clean_data: requirements - $(PYTHON_INTERPRETER) scripts/data_cleaning.py +## Preprocess internal data (raw -> interim) +.PHONY: preprocess +preprocess: requirements + $(PYTHON_INTERPRETER) scripts/preprocess_internal.py ## Process dataset (interim -> processed) .PHONY: data diff --git a/data/interim/internal_corrected.csv b/data/interim/internal_corrected.csv index eb067fc..19987bf 100644 --- a/data/interim/internal_corrected.csv +++ b/data/interim/internal_corrected.csv @@ -1,435 +1,439 @@ smiles,Helper_lipid_ID,Helper_lipid_ID_DOPE,Helper_lipid_ID_DOTAP,Helper_lipid_ID_DSPC,Helper_lipid_ID_MDOA,Cationic_Lipid_to_mRNA_weight_ratio,Cationic_Lipid_Mol_Ratio,Phospholipid_Mol_Ratio,Cholesterol_Mol_Ratio,PEG_Lipid_Mol_Ratio,Purity,Mix_type,Cargo_type,Cargo_type_mRNA,Cargo_type_pDNA,Cargo_type_siRNA,Target_or_delivered_gene,Model_type,Model_type_A549,Model_type_BDMC,Model_type_BMDM,Model_type_HBEC_ALI,Model_type_HEK293T,Model_type_HeLa,Model_type_IGROV1,Model_type_Mouse,Model_type_RAW264p7,Delivery_target,Delivery_target_body,Delivery_target_dendritic_cell,Delivery_target_generic_cell,Delivery_target_liver,Delivery_target_lung,Delivery_target_lung_epithelium,Delivery_target_macrophage,Delivery_target_muscle,Delivery_target_spleen,Route_of_administration,Route_of_administration_in_vitro,Route_of_administration_intramuscular,Route_of_administration_intratracheal,Route_of_administration_intravenous,Batch_or_individual_or_barcoded,Batch_or_individual_or_barcoded_Barcoded,Batch_or_individual_or_barcoded_Individual,Value_name,Formulation_ID,Cationic_Lipid_Mass_Ratio,Phospholipid_Mass_Ratio,Cholesterol_Mass_Ratio,PEG_Lipid_Mass_Ratio,Amine,Tail,Lipid_name,Library_ID,Experiment_ID,Experimenter_ID,Experiment_weight,Goal,screen_id,Source,Sample_weight,Comment,Amine_number,Full_SMILES,Amine_SMILES,Tail_length,Num_tails,Ketone,Isocyanide,Is_cyclic,Formulation,Batch_or_individual,Random_id,4CR_Lipid_name,Aldehyde,Carboxylic_acid,Amine_name,Tail_name,Linker,MolWt,Tail_count,Num tails,Common_name,Ester,Disulfide,Total_radiance,Form_comment,molecular_weight,split_name_for_normalization,Encapsulation_Efficiency_EE<50,Encapsulation_Efficiency_50<=EE<80,Encapsulation_Efficiency_80