├── make_figures
    ├── figures
    │   └── png
    │   │   ├── figure 2 basics-1.png
    │   │   ├── figure s1 data_qc.png
    │   │   ├── figure s8 dcdfg-1.png
    │   │   ├── figure 1 schematic-1.png
    │   │   ├── figure s7 simulation-1.png
    │   │   ├── figure 3 all published-1.png
    │   │   ├── figure s2 basics supp-1.png
    │   │   ├── figure s4 preprocessing-1.png
    │   │   ├── figure s3 stratify targets-1.png
    │   │   ├── figure s6 stratified split-1.png
    │   │   ├── figure s9 all published details-1.png
    │   │   └── figure s5 why is the mean a strong baseline-1.png
    ├── simple_simulation.R
    ├── add_lit_review_to_data_collection.R
    ├── export_benchmark_results.sh
    ├── figure_2_demo.py
    ├── global_effects
    │   ├── dixit.txt
    │   ├── freimer.txt
    │   ├── replogle.txt
    │   ├── adamson.txt
    │   └── frangieh_IFNg_v3.txt
    ├── timeseries_differential_expression.py
    ├── psc_tf_due_diligence.py
    ├── figure_s1_timeseries.py
    ├── variance_decomposition.py
    ├── cross_dataset_correlations.py
    ├── figure_s1_effects.py
    ├── plotting_script_unused.R
    ├── .load_perturbation(dataset)
    └── replicate_correlations_and_stereotypical_effects.py
├── package.json
├── experiments
    ├── 1.4.1_1
    │   └── metadata.json
    ├── 1.4.1_2
    │   └── metadata.json
    ├── 1.0_10
    │   └── metadata.json
    ├── 1.0_8
    │   └── metadata.json
    ├── 1.0_13
    │   └── metadata.json
    ├── 1.0_14
    │   └── metadata.json
    ├── 1.0_2
    │   └── metadata.json
    ├── 1.0_9
    │   └── metadata.json
    ├── 1.4.2_13
    │   └── metadata.json
    ├── 1.0_12
    │   └── metadata.json
    ├── 1.0_15
    │   └── metadata.json
    ├── 1.4.2_12
    │   └── metadata.json
    ├── 1.4.2_2
    │   └── metadata.json
    ├── 1.4.2_3
    │   └── metadata.json
    ├── 1.4.2_4
    │   └── metadata.json
    ├── 1.6.3_4
    │   └── metadata.json
    ├── 1.6.3_7
    │   └── metadata.json
    ├── 1.0_4
    │   └── metadata.json
    ├── 1.0_5
    │   └── metadata.json
    ├── 1.4.2_14
    │   └── metadata.json
    ├── 1.6.3_2
    │   └── metadata.json
    ├── 1.0_11
    │   └── metadata.json
    ├── 1.0_3
    │   └── metadata.json
    ├── 1.4.3_12
    │   └── metadata.json
    ├── 1.6.3_3
    │   └── metadata.json
    ├── 1.0_6
    │   └── metadata.json
    ├── 1.4.3_7
    │   └── metadata.json
    ├── 1.4.3_8
    │   └── metadata.json
    ├── 1.6.3_5
    │   └── metadata.json
    ├── 1.4.3_10
    │   └── metadata.json
    ├── 1.4.3_2
    │   └── metadata.json
    ├── 1.4.3_9
    │   └── metadata.json
    ├── 1.6.3_6
    │   └── metadata.json
    ├── 1.8.3_2
    │   └── metadata.json
    ├── 1.8.3_7
    │   └── metadata.json
    ├── 1.8.3_8
    │   └── metadata.json
    ├── 1.8.3_9
    │   └── metadata.json
    ├── 1.4.3_13
    │   └── metadata.json
    ├── 1.4.3_14
    │   └── metadata.json
    ├── 1.4.3_4
    │   └── metadata.json
    ├── 1.8.2_7
    │   └── metadata.json
    ├── 1.8.3_3
    │   └── metadata.json
    ├── 1.4.3_15
    │   └── metadata.json
    ├── 1.4.3_3
    │   └── metadata.json
    ├── 1.8.3_5
    │   └── metadata.json
    ├── 1.4.3_5
    │   └── metadata.json
    ├── 1.8.3_6
    │   └── metadata.json
    ├── 1.8.2_2
    │   └── metadata.json
    ├── 1.8.2_4
    │   └── metadata.json
    ├── 1.8.2_5
    │   └── metadata.json
    ├── 1.4.3_6
    │   └── metadata.json
    ├── 1.4.3_11
    │   └── metadata.json
    ├── 1.8.2_3
    │   └── metadata.json
    ├── 1.0_0
    │   └── metadata.json
    ├── 1.4.2_8
    │   └── metadata.json
    ├── 1.6.1_4
    │   └── metadata.json
    ├── 1.8.4_1
    │   └── metadata.json
    ├── 1.4.2_10
    │   └── metadata.json
    ├── 1.4.2_11
    │   └── metadata.json
    ├── 1.4.2_6
    │   └── metadata.json
    ├── 1.4.2_5
    │   └── metadata.json
    ├── 1.6.1_19
    │   └── metadata.json
    ├── 1.6.3_8
    │   └── metadata.json
    ├── 1.6.1_3
    │   └── metadata.json
    ├── 1.6.1_2
    │   └── metadata.json
    ├── 1.6.3_1
    │   └── metadata.json
    ├── 1.3.3_7
    │   └── metadata.json
    ├── 1.3.3_8
    │   └── metadata.json
    ├── 1.3.3_9
    │   └── metadata.json
    ├── 1.3.3_10
    │   └── metadata.json
    ├── 1.6.1_6
    │   └── metadata.json
    ├── 1.8.4_7
    │   └── metadata.json
    ├── 1.3.3_3
    │   └── metadata.json
    ├── 1.3.3_5
    │   └── metadata.json
    ├── 1.8.4_2
    │   └── metadata.json
    ├── 1.8.4_8
    │   └── metadata.json
    ├── 1.8.4_9
    │   └── metadata.json
    ├── 1.3.3_6
    │   └── metadata.json
    ├── 1.8.4_3
    │   └── metadata.json
    ├── 1.8.4_5
    │   └── metadata.json
    ├── 1.8.4_6
    │   └── metadata.json
    ├── 1.9_3
    │   └── metadata.json
    ├── 1.9_0
    │   └── metadata.json
    ├── 1.9_4
    │   └── metadata.json
    ├── 1.6.1_16
    │   └── metadata.json
    ├── 1.6.1_7
    │   └── metadata.json
    ├── 1.9_2
    │   └── metadata.json
    ├── 1.2.2_8
    │   └── metadata.json
    ├── 1.6.1_10
    │   └── metadata.json
    ├── 1.6.1_11
    │   └── metadata.json
    ├── 1.6.1_13
    │   └── metadata.json
    ├── 1.6.1_14
    │   └── metadata.json
    ├── 1.6.1_15
    │   └── metadata.json
    ├── 1.6.1_9
    │   └── metadata.json
    ├── 1.2.2_12
    │   └── metadata.json
    ├── 1.2.2_7
    │   └── metadata.json
    ├── 1.6.1_17
    │   └── metadata.json
    ├── 1.6.1_8
    │   └── metadata.json
    ├── 1.2.2_10
    │   └── metadata.json
    ├── 1.2.2_13
    │   └── metadata.json
    ├── 1.2.2_2
    │   └── metadata.json
    ├── 1.2.2_9
    │   └── metadata.json
    ├── 1.2.2_3
    │   └── metadata.json
    ├── 1.2.2_5
    │   └── metadata.json
    ├── 1.6.1_12
    │   └── metadata.json
    ├── 1.2.2_11
    │   └── metadata.json
    ├── 1.3.1_2
    │   └── metadata.json
    ├── 1.2.2_6
    │   └── metadata.json
    ├── 1.4.4_7
    │   └── metadata.json
    ├── 1.4.4_8
    │   └── metadata.json
    ├── 1.4.4_3
    │   └── metadata.json
    ├── 1.4.4_4
    │   └── metadata.json
    ├── 1.4.4_5
    │   └── metadata.json
    ├── 1.4.4_6
    │   └── metadata.json
    ├── 1.3.2_5
    │   └── metadata.json
    ├── 1.3.2_4
    │   └── metadata.json
    ├── 1.5.1_4
    │   └── metadata.json
    ├── 1.3.2_3
    │   └── metadata.json
    ├── 1.5.1_5
    │   └── metadata.json
    ├── 1.5.1_3
    │   └── metadata.json
    ├── 1.5.1_6
    │   └── metadata.json
    ├── 1.2.2_16
    │   └── metadata.json
    ├── 1.3.2_2
    │   └── metadata.json
    ├── 1.0_7
    │   └── metadata.json
    ├── 1.2.2_17
    │   └── metadata.json
    ├── 1.5.1_1
    │   └── metadata.json
    ├── 1.5.1_2
    │   └── metadata.json
    ├── 1.8.5_1
    │   └── metadata.json
    ├── 1.2.2_19
    │   └── metadata.json
    ├── 1.1.1_1
    │   └── metadata.json
    ├── 1.2.2_20
    │   └── metadata.json
    ├── 1.2.2_18
    │   └── metadata.json
    ├── 1.2.2_21
    │   └── metadata.json
    ├── ggrn_docker_backend_celloracle
    │   └── metadata.json
    ├── 1.8.5_2
    │   └── metadata.json
    ├── 1.8.5_7
    │   └── metadata.json
    ├── 1.8.5_8
    │   └── metadata.json
    ├── 1.8.5_9
    │   └── metadata.json
    ├── 1.8.5_3
    │   └── metadata.json
    ├── test
    │   └── metadata.json
    ├── 1.8.5_5
    │   └── metadata.json
    ├── 1.8.5_6
    │   └── metadata.json
    ├── ggrn_docker_backend
    │   └── metadata.json
    ├── 1.4.2_1
    │   └── metadata.json
    ├── 1.3.3_2
    │   └── metadata.json
    ├── 1.3.2_1
    │   └── metadata.json
    ├── 1.2.2_1
    │   └── metadata.json
    ├── 1.2.2_15
    │   └── metadata.json
    ├── 1.2.2_22
    │   └── metadata.json
    ├── 1.4.1_3
    │   └── metadata.json
    ├── 1.4.1_4
    │   └── metadata.json
    ├── 1.1.2_1
    │   └── metadata.json
    ├── 1.6.3_0
    │   └── metadata.json
    ├── 1.3.1_1
    │   └── metadata.json
    ├── 1.5.2_0
    │   └── metadata.json
    ├── 1.8.3_1
    │   └── metadata.json
    ├── 1.3.3_1
    │   └── metadata.json
    ├── 1.2.2_14
    │   └── metadata.json
    ├── 1.8.2_1
    │   └── metadata.json
    ├── 1.6.1_18
    │   └── metadata.json
    ├── 1.3.2_10
    │   └── metadata.json
    ├── 1.3.2_6
    │   └── metadata.json
    ├── 1.3.2_9
    │   └── metadata.json
    ├── 1.3.2_7
    │   └── metadata.json
    ├── 1.3.2_8
    │   └── metadata.json
    ├── 1.0_1
    │   └── metadata.json
    ├── singularity_demo
    │   └── metadata.json
    ├── 1.4.5_1
    │   └── metadata.json
    ├── 1.4.4_1
    │   └── metadata.json
    ├── 1.9_1
    │   └── metadata.json
    ├── 1.5.1_0
    │   └── metadata.json
    ├── 1.6.1_1
    │   └── metadata.json
    ├── 1.4.3_1
    │   └── metadata.json
    ├── 1.4.4_2
    │   └── metadata.json
    ├── 5_0
    │   └── metadata.json
    └── 1.4.1_0
    │   └── metadata.json
├── environment
    ├── preprint version 1 commit hashes used.txt
    ├── conda_inputs.yaml
    ├── install_minimal.sh
    ├── install.sh
    └── install.md
├── .gitignore
├── run_experiments.sh
├── license.md
├── gather_experiment_metadata.py
├── README.md
└── guiding_questions.txt


/make_figures/figures/png/figure 2 basics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure 2 basics-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s1 data_qc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s1 data_qc.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s8 dcdfg-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s8 dcdfg-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure 1 schematic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure 1 schematic-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s7 simulation-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s7 simulation-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure 3 all published-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure 3 all published-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s2 basics supp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s2 basics supp-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s4 preprocessing-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s4 preprocessing-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s3 stratify targets-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s3 stratify targets-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s6 stratified split-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s6 stratified split-1.png


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s9 all published details-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s9 all published details-1.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "bin": "^0.0.0",
 4 |     "canvas": "^2.11.0",
 5 |     "vega": "^5.22.1",
 6 |     "vega-cli": "^5.22.1",
 7 |     "vega-lite": "^5.6.0"
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.1_1/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.4.1_1",
3 |     "is_active": true,
4 |     "refers_to": "1.4.1_0",
5 |     "perturbation_dataset": "fantom4", 
6 |     "visualization_embedding": "X_pca"
7 | }


--------------------------------------------------------------------------------
/make_figures/figures/png/figure s5 why is the mean a strong baseline-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekernf01/perturbation_benchmarking/HEAD/make_figures/figures/png/figure s5 why is the mean a strong baseline-1.png


--------------------------------------------------------------------------------
/experiments/1.4.1_2/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.4.1_2",
3 |     "is_active": true,
4 |     "refers_to": "1.4.1_0",
5 |     "perturbation_dataset": "BETS_A549", 
6 |     "visualization_embedding": "X_pca"
7 | }


--------------------------------------------------------------------------------
/experiments/1.0_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_10",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but  instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle3"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_8",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but replogle instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/environment/preprint version 1 commit hashes used.txt:
--------------------------------------------------------------------------------
1 | geneformer_embeddings@2335437
2 | ggrn@47545673b
3 | ggrn_backend2@40b755b78d
4 | load_networks@768809c
5 | load_perturbations@50732922a
6 | perturbation_benchmarking@f2be496e652
7 | perturbation_benchmarking_package@b8219aceae
8 | 
9 | 


--------------------------------------------------------------------------------
/experiments/1.0_13/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_13",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but different data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "joung"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_14/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_14",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but different data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "dixit"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_2",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but replogle2 instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_9",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but replogle4 instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle4"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_13/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_13",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on other datasets.",
 5 |     "refers_to": "1.4.2_1",
 6 |     "perturbation_dataset": "frangieh_IFNg_v2",
 7 |     "num_genes":1000
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.0_12/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_12",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but different data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "norman"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_15/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_15",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but different data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "adamson"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_12/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_12",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on other datasets.",
 5 |     "refers_to": "1.4.2_1",    
 6 |     "perturbation_dataset": "frangieh_IFNg_v1",
 7 |     "num_genes":1000
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_2",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on their preferred demo datasets.",
 5 |     "refers_to": "1.4.2_1",
 6 |     "num_genes":10000,
 7 |     "perturbation_dataset": "adamson"
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_3",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on their preferred demo datasets.",
 5 |     "refers_to": "1.4.2_1",
 6 |     "perturbation_dataset": "dixit",
 7 |     "num_genes":10000
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_4",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on their preferred demo datasets.",
 5 |     "refers_to": "1.4.2_1",
 6 |     "num_genes":10000,
 7 |     "perturbation_dataset": "norman"
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_4/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.6.3_4",
3 |     "nickname": "timeseries pilot",
4 |     "question": "1.6.3",
5 |     "refers_to": "1.6.3_0",
6 |     "eligible_regulators": "tf",
7 |     "species": "zebrafish",
8 |     "perturbation_dataset": "saunders_axial_mesoderm"
9 | }


--------------------------------------------------------------------------------
/experiments/1.6.3_7/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.6.3_7",
3 |     "nickname": "timeseries pilot",
4 |     "question": "1.6.3",
5 |     "refers_to": "1.6.3_0",
6 |     "eligible_regulators": "tf",
7 |     "species": "zebrafish",
8 |     "perturbation_dataset": "saunders_pigment_cells"
9 | }


--------------------------------------------------------------------------------
/experiments/1.0_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_4",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 (various sklearn methods) but using only the GFP controls.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": false,
 7 |     "control_subtype": "Emerald"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_5",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but replogle2_tf_only instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2_tf_only"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_14/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_14",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on other datasets.",
 5 |     "refers_to": "1.4.2_1",    
 6 |     "perturbation_dataset": "nakatake_simulated_scrna",
 7 |     "num_genes":1000
 8 | }
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_2",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "human",
 8 |     "perturbation_dataset": "fantom4"
 9 | }
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/1.0_11/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_11",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but different data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake_simulated_scrna"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_3",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but frangieh pseudobulk data instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "frangieh_IFNg_v3"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_12/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_12",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on joung",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "joung"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_3",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "human",
 8 |     "perturbation_dataset": "BETS_A549"
 9 | }
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/1.0_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_6",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but replogle2_large_effect instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2_large_effect"
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_7",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on freimer",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "freimer"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_8",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_5",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "zebrafish",
 8 |     "perturbation_dataset": "saunders_blood"
 9 | }
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_10",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle3",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_2",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle2",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle2"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_9",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle4",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle4"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_6",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "zebrafish",
 8 |     "perturbation_dataset": "saunders_endoderm"
 9 | }
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/1.8.3_2/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_2",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "replogle2"
9 | }


--------------------------------------------------------------------------------
/experiments/1.8.3_7/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_7",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "freimer" 
9 | }


--------------------------------------------------------------------------------
/experiments/1.8.3_8/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_8",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "replogle"
9 | }


--------------------------------------------------------------------------------
/experiments/1.8.3_9/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_9",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "replogle4"
9 | }


--------------------------------------------------------------------------------
/experiments/1.4.3_13/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_13",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on a different dataset",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "norman"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_14/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_14",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on a different dataset",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "dixit"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_4",
 3 |     "nickname": "base_network",
 4 |     "readme": "Networks experiment but only using GFP controls.",
 5 |     "question": "1.4.3",
 6 |     "is_active": false,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "control_subtype": "Emerald"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.8.2_7/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.2_7",
3 |     "nickname": "how_much_averaging",
4 |     "readme": "Replicate-averaging experiment but with a different dataset.",
5 |     "question": "1.8.2",
6 |     "is_active": true,
7 |     "refers_to": "1.8.2_1", 
8 |     "perturbation_dataset": "freimer" 
9 | }


--------------------------------------------------------------------------------
/experiments/1.8.3_3/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_3",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "frangieh_IFNg_v3" 
9 | }


--------------------------------------------------------------------------------
/experiments/1.4.3_15/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_15",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on a different dataset",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "adamson"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_3",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on frangieh_IFNg_v3",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "frangieh_IFNg_v3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.8.3_5/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_5",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "replogle2_tf_only" 
9 | }


--------------------------------------------------------------------------------
/experiments/1.4.3_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_5",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle2_tf_only",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle2_tf_only"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.8.3_6/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.8.3_6",
3 |     "nickname": "gene_selection",
4 |     "readme": "Gene-selection experiment but with a different dataset.",
5 |     "question": "1.8.3",
6 |     "is_active": true,
7 |     "refers_to": "1.8.3_1", 
8 |     "perturbation_dataset": "replogle2_large_effect" 
9 | }


--------------------------------------------------------------------------------
/experiments/1.8.2_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.2_2",
 3 |     "nickname": "how_much_averaging",
 4 |     "readme": "Replicate-averaging experiment but with a different dataset.",
 5 |     "question": "1.8.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.8.2_1", 
 8 |     "perturbation_dataset": "adamson" 
 9 | }
10 | 


--------------------------------------------------------------------------------
/experiments/1.8.2_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.2_4",
 3 |     "nickname": "how_much_averaging",
 4 |     "readme": "Replicate-averaging experiment but with a different dataset.",
 5 |     "question": "1.8.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.8.2_1", 
 8 |     "perturbation_dataset": "dixit" 
 9 | }
10 | 


--------------------------------------------------------------------------------
/experiments/1.8.2_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.2_5",
 3 |     "nickname": "how_much_averaging",
 4 |     "readme": "Replicate-averaging experiment but with a different dataset.",
 5 |     "question": "1.8.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.8.2_1", 
 8 |     "perturbation_dataset": "norman" 
 9 | }
10 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_6",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on replogle2_large_effect",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "replogle2_large_effect"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_11/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_11",
 3 |     "nickname": "base_network",
 4 |     "readme": "Network experiment but on nakatake_simulated_scrna",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "perturbation_dataset": "nakatake_simulated_scrna"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.8.2_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.2_3",
 3 |     "nickname": "how_much_averaging",
 4 |     "readme": "Replicate-averaging experiment but with a different dataset.",
 5 |     "question": "1.8.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.8.2_1", 
 8 |     "perturbation_dataset": "frangieh_IFNg_v2" 
 9 | }
10 | 


--------------------------------------------------------------------------------
/experiments/1.0_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_0",
 3 |     "nickname": "ml methods tiny",
 4 |     "readme": "Like 1.0_1 but faster / smaller, mostly for realistic testing.",
 5 |     "refers_to": "1.0_1",
 6 |     "num_genes": 1000,
 7 |     "regression_method":[
 8 |         "mean", 
 9 |         "RidgeCV" 
10 |     ]
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_8",
 3 |     "is_active": false,
 4 |     "nickname": "GEARS",
 5 |     "readme": "Test of GEARS on other datasets.",
 6 |     "refers_to": "1.4.2_1",    
 7 |     "desired_heldout_fraction": 0.2, 
 8 |     "perturbation_dataset": "norman",
 9 |     "num_genes":10000
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_4",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "joung", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.8.4_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_1",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "merge_replicates": false
10 | }


--------------------------------------------------------------------------------
/experiments/1.4.2_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_10",
 3 |     "is_active": false,
 4 |     "nickname": "GEARS",
 5 |     "readme": "Test of GEARS on other datasets.",
 6 |     "refers_to": "1.4.2_1",    
 7 |     "desired_heldout_fraction": 0.2, 
 8 |     "perturbation_dataset": "nakatake",
 9 |     "num_genes":6000
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_11/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_11",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Test of GEARS on other datasets.",
 5 |     "refers_to": "1.4.2_1",    
 6 |     "is_active" : true,
 7 |     "type_of_split": "genetic_interaction",
 8 |     "perturbation_dataset": "norman",
 9 |     "num_genes":10000
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_6",
 3 |     "is_active": false,
 4 |     "nickname": "GEARS",
 5 |     "readme": "Test of GEARS on their preferred demo datasets.",
 6 |     "refers_to": "1.4.2_1",
 7 |     "desired_heldout_fraction": 0.2, 
 8 |     "perturbation_dataset": "dixit",
 9 |     "num_genes":10000
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_5",
 3 |     "is_active": false,
 4 |     "nickname": "GEARS",
 5 |     "readme": "Test of GEARS on their preferred demo datasets.",
 6 |     "refers_to": "1.4.2_1",
 7 |     "desired_heldout_fraction": 0.2, 
 8 |     "perturbation_dataset": "adamson",
 9 |     "num_genes":10000
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_19/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_19",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Exact repeat of 1.6.1_15, to check if the code is stochastic.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": false,
 7 |     "perturbation_dataset": "dixit",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_8",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "mouse",
 8 |     "matching_method": "user",
 9 |     "matching_method_for_evaluation": "user",
10 |     "perturbation_dataset": "paul2"
11 | }


--------------------------------------------------------------------------------
/experiments/1.6.1_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_3",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but data preprocessed differently.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "frangieh_IFNg_v3", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_2",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but uses data preprocessed differently.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "frangieh_IFNg_v2", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_1",
 3 |     "nickname": "timeseries pilot",
 4 |     "question": "1.6.3",
 5 |     "refers_to": "1.6.3_0",
 6 |     "eligible_regulators": "tf",
 7 |     "species": "mouse",
 8 |     "matching_method": "user",
 9 |     "matching_method_for_evaluation": "user",
10 |     "perturbation_dataset": "paul1"
11 | }
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/experiments/1.3.3_7/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_7",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "freimer"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.3.3_8/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_8",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "replogle"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.3.3_9/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_9",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "replogle4"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.3.3_10/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_10",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "replogle3"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.6.1_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_6",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but with nakatake. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.8.4_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_7",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "freimer", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.3.3_3/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_3",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "frangieh_IFNg_v3"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.3.3_5/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_5",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "replogle2_tf_only"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.8.4_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_2",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "replogle2", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.8.4_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_8",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "replogle", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.8.4_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_9",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "replogle4", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.3.3_6/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.3.3_6",
3 |     "nickname": "TransferLearning",
4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
5 |     "question": "1.3.3",
6 |     "is_active": true,
7 |     "refers_to": "1.3.3_1",
8 |     "perturbation_dataset": "replogle2_large_effect"
9 | }    


--------------------------------------------------------------------------------
/experiments/1.8.4_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_3",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "frangieh_IFNg_v3", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.8.4_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_5",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "replogle2_tf_only", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.8.4_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.4_6",
 3 |     "nickname": "stratified_split",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "type_of_split": "stratified", 
 9 |     "perturbation_dataset": "replogle2_large_effect", 
10 |     "merge_replicates": false
11 | }


--------------------------------------------------------------------------------
/experiments/1.9_3/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.9_3",
3 |     "nickname": "base_network_simulation",
4 |     "refers_to": "1.9_1",
5 |     "readme": "Testing different network structures like in experiment 1.4.3_1, but with simulated data based on known network. ",
6 |     "question": "1.9",
7 |     "is_active": true,
8 |     "perturbation_dataset": "simulation_TrueNetwork=gtex_rna_S=1_NoiseSD=0"
9 | }


--------------------------------------------------------------------------------
/experiments/1.9_0/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.9_0",
3 |     "nickname": "base_network_simulation",
4 |     "refers_to": "1.9_1",
5 |     "readme": "Testing different network structures like in experiment 1.4.3_1, but with simulated data based on known networks.",
6 |     "question": "1.9",
7 |     "is_active": true,
8 |     "perturbation_dataset":  "simulation_TrueNetwork=MARA_FANTOM4_S=1_NoiseSD=0"
9 | }


--------------------------------------------------------------------------------
/experiments/1.9_4/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.9_4",
3 |     "nickname": "base_network_simulation",
4 |     "refers_to": "1.9_1",
5 |     "readme": "Testing different network structures like in experiment 1.4.3_1, but with simulated data based on known network.",
6 |     "question": "1.9",
7 |     "is_active": true,
8 |     "perturbation_dataset": "simulation_TrueNetwork=celloracle_human_S=1_NoiseSD=0"
9 | }


--------------------------------------------------------------------------------
/make_figures/simple_simulation.R:
--------------------------------------------------------------------------------
 1 | set.seed(0)
 2 | X = matrix(rnorm(10000), ncol = 100)
 3 | control_indices = 1
 4 | train_indices = 1:50
 5 | test_indices = 51:100
 6 | baseline_predictor = colMeans(X[train_indices, ])
 7 | correlations = c()
 8 | for(i in test_indices){
 9 |   correlations[i-50] = cor(baseline_predictor - X[control_indices, ], X[i, ] - X[control_indices, ])
10 | }
11 | mean(correlations)
12 | # 0.649


--------------------------------------------------------------------------------
/experiments/1.6.1_16/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_16",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake_simulated_scrna",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_7",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.9_2/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "unique_id": "1.9_2",
3 |     "nickname": "base_network_simulation",
4 |     "refers_to": "1.9_1",
5 |     "readme": "Testing different network structures like in experiment 1.4.3_1, but with simulated data based on known network.",
6 |     "question": "1.9",
7 |     "is_active": true,
8 |     "perturbation_dataset": "simulation_TrueNetwork=cellnet_human_Hugene_S=1_NoiseSD=0"
9 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_8",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle"
 9 | 
10 | }


--------------------------------------------------------------------------------
/experiments/1.6.1_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_10",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle4", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_11/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_11",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "freimer",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_13/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_13",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "norman",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_14/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_14",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "adamson",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_15/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_15",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "dixit",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_9",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle3", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_12/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_12",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "joung"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_7",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "freimer"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_17/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_17",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_8",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2_tf_only", 
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_10",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_13/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_13",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "norman"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_2",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle2"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_9",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle4"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_3",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "frangieh_IFNg_v3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_5",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle2_tf_only"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_12/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_12",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Like experiment 1.6.1_1, but different data and starting_expression. Note: only 1000 genes selected, as in 1.6.1_1.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2_large_effect",     
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_11/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_11",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "nakatake_simulated_scrna"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.3.1_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.1_2",
 3 |     "nickname": "CellTypeSpecificRegressionCMAP",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.1_1",
 6 |     "is_active": false,
 7 |     "perturbation_dataset": "cmap",
 8 |     "network_datasets": {
 9 |         "celloracle_human":{}
10 |     }
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_6",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_1",
 8 |     "perturbation_dataset": "replogle2_large_effect"
 9 | 
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_7",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_1",
 8 |     "perturbation_dataset": "replogle4"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_8",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_2",
 8 |     "perturbation_dataset": "replogle4"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_3",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_1",
 8 |     "perturbation_dataset": "frangieh_IFNg_v3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_4",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_2",
 8 |     "perturbation_dataset": "frangieh_IFNg_v3"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_5",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_1",
 8 |     "perturbation_dataset": "replogle2_large_effect"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_6",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.4_2",
 8 |     "perturbation_dataset": "replogle2_large_effect"
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_5",
 3 |     "nickname": "cellTypeSpecificCSNets",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake",
 8 |     "default_level": "hIPS.parquet",
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "csnets":{}
13 |     }
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_4",
 3 |     "nickname": "cellTypeSpecificANANSE",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake",
 8 |     "default_level": "hIPS.parquet",
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "ANANSE_0.5":{}
13 |     }
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_4",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "saunders_blood",
 8 |     "visualization_embedding": "X_umap", 
 9 |     "eligible_regulators": "tf",
10 |     "species": "zebrafish",
11 |     "network_datasets": { 
12 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
13 |     }
14 | }
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_3",
 3 |     "nickname": "cellTypeSpecificCellNetHugene",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "nakatake",
 8 |     "default_level": "hIPS.parquet",
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "cellnet_human_Hugene":{}
13 |     }
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_5",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "saunders_endoderm",
 8 |     "visualization_embedding": "X_umap", 
 9 |     "eligible_regulators": "tf",
10 |     "species": "zebrafish",
11 |     "network_datasets": { 
12 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
13 |     }
14 | }
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_3",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "saunders_axial_mesoderm",
 8 |     "visualization_embedding": "X_umap", 
 9 |     "eligible_regulators": "tf",
10 |     "species": "zebrafish",
11 |     "network_datasets": { 
12 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
13 |     }
14 | }
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_6",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "saunders_pigment_cells",
 8 |     "eligible_regulators": "tf",
 9 |     "visualization_embedding": "X_umap", 
10 |     "species": "zebrafish",
11 |     "network_datasets": { 
12 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
13 |     }
14 | }
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/make_figures/add_lit_review_to_data_collection.R:
--------------------------------------------------------------------------------
1 | # This is a single-use script to help me transfer literature review notes to a better permanent home.
2 | 
3 | # Endoderm
4 | lit_review = readxl::read_excel("timeseries_figures/definitive_endoderm_ps_vs_screen_top_30_manually_annotated.xlsx")
5 | lit_review = lit_review[c("perturbation", "Included in literature review?", "Cell type affected", "PMID", "Notes")] %>% 
6 |   dplyr::distinct() %>%
7 |   write.csv("../../perturbation_data/perturbations/definitive_endoderm/lit_review.csv")
8 | 
9 | # TO DO: blood


--------------------------------------------------------------------------------
/experiments/1.2.2_16/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_16",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "fantom4", 
 9 |     "eligible_regulators": "human_tfs",
10 |     "network_datasets": {
11 |         "celloracle_human": { "do_aggregate_subnets": true }
12 |     }
13 | }


--------------------------------------------------------------------------------
/experiments/1.3.2_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_2",
 3 |     "nickname": "cellTypeSpecificCellNetHg1332",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "merge_replicates": true,
 8 |     "perturbation_dataset": "nakatake",
 9 |     "default_level": "hIPS.parquet",
10 |     "network_datasets": {
11 |         "empty": {},
12 |         "dense": {},
13 |         "cellnet_human_Hg1332":{}
14 |     }
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/experiments/1.0_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_7",
 3 |     "nickname": "ml methods",
 4 |     "readme": "Like 1.0_1 but freimer instead of nakatake.",
 5 |     "refers_to": "1.0_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "freimer", 
 8 |     "expand": "ladder", 
 9 |     "kwargs": [
10 |         {},
11 |         {},
12 |         {},
13 |         {},
14 |         {},
15 |         {},
16 |         {},
17 |         {},
18 |         {},
19 |         {},
20 |         {},
21 |         {
22 |             "pca_dim": 3
23 |         }
24 |     ]
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_17/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_17",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "BETS_A549", 
 9 |     "eligible_regulators": "human_tfs",
10 |     "network_datasets": {
11 |         "celloracle_human": { "do_aggregate_subnets": true }
12 |     }
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | slurm*
 2 | *.oracle
 3 | *.parquet
 4 | *.csv
 5 | *.pdf
 6 | *.svg
 7 | *.html
 8 | *.Rhistory
 9 | GEARS_gene_set.pkl
10 | ggrn_gears_input/*
11 | geneformer_loom_data
12 | geneformer_tokenized_data
13 | geneformer_finetuned
14 | from_to_docker
15 | logs
16 | wandb
17 | err.txt
18 | out.txt
19 | stdout.txt
20 | start_time.txt
21 | finish_time.txt
22 | *__pycache__*
23 | *.pyc
24 | old_experiments/*
25 | experiments/*/outputs
26 | experiments/*/old_outputs
27 | experiments/*/outputs_old
28 | node_modules/*
29 | lightning_logs/*
30 | TODO.md
31 | node_modules
32 | make_figures/global_effects/*
33 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_1",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "paul1",
 8 |     "visualization_embedding": "X_draw_graph_fa", 
 9 |     "matching_method": "user",
10 |     "matching_method_for_evaluation": "user",
11 |     "eligible_regulators": "tf",
12 |     "species": "mouse",
13 |     "network_datasets": { 
14 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
15 |     }
16 | }
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/experiments/1.5.1_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_2",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "refers_to": "1.5.1_0",
 7 |     "perturbation_dataset": "paul2",
 8 |     "visualization_embedding": "X_draw_graph_fa", 
 9 |     "eligible_regulators": "tf",
10 |     "species": "mouse",
11 |     "matching_method": "user",
12 |     "matching_method_for_evaluation": "user",
13 |     "network_datasets": { 
14 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
15 |     }
16 | }
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/experiments/1.8.5_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_1",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "merge_replicates": false,
10 |     "network_prior": "restrictive",
11 |     "network_datasets": {
12 |         "empty":     { "do_aggregate_subnets": true }, 
13 |         "dense":     { "do_aggregate_subnets": true },    
14 |         "celloracle_human":      { "do_aggregate_subnets": true }        
15 |     }
16 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_19/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_19",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "saunders_blood", 
 9 |     "visualization_embedding": "X_umap", 
10 |     "eligible_regulators": "tf",
11 |     "species": "zebrafish",
12 |     "network_datasets": {
13 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
14 |     }
15 | }


--------------------------------------------------------------------------------
/experiments/1.1.1_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.1.1_1",
 3 |     "nickname": "hyperparam_sweep",
 4 |     "readme": "This is a simple sweep over the regularization parameter for LASSO regression.",
 5 |     "question": "1.1.1",
 6 |     "is_active": true,
 7 |     "factor_varied": "alpha",
 8 |     "regression_method": "LASSO",
 9 |     "kwargs_to_expand": ["alpha"],
10 |     "kwargs":{
11 |         "alpha": [0, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.01, 0.1, 1, 10, 100, 1000]
12 |     },
13 |     "facet_by": null,
14 |     "color_by": null,
15 |     "perturbation_dataset": "nakatake",
16 |     "eligible_regulators": "human_tfs"
17 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_20/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_20",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "saunders_endoderm", 
 9 |     "visualization_embedding": "X_umap", 
10 |     "eligible_regulators": "tf",
11 |     "species": "zebrafish",
12 |     "network_datasets": {
13 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
14 |     }
15 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_18/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_18",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "saunders_axial_mesoderm", 
 9 |     "visualization_embedding": "X_umap", 
10 |     "eligible_regulators": "tf",
11 |     "species": "zebrafish",
12 |     "network_datasets": {
13 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
14 |     }
15 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_21/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_21",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "saunders_pigment_cells", 
 9 |     "visualization_embedding": "X_umap", 
10 |     "eligible_regulators": "tf",
11 |     "species": "zebrafish",
12 |     "network_datasets": {
13 |         "celloracle_zebrafish": { "do_aggregate_subnets": true }
14 |     }
15 | }


--------------------------------------------------------------------------------
/make_figures/export_benchmark_results.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | mkdir ../../evaluation_results
 3 | for experiment in `ls ../experiments`
 4 | do
 5 |     mkdir -p ../../evaluation_results/experiments/${experiment}/outputs
 6 |     for path_to_copy in metadata.json outputs/conditions.csv outputs/evaluationPerPert.parquet outputs/evaluationPerTarget.parquet outputs/train_resources outputs/train_walltimes
 7 |     do
 8 |         cp -r ../experiments/${experiment}/${path_to_copy} ../../evaluation_results/experiments/${experiment}/${path_to_copy}
 9 |     done
10 | done
11 | cd .. && python gather_experiment_metadata.py
12 | cp ../all_experiments.tsv ../../evaluation_results/all_experiments.tsv


--------------------------------------------------------------------------------
/experiments/ggrn_docker_backend_celloracle/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "ggrn_docker_backend_celloracle",
 3 |     "nickname": "ggrn_docker_backend_celloracle",
 4 |     "readme": "Test of the ggrn backend that runs a docker container with CO installed.",
 5 |     "question": "None", 
 6 |     "is_active": true,
 7 |     "factor_varied": "regression_method",  
 8 |     "color_by": null,
 9 |     "facet_by": null,
10 |     "perturbation_dataset": "freimer",
11 |     "pruning_parameter": 2000,
12 |     "regression_method":"docker____ekernf01/ggrn_docker_backend_celloracle", 
13 |     "network_datasets": {
14 |         "celloracle_human":{}
15 |     }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/experiments/1.8.5_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_2",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "replogle2", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.8.5_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_7",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "freimer", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.8.5_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_8",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "replogle", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.8.5_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_9",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "replogle4", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.8.5_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_3",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "frangieh_IFNg_v3", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/test/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "test",
 3 |     "nickname": "test",
 4 |     "readme": "This experiment is a sandbox meant to test new features of the benchmarking code.",
 5 |     "question": "1.1",
 6 |     "is_active": true,
 7 |     "factor_varied": "regression_method",
 8 |     "default_level": "mean",
 9 |     "color_by": null,
10 |     "facet_by": null,
11 |     "merge_replicates": true,
12 |     "network_prior": "restrictive",
13 |     "num_genes": 500,
14 |     "pruning_strategy": "none",
15 |     "regression_method": ["mean", "RidgeCV"], 
16 |     "perturbation_dataset": "nakatake",
17 |     "network_datasets": {
18 |         "dense": {}
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/experiments/1.8.5_5/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_5",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "replogle2_tf_only", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.8.5_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.5_6",
 3 |     "nickname": "split_seed_sensitivity",
 4 |     "readme": "Network experiment but with a different data split.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.4.3_1", 
 8 |     "data_split_seed": [1,2,3], 
 9 |     "perturbation_dataset": "replogle2_large_effect", 
10 |     "merge_replicates": false,
11 |     "network_prior": "restrictive",
12 |     "network_datasets": {
13 |         "empty":     { "do_aggregate_subnets": true }, 
14 |         "dense":     { "do_aggregate_subnets": true },    
15 |         "celloracle_human":      { "do_aggregate_subnets": true }        
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/ggrn_docker_backend/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "ggrn_docker_backend",
 3 |     "nickname": "ggrn_docker_backend",
 4 |     "readme": "Test of the ggrn backend that runs a user-specified docker container.",
 5 |     "question": "None", 
 6 |     "is_active": true,
 7 |     "factor_varied": "regression_method",  
 8 |     "color_by": null,
 9 |     "facet_by": null,
10 |     "perturbation_dataset": "nakatake",
11 |     "kwargs": {
12 |         "my_sweepy_hyperparameter": [0, 1],
13 |         "my_constant_hyperparameter": 0
14 |     },
15 |     "kwargs_to_expand": ["my_sweepy_hyperparameter"],
16 |     "regression_method":"docker____ekernf01/ggrn_docker_backend_template" 
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/run_experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l	
 2 | #SBATCH --job-name="ericBenchmarking"
 3 | #SBATCH --partition=parallel
 4 | #SBATCH --time=72:00:00
 5 | #SBATCH --nodes=1
 6 | #SBATCH --ntasks-per-node=16
 7 |  
 8 | source "${HOME}/mambaforge/etc/profile.d/conda.sh"
 9 | conda init
10 | conda activate ggrn
11 | 
12 | 
13 | for experiment in `ls -1 experiments  | grep -E $1`
14 | do
15 |     echo "Starting ${experiment}"
16 |     echo "Monitor progress:
17 | less experiments/${experiment}/err.txt
18 | less experiments/${experiment}/stdout.txt
19 | "
20 |     pereggrn --experiment_name $experiment --amount_to_do missing_models --verbosity 2 \
21 |         > experiments/$experiment/stdout.txt 2> experiments/$experiment/err.txt
22 | done
23 | 


--------------------------------------------------------------------------------
/experiments/1.4.2_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.2_1",
 3 |     "nickname": "GEARS",
 4 |     "readme": "Related Q asks, what's the best way of using a given network? GEARS has an interesting take on this. Here we test it out.",
 5 |     "question": "1.4.2",
 6 |     "is_active": true,
 7 |     "data_split_seed": [0, 1, 2],
 8 |     "regression_method":[
 9 |         "median",
10 |         "mean",
11 |         "GEARS" 
12 |     ],
13 |     "num_genes": 1000,
14 |     "facet_by": null,
15 |     "color_by": "data_split_seed",
16 |     "factor_varied": "regression_method",
17 |     "baseline_condition": 0,
18 |     "predict_self" : true,
19 |     "merge_replicates": false,
20 |     "perturbation_dataset": "nakatake"
21 | }
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/experiments/1.3.3_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.3_2",
 3 |     "nickname": "TransferLearning",
 4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
 5 |     "question": "1.3.3",
 6 |     "is_active": true,
 7 |     "refers_to": "1.3.3_1",
 8 |     "feature_extraction": [
 9 |         "mrna",
10 |         "mrna",
11 |         "geneformer_model_/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_benchmarking/geneformer_finetuned/231119_geneformer_CellClassifier_L2048_B12_LR9.707511253364405e-05_LSlinear_WU296.3165900045724_E10_Oadamw_F2"
12 |     ],
13 |     "perturbation_dataset": "replogle2"
14 | }    


--------------------------------------------------------------------------------
/experiments/1.3.2_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_1",
 3 |     "nickname": "cellTypeSpecific",
 4 |     "readme": "Do networks inferred for the cell type of interest work better than global networks or networks from the wrong cell types? Tested here with ANANSE cell-type-specific networks, ESC versus others, on the nakatake ESC perturbation data.",
 5 |     "question": "1.3.2",
 6 |     "is_active": false,
 7 |     "factor_varied": "network",
 8 |     "facet_by": null,    
 9 |     "color_by": null,
10 |     "merge_replicates": true,
11 |     "eligible_regulators": "human_tfs",
12 |     "perturbation_dataset": "nakatake",
13 |     "network_datasets": {
14 |         "empty": {},
15 |         "dense": {},
16 |         "ANANSE_0.5":{}
17 |     }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_1",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "eligible_regulators": "human_tfs",
 8 |     "data_split_seed": [0],
 9 |     "merge_replicates": true,
10 |     "regression_method":[
11 |         "RidgeCV"
12 |     ],
13 |     "matching_method": ["steady_state", "closest"],
14 |     "prediction_timescale": ["1","3","10"],
15 |     "factor_varied": "matching_method", 
16 |     "color_by": "prediction_timescale", 
17 |     "facet_by": null, 
18 |     "perturbation_dataset": "nakatake"
19 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_15/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_15",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "paul1", 
 9 |     "visualization_embedding": "X_draw_graph_fa",
10 |     "eligible_regulators": "tf",
11 |     "species": "mouse",
12 |     "matching_method": ["steady_state", "closest", "optimal_transport", "random", "user"],
13 |     "matching_method_for_evaluation": "user",
14 |     "network_datasets": {
15 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.2.2_22/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_22",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "refers_to": "1.2.2_14",
 8 |     "perturbation_dataset": "paul2",
 9 |     "visualization_embedding": "X_draw_graph_fa", 
10 |     "eligible_regulators": "tf",
11 |     "species": "mouse",
12 |     "matching_method": ["steady_state", "closest", "optimal_transport", "random", "user"],   
13 |     "matching_method_for_evaluation": "user",
14 |     "network_datasets": {
15 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
16 |     }
17 | }


--------------------------------------------------------------------------------
/experiments/1.4.1_3/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.1_3",
 3 |     "is_active": true,
 4 |     "refers_to": "1.4.1_0",
 5 |     "perturbation_dataset": "paul2", 
 6 |     "visualization_embedding": "X_draw_graph_fa", 
 7 |     "matching_method": "user",
 8 |     "matching_method_for_evaluation": "user",
 9 |     "eligible_regulators": "tfs",
10 |     "species"   : "mouse",
11 |     "network_datasets": { 
12 |         "cellnet_mouse_mogene":  { "do_aggregate_subnets": false },
13 |         "cellnet_mouse_4302":  { "do_aggregate_subnets": false },
14 |         "empty": { "do_aggregate_subnets": true }, 
15 |         "dense": { "do_aggregate_subnets": true }, 
16 |         "celloracle_mouse": { "do_aggregate_subnets": true },
17 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
18 |     }
19 | }


--------------------------------------------------------------------------------
/experiments/1.4.1_4/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.1_4",
 3 |     "is_active": true,
 4 |     "refers_to": "1.4.1_0",
 5 |     "perturbation_dataset": "paul1", 
 6 |     "visualization_embedding": "X_draw_graph_fa",
 7 |     "matching_method": "user",
 8 |     "matching_method_for_evaluation": "user",
 9 |     "eligible_regulators": "tfs",
10 |     "species"   : "mouse",
11 |     "network_datasets": { 
12 |         "cellnet_mouse_mogene":  { "do_aggregate_subnets": false },
13 |         "cellnet_mouse_4302":  { "do_aggregate_subnets": false },
14 |         "empty": { "do_aggregate_subnets": true }, 
15 |         "dense": { "do_aggregate_subnets": true }, 
16 |         "celloracle_mouse": { "do_aggregate_subnets": true },
17 |         "celloracle_mouse_atac_atlas": { "do_aggregate_subnets": true }
18 |     }
19 | }


--------------------------------------------------------------------------------
/experiments/1.1.2_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.1.2_1",
 3 |     "nickname": "pruning_nakatake",
 4 |     "readme": "This experiment compares models that use all genes as predictors versus models that only allow TF's to regulator other genes.",
 5 |     "question": "1.1.2",
 6 |     "is_active": false,
 7 |     "factor_varied": "only_tfs_are_regulators",
 8 |     "network_prior": "ignore",
 9 |     "eligible_regulators": ["human_tfs", "all"],
10 |     "regression_method": ["mean", "RidgeCV"],
11 |     "data_split_seed": [0,1,2],
12 |     "facet_by": "data_split_seed",
13 |     "color_by": "regression_method",
14 |     "baseline_condition": [0,1,2,0,1,2,0,1,2,0,1,2],
15 |     "merge_replicates": true,
16 |     "perturbation_dataset": "nakatake",
17 |     "network_datasets": {
18 |         "dense":{}
19 |     }
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/experiments/1.6.3_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.3_0",
 3 |     "nickname": "timeseries low rank",
 4 |     "readme": "Comparison of linear models with full-rank and low-rank transition matrices",
 5 |     "question": "1.6.3",
 6 |     "is_active": true,
 7 |     "facet_by": "type_of_split",
 8 |     "color_by": "regression_method",
 9 |     "factor_varied": "data_split_seed",
10 |     "type_of_split": "timeseries",
11 |     "num_genes": 2000,
12 |     "regression_method": "autoregressive",
13 |     "prediction_timescale": "1",
14 |     "low_dimensional_structure" : "dynamics", 
15 |     "low_dimensional_training": ["pca", "supervised"], 
16 |     "low_dimensional_value": [2, 3, 5, 20, 100, 2000],
17 |     "matching_method": "optimal_transport",
18 |     "perturbation_dataset": "definitive_endoderm", 
19 |     "predict_self": true
20 | }


--------------------------------------------------------------------------------
/make_figures/figure_2_demo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | import pereggrn_perturbations
 5 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
 6 | import os 
 7 | import altair as alt
 8 | 
 9 | print(os.listdir("../experiments/1.0_1/outputs"))
10 | X = pd.read_parquet("../experiments/1.0_1/outputs/evaluationPerPert.parquet")
11 | print(X.query("gene=='GATA3' & condition==0").T.to_csv())
12 | 
13 | # Metrics:
14 | # pearsonCorr,0.8628334731048072
15 | # spearmanCorr,0.6939872813635597
16 | # logFCNorm2,116.28345489501952
17 | # mae,0.24543215334415436
18 | # mse,1915.5604908063688
19 | # spearman,0.6103465858719491
20 | # proportion_correct_direction,0.8333679186553227
21 | # mse_top_20,207.51050154170025
22 | # mse_top_100,529.9619580687759
23 | # mse_top_200,752.8383862192422
24 | # cell_type_correct,0.0


--------------------------------------------------------------------------------
/experiments/1.3.1_1/metadata.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "unique_id": "1.3.1_1",
 4 |     "nickname": "CellTypeSpecificRegression",
 5 |     "readme": "Q1.3 is about bias versus variance: does it work best to treat cell types as identical (high bias), separate (high variance), or similar (compromise)? This experiment and (sequels that refer to it) investigate the two extreme options by training one regression per cluster with either lots of clusters, or all data in one cluster. This experiment is currently not active and it may require some work to get it running again.",
 6 |     "question": "1.3.1",
 7 |     "is_active": false,
 8 |     "factor_varied": "num_clusters",
 9 |     "color_by": null,
10 |     "facet_by": "network_prior",
11 |     "merge_replicates": true,
12 |     "perturbation_dataset": "nakatake",
13 |     "network_datasets": {
14 |         "celloracle_human":{}
15 |     }
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/experiments/1.5.2_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.2_0",
 3 |     "nickname": "onesc",
 4 |     "readme": "This experiment allows colleagues in the Cahan Lab to systematically test a new method called oneSC.",
 5 |     "question": "1.8.1",
 6 |     "is_active": true,
 7 |     "facet_by": null,
 8 |     "color_by": "regression_method",
 9 |     "factor_varied": "regression_method",
10 |     "type_of_split": "timeseries",
11 |     "eligible_regulators": "all",
12 |     "matching_method": "optimal_transport",
13 |     "num_genes": 50,
14 |     "expand": "ladder",
15 |     "regression_method": [ 
16 |         "docker____ekernf01/ggrn_docker_backend_timeseries_baseline",
17 |         "docker____ekernf01/ggrn_docker_backend_onesc"
18 |     ],
19 |     "prediction_timescale": [1, 2, 3, 4, 5, 10, 100, 1000],
20 |     "expand_prediction_timescale": false,
21 |     "perturbation_dataset": "definitive_endoderm"
22 | }


--------------------------------------------------------------------------------
/experiments/1.8.3_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.3_1",
 3 |     "nickname": "gene_selection",
 4 |     "readme": "Gene-selection experiment but with a different dataset.",
 5 |     "question": "1.8.3",
 6 |     "is_active": true,
 7 |     "factor_varied": "num_genes",
 8 |     "data_split_seed": [0],
 9 |     "color_by": "type_of_split",
10 |     "type_of_split": ["interventional"],
11 |     "facet_by": null,
12 |     "merge_replicates": true,
13 |     "regression_method": "RidgeCV",
14 |     "perturbation_dataset": "nakatake",
15 |     "eligible_regulators": "human_tfs",
16 |     "num_genes": [500, 1000, 2000, 5000, 10000],
17 |     "network_prior": "restrictive",
18 |     "network_datasets": {
19 |         "empty":     { "do_aggregate_subnets": true }, 
20 |         "dense":     { "do_aggregate_subnets": true },    
21 |         "celloracle_human":      { "do_aggregate_subnets": true }        
22 |     }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/experiments/1.3.3_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.3_1",
 3 |     "nickname": "TransferLearning",
 4 |     "readme": "Q1.3.3 is about whether we can learn causal effects by pretraining a big fat transformer on a big fat collection of scRNA data.",
 5 |     "question": "1.3.3",
 6 |     "is_active": true,
 7 |     "regression_method":[
 8 |         "mean", 
 9 |         "median", 
10 |         "RidgeCV"
11 |     ],
12 |     "feature_extraction": [
13 |         "mrna",
14 |         "mrna",
15 |         "geneformer_hyperparam_finetune"
16 |     ],
17 |     "expand": "ladder",
18 |     "eligible_regulators": "all",
19 |     "predict_self": true,
20 |     "data_split_seed": [0],    
21 |     "type_of_split": ["interventional"],
22 |     "num_genes": 10000,
23 |     "facet_by": null,
24 |     "color_by": "type_of_split",
25 |     "factor_varied": "regression_method",
26 |     "baseline_condition": 0,
27 |     "merge_replicates": true,
28 |     "perturbation_dataset": "nakatake",
29 |     "network_datasets": {
30 |         "dense":{}
31 |     }
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/experiments/1.2.2_14/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.2.2_14",
 3 |     "nickname": "matching_and_timescale",
 4 |     "readme": "Testing whether we should use the steady-state assumption or match to controls, and whether we should predict after one, a few, or many time-steps.",
 5 |     "question": "1.2.2",
 6 |     "is_active": true,
 7 |     "eligible_regulators": "human_tfs",
 8 |     "data_split_seed": [0],
 9 |     "type_of_split": "timeseries",
10 |     "regression_method":[
11 |         "RidgeCV"
12 |     ],
13 |     "matching_method": ["steady_state", "closest", "optimal_transport", "random"],
14 |     "num_genes": 2000,
15 |     "prediction_timescale": "1,2,3,10",
16 |     "cell_type_sharing_strategy": "distinct",
17 |     "factor_varied": "matching_method", 
18 |     "color_by": "prediction_timescale", 
19 |     "facet_by": null, 
20 |     "perturbation_dataset": "definitive_endoderm", 
21 |     "visualization_embedding": "X_pca",
22 |     "network_prior": "restrictive",
23 |     "network_datasets": {
24 |         "endoderm": { "do_aggregate_subnets": true }
25 |     }
26 | }


--------------------------------------------------------------------------------
/experiments/1.8.2_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.8.2_1",
 3 |     "nickname": "how_much_averaging",
 4 |     "readme": "Averaging within each perturbation reduces noise and focuses on biological variation due to perturbations. However, it hides potentially useful variation from sample to sample. This experiment tests whether one approach or the other tends yields better expression forecasting results.",
 5 |     "question": "1.8.3",
 6 |     "is_active": true,
 7 |     "factor_varied": "network_datasets",
 8 |     "data_split_seed": [0],
 9 |     "color_by": "type_of_split",
10 |     "type_of_split": ["interventional"],
11 |     "facet_by": null,
12 |     "merge_replicates": [true, false],
13 |     "regression_method": "RidgeCV",
14 |     "perturbation_dataset": "nakatake",
15 |     "eligible_regulators": "human_tfs",
16 |     "num_genes": [2000],
17 |     "network_prior": "restrictive",
18 |     "network_datasets": {
19 |         "empty":     { "do_aggregate_subnets": true }, 
20 |         "dense":     { "do_aggregate_subnets": true },    
21 |         "celloracle_human":      { "do_aggregate_subnets": true }        
22 |     }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/license.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Eric Kernfeld
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/environment/conda_inputs.yaml:
--------------------------------------------------------------------------------
 1 | name: ggrn
 2 | channels:
 3 |   - pytorch
 4 |   - pyg
 5 |   - conda-forge
 6 |   - bioconda
 7 |   - defaults
 8 |   - lingfeiwang
 9 | dependencies:
10 |   - pandas[version'>=2.0']
11 |   - python-duckdb
12 |   - pytorch-lightning
13 |   - scipy
14 |   - scanpy
15 |   - pytorch
16 |   - torchtext
17 |   - torchvision 
18 |   - torchaudio 
19 |   - numpy[version='<=1.23']
20 |   - cython
21 |   - scikit-learn
22 |   - pyarrow
23 |   - python=3.9
24 |   - jupyterlab
25 |   - jupyter
26 |   - ipywidgets
27 |   - yaml
28 |   - wandb
29 |   - selenium=3.141.0
30 |   - altair
31 |   - altair_saver
32 |   - dcor
33 |   - scanpy
34 |   - python-igraph
35 |   - louvain
36 |   - genomepy
37 |   - goatools
38 |   - python-annoy
39 |   - pyreadr
40 |   - regex
41 |   - scikit-misc
42 |   - matplotlib[version'>=3.4,<3.5']
43 |   - rpy2
44 |   - anndata2ri
45 |   - deprecated
46 |   - datasets
47 |   - gseapy
48 |   - dictys
49 |   - velocyto.py
50 |   - pyg
51 |   - tensorboard[version'>=2.4,<2.5']
52 |   - gimmemotifs[version'==0.17.1']
53 |   - transformers
54 |   - accelerate
55 |   - hyperopt
56 |   - grpcio[version'<=1.49']
57 |   - memray
58 |   - fa2
59 | 


--------------------------------------------------------------------------------
/experiments/1.6.1_18/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_18",
 3 |     "nickname": "dcdfg",
 4 |     "readme": "Parameter sweep for a less sparse NO-TEARS model.",
 5 |     "refers_to": "1.6.1_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "dixit",
 8 |     "network_datasets": {
 9 |         "dense":{}
10 |     },
11 |     "starting_expression": ["control"],
12 |     "expand": "ladder", 
13 |     "regression_method":[
14 |         "mean", 
15 |         "median",
16 | 
17 |         "DCDFG-spectral_radius-linearlr-False",   
18 |         "DCDFG-spectral_radius-linearlr-False",   
19 |         "DCDFG-spectral_radius-linearlr-False",   
20 |         "DCDFG-spectral_radius-linearlr-False",   
21 |         "DCDFG-spectral_radius-linearlr-False",   
22 | 
23 |         "DCDFG-spectral_radius-mlplr-False", 
24 |         "DCDFG-spectral_radius-mlplr-False",
25 |         "DCDFG-spectral_radius-mlplr-False",
26 |         "DCDFG-spectral_radius-mlplr-False",
27 |         "DCDFG-spectral_radius-mlplr-False"
28 |     ],
29 |     "pruning_parameter": [
30 |         0, 0, 
31 |         0.0001, 0.001, 0.01, 0.1, 1, 
32 |         0.0001, 0.001, 0.01, 0.1, 1
33 |     ]
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_10/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_10",
 3 |     "nickname": "cellTypeSpecificFANTOM5CMAP",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": false,
 7 |     "perturbation_dataset": "cmap",
 8 |     "default_level": null,
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "magnum_compendium_394": {
13 |             "subnets": [
14 |                 "retinal_pigment_epithelial_cells.parquet",
15 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
16 |                 "teratocarcinoma_cell_line.parquet",
17 |                 "lung_adenocarcinoma_cell_line.parquet",
18 |                 "breast_carcinoma_cell_line.parquet",
19 |                 "embryonic_kidney_cell_line.parquet",
20 |                 "hepatocellular_carcinoma_cell_line.parquet",
21 |                 "epitheloid_cancer_cell_line.parquet",
22 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet"
23 |             ],
24 |             "do_aggregate_subnets": false
25 |         }
26 |     }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_6/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_6",
 3 |     "nickname": "cellTypeSpecificFANTOM5Replogle1",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle",
 8 |     "merge_replicates": true,
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "magnum_compendium_394": {
13 |             "subnets": [
14 |                 "retinal_pigment_epithelial_cells.parquet",
15 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
16 |                 "teratocarcinoma_cell_line.parquet",
17 |                 "lung_adenocarcinoma_cell_line.parquet",
18 |                 "breast_carcinoma_cell_line.parquet",
19 |                 "embryonic_kidney_cell_line.parquet",
20 |                 "hepatocellular_carcinoma_cell_line.parquet",
21 |                 "epitheloid_cancer_cell_line.parquet",
22 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet"
23 |             ],
24 |             "do_aggregate_subnets": false
25 |         }
26 |     }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_9/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_9",
 3 |     "nickname": "cellTypeSpecificFANTOM5Replogle4",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "merge_replicates": true,
 8 |     "perturbation_dataset": "replogle4",
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "magnum_compendium_394": {
13 |             "subnets": [
14 |                 "retinal_pigment_epithelial_cells.parquet",
15 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
16 |                 "teratocarcinoma_cell_line.parquet",
17 |                 "lung_adenocarcinoma_cell_line.parquet",
18 |                 "breast_carcinoma_cell_line.parquet",
19 |                 "embryonic_kidney_cell_line.parquet",
20 |                 "hepatocellular_carcinoma_cell_line.parquet",
21 |                 "epitheloid_cancer_cell_line.parquet",
22 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet"
23 |             ],
24 |             "do_aggregate_subnets": false
25 |         }
26 |     }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_7/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_7",
 3 |     "nickname": "cellTypeSpecificFANTOM5Replogle2",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "perturbation_dataset": "replogle2",    
 8 |     "merge_replicates": true,
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "magnum_compendium_394": {
13 |             "subnets": [
14 |                 "retinal_pigment_epithelial_cells.parquet",
15 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
16 |                 "teratocarcinoma_cell_line.parquet",
17 |                 "lung_adenocarcinoma_cell_line.parquet",
18 |                 "breast_carcinoma_cell_line.parquet",
19 |                 "embryonic_kidney_cell_line.parquet",
20 |                 "hepatocellular_carcinoma_cell_line.parquet",
21 |                 "epitheloid_cancer_cell_line.parquet",
22 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet"
23 |             ],
24 |             "do_aggregate_subnets": false
25 |         }
26 |     }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/1.3.2_8/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.3.2_8",
 3 |     "nickname": "cellTypeSpecificFANTOM5Replogle3",
 4 |     "readme": "This experiment uses the same logic as experiment <refers_to>, with different networks and perturbation data.",
 5 |     "refers_to": "1.3.2_1",
 6 |     "is_active": true,
 7 |     "merge_replicates": true,
 8 |     "perturbation_dataset": "replogle3",    
 9 |     "network_datasets": {
10 |         "empty": {},
11 |         "dense": {},
12 |         "magnum_compendium_394": {
13 |             "subnets": [
14 |                 "retinal_pigment_epithelial_cells.parquet",
15 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
16 |                 "teratocarcinoma_cell_line.parquet",
17 |                 "lung_adenocarcinoma_cell_line.parquet",
18 |                 "breast_carcinoma_cell_line.parquet",
19 |                 "embryonic_kidney_cell_line.parquet",
20 |                 "hepatocellular_carcinoma_cell_line.parquet",
21 |                 "epitheloid_cancer_cell_line.parquet",
22 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet"
23 |             ],
24 |             "do_aggregate_subnets": false
25 |         }
26 |     }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/1.0_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.0_1",
 3 |     "nickname": "ml methods",
 4 |     "readme": "We test a slate of regression methods to see if anything can beat ... the mean of the training data.",
 5 |     "question": "1.0",
 6 |     "data_split_seed": [0],    
 7 |     "type_of_split": ["interventional"],
 8 |     "regression_method":[
 9 |         "mean", 
10 |         "median", 
11 |         "GradientBoostingRegressor",
12 |         "ExtraTreesRegressor",
13 |         "KernelRidge",
14 |         "RidgeCV", 
15 |         "RidgeCVExtraPenalty",
16 |         "LassoCV",
17 |         "ElasticNetCV",
18 |         "OrthogonalMatchingPursuitCV",
19 |         "BayesianRidge",         
20 |         "docker____ekernf01/ggrn_docker_backend_ahlmann_eltze"
21 |     ],
22 |     "num_genes": 10000,
23 |     "eligible_regulators": "human_tfs",
24 |     "is_active": true,
25 |     "facet_by": null,
26 |     "color_by": "type_of_split",
27 |     "factor_varied": "regression_method",
28 |     "visualization_embedding": "X_umap",
29 |     "baseline_condition": 0,
30 |     "merge_replicates": true,
31 |     "perturbation_dataset": "nakatake",
32 |     "network_datasets": {
33 |         "dense":{}
34 |     }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/gather_experiment_metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | import pereggrn.experimenter as experimenter
 4 | import pereggrn_networks
 5 | import pereggrn_perturbations
 6 | pereggrn_networks.set_grn_location("../network_collection/networks")
 7 | pereggrn_perturbations.set_data_path("../perturbation_data/perturbations")
 8 | all_active_experiments = []
 9 | for experiment in os.listdir("experiments"):
10 |     try:
11 |         all_active_experiments.append(pd.DataFrame(
12 |             {
13 |                 k:experimenter.validate_metadata(experiment, input_folder = "experiments")[k]
14 |                 for k in ["nickname", "refers_to", "readme"]
15 |             }, 
16 |             index = [experiment]
17 |         ))
18 |     except:
19 |         all_active_experiments.append(pd.DataFrame(
20 |             {
21 |                 k:"Could not validate the metadata -- likely an inactive experiment."
22 |                 for k in ["nickname", "refers_to", "readme"]
23 |             }, 
24 |             index = [experiment]
25 |         ))
26 | pd.concat(all_active_experiments).sort_index().to_csv("all_experiments.tsv", sep = "\t", index = True)
27 | print("Done. See results in all_experiments.tsv.")
28 |         
29 | 
30 | 


--------------------------------------------------------------------------------
/experiments/singularity_demo/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "singularity_demo",
 3 |     "nickname": "singularity_demo",
 4 |     "readme": "Trying to get backends working with singularity",
 5 |     "question": "1.8.1",
 6 |     "is_active": true,
 7 |     "facet_by": "type_of_split",
 8 |     "color_by": "regression_method",
 9 |     "factor_varied": "data_split_seed",
10 |     "type_of_split": "timeseries",
11 |     "eligible_regulators": "human_tfs",
12 |     "num_genes": 2000,
13 |     "expand": "ladder",
14 |     "regression_method": [ 
15 | 
16 |         "singularity____ekernf01/ggrn_docker_backend_sckinetics",
17 |         "singularity____ekernf01/ggrn_docker_backend_dictys",
18 | 
19 |         "singularity____ekernf01/ggrn_docker_backend_celloracle",
20 |         "singularity____ekernf01/ggrn_docker_backend_timeseries_baseline",
21 |         "singularity____ekernf01/ggrn_docker_backend_prescient"
22 |     ],
23 |     "prediction_timescale": [
24 |         "1", "1", 
25 |         "1,2,3,5,10",  "1,2,3,4",  "1,2,3,4"
26 |     ],
27 |     "network_datasets": { 
28 |         "celloracle_human":{}
29 |     },
30 |     "matching_method": "optimal_transport",
31 |     "perturbation_dataset": "definitive_endoderm"
32 | }
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/environment/install_minimal.sh:
--------------------------------------------------------------------------------
 1 | mkdir expression_forecasting_benchmarks
 2 | cd expression_forecasting_benchmarks
 3 | # Get data collections from Zenodo 
 4 | # accessory data, e.g. pLI and list of TF names
 5 | wget https://zenodo.org/record/13345104/files/accessory_data.zip  && unzip accessory_data.zip 
 6 | # perturbations 
 7 | wget https://zenodo.org/records/13785607/files/perturbation_data_minimal.zip && unzip perturbation_data_minimal.zip && mv perturbation_data_minimal perturbation_data
 8 | # networks
 9 | wget https://zenodo.org/records/13785607/files/network_collection_minimal.zip && unzip network_collection_minimal.zip && mv network_collection_minimal network_collection
10 | 
11 | # Get experiment metadata and project folder layout
12 | git clone https://github.com/ekernf01/perturbation_benchmarking
13 | # Install python packages
14 | conda create -n ggrn_minimal python=3.9
15 | conda activate ggrn_minimal
16 | conda install -y pip
17 | pip install vl-convert-python
18 | pip install ray[tune]
19 | pip install pyarrow
20 | for p in pereggrn_networks pereggrn_perturbations ggrn pereggrn
21 | do
22 |     pip install git+https://github.com/ekernf01/${p} --branch v2
23 | done
24 | echo "Installation has finished. Test your installation:"
25 | echo "    conda activate ggrn_minimal"
26 | echo "    cd perturbation_benchmarking"
27 | echo "    pereggrn -h # see the help page"
28 | echo "    pereggrn  --output example_output --input experiments --experiment_name '1.0_0' --networks ../network_collection/networks --data ../perturbation_data/perturbations --amount_to_do models --no_skip_bad_runs"
29 | 


--------------------------------------------------------------------------------
/make_figures/global_effects/dixit.txt:
--------------------------------------------------------------------------------
 1 | deg,mi,mean,norm2,median
 2 | -999.0,1.910162315127668,0.5038446881503853,52.312931645534476,0.30094033544637744
 3 | -999.0,1.9855050253844826,0.4916380547221156,52.56394543285953,0.29798986083162815
 4 | -999.0,2.0302618566942527,0.4756141754671866,52.976473363440824,0.2845773643314999
 5 | -999.0,2.0796098255382485,0.451007774304164,51.14938597693658,0.26872471232975415
 6 | -999.0,2.0411439603906376,0.4667585060832745,52.93853598842698,0.28042979028179643
 7 | -999.0,2.4129531576200356,0.27371691027161144,30.350002845250383,0.16881920648740878
 8 | -999.0,2.3629876602850333,0.3063420993997507,33.46881620355791,0.1840810704980273
 9 | -999.0,2.01893161752782,0.482825004038108,53.32670494361112,0.29113124258929723
10 | -999.0,2.393002815297529,0.27753547068215606,30.33618567283924,0.17437199764860553
11 | -999.0,2.4777122626670756,0.24799212287514608,26.89714683792748,0.16440947866471348
12 | -999.0,2.70443560791687,0.19210115416587883,21.429255506453305,0.11923406669115186
13 | -999.0,2.3644642097930713,0.29890899266547666,33.13344444757394,0.1838976806801691
14 | -999.0,2.3244762960247396,0.3169443810809533,34.45570125268245,0.19448842193074148
15 | -999.0,2.377290821787801,0.2958279112113571,32.629771020614584,0.18001584414493516
16 | -999.0,2.068408380861639,0.44690851398302267,50.68586283808235,0.2725523351475753
17 | -999.0,1.9784142893048147,0.502264491921718,53.931519971231076,0.29953037704410534
18 | -999.0,1.9605012824830634,0.48614706656257084,50.02607076605626,0.303037074808737
19 | -999.0,2.386350421230154,0.29610498570392146,32.9446849226944,0.1778979505850157
20 | 


--------------------------------------------------------------------------------
/experiments/1.4.5_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_1",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "factor_varied": "network_datasets",
 8 |     "type_of_split": "interventional",
 9 |     "data_split_seed": [0],
10 |     "desired_heldout_fraction": [1],
11 |     "color_by": null,
12 |     "facet_by": null,
13 |     "regression_method": "?????",
14 |     "perturbation_dataset": "nakatake",
15 |     "eligible_regulators": "all",    
16 |     "starting_expression": ["heldout"],
17 |     "num_genes": 10000,
18 |     "network_datasets": {
19 |         "celloracle_human":      { "do_aggregate_subnets": true },
20 |         "gtex_rna":              { "do_aggregate_subnets": true },
21 |         "magnum_compendium_32":  { "do_aggregate_subnets": true },   
22 |         "magnum_compendium_ppi": { "do_aggregate_subnets": true },
23 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": true },
24 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": true },
25 |         "MARA_FANTOM4":          { "do_aggregate_subnets": true },
26 |         "STRING":                { "do_aggregate_subnets": true },
27 |         "ANANSE_0.5":            { "do_aggregate_subnets": true },
28 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": true },
29 |         "humanbase":             { "do_aggregate_subnets": true }
30 |     }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_1",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "factor_varied": "network_datasets",
 8 |     "type_of_split": "interventional",
 9 |     "data_split_seed": [0],
10 |     "desired_heldout_fraction": [1],
11 |     "allowed_regulators_vs_network_regulators": "union",
12 |     "color_by": null,
13 |     "facet_by": null,
14 |     "regression_method": "regulon",
15 |     "perturbation_dataset": "nakatake",
16 |     "eligible_regulators": "all",
17 |     "num_genes": 10000,
18 |     "network_datasets": {
19 |         "celloracle_human":      { "do_aggregate_subnets": true },
20 |         "gtex_rna":              { "do_aggregate_subnets": true },
21 |         "magnum_compendium_32":  { "do_aggregate_subnets": true },   
22 |         "magnum_compendium_ppi": { "do_aggregate_subnets": true },
23 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": true },
24 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": true },
25 |         "MARA_FANTOM4":          { "do_aggregate_subnets": true },
26 |         "STRING":                { "do_aggregate_subnets": true },
27 |         "ANANSE_0.5":            { "do_aggregate_subnets": true },
28 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": true },
29 |         "humanbase":             { "do_aggregate_subnets": true }
30 |     }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/experiments/1.9_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.9_1",
 3 |     "nickname": "base_network_simulation",
 4 |     "is_active": true,    
 5 |     "readme": "Testing different network structures like in experiment 1.4.3_1, but with simulated data based on a known network. ",
 6 |     "question": "1.9",
 7 |     "factor_varied": "network_datasets",
 8 |     "color_by": "type_of_split",
 9 |     "type_of_split": ["interventional"],
10 |     "facet_by": null,
11 |     "regression_method": "RidgeCV",
12 |     "eligible_regulators": "all",
13 |     "num_genes": 10000,
14 |     "network_prior": "restrictive",
15 |     "network_datasets": {
16 |         "empty":     { "do_aggregate_subnets": true }, 
17 |         "dense":     { "do_aggregate_subnets": true },    
18 |         "celloracle_human":      { "do_aggregate_subnets": true },
19 |         "gtex_rna":              { "do_aggregate_subnets": true },
20 |         "magnum_compendium_32":  { "do_aggregate_subnets": true },   
21 |         "magnum_compendium_ppi": { "do_aggregate_subnets": true },
22 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": true },
23 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": true },
24 |         "MARA_FANTOM4":          { "do_aggregate_subnets": true },
25 |         "STRING":                { "do_aggregate_subnets": true },
26 |         "ANANSE_0.5":            { "do_aggregate_subnets": true },
27 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": true },
28 |         "humanbase":             { "do_aggregate_subnets": true }
29 |     },
30 |     "matching_method": ["user"],
31 |     "predict_self": true,
32 |     "data_split_seed": [0,1,2],
33 |     "prediction_timescale": 1,
34 |     "perturbation_dataset": "simulation_TrueNetwork=cellnet_human_Hg1332_S=1_NoiseSD=0"
35 | }


--------------------------------------------------------------------------------
/experiments/1.5.1_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.5.1_0",
 3 |     "nickname": "timeseries pilot",
 4 |     "readme": "Direct comparison of published timeseries methods",
 5 |     "question": "1.5.1",
 6 |     "is_active": true,
 7 |     "facet_by": "type_of_split",
 8 |     "color_by": "regression_method",
 9 |     "factor_varied": "data_split_seed",
10 |     "type_of_split": "timeseries",
11 |     "eligible_regulators": "human_tfs",
12 |     "num_genes": 2000,
13 |     "pruning_parameter": 2000,
14 |     "expand": "ladder",
15 |     "regression_method": [ 
16 |         "median", 
17 |         "mean", 
18 |         "RidgeCV",
19 | 
20 |         "docker____ekernf01/ggrn_docker_backend_sckinetics",
21 |         "docker____ekernf01/ggrn_docker_backend_dictys",
22 | 
23 |         "docker____ekernf01/ggrn_docker_backend_celloracle",
24 |         "docker____ekernf01/ggrn_docker_backend_timeseries_baseline",
25 |         "docker____ekernf01/ggrn_docker_backend_prescient",
26 |         "docker____ekernf01/ggrn_docker_backend_rnaforecaster"
27 |     ],
28 |     "prediction_timescale": [
29 |         "1", "1", "1,2,3,10", 
30 |         "1", "1", 
31 |         "1,2,3,5",  "1,2,3,4",  "1,2,3,4",  "1,2,3,4"
32 |     ],
33 |     "cell_type_sharing_strategy": "distinct",
34 |     "predict_self": [
35 |         false, false, false, 
36 |         false, false, 
37 |         false, false, false, true
38 |     ],
39 |     "kwargs": [
40 |         {}, {}, {}, 
41 |         {}, { "minimum_expression": 0.05 }, 
42 |         {}, {}, {}, {}
43 |     ],
44 |     "matching_method": "optimal_transport",
45 |     "perturbation_dataset": "definitive_endoderm",
46 |     "visualization_embedding": "X_pca", 
47 |     "network_datasets": { 
48 |         "endoderm": { "do_aggregate_subnets": true }
49 |     },
50 |     "network_prior": "restrictive"
51 | }


--------------------------------------------------------------------------------
/experiments/1.6.1_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.6.1_1",
 3 |     "nickname": "published methods",
 4 |     "readme": "Comparison of several published methods (originally focused on DCD-FG, hence the numbering after Q1.6).",
 5 |     "question": "1.6",
 6 |     "is_active": true,
 7 |     "facet_by": "starting_expression",
 8 |     "color_by": null,
 9 |     "factor_varied": "regression_method",
10 |     "expand": "ladder", 
11 |     "kwargs": [
12 |         {},
13 |         {},
14 |         {},
15 |         {},
16 |         {},
17 |         {},
18 |         {},
19 |         {},
20 |         {},
21 |         {}
22 |     ],
23 |     "regression_method":[
24 |         "mean", 
25 |         "mean", 
26 |         "median",
27 |         "median",
28 |         "DCDFG-spectral_radius-linearlr-False",   
29 |         "DCDFG-spectral_radius-linearlr-False",   
30 |         "DCDFG-spectral_radius-mlplr-False" ,
31 |         "DCDFG-spectral_radius-mlplr-False" , 
32 |         "RidgeCV", 
33 |         "GEARS"
34 |     ],
35 |     "feature_extraction": [
36 |         "mrna",
37 |         "mrna",
38 |         "mrna",
39 |         "mrna",
40 |         "mrna",
41 |         "mrna",
42 |         "mrna",
43 |         "mrna",
44 |         "geneformer_hyperparam_finetune",
45 |         "mrna"
46 |     ],
47 |     "predict_self": [
48 |         false, 
49 |         false, 
50 |         false, 
51 |         false, 
52 |         false, 
53 |         false, 
54 |         false, 
55 |         false, 
56 |         true, 
57 |         true
58 |     ],
59 |     "baseline_condition": 0,
60 |     "merge_replicates": false,
61 |     "perturbation_dataset": "frangieh_IFNg_v1",
62 |     "num_genes": 1000,
63 |     "starting_expression": [
64 |         "control", "heldout", 
65 |         "control", "heldout", 
66 |         "control", "heldout", 
67 |         "control", "heldout", 
68 |         "control", 
69 |         "control"
70 |     ],
71 |     "network_datasets": {
72 |         "dense":{}
73 |     }
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/make_figures/global_effects/freimer.txt:
--------------------------------------------------------------------------------
 1 | deg,mi,mean,norm2,median
 2 | 0.0,1.7343218862056689,0.19943972016478761,42.16134515105941,0.09792018098009196
 3 | 51.0,1.675038488044664,0.23554822621933932,54.19274235980161,0.09527179002206303
 4 | 10.0,1.8538888108379181,0.1759638531487992,40.833678096526164,0.07031262033430491
 5 | 0.0,1.9558892996805826,0.13874365642647515,31.642192934222216,0.05591693109502488
 6 | 0.0,1.9693811633459988,0.1412092087059149,32.88510834261646,0.05336172285389287
 7 | 4.0,1.9256675359984494,0.15453063646363532,34.8604855165297,0.06749591141784311
 8 | 0.0,2.0317514749918146,0.12513219543449508,30.84630393784162,0.043945878910136536
 9 | 0.0,1.8973835565613582,0.17346884928883674,37.993859804646206,0.07909252330845157
10 | 1.0,1.9886678737179506,0.13537700064122346,32.389979596614204,0.050204269213690804
11 | 6.0,1.9810338022765186,0.1489560543717019,37.17668530146186,0.05115143921775177
12 | 17.0,1.717687025316958,0.21406147981694806,49.087489821323366,0.08944421383182051
13 | 0.0,1.8989885921295004,0.15265015458299994,35.56572306981687,0.06026849941076856
14 | 4.0,1.8496164128167003,0.16461601442126586,37.547038757728295,0.07050215484510483
15 | 0.0,1.8954505998863567,0.16374288981899945,38.20787442150733,0.06554269694455271
16 | 4.0,1.911411814964397,0.16507045070989845,38.19916476743188,0.06351087037860026
17 | 23.0,1.6953779828055762,0.21809904406873054,48.8982368579704,0.09724737698332835
18 | 14.0,1.8347754156298597,0.18634992065106692,43.90672237759967,0.07674399865525601
19 | 1.0,1.8419721926257975,0.17449780062622644,39.97881022321453,0.0696136654723
20 | 0.0,2.046340688316985,0.12249124244804031,28.84714177458835,0.045549462287079054
21 | 1.0,1.9957782113815916,0.1462292994526221,34.403105547709714,0.05586016804821248
22 | 8.0,1.9201383488008854,0.16491124910620736,38.52060870601413,0.06592776566284995
23 | 8.0,1.7241701316419022,0.2075978526237536,46.08914444452816,0.08969086435808828
24 | 3.0,1.7997457523262692,0.1928282665887047,41.71048487408795,0.08552701148685775
25 | 7.0,1.8899789965283627,0.1696674008655979,39.87318625781164,0.0635335694135748
26 | 


--------------------------------------------------------------------------------
/experiments/1.4.3_1/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.3_1",
 3 |     "nickname": "base_network",
 4 |     "readme": "people have published big lists of TF-target or gene-gene relationships, often for GWAS interpretation or reprogramming. Existing benchmarks have limited information content and seldom compare these published network structures directly without introducing confounding factors. For instance, one might ask whether the networks used by CellNet, Mogrify, Irene, and CellOracle are of comparable value in predicting perturbation outcomes. Those methods have been compared, but they each involve many other components that may also affect the outcome, confounding the effect of network structure. This experiment benchmarks many networks using otherwise-equivalent methods to see how much each network helps predict held-out perturbations.",
 5 |     "question": "1.4.3",
 6 |     "is_active": true,
 7 |     "factor_varied": "network_datasets",
 8 |     "data_split_seed": [0],
 9 |     "color_by": "type_of_split",
10 |     "type_of_split": ["interventional"],
11 |     "facet_by": null,
12 |     "merge_replicates": true,
13 |     "regression_method": "RidgeCV",
14 |     "perturbation_dataset": "nakatake",
15 |     "eligible_regulators": "human_tfs",
16 |     "num_genes": 10000,
17 |     "visualization_embedding": "X_umap",
18 |     "network_prior": "restrictive",
19 |     "network_datasets": {
20 |         "empty":     { "do_aggregate_subnets": true }, 
21 |         "dense":     { "do_aggregate_subnets": true },    
22 |         "celloracle_human":      { "do_aggregate_subnets": true },
23 |         "gtex_rna":              { "do_aggregate_subnets": true },
24 |         "magnum_compendium_32":  { "do_aggregate_subnets": true },   
25 |         "magnum_compendium_ppi": { "do_aggregate_subnets": true },
26 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": true },
27 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": true },
28 |         "MARA_FANTOM4":          { "do_aggregate_subnets": true },
29 |         "STRING":                { "do_aggregate_subnets": true },
30 |         "ANANSE_0.5":            { "do_aggregate_subnets": true },
31 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": true },
32 |         "humanbase":             { "do_aggregate_subnets": true }
33 |     }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/make_figures/timeseries_differential_expression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | import anndata
 5 | import pereggrn_perturbations
 6 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
 7 | import sys 
 8 | import altair as alt
 9 | import os
10 | 
11 | dataset = 'definitive_endoderm'
12 | adata = pereggrn_perturbations.load_perturbation(dataset,   is_timeseries=True)
13 | adata.obs['timepoint'] = adata.obs['timepoint'].astype('str')
14 | adata.uns["log1p"]["base"] = 2
15 | for celltype in ["endoderm", "mesendoderm"]:
16 |     sc.tl.rank_genes_groups(adata, "cell_type", groups=[celltype], reference='pluripotent', method='wilcoxon')
17 |     human_tf = pd.read_csv('../../accessory_data/tf_lists/human.txt', header=None)
18 |     X = sc.get.rank_genes_groups_df(adata, group=[celltype])
19 |     X = X.query("names in @human_tf[0].values")
20 |     X = X.query("pvals_adj < 0.05")
21 |     X.sort_values('scores', ascending=False, inplace=True)
22 |     X.head(30).to_csv(f"timeseries_plots/top30_differential_expression_{celltype}.csv")
23 | 
24 | dataset = 'fantom4'
25 | adata = pereggrn_perturbations.load_perturbation(dataset,   is_timeseries=True)
26 | adata.obs['timepoint'] = adata.obs['timepoint'].astype('str')
27 | sc.tl.rank_genes_groups(adata, "timepoint", groups=['96.0'], reference='0.0', method='wilcoxon')
28 | human_tf = pd.read_csv('../../accessory_data/tf_lists/human.txt', header=None)
29 | X = sc.get.rank_genes_groups_df(adata, group='96.0')
30 | X = X.query("names in @human_tf[0].values")
31 | X.sort_values('logfoldchanges', ascending=False, inplace=True)
32 | X.head(30).to_csv("timeseries_plots/top30_differential_expression_fantom4.csv")
33 | 
34 | dataset = 'paul1'
35 | adata = pereggrn_perturbations.load_perturbation(dataset,   is_timeseries=True)
36 | mouse_tf = pd.read_csv('../../accessory_data/tf_lists/mouse.txt', header=None)
37 | adata.obs["supertype"] = adata.obs["cell_type"].map({
38 |     "MEP": "ME",
39 |     "Erythroids": "ME",
40 |     "Megakaryocytes": "ME",
41 |     "DC": "DC",
42 |     "GMP": "GM",
43 |     "late_GMP": "GM",
44 |     "Monocytes": "GM",
45 |     "Granulocytes": "GM"
46 | })
47 | for st in ["GM","ME", "DC"]:
48 |     sc.tl.rank_genes_groups(adata, "supertype", groups=[st], reference='rest', method='wilcoxon')
49 |     X = sc.get.rank_genes_groups_df(adata, group=[st])
50 |     X = X.query("names in @mouse_tf[0].values")
51 |     X = X.query("pvals_adj < 0.05")
52 |     X.sort_values('logfoldchanges', ascending=False, inplace=True)
53 |     X.head(30).to_csv(f"timeseries_plots/top30_differential_expression_paul_{st}.csv")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 |  ## A systematic comparison of computational methods for expression forecasting with [PEREGGRN](https://github.com/ekernf01/pereggrn) 
 2 |  
 3 | This repo contains benchmark experiments to evaluate various strategies for predicting gene expression after knockout, knockdown, or overexpression. 
 4 | 
 5 | ![image](https://github.com/ekernf01/perturbation_benchmarking/assets/5271803/ae7a5c86-dca6-49be-b048-743f8e110a18)
 6 | 
 7 | - For context and key results, see our [preprint](https://www.biorxiv.org/content/10.1101/2023.07.28.551039v2). 
 8 | - Install everything using [these instructions](https://github.com/ekernf01/perturbation_benchmarking/blob/main/environment/install.md).
 9 | - To repeat our experiments or run your own, see the [pereggrn](https://github.com/ekernf01/pereggrn) benchmarking software ([tutorial](https://github.com/ekernf01/pereggrn/blob/main/docs/tutorial.md), how to [add your own method](https://github.com/ekernf01/pereggrn/blob/main/docs/how_to.md#how-to-evaluate-a-new-method)).
10 | - If there's something you cannot find, go ahead and file a github issue -- with your input, we hope to improve the project.
11 | 
12 | ### Related infrastructure
13 | 
14 | This project is tightly coupled with our collections of data, our GGRN package for dynamic models of gene regulatory networks, and our PEREGGRN package containing benchmarking infrastructure. 
15 | 
16 | - Install everything using [these instructions](https://github.com/ekernf01/perturbation_benchmarking/blob/main/environment/install.md).
17 | - Perturbation data, the network collection, and some accessory data (e.g. a list of TF's) are on Zenodo with DOI `10.5281/zenodo.15115945`.
18 |     - Our code expects each of those three folders to be unzipped and placed adjacent to this repo.
19 |     - Use our [perturbation loader](https://github.com/ekernf01/pereggrn_perturbations) and [network loader](https://github.com/ekernf01/pereggrn_networks) to easily access and validate data from Python.
20 | - [GGRN](https://github.com/ekernf01/ggrn), the Grammar of Gene Regulatory Networks, offers flexible combination of different features for regulatory network inference.
21 | - [PEREGGRN](https://github.com/ekernf01/pereggrn), PErturbation Response Evaluation via a Grammar of Gene Regulatory Networks, helps conduct the experiments that are specified in this repo.
22 | - To interact with the evaluation results and see the source data for our figures, [download (1GB)](https://zenodo.org/records/15115945/files/perturbation_benchmarking.zip?download=1) them from Zenodo (DOI: 10.5281/zenodo.15115945). 
23 | - Certain additional experiments are implemented in [our fork of DCD-FG](https://github.com/ekernf01/dcdfg).
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/experiments/1.4.4_2/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.4_2",
 3 |     "nickname": "network_only",
 4 |     "readme": "Are network-connected genes enriched for perturbation responses? This experiment uses network structure alone for prediction, with no training data and all perturbations reserved for evaluation.",
 5 |     "question": "1.4.4",
 6 |     "is_active": true,
 7 |     "factor_varied": "network_datasets",
 8 |     "type_of_split": "interventional",
 9 |     "data_split_seed": [0],
10 |     "desired_heldout_fraction": [1],
11 |     "allowed_regulators_vs_network_regulators": "union",
12 |     "color_by": null,
13 |     "facet_by": null,
14 |     "regression_method": "regulon",
15 |     "perturbation_dataset": "nakatake",
16 |     "eligible_regulators": "all",
17 |     "num_genes": 10000,
18 |     "network_datasets": {
19 |         "gtex_rna":              { "do_aggregate_subnets": false },
20 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": false },
21 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": false },
22 |         "ANANSE_0.5":            { "do_aggregate_subnets": false },
23 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": false },
24 |         "humanbase":             { "do_aggregate_subnets": false }, 
25 |         "magnum_compendium_394": { 
26 |             "subnets": [
27 |                 "retinal_pigment_epithelial_cells.parquet",
28 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
29 |                 "teratocarcinoma_cell_line.parquet",
30 |                 "lung_adenocarcinoma_cell_line.parquet",
31 |                 "breast_carcinoma_cell_line.parquet",
32 |                 "embryonic_kidney_cell_line.parquet",
33 |                 "hepatocellular_carcinoma_cell_line.parquet",
34 |                 "epitheloid_cancer_cell_line.parquet",
35 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet",
36 |                 "cd8+_t_cells.parquet", 
37 |                 "adult_t-cell_leukemia_cell_line.parquet",
38 |                 "cd4+cd25-cd45ra-_memory_conventional_t_cells.parquet",
39 |                 "cd4+cd25+cd45ra+_naive_regulatory_t_cells.parquet",
40 |                 "cd4+_t_cells.parquet",
41 |                 "chronic_lymphocytic_leukemia_t-cll_cell_line.parquet",
42 |                 "cd4+cd25+cd45ra-_memory_regulatory_t_cells.parquet",
43 |                 "cd4+cd25-cd45ra+_naive_conventional_t_cells.parquet",
44 |                 "nk_t_cell_leukemia_cell_line.parquet",
45 |                 "melanoma_cell_line.parquet", 
46 |                 "melanocyte.parquet", 
47 |                 "skin_fetal.parquet", 
48 |                 "iris_pigment_epithelial_cells.parquet", 
49 |                 "retina_adult.parquet"
50 |             ],
51 |             "do_aggregate_subnets": false  
52 |         }
53 |     }
54 | }


--------------------------------------------------------------------------------
/experiments/5_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "5_0",
 3 |     "nickname": "scaling",
 4 |     "readme": "How do different methods scale in practice?",
 5 |     "question": "1.0",
 6 |     "is_active": true,
 7 |     "data_split_seed": [0],    
 8 |     "type_of_split": ["interventional"],
 9 |     "expand": "ladder",
10 |     "regression_method":[
11 |         "mean",                                     "mean", 
12 |         "median",                                   "median", 
13 |         "GradientBoostingRegressor",                "GradientBoostingRegressor",
14 |         "ExtraTreesRegressor",                      "ExtraTreesRegressor",
15 |         "KernelRidge",                              "KernelRidge",
16 |         "RidgeCV",                                  "RidgeCV", 
17 |         "RidgeCVExtraPenalty",                      "RidgeCVExtraPenalty",
18 |         "LassoCV",                                  "LassoCV",
19 |         "ElasticNetCV",                             "ElasticNetCV",
20 |         "OrthogonalMatchingPursuitCV",              "OrthogonalMatchingPursuitCV",
21 |         "BayesianRidge",                            "BayesianRidge",
22 |         "DCDFG-spectral_radius-linearlr-False",     "DCDFG-spectral_radius-linearlr-False",   
23 |         "DCDFG-spectral_radius-mlplr-False",        "DCDFG-spectral_radius-mlplr-False", 
24 | 
25 |         "GEARS",         "GEARS",                   "RidgeCV",     "RidgeCV"
26 |     ],
27 |     "feature_extraction": [
28 |         "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", 
29 |         "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", "mrna", 
30 |         "mrna", "mrna", "mrna", "mrna", "mrna", "mrna",     
31 |         
32 |         "mrna",       "mrna",                "geneformer", "geneformer"
33 |     ],
34 |     "predict_self": [
35 |         false,  false,  false,  false,  false,  false,  false,  false,  false,  false,  
36 |         false,  false,  false,  false,  false,  false,  false,  false,  false,  false,  
37 |         false,  false,  false,  false,  false,  false,        
38 |         
39 |         true,         true,                true,         true
40 |     ],
41 |     "num_genes":          [ 
42 |         500,  1000,   500,  1000,   500,  1000,   500,  1000,   500,  1000,   
43 |         500,  1000,   500,  1000,   500,  1000,   500,  1000,   500,  1000,   
44 |         500,  1000,   500,  1000,   500,  1000,   
45 |         
46 |            500,  1000,                    500,  1000
47 |     ],
48 |     "eligible_regulators": "all",
49 |     "facet_by": null,
50 |     "color_by": "num_genes",
51 |     "factor_varied": "regression_method",
52 |     "merge_replicates": true,
53 |     "perturbation_dataset": "frangieh_IFNg_v1",
54 |     "network_datasets": {
55 |         "dense":{}
56 |     }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/environment/install.sh:
--------------------------------------------------------------------------------
 1 | # This script sets up a new box (for us, usually an AWS EC2 instance) to run benchmarking analyses. 
 2 |  
 3 | # Get data collections from Zenodo 
 4 | sudo apt install unzip
 5 | # accessory data, e.g. pLI and list of TF names
 6 | wget https://zenodo.org/record/13345104/files/accessory_data.zip && unzip accessory_data.zip &
 7 | # perturbations 
 8 | wget https://zenodo.org/record/13345104/files/perturbation_data.zip && unzip perturbation_data.zip &
 9 | # networks
10 | wget https://zenodo.org/record/13345104/files/network_collection.zip && unzip network_collection.zip &
11 | 
12 | # Get mamba
13 | wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh"
14 | bash Mambaforge-$(uname)-$(uname -m).sh -b
15 | source "${HOME}/mambaforge/etc/profile.d/conda.sh"
16 | 
17 | # Set up Conda env
18 | # If you have a GPU, you can use conda_list_explicit_gpu.txt.
19 | mamba create --name ggrn --file perturbation_benchmarking/environment/conda_list_explicit.txt
20 | conda activate ggrn
21 | # Why --no-deps? Makes sure every version is pinned explicitly and is compatible with the other packages.
22 | pip install vl-convert-python==1.4.0 --no-deps
23 | pip install git+https://github.com/snap-stanford/GEARS@df09d7a --no-deps
24 | # PRESCIENT and CO are now used thru docker, but I am leaving this alone for backwards compatibiliy. 
25 | pip install celloracle==0.12.0 --no-deps
26 | pip install prescient==0.1.0   --no-deps 
27 | pip install geomloss==0.2.3    --no-deps 
28 | pip install git+https://github.com/bowang-lab/scFormer@2df344a --no-deps
29 | pip install 'scib>=1.0.3' --no-deps
30 | pip install biomart==0.9.2 --no-deps
31 | pip install msgpack==1.0.8 --no-deps
32 | pip install tensorboardX>=1.9 --no-deps
33 | pip install ray[tune]==2.6.2 --no-deps
34 | pip install scrublet==0.2.3 --no-deps
35 | pip install pot==0.9.3 --no-deps
36 | pip install wot==1.0.8.post2 --no-deps
37 | 
38 | # We need a specific version of Geneformer. We use `git lfs pull` because we need certain model files locally. 
39 | sudo apt install git-lfs
40 | echo "Cloning geneformer -- this could take a long time."
41 | git lfs install
42 | git clone https://huggingface.co/ctheodoris/Geneformer
43 | cd Geneformer
44 | git checkout 50e921d
45 | pip install . --no-deps
46 | git lfs pull
47 | cd ..
48 | 
49 | # Install our packages
50 | for p in pereggrn_networks pereggrn_perturbations pereggrn ggrn ggrn_backend2 ggrn_backend3 geneformer_embeddings 
51 | do
52 |     git clone http://github.com/ekernf01/${p} --branch v3
53 |     pip install -e $p --no-deps 
54 | done
55 | 
56 | echo "The package installation has finished, but the data download and unzip may still be running in the background, so it may not work right away."
57 | echo "Test your installation:"
58 | echo "    conda activate ggrn"
59 | echo "    pereggrn -h # see the help page"
60 | echo "    pereggrn --experiment_name '1.0_0' --amount_to_do models --no_skip_bad_runs # Run a simple benchmark "
61 | 


--------------------------------------------------------------------------------
/make_figures/psc_tf_due_diligence.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | import pandas as pd
 3 | import numpy as np
 4 | import pereggrn_perturbations
 5 | from pereggrn import experimenter
 6 | from scipy.stats import rankdata
 7 | pereggrn_perturbations.set_data_path("../../perturbation_data/perturbations")
 8 | 
 9 | # This scripe cross-references TF perturbation responses with known targets from Boyer et al. 2005. 
10 | 
11 | distance_to_targets = pd.read_csv("boyer2005targets.csv")[["GENE", "SOX2", "NANOG", "E2F4"]]
12 | distance_to_targets = 
13 | predictions = sc.read_h5ad("../experiments/1.0_1/outputs/predictions/7.h5ad") # condition 7 is LassoCV
14 | observed = pereggrn_perturbations.load_perturbation("nakatake")
15 | observed = experimenter.averageWithinPerturbation(observed)
16 | top_genes = {}
17 | targets = list(set(distance_to_targets["GENE"].unique()).intersection(predictions.var_names))
18 | overlap = pd.DataFrame(index = targets, columns = ["predicted_and_observed", "predicted_and_boyer", "boyer_and_observed"])
19 | for target in targets:
20 |     top_genes[target] = {}
21 |     predicted_logfc     = (predictions[                 :, target].X.mean(axis=1) - observed[observed.obs["is_control"], target].X.mean())
22 |     observed_logfc      = (observed[predictions.obs_names, target].X.mean(axis=1) - observed[observed.obs["is_control"], target].X.mean())
23 |     logfc = pd.DataFrame(
24 |         {
25 |             "observed_logfc": observed_logfc,
26 |             "predicted_logfc": predicted_logfc,
27 |             "observed_absolute_logfc": np.abs(observed_logfc),
28 |             "predicted_absolute_logfc": np.abs(predicted_logfc),
29 |         },
30 |         index = predictions.obs_names, 
31 |     )
32 |     top_genes[target]["predicted"] = logfc.query("@rankdata(-predicted_absolute_logfc)<=5").index
33 |     top_genes[target]["observed"]  = logfc.query("@rankdata(-observed_absolute_logfc)<=5").index
34 |     top_genes[target]["boyer"] = distance_to_targets.query("GENE==@target").T[[False, True, True, True]].set_axis(['distance'], axis = 1).query("distance!='-'").index.values
35 |     overlap.loc[target, "observed_top"] = top_genes[target]["observed"][0]
36 |     try:
37 |         overlap.loc[target, "predicted_top"] = top_genes[target]["predicted"][0]
38 |     except IndexError:
39 |         overlap.loc[target, "predicted_top"] = ""
40 |     overlap.loc[target, "predicted"]              = len(top_genes[target]["predicted"])
41 |     overlap.loc[target, "boyer"]                  = len(top_genes[target]["boyer"])
42 |     overlap.loc[target, "observed"]               = len(top_genes[target]["observed"])
43 |     overlap.loc[target, "predicted_and_observed"] = len(set(top_genes[target]["predicted"]).intersection(top_genes[target]["observed"]))
44 |     overlap.loc[target, "predicted_and_boyer"]    = len(set(top_genes[target]["predicted"]).intersection(top_genes[target]["boyer"]))
45 |     overlap.loc[target, "boyer_and_observed"]     = len(set(top_genes[target]["boyer"]).intersection(top_genes[target]["observed"]))
46 | 
47 | 
48 | [overlap.value_counts(c) for c in overlap.columns]
49 | 
50 | 


--------------------------------------------------------------------------------
/make_figures/figure_s1_timeseries.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | import anndata
 5 | import pereggrn_perturbations
 6 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
 7 | import sys 
 8 | import altair as alt
 9 | import os
10 | 
11 | sys.path.append("../../perturbation_data/setup/") # access our reusable data ingestion code
12 | import ingestion
13 | import global_effects
14 | effects = []
15 | for dataset in [
16 |     'definitive_endoderm',
17 |     'fantom4',
18 |     'BETS_A549',
19 | ]:    
20 |     print(dataset)
21 |     adata = pereggrn_perturbations.load_perturbation(dataset)
22 |     pt = adata.obs["perturbation_type"][0]
23 |     uns = adata.uns.copy()
24 |     try:
25 |         if adata.X.sum() == adata.raw.X.sum(): # We filled in log1p normalized data into the .raw slot for datasets obtained from GEARS. 
26 |             adata.raw = anndata.AnnData(X = np.exp(adata.raw.X.toarray()) - 1)
27 |         adata = ingestion.aggregate_by_perturbation(adata, group_by = ["perturbation"], use_raw = True)
28 |         sc.pp.normalize_total(adata)
29 |     except:
30 |         pass
31 |     adata.uns = uns
32 |     adata = ingestion.describe_perturbation_effect(adata, perturbation_type = pt)
33 |     consistency = ingestion.checkConsistency(adata, pt)
34 |     adata.obs["logFC"] = consistency[1]
35 |     print("Consistency:")
36 |     print(pd.Series(consistency[0]).value_counts()) 
37 |     fname = "global_effects/" + dataset + ".txt"
38 |     os.makedirs("global_effects", exist_ok = True)
39 |     global_effects.quantifyEffect(adata, fname = fname, withDEG = False, withMI = False, pseudocount = 1)
40 |     obs = adata.obs[['perturbation', 'is_control', 'logFC', 'logFCNorm2', 'logFCMean', 'expression_level_after_perturbation', 'perturbation_type']].copy()
41 |     
42 |     obs.loc[:, "dataset"] = dataset
43 |     effects.append(obs)
44 | 
45 | effects = pd.concat(effects)
46 | effects = effects.query("~is_control")
47 | effects = effects.query("logFC != -999")
48 | effects["guide"] = 0
49 | alt.data_transformers.disable_max_rows()
50 | chart = alt.Chart(effects).transform_density(
51 |     density='logFC',  
52 |     groupby=['dataset', 'perturbation_type'], 
53 |     as_=['logFC', 'density'],  
54 |     extent=[effects['logFC'].min(), effects['logFC'].max()], 
55 |     counts=False  
56 | ).mark_area(opacity=0.75).encode(
57 |     x=alt.X('logFC:Q', title='logFC'), 
58 |     y=alt.Y('density:Q', title='Density'),
59 |     color='dataset:N'
60 | ).properties(
61 |     width=200,
62 |     height=60
63 | )
64 | vline = alt.Chart(effects).mark_rule(color='black').encode(
65 |     x='guide:Q'
66 | )
67 | chart = (chart + vline).facet(
68 |     row="perturbation_type:N",
69 | )
70 | chart.save('timeseries_plots/fig_effects.svg')
71 | 
72 | 
73 | chart = alt.Chart(effects).mark_circle(size=10).encode(
74 |     x=alt.X('logFC:Q', title='logFC of perturbed gene\'s RNA'), 
75 |     y=alt.Y('logFCMean:Q', title='Mean absolute logFC, all genes'),
76 |     color=alt.Color('dataset:N') 
77 | ).properties(
78 |     width=200,
79 |     height=170
80 | )
81 | chart.save('timeseries_plots/fig_effects2.svg')
82 | 


--------------------------------------------------------------------------------
/experiments/1.4.1_0/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "unique_id": "1.4.1_0",
 3 |     "nickname": "timeseries celltype networks",
 4 |     "readme": "Comparing cell type specific versus universal networks for timeseries prediction",
 5 |     "question": "1.4.1",
 6 |     "is_active": true,
 7 |     "facet_by": "type_of_split",
 8 |     "color_by": "network_datasets",
 9 |     "factor_varied": "data_split_seed",
10 |     "type_of_split": "timeseries",
11 |     "eligible_regulators": "tfs",
12 |     "num_genes": 2000,
13 |     "regression_method": "RidgeCV",
14 |     "prediction_timescale": ["1,2,3,10"],
15 |     "matching_method": "optimal_transport",
16 |     "network_prior": "restrictive",
17 |     "perturbation_dataset": "definitive_endoderm", 
18 |     "visualization_embedding": "X_pca",
19 |     "network_datasets": { 
20 |         "endoderm":              { "do_aggregate_subnets": true },
21 |         "gtex_rna":              { "do_aggregate_subnets": false },
22 |         "cellnet_human_Hg1332":  { "do_aggregate_subnets": false },
23 |         "cellnet_human_Hugene":  { "do_aggregate_subnets": false },
24 |         "ANANSE_0.5":            { "do_aggregate_subnets": false },
25 |         "ANANSE_tissue_0.5":     { "do_aggregate_subnets": false },
26 |         "humanbase":             { "do_aggregate_subnets": false }, 
27 |         "magnum_compendium_394": { 
28 |             "subnets": [
29 |                 "retinal_pigment_epithelial_cells.parquet",
30 |                 "chronic_myelogenous_leukemia_cml_cell_line.parquet",
31 |                 "teratocarcinoma_cell_line.parquet",
32 |                 "lung_adenocarcinoma_cell_line.parquet",
33 |                 "breast_carcinoma_cell_line.parquet",
34 |                 "embryonic_kidney_cell_line.parquet",
35 |                 "hepatocellular_carcinoma_cell_line.parquet",
36 |                 "epitheloid_cancer_cell_line.parquet",
37 |                 "acute_myeloid_leukemia_fab_m5_cell_line.parquet",
38 |                 "cd8+_t_cells.parquet", 
39 |                 "adult_t-cell_leukemia_cell_line.parquet",
40 |                 "cd4+cd25-cd45ra-_memory_conventional_t_cells.parquet",
41 |                 "cd4+cd25+cd45ra+_naive_regulatory_t_cells.parquet",
42 |                 "cd4+_t_cells.parquet",
43 |                 "chronic_lymphocytic_leukemia_t-cll_cell_line.parquet",
44 |                 "cd4+cd25+cd45ra-_memory_regulatory_t_cells.parquet",
45 |                 "cd4+cd25-cd45ra+_naive_conventional_t_cells.parquet",
46 |                 "nk_t_cell_leukemia_cell_line.parquet",
47 |                 "melanoma_cell_line.parquet", 
48 |                 "melanocyte.parquet", 
49 |                 "skin_fetal.parquet", 
50 |                 "iris_pigment_epithelial_cells.parquet", 
51 |                 "retina_adult.parquet", 
52 |                 "blood_adult.parquet", 
53 |                 "cord_blood_derived_cell_line.parquet",
54 |                 "whole_blood_ribopure.parquet", 
55 |                 "peripheral_blood_mononuclear_cells.parquet",
56 |                 "multipotent_cord_blood_unrestricted_somatic_stem_cells.parquet"
57 |             ],
58 |             "do_aggregate_subnets": false  
59 |         },
60 |         "empty": { "do_aggregate_subnets": true }, 
61 |         "dense": { "do_aggregate_subnets": true }
62 |     }
63 | }
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/make_figures/variance_decomposition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import scanpy as sc
 4 | import anndata
 5 | import pereggrn_perturbations
 6 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
 7 | import sys 
 8 | import altair as alt
 9 | import os
10 | 
11 | sys.path.append("../../perturbation_data/setup/")
12 | import ingestion
13 | import global_effects
14 | 
15 | def decompose_variance(adata, gene_perturbed, dataset):
16 |     try:
17 |         depth = np.random.choice( a = np.array(adata.raw.X.sum(1)).reshape(-1), size = 1000, replace = True )
18 |         pre_log_scale = np.expm1(adata[0,:].X).sum()
19 |         fraction_of_rna_mapping_to_this_gene = np.expm1(adata[:,gene_perturbed].X).mean()/pre_log_scale
20 |         poisson_raw_counts = np.array([np.random.poisson( lam = l ) for l in fraction_of_rna_mapping_to_this_gene*depth])
21 |         resampled = np.log1p(pre_log_scale*(poisson_raw_counts / depth))
22 |         poisson = np.var(resampled)
23 |     except:
24 |         poisson = np.nan
25 |     control = np.var(ingestion.try_toarray(adata[adata.obs["is_control"],gene_perturbed].X))
26 |     others  = np.var(ingestion.try_toarray(adata[adata.obs["perturbation"]!=gene_perturbed,gene_perturbed].X))
27 |     this    = np.var(ingestion.try_toarray(adata[adata.obs["is_control"] | (adata.obs["perturbation"]==gene_perturbed),gene_perturbed].X))
28 |     return pd.DataFrame({
29 |         "poisson": poisson, 
30 |         "control": control, 
31 |         "others": others, 
32 |         "this": this,
33 |         "gene_perturbed": gene_perturbed, 
34 |         "dataset": dataset,
35 |     }, index = [0])
36 | 
37 | os.makedirs("variance_decomposition", exist_ok = True)
38 | all_variance_decomposition = []
39 | for dataset in [
40 |     'nakatake',
41 |     'freimer',
42 |     'replogle',
43 |     'replogle2',
44 |     'replogle3',
45 |     'replogle4',
46 |     'frangieh_IFNg_v1',
47 |     'frangieh_IFNg_v2',
48 |     'frangieh_IFNg_v3',
49 |     'dixit',
50 |     'adamson',
51 |     'norman',
52 | ]:
53 |     print(dataset)
54 |     try:
55 |         variance_decomposition = pd.read_csv(f"variance_decomposition/{dataset}.csv")
56 |     except FileNotFoundError:
57 |         adata = pereggrn_perturbations.load_perturbation(dataset)
58 |         pt = adata.obs["perturbation_type"][0]
59 |         variance_decomposition = []
60 |         for gene_perturbed in adata.uns["perturbed_and_measured_genes"]:
61 |             variance_decomposition.append(decompose_variance(adata, gene_perturbed, dataset))
62 |         variance_decomposition = pd.concat(variance_decomposition)
63 |         variance_decomposition.to_csv(f"variance_decomposition/{dataset}.csv")
64 |     all_variance_decomposition.append(variance_decomposition)
65 | all_variance_decomposition = pd.concat(all_variance_decomposition)
66 | all_variance_decomposition
67 | 
68 | all_variance_decomposition = pd.melt(
69 |     all_variance_decomposition, 
70 |     id_vars=['gene_perturbed', 'dataset'], 
71 |     value_vars=['poisson', 'control', 'others', 'this'], 
72 |     var_name='source_of_variance', 
73 |     value_name='variance'
74 | )
75 | 
76 | chart = alt.Chart(all_variance_decomposition.groupby(["dataset", "source_of_variance"])["variance"].mean()).mark_point().encode(
77 |     x='dataset:N',  # :N denotes a nominal (discrete) variable
78 |     y='variance:Q',  # :Q denotes a quantitative (continuous) variable
79 |     color='source_of_variance:N'  # Color by source_of_variance, also discrete
80 | ).properties(
81 |     title='Scatter plot of Variance by Dataset and Source of Variance'
82 | )
83 | 
84 | chart.display()


--------------------------------------------------------------------------------
/make_figures/cross_dataset_correlations.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy as sp
  4 | import itertools as it
  5 | import pereggrn_perturbations
  6 | import altair as alt
  7 | 
  8 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
  9 | 
 10 | nakatake = pereggrn_perturbations.load_perturbation('nakatake')
 11 | joung = pereggrn_perturbations.load_perturbation('joung')
 12 | replogle2 = pereggrn_perturbations.load_perturbation('replogle2')
 13 | replogle3 = pereggrn_perturbations.load_perturbation('replogle3')
 14 | dixit = pereggrn_perturbations.load_perturbation('dixit')
 15 | adamson = pereggrn_perturbations.load_perturbation('adamson')
 16 | 
 17 | def cross_correlate_expression(data1, data2):
 18 | 
 19 |     # Subset data to genes common in both datasets
 20 |     common_genes = list(set(data1.var.index) & set(data2.var.index))
 21 |     data1 = data1[:, common_genes].copy()
 22 |     data2 = data2[:, common_genes].copy()
 23 | 
 24 |     # Compute baseline shared gene expressions
 25 |     ctrl1 = np.asarray(data1[data1.obs.is_control].X.mean(axis=0)).squeeze()
 26 |     ctrl2 = np.asarray(data2[data2.obs.is_control].X.mean(axis=0)).squeeze()
 27 |     ctrl1n = data1[data1.obs.is_control].obs.perturbation.unique()
 28 |     ctrl2n = data2[data2.obs.is_control].obs.perturbation.unique()
 29 |     
 30 |     # Focus on the shared genetic perturbations
 31 |     correlations = list()
 32 |     common_perts = list(set(data1.obs.perturbation) & set(data2.obs.perturbation) - set(ctrl1n) - set(ctrl2n))
 33 |     for p in common_perts:
 34 |         trt1 = np.asarray(data1[data1.obs.perturbation==p].X.mean(axis=0)).squeeze()
 35 |         trt2 = np.asarray(data2[data2.obs.perturbation==p].X.mean(axis=0)).squeeze()        
 36 |         lfc1 = trt1 - ctrl1 
 37 |         lfc2 = trt2 - ctrl2                # Log Fold Change - X is log-transformed
 38 |         correlations.append([
 39 |             sp.stats.pearsonr(lfc1, lfc2).statistic,
 40 |             sp.stats.spearmanr(lfc1, lfc2).statistic,
 41 |             p
 42 |         ])
 43 |     correlations = pd.DataFrame(correlations, columns=['Pearson', 'Spearman', 'Perturbation'])
 44 |     return correlations
 45 | 
 46 | 
 47 | 
 48 | 
 49 | CRISPRi = ['replogle2', 'replogle3', 'dixit', 'adamson']
 50 | CRISPRiCorrelations = list()
 51 | for d1, d2 in it.combinations(CRISPRi, r=2):
 52 |     corrs = cross_correlate_expression(eval(d1), eval(d2))
 53 |     corrs['Dataset 1'] = d1
 54 |     corrs['Dataset 2'] = d2
 55 |     CRISPRiCorrelations.append(corrs)
 56 |     print(d1, d2)
 57 | CRISPRiCorrelations = pd.concat(CRISPRiCorrelations)
 58 | 
 59 | 
 60 | 
 61 | 
 62 | OE = ['nakatake', 'joung']
 63 | OECorrelations = list()
 64 | for d1, d2 in it.combinations(OE, r=2):
 65 |     corrs = cross_correlate_expression(eval(d1), eval(d2))
 66 |     corrs['Dataset 1'] = d1
 67 |     corrs['Dataset 2'] = d2
 68 |     OECorrelations.append(corrs)
 69 |     print(d1, d2)
 70 | OECorrelations = pd.concat(OECorrelations)
 71 | 
 72 | 
 73 | 
 74 | 
 75 | CRISPRiCorrelationsLong = CRISPRiCorrelations.melt(id_vars=['Dataset 1', 'Dataset 2', 'Perturbation'],
 76 |                                                    value_vars=['Pearson', 'Spearman'],
 77 |                                                    var_name='Correlation Type',
 78 |                                                    value_name='Value')
 79 | OECorrelationsLong = OECorrelations.melt(id_vars=['Dataset 1', 'Dataset 2', 'Perturbation'],
 80 |                                          value_vars=['Pearson', 'Spearman'],
 81 |                                          var_name='Correlation Type',
 82 |                                          value_name='Value')
 83 | AllCorrelations = pd.concat([CRISPRiCorrelationsLong, OECorrelationsLong])
 84 | AllCorrelations["Datasets"] = AllCorrelations["Dataset 1"] + " vs " + AllCorrelations["Dataset 2"]
 85 | AllCorrelations = AllCorrelations.query("Datasets!='dixit vs adamson'")
 86 | alt.data_transformers.disable_max_rows()
 87 | chart = alt.Chart(
 88 |         AllCorrelations
 89 |     ).mark_boxplot(
 90 |     ).encode(
 91 |         x = "Datasets:N", 
 92 |         y = "Value:Q", 
 93 |         color = "Correlation Type:N",
 94 |         xOffset = "Correlation Type:N",
 95 |     ).properties(
 96 |         width=400, 
 97 |         height=200,
 98 |         title = "Cross-dataset correlations"
 99 |     ) 
100 | chart.save("plots/cross_dataset_correlations.svg")
101 | 


--------------------------------------------------------------------------------
/guiding_questions.txt:
--------------------------------------------------------------------------------
 1 | We wanted experiment identifiers to be concise, stable, and informative. To help compromise among these properties, we maintain this numbered list of guiding questions. These have been numbered the same way since the start of the project; we allow new sub-questions but we seldom change the number of a question after it is added. Each experiment refers explicitly to one of these questions, and the experiment ID's use the numbers below as prefixes. 
 2 | 
 3 | 1. What is the best computational/statistical framework for predicting unseen perturbations of the transcriptome, and what characteristics of that framework are important to its performance?
 4 |     1.0. How important is the specific choice of ML method (e.g. ridge regression, LASSO, kernel regression, neural nets, boosted trees/random forests)?
 5 |     1.1. How dense are the network structures that best predict expression following new perturbations?
 6 |         1.1.1. How harshly should we prune features?
 7 |         1.1.2. Should we allow non-TF regulators?
 8 |     1.2. How does handling of time affect performance? 
 9 |          1.2.1. For dynamic models, is RNA velocity better or worse than modeling based on sample collection time?
10 |          1.2.2. Is it better to match each treatment to the nearest control (estimating total effects), or match each treatment to itself and assume steady state (estimating direct effects)? Is it better to predict results after a single iteration of the model, or a few, or many (steady state)? How do these decisions interact?
11 |     1.3. How much are causal effects or causal structures shared across different cell types? 
12 |         1.3.1. Do estimators treating cell types as "separate", "shared", or "similar" work best?
13 |         1.3.2. See 1.4.1
14 |         1.3.3. Can transfer learning or pre-training approaches such as GeneFormer improve causal effect predictions?
15 |     1.4. About existing drafts of causal networks affecting transcription:
16 |         1.4.0. Do most regulators have similar effects across all their targets?
17 |         1.4.1. Do cell-type-specific draft networks work better on the corresponding cell types?
18 |         1.4.2. What’s the best way to use a given network? Does GEARS beat causal inference approaches?
19 |         1.4.3. Do some sources of network structures work better than others?
20 |         1.4.4. Even if we can't get quantitative fold estimates, do networks predict which genes will change and which stay the same?
21 |         1.4.5. Given gene expression, do existing networks predict which genes were perturbed (DoRothEA copycat)?
22 |     1.5. How do existing methods compare on common tasks?
23 |         1.5.1. How do CellOracle, scKinetics, Dictys, PRESCIENT, RNAForecaster, and simple baselines compare?
24 |         1.5.2. How does OneSC perform in systematic tests?
25 |     1.6. What method of imposing low-rank structure works best, if any?
26 |         1.6.1. Does DCD-FG work?
27 |         1.6.2. Leaving aside causal inference or held-out perturbations, does low-rank structure also help learn fold changes for perturbations, as in FR-Perturb?
28 |     1.7. What method of measuring TF activity works best?
29 |     1.8. What types of data contain more useful signal? How do mundane details (e.g. data splitting) affect apparent performance? 
30 |         1.8.1. Which is more useful: lots of perturbations, or wild-type time-series data? 
31 |         1.8.2. Does pseudobulk aggregation or metacell aggregation or averaging of replicates hurt performance?
32 |         1.8.3. How does variable gene selection affect apparent performance? 
33 |         1.8.4. Is the main problem statistical generalization, or causal identification? Specifically, is it harder when the perturbations in the test set do not appear in the training set, or is it just as hard with a simple random split?
34 |         1.8.5. How do different data splits affect performance (50-50 vs 90-10, different seeds)?
35 |         1.8.6. Some evaluations require revealing all the test data to the predictor -- for instance, any evaluation of heldout data log likelihood. Does this make the task substantially easier?
36 |     1.9. Why does everything fail? Would similar evaluations work if cascading effects were much larger than noise, or if models were correctly specified?
37 | 2. Different model assumptions imply different amounts of perturbations are needed to identify network structure. What do our results imply about identifiability?
38 | 3. Is it possible to obtain calibrated predictive intervals for expression profiles after unseen perturbations? 
39 |     3.1. What are the biggest drivers of uncertainty?
40 |         3.1.1. Measurement noise? 
41 |         3.1.2. Network structure? 
42 |         3.1.3. Causal effect size & direction?
43 |         3.1.4. Systematic errors such as samples failing sequencing or off-target CRISPR effects
44 | 4. What makes some genes easier to predict and others harder?
45 | 5. How do different methods scale in practice? (CPU time and RAM.)
46 | 
47 | 


--------------------------------------------------------------------------------
/make_figures/figure_s1_effects.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scanpy as sc
  4 | import anndata
  5 | import pereggrn_perturbations
  6 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
  7 | import sys 
  8 | import altair as alt
  9 | from scipy.stats import rankdata as rank
 10 | 
 11 | sys.path.append("../../perturbation_data/setup/") # access our data ingestion module, which is not currently pip-installable
 12 | import ingestion
 13 | import global_effects
 14 | effects = []
 15 | 
 16 | 
 17 | DATASET_ORDER = [
 18 |     "nakatake",
 19 |     "joung",
 20 |     "norman",
 21 |     "replogle1",
 22 |     "replogle3",
 23 |     "replogle4",
 24 |     "adamson",
 25 |     "replogle2",
 26 |     "freimer",
 27 |     "dixit", 
 28 |     "frangieh_IFNg_v2",
 29 | ]
 30 | 
 31 | # Top-n genes consistency across replicates
 32 | for dataset in ["nakatake", "freimer", "frangieh_IFNg_v3", "replogle1"]:    
 33 |     print(dataset)
 34 |     adata = pereggrn_perturbations.load_perturbation(dataset)
 35 |     baseline = adata.X[adata.obs["is_control"], :].mean(axis = 0)
 36 |     intersection = {}
 37 |     union = {}
 38 |     jaccard = {}
 39 |     for n in [20, 100, 200]:
 40 |         for perturbation in adata.obs["perturbation"].unique():
 41 |             intersection[perturbation] = set(adata.var_names)
 42 |             union[perturbation] = set()
 43 |             for i in adata.obs.query("perturbation == @perturbation").index:
 44 |                 if adata.obs["is_control"][i]:
 45 |                     continue
 46 |                 logfc = adata[i, :].X - baseline
 47 |                 top_n_genes = set(adata.var_names[rank(-np.abs(logfc)) <= n]).copy()
 48 |                 intersection[perturbation] = intersection[perturbation].intersection(top_n_genes)
 49 |                 union[perturbation] = union[perturbation].union(top_n_genes)
 50 |                 jaccard[perturbation] = len(intersection[perturbation]) / len(union[perturbation])
 51 |         print(n)
 52 |         print(np.array([x for x in jaccard.values()]).mean())
 53 | 
 54 | # Effect size and direction
 55 | for dataset in DATASET_ORDER:    
 56 |     print(dataset)
 57 |     adata = pereggrn_perturbations.load_perturbation(dataset)
 58 |     pt = adata.obs["perturbation_type"][0]
 59 |     uns = adata.uns.copy()
 60 |     try:
 61 |         if adata.X.sum() == adata.raw.X.sum(): # We filled in log1p normalized data into the .raw slot for datasets obtained from GEARS. 
 62 |             adata.raw = anndata.AnnData(X = np.exp(adata.raw.X.toarray()) - 1)
 63 |         adata = ingestion.aggregate_by_perturbation(adata, group_by = ["perturbation"], use_raw = True)
 64 |         sc.pp.normalize_total(adata)
 65 |     except:
 66 |         pass
 67 |     adata.uns = uns
 68 |     adata = ingestion.describe_perturbation_effect(adata, perturbation_type = pt)
 69 |     consistency = ingestion.checkConsistency(adata, pt)
 70 |     adata.obs["logFC"] = consistency[1]
 71 |     print("Consistency:")
 72 |     print(pd.Series(consistency[0]).value_counts()) 
 73 |     fname = "global_effects/" + dataset + ".txt"
 74 |     global_effects.quantifyEffect(adata, fname = fname, withDEG = False, withMI = False, pseudocount = 1)
 75 |     obs = adata.obs[['perturbation', 'is_control', 'logFC', 'logFCNorm2', 'logFCMean', 'expression_level_after_perturbation', 'perturbation_type']].copy()
 76 |     
 77 |     obs.loc[:, "dataset"] = dataset
 78 |     effects.append(obs)
 79 | 
 80 | effects = pd.concat(effects)
 81 | effects = effects.query("~is_control")
 82 | effects = effects.query("logFC != -999")
 83 | effects["guide"] = 0
 84 | alt.data_transformers.disable_max_rows()
 85 | chart = alt.Chart(effects).transform_density(
 86 |     density='logFC',  
 87 |     groupby=['dataset', 'perturbation_type'], 
 88 |     as_=['logFC', 'density'],  
 89 |     extent=[effects['logFC'].min(), effects['logFC'].max()], 
 90 |     counts=False  
 91 | ).mark_area(opacity=0.75).encode(
 92 |     x=alt.X('logFC:Q', title='logFC'), 
 93 |     y=alt.Y('density:Q', title='Density'),
 94 |     color=alt.Color('dataset:N', scale=alt.Scale(domain=DATASET_ORDER, scheme = "dark2"))  # replace with your desired order
 95 | ).properties(
 96 |     width=200,
 97 |     height=60
 98 | )
 99 | vline = alt.Chart(effects).mark_rule(color='black').encode(
100 |     x='guide:Q'
101 | )
102 | chart = (chart + vline).facet(
103 |     row="perturbation_type:N",
104 | )
105 | chart.save('plots/fig_effects.svg')
106 | 
107 | 
108 | chart = alt.Chart(effects).mark_circle(size=10).encode(
109 |     x=alt.X('logFC:Q', title='logFC of perturbed gene\'s RNA'), 
110 |     y=alt.Y('logFCMean:Q', title='Mean absolute logFC, all genes'),
111 |     color=alt.Color('dataset:N', scale=alt.Scale(domain=DATASET_ORDER, scheme = "dark2"))  # replace with your desired order
112 | ).properties(
113 |     width=200,
114 |     height=100
115 | )
116 | chart = chart.facet(
117 |     row="perturbation_type:N",
118 | ).resolve_scale(y='independent') 
119 | chart.save('plots/fig_effects2.svg')
120 | 
121 | 


--------------------------------------------------------------------------------
/make_figures/global_effects/replogle.txt:
--------------------------------------------------------------------------------
 1 | deg,mi,mean,norm2,median
 2 | 0.0,2.53220244336788,0.0851376320306763,18.561378309978537,0.03242773106026124
 3 | 0.0,2.3354363553775945,0.08141238311717322,17.655476720300825,0.03249113166813662
 4 | 1.0,2.490995770437818,0.08857901955376693,18.288630103143028,0.035282791202508094
 5 | 0.0,2.449289610634067,0.08893685945566977,18.972449795487556,0.03597049034973923
 6 | 0.0,2.7384663944533503,0.0617633592873864,13.833885910833132,0.02341548321210485
 7 | 6566.0,0.5736099616684778,0.16344021925456315,10.381373834681122,0.11158609351059984
 8 | 9.0,1.5741214482461752,0.12188076589780454,22.337624304148004,0.06149157101199716
 9 | 23.0,1.5125992866163334,0.11711288306207096,21.37518184006788,0.058457629192020646
10 | 0.0,2.3874082913724743,0.12432136792407478,24.761340394866632,0.05315622178952548
11 | 0.0,2.6151125199638443,0.08006957468736565,17.552997344902433,0.030616784868386607
12 | 1.0,2.176322591807747,0.10706080320792109,22.92449815832578,0.04440849678227909
13 | 0.0,2.6655085847419473,0.07177084460139224,16.114358973094127,0.025979662309170398
14 | 0.0,2.622250569057808,0.06794705949446159,14.710025011737287,0.02628586120148789
15 | 0.0,2.4093025288723746,0.07356037164173564,16.193252426744046,0.028669752282965148
16 | 0.0,2.5532375238639036,0.07083735444318189,15.3273041966959,0.028080493564683277
17 | 0.0,2.6929299135049423,0.07005117096980484,16.217615159266217,0.02632713837306155
18 | 4.0,2.4316239224900285,0.12049612542234536,25.333510947706223,0.04772982519871703
19 | 0.0,2.2114906224160804,0.10901283522241911,21.254330364743346,0.04965254326157386
20 | 0.0,2.7540662723576963,0.0758180737306825,17.456677748981953,0.027243412285100597
21 | 0.0,2.4142981922921405,0.07749321709100794,16.587726989664553,0.031969599075122285
22 | 1.0,2.503459253406378,0.08899747768076602,19.399294681009472,0.03396729560137224
23 | 0.0,2.2869240140038567,0.0931245679974095,19.451142179248002,0.039343541404328594
24 | 0.0,2.540360125521646,0.09077471971848658,19.39568265930997,0.03631894476984457
25 | 0.0,2.6700745619904196,0.06637567537442006,15.062996944039243,0.024810393462022223
26 | 0.0,2.52498257077897,0.0777778239105776,16.436681535373335,0.03454566934990724
27 | 0.0,2.378222112648026,0.07358529732288872,15.25238353229441,0.030891321883759114
28 | 0.0,2.7250309522420455,0.06040585854428375,13.604706537920304,0.022381497134956244
29 | 0.0,1.500060540764355,0.15786745086780943,28.42930995382543,0.08512573705969635
30 | 7.0,1.8647697256385587,0.11059686425731534,21.68691535448334,0.05119773652271695
31 | 0.0,2.425943948510006,0.08011129440892355,17.67664559012215,0.031156963927202713
32 | 1.0,2.2550772581169563,0.11998042053370019,24.86143297958747,0.05152616434880946
33 | 54.0,1.8939861666516866,0.1318370395217529,23.82429140264389,0.06519928801964855
34 | 0.0,2.422898732389288,0.07932533938208546,16.60126406040305,0.0355124462001576
35 | 0.0,2.622363746838139,0.0753744298133296,16.849940503019837,0.028329388078202518
36 | 0.0,1.9524384274211086,0.1160466781533375,22.371578128082675,0.05325435950769616
37 | 0.0,2.2861635973115515,0.0979858584359881,19.616881954554533,0.04490560643830245
38 | 0.0,2.7176246369039534,0.057826564849600075,13.201582738556043,0.02148734152363047
39 | 0.0,2.300267766714472,0.09937908237044964,20.803438442217914,0.042829227048800295
40 | 3.0,2.3908362969831214,0.11192811376779843,22.783656608232356,0.0471706120296902
41 | 0.0,2.620373288439854,0.07786323598412682,17.818366699631007,0.028500890621481725
42 | 0.0,2.6345536288636024,0.06681063482790979,14.405803824984352,0.026443984176409054
43 | 0.0,2.782683527054495,0.06956382325805772,15.748230761025967,0.02569063260740482
44 | 2.0,2.6399668948008452,0.08289351032798649,18.791098363854722,0.02927904338580615
45 | 1.0,2.2533229235641747,0.09717749542996486,19.24037245231768,0.0431825915750651
46 | 38.0,2.1241585995274197,0.1293421259325743,23.565835319518047,0.062369558478899204
47 | 0.0,2.6103993735116644,0.0838894380212173,18.451538890948708,0.03193665866933697
48 | 0.0,2.644998649442768,0.07416858606639103,16.968667339537053,0.028507650754719012
49 | 0.0,2.76630450699615,0.05712219823283664,13.090239329751714,0.02083094261160019
50 | 0.0,2.731051453749715,0.07562789805828966,16.836151719705583,0.02852407301772497
51 | 0.0,2.240996350495207,0.1140636073145056,22.89587659448213,0.04990984568367461
52 | 35.0,1.852761638394906,0.15870331397918336,27.635287564807932,0.08327207977346303
53 | 0.0,2.6210457475559297,0.07115245919931588,16.112115807116844,0.028170699945515486
54 | 15.0,2.2787776058116176,0.10455931069427943,19.960056707886725,0.04696374762042114
55 | 0.0,2.573763781743705,0.08197435178401422,17.917952572128662,0.03136883747518937
56 | 0.0,2.587972611847775,0.07220116457801849,16.233346320347362,0.0274326827262376
57 | 0.0,2.5812635513205446,0.06833862970091643,14.577992283911522,0.027544027881836845
58 | 0.0,2.653495107185987,0.11190221021212549,24.25920414237376,0.04192543787242102
59 | 0.0,2.7215462189138004,0.0786076340661232,17.317861907221353,0.02873018454633397
60 | 0.0,2.5293173955825203,0.0730886386363198,15.26428710173191,0.031940094417080164
61 | 0.0,2.5876814020787835,0.06862864493382613,15.277351496735193,0.024767700354738464
62 | 0.0,2.6048379142212115,0.06711990132212776,15.097690420801136,0.025109119358669968
63 | 3.0,1.8464455469278533,0.13677045030443505,25.91788697629064,0.0660582637417314
64 | 0.0,2.666669405370624,0.07515900872460457,17.151767292167396,0.027366483388955844
65 | 


--------------------------------------------------------------------------------
/environment/install.md:
--------------------------------------------------------------------------------
 1 | ### Installation
 2 | 
 3 | We use Conda + Mamba + Docker to manage most dependencies. We offer either a minimal install, which may work cross-platform but lacks access to many GRN methods, or an exact install, which can reproduce all our results but only works on Ubuntu 20.04. 
 4 | 
 5 | ### Hardware
 6 | 
 7 | Certain models nominally require GPU's, but we have been able to run most experiments using a CPU, sometimes by making minimal changes to Pytorch code. See the [GGRN repo](https://github.com/ekernf01/ggrn) for details on GPU requirements of specific methods. To install with GPU available, we recommend you use the exact or minimal install above; activate the environment; and then install a gpu version of PyTorch `2.x.x`.
 8 | 
 9 | 50GB of disk space and 64GB of RAM is enough resources to run most experiments. Certain tree-based models or large datasets (Norman especially) may require more RAM. The more benchmarks you run, the more predictions are saved and the more disk space is occupied. To re-run all experiments, we would recommend 250GB disk space. 
10 | 
11 | ### Minimal install
12 | 
13 | In case of different operating systems and environments, exactly reproducing results is infeasible. However, you should be able to carry out many of our experiments, or your own new experiments, even without all dependencies. Use the commands in [`install_minimal.sh`](https://github.com/ekernf01/perturbation_benchmarking/blob/main/environment/install_minimal.sh). They will install python packages in a new conda environment, and they will download about 1Gb of example data from Zenodo.
14 | 
15 | Some notes:
16 | 
17 | - If you want the development version of our packages, you can remove the tag `@v2` from the end of the pip install commands.
18 | - For this minimal installation, we do not download all the data or networks. We only include a couple of examples. You can download the full network and data collections from Zenodo [DOI: 10.5281/zenodo.8071808](https://doi.org/10.5281/zenodo.8071808).
19 | - We require wget, git, and conda to be installed already. If the data download doesn't work with `wget`, you can easily rephrase it to use `curl` instead, or you can download the data manually using a web browser.
20 | - This doesn't try to install all dependencies, so some backends may be unavailable.
21 | - This doesn't try to install docker or singularity. If you want access to containerized methods via ggrn, you need to install Docker yourself.
22 | - This install code is written in bash and tested on Ubuntu Linux. On a Mac, the default shell is zsh, and you may need to run a bash shell (just type `bash`) for everything to work. We are not able to support Windows.
23 | 
24 | ### Exact install
25 | 
26 | To reproduce our computing environment exactly, you can start with a bash shell on a clean linux box (we have tested Rocky Linux release 8.8, Ubuntu 20.04, and Ubuntu 22.04). Use the commands in [install.sh](https://github.com/ekernf01/perturbation_benchmarking/blob/main/environment/install.sh). This will install mamba and many python packages, and it will download all our data (20Gb) from Zenodo. 
27 | 
28 | ```bash
29 | git clone https://github.com/ekernf01/perturbation_benchmarking
30 | source perturbation_benchmarking/environment/install.sh
31 | ```
32 | 
33 | Our install script doesn't try to install docker or singularity. If you want access to containerized methods via ggrn, you need to install Docker yourself. We cannot support you in this step; if it doesn't work, you will need to go to the official Docker instructions or another source. But this worked for me on an Amazon EC2 running Ubuntu 22.04.
34 | 
35 | ```bash
36 | sudo apt-get update
37 | sudo apt-get install ca-certificates curl
38 | sudo install -m 0755 -d /etc/apt/keyrings
39 | sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
40 | sudo chmod a+r /etc/apt/keyrings/docker.asc
41 | echo \
42 |   "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
43 |   $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
44 |   sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
45 | sudo apt-get update
46 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
47 | # Now configure it so you don't need sudo for every docker command.  
48 | # https://askubuntu.com/questions/477551/how-can-i-use-docker-without-sudo
49 | sudo usermod -aG docker $USER
50 | sg docker -c "bash" 
51 | ```
52 | 
53 | ### How to check the installation
54 | 
55 | **Warning**: data download and unzip will still be running in the background after the installer finishes. It is a ~20GB download. This means **the experiments may not work immediately.** If you see no `network_collection` or `perturbation_data` folders, then you need to wait for the download+unzip to finish.
56 | 
57 | The installation should create a Conda environment called 'ggrn' and several folders in your working directory. At minimum, there will be three data collections and the benchmark experiments.
58 | 
59 | ```
60 | ├── accessory_data
61 | ├── network_collection
62 | ├── perturbation_data
63 | ├── perturbation_benchmarking 
64 | ```
65 | 
66 | You can test your installation by running this.
67 | 
68 | ```bash
69 | cd perturbation_benchmarking
70 | conda activate ggrn
71 | pereggrn -h # see the help page
72 | pereggrn --experiment_name "1.0_0" --amount_to_do models --no_skip_bad_runs # Run a simple benchmark
73 | ```
74 | 
75 | **Warning**: data download and unzip will still be running in the background after the installer finishes. It is a ~20GB download. This means **the experiments may not work immediately.** If you see no `network_collection` or `perturbation_data` folders, then you need to wait for the download+unzip to finish.
76 | 


--------------------------------------------------------------------------------
/make_figures/plotting_script_unused.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(dplyr)
  3 | library(stringr)
  4 | library(arrow)
  5 | library(magrittr)
  6 | library(rjson)
  7 | setwd("/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_benchmarking/make_figures/")
  8 | source("plotting_functions.R")
  9 | 
 10 | # networks-only
 11 | {
 12 |   cell_type_matching = rjson::fromJSON(file = "../../accessory_data/cell_type_matching.json")
 13 |   do_subnetworks_match = function(perturbation_dataset, subnetwork){
 14 |     x = cell_type_matching$celltypes[perturbation_dataset]
 15 |     x %<>% unlist
 16 |     # If x is from a network that does not include any relevant subnetwork, the key might be missing from cell_type_matching$networks.
 17 |     # If x is from a network that does include a relevant subnetwork, this code runs as expected: check if this subnetwork is the relevant one.
 18 |     try(
 19 |       { 
 20 |         x = cell_type_matching$networks[x][[1]]
 21 |         # This incomprehensible one-liner converts a nested list of vectors to a tidy dataframe: {a:[1,2], b:2} becomes [[a,a,b], [1,2,2]].
 22 |         x = data.frame(network = Reduce(c, mapply(rep, names(x), sapply(x, length))), 
 23 |                        subnetwork = Reduce(c, x))
 24 |         x = paste(x$network, x$subnetwork)
 25 |         return(subnetwork %in% x)
 26 |       }, 
 27 |       silent = T
 28 |     )
 29 |     return(F)
 30 |   }
 31 |   X = collect_experiments(c("1.4.4_" %>% paste0(c(1:8)) ))
 32 |   X$cell_types_match = mapply(do_subnetworks_match, 
 33 |                               X$perturbation_dataset, 
 34 |                               X$network_datasets)
 35 |   X <- X %>% mutate(chart_x = paste(regression_method, starting_expression, sep = "_"))
 36 |   X$perturbation_dataset %<>% gsub("γ", "g", .)
 37 |   X %<>% make_the_usual_labels_nice()
 38 |   my_levels = unique(c("frangieh\nIFNg v1", "frangieh\nIFNg v2", "frangieh\nIFNg v3", "nakatake", "nakatake\nscrna\nsimulated", X$perturbation_dataset))
 39 |   X$perturbation_dataset %<>% factor(levels = my_levels)
 40 |   X$network_datasets %<>% gsub(".parquet", "", .)
 41 |   X$network_datasets %<>% gsub(".csv_converted", "", .)
 42 |   X$network_datasets %<>% gsub("_top_filtered", "", .)
 43 |   X$network_source = X$network_datasets %>% 
 44 |     strsplit(" ") %>% 
 45 |     sapply(extract2, 1) 
 46 |   X$network_tissue = X$network_datasets %>% 
 47 |     paste("all") %>%
 48 |     strsplit(" ") %>%
 49 |     sapply(extract2, 2) %>% 
 50 |     tolower %>% 
 51 |     gsub("_", " ", .) %>%
 52 |     gsub("b lymphocyte", "bcell", .) %>%
 53 |     gsub(" memory", "", .) %>%
 54 |     gsub(" regulatory", "", .) %>%
 55 |     gsub(" conventional", "", .) %>%
 56 |     gsub(" naive", "", .) %>%
 57 |     gsub("retinal pigment epithelial", "rpe", .) %>%
 58 |     gsub("chronic lymphocytic leukemia", "", .) %>%
 59 |     gsub("chronic myelogenous leukemia", "", .) %>%
 60 |     gsub("suprapubic", "", .) %>%
 61 |     gsub("lower leg", "", .) %>%
 62 |     gsub("brain .*", "brain", .) %>%
 63 |     gsub("cell line", "", .) %>%
 64 |     gsub("muscleskel", "muscle", .) %>%
 65 |     gsub("pancreatic", "pancreas", .) 
 66 |   single_networks = c("celloracle_human",      "magnum_compendium_ppi" , "MARA_FANTOM4"     ,     "STRING"           ,     "magnum_compendium_32" )
 67 |   X$network_tissue[X$network_source %in% single_networks] = X$network_source[X$network_source %in% single_networks] 
 68 |   X$network_source[X$network_source %in% single_networks] = "other"
 69 |   X$network_pretty = paste(
 70 |     as.integer(as.factor(X$network_source)),
 71 |     X$network_tissue
 72 |   )
 73 |   for(dataset in X$perturbation_dataset %>% unique){
 74 |     current_X = subset(X, perturbation_dataset == dataset)
 75 |     networks_by_fc = current_X %>% 
 76 |       dplyr::group_by(network_pretty) %>%
 77 |       dplyr::summarise(fc_targets_vs_non_targets = median(fc_targets_vs_non_targets, na.rm = T)) %>%
 78 |       dplyr::arrange(fc_targets_vs_non_targets)
 79 |     current_X$network_pretty %<>% factor(levels = networks_by_fc$network_pretty)
 80 |     ggplot(current_X) + 
 81 |       geom_boxplot(outlier.shape = NA, 
 82 |                    aes(color = cell_types_match,
 83 |                        x = network_pretty, y = pmax(pmin(fc_targets_vs_non_targets, 0.5), -0.5))) + 
 84 |       theme(axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5))    + 
 85 |       ggtitle("Perturbation response enrichment of known regulons") + 
 86 |       facet_wrap(~network_source, scales = "free", nrow = 1) + 
 87 |       geom_vline(xintercept = 0) + 
 88 |       ggtitle(dataset)
 89 |     ylab("Log fold change in target genes minus\nlog fold change in other genes") + 
 90 |       theme(axis.text.x = element_text(family = "mono", face = "bold"))
 91 |     ggsave(filename = paste0(paste0("plots/fig_network_only_", dataset, ".pdf")), width = 14, height = 8)
 92 |     networks_by_pvalue = current_X %>% 
 93 |       dplyr::group_by(network_pretty) %>%
 94 |       dplyr::summarise(fc_targets_vs_non_targets = median(-log10(pvalue_targets_vs_non_targets + 0.00001), na.rm = T)) %>%
 95 |       dplyr::arrange(fc_targets_vs_non_targets)
 96 |     current_X$network_pretty %<>% factor(levels = networks_by_pvalue$network_pretty)
 97 |     ggplot(current_X) + 
 98 |       geom_boxplot(outlier.shape = NA, 
 99 |                    aes(color = cell_types_match,
100 |                        x = network_pretty, y = -log10(pvalue_targets_vs_non_targets))) + 
101 |       theme(axis.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5))    + 
102 |       ggtitle("Perturbation response enrichment of known regulons") + 
103 |       facet_wrap(~network_source, scales = "free", nrow = 1) + 
104 |       geom_vline(xintercept = 0) + 
105 |       ylab("-Log10 p-value of H0: \ntarget genes have same fc as non-targets") + 
106 |       theme(axis.text.x = element_text(family = "mono", face = "bold")) + 
107 |       ggtitle(dataset)
108 |     ggsave(filename = paste0(paste0("plots/fig_network_only_", dataset, ".pdf")), width = 14, height = 8)
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/make_figures/global_effects/adamson.txt:
--------------------------------------------------------------------------------
 1 | deg,mi,mean,norm2,median
 2 | -999.0,1.5914746752989506,0.9402980431783272,80.66489463473708,0.7102679796061188
 3 | -999.0,1.9318087899804128,0.6758282083383268,61.781840029780795,0.4835094955650109
 4 | -999.0,1.9249115949410478,0.6724494950126417,59.94124558731231,0.5033497653594416
 5 | -999.0,2.0219711260704627,0.5840349139085499,52.94411945427133,0.4241061451970056
 6 | -999.0,1.8922613661078485,0.7161489710420372,65.50590101093931,0.4715083503829949
 7 | -999.0,1.726837413579345,0.8062926097937672,73.08206046493554,0.5267304598077865
 8 | -999.0,1.447305372992366,1.1141202513062227,95.94651855792124,0.8376613266043513
 9 | -999.0,1.8507037124417225,0.6824758899611743,61.77216050661735,0.47652286028414365
10 | -999.0,1.7344687447707292,0.7080150067682857,64.34389099371036,0.4829694518703429
11 | -999.0,1.8417205475676082,0.7953456856397185,72.80230856302951,0.5119966479358862
12 | -999.0,1.7791033847131843,0.8708487170974377,79.26238705614833,0.5673135036231329
13 | -999.0,1.9589046428696335,0.7092332669238077,64.25641029381931,0.4728460125737106
14 | -999.0,2.0239483293011618,0.6610630176157726,60.07961506021345,0.49264387641819385
15 | -999.0,1.9927488041133377,0.6680741053147355,61.353261239606795,0.48381930671564566
16 | -999.0,1.9702931917985653,0.6798582035878346,60.41057181469964,0.4588274136423368
17 | -999.0,1.949990384254539,0.6851844616866452,61.859357483675794,0.48092650456738073
18 | -999.0,2.0091737561777085,0.6152866812185548,55.58070705065005,0.46150747235591183
19 | -999.0,1.9952716613485404,0.7517209348508542,67.95510829544779,0.5488176996145346
20 | -999.0,1.8350210793114705,0.7220683781231743,64.30537670566792,0.5117125758313197
21 | -999.0,1.934779431043772,0.6828190171396469,61.41311215649594,0.4906493981049376
22 | -999.0,1.7382237625562553,0.817524375024505,71.85912146725195,0.59915400242542
23 | -999.0,1.6948172069470762,0.8726550971649565,76.71886205280724,0.6271573285286236
24 | -999.0,1.8496267502299086,0.7531653868964031,67.73920612348142,0.5432100768221629
25 | -999.0,1.4829375753295577,1.0760398323870752,92.69790857348158,0.813174067012588
26 | -999.0,1.8290744463017794,0.7820287049066247,69.9348735353462,0.5689592408257427
27 | -999.0,2.0229068283250085,0.6217493260455793,54.81808795575437,0.4761566215333669
28 | -999.0,2.0727350941126788,0.6484616342485798,58.40361354564193,0.4984418819141664
29 | -999.0,2.038279364228695,0.6005967555295818,54.756859194032366,0.448899013773742
30 | -999.0,1.976331308541767,0.6436719076466788,57.32857310541398,0.4913584000433753
31 | -999.0,1.5980561439624257,0.8792795831511191,76.44724345620084,0.6366285728161758
32 | -999.0,1.827423668933483,0.7639845420687541,68.86716178932963,0.5270872540014772
33 | -999.0,1.856882599249974,0.7462203360439567,70.54588043374459,0.5061468722330975
34 | -999.0,1.5843348053958297,0.8877348034055554,79.12083691468126,0.658159985896507
35 | -999.0,2.0056607398726873,0.6320341827213461,56.0288615867708,0.4701156590765587
36 | -999.0,1.680331746299358,0.8079042196258024,72.19668736925499,0.6067361030675495
37 | -999.0,1.9868161965263593,0.6493278656441207,59.510334833379105,0.4727059859736925
38 | -999.0,2.022586790853879,0.6297152474727035,58.59943625631716,0.46548651347819087
39 | -999.0,2.017948905415038,0.6436480398721556,58.16848055875927,0.47816793037761274
40 | -999.0,1.7783122662071587,0.7655476052788972,68.38704440064329,0.5549992420906138
41 | -999.0,1.9002998575478416,0.666107005728057,62.24487008664547,0.44963131773855436
42 | -999.0,2.051551411144596,0.5755306665788776,53.17461578160182,0.4153700345349749
43 | -999.0,1.8522913786486102,0.6867574195109631,62.6082692425542,0.4758279142934275
44 | -999.0,1.674924463408369,0.887557565763358,79.0910983036986,0.653446133013394
45 | -999.0,2.0352477897881167,0.6402697900974206,57.64754269091291,0.47100440739524785
46 | -999.0,1.9453672397720758,0.7074309896057229,63.05250134908849,0.5280533258235164
47 | -999.0,2.0195727708802185,0.6275857507788403,57.10860402522137,0.4610243650677387
48 | -999.0,1.7444121505098404,0.9041010686822314,82.74469599947018,0.579391314398791
49 | -999.0,1.5561107953590811,0.8529119794266775,77.51840953551712,0.6056435501482237
50 | -999.0,2.0067665577522655,0.6428413359359157,57.60144862493335,0.48208258837511014
51 | -999.0,1.5473442483709072,1.0158661501964203,87.14837394524783,0.7582703428681146
52 | -999.0,1.972796266435563,0.6153464663609847,55.23828315058759,0.4601298124006118
53 | -999.0,1.6075974200251246,0.9013498694128812,80.2621626752748,0.6423521064253392
54 | -999.0,2.016025772148362,0.6412250043558921,58.70573002668311,0.4773238446705782
55 | -999.0,1.8285098315480641,0.7733560050414434,69.2577640119751,0.5752548469312637
56 | -999.0,2.007001709023173,0.6313707566518987,56.77168360665907,0.46491770105035685
57 | -999.0,1.9969532596836115,0.6766822743927187,61.25948444713688,0.49986743084497454
58 | -999.0,2.0196044351223956,0.645648783573736,58.20585426394427,0.48336565771978957
59 | -999.0,1.9890964547206846,0.6361012024105648,56.92477259004084,0.46606493500972623
60 | -999.0,2.0383875774184,0.6200481666915277,55.52307621947284,0.46735982803242004
61 | -999.0,1.9703385440297472,0.6904410142836638,61.621854049130796,0.528849962846023
62 | -999.0,2.0027914312900488,0.6636116608143904,59.38580827597671,0.480498868946735
63 | -999.0,1.9688634014749224,0.6344534694829904,55.7638353318526,0.4884345387235139
64 | -999.0,1.9869045205321627,0.6847391675076052,62.30583572727342,0.49393583539597313
65 | -999.0,1.999007973625627,0.6722282655335571,60.067934797969436,0.5071774554965496
66 | -999.0,1.9466837111137507,0.6456105131845772,57.86628688328466,0.4813283336181945
67 | -999.0,1.933105008214174,0.6839355388447359,61.52780909189652,0.5179922604357907
68 | -999.0,2.0194061624692834,0.6604749536308715,59.64462100249662,0.4890410367999095
69 | -999.0,2.0309355530605693,0.6477285046351372,59.17648118237441,0.47539778418427436
70 | -999.0,1.978658091244812,0.7072666568225668,63.64067759590031,0.5298945111661153
71 | -999.0,1.9528116584688289,0.6755024065877345,61.606664494996615,0.4882026854558382
72 | -999.0,1.723341402851696,0.8377561720884014,75.47088304026562,0.589329456794277
73 | -999.0,1.9840901542586111,0.6608379058386858,58.96199200965332,0.49967958222853615
74 | -999.0,1.9603468468118306,0.6649667557057085,59.19913690481098,0.4919745295139954
75 | -999.0,1.9995247413225532,0.665538116851975,59.22881605126685,0.48987785236759285
76 | -999.0,1.728397438071219,0.8869082108702844,78.15361303971969,0.6574513331952943
77 | -999.0,1.9843334345369592,0.6314043497278519,56.872528860048064,0.47442205321815445
78 | -999.0,2.008455219064121,0.6201323316378644,56.508523837755774,0.45662131491016256
79 | -999.0,2.0236181289879474,0.6070675056113858,55.18154319004647,0.45450842804290015
80 | -999.0,1.9548984852536293,0.6977743954006845,61.24813191979027,0.5515170960491625
81 | -999.0,2.0390745765898006,0.6079028022474319,55.11654738751056,0.47661355764047303
82 | -999.0,1.9529017760890959,0.7358623005727601,64.9717899833591,0.5193232885049748
83 | 


--------------------------------------------------------------------------------
/make_figures/.load_perturbation(dataset):
--------------------------------------------------------------------------------
  1 | Help on module ingestion:
  2 | 
  3 | NNAAMMEE
  4 |     ingestion
  5 | 
  6 | FFUUNNCCTTIIOONNSS
  7 |     aaggggrreeggaattee__bbyy__ppeerrttuurrbbaattiioonn(adata: anndata._core.anndata.AnnData, group_by: list, use_raw=True)
  8 |         Compute pseudo-bulk expression by adding raw counts.
  9 |         
 10 |         Args:
 11 |             adata (anndata.AnnData): Object with raw counts in adata.raw.X
 12 |             group_by (list of st): names of categorical columns in adata.obs to group by. Typically starts with "perturbation". 
 13 |         
 14 |         Returns:
 15 |             anndata.AnnData: Pseudo-bulk expression
 16 |     
 17 |     cchheecckkCCoonnssiisstteennccyy(adata: anndata._core.anndata.AnnData, perturbationType: str = 'overexpression', group: str = None, verbose: bool = False, do_return_pval=False, show_plots=False)
 18 |         Check whether the gene that was perturbed is actually 
 19 |         measured to be higher (if overexpressed) or lower (if knocked
 20 |         down) or nearly zero (if knocked out).
 21 |         If an observation is a control or if the perturbed gene is not measured, 'N/A' is labeled. 
 22 |         If a perturbagen's expression is higher or lower than the median control (matching 
 23 |         the direction of intended perturbation), it is labeled True. Otherwise, False. 
 24 |         
 25 |         Args:
 26 |             adata (anndata.AnnData): the object to operate on. adata.X is expected to be normalized but not log-transformed. 
 27 |                 It is expected to be a dense array, not a sparse e.g. scipy CSR. 
 28 |             perturbation_type (str): one of {"overexpression", "knockout", "knockdown"}
 29 |             group (str, default None): a column in adata.obs to indicate sub-group of
 30 |                                        the treatment and the control.
 31 |             verbose (bool): show a swarmplot noting the difference between the control
 32 |                             and the treatment, if the perturbation direction and expression
 33 |                             level are disconcordant.
 34 |     
 35 |     cchheecckkPPeerrttuurrbbaattiioonnEEffffeeccttMMeettrriiccCCoorrrreellaattiioonn(adata: anndata._core.anndata.AnnData, metrics)
 36 |         Compute correlation between different measures of global effect size
 37 |     
 38 |     ccoommppuutteeCCoorrrreellaattiioonn(adata: anndata._core.anndata.AnnData, verbose: bool = False, group: str = None)
 39 |         Compute the correlation between biological replicates on scale of log fold change. For each 
 40 |         set of perturbation, the final correlation score is the median of 
 41 |         correlation between all pair-wise combinations of perturbation expression
 42 |         and control expression. Both Spearman and Pearson correlation are
 43 |         computed.
 44 |         
 45 |         This assume the existence of "is_control" in adata.obs.
 46 |     
 47 |     ccoonnvveerrtt__eennss__ttoo__ssyymmbbooll(ensembl_ids, gtf, strip_version=False)
 48 |         Convert ensembl gene id's (incomprehensible) into Entrez gene symbols (e.g. GAPDH)
 49 |         
 50 |         Args:
 51 |         
 52 |         - gtf: path to a GTF file with transcript annotations, e.g. Gencode V35.
 53 |         - ensemble_ids: iterable with inputs.
 54 |         - strip_version: ensembl ID's can be like 'ENSG01234.5' or like 'ENSG01234'. The '.5' is the version, i.e. the number of times this id has changed. Sometimes you want to strip this off (strip_version = True). More on ensembl ID's:
 55 |             
 56 |             https://useast.ensembl.org/Help/Faq?id=488#:~:text=An%20Ensembl%20stable%20ID%20consists,(version).&text=The%20second%20part%20is%20a,(object%20type)(identifier).
 57 |     
 58 |     ddeessccrriibbee__ppeerrttuurrbbaattiioonn__eeffffeecctt(adata: anndata._core.anndata.AnnData, perturbation_type, multiple_genes_hit: bool = None) -> anndata._core.anndata.AnnData
 59 |         Add details about perturbation's effect on the targeted genes
 60 |         
 61 |         Args:
 62 |             adata (anndata.AnnData): A perturbation dataset
 63 |             perturbation_type (typing.Union): one of {"overexpression", "knockout", "knockdown"}, or if mixed, an iterable of length equal to n_samples.
 64 |             multiple_genes_hit: Set to True if there observations with multiple genes perturbed.
 65 |         Raises:
 66 |             ValueError: Triggered by invalid perturbation types.
 67 |         
 68 |         Returns:
 69 |             anndata.AnnData: adata with columns filled in for 'expression_level_after_perturbation' and 'perturbation_type'
 70 |     
 71 |     ddeesseeqq22NNoorrmmaalliizzaattiioonn(counts_df)
 72 |         Equivalent to DESeq2:::counts.DESeqDataSet; counts(x, normalized=T)
 73 |     
 74 |     ddeesseeqq22__ssiizzee__ffaaccttoorrss(counts_df)
 75 |         Calculate DESeq size factors
 76 |         median of ratio to reference sample (geometric mean of all samples)
 77 |         
 78 |         https://github.com/broadinstitute/pyqtl/blob/master/qtl/norm.py
 79 |         References:
 80 |          [1] Anders & Huber, 2010
 81 |          [2] R functions:
 82 |               DESeq::estimateSizeFactorsForMatrix
 83 |     
 84 |     rreeaadd__ccmmaapp(expression_file, gene_metadata, instance_metadata)
 85 |         Read a dataset in CMAP's HDF-based gctx format, returning an AnnData object.
 86 |     
 87 |     ssiimmpplliiffyy__ccaatteeggoorriiccaall(x: pandas.core.frame.DataFrame, column: str, max_categories: int = 20, filler: str = 'other', new_column: str = None)
 88 |         Mark less frequent categories as other. Accepts and returns a dataframe.
 89 |     
 90 |     ssiimmuullaattee__ssiinnggllee__cceellllss(adata: anndata._core.anndata.AnnData, num_cells: int, counts_per_cell: int)
 91 |         Split bulk RNA samples into a simulated homogeneous population of cells.
 92 |         
 93 |         Args:
 94 |             adata (anndata.AnnData): bulk RNA perturbation data
 95 |             num_cells (int): how many cells to split each sample into
 96 |             counts_per_cell (int): how many reads or UMIs to simulate per cell
 97 |         
 98 |         Returns:
 99 |             anndata.AnnData: perturbation data with the same variables and num_cells times as many samples
100 |     
101 |     ttrryy__ttooaarrrraayy(x)
102 |     
103 |     vviissuuaalliizzeeLLooggFFCC(fc, pval=None, show_plots=False)
104 |     
105 |     vviissuuaalliizzeePPeerrttuurrbbaattiioonnEEffffeecctt(adata, metrics, TFDict, EpiDict)
106 |         Visualize effect size versus type of perturbation, e.g. TF versus non-TF
107 |     
108 |     vviissuuaalliizzeePPeerrttuurrbbaattiioonnMMeettaaddaattaa(adata: anndata._core.anndata.AnnData, x: str, y: str, style=None, hue=None, markers=None, xlim=None, s=30)
109 |         Plot characteristics of each perturbation, e.g. correlation between replicates or global effect size.
110 | 
111 | FFIILLEE
112 |     /home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/perturbation_data/setup/ingestion.py
113 | 
114 | 


--------------------------------------------------------------------------------
/make_figures/replicate_correlations_and_stereotypical_effects.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pereggrn_perturbations
  4 | from pereggrn import experimenter
  5 | import altair as alt
  6 | import re
  7 | import os
  8 | pereggrn_perturbations.set_data_path('../../perturbation_data/perturbations')
  9 | DATASET_ORDER = [
 10 |   "nakatake",
 11 |   "nakatake\nscrna\nsimulated",
 12 |   "joung",
 13 |   "norman",
 14 |   "replogle1",
 15 |   "replogle3",
 16 |   "replogle4",
 17 |   "adamson",
 18 |   "replogle2",
 19 |   "replogle2 large effect",
 20 |   "replogle2 tf only",
 21 |   "replogle2_large_effect",
 22 |   "replogle2_tf_only",
 23 |   "replogle2\nlarge effect",
 24 |   "replogle2\nlarge\neffect",
 25 |   "replogle2\ntf only",
 26 |   "freimer",
 27 |   "dixit", 
 28 |   "frangieh_IFNg_v1",
 29 |   "frangieh\nIFNg v1",
 30 |   "frangieh\nIFNg\nv1",
 31 |   "frangieh IFNg v1",
 32 |   "frangieh_IFNg_v2",
 33 |   "frangieh\nIFNg v2",
 34 |   "frangieh\nIFNg\nv2",
 35 |   "frangieh IFNg v2",
 36 |   "frangieh_IFNg_v3",
 37 |   "frangieh\nIFNg v3",
 38 |   "frangieh\nIFNg\nv3",
 39 |   "frangieh IFNg v3"
 40 | ]
 41 | 
 42 | # this eccentric code keeps the datasets in a consistent order across figures.
 43 | datasets_used = [d for d in DATASET_ORDER if d in [ 
 44 |     "nakatake",
 45 |     "replogle1",
 46 |     "replogle2",
 47 |     "replogle2\ntf only",
 48 |     "replogle2\nlarge effect",
 49 |     "replogle3",
 50 |     "replogle4",
 51 |     "joung",
 52 |     "freimer",
 53 |     "frangieh IFNg v3",
 54 |     "norman",
 55 |     "adamson",
 56 |     "dixit",
 57 | ]]
 58 | 
 59 | # See GEARS paper fig. S11
 60 | # I used this interactively, but it is not used in the final version of the paper.
 61 | norman_blacklist = {
 62 |     "IKZF3", 
 63 |     "PRDM1",
 64 |     "PTPN1",
 65 |     "C3orf72",
 66 |     "NIT1",
 67 |     "RREB1",
 68 |     "CDKN1C",
 69 |     "CNN1",
 70 |     "PTPN13",
 71 |     "JUN",
 72 |     "ZBTB1",
 73 | }
 74 | 
 75 | # This is for enrichment analysis of genes with high logFC between controls and the average of the rest of the data.
 76 | os.makedirs("control_vs_all_perts", exist_ok=True)
 77 | for dataset in datasets_used:    
 78 |     print(f"Processing dataset: {dataset}")
 79 |     adata = pereggrn_perturbations.load_perturbation(re.sub("\n| ", "_", dataset))
 80 |     controls = adata.obs_names[adata.obs["is_control"]]
 81 |     treateds = adata.obs_names[~adata.obs["is_control"]]
 82 |     lfc = adata[treateds, :].X.mean(axis=0) - adata[controls, :].X.mean(axis=0)
 83 |     lfc = np.array(lfc).flatten()
 84 |     lfc = pd.DataFrame({"lfc": lfc, "gene": adata.var_names}).sort_values("lfc")
 85 |     lfc.head(100).to_csv(f"control_vs_all_perts/bottom_genes_{dataset}.csv")
 86 |     lfc.tail(100).to_csv(f"control_vs_all_perts/top_genes_{dataset}.csv")
 87 | 
 88 | # This is for a figure testing if independent control-treatment pairs have consistent log fold changes when the treatment is the same.
 89 | correlations = dict()
 90 | for dataset in datasets_used:    
 91 |     adata = pereggrn_perturbations.load_perturbation(re.sub("\n| ", "_", dataset))
 92 |     correlations[dataset] = pd.DataFrame(index = range(len(adata.obs["perturbation"].unique())), columns = ["correlation", "dataset"])
 93 |     i=0
 94 |     controls = adata.obs_names[adata.obs["is_control"]]
 95 |     control_expression = adata[controls, :].X.mean(axis=0)
 96 |     for p in adata.obs["perturbation"].unique():
 97 |         try:
 98 | 
 99 |             treatment1, treatment2 = np.random.choice(adata.obs.loc[adata.obs["perturbation"]==p, :].index, size=2, replace=False)
100 |         except ValueError:
101 |             # no replicates :[
102 |             correlations[dataset].loc[i, "correlation"] = np.nan
103 |             continue
104 |         if adata.obs.loc[treatment1, "is_control"]:
105 |             # Don't include controls as treatments, even if they have a weird name like 'CAG-rtTA35-IH'
106 |             continue
107 |         lfc1 = adata[treatment1, :].X.mean(axis=0) - control_expression
108 |         lfc2 = adata[treatment2, :].X.mean(axis=0) - control_expression
109 |         correlations[dataset].loc[i, "correlation"] = np.corrcoef(lfc1, lfc2)[0, 1]
110 |         correlations[dataset].loc[i, "perturbation"] = p
111 |         correlations[dataset].loc[i, "dataset"] = dataset
112 |         i = i + 1
113 | 
114 | correlations = pd.concat(correlations.values())
115 | correlations = correlations.loc[~correlations["correlation"].isna(), :]
116 | print("Lowest correlations:")
117 | print(correlations.sort_values("correlation").head(10))
118 | print("Numbers of replicates:")
119 | print(correlations["dataset"].value_counts())
120 | box_plot = alt.Chart(correlations).mark_boxplot(color = "black").encode(
121 |     x=alt.X('dataset:N', title='', sort = datasets_used),
122 |     y=alt.Y('correlation:Q', title='Pearson correlation between log FC from independent treatments'),
123 | ).properties(
124 |     title='Replicate correlations', 
125 | )
126 | red_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='red').encode(
127 |     y='y:Q'
128 | )
129 | 
130 | # Combine the box plot and the red line
131 | final_chart = box_plot + red_line
132 | 
133 | # Save the chart
134 | final_chart.save("plots/genome_wide_logfc_correlation.svg")
135 | 
136 | # This is for a figure testing if independent control-treatment pairs have consistent log fold changes even across different treatments.
137 | 
138 | datasets_used = [d for d in DATASET_ORDER if d in [ # this eccentric code keeps the datasets in a consistent order across figures.
139 |     "nakatake", 
140 |     "replogle1",
141 |     "replogle2",
142 |     "replogle2\ntf only",
143 |     "replogle2\nlarge effect",
144 |     "replogle3",
145 |     "replogle4",
146 |     "joung",
147 |     "freimer"
148 | ]]
149 | num_pairs = 100
150 | correlations = dict()
151 | for dataset in datasets_used:    
152 |     adata = pereggrn_perturbations.load_perturbation(re.sub("\n| ", "_", dataset))
153 |     correlations[dataset] = pd.DataFrame(index = range(num_pairs), columns = ["correlation", "dataset"])
154 |     i=0
155 |     while i < num_pairs:
156 |         control1 = np.random.choice(adata.obs_names[adata.obs["is_control"]])
157 |         control2 = np.random.choice(adata.obs_names[adata.obs["is_control"]])
158 |         treatment1 = np.random.choice(adata.obs.loc[~adata.obs["is_control"], "perturbation"])
159 |         treatment2 = np.random.choice(adata.obs.loc[~adata.obs["is_control"], "perturbation"])
160 |         if treatment1==treatment2:
161 |             continue
162 |         if control1==control2:
163 |             continue  
164 |         lfc1 = adata[control1].X.mean(axis=0) - adata[adata.obs["perturbation"]==treatment1].X.mean(axis=0)
165 |         lfc2 = adata[control2].X.mean(axis=0) - adata[adata.obs["perturbation"]==treatment2].X.mean(axis=0)
166 |         correlations[dataset].loc[i, "correlation"] = np.corrcoef(lfc1, lfc2)[0, 1]
167 |         correlations[dataset].loc[i, "dataset"] = dataset
168 |         i = i + 1
169 | correlations = pd.concat(correlations.values())
170 | box_plot = alt.Chart(correlations).mark_boxplot(color = "black").encode(
171 |     x=alt.X('dataset:N', title='', sort = datasets_used),
172 |     y=alt.Y('correlation:Q', title='Pearson correlation between log FC within 100 independent control-treatment pairs'),
173 | ).properties(
174 |     title='Correlations by dataset', 
175 | )
176 | red_line = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='red').encode(
177 |     y='y:Q'
178 | )
179 | 
180 | # Combine the box plot and the red line
181 | final_chart = box_plot + red_line
182 | 
183 | # Save the chart
184 | final_chart.save("plots/stereotypical_responses.svg")
185 | 


--------------------------------------------------------------------------------
/make_figures/global_effects/frangieh_IFNg_v3.txt:
--------------------------------------------------------------------------------
  1 | deg,mi,mean,norm2,median
  2 | 0.0,1.8640777755412477,1.5609329866998158,199.4889866614173,1.5781911104919941
  3 | 0.0,2.2381782310639156,1.5564469075932919,195.37887534668164,1.5753619003456205
  4 | 0.0,2.2146129900918607,1.55387206964863,195.12431583019907,1.572317066612451
  5 | 0.0,2.17864857726856,1.5502577618885491,195.15675427216254,1.571027818437378
  6 | 0.0,2.143805661124758,1.5415083491298502,194.26777568511886,1.565942632867314
  7 | 0.0,2.1616418787607525,1.5424024594928842,194.14200699678977,1.569884079539798
  8 | 0.0,2.091400328116583,1.5469775280898237,195.39082977570428,1.5747049875440255
  9 | 0.0,2.1926744656597017,1.5608596898537956,196.17649290039483,1.5758254662918882
 10 | 0.0,2.1567343549140965,1.5650105652992246,196.9810360237488,1.5827025167134265
 11 | 0.0,2.2268631647167627,1.5519783931717825,194.9262772999478,1.5729230242628418
 12 | 0.0,1.9083184207749087,1.5222697612760214,194.4373118943775,1.550593464708645
 13 | 0.0,2.149422410667196,1.5392542935981004,193.97333882326117,1.5608210821100927
 14 | 0.0,2.21148604629312,1.5666434999331502,196.70284397013413,1.5795388880897012
 15 | 0.0,2.0063769936579625,1.525228861670856,193.53789348173277,1.5477483913971124
 16 | 0.0,2.142495553126543,1.534509983342969,193.41342573926613,1.5554688938353964
 17 | 0.0,1.8485792334911988,1.4899423138441248,190.35177059499426,1.5155990721321806
 18 | 0.0,2.1231270032310547,1.541349667999332,194.30697260052753,1.561486847479887
 19 | 0.0,2.1340725720831424,1.546294606032218,194.97147511138792,1.5683646486024583
 20 | 0.0,2.2446588325246086,1.5623723518062338,196.06174733634003,1.5770216589154689
 21 | 0.0,2.177385304838936,1.5478919762354075,194.77235644615556,1.5671498260045515
 22 | 0.0,2.137426054458756,1.5503454918715107,195.3734166350972,1.5663646175213937
 23 | 0.0,2.1076097974365418,1.558895246909559,196.6611550837507,1.5796028618030715
 24 | 0.0,2.265773660982213,1.5463120420780834,193.95632845087187,1.5658130529843945
 25 | 0.0,2.1139825111613684,1.5412720163156162,194.56389567959533,1.564791555845289
 26 | 0.0,1.415685342313327,1.405580766200538,194.31861736424278,1.3602309221984261
 27 | 0.0,2.087598892522954,1.5402147821081675,194.52740382550317,1.5604597994048544
 28 | 0.0,1.8510104523288886,1.5062488164002255,192.48318965271915,1.5177931629164139
 29 | 0.0,2.1234203490603583,1.5543498809044658,195.73130687365915,1.5718041451355966
 30 | 0.0,2.1215835910210528,1.5397446675282571,194.30678975491705,1.5629532834016175
 31 | 0.0,2.1108168980619126,1.565319161411174,197.13144848394558,1.582658132552802
 32 | 0.0,2.1357522726466915,1.5414106242670618,194.1715346343759,1.5645937044023777
 33 | 0.0,2.182388591495276,1.5609345887006152,196.23827570258575,1.5750510522307217
 34 | 0.0,2.184889890754682,1.5638902387877258,196.7063106246717,1.5829483811202238
 35 | 0.0,2.1581864657997376,1.5569068285744685,196.03154361251327,1.5687567884944296
 36 | 0.0,2.1364393112605864,1.562613548728406,196.83498177962963,1.5820432375634383
 37 | 0.0,2.148673690985009,1.546425620499626,194.85703701720652,1.5648129689736225
 38 | 0.0,1.9415834740694804,1.5459645754313236,196.75077582805162,1.5610250928779343
 39 | 0.0,1.8361447747468795,1.560322620699423,199.66746227032564,1.5662067089104021
 40 | 0.0,2.13869411989653,1.570424760448402,197.7835706771045,1.5889761549055694
 41 | 0.0,2.193614661642634,1.544623271715702,194.19198847334093,1.5677133876934737
 42 | 0.0,2.245068641740921,1.5658308249739197,196.45173938548572,1.5829112716002596
 43 | 0.0,2.154032028855592,1.56165374630263,196.75305058460683,1.5792645561811411
 44 | 0.0,2.1886111324660185,1.5493526551414794,194.82306283818133,1.5693485477334077
 45 | 0.0,2.157687531835807,1.5579216189638376,196.22474598266695,1.5756410131151717
 46 | 0.0,2.10261525186135,1.5408640983874744,194.61664219306303,1.5629619786836157
 47 | 0.0,2.1984340576171832,1.5377352652151677,193.36428496681577,1.5576610492034721
 48 | 0.0,2.026423254465132,1.5448108191356402,195.66685920458838,1.5670222107141463
 49 | 0.0,1.9740747933618819,1.5391376944403437,195.71288151301516,1.568027871944885
 50 | 0.0,2.155652742303728,1.5559045198438703,195.89936409991617,1.5699470176881383
 51 | 0.0,1.646400479754516,1.515089296619962,198.02852272428197,1.5120570619167317
 52 | 0.0,1.9912384923285922,1.525964098125269,193.71475766218472,1.5474136252160302
 53 | 0.0,2.013124700838948,1.5452507794095025,195.86909924769154,1.5705184527122131
 54 | 0.0,1.5185687940215273,1.4831225745256398,197.18336414414497,1.4791928631633757
 55 | 0.0,2.0126105646808057,1.5695954000737264,198.50693076797475,1.5787671685704154
 56 | 0.0,2.1508967320830856,1.55688314547353,196.08418092978462,1.577116390813067
 57 | 0.0,2.142483910617741,1.5625690040209503,196.8885207558313,1.574741117521875
 58 | 0.0,2.0703794142750196,1.5482914072608331,195.73410952279326,1.5675275634646697
 59 | 0.0,1.4384651376526867,1.4281337091413215,195.29146796121395,1.3561108694172535
 60 | 0.0,2.162085793333776,1.5529603138247763,195.49730860400592,1.5732664440822757
 61 | 0.0,2.2625088917034875,1.570442028611116,196.9230360203464,1.585241176356665
 62 | 0.0,2.078323240553724,1.547451412344719,195.68090736721518,1.5763017567141577
 63 | 0.0,2.2806125505915307,1.5540507887338513,194.85596970158656,1.568189682507506
 64 | 0.0,2.0370231231124682,1.547338301703512,195.94346507709105,1.5712998474610274
 65 | 0.0,1.645333124590206,1.4998445768487476,196.18719896733361,1.519768098521916
 66 | 0.0,2.083480856444875,1.5373446908167252,194.29045989983356,1.5620324044482161
 67 | 0.0,2.2168321913794373,1.5487188161243786,194.56348086444126,1.5691249594744308
 68 | 0.0,2.121720310089325,1.5376791203052502,193.81073381107933,1.5601841707997217
 69 | 0.0,2.0263778677198268,1.536791967307301,194.67200725431542,1.5609471400162183
 70 | 0.0,2.12281088384689,1.5438380193375987,194.68726469072567,1.5613545813159133
 71 | 0.0,2.1655282295791833,1.5514898751311164,195.27435375304844,1.5701866057852747
 72 | 0.0,1.9698977448149457,1.5483071400933763,196.6593573948986,1.5660242540257614
 73 | 0.0,2.211111560470838,1.552942821633374,195.11211160343956,1.5692227649907298
 74 | 0.0,1.8925633364572512,1.5099657221344673,193.2098644070017,1.5378067848811237
 75 | 0.0,2.2329806404979715,1.561646227176743,196.07998011291474,1.577544728765365
 76 | 0.0,1.932581241189833,1.5637396892245976,199.2857959672798,1.5785682209965772
 77 | 0.0,1.9884012416036378,1.54257371472264,195.90619144575348,1.5626696596500922
 78 | 0.0,2.0854399207017966,1.560007839941076,196.91610245539837,1.574321313707657
 79 | 0.0,2.051918099759816,1.547937310125488,195.805379708888,1.569007553057012
 80 | 0.0,2.180962217080367,1.5542753368941402,195.5032006529314,1.5737854607689523
 81 | 0.0,2.0008385104951243,1.5594144893351818,197.60987128564545,1.5740475424871678
 82 | 0.0,1.444989872195319,1.411348588538996,192.47441633541158,1.406311168786681
 83 | 0.0,2.1229153302538792,1.5387922632032018,194.11973625164177,1.5627768393160022
 84 | 0.0,2.2097709127458875,1.5450657393731981,194.19935625463415,1.5674390930166782
 85 | 0.0,1.8539703772374319,1.6061603662587003,204.58572212209245,1.637949543765706
 86 | 0.0,1.983342528007456,1.6027114324898186,202.62395956223804,1.625357931746318
 87 | 0.0,1.5307977256206562,1.5089235045521172,200.19753595708247,1.5122452229075352
 88 | 0.0,2.1522226470115298,1.5620301414972428,196.73107449446087,1.5805386268525883
 89 | 0.0,2.0047772048870023,1.540411638202022,195.40913242619797,1.567571576916326
 90 | 0.0,2.1328941484679826,1.5490660615669747,195.22898615963723,1.5734127614645266
 91 | 0.0,1.9068323333274704,1.5854450292101352,201.20729765411272,1.6148566609105695
 92 | 0.0,1.9408611731592433,1.6154995145390716,204.63659432390375,1.6390101940336206
 93 | 0.0,1.883510473067413,1.5324767698541615,195.88453761119865,1.5505516563263808
 94 | 0.0,2.228971397448362,1.5616555102125103,196.03457521235217,1.5749355465309498
 95 | 0.0,2.1967887189198265,1.54873531551021,194.66415182438772,1.5690105597328485
 96 | 0.0,2.0798270863101327,1.5481242184122967,195.45975613828253,1.56627418816735
 97 | 0.0,2.160358117621133,1.5448997808198106,194.35859412496424,1.5636346555472325
 98 | 0.0,2.23395005336625,1.5684884461288386,196.86755039384283,1.5878436667337712
 99 | 0.0,2.0878533060432334,1.5463923111524376,195.372820848338,1.5699116915471563
100 | 0.0,2.060805118562739,1.5529261098483382,196.45974974040328,1.575149763186573
101 | 0.0,2.1852108711151663,1.5537795520530573,195.33739994157963,1.5673692056664343
102 | 0.0,2.1723956840688214,1.5410405201946737,193.89711378657466,1.5629236160801026
103 | 0.0,2.1624195251909413,1.5475789514410003,194.82016725886558,1.568032621774313
104 | 0.0,2.0354585234984484,1.5188927705879876,192.36077571022926,1.5389618795041367
105 | 0.0,2.314670027953225,1.572605157225235,196.9004402955716,1.5858425661991622
106 | 0.0,2.1883470281653365,1.5612987149448965,196.27897552026596,1.579306925991565
107 | 0.0,1.9051920193767289,1.4914015202328919,190.60867099558462,1.5026761521341778
108 | 0.0,2.0246838792004067,1.5411527660237252,195.27137876489357,1.5644692725655591
109 | 0.0,1.5099178911014048,1.47995025382225,196.4747953615395,1.4806169283389723
110 | 0.0,1.4771256138313278,1.4631864825076955,196.08452507242382,1.4429037548897312
111 | 0.0,1.8760693326819093,1.5209786657496862,194.7605046706265,1.5470790679371103
112 | 0.0,2.125850944619379,1.5420893253847805,194.43031476821017,1.5628876526637296
113 | 0.0,2.102146433738809,1.5585068390944619,196.6185261672021,1.5798999214577591
114 | 0.0,2.1598454095443245,1.5526100991586054,195.46109768232776,1.5720211673940145
115 | 0.0,1.8042287046922387,1.5171228197889328,195.54491993849456,1.536775072138354
116 | 0.0,2.0763786450300525,1.5481833853488909,195.55984905330055,1.5690829674366582
117 | 0.0,2.1186284207655506,1.5473479668061056,194.8013755920857,1.5629967636456727
118 | 0.0,2.1198114886154613,1.5429854734515656,194.55923629587397,1.565548364006912
119 | 0.0,2.169343041302846,1.551425505489008,195.1944610738628,1.569942451505478
120 | 0.0,2.274963584737022,1.5656366749692452,196.26420432759627,1.5820869172247243
121 | 0.0,1.8446331337189035,1.5279904847437138,196.20108351194324,1.5518236001678176
122 | 0.0,1.5439144357588774,1.5535761858648764,204.6463168705661,1.5510111903439598
123 | 0.0,1.6794377222856218,1.517095329213177,197.45913845634124,1.537604208660047
124 | 0.0,1.8956130468865373,1.5454190279012916,197.45924105261773,1.560811639495884
125 | 0.0,2.084316984259716,1.5403456665761686,194.722976383967,1.5531142135872757
126 | 0.0,1.7838343091296807,1.5347511987171807,197.71940894337837,1.5574776801357277
127 | 0.0,1.5791435901431778,1.4669246652338017,193.9073958107502,1.479681230430519
128 | 0.0,1.934234259454923,1.5403941939989703,196.1069055267106,1.5501171023858566
129 | 0.0,2.0365987777890777,1.545911034678182,195.80842417558142,1.5656023454305925
130 | 0.0,2.231803397549866,1.5671170997572619,196.7197092480343,1.5829647298896372
131 | 0.0,2.0894025097241067,1.5483521400399898,195.5572400231854,1.5670255742976025
132 | 0.0,2.0818271393018324,1.5279270572794672,193.02902788318465,1.5533094128647558
133 | 0.0,1.5277103925377589,1.44095498812414,193.4678854514472,1.3858780585375379
134 | 0.0,2.0569673937760005,1.5402442261374258,194.90389865369,1.5638912846276043
135 | 0.0,2.1196826740155754,1.5510783908474288,195.57200072476016,1.5719270009568445
136 | 0.0,2.147735312075022,1.5562321072355592,195.68866512797405,1.5733503568007277
137 | 0.0,2.1970969346970284,1.5452486941452033,194.15781106348393,1.5627433267211983
138 | 0.0,1.90784636046063,1.5335951103181935,195.4021003601159,1.552574322971833
139 | 0.0,2.089718383433232,1.5579241573402456,196.76210783706733,1.5745865252198137
140 | 0.0,1.5338245175505505,1.4801049835269258,196.09806846140276,1.4777718468140926
141 | 0.0,1.5308564546094194,1.3525228356574912,193.27090950490745,1.3193996730822846
142 | 0.0,1.5740577681329069,1.519394116265145,200.22614521142762,1.5109289163727921
143 | 0.0,2.2081093772223452,1.5418842645016535,193.84098315418208,1.5648391281572702
144 | 0.0,2.158177853294038,1.5653843071799556,197.090758091642,1.5862503173350966
145 | 0.0,2.0803415089071353,1.5408799990974118,194.57653704456138,1.5620267655820013
146 | 0.0,2.1404350468012394,1.5533705921847842,195.7559663197285,1.5725786221332942
147 | 0.0,2.26432898137135,1.5522891819303313,194.76016761530136,1.567694073438641
148 | 0.0,2.1380670718146373,1.5457834156194816,194.7552176609661,1.5698965331240486
149 | 0.0,2.148562792196919,1.5501633152454923,195.28620977123848,1.5680156606521105
150 | 0.0,2.0640027592937678,1.5595367461321166,197.1046049370052,1.5773436281839779
151 | 0.0,1.9932499822399365,1.5352175756273032,194.60104408996276,1.5525199849708773
152 | 0.0,2.1526367527154306,1.550473562593641,195.2744890381642,1.575663595946704
153 | 0.0,2.055216784492464,1.5409194924980316,195.0036174412795,1.563945518194417
154 | 0.0,1.8091797714249829,1.5268951675031466,196.22278438034814,1.5466601451185265
155 | 0.0,2.040230128855401,1.540350868108229,195.0585128544913,1.5626756665942685
156 | 0.0,2.2871549400801134,1.567064822886356,196.38189758161954,1.578833953285653
157 | 0.0,2.1972096438085535,1.551962229765701,195.0511175356856,1.568072582965678
158 | 0.0,2.2134806169440115,1.5407181638311975,193.66522447500836,1.5635672171368549
159 | 0.0,2.19983356493271,1.5574078966211726,195.85337542383704,1.5772312587748345
160 | 0.0,1.893275252973158,1.5480731863258566,197.921888322951,1.5736067258312068
161 | 0.0,2.206737681843113,1.5553667273548109,195.40959842211802,1.5736292234807383
162 | 0.0,2.184137916042091,1.5666278500826556,196.9174009458554,1.581896104794101
163 | 0.0,2.176280143009812,1.5528729668552068,195.26156914783107,1.5729586392853119
164 | 0.0,2.1379025140711483,1.560572213202117,196.63125329266384,1.5761261153704718
165 | 0.0,2.196993960309918,1.55282370090618,195.27822173922985,1.5736670096851357
166 | 0.0,2.1319891530903425,1.5429302892424888,194.55391823892364,1.5677271376576394
167 | 0.0,2.097896118991125,1.5552981188022972,196.21999803583267,1.5761179859402965
168 | 0.0,2.046249138843732,1.5443778328025561,195.157552635341,1.5582929677659882
169 | 0.0,1.8850594527676086,1.6040034659609628,203.80393468967938,1.632870602719037
170 | 0.0,2.0284295881276098,1.5455540634437015,195.51017503299556,1.5671990385138497
171 | 0.0,2.2179642919768616,1.5508818085102298,194.88435403268508,1.5691804406525827
172 | 0.0,2.1684226449454256,1.553239808917264,195.5320778305028,1.5741744189817077
173 | 0.0,2.1582494316044363,1.5436382196705518,194.4885708865621,1.5692245883109626
174 | 0.0,2.1294479453574633,1.5455015726838326,194.89203792700417,1.570972237834125
175 | 0.0,2.176103195754049,1.5551445796728127,195.60683472967204,1.5786037948247857
176 | 0.0,2.1943146792388486,1.561915744534123,196.36636401906836,1.5785805990560702
177 | 0.0,1.822766444361539,1.5057347897988325,193.85929823740096,1.5292511955784214
178 | 0.0,2.168893277275868,1.543970541663893,194.32813142687357,1.5657897243740715
179 | 0.0,1.9831645986439677,1.5416768152123117,195.9416089280911,1.5615482473431592
180 | 0.0,2.109807952977918,1.55108106082843,195.80492140493843,1.5721647966489303
181 | 0.0,2.1264764288467894,1.550727616247975,195.43408241786932,1.5691919140016939
182 | 0.0,1.5523206302514265,1.508428003572635,198.1678179429505,1.5120452450619024
183 | 0.0,2.1448644580884344,1.5555607805576206,195.9932274995643,1.577986050468815
184 | 0.0,2.1005359651462374,1.5548282866601129,196.35332306469067,1.5760821883909935
185 | 0.0,2.15470270549886,1.5574981376851567,196.11394128318008,1.5724905987839823
186 | 0.0,2.175010482738152,1.5432376877535128,194.28607121259859,1.5605623043932368
187 | 0.0,1.5224836407887739,1.472653882651759,195.36682904709403,1.4718125579522716
188 | 0.0,2.0801032596431868,1.5397058742447813,194.71982518235208,1.5618416510577042
189 | 0.0,2.1972788586802516,1.5437038722319774,194.07166205738181,1.56697246476462
190 | 0.0,2.1932624723251157,1.5544561018926852,195.4312600624217,1.5738567292170003
191 | 0.0,2.045184730299986,1.5628419483484843,197.2900739488058,1.579576675594899
192 | 0.0,1.8880620927252965,0.9635258228901327,128.05342258383,0.9789828926428183
193 | 0.0,2.0846155294096533,1.5291712339438326,193.35462153105976,1.55459497254967
194 | 0.0,1.853229487638607,1.5340635997555556,196.4838058902042,1.5478681489558024
195 | 0.0,1.7401475392447134,1.5215835638751685,197.10886143383803,1.5424434097034552
196 | 0.0,2.0437962682133666,1.5484623658914356,196.01709980964904,1.5709140057625275
197 | 0.0,2.158723268827176,1.5517773640361345,195.4835719000345,1.5674839750329848
198 | 0.0,2.202084184989831,1.5593504415085588,195.93439439434738,1.5757596981647635
199 | 


--------------------------------------------------------------------------------