├── JAX_color_code.R ├── JAX_help_code.R ├── README.md ├── Section_1_basic_analysis ├── .DS_Store ├── .Rhistory ├── Detecting_doublets_by_subclustering.py ├── Dimension_reduction_scanpy.py ├── Integrating_adjacent_timepoints.py ├── Run_Scrublet.py ├── step1_Removing_doublets.R ├── step2_Plot_UMAP.R ├── step3_Pseudobulk.R ├── step4_Estimate_cell_num.R └── step5_Check_batch_effect.R ├── Section_2_posterior_embryo ├── Embedding_scanpy_posterior_embryo.py ├── Run_geosketch.py ├── step1_posterior_embryo.R ├── step2_NMP_mesoderm.R ├── step3_Notochord.R ├── step4_Gut.R ├── step5_Genes_correlated_with_PC.R ├── step6_somites_validation.R └── step7_Npm1_signature.R ├── Section_3_kidney_mesenchyme ├── Embedding_Renal.py ├── Embedding_scanpy_kidney.py ├── Embedding_scanpy_lateral_plate_mesoderm.py ├── Embedding_scanpy_patterned_mesoderm_somites_26_34.py ├── Embedding_scanpy_patterned_mesoderm_somites_5_20.py ├── Spatial_mapping.py ├── step1_kidney.R ├── step2_Lateral_plate_mesoderm.R ├── step3_Spatial_mapping.R ├── step4_Patterned_mesoderm.R └── step5_Renal_two_subpopulations.R ├── Section_4_eye ├── Embedding_scanpy_eye.py └── Eye.R ├── Section_5_neuroectoderm ├── Embedding_early_neurons.py ├── Embedding_neuroectoderm_derivatives.py ├── Embedding_patterned_neuroectoderm.py ├── step1_Patterned_neuroectoderm.R ├── step2_Early_neurons.R ├── step3_Early_neurons_PCA.R ├── step4_Mapping_neuroectoderm_derivatives.R ├── step5_Astrocytes.R └── step6_Key_TFs.R ├── Section_6_development_tree ├── .DS_Store ├── Dimension_reduction_subsystem.py ├── Graph_robust.py ├── Two_examples.py ├── step1_Early_stage_graph.R ├── step2_Late_stage_graph.R ├── step3_Create_graph.R ├── step4_Two_examples.R └── step5_MNN_robustness.R ├── Section_7_key_TFs ├── .Rhistory ├── HSCs_progenitors.py ├── step1_Key_TFs.R ├── step2_Key_genes.R ├── step3_Summarize_results.R ├── step4_Pseudotime_endoderm.R └── step5_HSCs_progenitors.R ├── Section_8_birth_series ├── Embedding_birth_series.py ├── Embedding_individual_celltype.py ├── step1_Celltypes_shift_after_birth.R ├── step2_Embedding_birth_series.R ├── step3_Celltypes_changing_over_Csection.R ├── step4_DEGs_birth.R └── step5_Comparing_NatBirth.R └── spatial_mapping.tar.gz /README.md: -------------------------------------------------------------------------------- 1 | # JAX_code 2 | The scripts that are used for analyzing dataset in the paper: A single-cell transcriptional timelapse of mouse embryonic development, from gastrula to pup (https://www.biorxiv.org/content/10.1101/2023.04.05.535726v1.abstract) 3 | 4 | The house mouse, Mus musculus, is an exceptional model system, combining genetic tractability with close homology to human biology. Gestation in mouse development lasts just under three weeks, a period during which its genome orchestrates the astonishing transformation of a single cell zygote into a free-living pup composed of >500 million cells. Towards a global framework for exploring mammalian development, we applied single cell combinatorial indexing to profile the transcriptional states of 12.4 million nuclei from 83 precisely staged embryos spanning late gastrulation (embryonic day 8 or E8) to birth (postnatal day 0 or P0), with 2-hr temporal resolution during somitogenesis, 6-hr resolution through to birth, and 20-min resolution during the immediately postpartum period. From these data (E8 to P0), we annotate dozens of major cell clusters and hundreds of cell types and perform deeper analyses of the unfolding of the posterior embryo during somitogenesis as well as the ontogenesis of the kidney, mesenchyme, retina, and early neurons. Finally, we leverage the depth and temporal resolution of these whole embryo snapshots, together with other published data, to construct and curate a rooted tree of cell type relationships that spans mouse development from zygote to pup. Throughout this tree, we systematically nominate sets of transcription factors (TFs) and other genes as candidate drivers of the in vivo differentiation of hundreds of mammalian cell types. Remarkably, the most dramatic shifts in transcriptional state are observed in a restricted set of cell types in the hours immediately following birth, and presumably underlie the massive changes in physiology that must accompany the successful transition of a placental mammal to extrauterine life. 5 | 6 | The data used in these scripts can be found at https://shendure-web.gs.washington.edu/content/members/cxqiu/public/backup/jax/download/other/ 7 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_1_basic_analysis/.DS_Store -------------------------------------------------------------------------------- /Section_1_basic_analysis/.Rhistory: -------------------------------------------------------------------------------- 1 | ?monocle3:::normalize_expr_data 2 | monocle3:::normalize_expr_data 3 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/Detecting_doublets_by_subclustering.py: -------------------------------------------------------------------------------- 1 | 2 | ### We performed two rounds of clustering and used the doublet annotations to identify subclusters that are enriched in doublets 3 | 4 | import scanpy as sc 5 | import pandas as pd 6 | import numpy as np 7 | import scrublet as scr 8 | import os, sys 9 | 10 | WORK_PATH = "./" 11 | 12 | os.mkdir(os.path.join(WORK_PATH, "doublet_cluster")) 13 | 14 | fdata = pd.read_csv("df_gene.csv", index_col = 0) 15 | 16 | adata1 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_1.mtx')) 17 | pdata1 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_1.csv'), index_col = 0) 18 | adata1.obs = pdata1 19 | adata1.var = fdata 20 | 21 | adata2 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_2.mtx')) 22 | pdata2 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_2.csv'), index_col = 0) 23 | adata2.obs = pdata2 24 | adata2.var = fdata 25 | 26 | adata3 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_3.mtx')) 27 | pdata3 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_3.csv', index_col = 0)) 28 | adata3.obs = pdata3 29 | adata3.var = fdata 30 | 31 | adata4 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_4.mtx')) 32 | pdata4 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_4.csv', index_col = 0)) 33 | adata4.obs = pdata4 34 | adata4.var = fdata 35 | 36 | adata5 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_5.mtx')) 37 | pdata5 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_5.csv', index_col = 0)) 38 | adata5.obs = pdata5 39 | adata5.var = fdata 40 | 41 | adata6 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_6.mtx')) 42 | pdata6 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_6.csv', index_col = 0)) 43 | adata6.obs = pdata6 44 | adata6.var = fdata 45 | 46 | adata = adata1.concatenate(adata2, adata3, adata4, adata5, adata6) 47 | 48 | adata_orig = adata 49 | 50 | ### remove sex genes 51 | adata = adata_orig[:, ~adata_orig.var['chr'].isin(['chrX', 'chrY'])] 52 | ### high variable genes 53 | sc.pp.filter_genes(adata, min_cells=1) 54 | sc.pp.normalize_total(adata, target_sum=1e5) 55 | sc.pp.log1p(adata) 56 | sc.pp.highly_variable_genes(adata, n_top_genes=3000) 57 | filter_genes = list(adata.var.loc[adata.var['highly_variable'] == True, 'gene_id']) 58 | 59 | ### 60 | adata = adata_orig[:, adata_orig.var['gene_id'].isin(filter_genes)] 61 | sc.pp.normalize_total(adata, target_sum=1e5) 62 | sc.pp.log1p(adata) 63 | sc.pp.scale(adata) 64 | ### 65 | sc.tl.pca(adata, svd_solver='arpack', n_comps = 30) 66 | sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30) 67 | sc.tl.louvain(adata) 68 | sc.tl.umap(adata, min_dist=0.1) 69 | 70 | adata.obs['umap_1'] = list(adata.obsm['X_umap'][:,0]) 71 | adata.obs['umap_2'] = list(adata.obsm['X_umap'][:,1]) 72 | name = "global.csv" 73 | adata.obs.to_csv(os.path.join(WORK_PATH, 'doublet_cluster', name)) 74 | 75 | obs_all = adata.obs 76 | obs_all['louvain'].value_counts() 77 | cluster_list = list(set(list(obs_all['louvain']))) 78 | 79 | xx = 0 80 | for cnt in range(len(cluster_list)): 81 | xx += 1 82 | print('Processing: ' + str(xx) + '/' + str(len(cluster_list))) 83 | cluster = cluster_list[cnt] 84 | include_cell = list(obs_all.loc[obs_all['louvain'] == cluster, 'sample']) 85 | adata = adata_orig[adata_orig.obs['sample'].isin(include_cell)] 86 | adata = adata[:, ~adata.var['chr'].isin(['chrX', 'chrY'])] 87 | sc.pp.filter_genes(adata, min_cells=1) 88 | sc.pp.normalize_total(adata, target_sum=1e5) 89 | sc.pp.log1p(adata) 90 | sc.pp.highly_variable_genes(adata, n_top_genes=3000) 91 | filter_genes = list(adata.var.loc[adata.var['highly_variable'] == True, 'gene_id']) 92 | adata = adata_orig[adata_orig.obs['sample'].isin(include_cell)] 93 | adata = adata[:, adata.var['gene_id'].isin(filter_genes)] 94 | sc.pp.normalize_total(adata, target_sum=1e5) 95 | sc.pp.log1p(adata) 96 | sc.pp.scale(adata) 97 | sc.tl.pca(adata, svd_solver='arpack', n_comps = 30) 98 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 99 | sc.tl.louvain(adata, resolution = 3) 100 | sc.tl.umap(adata, min_dist=0.1) 101 | adata.obs['umap_1'] = list(adata.obsm['X_umap'][:,0]) 102 | adata.obs['umap_2'] = list(adata.obsm['X_umap'][:,1]) 103 | name = 'adata.obs.louvain_' + cluster + '.csv' 104 | adata.obs.to_csv(os.path.join(WORK_PATH, 'doublet_cluster', name)) 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/Dimension_reduction_scanpy.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################# 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on 11.4M dataset ### 4 | ############################################################################################################# 5 | 6 | ### Of note, this data should include 24,552 genes and 11,441,407 cells 7 | ### Hint: I suggest to request >500GB for the following analysis. 8 | 9 | import scanpy as sc 10 | import pandas as pd 11 | import numpy as np 12 | import os, sys 13 | import time 14 | import gc 15 | 16 | start_time = time.time() 17 | 18 | WORK_PATH = './' 19 | 20 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 21 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 22 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 23 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 24 | 25 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 26 | del adata_1, adata_2, adata_3, adata_4 27 | gc.collect() 28 | 29 | print("Done reading data ...") 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 31 | 32 | sc.pp.normalize_total(adata, target_sum=1e4) 33 | print("Done normalization by total counts ...") 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 35 | 36 | sc.pp.log1p(adata) 37 | print("Done log transformation ...") 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 39 | 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 41 | print("Done finding highly variable genes ...") 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 43 | 44 | adata = adata[:, adata.var.highly_variable] 45 | print("Done filtering in highly variable genes ...") 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 47 | 48 | sc.pp.scale(adata, max_value=10) 49 | print("Done scaling data ...") 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 51 | 52 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 53 | print("Done performing PCA ...") 54 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 55 | 56 | sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30) 57 | print("Done computing neighborhood graph ...") 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 59 | 60 | sc.tl.umap(adata, min_dist=0.1, n_components=3) 61 | print("Done UMAP ...") 62 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 63 | 64 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 65 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 66 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 67 | 68 | sc.tl.umap(adata, min_dist=0.1, n_components=2) 69 | print("Done UMAP ...") 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 71 | 72 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 73 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 74 | 75 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 76 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 77 | print("Done clustering using res = 1 ...") 78 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 79 | 80 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 81 | adata.obs['leiden_res_2'] = adata.obs['leiden'] 82 | print("Done clustering using res = 2 ...") 83 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 84 | 85 | adata.obs.to_csv(os.path.join(WORK_PATH, 'adata_scale.obs.csv')) 86 | pd.DataFrame(adata.obsm['X_pca']).to_csv(os.path.join(WORK_PATH, 'adata_scale.pca.csv')) 87 | 88 | adata.write(os.path.join(WORK_PATH, 'adata_scale.h5ad'), compression="gzip") 89 | print("Done writing data ...") 90 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 91 | 92 | 93 | ################################################################################################# 94 | ### For each major trajectories, we further performed sub-clustering to get higher resolution ### 95 | ################################################################################################# 96 | 97 | import scanpy as sc 98 | import pandas as pd 99 | import numpy as np 100 | import os 101 | import time 102 | import sys 103 | 104 | start_time = time.time() 105 | 106 | WORK_PATH = './' 107 | 108 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 109 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 110 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 111 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 112 | 113 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4) 114 | del adata_1, adata_2, adata_3, adata_4 115 | gc.collect() 116 | 117 | ### Of note, please read df_cell.rds and then write it to df_cell.csv in R 118 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 119 | adata_orig.obs = pdata 120 | 121 | trajectory_list = ["Neuroectoderm_and_glia", 122 | "Intermediate_neuronal_progenitors", 123 | "Eye_and_other", 124 | "Ependymal_cells", 125 | "CNS_neurons", 126 | "Mesoderm", 127 | "Definitive_erythroid", 128 | "Epithelial_cells", 129 | "Endothelium", 130 | "Muscle_cells", 131 | "Hepatocytes", 132 | "White_blood_cells", 133 | "Neural_crest_PNS_glia", 134 | "Adipocytes", 135 | "Primitive_erythroid", 136 | "Neural_crest_PNS_neurons", 137 | "T_cells", 138 | "Lung_and_airway", 139 | "Intestine", 140 | "B_cells", 141 | "Olfactory_sensory_neurons", 142 | "Cardiomyocytes", 143 | "Oligodendrocytes", 144 | "Mast_cells", 145 | "Megakaryocytes", 146 | "Testis_and_adrenal"] 147 | 148 | for trajectory_id in trajectory_list: 149 | print("Processing: %s"%trajectory_id) 150 | 151 | adata = adata_orig[adata_orig.obs["major_trajectory"] == trajectory_id] 152 | 153 | print("Done reading data ...") 154 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 155 | 156 | sc.pp.normalize_total(adata, target_sum=1e4) 157 | print("Done normalization by total counts ...") 158 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 159 | 160 | sc.pp.log1p(adata) 161 | print("Done log transformation ...") 162 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 163 | 164 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 165 | print("Done finding highly variable genes ...") 166 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 167 | 168 | adata = adata[:, adata.var.highly_variable] 169 | print("Done filtering in highly variable genes ...") 170 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 171 | 172 | sc.pp.scale(adata, max_value=10) 173 | print("Done scaling data ...") 174 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 175 | ### done with regress_out and scale ### 176 | 177 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 178 | print("Done performing PCA ...") 179 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 180 | 181 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 182 | print("Done computing neighborhood graph ...") 183 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 184 | 185 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 186 | print("Done UMAP ...") 187 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 188 | 189 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 190 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 191 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 192 | 193 | sc.tl.umap(adata, min_dist=0.1, n_components=2) 194 | print("Done UMAP ...") 195 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 196 | 197 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 198 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 199 | 200 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 201 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 202 | print("Done clustering using res = 1 ...") 203 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 204 | 205 | sc.tl.leiden(adata, resolution=5, n_iterations=2) 206 | adata.obs['leiden_res_5'] = adata.obs['leiden'] 207 | print("Done clustering using res = 5 ...") 208 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 209 | 210 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%trajectory_id)) 211 | pd.DataFrame(adata.obsm['X_pca']).to_csv(os.path.join(WORK_PATH, '%s_adata_scale.pca.csv'%trajectory_id)) 212 | 213 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%trajectory_id), compression="gzip") 214 | print("Done writing data ...") 215 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 216 | 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/Integrating_adjacent_timepoints.py: -------------------------------------------------------------------------------- 1 | 2 | import scanpy as sc 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | import sys 7 | from annoy import AnnoyIndex 8 | 9 | WORK_PATH = './' 10 | 11 | file = open(os.path.join(WORK_PATH, "batch_list.txt")) 12 | example_list = [line.rstrip() for line in file] 13 | file.close() 14 | 15 | for example_i in example_list: 16 | print(example_i) 17 | 18 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i)) 19 | fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_i), index_col = 0) 20 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0) 21 | adata.obs = pdata 22 | adata.var = fdata 23 | 24 | print(adata.shape) 25 | 26 | sc.pp.normalize_total(adata, target_sum=1e4) 27 | sc.pp.log1p(adata) 28 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 29 | adata = adata[:, adata.var.highly_variable] 30 | sc.pp.scale(adata, max_value=10) 31 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 32 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 33 | 34 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 35 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 36 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 37 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 38 | 39 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 40 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 41 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 42 | 43 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i)) 44 | 45 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip") 46 | 47 | X = adata.obsm['X_pca'] 48 | print(X.shape) 49 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f') 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/Run_Scrublet.py: -------------------------------------------------------------------------------- 1 | 2 | ### We performed Scrublet to detect doublets 3 | 4 | import scanpy as sc 5 | import pandas as pd 6 | import numpy as np 7 | import scrublet as scr 8 | import os, sys 9 | 10 | WORK_PATH = './' 11 | 12 | for cnt in range(6): 13 | 14 | batch_id = str(cnt+1) 15 | print(batch_id) 16 | 17 | adata = sc.read_mtx(os.path.join(WORK_PATH, "gene_count_%s.mtx"%batch_id)) 18 | pdata = pd.read.csv(os.path.join(WORK_PATH, "df_cell_%s.csv"%batch_id, index_col = 0)) 19 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv", index_col = 0)) 20 | 21 | adata.obs_names = list(pdata['sample']) 22 | adata.var_names = list(fdata['gene_id']) 23 | 24 | min_counts = 3 25 | min_cells = 3 26 | vscore_percentile = 85 27 | n_pc = 30 28 | expected_doublet_rate = 0.06 29 | sim_doublet_ratio = 2 30 | n_neighbors = 30 31 | scaling_method = 'log' 32 | scrublet_results = scr.compute_doublet_scores( 33 | adata.X, 34 | min_counts = min_counts, 35 | min_cells = min_cells, 36 | vscore_percentile = vscore_percentile, 37 | n_prin_comps = n_pc, 38 | scaling_method = scaling_method, 39 | expected_doublet_rate = expected_doublet_rate, 40 | sim_doublet_ratio = sim_doublet_ratio, 41 | n_neighbors = n_neighbors, 42 | use_approx_neighbors = True, 43 | get_doublet_neighbor_parents = False 44 | ) 45 | 46 | pd.DataFrame(scrublet_results['doublet_scores_observed_cells']).to_csv(os.path.join(WORK_PATH, "doublet_scores_observed_cells_%s.csv"%batch_id), index = False, header = None) 47 | pd.DataFrame(scrublet_results['doublet_scores_simulated_doublets']).to_csv(os.path.join(WORK_PATH, "doublet_scores_simulated_doublets_%s.csv"%batch_id), index = False, header = None) 48 | 49 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/step2_Plot_UMAP.R: -------------------------------------------------------------------------------- 1 | 2 | ################################### 3 | ### Section - 1, Basic analysis ### 4 | ################################### 5 | 6 | 7 | ################### 8 | ### Cell number ### 9 | ################### 10 | 11 | source("JAX_help_code.R") 12 | source("JAX_color_code.R") 13 | 14 | work_path = "./" 15 | 16 | pd = readRDS(paste0(work_path, "df_cell.rds")) 17 | ### n = 11,441,407 cells 18 | 19 | x = as.vector(pd$day) 20 | x[pd$day == "E8.0-E8.5"] = "E8.5" 21 | pd$day = as.vector(x) 22 | 23 | ### Making bar plot for cell number from individual timepoints (Fig. 1a) 24 | 25 | pd_cell_num_1 = pd %>% group_by(day) %>% tally() %>% rename(cell_num = n) %>% as.data.frame() 26 | pd_cell_num_1$day = factor(pd_cell_num_1$day, levels = rev(names(day_color_plate))) 27 | pd_cell_num_1$log2_cell_num = log2(pd_cell_num_1$cell_num) 28 | 29 | p1 = pd_cell_num_1 %>% 30 | ggplot(aes(day, cell_num, fill = day)) + 31 | geom_bar(stat="identity") + 32 | coord_flip() + 33 | scale_fill_manual(values = day_color_plate) + 34 | scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + 35 | geom_text(aes(label = scales::comma(cell_num)), 36 | hjust = -0.1, 37 | position = position_dodge(width = 1), 38 | inherit.aes = TRUE, 39 | size = 3) + 40 | labs(x = "", y = "Cell number") + 41 | theme_classic(base_size = 15) + 42 | theme(legend.position="none") + 43 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 44 | pdf(paste0(work_path, "Cell_number_timepoints.pdf"), 3.5, 8) 45 | print(p1) 46 | dev.off() 47 | 48 | ### Making bar plot for cell number from individual somite counts (Fig. 1a) 49 | 50 | pd_cell_num_2 = pd[!is.na(pd$somite_count),] %>% group_by(somite_count) %>% tally() %>% rename(cell_num = n) %>% as.data.frame() 51 | pd_cell_num_2$somite_count = factor(pd_cell_num_2$somite_count, levels = rev(names(somite_color_plate))) 52 | 53 | p2 = pd_cell_num_2 %>% 54 | ggplot(aes(somite_count, cell_num, fill = somite_count)) + 55 | geom_bar(stat="identity") + 56 | coord_flip() + 57 | scale_fill_manual(values = somite_color_plate) + 58 | scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + 59 | geom_text(aes(label = scales::comma(cell_num)), 60 | hjust = -0.1, 61 | position = position_dodge(width = 1), 62 | inherit.aes = TRUE, 63 | size = 3) + 64 | labs(x = "", y = "Cell number") + 65 | theme_classic(base_size = 15) + 66 | theme(legend.position="none") + 67 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 68 | pdf(paste0(work_path, "Cell_number_somite_counts.pdf"), 3.5, 8) 69 | print(p2) 70 | dev.off() 71 | 72 | 73 | #################################### 74 | ### Making 2D UMAP visualization ### 75 | #################################### 76 | 77 | source("JAX_help_code.R") 78 | source("JAX_color_code.R") 79 | 80 | work_path = "./" 81 | 82 | pd = readRDS(paste0(work_path, "df_cell.rds")) 83 | ### n = 11,441,407 cells 84 | 85 | x = as.vector(pd$day) 86 | x[pd$day == "E8.0-E8.5"] = "E8.5" 87 | pd$day = as.vector(x) 88 | 89 | ### Making 2D UMAP visualization for the global embedding 90 | 91 | ### Highlight cells with their major trajectories (Fig. 1c) 92 | 93 | p = pd %>% 94 | ggplot() + 95 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 96 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = major_trajectory), size=0.3) + 97 | scale_color_manual(values = major_trajectory_color_plate) + 98 | theme_void() + 99 | theme(legend.position="none") + 100 | ggsave(paste0(work_path, "Global_embedding_2D_UMAP_major_trajectory.png"), width = 10, height = 10, dpi = 300) 101 | 102 | ### Highlight cells with their day timepoints (Fig. 1c) 103 | ### Of note, we need to downsample cells from each timepint to a similar number (i.e. 100,000) 104 | 105 | pd_1 = pd %>% filter(pd$day %in% c("E8.75", "E17.25")) %>% as.data.frame() 106 | pd_2 = pd %>% filter(!pd$day %in% c("E8.75", "E17.25")) %>% group_by(day) %>% sample_n(100000) %>% as.data.frame() 107 | pd_sub = rbind(pd_1, pd_2) 108 | p = pd_sub[sample(1:nrow(pd_sub))] %>% 109 | ggplot() + 110 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 111 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) + 112 | scale_color_manual(values=day_color_plate) + 113 | theme_void() + 114 | theme(legend.position="none") + 115 | ggsave(paste0(work_path, "Global_embedding_2D_UMAP_day.png"), width = 10, height = 10, dpi = 300) 116 | 117 | 118 | 119 | ############################################################################### 120 | ### Making 3D UMAP for individual major_trajectories (Extended Data Fig. 3) ### 121 | ############################################################################### 122 | 123 | source("JAX_help_code.R") 124 | source("JAX_color_code.R") 125 | 126 | work_path = "./" 127 | 128 | pd = readRDS(paste0(work_path, "df_cell.rds")) 129 | ### n = 11,441,407 cells 130 | 131 | major_trajectory_list = names(major_trajectory_color_plate) 132 | 133 | for(i in major_trajectory_list){ 134 | print(i) 135 | 136 | if(sum(pd$global_celltype == i) > 300000){ 137 | fig = pd %>% 138 | filter(global_celltype == i) %>% 139 | sample_n(300000) %>% 140 | plot_ly(x = ~sub_UMAP_3d_1, y = ~sub_UMAP_3d_2, z = ~sub_UMAP_3d_3, size=I(30), color = ~sub_celltype) %>% 141 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 142 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 143 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2))) 144 | } else { 145 | fig = pd %>% 146 | filter(global_celltype == i) %>% 147 | plot_ly(x = ~sub_UMAP_3d_1, y = ~sub_UMAP_3d_2, z = ~sub_UMAP_3d_3, size=I(30), color = ~sub_celltype) %>% 148 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 149 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 150 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2))) 151 | } 152 | 153 | saveWidget(fig, paste0(work_path, i, "_celltype_update.html"), selfcontained = FALSE, libdir = "tmp") 154 | 155 | } 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/step3_Pseudobulk.R: -------------------------------------------------------------------------------- 1 | 2 | ################################### 3 | ### Section - 1, Basic analysis ### 4 | ################################### 5 | 6 | ######################################### 7 | ### Pseudobulk analysis using Monocle ### 8 | ######################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | 13 | work_path = "./" 14 | 15 | cds = readRDS(paste0(work_path, "embryo_cds.rds")) 16 | 17 | ### identifying the highly variable genes 18 | obj = doObjectTransform(cds, transform_to = "seurat") 19 | obj = NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000) 20 | obj = FindVariableFeatures(obj, selection.method = "vst", nfeatures = 3000) 21 | gene_use = VariableFeatures(obj) 22 | 23 | ### performing PCA analysis 24 | set.seed(2016) 25 | FM = monocle3:::normalize_expr_data(cds, 26 | norm_method = "log", 27 | pseudo_count = 1) 28 | FM = FM[gene_use,] 29 | 30 | num_dim = 10 31 | scaling = TRUE 32 | set.seed(2016) 33 | irlba_res = my_sparse_prcomp_irlba(Matrix::t(FM), 34 | n = min(num_dim, min(dim(FM)) - 1), 35 | center = scaling, 36 | scale. = scaling) 37 | preproc_res = irlba_res$x 38 | row.names(preproc_res) = colnames(cds) 39 | 40 | prop_var_expl = irlba_res$sdev^2/sum(irlba_res$sdev^2) 41 | print(prop_var_expl) 42 | 43 | df = data.frame(embryo_id = rownames(preproc_res), 44 | PC_1 = preproc_res[,1], 45 | PC_2 = preproc_res[,2], 46 | PC_3 = preproc_res[,3], 47 | day = as.vector(cds$day), 48 | embryo_sex = as.vector(cds$embryo_sex)) 49 | df$day = factor(df$day, levels = names(day_color_plate)) 50 | 51 | fig = plot_ly(df, x = ~PC_1, y = ~PC_2, z = ~PC_3, color = ~day, colors = day_color_plate) %>% 52 | layout(scene = list(xaxis=list(title = list(text ='PC_1 (77.3%)', font = t1), tickfont = t2), 53 | yaxis=list(title = list(text ='PC_2 (9.9%)', font = t1), tickfont = t2), 54 | zaxis=list(title = list(text ='PC_3 (4.2%)', font = t1), tickfont = t2), 55 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5))), 56 | showlegend = FALSE) 57 | saveWidget(fig, paste0(work_path, "embryo_pca_day.html"), selfcontained = FALSE, libdir = "tmp") 58 | 59 | sex_color_plate = c("F" = "#ff0000", 60 | "M" = "#0000FF") 61 | fig = plot_ly(df, x = ~PC_1, y = ~PC_2, z = ~PC_3, color = ~embryo_sex, colors = sex_color_plate) %>% 62 | layout(scene = list(xaxis=list(title = list(text ='PC_1 (77.3%)', font = t1), tickfont = t2), 63 | yaxis=list(title = list(text ='PC_2 (9.9%)', font = t1), tickfont = t2), 64 | zaxis=list(title = list(text ='PC_3 (4.2%)', font = t1), tickfont = t2), 65 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5))), 66 | showlegend = FALSE) 67 | saveWidget(fig, paste0(work_path, "embryo_pca_sex.html"), selfcontained = FALSE, libdir = "tmp") 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /Section_1_basic_analysis/step4_Estimate_cell_num.R: -------------------------------------------------------------------------------- 1 | 2 | ################################### 3 | ### Section - 1, Basic analysis ### 4 | ################################### 5 | 6 | ###################################################################### 7 | ### Estimating absolute number of cells from individual timepoints ### 8 | ###################################################################### 9 | 10 | work_path = "./" 11 | 12 | ### Cell number estimated by qPCR experiment (million) 13 | cell_num = c("E8.5" = 0.21, 14 | "E9.5" = 0.94, 15 | "E10.5" = 10.10, 16 | "E11.5" = 22.98, 17 | "E12.5" = 45.03, 18 | "E13.5" = 60.59, 19 | "E14.5" = 131.00, 20 | "E15.5" = 216.79, 21 | "E16.5" = 353.17, 22 | "E17.5" = 515.85, 23 | "E18.5" = 584.78, 24 | "E19.5" = 671.50) 25 | 26 | df = data.frame(x = as.numeric(gsub("E", "", as.vector(names(cell_num)))), 27 | log2_y = log2(cell_num * 1000000)) 28 | 29 | ### fit polynomial regression with degree 5 30 | fit3 = lm(log2_y~poly(x,3,raw=TRUE), data=df) 31 | print(summary(fit3)$adj.r.squared) ### 0.9858479 32 | 33 | day_list = c("E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 34 | "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 35 | "E12.25", "E12.5", "E12.75", "E13.0", "E13.25", "E13.5", "E13.75", 36 | "E14.0", "E14.25", "E14.333", "E14.75", "E15.0", "E15.25", "E15.5", 37 | "E15.75", "E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 38 | "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "E19.5") 39 | 40 | x_axis = as.numeric(gsub("E","",day_list)) 41 | 42 | plot(df$x, df$log2_y, pch=19, xlab='x', ylab='log2_y') 43 | lines(x_axis, predict(fit3, data.frame(x=x_axis)), col='purple') 44 | 45 | cell_num_pred = round(2^predict(fit3, data.frame(x=x_axis))) 46 | 47 | df_x = data.frame(day = x_axis, 48 | cell_num_pred_log2 = predict(fit3, data.frame(x=x_axis)), 49 | cell_num_pred = round(2^predict(fit3, data.frame(x=x_axis)))) 50 | 51 | day_x = day = paste0("E", df_x$day) 52 | day[day_x == "E9"] = "E9.0" 53 | day[day_x == "E10"] = "E10.0" 54 | day[day_x == "E11"] = "E11.0" 55 | day[day_x == "E12"] = "E12.0" 56 | day[day_x == "E13"] = "E13.0" 57 | day[day_x == "E14"] = "E14.0" 58 | day[day_x == "E15"] = "E15.0" 59 | day[day_x == "E16"] = "E16.0" 60 | day[day_x == "E17"] = "E17.0" 61 | day[day_x == "E18"] = "E18.0" 62 | day[day_x == "E19.5"] = "P0" 63 | df_x$day = as.vector(day) 64 | 65 | ### summary of fit3 curve 66 | a3 = 0.011369 67 | a2 = -0.583861 68 | a1 = 10.397036 69 | a4 = -35.469755 70 | 71 | ### the function of curve 72 | p_function = function(x){ 73 | y = a3*x^3 + a2*x^2 + a1*x^1 + a4 74 | return(y) 75 | } 76 | 77 | ### derivative of the curve, which is increasing time (in log2 scale) at a given timepoint 78 | d_function = function(x){ 79 | y = 3*a3*x^2 + 2*a2*x + a1 80 | return(y) 81 | } 82 | 83 | ### timepoint 84 | x_axis = as.numeric(gsub("E","",day_list)) 85 | 86 | ### the doubling time 87 | doubling_time = 24*2/(2^d_function(x_axis)) 88 | 89 | df_x$doubling_time = doubling_time 90 | df_x$x_axis = x_axis 91 | 92 | write.csv(df_x, paste0(work_path, "cell_num_prediction.csv")) 93 | 94 | 95 | ################################################################################################ 96 | ### Plotting the cell composition of each major trajectories as a function of time (Fig. 1e) ### 97 | ################################################################################################ 98 | 99 | source("JAX_help_code.R") 100 | source("JAX_color_code.R") 101 | 102 | work_path = "./" 103 | 104 | pd = readRDS(paste0(work_path, "df_cell.rds")) 105 | ### n = 11,441,407 cells 106 | 107 | x = as.vector(pd$day) 108 | x[pd$day == "E8.0-E8.5"] = "E8.5" 109 | pd$day = as.vector(x) 110 | 111 | cell_num = read.csv(paste0(work_path, "cell_num_prediction.csv")) 112 | 113 | df = pd %>% 114 | group_by(day, major_trajectory) %>% 115 | tally() %>% 116 | dplyr::rename(cell_num = n) 117 | df_sub = pd %>% 118 | group_by(day) %>% 119 | tally() %>% 120 | dplyr::rename(cell_num_total = n) 121 | df = df %>% 122 | left_join(df_sub, by = "day") %>% 123 | mutate(percentage = cell_num/cell_num_total) %>% 124 | left_join(cell_num, by = "day") %>% 125 | mutate(cell_num_pred_log2 = cell_num_pred_log2 * percentage) 126 | df$day = factor(df$day, levels = names(day_color_plate)) 127 | df$major_trajectory = factor(df$major_trajectory, levels = names(major_trajectory_color_plate)) 128 | 129 | cell_num_x = c("E8.5" = 0.21, 130 | "E9.5" = 0.94, 131 | "E10.5" = 10.10, 132 | "E11.5" = 22.98, 133 | "E12.5" = 45.03, 134 | "E13.5" = 60.59, 135 | "E14.333" = 131.00, 136 | "E15.5" = 216.79, 137 | "E16.5" = 353.17, 138 | "E17.5" = 515.85, 139 | "E18.5" = 584.78, 140 | "P0" = 671.50) 141 | 142 | df_y = data.frame(day = as.vector(names(cell_num_x)), 143 | log2_y = log2(cell_num_x * 1000000)) 144 | 145 | df_y$day = factor(df_y$day, levels = names(day_color_plate)) 146 | 147 | cell_num$day = factor(cell_num$day, levels = names(day_color_plate)) 148 | 149 | p = ggplot() + 150 | geom_bar(data = df, aes(x = day, y = cell_num_pred_log2, group = major_trajectory, fill = major_trajectory), stat="identity", width = 1) + 151 | geom_point(data=df_y, aes(x=day, y=log2_y), shape = 21, colour = "black", fill = "white", size = 2, stroke = 1.5, alpha = 0.8) + 152 | labs(x = "", y = "") + 153 | theme_classic(base_size = 12) + 154 | scale_fill_manual(values = major_trajectory_color_plate) + 155 | theme(legend.position="none") + 156 | theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1), axis.text.y = element_text(color="black")) 157 | 158 | pdf(paste0(work_path, "Cell_composition_over_time.pdf"), 7, 5) 159 | print(p) 160 | dev.off() 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/Embedding_scanpy_posterior_embryo.py: -------------------------------------------------------------------------------- 1 | 2 | ################################################################################################################################################### 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on NMPs, gut, and notochord during early somitogenesis ### 4 | ################################################################################################################################################### 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0"] 30 | celltype_include = ["Notochord", "Nodal cilia", "NMPs and spinal cord progenitors", "Gut", "Mesodermal progenitors (Tbx6+)"] 31 | 32 | example_id = "posterior_embryo" 33 | print(example_id) 34 | 35 | adata = adata[adata.obs["day"].isin(day_include)] 36 | adata = adata[adata.obs["celltype"].isin(celltype_include)] 37 | 38 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 39 | 40 | print("Done reading data ...") 41 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 42 | 43 | sc.pp.normalize_total(adata, target_sum=1e4) 44 | print("Done normalization by total counts ...") 45 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 46 | 47 | sc.pp.log1p(adata) 48 | print("Done log transformation ...") 49 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 50 | 51 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 52 | print("Done finding highly variable genes ...") 53 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 54 | 55 | adata = adata[:, adata.var.highly_variable] 56 | print("Done filtering in highly variable genes ...") 57 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 58 | 59 | sc.pp.scale(adata, max_value=10) 60 | print("Done scaling data ...") 61 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 62 | ### done with regress_out and scale ### 63 | 64 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 65 | print("Done performing PCA ...") 66 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 67 | 68 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 69 | print("Done computing neighborhood graph ...") 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 71 | 72 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 73 | print("Done UMAP ...") 74 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 75 | 76 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 77 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 78 | print("Done clustering using res = 1 ...") 79 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 80 | 81 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 82 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 83 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 84 | 85 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 86 | print("Done UMAP ...") 87 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 88 | 89 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 90 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 91 | 92 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 93 | 94 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 95 | print("Done writing data ...") 96 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 97 | 98 | emb = adata.obsm['X_pca'] 99 | print(emb.shape) 100 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 101 | 102 | 103 | ######################################################### 104 | ### Perform subclustering on three major trajectories ### 105 | ######################################################### 106 | 107 | import scanpy as sc 108 | import pandas as pd 109 | import numpy as np 110 | import os, sys 111 | 112 | work_path = './' 113 | example_id = "posterior_embryo" 114 | 115 | adata_all = sc.read_h5ad(os.path.join(work_path, '%s_adata_scale.h5ad'%example_id)) 116 | pdata = pd.read_csv(os.path.join(work_path, '%s_adata_scale.obs.csv'%example_id), index_col = 0) 117 | adata_all.obs = pdata 118 | 119 | subcluster_list = ["NMP_Mesoderm", "Notochord", "Gut"] 120 | 121 | for i in subcluster_list: 122 | 123 | adata = adata_all[adata_all.obs["cluster_tmp"] == i] 124 | print(adata.shape) 125 | 126 | sc.pp.normalize_total(adata, target_sum=1e4) 127 | sc.pp.log1p(adata) 128 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 129 | adata = adata[:, adata.var.highly_variable] 130 | sc.pp.scale(adata, max_value=10) 131 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 132 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 133 | 134 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 135 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 136 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 137 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 138 | 139 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 140 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 141 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 142 | 143 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 144 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 145 | 146 | adata.obs.to_csv(os.path.join(work_path, '%s_adata_scale.%s.obs.csv'%(example_id, i))) 147 | 148 | emb = adata.obsm['X_pca'] 149 | np.savetxt(os.path.join(work_path, '%s_adata_scale.%s.PCs.csv'%(example_id, i)), emb, delimiter=",", fmt='%1.3f') 150 | 151 | 152 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/Run_geosketch.py: -------------------------------------------------------------------------------- 1 | 2 | ################################################ 3 | ### run geosketch to downsample to 10% cells ### 4 | ################################################ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import scanpy as sc 9 | import os, sys 10 | from time import time 11 | from geosketch import gs 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | ### Of note, this is the adata object after running Dimension_reduction_scanpy.py in Section-1 18 | ### We need the PCA features to perform geosketch 19 | 20 | adata = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_scale.h5ad')) 21 | 22 | X_dimred = adata.obsm['X_pca'] 23 | 24 | start = time() 25 | 26 | N = 1144141 # Number of samples to obtain from the data set. 27 | sketch_index = gs(X_dimred, N, replace=False) 28 | 29 | np.savetxt(os.path.join(WORK_PATH, "adata_scale_geosketch_downsample.csv"), np.array(sketch_index)+1, delimiter=",", fmt='%s') 30 | adata.obs.to_csv(os.path.join(WORK_PATH, 'adata_scale.obs.csv')) 31 | 32 | end = time() 33 | print(end - start) 34 | 35 | 36 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/step1_posterior_embryo.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 2, Posterior embryo ### 4 | ##################################### 5 | 6 | #################################### 7 | ### Making 3D UMAP visualization ### 8 | #################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "posterior_embryo" 15 | 16 | pd = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 17 | rownames(pd) = as.vector(pd$cell_id) 18 | pd$somite_count = factor(pd$somite_count, levels = names(somite_color_plate)) 19 | 20 | ### making 3D UMAP, with cells are colored by their initial cell type annotations (Fig. 2a) 21 | fig = plot_ly(pd, x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~celltype_update, colors = posterior_embryo_color_plate) %>% 22 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 23 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 24 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2), 25 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 26 | saveWidget(fig, paste0(work_path, example_i, "_celltype_update.html"), selfcontained = FALSE, libdir = "tmp") 27 | 28 | ### making 3D UMAP, with cells are colored by somite counts (Fig. 2b) 29 | fig = plot_ly(pd, x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 30 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 31 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 32 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2), 33 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 34 | saveWidget(fig, paste0(work_path, example_i, "_somite_count.html"), selfcontained = FALSE, libdir = "tmp") 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/step2_NMP_mesoderm.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 2, Posterior embryo ### 4 | ##################################### 5 | 6 | ######################## 7 | ### Analysis on NMPs ### 8 | ######################## 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "posterior_embryo" 15 | 16 | i = "NMP_Mesoderm" 17 | 18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T) 19 | 20 | ### 2D UMAP of NMPs, with cells are colored by their initial cell types (Fig. 2c) 21 | p = pd_x %>% 22 | ggplot() + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 24 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.35) + 25 | theme_void() + 26 | scale_color_manual(values=posterior_embryo_color_plate) + 27 | theme(legend.position="none") + 28 | ggsave(paste0(work_path, "NMPs_celltype.png"), width = 4, height = 3, dpi = 300) 29 | 30 | ### 2D UMAP of NMPs, with cells are colored by their timepoints (Fig. 2c) 31 | p = pd_x %>% 32 | ggplot() + 33 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 34 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.35) + 35 | theme_void() + 36 | scale_color_manual(values=somite_color_plate) + 37 | theme(legend.position="none") + 38 | ggsave(paste0(work_path, "NMPs_somite_count.png"), width = 4, height = 3, dpi = 300) 39 | 40 | ############################### 41 | ### Performing PCA analysis ### 42 | ############################### 43 | 44 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),] 45 | gene_count_x = doExtractData(pd_x, mouse_gene_sub) 46 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x) 47 | 48 | npcs = 30 49 | reduction.key = "PC_" 50 | seed.use = 42 51 | 52 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000) 53 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500) 54 | genes_include = VariableFeatures(obj_x) 55 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x)) 56 | scale_dat = GetAssayData(obj_x, slot = "scale.data") 57 | print(dim(scale_dat)) 58 | 59 | set.seed(seed = seed.use) 60 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs) 61 | feature.loadings <- pca.results$v 62 | set.seed(seed = seed.use) 63 | cell.embeddings <- pca.results$u %*% diag(pca.results$d) 64 | 65 | rownames(x = feature.loadings) <- genes_include 66 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs) 67 | rownames(x = cell.embeddings) <- colnames(obj_x) 68 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings) 69 | 70 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1)) 71 | eigValues = (stdev)^2 ## EigenValues 72 | varExplained = eigValues / sum(eigValues) 73 | 74 | res = list(cell.embeddings = cell.embeddings, 75 | feature.loadings = feature.loadings, 76 | varExplained = varExplained) 77 | 78 | emb = res[["cell.embeddings"]] 79 | emb = emb[rownames(pd_x),] 80 | pd_x = cbind(pd_x, emb[,c(1:3)]) 81 | print(res[["varExplained"]]) 82 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate)) 83 | 84 | ### making 3D PCA plot (Fig. 2e) 85 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~celltype_update, colors = posterior_embryo_color_plate) %>% 86 | layout(scene = list(xaxis=list(title = list(text ='PC_1 (21.0%)', font = t1), tickfont = t2), 87 | yaxis=list(title = list(text ='PC_2 (13.9%)', font = t1), tickfont = t2), 88 | zaxis=list(title = list(text ='PC_3 (11.1%)', font = t1), tickfont = t2), 89 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 90 | saveWidget(fig, paste0(work_path, example_i, "_NMP_Mesoderm_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp") 91 | 92 | ################################################################################# 93 | ### making scatter plot between each PCs and gene expression or somite counts ### 94 | ################################################################################# 95 | 96 | gene_count_x = gene_count[,rownames(pd_x)] 97 | emb = emb[rownames(pd_x),] 98 | 99 | gene_count_x = t(t(gene_count_x) / colSums(gene_count_x)) * 100000 100 | gene_count_x = gene_count_x[c("ENSMUSG00000074637","ENSMUSG00000030699","ENSMUSG00000020160","ENSMUSG00000062327","ENSMUSG00000009900","ENSMUSG00000024987"),] 101 | rownames(gene_count_x) = c("Sox2","Tbx6","Mesi1","T","Wnt3a","Cyp26a1") 102 | gene_count_x@x = log(gene_count_x@x + 1) 103 | 104 | df = data.frame(exp = c(as.vector(gene_count_x[1,]), as.vector(gene_count_x[2,])), 105 | gene = c(rep("Sox2", nrow(emb)), rep("Tbx6", nrow(emb))), 106 | PC_1 = c(as.vector(emb[,1]), as.vector(emb[,1]))) 107 | 108 | ### Fig. 2e 109 | df %>% 110 | ggplot(aes(PC_1, exp, color = gene)) + geom_smooth(method = loess, se = FALSE) + 111 | labs(x="", y="", title="") + 112 | theme_classic(base_size = 12) + 113 | theme(plot.title = element_text(hjust = 0.5)) + 114 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) + 115 | scale_color_brewer(palette = "Set1") 116 | 117 | 118 | df = data.frame(exp = c(as.vector(gene_count_x[3,]), as.vector(gene_count_x[4,]), as.vector(gene_count_x[5,]), as.vector(gene_count_x[6,])), 119 | gene = c(rep("Mesi1", nrow(emb)), rep("T", nrow(emb)), rep("Wnt3a", nrow(emb)), rep("Cyp26a1", nrow(emb))), 120 | PC_3 = c(as.vector(emb[,3]), as.vector(emb[,3]), as.vector(emb[,3]), as.vector(emb[,3]))) 121 | 122 | ### Fig. 2e 123 | df %>% 124 | ggplot(aes(PC_3, exp, color = gene)) + geom_smooth(method = loess, se = FALSE) + 125 | labs(x="", y="", title="") + 126 | theme_classic(base_size = 12) + 127 | theme(plot.title = element_text(hjust = 0.5)) + 128 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) + 129 | scale_color_brewer(palette = "Set2") 130 | 131 | df = data.frame(somite_count = pd_x$somite_count, 132 | PC_2 = as.vector(emb[,2])) 133 | df$somite_count = factor(df$somite_count, levels = names(somite_color_plate)) 134 | df$somite = as.vector(gsub(" somites", "", df$somite_count)) 135 | df$somite = factor(df$somite, levels = c(0, 2:12, 14:18, 20:34)) 136 | 137 | ### Fig. 2e 138 | df %>% 139 | ggplot( aes(somite, PC_2, fill = somite_count)) + 140 | geom_boxplot(outlier.shape = NA) + 141 | labs(x="", y="", title="") + 142 | theme_classic(base_size = 5) + 143 | scale_fill_manual(values=somite_color_plate) + 144 | theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) + 145 | NoLegend() 146 | 147 | 148 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/step3_Notochord.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 2, Posterior embryo ### 4 | ##################################### 5 | 6 | ############################# 7 | ### Analysis on Notochord ### 8 | ############################# 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "posterior_embryo" 15 | 16 | i = "Notochord" 17 | 18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T) 19 | 20 | p = pd_x %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.8) + 24 | theme_void() + 25 | scale_color_manual(values=celltype_color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, "Notochord_celltype.png"), width = 4, height = 3, dpi = 300) 28 | 29 | p = pd_x %>% 30 | ggplot() + 31 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) + 32 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.8) + 33 | theme_void() + 34 | scale_color_manual(values=somite_color_plate) + 35 | theme(legend.position="none") + 36 | ggsave(paste0(work_path, "Notochord_day.png"), width = 4, height = 3, dpi = 300) 37 | 38 | 39 | ############################### 40 | ### Performing PCA analysis ### 41 | ############################### 42 | 43 | ### excluding nodal cilia before performing PCA 44 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),] 45 | gene_count_x = doExtractData(pd_x, mouse_gene_sub) 46 | pd_x = pd_x[pd_x$celltype_update != "Nodal cilia",] 47 | gene_count_x = gene_count[,rownames(pd_x)] 48 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x) 49 | 50 | npcs = 30 51 | reduction.key = "PC_" 52 | seed.use = 42 53 | 54 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000) 55 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500) 56 | genes_include = VariableFeatures(obj_x) 57 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x)) 58 | scale_dat = GetAssayData(obj_x, slot = "scale.data") 59 | print(dim(scale_dat)) 60 | 61 | set.seed(seed = seed.use) 62 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs) 63 | feature.loadings <- pca.results$v 64 | set.seed(seed = seed.use) 65 | cell.embeddings <- pca.results$u %*% diag(pca.results$d) 66 | 67 | rownames(x = feature.loadings) <- genes_include 68 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs) 69 | rownames(x = cell.embeddings) <- colnames(obj_x) 70 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings) 71 | 72 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1)) 73 | eigValues = (stdev)^2 ## EigenValues 74 | varExplained = eigValues / sum(eigValues) 75 | 76 | res = list(cell.embeddings = cell.embeddings, 77 | feature.loadings = feature.loadings, 78 | varExplained = varExplained) 79 | 80 | emb = res[["cell.embeddings"]] 81 | emb = emb[rownames(pd_x),] 82 | pd_x = cbind(pd_x, emb[,c(1:3)]) 83 | print(res[["varExplained"]]) 84 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate)) 85 | 86 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 87 | layout(scene = list(xaxis=list(title = list(text ='PC_1 (28.7%)', font = t1), tickfont = t2), 88 | yaxis=list(title = list(text ='PC_2 (11.4%)', font = t1), tickfont = t2), 89 | zaxis=list(title = list(text ='PC_3 (6.7%)', font = t1), tickfont = t2), 90 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 91 | saveWidget(fig, paste0(work_path, example_i, "_Notochord_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp") 92 | 93 | ########################################################################## 94 | ### counting cell number of nodal cilia as a function of somite counts ### 95 | ########################################################################## 96 | 97 | ### Extended Data Fig. 4h 98 | ### data that we are using 99 | 100 | ### somite_count a n frac 101 | ### 0 somites 4 9296 0.0004302926 102 | ### 2 somites 20 7329 0.0027288853 103 | ### 3 somites 7 4564 0.0015337423 104 | ### 4 somites 7 8362 0.0008371203 105 | ### 5 somites 4 6872 0.0005820722 106 | ### 7 somites 7 17182 0.0004074031 107 | ### 8 somites 3 19415 0.0001545197 108 | ### 9 somites 5 13703 0.0003648836 109 | ### 11 somites 3 20150 0.0001488834 110 | 111 | x %>% ggplot(aes(x=somite_count, y=frac, fill = somite_count)) + 112 | scale_fill_viridis(discrete=TRUE) + 113 | geom_bar(stat="identity") + 114 | theme_classic(base_size = 10) + 115 | theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) + 116 | theme(legend.position="none") 117 | 118 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/step4_Gut.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 2, Posterior embryo ### 4 | ##################################### 5 | 6 | ####################### 7 | ### Analysis on Gut ### 8 | ####################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_pat = "./" 13 | 14 | example_i = "posterior_embryo" 15 | 16 | i = "Gut" 17 | 18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T) 19 | 20 | p = pd_x %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.8) + 24 | theme_void() + 25 | scale_color_manual(values=celltype_color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, "Gut_celltype.png"), width = 4, height = 3, dpi = 300) 28 | 29 | p = pd_x %>% 30 | ggplot() + 31 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) + 32 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.8) + 33 | theme_void() + 34 | scale_color_manual(values=somite_color_plate) + 35 | theme(legend.position="none") + 36 | ggsave(paste0(work_path, "Gut_day.png"), width = 4, height = 3, dpi = 300) 37 | 38 | 39 | ############################### 40 | ### Performing PCA analysis ### 41 | ############################### 42 | 43 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),] 44 | gene_count_x = doExtractData(pd_x, mouse_gene_sub) 45 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x) 46 | 47 | npcs = 30 48 | reduction.key = "PC_" 49 | seed.use = 42 50 | 51 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000) 52 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500) 53 | genes_include = VariableFeatures(obj_x) 54 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x)) 55 | scale_dat = GetAssayData(obj_x, slot = "scale.data") 56 | print(dim(scale_dat)) 57 | 58 | set.seed(seed = seed.use) 59 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs) 60 | feature.loadings <- pca.results$v 61 | set.seed(seed = seed.use) 62 | cell.embeddings <- pca.results$u %*% diag(pca.results$d) 63 | 64 | rownames(x = feature.loadings) <- genes_include 65 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs) 66 | rownames(x = cell.embeddings) <- colnames(obj_x) 67 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings) 68 | 69 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1)) 70 | eigValues = (stdev)^2 ## EigenValues 71 | varExplained = eigValues / sum(eigValues) 72 | 73 | res = list(cell.embeddings = cell.embeddings, 74 | feature.loadings = feature.loadings, 75 | varExplained = varExplained) 76 | 77 | emb = res[["cell.embeddings"]] 78 | emb = emb[rownames(pd_x),] 79 | pd_x = cbind(pd_x, emb[,c(1:3)]) 80 | print(res[["varExplained"]]) 81 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate)) 82 | 83 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 84 | layout(scene = list(xaxis=list(title = list(text ='PC_1 (28.7%)', font = t1), tickfont = t2), 85 | yaxis=list(title = list(text ='PC_2 (11.4%)', font = t1), tickfont = t2), 86 | zaxis=list(title = list(text ='PC_3 (6.7%)', font = t1), tickfont = t2), 87 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 88 | saveWidget(fig, paste0(work_path, example_i, "_Gut_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp") 89 | 90 | -------------------------------------------------------------------------------- /Section_2_posterior_embryo/step6_somites_validation.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 2, Posterior embryo ### 4 | ##################################### 5 | 6 | source("JAX_help_code.R") 7 | source("JAX_color_code.R") 8 | work_path = "./" 9 | 10 | pd = readRDS(paste0(work_path, "pd_somites.rds")) 11 | ### n = 104,671 nuclei 12 | 13 | celltype_color_plate = c("#54c15f", "#c34fb7", "#91b737", "#7b63d0", "#c9a63c", 14 | "#7081ca", "#478734", "#da3f78", "#56c09e", "#cf4a35", 15 | "#999999", "#4eacd7", "#e18f4f", "#be75b4", "#a0b46c", 16 | "#a0445d", "#37845f", "#df7c82", "#72732b", "#a06432") 17 | names(celltype_color_plate) = x 18 | 19 | ### Extended Data Fig. 4b 20 | p = pd %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.8) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno), size=0.6) + 24 | theme_void() + 25 | scale_color_manual(values=color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, "somites.anno.png"), width = 6, height = 6, dpi = 300) 28 | 29 | somite_color_plate = c("#440154", "#482475", "#414487", "#355f8d", 30 | "#2a788e", "#21918c", "#22a884", "#44bf70", 31 | "#7ad151", "#bddf26", "#fde725") 32 | names(somite_color_plate) = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites") 33 | 34 | ### Extended Data Fig. 4c 35 | p = pd %>% 36 | ggplot() + 37 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.8) + 38 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.6) + 39 | theme_void() + 40 | scale_color_manual(values=somite_color_plate) + 41 | theme(legend.position="none") + 42 | ggsave(paste0(work_path, "somites.somite_count.png"), width = 6, height = 6, dpi = 300) 43 | 44 | ### Extended Data Fig. 4a 45 | pd$embryo_id = factor(pd$embryo_id, levels = rev(names(table(pd$embryo_id)))) 46 | p1 = pd %>% 47 | group_by(embryo_id, somite_count) %>% tally() %>% rename(cell_num = n) %>% 48 | ggplot(aes(embryo_id, cell_num, fill = somite_count)) + 49 | geom_bar(stat="identity") + 50 | coord_flip() + 51 | scale_fill_manual(values = somite_color_plate) + 52 | geom_text(aes(label = scales::comma(cell_num)), 53 | hjust = -0.1, 54 | position = position_dodge(width = 1), 55 | inherit.aes = TRUE, 56 | size = 5) + 57 | labs(x = "", y = "Cell number") + 58 | theme_classic(base_size = 15) + 59 | theme(legend.position="none") + 60 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 61 | pdf(paste0(work_path, "cell_num.pdf"), 5, 8) 62 | print(p1) 63 | dev.off() 64 | 65 | 66 | ####################### 67 | ### Focusing on NMP ### 68 | ####################### 69 | 70 | 71 | pd_NMP = read.csv(paste0(work_path, "adata_somites_NMP.obs.csv"), row.names=1, as.is=T) 72 | 73 | somite_color_plate = c("#440154", "#482475", "#414487", "#355f8d", 74 | "#2a788e", "#21918c", "#22a884", "#44bf70", 75 | "#7ad151", "#bddf26", "#fde725") 76 | names(somite_color_plate) = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites") 77 | 78 | pd_NMP$somite_count = factor(pd_NMP$somite_count, levels = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites")) 79 | p = pd_NMP %>% 80 | ggplot() + 81 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.5) + 82 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=1) + 83 | theme_void() + 84 | scale_color_manual(values=somite_color_plate) + 85 | theme(legend.position="none") + 86 | ggsave(paste0(work_path, "NMP.somite_count.png"), width = 6, height = 6, dpi = 300) 87 | 88 | 89 | p = pd_NMP %>% 90 | ggplot() + 91 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.5) + 92 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno), size=1) + 93 | theme_void() + 94 | scale_color_manual(values=color_plate) + 95 | theme(legend.position="none") + 96 | ggsave(paste0(work_path, "NMP.anno.png"), width = 6, height = 6, dpi = 300) 97 | 98 | pd = data.frame(pData(cds)) 99 | pd$somite_count = factor(pd$somite_count, levels = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites")) 100 | pd$Cdx1 = as.vector(exprs(cds)["ENSMUSG00000024619",]) 101 | pd$Hoxa10 = as.vector(exprs(cds)["ENSMUSG00000000938",]) 102 | pd$T = as.vector(exprs(cds)["ENSMUSG00000062327",]) 103 | pd$Meis1 = as.vector(exprs(cds)["ENSMUSG00000020160",]) 104 | 105 | df = pd %>% filter(Cdx1 != 0) %>% group_by(somite_count, .drop = FALSE) %>% tally() %>% 106 | left_join(pd %>% group_by(somite_count) %>% tally() %>% rename(total_n = n)) %>% 107 | mutate(percent = 100*n/total_n) 108 | 109 | p1 <-ggplot(data=df, aes(x=somite_count, y=percent, fill = somite_count)) + 110 | geom_bar(stat="identity") + labs(x="",y="% of cells expressed Cdx1") + 111 | scale_fill_manual(values=somite_color_plate) + theme_classic(base_size = 10) + theme(legend.position="none") + 112 | theme(axis.text.x = element_text(color="black", angle = 90), axis.text.y = element_text(color="black")) 113 | 114 | df = pd %>% filter(Hoxa10 != 0) %>% group_by(somite_count, .drop = FALSE) %>% tally() %>% 115 | left_join(pd %>% group_by(somite_count) %>% tally() %>% rename(total_n = n)) %>% 116 | mutate(percent = 100*n/total_n) 117 | 118 | p2 <-ggplot(data=df, aes(x=somite_count, y=percent, fill = somite_count)) + 119 | geom_bar(stat="identity") + labs(x="",y="% of cells expressed Hoxa10") + 120 | scale_fill_manual(values=somite_color_plate) + theme_classic(base_size = 10) + theme(legend.position="none") + 121 | theme(axis.text.x = element_text(color="black", angle = 90), axis.text.y = element_text(color="black")) 122 | 123 | # Extended Data Fig. 4f 124 | 125 | library(gridExtra) 126 | pdf(paste0(work_path, "NMP_Cdx1_Hoxa10.pdf"), 4, 6) 127 | grid.arrange(p1, p2, nrow=2, ncol=1) 128 | dev.off() 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Embedding_Renal.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################ 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ### 4 | ############################################################################################################ 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | example_id = "Renal_big" 18 | print(example_id) 19 | 20 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_id)) 21 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_id), index_col = 0) 22 | fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_id), index_col = 0) 23 | adata.obs = pdata 24 | adata.var = fdata 25 | 26 | print("Done reading data ...") 27 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 28 | 29 | sc.pp.normalize_total(adata, target_sum=1e4) 30 | print("Done normalization by total counts ...") 31 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 32 | 33 | sc.pp.log1p(adata) 34 | print("Done log transformation ...") 35 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 36 | 37 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 38 | print("Done finding highly variable genes ...") 39 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 40 | 41 | adata = adata[:, adata.var.highly_variable] 42 | print("Done filtering in highly variable genes ...") 43 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 44 | 45 | sc.pp.scale(adata, max_value=10) 46 | print("Done scaling data ...") 47 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 48 | ### done with regress_out and scale ### 49 | 50 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 51 | print("Done performing PCA ...") 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 53 | 54 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 55 | print("Done computing neighborhood graph ...") 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 57 | 58 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 59 | print("Done UMAP ...") 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 61 | 62 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 63 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 64 | print("Done clustering using res = 1 ...") 65 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 66 | 67 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 68 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 69 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 70 | 71 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 72 | print("Done UMAP ...") 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 74 | 75 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 76 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 77 | 78 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s.obs.csv'%example_id)) 79 | 80 | adata.write(os.path.join(WORK_PATH, '%s.h5ad'%example_id), compression="gzip") 81 | print("Done writing data ...") 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 83 | 84 | 85 | 86 | ### Only co-embedding renal pericytes and stromal cells 87 | 88 | 89 | example_id = "Renal_pericytes_stromal" 90 | print(example_id) 91 | 92 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_id)) 93 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_id), index_col = 0) 94 | fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_id), index_col = 0) 95 | adata.obs = pdata 96 | adata.var = fdata 97 | 98 | print("Done reading data ...") 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 100 | 101 | sc.pp.normalize_total(adata, target_sum=1e4) 102 | print("Done normalization by total counts ...") 103 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 104 | 105 | sc.pp.log1p(adata) 106 | print("Done log transformation ...") 107 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 108 | 109 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 110 | print("Done finding highly variable genes ...") 111 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 112 | 113 | adata = adata[:, adata.var.highly_variable] 114 | print("Done filtering in highly variable genes ...") 115 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 116 | 117 | sc.pp.scale(adata, max_value=10) 118 | print("Done scaling data ...") 119 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 120 | ### done with regress_out and scale ### 121 | 122 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 123 | print("Done performing PCA ...") 124 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 125 | 126 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 127 | print("Done computing neighborhood graph ...") 128 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 129 | 130 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 131 | print("Done UMAP ...") 132 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 133 | 134 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 135 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 136 | print("Done clustering using res = 1 ...") 137 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 138 | 139 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 140 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 141 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 142 | 143 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 144 | print("Done UMAP ...") 145 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 146 | 147 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 148 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 149 | 150 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s.obs.csv'%example_id)) 151 | 152 | adata.write(os.path.join(WORK_PATH, '%s.h5ad'%example_id), compression="gzip") 153 | print("Done writing data ...") 154 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 155 | 156 | 157 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Embedding_scanpy_kidney.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################ 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ### 4 | ############################################################################################################ 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["Anterior intermediate mesoderm", 30 | "Collecting duct intercalated cells", 31 | "Connecting tubule", 32 | "Metanephric mesenchyme", 33 | "Podocytes", 34 | "Proximal tubule cells", 35 | "Ascending loop of Henle", 36 | "Collecting duct principal cells", 37 | "Distal convoluted tubule", 38 | "Nephron progenitors", 39 | "Posterior intermediate mesoderm", 40 | "Ureteric bud"] 41 | 42 | example_id = "renal" 43 | print(example_id) 44 | 45 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)] 46 | 47 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 48 | 49 | print("Done reading data ...") 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 51 | 52 | sc.pp.normalize_total(adata, target_sum=1e4) 53 | print("Done normalization by total counts ...") 54 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 55 | 56 | sc.pp.log1p(adata) 57 | print("Done log transformation ...") 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 59 | 60 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 61 | print("Done finding highly variable genes ...") 62 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 63 | 64 | adata = adata[:, adata.var.highly_variable] 65 | print("Done filtering in highly variable genes ...") 66 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 67 | 68 | sc.pp.scale(adata, max_value=10) 69 | print("Done scaling data ...") 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 71 | ### done with regress_out and scale ### 72 | 73 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 74 | print("Done performing PCA ...") 75 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 76 | 77 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 78 | print("Done computing neighborhood graph ...") 79 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 80 | 81 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 82 | print("Done UMAP ...") 83 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 84 | 85 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 86 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 87 | print("Done clustering using res = 1 ...") 88 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 89 | 90 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 91 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 92 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 93 | 94 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 95 | print("Done UMAP ...") 96 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 97 | 98 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 99 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 100 | 101 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 102 | 103 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 104 | print("Done writing data ...") 105 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 106 | 107 | emb = adata.obsm['X_pca'] 108 | print(emb.shape) 109 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 110 | 111 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Embedding_scanpy_lateral_plate_mesoderm.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################ 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ### 4 | ############################################################################################################ 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["Lateral plate and intermediate mesoderm"] 30 | 31 | example_id = "LPM" 32 | print(example_id) 33 | 34 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)] 35 | 36 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 37 | 38 | print("Done reading data ...") 39 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 40 | 41 | sc.pp.normalize_total(adata, target_sum=1e4) 42 | print("Done normalization by total counts ...") 43 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 44 | 45 | sc.pp.log1p(adata) 46 | print("Done log transformation ...") 47 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 48 | 49 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 50 | print("Done finding highly variable genes ...") 51 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 52 | 53 | adata = adata[:, adata.var.highly_variable] 54 | print("Done filtering in highly variable genes ...") 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 56 | 57 | sc.pp.scale(adata, max_value=10) 58 | print("Done scaling data ...") 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 60 | ### done with regress_out and scale ### 61 | 62 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 63 | print("Done performing PCA ...") 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 65 | 66 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 67 | print("Done computing neighborhood graph ...") 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 69 | 70 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 71 | print("Done UMAP ...") 72 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 73 | 74 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 75 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 76 | print("Done clustering using res = 1 ...") 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 78 | 79 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 80 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 81 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 82 | 83 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 84 | print("Done UMAP ...") 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 86 | 87 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 88 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 89 | 90 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 91 | 92 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 93 | print("Done writing data ...") 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 95 | 96 | emb = adata.obsm['X_pca'] 97 | print(emb.shape) 98 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 99 | 100 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Embedding_scanpy_patterned_mesoderm_somites_26_34.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################ 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ### 4 | ############################################################################################################ 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | example_id = "LPM_somite_26_34" 18 | print(example_id) 19 | 20 | 21 | ### First, only including backbone cells 22 | 23 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_backbone.gene_count.mtx'%example_id)) 24 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_backbone.df_cell.csv'%example_id), index_col = 0) 25 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 26 | adata.obs = pdata 27 | adata.var = fdata 28 | 29 | print("Done reading data ...") 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 31 | 32 | sc.pp.normalize_total(adata, target_sum=1e4) 33 | print("Done normalization by total counts ...") 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 35 | 36 | sc.pp.log1p(adata) 37 | print("Done log transformation ...") 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 39 | 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 41 | print("Done finding highly variable genes ...") 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 43 | 44 | adata = adata[:, adata.var.highly_variable] 45 | print("Done filtering in highly variable genes ...") 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 47 | 48 | sc.pp.scale(adata, max_value=10) 49 | print("Done scaling data ...") 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 51 | ### done with regress_out and scale ### 52 | 53 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 54 | print("Done performing PCA ...") 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 56 | 57 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 58 | print("Done computing neighborhood graph ...") 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 60 | 61 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 62 | print("Done UMAP ...") 63 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 64 | 65 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 66 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 67 | print("Done clustering using res = 1 ...") 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 69 | 70 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 71 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 72 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 73 | 74 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 75 | print("Done UMAP ...") 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 77 | 78 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 79 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 80 | 81 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_backbone_adata_scale.obs.csv'%example_id)) 82 | 83 | adata.write(os.path.join(WORK_PATH, '%s_backbone_adata_scale_processed.h5ad'%example_id), compression="gzip") 84 | print("Done writing data ...") 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 86 | 87 | emb = adata.obsm['X_pca'] 88 | print(emb.shape) 89 | np.savetxt(os.path.join(WORK_PATH, '%s_backbone_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 90 | 91 | 92 | 93 | ### Next, including both backbone cells and derivatives 94 | 95 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_all.gene_count.mtx'%example_id)) 96 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_all.df_cell.csv'%example_id), index_col = 0) 97 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 98 | adata.obs = pdata 99 | adata.var = fdata 100 | 101 | print("Done reading data ...") 102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 103 | 104 | sc.pp.normalize_total(adata, target_sum=1e4) 105 | print("Done normalization by total counts ...") 106 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 107 | 108 | sc.pp.log1p(adata) 109 | print("Done log transformation ...") 110 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 111 | 112 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 113 | print("Done finding highly variable genes ...") 114 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 115 | 116 | adata = adata[:, adata.var.highly_variable] 117 | print("Done filtering in highly variable genes ...") 118 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 119 | 120 | sc.pp.scale(adata, max_value=10) 121 | print("Done scaling data ...") 122 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 123 | ### done with regress_out and scale ### 124 | 125 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 126 | print("Done performing PCA ...") 127 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 128 | 129 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 130 | print("Done computing neighborhood graph ...") 131 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 132 | 133 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 134 | print("Done UMAP ...") 135 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 136 | 137 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 138 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 139 | print("Done clustering using res = 1 ...") 140 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 141 | 142 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 143 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 144 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 145 | 146 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 147 | print("Done UMAP ...") 148 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 149 | 150 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 151 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 152 | 153 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_all_adata_scale.obs.csv'%example_id)) 154 | 155 | adata.write(os.path.join(WORK_PATH, '%s_all_adata_scale_processed.h5ad'%example_id), compression="gzip") 156 | print("Done writing data ...") 157 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 158 | 159 | emb = adata.obsm['X_pca'] 160 | print(emb.shape) 161 | np.savetxt(os.path.join(WORK_PATH, '%s_all_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 162 | 163 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Embedding_scanpy_patterned_mesoderm_somites_5_20.py: -------------------------------------------------------------------------------- 1 | 2 | ############################################################################################################ 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ### 4 | ############################################################################################################ 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | example_id = "LPM_somite_5_20" 18 | print(example_id) 19 | 20 | 21 | ### First, only including backbone cells 22 | 23 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_backbone.gene_count.mtx'%example_id)) 24 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_backbone.df_cell.csv'%example_id), index_col = 0) 25 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 26 | adata.obs = pdata 27 | adata.var = fdata 28 | 29 | print("Done reading data ...") 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 31 | 32 | sc.pp.normalize_total(adata, target_sum=1e4) 33 | print("Done normalization by total counts ...") 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 35 | 36 | sc.pp.log1p(adata) 37 | print("Done log transformation ...") 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 39 | 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 41 | print("Done finding highly variable genes ...") 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 43 | 44 | adata = adata[:, adata.var.highly_variable] 45 | print("Done filtering in highly variable genes ...") 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 47 | 48 | sc.pp.scale(adata, max_value=10) 49 | print("Done scaling data ...") 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 51 | ### done with regress_out and scale ### 52 | 53 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 54 | print("Done performing PCA ...") 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 56 | 57 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 58 | print("Done computing neighborhood graph ...") 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 60 | 61 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 62 | print("Done UMAP ...") 63 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 64 | 65 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 66 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 67 | print("Done clustering using res = 1 ...") 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 69 | 70 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 71 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 72 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 73 | 74 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 75 | print("Done UMAP ...") 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 77 | 78 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 79 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 80 | 81 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_backbone_adata_scale.obs.csv'%example_id)) 82 | 83 | adata.write(os.path.join(WORK_PATH, '%s_backbone_adata_scale_processed.h5ad'%example_id), compression="gzip") 84 | print("Done writing data ...") 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 86 | 87 | emb = adata.obsm['X_pca'] 88 | print(emb.shape) 89 | np.savetxt(os.path.join(WORK_PATH, '%s_backbone_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 90 | 91 | 92 | 93 | ### Next, including both backbone cells and derivatives 94 | 95 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_all.gene_count.mtx'%example_id)) 96 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_all.df_cell.csv'%example_id), index_col = 0) 97 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 98 | adata.obs = pdata 99 | adata.var = fdata 100 | 101 | print("Done reading data ...") 102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 103 | 104 | sc.pp.normalize_total(adata, target_sum=1e4) 105 | print("Done normalization by total counts ...") 106 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 107 | 108 | sc.pp.log1p(adata) 109 | print("Done log transformation ...") 110 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 111 | 112 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 113 | print("Done finding highly variable genes ...") 114 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 115 | 116 | adata = adata[:, adata.var.highly_variable] 117 | print("Done filtering in highly variable genes ...") 118 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 119 | 120 | sc.pp.scale(adata, max_value=10) 121 | print("Done scaling data ...") 122 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 123 | ### done with regress_out and scale ### 124 | 125 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 126 | print("Done performing PCA ...") 127 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 128 | 129 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 130 | print("Done computing neighborhood graph ...") 131 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 132 | 133 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 134 | print("Done UMAP ...") 135 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 136 | 137 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 138 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 139 | print("Done clustering using res = 1 ...") 140 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 141 | 142 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 143 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 144 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 145 | 146 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 147 | print("Done UMAP ...") 148 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 149 | 150 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 151 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 152 | 153 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_all_adata_scale.obs.csv'%example_id)) 154 | 155 | adata.write(os.path.join(WORK_PATH, '%s_all_adata_scale_processed.h5ad'%example_id), compression="gzip") 156 | print("Done writing data ...") 157 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 158 | 159 | emb = adata.obsm['X_pca'] 160 | print(emb.shape) 161 | np.savetxt(os.path.join(WORK_PATH, '%s_all_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 162 | 163 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/Spatial_mapping.py: -------------------------------------------------------------------------------- 1 | 2 | ####################################################################################### 3 | ### Part-1: extracting spatial coordinates and annotations from individual sections ### 4 | ####################################################################################### 5 | 6 | import os, sys 7 | import numpy as np 8 | import pandas as pd 9 | import scanpy as sc 10 | 11 | WORK_PATH = "./" 12 | 13 | newpath = os.path.join(WORK_PATH, 'annotation') 14 | if not os.path.exists(newpath): 15 | os.makedirs(newpath) 16 | 17 | file = open(os.path.join(WORK_PATH, "Mosta_file_list.txt")) 18 | file_list = [line.rstrip().replace(".MOSTA.h5ad", "") for line in file] 19 | file.close() 20 | 21 | for file_id in file_list: 22 | 23 | mosta = sc.read_h5ad(os.path.join(WORK_PATH, '%s.MOSTA.h5ad'%file_id)) 24 | mosta_meta = mosta.obs 25 | 26 | np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_color.csv'%file_id), mosta.uns['annotation_colors'], delimiter=",", fmt='%s') 27 | np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_color_id.csv'%file_id), mosta.obs['annotation'].cat.categories, delimiter=",", fmt='%s') 28 | np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_coor.csv'%file_id), mosta.obsm['spatial'], delimiter=",") 29 | np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_anno.csv'%file_id), mosta.obs['annotation'], delimiter=",", fmt='%s') 30 | 31 | 32 | ################################################################## 33 | ### Part-2: generating h5ad format profile for sc-RNA-seq data ### 34 | ################################################################## 35 | 36 | import os, sys 37 | import numpy as np 38 | import pandas as pd 39 | import scanpy as sc 40 | 41 | WORK_PATH = "./" 42 | 43 | day_list = ["E95","E105","E115","E125","E135","E145","E155","E165"] 44 | 45 | for day_id in day_list: 46 | 47 | adata = sc.read_mtx(os.path.join(WORK_PATH, 'sc_data', '%s.gene_count.mtx'%day_id)) 48 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'sc_data', '%s.df_cell.csv'%day_id), index_col = 0) 49 | fdata = pd.read_csv(os.path.join(WORK_PATH, 'sc_data', '%s.df_gene.csv'%day_id), index_col = 0) 50 | adata.obs = pdata 51 | adata.var = fdata 52 | 53 | adata.write(os.path.join(WORK_PATH, 'sc_data', '%s.sc_data.h5ad'%day_id)) 54 | 55 | 56 | ######################################################## 57 | ### Part-3: performing spatial mapping using Tangram ### 58 | ######################################################## 59 | 60 | import os, sys 61 | import numpy as np 62 | import pandas as pd 63 | import matplotlib.pyplot as plt 64 | import seaborn as sns 65 | import scanpy as sc 66 | import torch 67 | import tangram as tg 68 | 69 | WORK_PATH = "./" 70 | 71 | newpath = os.path.join(WORK_PATH, 'result') 72 | if not os.path.exists(newpath): 73 | os.makedirs(newpath) 74 | 75 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 76 | print('Using device:', device) 77 | print(torch.cuda) 78 | print(torch.version.cuda) 79 | print(torch.cuda.is_available()) 80 | 81 | file = open(os.path.join(work_path, "MOSTA_file_list.txt")) 82 | file_list = [line.rstrip().replace(".MOSTA.h5ad", "") for line in file] 83 | file.close() 84 | 85 | 86 | for spatial_id in file_list: 87 | 88 | print(spatial_id) 89 | 90 | mosta = sc.read(os.path.join(WORK_PATH, spatial_id + '.MOSTA.h5ad')) 91 | 92 | day_id = spatial_id.split('_')[0].replace('.','') 93 | adata = sc.read(os.path.join(WORK_PATH, 'sc_data', '%s.sc_data.h5ad'%day_id)) 94 | 95 | sc.pp.normalize_total(adata, inplace=True) 96 | sc.pp.log1p(adata) 97 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 98 | 99 | if mosta.shape[0] > 90000: 100 | sc.pp.subsample(mosta, n_obs = 90000) 101 | 102 | var_genes = adata.var.index[adata.var.highly_variable] 103 | tg.pp_adatas(adata, mosta, genes=var_genes) 104 | 105 | ad_map = tg.map_cells_to_space( 106 | adata_sc=adata, 107 | adata_sp=mosta, 108 | device='cuda:0' 109 | ) 110 | 111 | tg.project_cell_annotations(ad_map, mosta, annotation='celltype') 112 | annotation_list = list(pd.unique(adata.obs['celltype'])) 113 | 114 | colnames = ','.join(list(mosta.obsm['tangram_ct_pred'].columns)) 115 | 116 | np.savetxt(os.path.join(WORK_PATH, 'result', '%s.result.csv'%spatial_id), mosta.obsm['tangram_ct_pred'], delimiter=",", fmt='%.8e', header=colnames) 117 | np.savetxt(os.path.join(WORK_PATH, 'result', '%s.result.coor.csv'%spatial_id), mosta.obsm['spatial'], delimiter=",") 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /Section_3_kidney_mesenchyme/step2_Lateral_plate_mesoderm.R: -------------------------------------------------------------------------------- 1 | 2 | ###################################### 3 | ### Section - 3, Kidney_mesenchyme ### 4 | ###################################### 5 | 6 | ############################################################################## 7 | ### Making 2D UMAP visualization for lateral plate & intermediate mesoderm ### 8 | ############################################################################## 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "LPM"; print(example_i) 15 | 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 17 | 18 | ### Fig. 3e 19 | p = pd %>% 20 | ggplot() + 21 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.35) + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = lateral_plate_mesoderm_sub_clustering), size=0.2) + 23 | theme_void() + 24 | scale_color_manual(values=LPM_color_plate) + 25 | theme(legend.position="none") + 26 | ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 8, height = 6, dpi = 300) 27 | 28 | 29 | x_table = table(pd$day) 30 | pd_1 = pd %>% filter(day %in% names(x_table)[x_table > 10000]) %>% group_by(day) %>% sample_n(10000) %>% as.data.frame() 31 | pd_2 = pd %>% filter(day %in% names(x_table)[x_table <= 10000]) %>% as.data.frame() 32 | pd_sub = rbind(pd_1, pd_2) 33 | pd_sub$day = factor(pd_sub$day, levels = names(day_color_plate)) 34 | 35 | ### Fig. 3e (sub panel on the top left) 36 | p = ggplot() + 37 | geom_point(data = pd_sub, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.35) + 38 | geom_point(data = pd_sub[sample(1:nrow(pd_sub)),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.2) + 39 | scale_color_manual(values=day_color_plate) + 40 | theme_void() + 41 | theme(legend.position="none") + 42 | ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 8, height = 6, dpi = 300) 43 | 44 | 45 | -------------------------------------------------------------------------------- /Section_4_eye/Embedding_scanpy_eye.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################################## 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ### 4 | ########################################################################################################## 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["Amacrine cells", 30 | "Amacrine/Horizontal precursor cells", 31 | "Bipolar precursor cells", 32 | "Cholinergic amacrine cells", 33 | "Ciliary margin cells", 34 | "Cone precursor cells", 35 | "Horizontal cells", 36 | "Naive retinal progenitor cells", 37 | "Photoreceptor precursor cells", 38 | "PV-containing retinal ganglion cells", 39 | "Retinal ganglion cells", 40 | "Retinal progenitor cells", 41 | "Rod precursor cells", 42 | "Eye field", 43 | "Retinal pigment cells"] 44 | 45 | example_id = "eye" 46 | print(example_id) 47 | 48 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)] 49 | 50 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 51 | 52 | print("Done reading data ...") 53 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 54 | 55 | sc.pp.normalize_total(adata, target_sum=1e4) 56 | print("Done normalization by total counts ...") 57 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 58 | 59 | sc.pp.log1p(adata) 60 | print("Done log transformation ...") 61 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 62 | 63 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 64 | print("Done finding highly variable genes ...") 65 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 66 | 67 | adata = adata[:, adata.var.highly_variable] 68 | print("Done filtering in highly variable genes ...") 69 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 70 | 71 | sc.pp.scale(adata, max_value=10) 72 | print("Done scaling data ...") 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 74 | ### done with regress_out and scale ### 75 | 76 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 77 | print("Done performing PCA ...") 78 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 79 | 80 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 81 | print("Done computing neighborhood graph ...") 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 83 | 84 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 85 | print("Done UMAP ...") 86 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 87 | 88 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 89 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 90 | print("Done clustering using res = 1 ...") 91 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 92 | 93 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 94 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 95 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 96 | 97 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 98 | print("Done UMAP ...") 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 100 | 101 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 102 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 103 | 104 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 105 | 106 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 107 | print("Done writing data ...") 108 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 109 | 110 | emb = adata.obsm['X_pca'] 111 | print(emb.shape) 112 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 113 | 114 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/Embedding_early_neurons.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################################## 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ### 4 | ########################################################################################################## 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["GABAergic neurons", "Glutamatergic neurons", "Spinal cord dorsal progenitors", "Spinal cord ventral progenitors"] 30 | 31 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 32 | "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 33 | "E12.25", "E12.5", "E12.75"] 34 | 35 | example_id = "Neurons" 36 | print(example_id) 37 | 38 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)] 39 | adata = adata[adata.obs["day"].isin(day_include)] 40 | 41 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 42 | 43 | print("Done reading data ...") 44 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 45 | 46 | sc.pp.normalize_total(adata, target_sum=1e4) 47 | print("Done normalization by total counts ...") 48 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 49 | 50 | sc.pp.log1p(adata) 51 | print("Done log transformation ...") 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 53 | 54 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 55 | print("Done finding highly variable genes ...") 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 57 | 58 | adata = adata[:, adata.var.highly_variable] 59 | print("Done filtering in highly variable genes ...") 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 61 | 62 | sc.pp.scale(adata, max_value=10) 63 | print("Done scaling data ...") 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 65 | ### done with regress_out and scale ### 66 | 67 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 68 | print("Done performing PCA ...") 69 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 70 | 71 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 72 | print("Done computing neighborhood graph ...") 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 74 | 75 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 76 | print("Done UMAP ...") 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 78 | 79 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 80 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 81 | print("Done clustering using res = 1 ...") 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 83 | 84 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 85 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 86 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 87 | 88 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 89 | print("Done UMAP ...") 90 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 91 | 92 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 93 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 94 | 95 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 96 | 97 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 98 | print("Done writing data ...") 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 100 | 101 | emb = adata.obsm['X_pca'] 102 | print(emb.shape) 103 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 104 | 105 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/Embedding_neuroectoderm_derivatives.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################################## 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ### 4 | ########################################################################################################## 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["CNS_neurons", "Ependymal_cells", "Intermediate_neuronal_progenitors", 30 | "Neuroectoderm_and_glia", "Oligodendrocytes"] 31 | 32 | celltype_list_exclude = ["Amacrine cells", "Amacrine/Horizontal precursor cells", "Cholinergic amacrine cells", "Horizontal cells", "PV-containing retinal ganglion cells", "Retinal ganglion cells", "Ciliated nodal cells"] 33 | 34 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 35 | "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 36 | "E12.25", "E12.5", "E12.75"] 37 | 38 | example_id = "Neuroectoderm_derivative" 39 | print(example_id) 40 | 41 | adata = adata[adata.obs["major_trajectory"].isin(celltype_include)] 42 | adata = adata[adata.obs["day"].isin(day_include)] 43 | adata = adata[~adata.obs["celltype_update"].isin(celltype_list_exclude)] 44 | 45 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 46 | 47 | print("Done reading data ...") 48 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 49 | 50 | sc.pp.normalize_total(adata, target_sum=1e4) 51 | print("Done normalization by total counts ...") 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 53 | 54 | sc.pp.log1p(adata) 55 | print("Done log transformation ...") 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 57 | 58 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 59 | print("Done finding highly variable genes ...") 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 61 | 62 | adata = adata[:, adata.var.highly_variable] 63 | print("Done filtering in highly variable genes ...") 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 65 | 66 | sc.pp.scale(adata, max_value=10) 67 | print("Done scaling data ...") 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 69 | ### done with regress_out and scale ### 70 | 71 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 72 | print("Done performing PCA ...") 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 74 | 75 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 76 | print("Done computing neighborhood graph ...") 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 78 | 79 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 80 | print("Done UMAP ...") 81 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 82 | 83 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 84 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 85 | print("Done clustering using res = 1 ...") 86 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 87 | 88 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 89 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 90 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 91 | 92 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 93 | print("Done UMAP ...") 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 95 | 96 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 97 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 98 | 99 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 100 | 101 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 102 | print("Done writing data ...") 103 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 104 | 105 | emb = adata.obsm['X_pca'] 106 | print(emb.shape) 107 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 108 | 109 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/Embedding_patterned_neuroectoderm.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################################## 3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ### 4 | ########################################################################################################## 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | 13 | start_time = time.time() 14 | 15 | WORK_PATH = './' 16 | 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 21 | 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 23 | del adata_1, adata_2, adata_3, adata_4 24 | gc.collect() 25 | 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0) 27 | adata.obs = pdata 28 | 29 | celltype_include = ["Telencephalon", 30 | "Dorsal telencephalon", 31 | "Hypothalamus", 32 | "Diencephalon", 33 | "Midbrain", 34 | "Hypothalamus (Sim1+)", 35 | "Anterior floor plate", 36 | "Midbrain-hindbrain boundary", 37 | "Anterior roof plate", 38 | "Hindbrain", 39 | "Floorplate and p3 domain", 40 | "Spinal cord/r7/r8", 41 | "Posterior roof plate"] 42 | 43 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 44 | "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 45 | "E12.25", "E12.5", "E12.75"] 46 | 47 | example_id = "Neuroectoderm_backbone" 48 | print(example_id) 49 | 50 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)] 51 | adata = adata[adata.obs["day"].isin(day_include)] 52 | 53 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip") 54 | 55 | print("Done reading data ...") 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 57 | 58 | sc.pp.normalize_total(adata, target_sum=1e4) 59 | print("Done normalization by total counts ...") 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 61 | 62 | sc.pp.log1p(adata) 63 | print("Done log transformation ...") 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 65 | 66 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 67 | print("Done finding highly variable genes ...") 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 69 | 70 | adata = adata[:, adata.var.highly_variable] 71 | print("Done filtering in highly variable genes ...") 72 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 73 | 74 | sc.pp.scale(adata, max_value=10) 75 | print("Done scaling data ...") 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 77 | ### done with regress_out and scale ### 78 | 79 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 80 | print("Done performing PCA ...") 81 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 82 | 83 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 84 | print("Done computing neighborhood graph ...") 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 86 | 87 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 88 | print("Done UMAP ...") 89 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 90 | 91 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 92 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 93 | print("Done clustering using res = 1 ...") 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 95 | 96 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 97 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 98 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 99 | 100 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 101 | print("Done UMAP ...") 102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 103 | 104 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 105 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 106 | 107 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id)) 108 | 109 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip") 110 | print("Done writing data ...") 111 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 112 | 113 | emb = adata.obsm['X_pca'] 114 | print(emb.shape) 115 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f') 116 | 117 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/step1_Patterned_neuroectoderm.R: -------------------------------------------------------------------------------- 1 | 2 | ################################## 3 | ### Section - 5, Neuroectoderm ### 4 | ################################## 5 | 6 | ############################################################### 7 | ### Making 2D UMAP visualization of patterned neuroectoderm ### 8 | ############################################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "Neuroectoderm_backbone"; print(example_i) 15 | 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 17 | 18 | ### Fig. 4a 19 | 20 | p = pd %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.3) + 24 | theme_void() + 25 | scale_color_manual(values=neuroectoderm_color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300) 28 | 29 | 30 | ############################################################################# 31 | ### Making 3D UMAP visualization of patterned neuroectoderm + derivatives ### 32 | ############################################################################# 33 | 34 | source("JAX_help_code.R") 35 | source("JAX_color_code.R") 36 | work_path = "./" 37 | 38 | example_i = "Neuroectoderm_derivative"; print(example_i) 39 | 40 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 41 | 42 | ### Fig. 4b 43 | 44 | fig = plot_ly(pd[sample(1:nrow(pd), 250000),], x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~major_trajectory, colors = major_trajectory_color_plate) %>% 45 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 46 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 47 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2), 48 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 49 | saveWidget(fig, paste0(work_path, example_i, "_major_trajectory.html"), selfcontained = FALSE, libdir = "tmp") 50 | 51 | ### Fig. 4c 52 | 53 | fig = plot_ly(pd[sample(1:nrow(pd), 250000),], x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~day, colors = day_color_plate) %>% 54 | layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2), 55 | yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2), 56 | zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2), 57 | camera = list(eye = list(x = -0.8, y = 2, z = 1.5)))) 58 | saveWidget(fig, paste0(work_path, example_i, "_day.html"), selfcontained = FALSE, libdir = "tmp") 59 | 60 | ### BACKUP ### 61 | 62 | celltype_list = c("Telencephalon", 63 | "Dorsal telencephalon", 64 | "Hypothalamus", 65 | "Diencephalon", 66 | "Midbrain", 67 | "Hypothalamus (Sim1+)", 68 | "Anterior floor plate", 69 | "Midbrain-hindbrain boundary", 70 | "Anterior roof plate", 71 | "Hindbrain", 72 | "Floorplate and p3 domain", 73 | "Spinal cord/r7/r8", 74 | "Posterior roof plate") 75 | 76 | day_list = c("E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 77 | "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 78 | "E12.25", "E12.5", "E12.75") 79 | 80 | emb = as.matrix(pd[,c("UMAP_1","UMAP_2","UMAP_3")]) 81 | 82 | dist_1 = list() 83 | for(day_i in day_list){ 84 | print(day_i) 85 | emb_x = emb[pd$day == day_i & pd$celltype_update %in% celltype_list,] 86 | if(nrow(emb_x) > 10000){ 87 | emb_x = emb_x[sample(1:nrow(emb_x), 10000),] 88 | } 89 | dist_1[[day_i]] = c(rdist(emb_x)) 90 | 91 | } 92 | 93 | dist_2 = list() 94 | for(day_i in day_list){ 95 | print(day_i) 96 | emb_x = emb[pd$day == day_i & !pd$celltype_update %in% celltype_list,] 97 | if(nrow(emb_x) > 10000){ 98 | emb_x = emb_x[sample(1:nrow(emb_x), 10000),] 99 | } 100 | dist_2[[day_i]] = c(rdist(emb_x)) 101 | } 102 | 103 | df = NULL 104 | for(day_i in day_list){ 105 | df = rbind(df, 106 | data.frame(day = day_i, 107 | dist = mean(dist_1[[day_i]]), 108 | group = "patterned_neuroectoderm", stringsAsFactors = FALSE)) 109 | df = rbind(df, 110 | data.frame(day = day_i, 111 | dist = mean(dist_2[[day_i]]), 112 | group = "derived_cell_types", stringsAsFactors = FALSE)) 113 | } 114 | df$day = factor(df$day, levels = day_list) 115 | 116 | df$day = factor(df$day, levels = rev(day_list)) 117 | p = df %>% 118 | ggplot(aes(x=day, y=dist, color=group, group=group)) + 119 | geom_line() + 120 | geom_point() + 121 | scale_color_brewer(palette = "Set1") + 122 | theme_classic(base_size = 10) + 123 | theme(legend.position="none") + 124 | coord_flip() 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/step2_Early_neurons.R: -------------------------------------------------------------------------------- 1 | 2 | ################################## 3 | ### Section - 5, Neuroectoderm ### 4 | ################################## 5 | 6 | ################################################# 7 | ### 2D UMAP of subclustering on early neurons ### 8 | ################################################# 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "Neurons"; print(example_i) 15 | 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 17 | 18 | ### Fig. 5e 19 | 20 | p = pd %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = neurons_sub_clustering), size=0.3) + 24 | theme_void() + 25 | scale_color_manual(values=neuroectoderm_color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300) 28 | 29 | ### Extended Data Fig. 10c 30 | 31 | day_list = names(day_color_plate) 32 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day]) 33 | p = pd %>% 34 | ggplot() + 35 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 36 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) + 37 | theme_void() + 38 | scale_color_manual(values=neuron_day_color_plate) + 39 | theme(legend.position="none") + 40 | ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 6, height = 6, dpi = 300) 41 | 42 | 43 | #################################################### 44 | ### 2D UMAP of Intermediate neuronal progenitors ### 45 | #################################################### 46 | 47 | source("JAX_help_code.R") 48 | source("JAX_color_code.R") 49 | work_path = "./" 50 | 51 | example_i = "INP"; print(example_i) 52 | 53 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 54 | 55 | ### Extended Data Fig. 10a 56 | 57 | p = pd %>% 58 | ggplot() + 59 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 60 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.3) + 61 | theme_void() + 62 | scale_color_brewer(palette = "Set2") + 63 | theme(legend.position="none") + 64 | ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300) 65 | 66 | day_list = names(day_color_plate) 67 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day]) 68 | p = pd %>% 69 | ggplot() + 70 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 71 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) + 72 | theme_void() + 73 | scale_color_manual(values=neuron_day_color_plate) + 74 | theme(legend.position="none") + 75 | ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 6, height = 6, dpi = 300) 76 | 77 | 78 | #################################################################################################### 79 | ### Composition of embryos from each 6-hr bin by intermediate neuronal progenitor and CNS neuron ### 80 | #################################################################################################### 81 | 82 | source("JAX_help_code.R") 83 | source("JAX_color_code.R") 84 | work_path = "./" 85 | 86 | pd = readRDS(paste0(work_path, "df_cell.rds")) 87 | day_list = names(day_color_plate) 88 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day]) 89 | 90 | pd_1 = pd[pd$major_trajectory == "CNS_neurons",] 91 | pd_2 = pd[pd$major_trajectory == "Intermediate_neuronal_progenitors",] 92 | 93 | x1 = pd_1 %>% group_by(day) %>% tally() %>% 94 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 95 | mutate(frac = 100*n/total_n) 96 | 97 | x2 = pd_2 %>% group_by(day) %>% tally() %>% 98 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 99 | mutate(frac = 100*n/total_n) 100 | 101 | x = x1 %>% select(day, frac) %>% rename(direct_frac = frac) %>% left_join(x2 %>% select(day, frac) %>% rename(indirect_frac = frac), by = "day") 102 | x$indirect_frac[is.na(x$indirect_frac)] = 0 103 | x = data.frame(day = rep(x$day, 2), 104 | frac = c(x$direct_frac, x$indirect_frac), 105 | major_trajectory = rep(c("CNS_neurons","Intermediate_neuronal_progenitors"), each = nrow(x))) 106 | 107 | ### Fig. 4d 108 | 109 | p = x %>% 110 | ggplot(aes(x=day, y=frac, fill = day)) + 111 | geom_bar(stat='identity') + facet_grid(rows = vars(major_trajectory)) + 112 | labs(x='',y='% of cells') + 113 | scale_fill_manual(values=day_color_plate) + 114 | theme_classic(base_size = 10) + 115 | theme(legend.position="none") + 116 | theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) 117 | 118 | 119 | ############################################################# 120 | ### For each interneuron, what are the top expressed TFs? ### 121 | ############################################################# 122 | 123 | source("JAX_help_code.R") 124 | source("JAX_color_code.R") 125 | work_path = "./" 126 | 127 | dat = readRDS(paste0(work_path, "Neurons_heatmap_dat.rds")) 128 | 129 | Colors=rev(brewer.pal(11,"Spectral")) 130 | Colors=colorRampPalette(Colors)(120) 131 | pdf(paste0(work_path, "Neurons_heatmap.pdf"), 8, 5) 132 | heatmap.2(as.matrix(t(dat)), 133 | col=Colors, 134 | scale="col", 135 | Rowv = F, 136 | Colv = F, 137 | key=T, 138 | density.info="none", 139 | trace="none", 140 | cexRow = 1, 141 | cexCol = 1, 142 | margins = c(5,5)) 143 | dev.off() 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /Section_5_neuroectoderm/step5_Astrocytes.R: -------------------------------------------------------------------------------- 1 | 2 | ################################## 3 | ### Section - 5, Neuroectoderm ### 4 | ################################## 5 | 6 | ############################################## 7 | ### Analyzing astrocytes from stages < E13 ### 8 | ############################################## 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | example_i = "Astrocytes"; print(example_i) 15 | 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 17 | 18 | ### Extended Data Fig. 10g 19 | 20 | p = pd %>% 21 | ggplot() + 22 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 23 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_sub_clustering), size=0.3) + 24 | theme_void() + 25 | scale_color_manual(values=astrocytes_color_plate) + 26 | theme(legend.position="none") + 27 | ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300) 28 | 29 | 30 | 31 | ############################################################## 32 | ### Compositions changing over time for different subtypes ### 33 | ############################################################## 34 | 35 | source("JAX_help_code.R") 36 | source("JAX_color_code.R") 37 | work_path = "./" 38 | 39 | example_i = "Astrocytes"; print(example_i) 40 | 41 | pd = readRDS(paste0(work_path, "df_cell.rds")) 42 | pd_sub = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 43 | 44 | day_list = names(day_color_plate) 45 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day]) 46 | pd_1 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA1 astrocytes"],] 47 | pd_2 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA2 astrocytes"],] 48 | pd_3 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA3 astrocytes"],] 49 | pd_4 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "Anterior astrocytes"],] 50 | 51 | x1 = pd_1 %>% group_by(day) %>% tally() %>% 52 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 53 | mutate(frac = 100*n/total_n) 54 | 55 | x2 = pd_2 %>% group_by(day) %>% tally() %>% 56 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 57 | mutate(frac = 100*n/total_n) 58 | 59 | x3 = pd_3 %>% group_by(day) %>% tally() %>% 60 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 61 | mutate(frac = 100*n/total_n) 62 | 63 | x4 = pd_4 %>% group_by(day) %>% tally() %>% 64 | left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>% 65 | mutate(frac = 100*n/total_n) 66 | 67 | x = x2 %>% select(day, frac) %>% rename(VA2 = frac) %>% 68 | left_join(x1 %>% select(day, frac) %>% rename(VA1 = frac), by = "day") %>% 69 | left_join(x3 %>% select(day, frac) %>% rename(VA3 = frac), by = "day") %>% 70 | left_join(x4 %>% select(day, frac) %>% rename(AA = frac), by = "day") 71 | x$VA1[is.na(x$VA1)] = 0 72 | x$VA2[is.na(x$VA2)] = 0 73 | x$VA3[is.na(x$VA3)] = 0 74 | x$AA[is.na(x$AA)] = 0 75 | x = data.frame(day = rep(x$day, 4), 76 | frac = c(x$VA1, x$VA2, x$VA3, x$AA), 77 | major_trajectory = rep(c("VA1","VA2","VA3","AA"), each = nrow(x))) 78 | 79 | x$major_trajectory = factor(x$major_trajectory, levels = c("VA1","VA2","VA3","AA")) 80 | 81 | ### Extended Data Fig. 10h 82 | 83 | p = x %>% 84 | ggplot(aes(x=day, y=frac, fill = day)) + 85 | geom_bar(stat='identity') + facet_grid(rows = vars(major_trajectory)) + 86 | labs(x='',y='% of cells') + 87 | scale_fill_manual(values=day_color_plate_2) + 88 | theme_classic(base_size = 10) + 89 | theme(legend.position="none") + 90 | theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) 91 | 92 | 93 | 94 | ############################################################################### 95 | ### Mapping different subtypes of astrocytes to their potential progenitors ### 96 | ############################################################################### 97 | 98 | source("JAX_help_code.R") 99 | source("JAX_color_code.R") 100 | work_path = "./" 101 | 102 | example_i = "Neuroectoderm_derivative" 103 | name = "Astrocytes" 104 | 105 | pd_sub = readRDS(paste0(work_path, name, "_adata_scale.obs.rds")) 106 | 107 | ### this result was calculated by step4_Mapping_neuroectoderm_derivatives.R 108 | dat = readRDS(paste0(work_path, example_i, ".MNN_pairs.rds")) 109 | 110 | pd_back = readRDS(paste0(work_path, "Neuroectoderm_backbone_adata_scale.obs.rds")) 111 | rownames(pd_back) = as.vector(pd_back$cell_id) 112 | 113 | celltype_sub_clustering_list = paste0("VA", c(1:3), " astrocytes") 114 | 115 | ### Extended Data Fig. 10j 116 | 117 | for(i in celltype_sub_clustering_list){ 118 | print(i) 119 | pd_sub_i = pd_sub %>% filter(celltype_sub_clustering == i) %>% pull(cell_id) 120 | dat_sub = dat %>% filter(A %in% pd_sub_i) %>% group_by(B) %>% tally() %>% rename(cell_id = B, freq = n) 121 | df = pd_back %>% select(UMAP_1 = UMAP_2d_1, UMAP_2 = UMAP_2d_2, cell_id, day) %>% left_join(dat_sub, by = "cell_id") 122 | df$freq[is.na(df$freq)] = 0 123 | 124 | name_i = gsub("/", "_", i) 125 | name_i = gsub(" ", "_", name_i) 126 | 127 | try(ggplot() + 128 | geom_point(data = df[sample(1:nrow(df),100000),], aes(x = UMAP_1, y = UMAP_2), size=0.5, color = "grey80") + 129 | geom_point(data = df[df$freq != 0,], aes(x = UMAP_1, y = UMAP_2, color = freq), size=0.5) + 130 | theme_void() + 131 | scale_color_viridis() + 132 | theme(legend.position="none") + 133 | ggsave(paste0(work_path, name_i, ".png"), width = 8, height = 6, dpi = 300), silent = T) 134 | 135 | } 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /Section_6_development_tree/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_6_development_tree/.DS_Store -------------------------------------------------------------------------------- /Section_6_development_tree/Dimension_reduction_subsystem.py: -------------------------------------------------------------------------------- 1 | 2 | ########################################################################################## 3 | ### First, we manually split all the cell types, from the organogenesis & fetal development, into 12 systems, 4 | ### to perform dimension reducting using Scanpy, followed by identifying the kNNs across cells using annoy in Python 5 | ########################################################################################## 6 | 7 | 8 | import scanpy as sc 9 | import pandas as pd 10 | import numpy as np 11 | import os, sys 12 | import time 13 | import gc 14 | from annoy import AnnoyIndex 15 | 16 | start_time = time.time() 17 | 18 | WORK_PATH = './' 19 | 20 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 21 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 22 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 23 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 24 | 25 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4) 26 | del adata_1, adata_2, adata_3, adata_4 27 | gc.collect() 28 | 29 | ############################################## 30 | ### Of note, please read df_cell_graph.rds and then write it to df_cell_graph.csv in R 31 | 32 | ### >>> dat = readRDS("df_cell_graph.rds") 33 | ### >>> rownames(dat) = as.vector(dat$cell_id) 34 | ### >>> write.csv(dat, "df_cell_graph.csv") 35 | 36 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_graph.csv'), index_col = 0) 37 | adata_orig.obs = pdata 38 | 39 | system_list = ["Endothelium", 40 | "Epithelial_cells", 41 | "Eye", 42 | "Gut", 43 | "Notochord", 44 | "PNS_glia", 45 | "PNS_neurons", 46 | "Renal", 47 | "Lateral_plate_mesoderm", 48 | "Blood", 49 | "Brain_spinal_cord", 50 | "Mesoderm"] 51 | 52 | for system_i in trajectory_list: 53 | 54 | print("Processing: %s"%system_i) 55 | 56 | adata = adata_orig[adata_orig.obs["system"] == system_i] 57 | print(adata.shape) 58 | 59 | sc.pp.normalize_total(adata, target_sum=1e4) 60 | sc.pp.log1p(adata) 61 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 62 | adata = adata[:, adata.var.highly_variable] 63 | sc.pp.scale(adata, max_value=10) 64 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 65 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 66 | 67 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 68 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 69 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 70 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 71 | 72 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 73 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 74 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 75 | 76 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 77 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 78 | 79 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 80 | adata.obs['leiden_res_2'] = adata.obs['leiden'] 81 | 82 | sc.tl.leiden(adata, resolution=5, n_iterations=2) 83 | adata.obs['leiden_res_5'] = adata.obs['leiden'] 84 | 85 | sc.tl.leiden(adata, resolution=10, n_iterations=2) 86 | adata.obs['leiden_res_10'] = adata.obs['leiden'] 87 | 88 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%system_i)) 89 | 90 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%system_i), compression="gzip") 91 | 92 | X = adata.obsm['X_pca'] 93 | print(X.shape) 94 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), X, delimiter=",", fmt='%1.3f') 95 | 96 | ### calculating kNN using annoy, this is much faster than using R 97 | 98 | dist_metric = 'euclidean' 99 | k = 15 100 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15. 101 | 102 | npc = X.shape[1] 103 | ncell = X.shape[0] 104 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 105 | 106 | for i in range(ncell): 107 | annoy_index.add_item(i, list(X[i,:])) 108 | annoy_index.build(15) ### bigger number will make the result more accurate 109 | 110 | knn = [] 111 | for iCell in range(ncell): 112 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 113 | knn = np.array(knn, dtype=int) 114 | 115 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%system_i), knn, delimiter=",", fmt='%s') 116 | 117 | 118 | 119 | ###################################################### 120 | ### Second, we found the neuroectoderm is too complex, so we subset the patterned neuroectoderm cells to perform embedding 121 | 122 | 123 | import scanpy as sc 124 | import pandas as pd 125 | import numpy as np 126 | import os, sys 127 | import time 128 | import gc 129 | from annoy import AnnoyIndex 130 | 131 | start_time = time.time() 132 | 133 | WORK_PATH = './' 134 | 135 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 136 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 137 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 138 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 139 | 140 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4) 141 | del adata_1, adata_2, adata_3, adata_4 142 | gc.collect() 143 | 144 | ############################################## 145 | ### Of note, please read df_cell_graph.rds and then write it to df_cell_graph.csv in R 146 | 147 | ### >>> dat = readRDS("df_cell_graph.rds") 148 | ### >>> rownames(dat) = as.vector(dat$cell_id) 149 | ### >>> write.csv(dat, "df_cell_graph.csv") 150 | 151 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_graph.csv'), index_col = 0) 152 | adata_orig.obs = pdata 153 | 154 | patterned_neuroectoderm = ["Anterior floor plate", 155 | "Diencephalon", 156 | "Floorplate and p3 domain", 157 | "Hypothalamus", 158 | "Midbrain", 159 | "Posterior roof plate", 160 | "Telencephalon", 161 | "Anterior roof plate", 162 | "Dorsal telencephalon", 163 | "Hindbrain", 164 | "Hypothalamus (Sim1+)", 165 | "Midbrain-hindbrain boundary", 166 | "Spinal cord/r7/r8"] 167 | 168 | system_i = "Neuroectoderm" 169 | print("Processing: %s"%system_i) 170 | 171 | adata = adata_orig[adata_orig.obs["celltype_update"].isin(patterned_neuroectoderm)] 172 | print(adata.shape) 173 | 174 | sc.pp.normalize_total(adata, target_sum=1e4) 175 | sc.pp.log1p(adata) 176 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 177 | adata = adata[:, adata.var.highly_variable] 178 | sc.pp.scale(adata, max_value=10) 179 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 180 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 181 | 182 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 183 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 184 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 185 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 186 | 187 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 188 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 189 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 190 | 191 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 192 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 193 | 194 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 195 | adata.obs['leiden_res_2'] = adata.obs['leiden'] 196 | 197 | sc.tl.leiden(adata, resolution=5, n_iterations=2) 198 | adata.obs['leiden_res_5'] = adata.obs['leiden'] 199 | 200 | sc.tl.leiden(adata, resolution=10, n_iterations=2) 201 | adata.obs['leiden_res_10'] = adata.obs['leiden'] 202 | 203 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%system_i)) 204 | 205 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%system_i), compression="gzip") 206 | 207 | X = adata.obsm['X_pca'] 208 | print(X.shape) 209 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), X, delimiter=",", fmt='%1.3f') 210 | 211 | ### calculating kNN using annoy, this is much faster than using R 212 | 213 | dist_metric = 'euclidean' 214 | k = 15 215 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15. 216 | 217 | npc = X.shape[1] 218 | ncell = X.shape[0] 219 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 220 | 221 | for i in range(ncell): 222 | annoy_index.add_item(i, list(X[i,:])) 223 | annoy_index.build(15) ### bigger number will make the result more accurate 224 | 225 | knn = [] 226 | for iCell in range(ncell): 227 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 228 | knn = np.array(knn, dtype=int) 229 | 230 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%system_i), knn, delimiter=",", fmt='%s') 231 | 232 | 233 | -------------------------------------------------------------------------------- /Section_6_development_tree/Graph_robust.py: -------------------------------------------------------------------------------- 1 | 2 | ################################################################################ 3 | ### Second, to assess the robustness of MNNs to cell sampling, we randomly subsampled 80% 4 | ### of cells from each developmental system during organogenesis & fetal development 5 | 6 | import scanpy as sc 7 | import pandas as pd 8 | import numpy as np 9 | import os, sys 10 | import time 11 | import gc 12 | from annoy import AnnoyIndex 13 | 14 | system_list = ["Endothelium", 15 | "Epithelial_cells", 16 | "Eye", 17 | "Gut", 18 | "Notochord", 19 | "PNS_glia", 20 | "PNS_neurons", 21 | "Renal", 22 | "Lateral_plate_mesoderm", 23 | "Blood", 24 | "Brain_spinal_cord", 25 | "Mesoderm", 26 | "Neuroectoderm"] 27 | 28 | WORK_PATH = "./" 29 | 30 | for system_i in system_list: 31 | 32 | ### PC features were calculated by Dimension_reduction_subsystem.py 33 | 34 | X = pd.read_csv(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), index_col = False, header=None) 35 | X = pd.DataFrame.to_numpy(X) 36 | 37 | original_size = X.shape[0] 38 | subset_size = int(original_size * 0.8) 39 | npc = X.shape[1] 40 | 41 | dist_metric = 'euclidean' 42 | k = 15 43 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15. 44 | 45 | for cnt in range(100): 46 | 47 | idx = np.random.choice(original_size, size=subset_size, replace=False) 48 | X_sub = X[idx,:] 49 | 50 | ncell = X_sub.shape[0] 51 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 52 | 53 | for i in range(ncell): 54 | annoy_index.add_item(i, list(X_sub[i,:])) 55 | annoy_index.build(15) ### bigger number will make the result more accurate 56 | 57 | knn = [] 58 | for iCell in range(ncell): 59 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 60 | knn = np.array(knn, dtype=int) 61 | 62 | np.savetxt(os.path.join(WORK_PATH, '%s_knn_%s.csv'%(system_i, str(cnt+1))), knn, delimiter=",", fmt='%s') 63 | np.savetxt(os.path.join(WORK_PATH, '%s_idx_%s.csv'%(system_i, str(cnt+1))), idx, delimiter=",", fmt='%s') 64 | 65 | 66 | 67 | 68 | ################################################################################## 69 | ### Third, to determine the effect of k parameter choice on the MNNs identified 70 | ### between cell types, we examined different k values (k = 5, 10, 20, 30, 40, 50) 71 | ################################################################################## 72 | 73 | 74 | import scanpy as sc 75 | import pandas as pd 76 | import numpy as np 77 | import os, sys 78 | import time 79 | import gc 80 | from annoy import AnnoyIndex 81 | 82 | system_list = ["Endothelium", 83 | "Epithelial_cells", 84 | "Eye", 85 | "Gut", 86 | "Notochord", 87 | "PNS_glia", 88 | "PNS_neurons", 89 | "Renal", 90 | "Lateral_plate_mesoderm", 91 | "Blood", 92 | "Brain_spinal_cord", 93 | "Mesoderm", 94 | "Neuroectoderm"] 95 | 96 | WORK_PATH = "./" 97 | 98 | for system_i in system_list: 99 | 100 | ### PC features were calculated by Dimension_reduction_subsystem.py 101 | 102 | X = pd.read_csv(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), index_col = False, header=None) 103 | X = pd.DataFrame.to_numpy(X) 104 | 105 | ncell = X.shape[0] 106 | npc = X.shape[1] 107 | dist_metric = 'euclidean' 108 | 109 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 110 | 111 | for i in range(ncell): 112 | annoy_index.add_item(i, list(X[i,:])) 113 | annoy_index.build(15) ### bigger number will make the result more accurate 114 | 115 | for k in [5,10,20,30,40,50]: 116 | 117 | knn = [] 118 | for iCell in range(ncell): 119 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 120 | knn = np.array(knn, dtype=int) 121 | 122 | np.savetxt(os.path.join(WORK_PATH, '%s_knn_%s.csv'%(system_i, str(k))), knn, delimiter=",", fmt='%s') 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /Section_6_development_tree/Two_examples.py: -------------------------------------------------------------------------------- 1 | 2 | import scanpy as sc 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | import sys 7 | from annoy import AnnoyIndex 8 | 9 | WORK_PATH = './' 10 | 11 | example_list = ["suppressor_cells", "lung"] 12 | 13 | for example_i in example_list: 14 | print(example_i) 15 | 16 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i)) 17 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 18 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0) 19 | adata.obs = pdata 20 | adata.var = fdata 21 | 22 | print(adata.shape) 23 | 24 | sc.pp.normalize_total(adata, target_sum=1e4) 25 | sc.pp.log1p(adata) 26 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 27 | adata = adata[:, adata.var.highly_variable] 28 | sc.pp.scale(adata, max_value=10) 29 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 30 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 31 | 32 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 33 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 34 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 35 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 36 | 37 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 38 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 39 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 40 | 41 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i)) 42 | 43 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip") 44 | 45 | X = adata.obsm['X_pca'] 46 | print(X.shape) 47 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f') 48 | 49 | dist_metric = 'euclidean' 50 | k = 15 51 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15. 52 | 53 | npc = X.shape[1] 54 | ncell = X.shape[0] 55 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 56 | 57 | for i in range(ncell): 58 | annoy_index.add_item(i, list(X[i,:])) 59 | annoy_index.build(15) ### bigger number will make the result more accurate 60 | 61 | knn = [] 62 | for iCell in range(ncell): 63 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 64 | knn = np.array(knn, dtype=int) 65 | 66 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%example_i), knn, delimiter=",", fmt='%s') 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Section_6_development_tree/step2_Late_stage_graph.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 6, Development tree ### 4 | ##################################### 5 | 6 | ######################################################################################################### 7 | ### First, we manually split all the cell types, from the organogenesis & fetal development, into 12 systems, 8 | ### to perform dimension reducting using Scanpy, followed by identifying the kNNs across cells using annoy in Python 9 | 10 | ### For Brain_spinal_cord, we further split patterned neuroectoderm ("Neuroectoderm") to perform embedding. 11 | 12 | ### Python Dimension_reduction_subsystem.py 13 | 14 | source("JAX_help_code.R") 15 | source("JAX_color_code.R") 16 | work_path = "./" 17 | 18 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds")) 19 | rownames(pd_all) = as.vector(pd_all$cell_id) 20 | 21 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t") 22 | 23 | system_list = c("Endothelium", 24 | "Epithelial_cells", 25 | "Eye", 26 | "Gut", 27 | "Notochord", 28 | "PNS_glia", 29 | "PNS_neurons", 30 | "Renal", 31 | "Lateral_plate_mesoderm", 32 | "Blood", 33 | "Neuroectoderm", 34 | "Brain_spinal_cord", 35 | "Mesoderm") 36 | 37 | patterned_neuroectoderm = c("Anterior floor plate", 38 | "Diencephalon", 39 | "Floorplate and p3 domain", 40 | "Hypothalamus", 41 | "Midbrain", 42 | "Posterior roof plate", 43 | "Telencephalon", 44 | "Anterior roof plate", 45 | "Dorsal telencephalon", 46 | "Hindbrain", 47 | "Hypothalamus (Sim1+)", 48 | "Midbrain-hindbrain boundary", 49 | "Spinal cord/r7/r8") 50 | 51 | for(kk in 1:length(system_list)){ 52 | 53 | system_i = system_list[kk] 54 | print(system_i) 55 | 56 | ### After you running the "Python Dimension_reduction_subsystem.py", you will get this files 57 | pd = read.csv(paste0(work_path, system_i, "_adata_scale.obs.csv"), as.is=T, row.names = 1) 58 | rownames(pd) = as.vector(pd$cell_id) 59 | pd = pd %>% left_join(pd_all[,c("cell_id","celltype_new","system","meta_group")], by = "cell_id") %>% as.data.frame() 60 | 61 | if(system_i == "Neuroectoderm"){ 62 | pd$system = "Neuroectoderm" 63 | } 64 | 65 | ### create or read MNN pairs between individual cells 66 | 67 | nn_matrix = read.csv(paste0(work_path, system_i, "_adata_scale.kNN_15.csv"), as.is=T, header=F) 68 | nn_matrix = as.matrix(nn_matrix) 69 | nn_matrix = nn_matrix + 1 ### python and R using different start index 70 | 71 | ### extracting MNN pairs 72 | ### only retaining those edges which are considered twice (A -> B, B -> A) 73 | 74 | x = data.frame(i = rep(1:nrow(nn_matrix), ncol(nn_matrix)), 75 | j = c(nn_matrix), stringsAsFactors = FALSE) 76 | 77 | dat = Matrix::sparseMatrix(i = as.numeric(as.vector(x$i)), 78 | j = as.numeric(as.vector(x$j)), 79 | x = 1) 80 | 81 | dat_t = t(dat) + dat 82 | x = data.frame(summary(dat_t)) 83 | x = x[x$x == 2 & x$i > x$j,] 84 | x$x = NULL 85 | 86 | ### x saves the MNN pairs 87 | saveRDS(x, paste0(work_path, system_i, ".MNN.rds")) 88 | 89 | y = data.frame(i = 1:nrow(pd), 90 | j = 1:nrow(pd), 91 | meta_group = as.vector(pd$meta_group), stringsAsFactors = FALSE) 92 | 93 | dat = x %>% left_join(y %>% select(i, meta_group), by = "i") %>% 94 | left_join(y %>% select(j, meta_group), by = "j") %>% 95 | group_by(meta_group.x, meta_group.y) %>% tally() 96 | 97 | obs = dcast(dat, meta_group.x~meta_group.y) 98 | rownames(obs) = as.vector(obs[,1]) 99 | obs = obs[,-1] 100 | obs[is.na(obs)] = 0 101 | obs = as.matrix(obs) 102 | 103 | diag(obs) = 0 104 | 105 | obs_x = obs + t(obs) 106 | obs_y = as.vector(obs_x[upper.tri(obs_x)]) 107 | 108 | group = NULL 109 | for(i in 2:nrow(obs_x)){ 110 | print(i) 111 | for(j in 1:(i-1)){ 112 | group = rbind(group, data.frame(system = system_i, 113 | x = colnames(obs_x)[j], 114 | y = rownames(obs_x)[i], stringsAsFactors = F)) 115 | } 116 | } 117 | 118 | group$edge_num = obs_y 119 | 120 | group = group %>% 121 | left_join(nodes %>% rename(x = meta_group) %>% select(x, celltype_new, celltype_num), by = "x") %>% rename(x_name = celltype_new, x_size = celltype_num) %>% 122 | left_join(nodes %>% rename(y = meta_group) %>% select(y, celltype_new, celltype_num), by = "y") %>% rename(y_name = celltype_new, y_size = celltype_num) 123 | 124 | group$min_size = if_else(group$x_size < group$y_size, group$x_size, group$y_size) 125 | group$edge_num_norm = group$edge_num/log2(15*group$min_size) 126 | group$min_size = NULL 127 | 128 | saveRDS(group, paste0(work_path, system_i, ".edges_new.rds")) 129 | 130 | ### output MNN pairs for manually reviewing 131 | 132 | edges = group 133 | 134 | edges_2 = edges 135 | edges_2$x = as.vector(edges$y); edges_2$y = as.vector(edges$x) 136 | edges_2$x_size = as.vector(edges$y_size); edges_2$y_size = as.vector(edges$x_size) 137 | edges_2$x_name = as.vector(edges$y_name); edges_2$y_name = as.vector(edges$x_name) 138 | 139 | dat = rbind(edges, edges_2) %>% as.data.frame() %>% 140 | filter(edge_num != 0) %>% rename(MNN_pairs = edge_num, MNN_pairs_normalized = edge_num_norm) %>% 141 | group_by(x) %>% arrange(desc(MNN_pairs), .by_group = T) %>% 142 | as.data.frame() 143 | 144 | if(system_i == "Brain_spinal_cord"){ 145 | tmp = read.table(paste0(work_path, "Neuroectoderm", ".MNN_pairs.txt"),header=T,as.is=T,sep="\t") 146 | dat = dat[!dat$x %in% c(tmp$x, tmp$y) | !dat$y %in% c(tmp$x, tmp$y), ] 147 | } 148 | 149 | write.table(dat, paste0(work_path, system_i, ".MNN_pairs.txt"), row.names=F, sep="\t", quote=F) 150 | 151 | } 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /Section_6_development_tree/step3_Create_graph.R: -------------------------------------------------------------------------------- 1 | 2 | ##################################### 3 | ### Section - 6, Development tree ### 4 | ##################################### 5 | 6 | ################################### 7 | ### Summary the edges and nodes ### 8 | ################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t") 15 | 16 | ### now we merged edges which have been manually reviewed. 17 | 18 | ### edges_1 includes edges from pre-gastrulation and gastrulation stages 19 | edges_1 = read.table(paste0(work_path, "edges_1.txt"), header=F, as.is=T, sep="\t") 20 | 21 | ### edges_2 includes edges from organogenesis & fetal development 22 | edges_2 = read.table(paste0(work_path, "edges_2.txt"), header=F, as.is=T, sep="\t") 23 | 24 | ### edges_3 includes edges which are manually added to connect blood and PNS-neuron 25 | edges_3 = read.table(paste0(work_path, "edges_3.txt"), header=F, as.is=T, sep="\t") 26 | 27 | edges = rbind(edges_1, edges_2, edges_3) 28 | names(edges) = c("system", "x", "y", "x_name", "y_name", "edge_type") 29 | 30 | length((unique(c(edges$x, edges$y)))) 31 | 32 | write.table(edges, paste0(work_path, "edges.txt"), row.names=F, sep="\t", quote=F) 33 | 34 | ### To better visualize the result, we took out the spatial continuity edges, and also collapse reundant nodes 35 | edges_sub = edges[edges$edge_type != "Spatial continuity",] 36 | length((unique(c(edges_sub$x, edges_sub$y)))) 37 | 38 | edges_sub = rbind(edges_sub, edges[edges$x %in% c("BS_M37", "BS_M39") | edges$y %in% c("BS_M37", "BS_M39"),]) 39 | length((unique(c(edges_sub$x, edges_sub$y)))) 40 | 41 | write.table(edges_sub, paste0(work_path, "edges_sub.txt"), row.names=F, sep="\t", quote=F) 42 | 43 | ### removing redundant nodes 44 | edges_sub$x_y = paste0(edges_sub$x, ":", edges_sub$y) 45 | edges_x_1 = edges_sub[edges_sub$x_name == edges_sub$y_name & edges_sub$system == "Pre_gastrulation",] 46 | edges_x_2 = edges_sub[edges_sub$x_name == edges_sub$y_name & edges_sub$system == "Gastrulation_E8.5b",] 47 | 48 | edges_x_3 = edges_sub[edges_sub$x %in% as.vector(edges_x_1$y),] 49 | edges_x_4 = edges_sub[edges_sub$x %in% as.vector(edges_x_2$y),] 50 | 51 | edges_x_3_ = edges_x_3 %>% left_join(edges_x_1 %>% select(x,y) %>% rename(new_x = x, x=y), by = "x") 52 | edges_x_3$x = as.vector(edges_x_3_$new_x) 53 | 54 | edges_x_4_ = edges_x_4 %>% left_join(edges_x_2 %>% select(x,y) %>% rename(new_x = x, x=y), by = "x") 55 | edges_x_4$x = as.vector(edges_x_4_$new_x) 56 | 57 | edges_x_5 = edges_sub[!edges_sub$x_y %in% c(edges_x_1$x_y, edges_x_2$x_y, edges_x_3$x_y, edges_x_4$x_y),] 58 | 59 | edges_x = rbind(edges_x_3, edges_x_4, edges_x_5) 60 | print(edges_x[edges_x$x_name == edges_x$y_name,]) 61 | edges_x = edges_x[edges_x$x_name != edges_x$y_name,] 62 | edges_x$x_y_name = paste0(edges_x$x_name, ":", edges_x$y_name) 63 | x_table = table(edges_x$x_y_name) 64 | tmp = edges_x[edges_x$x_y_name %in% names(x_table)[x_table != 1],] 65 | print(tmp[order(tmp$x_name),]) 66 | 67 | redundant_edges = c("En_M5:En_M1", "Ga_M5:Ga_M6", "L_M7:L_M3", "En_M7:En_M5", "Ga_M23:En_M5", "BS_M20:BS_M2", "Ga_M17:En_M7") 68 | edges_x = edges_x[!edges_x$x_y %in% redundant_edges,] 69 | print(length(unique(c(edges_x$x, edges_x$y)))) 70 | print(length(unique(c(edges_x$x_name, edges_x$y_name)))) 71 | 72 | write.table(edges_x, paste0(work_path, "edges_sub.txt"), row.names=F, sep="\t", quote=F) 73 | 74 | nodes_sub = nodes[nodes$meta_group %in% c(edges_x$x, edges_x$y),] 75 | write.table(nodes_sub, paste0(work_path, "nodes_sub.txt"), row.names=F, sep="\t", quote=F) 76 | 77 | 78 | ############################################################## 79 | ### making Histogram for accepted edges and rejected edges ### 80 | ############################################################## 81 | 82 | source("JAX_help_code.R") 83 | source("JAX_color_code.R") 84 | work_path = "./" 85 | 86 | dat = read.table(paste0(work_path, "edges_MNNs.txt"), header=T, sep="\t", as.is=T) 87 | dat = dat[dat$MNN_pairs_normalized > 1,] 88 | 89 | dat_1 = dat[dat$Comments %in% c("Developmental progression", "Spatial continuity"),] 90 | dat_2 = dat[dat$Comments %in% c("x","X"),] 91 | 92 | dat_uniq = NULL 93 | x_uniq = NULL 94 | for(i in 1:nrow(dat_2)){ 95 | tmp = paste0(dat_2$x[i], ":", dat_2$y[i]) 96 | tmp_r = paste0(dat_2$y[i], ":", dat_2$x[i]) 97 | if(tmp %in% x_uniq | tmp_r %in% x_uniq){ 98 | next 99 | } else { 100 | dat_uniq = rbind(dat_uniq, dat_2[i,]) 101 | x_uniq = c(x_uniq, tmp) 102 | } 103 | } 104 | 105 | dat_1$group = "Accepted" 106 | dat_uniq$group = "Rejected" 107 | df = rbind(dat_1, dat_uniq) 108 | df$log2_MNN_pairs_normalized = log2(df$MNN_pairs_normalized) 109 | 110 | ### Extended Data Fig. 11d 111 | 112 | p <- df %>% 113 | ggplot( aes(x=log2_MNN_pairs_normalized, fill=group)) + 114 | geom_histogram( color="#e9ecef", alpha=0.5, position = 'identity') + 115 | scale_fill_manual(values=c("#f85633", "#0058d6")) + 116 | theme_ipsum() + 117 | labs(fill="") 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /Section_7_key_TFs/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_7_key_TFs/.Rhistory -------------------------------------------------------------------------------- /Section_7_key_TFs/HSCs_progenitors.py: -------------------------------------------------------------------------------- 1 | 2 | import scanpy as sc 3 | import pandas as pd 4 | import numpy as np 5 | import os 6 | import sys 7 | from annoy import AnnoyIndex 8 | 9 | WORK_PATH = './' 10 | 11 | example_list = ["HSC"] 12 | 13 | for example_i in example_list: 14 | print(example_i) 15 | 16 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i)) 17 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0) 18 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0) 19 | adata.obs = pdata 20 | adata.var = fdata 21 | 22 | print(adata.shape) 23 | 24 | sc.pp.normalize_total(adata, target_sum=1e4) 25 | sc.pp.log1p(adata) 26 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 27 | adata = adata[:, adata.var.highly_variable] 28 | sc.pp.scale(adata, max_value=10) 29 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 30 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 31 | 32 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 33 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 34 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 35 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 36 | 37 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 38 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 39 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 40 | 41 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i)) 42 | 43 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip") 44 | 45 | X = adata.obsm['X_pca'] 46 | print(X.shape) 47 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f') 48 | 49 | dist_metric = 'euclidean' 50 | k = 15 51 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15. 52 | 53 | npc = X.shape[1] 54 | ncell = X.shape[0] 55 | annoy_index = AnnoyIndex(npc, metric=dist_metric) 56 | 57 | for i in range(ncell): 58 | annoy_index.add_item(i, list(X[i,:])) 59 | annoy_index.build(15) ### bigger number will make the result more accurate 60 | 61 | knn = [] 62 | for iCell in range(ncell): 63 | knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) 64 | knn = np.array(knn, dtype=int) 65 | 66 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%example_i), knn, delimiter=",", fmt='%s') 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Section_7_key_TFs/step3_Summarize_results.R: -------------------------------------------------------------------------------- 1 | 2 | #################################### 3 | ### Section - 7, Key TFs & genes ### 4 | #################################### 5 | 6 | ### making plot to show which TF or gene are appeared for different edges 7 | 8 | ################ 9 | ### key TFs #### 10 | ################ 11 | 12 | source("JAX_help_code.R") 13 | source("JAX_color_code.R") 14 | work_path = "./" 15 | 16 | dat = read.csv(paste0(work_path, "All.keyTF.csv"), header=T, as.is=T) 17 | 18 | df = dat %>% select(node_A, node_B, gene_short_name) %>% unique() %>% 19 | group_by(node_A, node_B) %>% tally() 20 | print(paste0(mean(df$n), "+/-", sd(df$n))) 21 | ### 39.64 +/- 43.44 22 | 23 | print(paste(quantile(df$n, 0.25), quantile(df$n, 0.5), quantile(df$n, 0.75))) 24 | ### 12 28 51 25 | 26 | df_1 = dat %>% filter(comparing %in% c("group_1 vs. group_2","group_3 vs. group_4")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 27 | df_2 = dat %>% filter(comparing %in% c("group_2 vs. group_3")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 28 | df_3 = dat %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 29 | sum(!df_1 %in% df_2)/length(df_3) 30 | ### 5% 31 | 32 | df_1 = dat %>% filter(edge_type == "Developmental progression", comparing == "group_1 vs. group_2") %>% 33 | select(node_A, node_B, gene_short_name) %>% unique() %>% 34 | group_by(gene_short_name) %>% tally() 35 | print(head(df_1[order(df_1$n, decreasing = T),], 20)) 36 | p1 = df_1 %>% 37 | ggplot(aes(n)) + geom_histogram(binwidth = 0.5) + 38 | labs(x="# of edges that have been involved", y="# of key TFs", title="") + 39 | theme_classic(base_size = 10) + 40 | theme(legend.position="none") + 41 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 42 | 43 | df_3 = dat %>% 44 | select(node_A, node_B, gene_short_name) %>% unique() %>% 45 | group_by(gene_short_name) %>% tally() 46 | print(head(df_3[order(df_3$n, decreasing = T),], 20)) 47 | p3 = df_3 %>% 48 | ggplot(aes(n)) + geom_histogram(binwidth = 0.6) + 49 | labs(x="# of edges that have been involved", y="# of key TFs", title="") + 50 | theme_classic(base_size = 10) + 51 | theme(legend.position="none") + 52 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 53 | 54 | ### Extended Data Fig. 11k 55 | 56 | pdf(paste0(work_path, "Hist_TF.pdf"), 6, 3) 57 | grid.arrange(p1, p3, nrow=1, ncol=2) 58 | dev.off() 59 | 60 | ################## 61 | ### key Genes #### 62 | ################## 63 | 64 | 65 | dat = read.csv(paste0(work_path, "All.keyGene.csv"), header=T, as.is=T) 66 | 67 | df = dat %>% select(node_A, node_B, gene_short_name) %>% unique() %>% 68 | group_by(node_A, node_B) %>% tally() 69 | print(paste0(mean(df$n), "+/-", sd(df$n))) 70 | ### 293.24 +/- 358.04 71 | 72 | print(paste(quantile(df$n, 0.25), quantile(df$n, 0.5), quantile(df$n, 0.75))) 73 | ### 76 171 389 74 | 75 | 76 | df_1 = dat %>% filter(comparing %in% c("group_1 vs. group_2","group_3 vs. group_4")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 77 | df_2 = dat %>% filter(comparing %in% c("group_2 vs. group_3")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 78 | df_3 = dat %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name) 79 | sum(!df_1 %in% df_2)/length(df_3) 80 | ### 7% 81 | 82 | 83 | df_1 = dat %>% filter(edge_type == "Developmental progression", comparing == "group_1 vs. group_2") %>% 84 | select(node_A, node_B, gene_short_name) %>% unique() %>% 85 | group_by(gene_short_name) %>% tally() 86 | print(head(df_1[order(df_1$n, decreasing = T),], 20)) 87 | df_1[order(df_1$n, decreasing = T),] %>% filter(n > 10) %>% write.csv("~/Dropbox/tmp/Fig.S16.d_1.csv") 88 | p1 = df_1 %>% 89 | ggplot(aes(n)) + geom_histogram(binwidth = 0.5) + 90 | labs(x="# of edges that have been involved", y="# of key Genes", title="") + 91 | theme_classic(base_size = 10) + 92 | theme(legend.position="none") + 93 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 94 | 95 | df_3 = dat %>% 96 | select(node_A, node_B, gene_short_name) %>% unique() %>% 97 | group_by(gene_short_name) %>% tally() 98 | print(head(df_3[order(df_3$n, decreasing = T),], 20)) 99 | df_3[order(df_3$n, decreasing = T),] %>% filter(n > 10) %>% write.csv("~/Dropbox/tmp/Fig.S16.d_3.csv") 100 | p3 = df_3 %>% 101 | ggplot(aes(n)) + geom_histogram(binwidth = 0.6) + 102 | labs(x="# of edges that have been involved", y="# of key Genes", title="") + 103 | theme_classic(base_size = 10) + 104 | theme(legend.position="none") + 105 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 106 | 107 | ### Extended Data Fig. 11l 108 | 109 | pdf(paste0(work_path, "Hist_Gene.pdf"), 6, 3) 110 | grid.arrange(p1, p3, nrow=1, ncol=2) 111 | dev.off() 112 | 113 | 114 | -------------------------------------------------------------------------------- /Section_7_key_TFs/step4_Pseudotime_endoderm.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #################################### 4 | ### Section - 7, Key TFs & genes ### 5 | #################################### 6 | 7 | ################################################## 8 | ### Anterior primitive streak -> Def. endoderm ### 9 | ################################################## 10 | 11 | ### Can we estimate pseudotime of cells corresponding to Def.endoderm developement 12 | ### during transition and plot key TF/genes expression as a function of pseudotime? 13 | 14 | source("JAX_help_code.R") 15 | source("JAX_color_code.R") 16 | work_path = "./" 17 | 18 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t") 19 | 20 | system_i = "Gastrulation" 21 | 22 | edges = read.table(paste0(work_path, "edges.txt"), header=F, as.is=T, sep="\t") 23 | names(edges) = c("system","x","y","x_name","y_name","edge_type") 24 | edges = edges[edges$system == "Gastrulation",] 25 | 26 | obj = readRDS(paste0(work_path, "obj_Early_PS.rds")) 27 | pd = readRDS(paste0(work_path, system_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 28 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds")) 29 | pd$celltype_new = NULL 30 | pd = pd %>% left_join(pd_all[,c("cell_id", "celltype_new")], by = "cell_id") 31 | 32 | if ("meta_group" %in% names(pd)) {pd$meta_group = NULL} 33 | pd_x = pd %>% left_join(nodes %>% filter(system == system_i) %>% select(celltype_new, meta_group)) 34 | pd$meta_group = as.vector(pd_x$meta_group) 35 | gene_count = GetAssayData(obj, slot = "counts") 36 | 37 | y = data.frame(i = 1:nrow(pd), 38 | j = 1:nrow(pd), 39 | meta_group = as.vector(pd$meta_group), stringsAsFactors = FALSE) 40 | 41 | x = readRDS(paste0(work_path, system_i, ".MNN.rds")) 42 | x_rev = x 43 | x_rev$i = x$j 44 | x_rev$j = x$i 45 | x_rev = rbind(x, x_rev) 46 | dat = x_rev %>% left_join(y %>% select(i, meta_group), by = "i") %>% 47 | left_join(y %>% select(j, meta_group), by = "j") 48 | 49 | cnt = 1 50 | print(edges[cnt,]) 51 | 52 | xx = as.vector(edges$x)[cnt] 53 | yy = as.vector(edges$y)[cnt] 54 | 55 | dat_cnt = dat[dat$meta_group.x == xx & dat$meta_group.y == yy,] 56 | group_1_MNN = as.vector(dat_cnt$i) 57 | group_2_MNN = as.vector(dat_cnt$j) 58 | 59 | coor = c(1:nrow(pd)) 60 | 61 | while(length(unique(group_1_MNN)) < 200){ 62 | num_1 = length(unique(group_1_MNN)) 63 | tmp = intersect(as.vector(dat$j[dat$i %in% group_1_MNN]), coor[pd$meta_group == xx]) 64 | group_1_MNN = c(group_1_MNN, tmp) 65 | num_2 = length(unique(group_1_MNN)) 66 | if(num_1 == num_2) {break} 67 | } 68 | 69 | group_1_close = intersect(as.vector(dat$j[dat$i %in% group_1_MNN]), coor[pd$meta_group == xx]) 70 | group_1_close = group_1_close[!group_1_close %in% group_1_MNN] 71 | while(length(unique(group_1_close)) < 200){ 72 | num_1 = length(unique(group_1_close)) 73 | tmp = intersect(as.vector(dat$j[dat$i %in% group_1_close]), coor[pd$meta_group == xx]) 74 | group_1_close = c(group_1_close, tmp) 75 | group_1_close = group_1_close[!group_1_close %in% group_1_MNN] 76 | num_2 = length(unique(group_1_close)) 77 | if(num_1 == num_2) {break} 78 | } 79 | 80 | while(length(unique(group_2_MNN)) < 200){ 81 | num_1 = length(unique(group_2_MNN)) 82 | tmp = intersect(as.vector(dat$j[dat$i %in% group_2_MNN]), coor[pd$meta_group == yy]) 83 | group_2_MNN = c(group_2_MNN, tmp) 84 | num_2 = length(unique(group_2_MNN)) 85 | if(num_1 == num_2) {break} 86 | } 87 | 88 | group_2_close = intersect(as.vector(dat$j[dat$i %in% group_2_MNN]), coor[pd$meta_group == yy]) 89 | group_2_close = group_2_close[!group_2_close %in% group_2_MNN] 90 | while(length(unique(group_2_close)) < 200){ 91 | num_1 = length(unique(group_2_close)) 92 | tmp = intersect(as.vector(dat$j[dat$i %in% group_2_close]), coor[pd$meta_group == yy]) 93 | group_2_close = c(group_2_close, tmp) 94 | group_2_close = group_2_close[!group_2_close %in% group_2_MNN] 95 | num_2 = length(unique(group_2_close)) 96 | if(num_1 == num_2) {break} 97 | } 98 | 99 | group = rep("other", nrow(pd)) 100 | group[coor %in% group_1_close] = "group_1" 101 | group[coor %in% group_1_MNN] = "group_2" 102 | group[coor %in% group_2_MNN] = "group_3" 103 | group[coor %in% group_2_close] = "group_4" 104 | pd$group = as.vector(group) 105 | 106 | pd_sub = pd[pd$group != "other",] 107 | group_table = table(pd_sub$group) 108 | 109 | gene_count_sub = gene_count[,as.vector(pd_sub$cell_id)] 110 | obj_sub = CreateSeuratObject(gene_count_sub, meta.data = pd_sub) 111 | obj_sub = NormalizeData(obj_sub, normalization.method = "LogNormalize", scale.factor = 10000) 112 | obj_sub = FindVariableFeatures(obj_sub, selection.method = "vst", nfeatures = 2500) 113 | genes_include = VariableFeatures(obj_sub) 114 | 115 | cds = doObjectTransform(obj_sub, transform_to = "monocle") 116 | 117 | pd_x = read.csv(paste0(work_path, "pijuan_obs.csv"), row.names=1, as.is=T) 118 | pd_x = pd_x[colnames(cds),] 119 | 120 | cds$batch_1 = as.vector(pd_x$batch) 121 | cds$batch_2 = as.vector(pd_x$group) 122 | 123 | cds = preprocess_cds(cds, use_genes = genes_include) 124 | cds = align_cds(cds, alignment_group = "batch_1") 125 | cds = reduce_dimension(cds) 126 | 127 | saveRDS(cds, paste0(work_path, "cds_Def_endoderm.rds")) 128 | 129 | 130 | ##################### 131 | ### Making plots #### 132 | ##################### 133 | 134 | plot_cells(cds, color_cells_by = "celltype_new", cell_size = 1) 135 | 136 | cds = cluster_cells(cds) 137 | cds = learn_graph(cds) 138 | cds = order_cells(cds) 139 | 140 | plot_cells(cds, 141 | color_cells_by = "pseudotime", 142 | label_cell_groups=FALSE, 143 | label_leaves=FALSE, 144 | label_branch_points=FALSE, 145 | graph_label_size=1.5, 146 | cell_size = 1) 147 | 148 | cds$pseudotime = cds@principal_graph_aux[["UMAP"]]$pseudotime 149 | 150 | df = data.frame(pData(cds)) 151 | boxplot(df$pseudotime~factor(df$group)) 152 | 153 | df$UMAP_1 = reducedDims(cds)$UMAP[,1] 154 | df$UMAP_2 = reducedDims(cds)$UMAP[,2] 155 | 156 | ### Extended Data Fig. 11m 157 | 158 | p = ggplot() + 159 | geom_point(data = df, aes(x = UMAP_1, y = UMAP_2), size=2, color = "black") + 160 | geom_point(data = df, aes(x = UMAP_1, y = UMAP_2, color = celltype_new), size=1.8) + 161 | theme_void() + 162 | scale_color_manual(values=gastrulation_color_plate) + 163 | theme(legend.position="none") + 164 | ggsave(paste0(work_path, "Def_endoderm_UMAP_celltype.png"), width = 6, height = 4, dpi = 300) 165 | 166 | p = ggplot() + 167 | geom_point(data = df, aes(x = UMAP_1, y = UMAP_2), size=2, color = "black") + 168 | geom_point(data = df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime), size=1.8) + 169 | theme_void() + 170 | scale_color_viridis(discrete=F) + 171 | theme(legend.position="none") + 172 | ggsave(paste0(work_path, "Def_endoderm_UMAP_pseudotime.png"), width = 6, height = 4, dpi = 300) 173 | 174 | 175 | ################################### 176 | ### making gene expression plot ### 177 | ################################### 178 | 179 | gene_count = exprs(cds) 180 | gene_count = t(t(gene_count) / colSums(gene_count)) * 100000 181 | 182 | target_genes = c("Sox17", "Elf3", "Sall4", "Hesx1", "Lin28a", "Ovol2", 183 | "Cer1", "Slc25a4", "Cd24a", "Slc2a3", "Lrpap1", "Krt18") 184 | 185 | mouse_gene_sub = mouse_gene[mouse_gene$gene_short_name %in% target_genes,] 186 | 187 | gene_count_x = gene_count[rownames(mouse_gene_sub),] 188 | rownames(gene_count_x) = as.vector(mouse_gene_sub$gene_short_name) 189 | gene_count_x@x = log(gene_count_x@x + 1) 190 | 191 | dat = data.frame(exp = c(t(as.matrix(gene_count_x))), 192 | gene = rep(rownames(gene_count_x), each = ncol(gene_count_x)), 193 | pseudotime = rep(as.vector(cds$pseudotime), nrow(gene_count_x)), 194 | pseudotime_rank = rep(rank(as.vector(cds$pseudotime)), nrow(gene_count_x)), stringsAsFactors = F) 195 | dat$gene = factor(dat$gene, levels = target_genes) 196 | 197 | p = ggplot() + 198 | geom_smooth(data = dat, aes(pseudotime, exp, color = gene), method = loess, se = FALSE) + 199 | labs(x="", y="", title="") + 200 | theme_classic(base_size = 12) + 201 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) + 202 | scale_color_brewer(palette = "Paired") 203 | 204 | 205 | dat_add_offset = NULL 206 | for(gene_i in target_genes){ 207 | dat_sub = dat %>% filter(gene == gene_i) 208 | p_tmp = qplot(pseudotime, exp, data=dat_sub) + stat_smooth(method = loess, se = FALSE) 209 | dat_tmp = ggplot_build(p_tmp)$data[[2]][,c("x","y")] 210 | dat_tmp$y = dat_tmp$y - dat_tmp$y[dat_tmp$x == 0] 211 | dat_tmp$gene = gene_i 212 | 213 | dat_add_offset = rbind(dat_add_offset, dat_tmp) 214 | } 215 | dat_add_offset$gene = factor(dat_add_offset$gene, levels = target_genes) 216 | 217 | p = ggplot() + 218 | geom_line(data = dat_add_offset, aes(x, y, color = gene), size = 1) + 219 | labs(x="", y="", title="") + 220 | theme_classic(base_size = 12) + 221 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) + 222 | scale_color_brewer(palette = "Paired") 223 | 224 | ### Extended Data Fig. 11n 225 | 226 | pdf(paste0(work_path, "Def_endoderm_gene_expression_pseudotime_add_offset.pdf"), 5, 3) 227 | print(p) 228 | dev.off() 229 | 230 | 231 | -------------------------------------------------------------------------------- /Section_7_key_TFs/step5_HSCs_progenitors.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #################################### 4 | ### Section - 7, Key TFs & genes ### 5 | #################################### 6 | 7 | ### Here, we present an example - HSCs, showing much heterogeneity even at the progenitor state 8 | ### How it connects to multiple different derivatives? 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds")) 15 | rownames(pd_all) = as.vector(pd_all$cell_id) 16 | 17 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t") 18 | 19 | celltype_include = c("Hematopoietic stem cells (Cd34+)") 20 | example_i = "HSC" 21 | 22 | pd_sub = pd_all[pd_all$celltype_new %in% celltype_include,] 23 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),] 24 | gene_count = doExtractData(pd_sub, mouse_gene_sub) 25 | 26 | writeMM(t(gene_count), paste0(work_path, example_i, ".gene_count.mtx")) 27 | write.csv(pd, paste0(work_path, example_i, ".df_cell.csv")) 28 | 29 | 30 | ### Using Scanpy to perform cell embedding 31 | ### python HSCs_progenitors.py 32 | 33 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 34 | 35 | x_table = table(pd_x$day) 36 | pd_1 = pd_x[pd_x$day %in% names(x_table)[x_table > 1000],] 37 | pd_1_ = pd_1 %>% group_by(day) %>% sample_n(1000) 38 | pd_1 = pd_1[pd_1$cell_id %in% pd_1_$cell_id,] 39 | pd_2 = pd_x[pd_x$day %in% names(x_table)[x_table <= 1000],] 40 | pd_plot = rbind(pd_1, pd_2) 41 | pd_plot$day = factor(pd_plot$day, levels = names(day_color_plate)) 42 | 43 | ### Extended Data Fig. 11p 44 | 45 | p = ggplot() + 46 | geom_point(data = pd_plot, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) + 47 | geom_point(data = pd_plot[sample(1:nrow(pd_plot)),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.8) + 48 | theme_void() + 49 | scale_color_manual(values=day_color_plate) + 50 | theme(legend.position="none") + 51 | ggsave(paste0(work_path, example_i, ".day.png"), width = 6, height = 6, dpi = 300) 52 | 53 | 54 | 55 | 56 | ################################################################## 57 | ### what are the MNNs between HSCs and its multiple derivatives ## 58 | ################################################################## 59 | 60 | example_i = "HSC" 61 | system_i = "Blood" 62 | 63 | pd_target = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 64 | 65 | ### Of note, this profile is generated by co-embedding all the cell types of Blood system 66 | ### Please see the scripts from Section_6_development_tree for details 67 | pd = readRDS(paste0(work_path, system_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 68 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds")) 69 | pd$celltype_new = NULL 70 | pd = pd %>% left_join(pd_all[,c("cell_id", "celltype_new")], by = "cell_id") 71 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t") 72 | 73 | if ("meta_group" %in% names(pd)) {pd$meta_group = NULL} 74 | pd_x = pd %>% left_join(nodes %>% filter(system == system_i) %>% select(celltype_new, meta_group)) 75 | pd$meta_group = as.vector(pd_x$meta_group) 76 | 77 | y = data.frame(i = 1:nrow(pd), 78 | j = 1:nrow(pd), 79 | meta_group = as.vector(pd$meta_group), 80 | cell_id = as.vector(pd$cell_id), stringsAsFactors = FALSE) 81 | 82 | ### Of note, this profile is generated by identifying MNN pairs of cells within Blood system 83 | ### Please see the scripts from Section_6_development_tree for details 84 | x = readRDS(paste0(work_path, system_i, ".MNN.rds")) 85 | x_rev = x 86 | x_rev$i = x$j 87 | x_rev$j = x$i 88 | x_rev = rbind(x, x_rev) 89 | dat = x_rev %>% left_join(y %>% select(i, meta_group, cell_id), by = "i") %>% 90 | left_join(y %>% select(j, meta_group, cell_id), by = "j") 91 | 92 | edges = read.table(paste0(work_path, "edges.txt"), as.is=T, sep="\t") 93 | nodes_include = as.vector(edges$V3[edges$V2 == "B_M11"]) 94 | 95 | dat_uniq = dat %>% filter(meta_group.x == "B_M11", meta_group.y %in% nodes_include) %>% 96 | group_by(cell_id.x, meta_group.y) %>% tally() %>% 97 | group_by(cell_id.x) %>% slice_max(order_by = n, n = 1, with_ties = F) %>% 98 | rename(cell_id = cell_id.x, meta_group = meta_group.y) %>% select(cell_id, meta_group) %>% 99 | left_join(nodes[,c("meta_group", "celltype_new")]) %>% rename(MNN = celltype_new) 100 | 101 | pd_target = pd_target %>% left_join(dat_uniq[,c("cell_id","MNN")]) 102 | 103 | ### Extended Data Fig. 11q 104 | 105 | p = ggplot() + 106 | geom_point(data = pd_target[is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2), color = "grey80", size=0.6) + 107 | geom_point(data = pd_target[!is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.65) + 108 | geom_point(data = pd_target[!is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = MNN), size=0.6) + 109 | theme_void() + 110 | scale_color_manual(values=blood_system_color_plate) + 111 | theme(legend.position="none") + 112 | ggsave(paste0(work_path, example_i, ".MNN.png"), width = 6, height = 6, dpi = 300) 113 | 114 | -------------------------------------------------------------------------------- /Section_8_birth_series/Embedding_birth_series.py: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ###################################### 7 | ### Embedding birth series dataset ### 8 | ###################################### 9 | 10 | ### The gene_count, df_cell, df_gene data could be downloaded from 11 | ### /net/shendure/vol10/www/content/members/cxqiu/public/backup/jax/download/mtx 12 | ### cell_annotation.run_28.csv.gz 13 | ### gene_count.run_28.mtx.gz 14 | ### gene_annotation.csv.gz 15 | 16 | import scanpy as sc 17 | import pandas as pd 18 | import numpy as np 19 | import os, sys 20 | 21 | WORK_PATH = './' 22 | example_i = "birth" 23 | 24 | adata = sc.read_mtx(os.path.join(WORK_PATH, "gene_count.run_28.mtx.gz")) 25 | pdata = pd.read_csv(os.path.join(WORK_PATH, "cell_annotation.run_28.csv.gz"), index_col = 0) 26 | fdata = pd.read_csv(os.path.join(WORK_PATH, "gene_annotation.csv.gz"), index_col = 0) 27 | adata.obs = pdata 28 | adata.var = fdata 29 | 30 | adata = adata[:, adata.var["gene_type"].isin(["protein_coding","pseudogene","lincRNA"])] 31 | chr_include = ["chr" + str(i) for i in range(1,20)] 32 | chr_include.append("chrM") 33 | adata = adata[:, adata.var["chr"].isin(chr_include)] 34 | 35 | adata.write(os.path.join(WORK_PATH, "adata_birth.h5ad"), compression="gzip") 36 | 37 | sc.pp.normalize_total(adata, target_sum=1e4) 38 | sc.pp.log1p(adata) 39 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 40 | adata = adata[:, adata.var.highly_variable] 41 | sc.pp.scale(adata, max_value=10) 42 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 43 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 44 | 45 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 46 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 47 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 48 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 49 | 50 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 51 | adata.obs['leiden_res_1'] = adata.obs['leiden'] 52 | 53 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 54 | adata.obs['leiden_res_2'] = adata.obs['leiden'] 55 | 56 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 57 | print("Done UMAP ...") 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours') 59 | 60 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 61 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 62 | 63 | adata.obs.to_csv(os.path.join(WORK_PATH, "adata_birth.obs.csv")) 64 | adata.write(os.path.join(WORK_PATH, "adata_birth_processed.h5ad"), compression="gzip") 65 | 66 | emb = adata.obsm['X_pca'] 67 | print(emb.shape) 68 | np.savetxt(os.path.join(WORK_PATH, "adata_birth.PCs.csv"), emb, delimiter=",", fmt='%1.3f') 69 | 70 | 71 | 72 | ############################################################################################## 73 | ### perform subclustering on three major trajectories, by only including C-section samples ### 74 | ############################################################################################## 75 | 76 | 77 | import scanpy as sc 78 | import pandas as pd 79 | import numpy as np 80 | import os, sys 81 | 82 | WORK_PATH = './' 83 | 84 | adata_all = sc.read_h5ad(os.path.join(WORK_PATH, "adata_birth.h5ad")) 85 | 86 | ### pd_birth.csv is generated by write.csv("pd_birth.rds") in R 87 | 88 | pdata = pd.read_csv(os.path.join(WORK_PATH, "pd_birth.csv"), index_col = 0) 89 | adata_all.obs = pdata 90 | 91 | ### 92 | targ_list = ["Adipocytes","Hepatocytes","Lung_and_airway"] 93 | day_include = ["Csection_0m","Csection_20m","Csection_40m","Csection_60m","Csection_80m"] 94 | 95 | for i in targ_list: 96 | 97 | adata = adata_all[adata_all.obs["major_trajectory"] == i] 98 | adata = adata[adata.obs["day"].isin(day_include)] 99 | print("processing: " + i) 100 | print(adata.shape) 101 | 102 | sc.pp.normalize_total(adata, target_sum=1e4) 103 | sc.pp.log1p(adata) 104 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 105 | adata = adata[:, adata.var.highly_variable] 106 | sc.pp.scale(adata, max_value=10) 107 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 108 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 109 | 110 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 111 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 112 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 113 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 114 | 115 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 116 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 117 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 118 | 119 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 120 | adata.obs['subcluster_leiden_res_1'] = adata.obs['leiden'] 121 | 122 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 123 | adata.obs['subcluster_leiden_res_2'] = adata.obs['leiden'] 124 | 125 | sc.tl.leiden(adata, resolution=5, n_iterations=2) 126 | adata.obs['subcluster_leiden_res_5'] = adata.obs['leiden'] 127 | 128 | adata.obs.to_csv(os.path.join(WORK_PATH, "adata_%s.obs.csv"%i)) 129 | adata.write(os.path.join(WORK_PATH, "adata_%s_processed.h5ad"%i), compression="gzip") 130 | 131 | 132 | 133 | 134 | 135 | ##################################################################################################################### 136 | ### perform subclustering on three major trajectories, by including C-section samples + 3 natural birthed samples ### 137 | ##################################################################################################################### 138 | 139 | 140 | import scanpy as sc 141 | import pandas as pd 142 | import numpy as np 143 | import os, sys 144 | 145 | WORK_PATH = './' 146 | 147 | adata_all = sc.read_h5ad(os.path.join(WORK_PATH, "adata_birth.h5ad")) 148 | 149 | ### pd_birth.csv is generated by write.csv("pd_birth.rds") in R 150 | 151 | pdata = pd.read_csv(os.path.join(WORK_PATH, "pd_birth.csv"), index_col = 0) 152 | adata_all.obs = pdata 153 | 154 | ### 155 | targ_list = ["Adipocytes","Hepatocytes","Lung_and_airway"] 156 | day_include = ["NatBirth","Csection_0m","Csection_20m","Csection_40m","Csection_60m","Csection_80m"] 157 | 158 | for i in targ_list: 159 | 160 | adata = adata_all[adata_all.obs["major_trajectory"] == i] 161 | adata = adata[adata.obs["day"].isin(day_include)] 162 | print("processing: " + i) 163 | print(adata.shape) 164 | 165 | sc.pp.normalize_total(adata, target_sum=1e4) 166 | sc.pp.log1p(adata) 167 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 168 | adata = adata[:, adata.var.highly_variable] 169 | sc.pp.scale(adata, max_value=10) 170 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 171 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 172 | 173 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 174 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 175 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 176 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 177 | 178 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 179 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 180 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 181 | 182 | sc.tl.leiden(adata, resolution=1, n_iterations=2) 183 | adata.obs['subcluster_leiden_res_1'] = adata.obs['leiden'] 184 | 185 | sc.tl.leiden(adata, resolution=2, n_iterations=2) 186 | adata.obs['subcluster_leiden_res_2'] = adata.obs['leiden'] 187 | 188 | sc.tl.leiden(adata, resolution=5, n_iterations=2) 189 | adata.obs['subcluster_leiden_res_5'] = adata.obs['leiden'] 190 | 191 | adata.obs.to_csv(os.path.join(WORK_PATH, "adata_%s_NatBirth.obs.csv"%i)) 192 | adata.write(os.path.join(WORK_PATH, "adata_%s_NatBirth_processed.h5ad"%i), compression="gzip") 193 | 194 | emb = adata.obsm['X_pca'] 195 | np.savetxt(os.path.join(WORK_PATH, "adata_%s_NatBirth.PCs.csv"%i), emb, delimiter=",", fmt='%1.3f') 196 | 197 | 198 | 199 | 200 | 201 | -------------------------------------------------------------------------------- /Section_8_birth_series/Embedding_individual_celltype.py: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ################################################################################################################# 7 | ### To systematically identify which cell types exhibit abrupt transcriptional changes before vs. after birth ### 8 | ################################################################################################################# 9 | 10 | import scanpy as sc 11 | import pandas as pd 12 | import numpy as np 13 | import os, sys 14 | import gc 15 | import time 16 | 17 | WORK_PATH = "./" 18 | 19 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad')) 20 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad')) 21 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad')) 22 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad')) 23 | 24 | adata = adata_1.concatenate(adata_2, adata_3, adata_4) 25 | del adata_1, adata_2, adata_3, adata_4 26 | gc.collect() 27 | 28 | day_include = ["E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", "E17.5", 29 | "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0"] 30 | 31 | celltype_list = {} 32 | file = open(os.path.join(WORK_PATH, "celltype_include.txt")) 33 | for line in file: 34 | l = line.rstrip().split('\t') 35 | celltype_list[l[0]] = l[1] 36 | file.close() 37 | 38 | for celltype_i in celltype_list: 39 | 40 | adata = adata_i[adata_i.obs["celltype_update"] == celltype_i] 41 | adata = adata_i[adata_i.obs["day"].isin(day_include)] 42 | 43 | sc.pp.normalize_total(adata, target_sum=1e4) 44 | sc.pp.log1p(adata) 45 | sc.pp.highly_variable_genes(adata, n_top_genes=2500) 46 | adata = adata[:, adata.var.highly_variable] 47 | sc.pp.scale(adata, max_value=10) 48 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30) 49 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30) 50 | 51 | sc.tl.umap(adata, min_dist=0.3, n_components=3) 52 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0]) 53 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1]) 54 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2]) 55 | 56 | sc.tl.umap(adata, min_dist=0.3, n_components=2) 57 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0]) 58 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1]) 59 | 60 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%celltype_list[celltype_i])) 61 | 62 | X = adata.obsm['X_pca'] 63 | print(X.shape) 64 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%celltype_list[celltype_i]), X, delimiter=",", fmt='%1.3f') 65 | 66 | -------------------------------------------------------------------------------- /Section_8_birth_series/step1_Celltypes_shift_after_birth.R: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ################################################################################################ 7 | ### Re-embedded 2D UMAP of cells from three major cell clusters before E18.75, E18.75, or P0 ### 8 | ################################################################################################ 9 | 10 | ### Fig. 6a 11 | 12 | source("JAX_help_code.R") 13 | source("JAX_color_code.R") 14 | work_path = "./" 15 | 16 | day_group_color_plate = c("Early" = "#a46cb7", 17 | "E18.75" = "#7aa457", 18 | "P0" = "#cb6a49", 19 | "Other" = "grey90") 20 | 21 | for(example_i in c("Hepatocytes", "Adipocytes", "Lung_and_airway")){ 22 | 23 | example_i = "Renal"; print(example_i) 24 | 25 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds")) 26 | 27 | day_group = rep("Early", nrow(pd)) 28 | day_group[pd$day == "E18.75"] = "E18.75" 29 | day_group[pd$day == "P0"] = "P0" 30 | pd$day_group = as.vector(day_group) 31 | 32 | pd$tmp = if_else(pd$day_group == "Early", "Early", "Other") 33 | p = pd %>% 34 | ggplot() + 35 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 36 | geom_point(data = subset(pd, tmp == 'Early'), 37 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 38 | theme_void() + 39 | scale_color_manual(values=day_group_color_plate) + 40 | theme(legend.position="none") + 41 | ggsave(paste0(work_path, example_i, "_1.png"), width = 6, height = 6, dpi = 300) 42 | 43 | pd$tmp = if_else(pd$day_group == "E18.75", "E18.75", "Other") 44 | p = pd %>% 45 | ggplot() + 46 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 47 | geom_point(data = subset(pd, tmp == 'E18.75'), 48 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 49 | theme_void() + 50 | scale_color_manual(values=day_group_color_plate) + 51 | theme(legend.position="none") + 52 | ggsave(paste0(work_path, example_i, "_2.png"), width = 6, height = 6, dpi = 300) 53 | 54 | pd$tmp = if_else(pd$day_group == "P0", "P0", "Other") 55 | p = pd %>% 56 | ggplot() + 57 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 58 | geom_point(data = subset(pd, tmp == 'P0'), 59 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) + 60 | theme_void() + 61 | scale_color_manual(values=day_group_color_plate) + 62 | theme(legend.position="none") + 63 | ggsave(paste0(work_path, example_i, "_3.png"), width = 6, height = 6, dpi = 300) 64 | 65 | } 66 | 67 | 68 | ################################################################################################################# 69 | ### To systematically identify which cell types exhibit abrupt transcriptional changes before vs. after birth ### 70 | ################################################################################################################# 71 | 72 | source("JAX_help_code.R") 73 | source("JAX_color_code.R") 74 | work_path = "./" 75 | 76 | pd_all = readRDS(paste0(work_path, "df_cell.rds")) 77 | x = as.vector(pd_all$day) 78 | x[pd_all$day == "E8.0-E8.5"] = "E8.5" 79 | pd_all$day = as.vector(x) 80 | 81 | pd_sub = pd_all[pd_all$day %in% c("E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 82 | "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0"),] 83 | dat = pd_sub %>% group_by(celltype_update, day) %>% tally() %>% filter(n >= 200) 84 | celltype_1 = dat %>% filter(day == "P0") %>% pull(celltype_update) 85 | celltype_2 = dat %>% filter(day != "P0") %>% group_by(celltype_update) %>% tally() %>% filter(n >= 5) %>% pull(celltype_update) 86 | celltype_x = intersect(celltype_1, celltype_2) 87 | 88 | celltype_include = data.frame(celltype_update = as.vector(celltype_x), 89 | celltype_name = doSimpleName(as.vector(celltype_x)), 90 | stringsAsFactors = F) 91 | 92 | write.table(celltype_include, paste0(work_path, "celltype_include.txt"), row.names=F, col.names=F, sep="\t", quote=F) 93 | 94 | ### Running embedding on individual cell types 95 | ### python Embedding_individual_celltype.py 96 | 97 | for(kk in 1:nrow(celltype_include)){ 98 | 99 | celltype_update_i = celltype_include$celltype_update[kk] 100 | celltype_name_i = celltype_include$celltype_name[kk] 101 | print(celltype_update_i) 102 | 103 | pd = read.csv(paste0(work_path, celltype_name_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T) 104 | 105 | emb = read.csv(paste0(work_path, celltype_name_i, "_adata_scale.PCs.csv"), header=F, as.is=T) 106 | colnames(emb) = paste0("PC_", 1:30) 107 | rownames(emb) = rownames(pd) = as.vector(pd$cell_id) 108 | emb = as.matrix(emb) 109 | 110 | x_table = pd %>% group_by(day) %>% tally() %>% filter(n >= 200) 111 | x_table_median = median(x_table$n) 112 | pd_1 = pd %>% filter(day %in% as.vector(x_table$day[x_table$n <= x_table_median])) %>% as.data.frame() 113 | pd_2 = pd %>% filter(day %in% as.vector(x_table$day[x_table$n > x_table_median])) %>% group_by(day) %>% sample_n(x_table_median) %>% as.data.frame() 114 | pd_sub = rbind(pd_1, pd_2) 115 | rownames(pd_sub) = as.vector(pd_sub$cell_id) 116 | emb_sub = emb[as.vector(pd_sub$cell_id),] 117 | 118 | k.param = floor(log2(x_table_median)) + 1 + 1; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean" 119 | nn.ranked = Seurat:::NNHelper( 120 | data = emb_sub, 121 | k = k.param, 122 | method = nn.method, 123 | searchtype = "standard", 124 | eps = nn.eps, 125 | metric = annoy.metric) 126 | nn.ranked = Indices(object = nn.ranked) 127 | nn_matrix = nn.ranked 128 | 129 | resultA = NULL 130 | for(i in 1:k.param){ 131 | print(i) 132 | resultA = cbind(resultA, as.vector(pd_sub$day)[as.vector(nn_matrix[,i])]) 133 | } 134 | 135 | resultB = NULL 136 | for(i in 2:k.param){ 137 | print(i) 138 | resultB = cbind(resultB, as.vector(resultA[,i] != resultA[,1])) 139 | } 140 | 141 | res = data.frame(day = resultA[,1], 142 | pct = apply(resultB, 1, sum)/ncol(resultB)) 143 | 144 | print(res %>% group_by(day) %>% summarise(mean_pct = mean(pct)) %>% as.data.frame()) 145 | 146 | saveRDS(res, paste0(work_path, celltype_name_i, "_res.rds")) 147 | 148 | } 149 | 150 | 151 | ################################### 152 | ### Summarizing the kNN results ### 153 | ################################### 154 | 155 | df = NULL 156 | for(kk in 1:nrow(celltype_include)){ 157 | celltype_update_i = celltype_include$celltype_update[kk] 158 | celltype_name_i = celltype_include$celltype_name[kk] 159 | print(celltype_update_i) 160 | 161 | res_i = readRDS(paste0(work_path, celltype_name_i, "_res.rds")) 162 | res_i = res_i %>% group_by(day) %>% summarise(mean_pct = mean(pct)) %>% mutate(celltype_update = celltype_update_i) %>% as.data.frame() 163 | df = rbind(df, res_i) 164 | } 165 | 166 | df_order = df %>% filter(day == "P0") %>% arrange(mean_pct) 167 | df$celltype_update = factor(df$celltype_update, levels = rev(as.vector(df_order$celltype_update))) 168 | df$mean_pct = 100*df$mean_pct 169 | 170 | day_list = c("E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 171 | "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0") 172 | day_color = c("#5dae46", 173 | "#855ecd", 174 | "#b2b044", 175 | "#c94ca4", 176 | "#55a574", 177 | "#d53f63", 178 | "#4cbad2", 179 | "#c95534", 180 | "#617fc6", 181 | "#d89248", 182 | "#bd80c4", 183 | "#7f702f", 184 | "#bc6476") 185 | names(day_color) = day_list 186 | 187 | ### Fig. 6b 188 | ### size 10 X 6 189 | 190 | p = ggplot() + 191 | geom_point(data = df %>% filter(day == "P0"), aes(x = mean_pct, y = celltype_update), color = "black", size = 3) + 192 | geom_point(data = df, aes(x = mean_pct, y = celltype_update, color = day), size = 2) + 193 | scale_color_manual(values=day_color) + 194 | labs(x = "Mean % of the nearest neighboring cells from different timepoints", y = "") + 195 | theme_minimal() + 196 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /Section_8_birth_series/step2_Embedding_birth_series.R: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ########################################### 7 | ### Plotting cell number across samples ### 8 | ########################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | df = readRDS(paste0(work_path, "pd_birth.rds")) 15 | day = rep(NA, nrow(df)) 16 | day[df$RT_group == "E18.75_L1-01"] = "10" 17 | day[df$RT_group == "E18.75_L2-01"] = "1" 18 | day[df$RT_group == "E18.75_L2-02"] = "2" 19 | day[df$RT_group == "E18.75_L2-03"] = "3" 20 | day[df$RT_group == "E18.75_L2-04"] = "6" 21 | day[df$RT_group == "E18.75_L2-05"] = "7" 22 | day[df$RT_group == "E18.75_L2-06"] = "8" 23 | day[df$RT_group == "E18.75_L2-07"] = "4" 24 | day[df$RT_group == "E18.75_L2-08"] = "5" 25 | day[df$RT_group == "E18.75_L2-09"] = "9" 26 | day[df$RT_group == "P0_L1-01" ] = "11" 27 | day[df$RT_group == "P0_L1-04" ] = "12" 28 | df$tmp = as.vector(day) 29 | 30 | pd_cell_num_1 = df %>% group_by(tmp, day) %>% tally() %>% rename(cell_num = n) %>% as.data.frame() 31 | pd_cell_num_1$tmp = factor(pd_cell_num_1$tmp, levels = rev(1:12)) 32 | 33 | ### Extended Data Fig. 12a 34 | 35 | p = pd_cell_num_1 %>% 36 | ggplot(aes(tmp, cell_num, fill = day)) + 37 | geom_bar(stat="identity") + 38 | coord_flip() + 39 | scale_fill_manual(values = birth_color_plate) + 40 | geom_text(aes(label = scales::comma(cell_num)), 41 | hjust = -0.1, 42 | position = position_dodge(width = 1), 43 | inherit.aes = TRUE, 44 | size = 5) + 45 | labs(x = "", y = "Cells profiled") + 46 | theme_classic(base_size = 15) + 47 | theme(legend.position="none") + 48 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 49 | 50 | 51 | ################################################## 52 | ### 2D UMAP of cells from birth series dataset ### 53 | ################################################## 54 | 55 | source("JAX_help_code.R") 56 | source("JAX_color_code.R") 57 | work_path = "./" 58 | 59 | pd = readRDS(paste0(work_path, "pd_birth.rds")) 60 | 61 | ### Extended Data Fig. 12b 62 | 63 | p = pd %>% 64 | ggplot() + 65 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) + 66 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = major_trajectory), size=0.3) + 67 | scale_color_manual(values=major_trajectory_color_plate) + 68 | theme_void() + 69 | theme(legend.position="none") + 70 | ggsave(paste0(work_path, "birth_anno.png"), width = 10, height = 10, dpi = 300) 71 | 72 | 73 | ######################################################################### 74 | ### 2D UMAP of subclustering results for adipocytes and lung & airway ### 75 | ######################################################################### 76 | 77 | ########################## 78 | ### 1 - Hepatocytes 79 | 80 | source("JAX_help_code.R") 81 | source("JAX_color_code.R") 82 | work_path = "./" 83 | 84 | trajectory_i = "Hepatocytes" 85 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds")) 86 | df$day = paste0("Csection_", as.vector(df$day), "m") 87 | 88 | ### Fig. 6e (1st row) 89 | 90 | birth_color_plate = c(birth_color_plate, "other" = "grey80") 91 | 92 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){ 93 | df$tmp = if_else(df$day == i, i, "other") 94 | try(df %>% 95 | ggplot() + 96 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) + 97 | geom_point(data = subset(df, tmp == i), 98 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) + 99 | theme_void() + 100 | scale_color_manual(values=birth_color_plate) + 101 | theme(legend.position="none") + 102 | ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300)) 103 | df$tmp = NULL 104 | } 105 | 106 | 107 | ########################## 108 | ### 2 - Adipocytes 109 | 110 | trajectory_i = "Adipocytes" 111 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds")) 112 | df$day = paste0("Csection_", as.vector(df$day), "m") 113 | 114 | ### Extended Data Fig. 12c 115 | 116 | color_plate = c("Brown adipocyte cells" = "#cb6751", 117 | "Adipocyte progenitor cells" = "#7aa457", 118 | "Adipocyte cells (Cyp2e1+)" = "#9e6ebd") 119 | 120 | p = ggplot() + 121 | geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.2, color="black") + 122 | geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno_subclustering), size=1) + 123 | theme_void() + 124 | scale_color_manual(values=color_plate) + 125 | theme(legend.position="none") + 126 | ggsave(paste0(work_path, trajectory_i, "_anno.png"), width = 4, height = 4, dpi = 300) 127 | 128 | ### Fig. 6e (2nd row) 129 | 130 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){ 131 | df$tmp = if_else(df$day == i, i, "other") 132 | try(df %>% 133 | ggplot() + 134 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) + 135 | geom_point(data = subset(df, tmp == i), 136 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) + 137 | theme_void() + 138 | scale_color_manual(values=birth_color_plate) + 139 | theme(legend.position="none") + 140 | ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300)) 141 | df$tmp = NULL 142 | } 143 | 144 | 145 | ########################## 146 | ### 3 - Lung & airway 147 | 148 | trajectory_i = "Lung_and_airway" 149 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds")) 150 | df$day = paste0("Csection_", as.vector(df$day), "m") 151 | 152 | ### Extended Data Fig. 12d 153 | 154 | color_plate = c("Airway club cells" = "#6dd9b4", 155 | "Alveolar Type 1 cells" = "#008cff", 156 | "Alveolar Type 2 cells" = "#dab300", 157 | "Lung cells (Eln+)" = "#185e3e", 158 | "Airway goblet cells" = "#663fc6") 159 | 160 | p = ggplot() + 161 | geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.2, color="black") + 162 | geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno_subclustering), size=1) + 163 | theme_void() + 164 | scale_color_manual(values=color_plate) + 165 | theme(legend.position="none") + 166 | ggsave(paste0(work_path, trajectory_i, "_anno.png"), width = 4, height = 4, dpi = 300) 167 | 168 | ### Fig. 6e (3rd row) 169 | 170 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){ 171 | df$tmp = if_else(df$day == i, i, "other") 172 | try(df %>% 173 | ggplot() + 174 | geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) + 175 | geom_point(data = subset(df, tmp == i), 176 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) + 177 | theme_void() + 178 | scale_color_manual(values=birth_color_plate) + 179 | theme(legend.position="none") + 180 | ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300)) 181 | df$tmp = NULL 182 | } 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /Section_8_birth_series/step3_Celltypes_changing_over_Csection.R: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ########################################################### 7 | ### which cell type is changing across C-section series ### 8 | ########################################################### 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | pd = readRDS(paste0(work_path, "pd_birth.rds")) 15 | pd$anno = as.vector(pd$major_trajectory) 16 | emb = readRDS(paste0(work_path, "Birth_series.PCs.rds")) 17 | emb = as.matrix(emb) 18 | 19 | index = pd$day %in% paste0("Csection_", c(0,20,40,60,80), "m") 20 | pd_sub = pd[index,] 21 | emb_sub = emb[index,] 22 | 23 | k.param = 11; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean" 24 | nn.ranked = Seurat:::NNHelper( 25 | data = emb_sub, 26 | k = k.param, 27 | method = nn.method, 28 | searchtype = "standard", 29 | eps = nn.eps, 30 | metric = annoy.metric) 31 | nn.ranked = Indices(object = nn.ranked) 32 | nn_matrix = nn.ranked 33 | 34 | saveRDS(nn_matrix, paste0(work_path, "nn_matrix.rds")) 35 | 36 | day_value = gsub("Csection_", "", as.vector(pd_sub$day)) 37 | day_value = gsub("m", "", day_value) 38 | day_value = as.numeric(day_value) 39 | 40 | nn_res = NULL 41 | for(i in 1:ncol(nn_matrix)){ 42 | nn_res = cbind(nn_res, day_value[as.vector(nn_matrix[,i])]) 43 | } 44 | dat = data.frame(org_day = nn_res[,1], 45 | nn_day = apply(nn_res[,2:ncol(nn_res)], 1, mean), 46 | anno = as.vector(pd_sub$anno)) 47 | 48 | celltype_list = names(table(dat$anno)) 49 | res = NULL 50 | for(celltype_i in celltype_list){ 51 | res = rbind(res, 52 | data.frame(anno = celltype_i, 53 | cor = cor.test(dat$org_day[dat$anno == celltype_i], dat$nn_day[dat$anno == celltype_i])$estimate, stringsAsFactors = FALSE)) 54 | } 55 | res = res[order(res$cor),] 56 | res$anno = factor(res$anno, levels = as.vector(res$anno)) 57 | 58 | p = res %>% 59 | ggplot(aes(anno, cor, fill = anno)) + 60 | geom_bar(stat="identity") + 61 | coord_flip() + 62 | scale_fill_manual(values = major_trajectory_color_plate) + 63 | labs(x = "", y = "") + 64 | theme_classic(base_size = 15) + 65 | theme(legend.position="none") + 66 | theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 67 | -------------------------------------------------------------------------------- /Section_8_birth_series/step5_Comparing_NatBirth.R: -------------------------------------------------------------------------------- 1 | 2 | ################################# 3 | ### Section - 8, Birth series ### 4 | ################################# 5 | 6 | ########################################################################################################## 7 | ### 2D UMAP of subclustering results for adipocytes and lung & airway (adding Natural birthed samples) ### 8 | ########################################################################################################## 9 | 10 | source("JAX_help_code.R") 11 | source("JAX_color_code.R") 12 | work_path = "./" 13 | 14 | trajectory_list = c("Hepatocytes", "Adipocytes", "Lung_and_airway") 15 | 16 | ### Extended Data Fig. 12f 17 | 18 | for(trajectory_i in trajectory_list){ 19 | 20 | print(trajectory_i) 21 | 22 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections_NatBirth.obs.rds")) 23 | 24 | rep_id = as.vector(df$day) 25 | rep_id[df$embryo_id == "embryo_76"] = "NatBirth_rep1" 26 | rep_id[df$embryo_id == "embryo_77"] = "NatBirth_rep2" 27 | rep_id[df$embryo_id == "embryo_78"] = "NatBirth_rep3" 28 | df$rep_id = as.vector(rep_id) 29 | 30 | for(i in names(table(df$rep_id))){ 31 | 32 | try(ggplot() + 33 | geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), color = "grey80", size=1.5, alpha = 0.3) + 34 | geom_point(data = df %>% filter(rep_id == i), 35 | aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=1.5) + 36 | theme_void() + 37 | scale_color_manual(values=birth_color_plate) + 38 | theme(legend.position="none") + 39 | ggsave(paste0(work_path, "birth_", trajectory_i, "_NatBirth_", i, ".png"), width = 4, height = 4, dpi = 300)) 40 | } 41 | 42 | } 43 | 44 | 45 | ############################################################################### 46 | ### Identifying the neighbors for each NatBirth samples in the co-embedding ### 47 | ############################################################################### 48 | 49 | source("JAX_help_code.R") 50 | source("JAX_color_code.R") 51 | work_path = "./" 52 | 53 | trajectory_list = c("Hepatocytes", "Adipocytes", "Lung_and_airway") 54 | 55 | ### Extended Data Fig. 12e 56 | 57 | for(trajectory_i in trajectory_list){ 58 | 59 | print(trajectory_i) 60 | 61 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections_NatBirth.obs.rds")) 62 | 63 | rep_id = as.vector(df$day) 64 | rep_id[df$embryo_id == "embryo_76"] = "NatBirth_rep1" 65 | rep_id[df$embryo_id == "embryo_77"] = "NatBirth_rep2" 66 | rep_id[df$embryo_id == "embryo_78"] = "NatBirth_rep3" 67 | df$rep_id = as.vector(rep_id) 68 | 69 | emb = read.csv(paste0(work_path, "adata_", trajectory_i, "_NatBirth.PCs.csv"), header=F, as.is=T) 70 | colnames(emb) = paste0("PC_", 1:30) 71 | rownames(emb) = rownames(df) = as.vector(df$cell_id) 72 | emb = as.matrix(emb) 73 | 74 | result = list() 75 | for(kk in 1:3){ 76 | rep_i = paste0("NatBirth_rep", kk); print(rep_i) 77 | 78 | df_1 = df[df$rep_id == rep_i,] 79 | emb_1 = emb[df$rep_id == rep_i,] 80 | 81 | df_2 = df[df$rep_id != rep_i,] 82 | df_2_x = df_2 %>% group_by(rep_id) %>% sample_n(5000) 83 | df_2 = df_2[df_2$cell_id %in% df_2_x$cell_id,] 84 | emb_2 = emb[as.vector(df_2$cell_id),] 85 | 86 | k.param = 10; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean" 87 | nn.ranked = Seurat:::NNHelper( 88 | data = emb_2, 89 | query = emb_1, 90 | k = k.param, 91 | method = nn.method, 92 | searchtype = "standard", 93 | eps = nn.eps, 94 | metric = annoy.metric) 95 | nn.ranked = Indices(object = nn.ranked) 96 | nn_matrix = nn.ranked 97 | 98 | resultA = NULL 99 | for(i in 1:k.param){ 100 | print(i) 101 | resultA = cbind(resultA, as.vector(df_2$rep_id)[as.vector(nn_matrix[,i])]) 102 | } 103 | 104 | result[[rep_i]] = table(c(resultA)) 105 | } 106 | 107 | dat_1 = data.frame(target_id = names(result[[1]]), num = as.vector(result[[1]])) 108 | dat_1 = rbind(dat_1, data.frame(target_id = "NatBirth_rep1", num = 0)) 109 | dat_1$rep_id = "NatBirth_rep1" 110 | 111 | dat_2 = data.frame(target_id = names(result[[2]]), num = as.vector(result[[2]])) 112 | dat_2 = rbind(dat_2, data.frame(target_id = "NatBirth_rep2", num = 0)) 113 | dat_2$rep_id = "NatBirth_rep2" 114 | 115 | dat_3 = data.frame(target_id = names(result[[3]]), num = as.vector(result[[3]])) 116 | dat_3 = rbind(dat_3, data.frame(target_id = "NatBirth_rep3", num = 0)) 117 | dat_3$rep_id = "NatBirth_rep3" 118 | 119 | dat = rbind(dat_1, dat_2, dat_3) 120 | 121 | dat$target_id = factor(dat$target_id, levels = c(paste0("NatBirth_rep",1:3), paste0("Csection_",c(0,20,40,60,80),"m"))) 122 | 123 | p = dat %>% 124 | ggplot(aes(x=target_id, y = num, fill = target_id))+ 125 | geom_bar(stat="identity") + facet_grid(rep_id ~ .) + 126 | theme_classic(base_size = 10) + 127 | theme(legend.position="none") + 128 | labs(x="",y="# of kNNs") + 129 | scale_fill_manual(values=birth_color_plate) + 130 | theme(axis.text.x = element_text(color="black", angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(color="black")) 131 | 132 | } 133 | 134 | 135 | ################################################################## 136 | ### DEGs between NatBirth and C-section samples (20m and 40m) ### 137 | ################################################################## 138 | 139 | source("JAX_help_code.R") 140 | source("JAX_color_code.R") 141 | work_path = "./" 142 | 143 | pd = readRDS(paste0(work_path, "pd_birth.rds")) 144 | pd$anno = as.vector(pd$major_trajectory) 145 | 146 | pd_sub = pd %>% filter(day %in% c("NatBirth","Csection_20m","Csection_40m")) 147 | pd_sub$embryo_group = paste0(pd_sub$anno, "_", pd_sub$embryo_group) 148 | x_table = table(pd_sub$embryo_group) 149 | pd_sub_1 = pd_sub[pd_sub$embryo_group %in% names(x_table)[x_table > 10000],] 150 | pd_sub_2 = pd_sub[pd_sub$embryo_group %in% names(x_table)[x_table <= 10000],] 151 | pd_sub_1_x = pd_sub_1 %>% group_by(embryo_group) %>% sample_n(10000) 152 | pd_sub_1 = pd_sub_1[pd_sub_1$cell_id %in% pd_sub_1_x$cell_id,] 153 | df = rbind(pd_sub_1, pd_sub_2) 154 | 155 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M", "X", "Y")),] 156 | gene_count = doExtractData(df, mouse_gene_sub) 157 | obj = CreateSeuratObject(gene_count, meta.data = df) 158 | obj = NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000) 159 | 160 | anno_list = names(table(pd$anno)) 161 | res_all = NULL 162 | 163 | for(i in 1:length(anno_list)){ 164 | anno_i = anno_list[i]; print(anno_i) 165 | obj_sub = subset(obj, subset = anno == anno_i) 166 | Idents(obj_sub) = as.vector(obj_sub$day) 167 | obj_sub = FindVariableFeatures(obj_sub, selection.method = "vst", nfeatures = 5000) 168 | genes_include = VariableFeatures(obj_sub) 169 | 170 | res = FindMarkers(obj_sub, ident.1 = "NatBirth", features = genes_include) 171 | res = res %>% mutate(gene_ID = rownames(res)) %>% 172 | left_join(mouse_gene[,c("gene_ID","gene_short_name")]) %>% as.data.frame() %>% filter(p_val_adj < 0.05) 173 | res$high_in_which = if_else(res$avg_logFC > 0, "Up_in_NatBirth", "Down_in_NatBirth") 174 | res$p_val = NULL 175 | res$major_cell_cluster = anno_i 176 | 177 | res_all = rbind(res_all, res) 178 | } 179 | 180 | write.csv(res_all, paste0(work_path, "adata_major_cell_cluster_NatBirth_DEGs.csv")) 181 | 182 | 183 | 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /spatial_mapping.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/spatial_mapping.tar.gz --------------------------------------------------------------------------------