├── JAX_color_code.R
├── JAX_help_code.R
├── README.md
├── Section_1_basic_analysis
    ├── .DS_Store
    ├── .Rhistory
    ├── Detecting_doublets_by_subclustering.py
    ├── Dimension_reduction_scanpy.py
    ├── Integrating_adjacent_timepoints.py
    ├── Run_Scrublet.py
    ├── step1_Removing_doublets.R
    ├── step2_Plot_UMAP.R
    ├── step3_Pseudobulk.R
    ├── step4_Estimate_cell_num.R
    └── step5_Check_batch_effect.R
├── Section_2_posterior_embryo
    ├── Embedding_scanpy_posterior_embryo.py
    ├── Run_geosketch.py
    ├── step1_posterior_embryo.R
    ├── step2_NMP_mesoderm.R
    ├── step3_Notochord.R
    ├── step4_Gut.R
    ├── step5_Genes_correlated_with_PC.R
    ├── step6_somites_validation.R
    └── step7_Npm1_signature.R
├── Section_3_kidney_mesenchyme
    ├── Embedding_Renal.py
    ├── Embedding_scanpy_kidney.py
    ├── Embedding_scanpy_lateral_plate_mesoderm.py
    ├── Embedding_scanpy_patterned_mesoderm_somites_26_34.py
    ├── Embedding_scanpy_patterned_mesoderm_somites_5_20.py
    ├── Spatial_mapping.py
    ├── step1_kidney.R
    ├── step2_Lateral_plate_mesoderm.R
    ├── step3_Spatial_mapping.R
    ├── step4_Patterned_mesoderm.R
    └── step5_Renal_two_subpopulations.R
├── Section_4_eye
    ├── Embedding_scanpy_eye.py
    └── Eye.R
├── Section_5_neuroectoderm
    ├── Embedding_early_neurons.py
    ├── Embedding_neuroectoderm_derivatives.py
    ├── Embedding_patterned_neuroectoderm.py
    ├── step1_Patterned_neuroectoderm.R
    ├── step2_Early_neurons.R
    ├── step3_Early_neurons_PCA.R
    ├── step4_Mapping_neuroectoderm_derivatives.R
    ├── step5_Astrocytes.R
    └── step6_Key_TFs.R
├── Section_6_development_tree
    ├── .DS_Store
    ├── Dimension_reduction_subsystem.py
    ├── Graph_robust.py
    ├── Two_examples.py
    ├── step1_Early_stage_graph.R
    ├── step2_Late_stage_graph.R
    ├── step3_Create_graph.R
    ├── step4_Two_examples.R
    └── step5_MNN_robustness.R
├── Section_7_key_TFs
    ├── .Rhistory
    ├── HSCs_progenitors.py
    ├── step1_Key_TFs.R
    ├── step2_Key_genes.R
    ├── step3_Summarize_results.R
    ├── step4_Pseudotime_endoderm.R
    └── step5_HSCs_progenitors.R
├── Section_8_birth_series
    ├── Embedding_birth_series.py
    ├── Embedding_individual_celltype.py
    ├── step1_Celltypes_shift_after_birth.R
    ├── step2_Embedding_birth_series.R
    ├── step3_Celltypes_changing_over_Csection.R
    ├── step4_DEGs_birth.R
    └── step5_Comparing_NatBirth.R
└── spatial_mapping.tar.gz


/README.md:
--------------------------------------------------------------------------------
1 | # JAX_code
2 | The scripts that are used for analyzing dataset in the paper: A single-cell transcriptional timelapse of mouse embryonic development, from gastrula to pup (https://www.biorxiv.org/content/10.1101/2023.04.05.535726v1.abstract)
3 | 
4 | The house mouse, Mus musculus, is an exceptional model system, combining genetic tractability with close homology to human biology. Gestation in mouse development lasts just under three weeks, a period during which its genome orchestrates the astonishing transformation of a single cell zygote into a free-living pup composed of >500 million cells. Towards a global framework for exploring mammalian development, we applied single cell combinatorial indexing to profile the transcriptional states of 12.4 million nuclei from 83 precisely staged embryos spanning late gastrulation (embryonic day 8 or E8) to birth (postnatal day 0 or P0), with 2-hr temporal resolution during somitogenesis, 6-hr resolution through to birth, and 20-min resolution during the immediately postpartum period. From these data (E8 to P0), we annotate dozens of major cell clusters and hundreds of cell types and perform deeper analyses of the unfolding of the posterior embryo during somitogenesis as well as the ontogenesis of the kidney, mesenchyme, retina, and early neurons. Finally, we leverage the depth and temporal resolution of these whole embryo snapshots, together with other published data, to construct and curate a rooted tree of cell type relationships that spans mouse development from zygote to pup. Throughout this tree, we systematically nominate sets of transcription factors (TFs) and other genes as candidate drivers of the in vivo differentiation of hundreds of mammalian cell types. Remarkably, the most dramatic shifts in transcriptional state are observed in a restricted set of cell types in the hours immediately following birth, and presumably underlie the massive changes in physiology that must accompany the successful transition of a placental mammal to extrauterine life.
5 | 
6 | The data used in these scripts can be found at https://shendure-web.gs.washington.edu/content/members/cxqiu/public/backup/jax/download/other/
7 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_1_basic_analysis/.DS_Store


--------------------------------------------------------------------------------
/Section_1_basic_analysis/.Rhistory:
--------------------------------------------------------------------------------
1 | ?monocle3:::normalize_expr_data
2 | monocle3:::normalize_expr_data
3 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/Detecting_doublets_by_subclustering.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ### We performed two rounds of clustering and used the doublet annotations to identify subclusters that are enriched in doublets
  3 | 
  4 | import scanpy as sc
  5 | import pandas as pd
  6 | import numpy as np
  7 | import scrublet as scr
  8 | import os, sys
  9 | 
 10 | WORK_PATH = "./"
 11 | 
 12 | os.mkdir(os.path.join(WORK_PATH, "doublet_cluster"))
 13 | 
 14 | fdata = pd.read_csv("df_gene.csv", index_col = 0)
 15 | 
 16 | adata1 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_1.mtx'))
 17 | pdata1 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_1.csv'), index_col = 0)
 18 | adata1.obs = pdata1
 19 | adata1.var = fdata
 20 | 
 21 | adata2 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_2.mtx'))
 22 | pdata2 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_2.csv'), index_col = 0)
 23 | adata2.obs = pdata2
 24 | adata2.var = fdata
 25 | 
 26 | adata3 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_3.mtx'))
 27 | pdata3 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_3.csv', index_col = 0))
 28 | adata3.obs = pdata3
 29 | adata3.var = fdata
 30 | 
 31 | adata4 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_4.mtx'))
 32 | pdata4 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_4.csv', index_col = 0))
 33 | adata4.obs = pdata4
 34 | adata4.var = fdata
 35 | 
 36 | adata5 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_5.mtx'))
 37 | pdata5 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_5.csv', index_col = 0))
 38 | adata5.obs = pdata5
 39 | adata5.var = fdata
 40 | 
 41 | adata6 = sc.read_mtx(os.path.join(WORK_PATH, 'gene_count_6.mtx'))
 42 | pdata6 = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_6.csv', index_col = 0))
 43 | adata6.obs = pdata6
 44 | adata6.var = fdata
 45 | 
 46 | adata = adata1.concatenate(adata2, adata3, adata4, adata5, adata6)
 47 | 
 48 | adata_orig = adata
 49 | 
 50 | ### remove sex genes
 51 | adata = adata_orig[:, ~adata_orig.var['chr'].isin(['chrX', 'chrY'])]
 52 | ### high variable genes
 53 | sc.pp.filter_genes(adata, min_cells=1)
 54 | sc.pp.normalize_total(adata, target_sum=1e5)
 55 | sc.pp.log1p(adata)
 56 | sc.pp.highly_variable_genes(adata, n_top_genes=3000)
 57 | filter_genes = list(adata.var.loc[adata.var['highly_variable'] == True, 'gene_id'])
 58 | 
 59 | ### 
 60 | adata = adata_orig[:, adata_orig.var['gene_id'].isin(filter_genes)]
 61 | sc.pp.normalize_total(adata, target_sum=1e5)
 62 | sc.pp.log1p(adata)
 63 | sc.pp.scale(adata)
 64 | ###
 65 | sc.tl.pca(adata, svd_solver='arpack', n_comps = 30)
 66 | sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30)
 67 | sc.tl.louvain(adata)
 68 | sc.tl.umap(adata, min_dist=0.1)
 69 | 
 70 | adata.obs['umap_1'] = list(adata.obsm['X_umap'][:,0])
 71 | adata.obs['umap_2'] = list(adata.obsm['X_umap'][:,1])
 72 | name = "global.csv"
 73 | adata.obs.to_csv(os.path.join(WORK_PATH, 'doublet_cluster', name))
 74 | 
 75 | obs_all = adata.obs
 76 | obs_all['louvain'].value_counts()
 77 | cluster_list = list(set(list(obs_all['louvain'])))
 78 | 
 79 | xx = 0
 80 | for cnt in range(len(cluster_list)):
 81 |     xx += 1
 82 |     print('Processing: ' + str(xx) + '/' + str(len(cluster_list)))
 83 |     cluster = cluster_list[cnt]
 84 |     include_cell = list(obs_all.loc[obs_all['louvain'] == cluster, 'sample'])
 85 |     adata = adata_orig[adata_orig.obs['sample'].isin(include_cell)]
 86 |     adata = adata[:, ~adata.var['chr'].isin(['chrX', 'chrY'])]
 87 |     sc.pp.filter_genes(adata, min_cells=1)
 88 |     sc.pp.normalize_total(adata, target_sum=1e5)
 89 |     sc.pp.log1p(adata)
 90 |     sc.pp.highly_variable_genes(adata, n_top_genes=3000)
 91 |     filter_genes = list(adata.var.loc[adata.var['highly_variable'] == True, 'gene_id'])
 92 |     adata = adata_orig[adata_orig.obs['sample'].isin(include_cell)]
 93 |     adata = adata[:, adata.var['gene_id'].isin(filter_genes)]
 94 |     sc.pp.normalize_total(adata, target_sum=1e5)
 95 |     sc.pp.log1p(adata)
 96 |     sc.pp.scale(adata)
 97 |     sc.tl.pca(adata, svd_solver='arpack', n_comps = 30)
 98 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 99 |     sc.tl.louvain(adata, resolution = 3)
100 |     sc.tl.umap(adata, min_dist=0.1)
101 |     adata.obs['umap_1'] = list(adata.obsm['X_umap'][:,0])
102 |     adata.obs['umap_2'] = list(adata.obsm['X_umap'][:,1])
103 |     name = 'adata.obs.louvain_' + cluster + '.csv'
104 |     adata.obs.to_csv(os.path.join(WORK_PATH, 'doublet_cluster', name))
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/Dimension_reduction_scanpy.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on 11.4M dataset ###
  4 | #############################################################################################################
  5 | 
  6 | ### Of note, this data should include 24,552 genes and 11,441,407 cells
  7 | ### Hint: I suggest to request >500GB for the following analysis.
  8 | 
  9 | import scanpy as sc
 10 | import pandas as pd
 11 | import numpy as np
 12 | import os, sys
 13 | import time
 14 | import gc
 15 | 
 16 | start_time = time.time()
 17 | 
 18 | WORK_PATH = './'
 19 | 
 20 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 21 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 22 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 23 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 24 | 
 25 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 26 | del adata_1, adata_2, adata_3, adata_4
 27 | gc.collect()
 28 | 
 29 | print("Done reading data ...")
 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 31 | 
 32 | sc.pp.normalize_total(adata, target_sum=1e4)
 33 | print("Done normalization by total counts ...")
 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 35 | 
 36 | sc.pp.log1p(adata)
 37 | print("Done log transformation ...")
 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 39 | 
 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 41 | print("Done finding highly variable genes ...")
 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 43 | 
 44 | adata = adata[:, adata.var.highly_variable]
 45 | print("Done filtering in highly variable genes ...")
 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 47 | 
 48 | sc.pp.scale(adata, max_value=10)
 49 | print("Done scaling data ...")
 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 51 | 
 52 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 53 | print("Done performing PCA ...")
 54 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 55 | 
 56 | sc.pp.neighbors(adata, n_neighbors=50, n_pcs=30)
 57 | print("Done computing neighborhood graph ...")
 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 59 | 
 60 | sc.tl.umap(adata, min_dist=0.1, n_components=3)
 61 | print("Done UMAP ...")
 62 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 63 | 
 64 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 65 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 66 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 67 | 
 68 | sc.tl.umap(adata, min_dist=0.1, n_components=2)
 69 | print("Done UMAP ...")
 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 71 | 
 72 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 73 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 74 | 
 75 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 76 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 77 | print("Done clustering using res = 1 ...")
 78 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 79 | 
 80 | sc.tl.leiden(adata, resolution=2, n_iterations=2)
 81 | adata.obs['leiden_res_2'] = adata.obs['leiden']
 82 | print("Done clustering using res = 2 ...")
 83 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 84 | 
 85 | adata.obs.to_csv(os.path.join(WORK_PATH, 'adata_scale.obs.csv'))
 86 | pd.DataFrame(adata.obsm['X_pca']).to_csv(os.path.join(WORK_PATH, 'adata_scale.pca.csv'))
 87 | 
 88 | adata.write(os.path.join(WORK_PATH, 'adata_scale.h5ad'), compression="gzip")
 89 | print("Done writing data ...")
 90 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 91 | 
 92 | 
 93 | #################################################################################################
 94 | ### For each major trajectories, we further performed sub-clustering to get higher resolution ###
 95 | #################################################################################################
 96 | 
 97 | import scanpy as sc
 98 | import pandas as pd
 99 | import numpy as np
100 | import os
101 | import time
102 | import sys
103 | 
104 | start_time = time.time()
105 | 
106 | WORK_PATH = './'
107 | 
108 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
109 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
110 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
111 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
112 | 
113 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4)
114 | del adata_1, adata_2, adata_3, adata_4
115 | gc.collect()
116 | 
117 | ### Of note, please read df_cell.rds and then write it to df_cell.csv in R
118 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
119 | adata_orig.obs = pdata
120 | 
121 | trajectory_list = ["Neuroectoderm_and_glia",
122 | "Intermediate_neuronal_progenitors",
123 | "Eye_and_other",
124 | "Ependymal_cells",
125 | "CNS_neurons",
126 | "Mesoderm",
127 | "Definitive_erythroid",
128 | "Epithelial_cells",
129 | "Endothelium",
130 | "Muscle_cells",
131 | "Hepatocytes",
132 | "White_blood_cells",
133 | "Neural_crest_PNS_glia",
134 | "Adipocytes",
135 | "Primitive_erythroid",
136 | "Neural_crest_PNS_neurons",
137 | "T_cells",
138 | "Lung_and_airway",
139 | "Intestine",
140 | "B_cells",
141 | "Olfactory_sensory_neurons",
142 | "Cardiomyocytes",
143 | "Oligodendrocytes",
144 | "Mast_cells",
145 | "Megakaryocytes",
146 | "Testis_and_adrenal"]
147 | 
148 | for trajectory_id in trajectory_list:
149 |     print("Processing: %s"%trajectory_id)
150 | 
151 |     adata = adata_orig[adata_orig.obs["major_trajectory"] == trajectory_id]
152 | 
153 |     print("Done reading data ...")
154 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
155 | 
156 |     sc.pp.normalize_total(adata, target_sum=1e4)
157 |     print("Done normalization by total counts ...")
158 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
159 | 
160 |     sc.pp.log1p(adata)
161 |     print("Done log transformation ...")
162 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
163 | 
164 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
165 |     print("Done finding highly variable genes ...")
166 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
167 | 
168 |     adata = adata[:, adata.var.highly_variable]
169 |     print("Done filtering in highly variable genes ...")
170 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
171 | 
172 |     sc.pp.scale(adata, max_value=10)
173 |     print("Done scaling data ...")
174 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
175 |     ### done with regress_out and scale ###
176 | 
177 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
178 |     print("Done performing PCA ...")
179 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
180 | 
181 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
182 |     print("Done computing neighborhood graph ...")
183 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
184 | 
185 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
186 |     print("Done UMAP ...")
187 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
188 | 
189 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
190 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
191 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
192 | 
193 |     sc.tl.umap(adata, min_dist=0.1, n_components=2)
194 |     print("Done UMAP ...")
195 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
196 | 
197 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
198 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
199 | 
200 |     sc.tl.leiden(adata, resolution=1, n_iterations=2)
201 |     adata.obs['leiden_res_1'] = adata.obs['leiden']
202 |     print("Done clustering using res = 1 ...")
203 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
204 | 
205 |     sc.tl.leiden(adata, resolution=5, n_iterations=2)
206 |     adata.obs['leiden_res_5'] = adata.obs['leiden']
207 |     print("Done clustering using res = 5 ...")
208 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
209 | 
210 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%trajectory_id))
211 |     pd.DataFrame(adata.obsm['X_pca']).to_csv(os.path.join(WORK_PATH, '%s_adata_scale.pca.csv'%trajectory_id))
212 | 
213 |     adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%trajectory_id), compression="gzip")
214 |     print("Done writing data ...")
215 |     print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/Integrating_adjacent_timepoints.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import scanpy as sc
 3 | import pandas as pd
 4 | import numpy as np
 5 | import os
 6 | import sys
 7 | from annoy import AnnoyIndex
 8 | 
 9 | WORK_PATH = './'
10 | 
11 | file = open(os.path.join(WORK_PATH, "batch_list.txt"))
12 | example_list = [line.rstrip() for line in file]
13 | file.close()
14 | 
15 | for example_i in example_list:
16 |     print(example_i)
17 | 
18 |     adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i))
19 |     fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_i), index_col = 0)
20 |     pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0)
21 |     adata.obs = pdata
22 |     adata.var = fdata
23 | 
24 |     print(adata.shape)
25 | 
26 |     sc.pp.normalize_total(adata, target_sum=1e4)
27 |     sc.pp.log1p(adata)
28 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
29 |     adata = adata[:, adata.var.highly_variable]
30 |     sc.pp.scale(adata, max_value=10)
31 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
32 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
33 | 
34 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
35 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
36 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
37 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
38 | 
39 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
40 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
41 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
42 | 
43 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i))
44 | 
45 |     adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip")
46 | 
47 |     X = adata.obsm['X_pca']
48 |     print(X.shape)
49 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f')
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/Run_Scrublet.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ### We performed Scrublet to detect doublets
 3 | 
 4 | import scanpy as sc
 5 | import pandas as pd
 6 | import numpy as np
 7 | import scrublet as scr
 8 | import os, sys
 9 | 
10 | WORK_PATH = './'
11 | 
12 | for cnt in range(6):
13 |     
14 |     batch_id = str(cnt+1)
15 |     print(batch_id)
16 | 
17 |     adata = sc.read_mtx(os.path.join(WORK_PATH, "gene_count_%s.mtx"%batch_id))
18 |     pdata = pd.read.csv(os.path.join(WORK_PATH, "df_cell_%s.csv"%batch_id, index_col = 0))
19 |     fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv", index_col = 0))
20 | 
21 |     adata.obs_names = list(pdata['sample'])
22 |     adata.var_names = list(fdata['gene_id'])
23 | 
24 |     min_counts = 3
25 |     min_cells = 3
26 |     vscore_percentile = 85
27 |     n_pc = 30
28 |     expected_doublet_rate = 0.06
29 |     sim_doublet_ratio = 2
30 |     n_neighbors = 30
31 |     scaling_method = 'log'
32 |     scrublet_results = scr.compute_doublet_scores(
33 |         adata.X, 
34 |         min_counts = min_counts, 
35 |         min_cells = min_cells, 
36 |         vscore_percentile = vscore_percentile, 
37 |         n_prin_comps = n_pc,
38 |         scaling_method = scaling_method,
39 |         expected_doublet_rate = expected_doublet_rate,
40 |         sim_doublet_ratio = sim_doublet_ratio,
41 |         n_neighbors = n_neighbors, 
42 |         use_approx_neighbors = True, 
43 |         get_doublet_neighbor_parents = False
44 |     )
45 | 
46 |     pd.DataFrame(scrublet_results['doublet_scores_observed_cells']).to_csv(os.path.join(WORK_PATH, "doublet_scores_observed_cells_%s.csv"%batch_id), index = False, header = None)
47 |     pd.DataFrame(scrublet_results['doublet_scores_simulated_doublets']).to_csv(os.path.join(WORK_PATH, "doublet_scores_simulated_doublets_%s.csv"%batch_id), index = False, header = None)
48 | 
49 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/step2_Plot_UMAP.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################
  3 | ### Section - 1, Basic analysis ###
  4 | ###################################
  5 | 
  6 | 
  7 | ###################
  8 | ### Cell number ###
  9 | ###################
 10 | 
 11 | source("JAX_help_code.R")
 12 | source("JAX_color_code.R")
 13 | 
 14 | work_path = "./"
 15 | 
 16 | pd = readRDS(paste0(work_path, "df_cell.rds"))
 17 | ### n = 11,441,407 cells
 18 | 
 19 | x = as.vector(pd$day)
 20 | x[pd$day == "E8.0-E8.5"] = "E8.5"
 21 | pd$day = as.vector(x)
 22 | 
 23 | ### Making bar plot for cell number from individual timepoints (Fig. 1a)
 24 | 
 25 | pd_cell_num_1 = pd %>% group_by(day) %>% tally() %>% rename(cell_num = n) %>% as.data.frame()
 26 | pd_cell_num_1$day = factor(pd_cell_num_1$day, levels = rev(names(day_color_plate)))
 27 | pd_cell_num_1$log2_cell_num = log2(pd_cell_num_1$cell_num)
 28 | 
 29 | p1 = pd_cell_num_1 %>%
 30 |     ggplot(aes(day, cell_num, fill = day)) + 
 31 |     geom_bar(stat="identity") +
 32 |     coord_flip() +
 33 |     scale_fill_manual(values = day_color_plate) + 
 34 |     scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) +
 35 |     geom_text(aes(label = scales::comma(cell_num)), 
 36 |               hjust = -0.1,
 37 |               position = position_dodge(width = 1),
 38 |               inherit.aes = TRUE,
 39 |               size = 3) +
 40 |     labs(x = "", y = "Cell number") +
 41 |     theme_classic(base_size = 15) +
 42 |     theme(legend.position="none") +
 43 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
 44 | pdf(paste0(work_path, "Cell_number_timepoints.pdf"), 3.5, 8)
 45 | print(p1)
 46 | dev.off()
 47 | 
 48 | ### Making bar plot for cell number from individual somite counts (Fig. 1a)
 49 | 
 50 | pd_cell_num_2 = pd[!is.na(pd$somite_count),] %>% group_by(somite_count) %>% tally() %>% rename(cell_num = n) %>% as.data.frame()
 51 | pd_cell_num_2$somite_count = factor(pd_cell_num_2$somite_count, levels = rev(names(somite_color_plate)))
 52 | 
 53 | p2 = pd_cell_num_2 %>%
 54 |     ggplot(aes(somite_count, cell_num, fill = somite_count)) + 
 55 |     geom_bar(stat="identity") +
 56 |     coord_flip() +
 57 |     scale_fill_manual(values = somite_color_plate) + 
 58 |     scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) +
 59 |     geom_text(aes(label = scales::comma(cell_num)), 
 60 |               hjust = -0.1,
 61 |               position = position_dodge(width = 1),
 62 |               inherit.aes = TRUE,
 63 |               size = 3) +
 64 |     labs(x = "", y = "Cell number") +
 65 |     theme_classic(base_size = 15) +
 66 |     theme(legend.position="none") +
 67 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
 68 | pdf(paste0(work_path, "Cell_number_somite_counts.pdf"), 3.5, 8)
 69 | print(p2)
 70 | dev.off()
 71 | 
 72 | 
 73 | ####################################
 74 | ### Making 2D UMAP visualization ###
 75 | ####################################
 76 | 
 77 | source("JAX_help_code.R")
 78 | source("JAX_color_code.R")
 79 | 
 80 | work_path = "./"
 81 | 
 82 | pd = readRDS(paste0(work_path, "df_cell.rds"))
 83 | ### n = 11,441,407 cells
 84 | 
 85 | x = as.vector(pd$day)
 86 | x[pd$day == "E8.0-E8.5"] = "E8.5"
 87 | pd$day = as.vector(x)
 88 | 
 89 | ### Making 2D UMAP visualization for the global embedding
 90 | 
 91 | ### Highlight cells with their major trajectories (Fig. 1c)
 92 | 
 93 | p = pd %>%
 94 |     ggplot() +
 95 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 96 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = major_trajectory), size=0.3) +
 97 |     scale_color_manual(values = major_trajectory_color_plate) +
 98 |     theme_void() +
 99 |     theme(legend.position="none") + 
100 |     ggsave(paste0(work_path, "Global_embedding_2D_UMAP_major_trajectory.png"), width = 10, height = 10, dpi = 300)
101 | 
102 | ### Highlight cells with their day timepoints (Fig. 1c)
103 | ### Of note, we need to downsample cells from each timepint to a similar number (i.e. 100,000)
104 | 
105 | pd_1 = pd %>% filter(pd$day %in% c("E8.75", "E17.25")) %>% as.data.frame()
106 | pd_2 = pd %>% filter(!pd$day %in% c("E8.75", "E17.25")) %>% group_by(day) %>% sample_n(100000) %>% as.data.frame()
107 | pd_sub = rbind(pd_1, pd_2)
108 | p = pd_sub[sample(1:nrow(pd_sub))] %>%
109 |     ggplot() +
110 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
111 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) +
112 |     scale_color_manual(values=day_color_plate) +
113 |     theme_void() +
114 |     theme(legend.position="none") + 
115 |     ggsave(paste0(work_path, "Global_embedding_2D_UMAP_day.png"), width = 10, height = 10, dpi = 300)
116 | 
117 | 
118 | 
119 | ###############################################################################
120 | ### Making 3D UMAP for individual major_trajectories (Extended Data Fig. 3) ###
121 | ###############################################################################
122 | 
123 | source("JAX_help_code.R")
124 | source("JAX_color_code.R")
125 | 
126 | work_path = "./"
127 | 
128 | pd = readRDS(paste0(work_path, "df_cell.rds"))
129 | ### n = 11,441,407 cells
130 | 
131 | major_trajectory_list = names(major_trajectory_color_plate)
132 | 
133 | for(i in major_trajectory_list){
134 |     print(i)
135 |     
136 |     if(sum(pd$global_celltype == i) > 300000){
137 |         fig = pd %>%
138 |             filter(global_celltype == i) %>%
139 |             sample_n(300000) %>%
140 |             plot_ly(x = ~sub_UMAP_3d_1, y = ~sub_UMAP_3d_2, z = ~sub_UMAP_3d_3, size=I(30), color = ~sub_celltype) %>% 
141 |             layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
142 |                                 yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
143 |                                 zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2)))
144 |     } else {
145 |         fig = pd %>%
146 |             filter(global_celltype == i) %>%
147 |             plot_ly(x = ~sub_UMAP_3d_1, y = ~sub_UMAP_3d_2, z = ~sub_UMAP_3d_3, size=I(30), color = ~sub_celltype) %>% 
148 |             layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
149 |                                 yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
150 |                                 zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2)))
151 |     }
152 |     
153 |     saveWidget(fig, paste0(work_path, i, "_celltype_update.html"), selfcontained = FALSE, libdir = "tmp")
154 |     
155 | }
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/step3_Pseudobulk.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ###################################
 3 | ### Section - 1, Basic analysis ###
 4 | ###################################
 5 | 
 6 | #########################################
 7 | ### Pseudobulk analysis using Monocle ###
 8 | #########################################
 9 | 
10 | source("JAX_help_code.R")
11 | source("JAX_color_code.R")
12 | 
13 | work_path = "./"
14 | 
15 | cds = readRDS(paste0(work_path, "embryo_cds.rds"))
16 | 
17 | ### identifying the highly variable genes
18 | obj = doObjectTransform(cds, transform_to = "seurat")
19 | obj = NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000)
20 | obj = FindVariableFeatures(obj, selection.method = "vst", nfeatures = 3000)
21 | gene_use = VariableFeatures(obj)
22 | 
23 | ### performing PCA analysis
24 | set.seed(2016)
25 | FM = monocle3:::normalize_expr_data(cds, 
26 |                                     norm_method = "log", 
27 |                                     pseudo_count = 1)
28 | FM = FM[gene_use,]
29 | 
30 | num_dim = 10
31 | scaling = TRUE
32 | set.seed(2016)
33 | irlba_res = my_sparse_prcomp_irlba(Matrix::t(FM), 
34 |                                    n = min(num_dim, min(dim(FM)) - 1), 
35 |                                    center = scaling, 
36 |                                    scale. = scaling)
37 | preproc_res = irlba_res$x
38 | row.names(preproc_res) = colnames(cds)
39 | 
40 | prop_var_expl = irlba_res$sdev^2/sum(irlba_res$sdev^2)
41 | print(prop_var_expl)
42 | 
43 | df = data.frame(embryo_id = rownames(preproc_res),
44 |                 PC_1 = preproc_res[,1],
45 |                 PC_2 = preproc_res[,2],
46 |                 PC_3 = preproc_res[,3],
47 |                 day = as.vector(cds$day),
48 |                 embryo_sex = as.vector(cds$embryo_sex))
49 | df$day = factor(df$day, levels = names(day_color_plate))
50 | 
51 | fig = plot_ly(df, x = ~PC_1, y = ~PC_2, z = ~PC_3, color = ~day, colors = day_color_plate) %>% 
52 |     layout(scene = list(xaxis=list(title = list(text ='PC_1 (77.3%)', font = t1), tickfont = t2),
53 |                         yaxis=list(title = list(text ='PC_2 (9.9%)', font = t1), tickfont = t2),
54 |                         zaxis=list(title = list(text ='PC_3 (4.2%)', font = t1), tickfont = t2),
55 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))),
56 |            showlegend = FALSE)
57 | saveWidget(fig, paste0(work_path, "embryo_pca_day.html"), selfcontained = FALSE, libdir = "tmp")
58 | 
59 | sex_color_plate = c("F" = "#ff0000",
60 |                     "M" = "#0000FF")
61 | fig = plot_ly(df, x = ~PC_1, y = ~PC_2, z = ~PC_3, color = ~embryo_sex, colors = sex_color_plate) %>% 
62 |     layout(scene = list(xaxis=list(title = list(text ='PC_1 (77.3%)', font = t1), tickfont = t2),
63 |                         yaxis=list(title = list(text ='PC_2 (9.9%)', font = t1), tickfont = t2),
64 |                         zaxis=list(title = list(text ='PC_3 (4.2%)', font = t1), tickfont = t2),
65 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))),
66 |            showlegend = FALSE)
67 | saveWidget(fig, paste0(work_path, "embryo_pca_sex.html"), selfcontained = FALSE, libdir = "tmp")
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/Section_1_basic_analysis/step4_Estimate_cell_num.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################
  3 | ### Section - 1, Basic analysis ###
  4 | ###################################
  5 | 
  6 | ######################################################################
  7 | ### Estimating absolute number of cells from individual timepoints ###
  8 | ######################################################################
  9 | 
 10 | work_path = "./"
 11 | 
 12 | ### Cell number estimated by qPCR experiment (million)
 13 | cell_num = c("E8.5" = 0.21,
 14 |              "E9.5" = 0.94,
 15 |              "E10.5" = 10.10,
 16 |              "E11.5" = 22.98,
 17 |              "E12.5" = 45.03,
 18 |              "E13.5" = 60.59,
 19 |              "E14.5" = 131.00,
 20 |              "E15.5" = 216.79,
 21 |              "E16.5" = 353.17,
 22 |              "E17.5" = 515.85,
 23 |              "E18.5" = 584.78,
 24 |              "E19.5" = 671.50)
 25 | 
 26 | df = data.frame(x = as.numeric(gsub("E", "", as.vector(names(cell_num)))),
 27 |                 log2_y = log2(cell_num * 1000000))
 28 | 
 29 | ### fit polynomial regression with degree 5
 30 | fit3 = lm(log2_y~poly(x,3,raw=TRUE), data=df)
 31 | print(summary(fit3)$adj.r.squared) ###  0.9858479
 32 | 
 33 | day_list = c("E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 
 34 |              "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 
 35 |              "E12.25", "E12.5", "E12.75", "E13.0", "E13.25", "E13.5", "E13.75", 
 36 |              "E14.0", "E14.25", "E14.333", "E14.75", "E15.0", "E15.25", "E15.5", 
 37 |              "E15.75", "E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 
 38 |              "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "E19.5")
 39 | 
 40 | x_axis = as.numeric(gsub("E","",day_list))
 41 | 
 42 | plot(df$x, df$log2_y, pch=19, xlab='x', ylab='log2_y')
 43 | lines(x_axis, predict(fit3, data.frame(x=x_axis)), col='purple')
 44 | 
 45 | cell_num_pred = round(2^predict(fit3, data.frame(x=x_axis)))
 46 | 
 47 | df_x = data.frame(day = x_axis,
 48 |                   cell_num_pred_log2 = predict(fit3, data.frame(x=x_axis)),
 49 |                   cell_num_pred = round(2^predict(fit3, data.frame(x=x_axis))))
 50 | 
 51 | day_x = day = paste0("E", df_x$day)
 52 | day[day_x == "E9"] = "E9.0"
 53 | day[day_x == "E10"] = "E10.0"
 54 | day[day_x == "E11"] = "E11.0"
 55 | day[day_x == "E12"] = "E12.0"
 56 | day[day_x == "E13"] = "E13.0"
 57 | day[day_x == "E14"] = "E14.0"
 58 | day[day_x == "E15"] = "E15.0"
 59 | day[day_x == "E16"] = "E16.0"
 60 | day[day_x == "E17"] = "E17.0"
 61 | day[day_x == "E18"] = "E18.0"
 62 | day[day_x == "E19.5"] = "P0"
 63 | df_x$day = as.vector(day)
 64 | 
 65 | ### summary of fit3 curve
 66 | a3 = 0.011369
 67 | a2 = -0.583861
 68 | a1 = 10.397036
 69 | a4 = -35.469755
 70 | 
 71 | ### the function of curve
 72 | p_function = function(x){
 73 |     y = a3*x^3 + a2*x^2 + a1*x^1 + a4
 74 |     return(y)
 75 | }
 76 | 
 77 | ### derivative of the curve, which is increasing time (in log2 scale) at a given timepoint
 78 | d_function = function(x){
 79 |     y = 3*a3*x^2 + 2*a2*x + a1
 80 |     return(y)
 81 | }
 82 | 
 83 | ### timepoint
 84 | x_axis = as.numeric(gsub("E","",day_list))
 85 | 
 86 | ### the doubling time
 87 | doubling_time = 24*2/(2^d_function(x_axis))
 88 | 
 89 | df_x$doubling_time = doubling_time
 90 | df_x$x_axis = x_axis
 91 | 
 92 | write.csv(df_x, paste0(work_path, "cell_num_prediction.csv"))
 93 | 
 94 | 
 95 | ################################################################################################
 96 | ### Plotting the cell composition of each major trajectories as a function of time (Fig. 1e) ###
 97 | ################################################################################################
 98 | 
 99 | source("JAX_help_code.R")
100 | source("JAX_color_code.R")
101 | 
102 | work_path = "./"
103 | 
104 | pd = readRDS(paste0(work_path, "df_cell.rds"))
105 | ### n = 11,441,407 cells
106 | 
107 | x = as.vector(pd$day)
108 | x[pd$day == "E8.0-E8.5"] = "E8.5"
109 | pd$day = as.vector(x)
110 | 
111 | cell_num = read.csv(paste0(work_path, "cell_num_prediction.csv"))
112 | 
113 | df = pd %>%
114 |     group_by(day, major_trajectory) %>%
115 |     tally() %>%
116 |     dplyr::rename(cell_num = n)
117 | df_sub = pd %>%
118 |     group_by(day) %>%
119 |     tally() %>%
120 |     dplyr::rename(cell_num_total = n)
121 | df = df %>%
122 |     left_join(df_sub, by = "day") %>%
123 |     mutate(percentage = cell_num/cell_num_total) %>%
124 |     left_join(cell_num, by = "day") %>%
125 |     mutate(cell_num_pred_log2 = cell_num_pred_log2 * percentage)
126 | df$day = factor(df$day, levels = names(day_color_plate))
127 | df$major_trajectory = factor(df$major_trajectory, levels = names(major_trajectory_color_plate))
128 | 
129 | cell_num_x = c("E8.5" = 0.21,
130 |                "E9.5" = 0.94,
131 |                "E10.5" = 10.10,
132 |                "E11.5" = 22.98,
133 |                "E12.5" = 45.03,
134 |                "E13.5" = 60.59,
135 |                "E14.333" = 131.00,
136 |                "E15.5" = 216.79,
137 |                "E16.5" = 353.17,
138 |                "E17.5" = 515.85,
139 |                "E18.5" = 584.78,
140 |                "P0" = 671.50)
141 | 
142 | df_y = data.frame(day = as.vector(names(cell_num_x)),
143 |                   log2_y = log2(cell_num_x * 1000000))
144 | 
145 | df_y$day = factor(df_y$day, levels = names(day_color_plate))
146 | 
147 | cell_num$day = factor(cell_num$day, levels = names(day_color_plate))
148 | 
149 | p = ggplot() +
150 |     geom_bar(data = df, aes(x = day, y = cell_num_pred_log2, group = major_trajectory, fill = major_trajectory), stat="identity", width = 1) +
151 |     geom_point(data=df_y, aes(x=day, y=log2_y), shape = 21, colour = "black", fill = "white", size = 2, stroke = 1.5, alpha = 0.8) +
152 |     labs(x = "", y = "") +
153 |     theme_classic(base_size = 12) +
154 |     scale_fill_manual(values = major_trajectory_color_plate) +
155 |     theme(legend.position="none") +
156 |     theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1), axis.text.y = element_text(color="black"))
157 | 
158 | pdf(paste0(work_path, "Cell_composition_over_time.pdf"), 7, 5)
159 | print(p)
160 | dev.off()
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/Embedding_scanpy_posterior_embryo.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################################################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on NMPs, gut, and notochord during early somitogenesis ###
  4 | ###################################################################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0"]
 30 | celltype_include = ["Notochord", "Nodal cilia", "NMPs and spinal cord progenitors", "Gut", "Mesodermal progenitors (Tbx6+)"]
 31 | 
 32 | example_id = "posterior_embryo"
 33 | print(example_id)
 34 | 
 35 | adata = adata[adata.obs["day"].isin(day_include)]
 36 | adata = adata[adata.obs["celltype"].isin(celltype_include)]
 37 | 
 38 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 39 | 
 40 | print("Done reading data ...")
 41 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 42 | 
 43 | sc.pp.normalize_total(adata, target_sum=1e4)
 44 | print("Done normalization by total counts ...")
 45 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 46 | 
 47 | sc.pp.log1p(adata)
 48 | print("Done log transformation ...")
 49 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 50 | 
 51 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 52 | print("Done finding highly variable genes ...")
 53 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 54 | 
 55 | adata = adata[:, adata.var.highly_variable]
 56 | print("Done filtering in highly variable genes ...")
 57 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 58 | 
 59 | sc.pp.scale(adata, max_value=10)
 60 | print("Done scaling data ...")
 61 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 62 | ### done with regress_out and scale ###
 63 | 
 64 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 65 | print("Done performing PCA ...")
 66 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 67 | 
 68 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 69 | print("Done computing neighborhood graph ...")
 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 71 | 
 72 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 73 | print("Done UMAP ...")
 74 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 75 | 
 76 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 77 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 78 | print("Done clustering using res = 1 ...")
 79 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 80 | 
 81 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 82 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 83 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 84 | 
 85 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 86 | print("Done UMAP ...")
 87 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 88 | 
 89 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 90 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 91 | 
 92 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
 93 | 
 94 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
 95 | print("Done writing data ...")
 96 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 97 | 
 98 | emb = adata.obsm['X_pca']
 99 | print(emb.shape)
100 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
101 | 
102 | 
103 | #########################################################
104 | ### Perform subclustering on three major trajectories ###
105 | #########################################################
106 | 
107 | import scanpy as sc
108 | import pandas as pd
109 | import numpy as np
110 | import os, sys
111 | 
112 | work_path = './'
113 | example_id = "posterior_embryo"
114 | 
115 | adata_all = sc.read_h5ad(os.path.join(work_path, '%s_adata_scale.h5ad'%example_id))
116 | pdata = pd.read_csv(os.path.join(work_path, '%s_adata_scale.obs.csv'%example_id), index_col = 0)
117 | adata_all.obs = pdata
118 | 
119 | subcluster_list = ["NMP_Mesoderm", "Notochord", "Gut"]
120 | 
121 | for i in subcluster_list:
122 | 
123 |     adata = adata_all[adata_all.obs["cluster_tmp"] == i]
124 |     print(adata.shape)
125 | 
126 |     sc.pp.normalize_total(adata, target_sum=1e4)
127 |     sc.pp.log1p(adata)
128 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
129 |     adata = adata[:, adata.var.highly_variable]
130 |     sc.pp.scale(adata, max_value=10)
131 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
132 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
133 | 
134 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
135 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
136 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
137 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
138 | 
139 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
140 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
141 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
142 | 
143 |     sc.tl.leiden(adata, resolution=1, n_iterations=2)
144 |     adata.obs['leiden_res_1'] = adata.obs['leiden']
145 | 
146 |     adata.obs.to_csv(os.path.join(work_path, '%s_adata_scale.%s.obs.csv'%(example_id, i)))
147 |     
148 |     emb = adata.obsm['X_pca']
149 |     np.savetxt(os.path.join(work_path, '%s_adata_scale.%s.PCs.csv'%(example_id, i)), emb, delimiter=",", fmt='%1.3f')
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/Run_geosketch.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ################################################
 3 | ### run geosketch to downsample to 10% cells ###
 4 | ################################################
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import scanpy as sc
 9 | import os, sys
10 | from time import time
11 | from geosketch import gs
12 | 
13 | start_time = time.time()
14 | 
15 | WORK_PATH = './'
16 | 
17 | ### Of note, this is the adata object after running Dimension_reduction_scanpy.py in Section-1
18 | ### We need the PCA features to perform geosketch
19 | 
20 | adata = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_scale.h5ad'))
21 | 
22 | X_dimred = adata.obsm['X_pca']
23 | 
24 | start = time()
25 | 
26 | N = 1144141 # Number of samples to obtain from the data set.
27 | sketch_index = gs(X_dimred, N, replace=False)
28 | 
29 | np.savetxt(os.path.join(WORK_PATH, "adata_scale_geosketch_downsample.csv"), np.array(sketch_index)+1, delimiter=",", fmt='%s')
30 | adata.obs.to_csv(os.path.join(WORK_PATH, 'adata_scale.obs.csv'))
31 | 
32 | end = time()
33 | print(end - start)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/step1_posterior_embryo.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | ### Section - 2, Posterior embryo ###
 4 | #####################################
 5 | 
 6 | ####################################
 7 | ### Making 3D UMAP visualization ###
 8 | ####################################
 9 | 
10 | source("JAX_help_code.R")
11 | source("JAX_color_code.R")
12 | work_path = "./"
13 | 
14 | example_i = "posterior_embryo"
15 | 
16 | pd = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
17 | rownames(pd) = as.vector(pd$cell_id)
18 | pd$somite_count = factor(pd$somite_count, levels = names(somite_color_plate))
19 | 
20 | ### making 3D UMAP, with cells are colored by their initial cell type annotations (Fig. 2a)
21 | fig = plot_ly(pd, x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~celltype_update, colors = posterior_embryo_color_plate) %>% 
22 |     layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
23 |                         yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
24 |                         zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2),
25 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
26 | saveWidget(fig, paste0(work_path, example_i, "_celltype_update.html"), selfcontained = FALSE, libdir = "tmp")
27 | 
28 | ### making 3D UMAP, with cells are colored by somite counts (Fig. 2b)
29 | fig = plot_ly(pd, x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 
30 |     layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
31 |                         yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
32 |                         zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2),
33 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
34 | saveWidget(fig, paste0(work_path, example_i, "_somite_count.html"), selfcontained = FALSE, libdir = "tmp")
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/step2_NMP_mesoderm.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################
  3 | ### Section - 2, Posterior embryo ###
  4 | #####################################
  5 | 
  6 | ########################
  7 | ### Analysis on NMPs ###
  8 | ########################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | example_i = "posterior_embryo"
 15 | 
 16 | i = "NMP_Mesoderm"
 17 | 
 18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T)
 19 | 
 20 | ### 2D UMAP of NMPs, with cells are colored by their initial cell types (Fig. 2c)
 21 | p = pd_x %>%
 22 |     ggplot() +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 24 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.35) +
 25 |     theme_void() +
 26 |     scale_color_manual(values=posterior_embryo_color_plate) +
 27 |     theme(legend.position="none") + 
 28 |     ggsave(paste0(work_path, "NMPs_celltype.png"), width = 4, height = 3, dpi = 300)
 29 | 
 30 | ### 2D UMAP of NMPs, with cells are colored by their timepoints (Fig. 2c)
 31 | p = pd_x %>%
 32 |     ggplot() +
 33 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 34 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.35) +
 35 |     theme_void() +
 36 |     scale_color_manual(values=somite_color_plate) +
 37 |     theme(legend.position="none") + 
 38 |     ggsave(paste0(work_path, "NMPs_somite_count.png"), width = 4, height = 3, dpi = 300)
 39 | 
 40 | ###############################
 41 | ### Performing PCA analysis ###
 42 | ###############################
 43 | 
 44 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),]
 45 | gene_count_x = doExtractData(pd_x, mouse_gene_sub)
 46 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x)
 47 | 
 48 | npcs = 30
 49 | reduction.key = "PC_"
 50 | seed.use = 42
 51 | 
 52 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000)
 53 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500)
 54 | genes_include = VariableFeatures(obj_x)
 55 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x))
 56 | scale_dat = GetAssayData(obj_x, slot = "scale.data")
 57 | print(dim(scale_dat))
 58 | 
 59 | set.seed(seed = seed.use)
 60 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs)
 61 | feature.loadings <- pca.results$v
 62 | set.seed(seed = seed.use)
 63 | cell.embeddings <- pca.results$u %*% diag(pca.results$d)
 64 | 
 65 | rownames(x = feature.loadings) <- genes_include
 66 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs)
 67 | rownames(x = cell.embeddings) <- colnames(obj_x)
 68 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings)
 69 | 
 70 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1))
 71 | eigValues = (stdev)^2  ## EigenValues
 72 | varExplained = eigValues / sum(eigValues)
 73 | 
 74 | res = list(cell.embeddings = cell.embeddings,
 75 |            feature.loadings = feature.loadings,
 76 |            varExplained = varExplained)
 77 | 
 78 | emb = res[["cell.embeddings"]]
 79 | emb = emb[rownames(pd_x),]
 80 | pd_x = cbind(pd_x, emb[,c(1:3)])
 81 | print(res[["varExplained"]])
 82 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate))
 83 | 
 84 | ### making 3D PCA plot (Fig. 2e)
 85 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~celltype_update, colors = posterior_embryo_color_plate) %>% 
 86 |     layout(scene = list(xaxis=list(title = list(text ='PC_1 (21.0%)', font = t1), tickfont = t2),
 87 |                         yaxis=list(title = list(text ='PC_2 (13.9%)', font = t1), tickfont = t2),
 88 |                         zaxis=list(title = list(text ='PC_3 (11.1%)', font = t1), tickfont = t2),
 89 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
 90 | saveWidget(fig, paste0(work_path, example_i, "_NMP_Mesoderm_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp")
 91 | 
 92 | #################################################################################
 93 | ### making scatter plot between each PCs and gene expression or somite counts ###
 94 | #################################################################################
 95 | 
 96 | gene_count_x = gene_count[,rownames(pd_x)]
 97 | emb = emb[rownames(pd_x),]
 98 | 
 99 | gene_count_x = t(t(gene_count_x) / colSums(gene_count_x)) * 100000
100 | gene_count_x = gene_count_x[c("ENSMUSG00000074637","ENSMUSG00000030699","ENSMUSG00000020160","ENSMUSG00000062327","ENSMUSG00000009900","ENSMUSG00000024987"),]
101 | rownames(gene_count_x) = c("Sox2","Tbx6","Mesi1","T","Wnt3a","Cyp26a1")
102 | gene_count_x@x = log(gene_count_x@x + 1)
103 | 
104 | df = data.frame(exp = c(as.vector(gene_count_x[1,]), as.vector(gene_count_x[2,])),
105 |                 gene = c(rep("Sox2", nrow(emb)), rep("Tbx6", nrow(emb))),
106 |                 PC_1 = c(as.vector(emb[,1]), as.vector(emb[,1])))
107 | 
108 | ### Fig. 2e
109 | df %>%
110 |     ggplot(aes(PC_1, exp, color = gene)) + geom_smooth(method = loess, se = FALSE) +
111 |     labs(x="", y="", title="") +
112 |     theme_classic(base_size = 12) +
113 |     theme(plot.title = element_text(hjust = 0.5)) +
114 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) +
115 |     scale_color_brewer(palette = "Set1") 
116 | 
117 | 
118 | df = data.frame(exp = c(as.vector(gene_count_x[3,]), as.vector(gene_count_x[4,]), as.vector(gene_count_x[5,]), as.vector(gene_count_x[6,])),
119 |                 gene = c(rep("Mesi1", nrow(emb)), rep("T", nrow(emb)), rep("Wnt3a", nrow(emb)), rep("Cyp26a1", nrow(emb))),
120 |                 PC_3 = c(as.vector(emb[,3]), as.vector(emb[,3]), as.vector(emb[,3]), as.vector(emb[,3])))
121 | 
122 | ### Fig. 2e
123 | df %>%
124 |     ggplot(aes(PC_3, exp, color = gene)) + geom_smooth(method = loess, se = FALSE) +
125 |     labs(x="", y="", title="") +
126 |     theme_classic(base_size = 12) +
127 |     theme(plot.title = element_text(hjust = 0.5)) +
128 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) +
129 |     scale_color_brewer(palette = "Set2")
130 | 
131 | df  = data.frame(somite_count = pd_x$somite_count,
132 |                  PC_2 = as.vector(emb[,2]))
133 | df$somite_count = factor(df$somite_count, levels = names(somite_color_plate))
134 | df$somite = as.vector(gsub(" somites", "", df$somite_count))
135 | df$somite = factor(df$somite, levels = c(0, 2:12, 14:18, 20:34))
136 | 
137 | ### Fig. 2e
138 | df %>%
139 |     ggplot( aes(somite, PC_2, fill = somite_count)) + 
140 |     geom_boxplot(outlier.shape = NA) + 
141 |     labs(x="", y="", title="") +
142 |     theme_classic(base_size = 5) +
143 |     scale_fill_manual(values=somite_color_plate) +
144 |     theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) +
145 |     NoLegend()
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/step3_Notochord.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################
  3 | ### Section - 2, Posterior embryo ###
  4 | #####################################
  5 | 
  6 | #############################
  7 | ### Analysis on Notochord ###
  8 | #############################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | example_i = "posterior_embryo"
 15 | 
 16 | i = "Notochord"
 17 | 
 18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T)
 19 | 
 20 | p = pd_x %>%
 21 |     ggplot() +
 22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.8) +
 24 |     theme_void() +
 25 |     scale_color_manual(values=celltype_color_plate) +
 26 |     theme(legend.position="none") + 
 27 |     ggsave(paste0(work_path, "Notochord_celltype.png"), width = 4, height = 3, dpi = 300)
 28 | 
 29 | p = pd_x %>%
 30 |     ggplot() +
 31 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) +
 32 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.8) +
 33 |     theme_void() +
 34 |     scale_color_manual(values=somite_color_plate) +
 35 |     theme(legend.position="none") + 
 36 |     ggsave(paste0(work_path, "Notochord_day.png"), width = 4, height = 3, dpi = 300)
 37 | 
 38 | 
 39 | ###############################
 40 | ### Performing PCA analysis ###
 41 | ###############################
 42 | 
 43 | ### excluding nodal cilia before performing PCA
 44 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),]
 45 | gene_count_x = doExtractData(pd_x, mouse_gene_sub)
 46 | pd_x = pd_x[pd_x$celltype_update != "Nodal cilia",]
 47 | gene_count_x = gene_count[,rownames(pd_x)]
 48 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x)
 49 | 
 50 | npcs = 30
 51 | reduction.key = "PC_"
 52 | seed.use = 42
 53 | 
 54 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000)
 55 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500)
 56 | genes_include = VariableFeatures(obj_x)
 57 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x))
 58 | scale_dat = GetAssayData(obj_x, slot = "scale.data")
 59 | print(dim(scale_dat))
 60 | 
 61 | set.seed(seed = seed.use)
 62 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs)
 63 | feature.loadings <- pca.results$v
 64 | set.seed(seed = seed.use)
 65 | cell.embeddings <- pca.results$u %*% diag(pca.results$d)
 66 | 
 67 | rownames(x = feature.loadings) <- genes_include
 68 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs)
 69 | rownames(x = cell.embeddings) <- colnames(obj_x)
 70 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings)
 71 | 
 72 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1))
 73 | eigValues = (stdev)^2  ## EigenValues
 74 | varExplained = eigValues / sum(eigValues)
 75 | 
 76 | res = list(cell.embeddings = cell.embeddings,
 77 |            feature.loadings = feature.loadings,
 78 |            varExplained = varExplained)
 79 | 
 80 | emb = res[["cell.embeddings"]]
 81 | emb = emb[rownames(pd_x),]
 82 | pd_x = cbind(pd_x, emb[,c(1:3)])
 83 | print(res[["varExplained"]])
 84 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate))
 85 | 
 86 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 
 87 |     layout(scene = list(xaxis=list(title = list(text ='PC_1 (28.7%)', font = t1), tickfont = t2),
 88 |                         yaxis=list(title = list(text ='PC_2 (11.4%)', font = t1), tickfont = t2),
 89 |                         zaxis=list(title = list(text ='PC_3 (6.7%)', font = t1), tickfont = t2),
 90 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
 91 | saveWidget(fig, paste0(work_path, example_i, "_Notochord_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp")
 92 | 
 93 | ##########################################################################
 94 | ### counting cell number of nodal cilia as a function of somite counts ###
 95 | ##########################################################################
 96 | 
 97 | ### Extended Data Fig. 4h
 98 | ### data that we are using
 99 | 
100 | ### somite_count  a     n       frac
101 | ### 0 somites  4  9296 0.0004302926
102 | ### 2 somites 20  7329 0.0027288853
103 | ### 3 somites  7  4564 0.0015337423
104 | ### 4 somites  7  8362 0.0008371203
105 | ### 5 somites  4  6872 0.0005820722
106 | ### 7 somites  7 17182 0.0004074031
107 | ### 8 somites  3 19415 0.0001545197
108 | ### 9 somites  5 13703 0.0003648836
109 | ### 11 somites  3 20150 0.0001488834
110 | 
111 | x %>% ggplot(aes(x=somite_count, y=frac, fill = somite_count)) +
112 |     scale_fill_viridis(discrete=TRUE)  +
113 |     geom_bar(stat="identity") +
114 |     theme_classic(base_size = 10) +
115 |     theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black")) +
116 |     theme(legend.position="none")
117 | 
118 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/step4_Gut.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #####################################
 3 | ### Section - 2, Posterior embryo ###
 4 | #####################################
 5 | 
 6 | #######################
 7 | ### Analysis on Gut ###
 8 | #######################
 9 | 
10 | source("JAX_help_code.R")
11 | source("JAX_color_code.R")
12 | work_pat = "./"
13 | 
14 | example_i = "posterior_embryo"
15 | 
16 | i = "Gut"
17 | 
18 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.", i, ".obs.csv"), header=T, row.names=1, as.is=T)
19 | 
20 | p = pd_x %>%
21 |     ggplot() +
22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) +
23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.8) +
24 |     theme_void() +
25 |     scale_color_manual(values=celltype_color_plate) +
26 |     theme(legend.position="none") + 
27 |     ggsave(paste0(work_path, "Gut_celltype.png"), width = 4, height = 3, dpi = 300)
28 | 
29 | p = pd_x %>%
30 |     ggplot() +
31 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) +
32 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.8) +
33 |     theme_void() +
34 |     scale_color_manual(values=somite_color_plate) +
35 |     theme(legend.position="none") + 
36 |     ggsave(paste0(work_path, "Gut_day.png"), width = 4, height = 3, dpi = 300)
37 | 
38 | 
39 | ###############################
40 | ### Performing PCA analysis ###
41 | ###############################
42 | 
43 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),]
44 | gene_count_x = doExtractData(pd_x, mouse_gene_sub)
45 | obj_x = CreateSeuratObject(gene_count_x, meta.data = pd_x)
46 | 
47 | npcs = 30
48 | reduction.key = "PC_"
49 | seed.use = 42
50 | 
51 | obj_x = NormalizeData(obj_x, normalization.method = "LogNormalize", scale.factor = 10000)
52 | obj_x = FindVariableFeatures(obj_x, selection.method = "vst", nfeatures = 2500)
53 | genes_include = VariableFeatures(obj_x)
54 | obj_x = ScaleData(obj_x, verbose = FALSE, features = rownames(obj_x))
55 | scale_dat = GetAssayData(obj_x, slot = "scale.data")
56 | print(dim(scale_dat))
57 | 
58 | set.seed(seed = seed.use)
59 | pca.results <- irlba::irlba(A = t(x = scale_dat[genes_include,]), nv = npcs)
60 | feature.loadings <- pca.results$v
61 | set.seed(seed = seed.use)
62 | cell.embeddings <- pca.results$u %*% diag(pca.results$d)
63 | 
64 | rownames(x = feature.loadings) <- genes_include
65 | colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs)
66 | rownames(x = cell.embeddings) <- colnames(obj_x)
67 | colnames(x = cell.embeddings) <- colnames(x = feature.loadings)
68 | 
69 | stdev <- pca.results$d/sqrt(max(1, ncol(scale_dat) - 1))
70 | eigValues = (stdev)^2  ## EigenValues
71 | varExplained = eigValues / sum(eigValues)
72 | 
73 | res = list(cell.embeddings = cell.embeddings,
74 |            feature.loadings = feature.loadings,
75 |            varExplained = varExplained)
76 | 
77 | emb = res[["cell.embeddings"]]
78 | emb = emb[rownames(pd_x),]
79 | pd_x = cbind(pd_x, emb[,c(1:3)])
80 | print(res[["varExplained"]])
81 | pd_x$somite_count = factor(pd_x$somite_count, levels = names(somite_color_plate))
82 | 
83 | fig = plot_ly(pd_x, x=~PC_1, y=~PC_2, z=~PC_3, size = I(30), color = ~somite_count, colors = somite_color_plate) %>% 
84 |     layout(scene = list(xaxis=list(title = list(text ='PC_1 (28.7%)', font = t1), tickfont = t2),
85 |                         yaxis=list(title = list(text ='PC_2 (11.4%)', font = t1), tickfont = t2),
86 |                         zaxis=list(title = list(text ='PC_3 (6.7%)', font = t1), tickfont = t2),
87 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
88 | saveWidget(fig, paste0(work_path, example_i, "_Gut_PCA_celltype_update.html"), selfcontained = FALSE, libdir = "tmp")
89 | 
90 | 


--------------------------------------------------------------------------------
/Section_2_posterior_embryo/step6_somites_validation.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################
  3 | ### Section - 2, Posterior embryo ###
  4 | #####################################
  5 | 
  6 | source("JAX_help_code.R")
  7 | source("JAX_color_code.R")
  8 | work_path = "./"
  9 | 
 10 | pd = readRDS(paste0(work_path, "pd_somites.rds"))
 11 | ### n = 104,671 nuclei
 12 | 
 13 | celltype_color_plate = c("#54c15f", "#c34fb7", "#91b737", "#7b63d0", "#c9a63c",
 14 |                          "#7081ca", "#478734", "#da3f78", "#56c09e", "#cf4a35",
 15 |                          "#999999", "#4eacd7", "#e18f4f", "#be75b4", "#a0b46c",
 16 |                          "#a0445d", "#37845f", "#df7c82", "#72732b", "#a06432")
 17 | names(celltype_color_plate) = x
 18 | 
 19 | ### Extended Data Fig. 4b
 20 | p = pd %>%
 21 |     ggplot() +
 22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.8) +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno), size=0.6) +
 24 |     theme_void() +
 25 |     scale_color_manual(values=color_plate) +
 26 |     theme(legend.position="none") + 
 27 |     ggsave(paste0(work_path, "somites.anno.png"), width = 6, height = 6, dpi = 300)
 28 | 
 29 | somite_color_plate = c("#440154", "#482475", "#414487", "#355f8d",
 30 |                        "#2a788e", "#21918c", "#22a884", "#44bf70",
 31 |                        "#7ad151", "#bddf26", "#fde725")
 32 | names(somite_color_plate) = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites")
 33 | 
 34 | ### Extended Data Fig. 4c
 35 | p = pd %>%
 36 |     ggplot() +
 37 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.8) +
 38 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=0.6) +
 39 |     theme_void() +
 40 |     scale_color_manual(values=somite_color_plate) +
 41 |     theme(legend.position="none") + 
 42 |     ggsave(paste0(work_path, "somites.somite_count.png"), width = 6, height = 6, dpi = 300)
 43 | 
 44 | ### Extended Data Fig. 4a
 45 | pd$embryo_id = factor(pd$embryo_id, levels = rev(names(table(pd$embryo_id))))
 46 | p1 = pd %>%
 47 |     group_by(embryo_id, somite_count) %>% tally() %>% rename(cell_num = n) %>%
 48 |     ggplot(aes(embryo_id, cell_num, fill = somite_count)) + 
 49 |     geom_bar(stat="identity") +
 50 |     coord_flip() +
 51 |     scale_fill_manual(values = somite_color_plate) + 
 52 |     geom_text(aes(label = scales::comma(cell_num)), 
 53 |               hjust = -0.1,
 54 |               position = position_dodge(width = 1),
 55 |               inherit.aes = TRUE,
 56 |               size = 5) +
 57 |     labs(x = "", y = "Cell number") +
 58 |     theme_classic(base_size = 15) +
 59 |     theme(legend.position="none") +
 60 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
 61 | pdf(paste0(work_path, "cell_num.pdf"), 5, 8)
 62 | print(p1)
 63 | dev.off()
 64 | 
 65 | 
 66 | #######################
 67 | ### Focusing on NMP ###
 68 | #######################
 69 | 
 70 | 
 71 | pd_NMP = read.csv(paste0(work_path, "adata_somites_NMP.obs.csv"), row.names=1, as.is=T)
 72 | 
 73 | somite_color_plate = c("#440154", "#482475", "#414487", "#355f8d",
 74 |                        "#2a788e", "#21918c", "#22a884", "#44bf70",
 75 |                        "#7ad151", "#bddf26", "#fde725")
 76 | names(somite_color_plate) = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites")
 77 | 
 78 | pd_NMP$somite_count = factor(pd_NMP$somite_count, levels = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites"))
 79 | p = pd_NMP %>%
 80 |     ggplot() +
 81 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.5) +
 82 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = somite_count), size=1) +
 83 |     theme_void() +
 84 |     scale_color_manual(values=somite_color_plate) +
 85 |     theme(legend.position="none") + 
 86 |     ggsave(paste0(work_path, "NMP.somite_count.png"), width = 6, height = 6, dpi = 300)
 87 | 
 88 | 
 89 | p = pd_NMP %>%
 90 |     ggplot() +
 91 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.5) +
 92 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno), size=1) +
 93 |     theme_void() +
 94 |     scale_color_manual(values=color_plate) +
 95 |     theme(legend.position="none") + 
 96 |     ggsave(paste0(work_path, "NMP.anno.png"), width = 6, height = 6, dpi = 300)
 97 | 
 98 | pd = data.frame(pData(cds))
 99 | pd$somite_count = factor(pd$somite_count, levels = paste0(c(8,9,10,11,12,13,14,16,17,20,21), " somites"))
100 | pd$Cdx1 = as.vector(exprs(cds)["ENSMUSG00000024619",])
101 | pd$Hoxa10 = as.vector(exprs(cds)["ENSMUSG00000000938",])
102 | pd$T = as.vector(exprs(cds)["ENSMUSG00000062327",])
103 | pd$Meis1 = as.vector(exprs(cds)["ENSMUSG00000020160",])
104 | 
105 | df = pd %>% filter(Cdx1 != 0) %>% group_by(somite_count, .drop = FALSE) %>% tally() %>%
106 |     left_join(pd %>% group_by(somite_count) %>% tally() %>% rename(total_n = n)) %>%
107 |     mutate(percent = 100*n/total_n)
108 | 
109 | p1 <-ggplot(data=df, aes(x=somite_count, y=percent, fill = somite_count)) +
110 |     geom_bar(stat="identity") + labs(x="",y="% of cells expressed Cdx1") +
111 |     scale_fill_manual(values=somite_color_plate) + theme_classic(base_size = 10) + theme(legend.position="none") +
112 |     theme(axis.text.x = element_text(color="black", angle = 90), axis.text.y = element_text(color="black")) 
113 | 
114 | df = pd %>% filter(Hoxa10 != 0) %>% group_by(somite_count, .drop = FALSE) %>% tally() %>%
115 |     left_join(pd %>% group_by(somite_count) %>% tally() %>% rename(total_n = n)) %>%
116 |     mutate(percent = 100*n/total_n)
117 | 
118 | p2 <-ggplot(data=df, aes(x=somite_count, y=percent, fill = somite_count)) +
119 |     geom_bar(stat="identity") + labs(x="",y="% of cells expressed Hoxa10") +
120 |     scale_fill_manual(values=somite_color_plate) + theme_classic(base_size = 10) + theme(legend.position="none") +
121 |     theme(axis.text.x = element_text(color="black", angle = 90), axis.text.y = element_text(color="black")) 
122 | 
123 | # Extended Data Fig. 4f
124 | 
125 | library(gridExtra) 
126 | pdf(paste0(work_path, "NMP_Cdx1_Hoxa10.pdf"), 4, 6)
127 | grid.arrange(p1, p2, nrow=2, ncol=1) 
128 | dev.off()
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Embedding_Renal.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ###
  4 | ############################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | example_id = "Renal_big"
 18 | print(example_id)
 19 | 
 20 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_id))
 21 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_id), index_col = 0)
 22 | fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_id), index_col = 0)
 23 | adata.obs = pdata
 24 | adata.var = fdata
 25 | 
 26 | print("Done reading data ...")
 27 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 28 | 
 29 | sc.pp.normalize_total(adata, target_sum=1e4)
 30 | print("Done normalization by total counts ...")
 31 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 32 | 
 33 | sc.pp.log1p(adata)
 34 | print("Done log transformation ...")
 35 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 36 | 
 37 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 38 | print("Done finding highly variable genes ...")
 39 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 40 | 
 41 | adata = adata[:, adata.var.highly_variable]
 42 | print("Done filtering in highly variable genes ...")
 43 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 44 | 
 45 | sc.pp.scale(adata, max_value=10)
 46 | print("Done scaling data ...")
 47 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 48 | ### done with regress_out and scale ###
 49 | 
 50 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 51 | print("Done performing PCA ...")
 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 53 | 
 54 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 55 | print("Done computing neighborhood graph ...")
 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 57 | 
 58 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 59 | print("Done UMAP ...")
 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 61 | 
 62 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 63 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 64 | print("Done clustering using res = 1 ...")
 65 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 66 | 
 67 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 68 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 69 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 70 | 
 71 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 72 | print("Done UMAP ...")
 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 74 | 
 75 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 76 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 77 | 
 78 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s.obs.csv'%example_id))
 79 | 
 80 | adata.write(os.path.join(WORK_PATH, '%s.h5ad'%example_id), compression="gzip")
 81 | print("Done writing data ...")
 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 83 | 
 84 | 
 85 | 
 86 | ### Only co-embedding renal pericytes and stromal cells
 87 | 
 88 | 
 89 | example_id = "Renal_pericytes_stromal"
 90 | print(example_id)
 91 | 
 92 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_id))
 93 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_id), index_col = 0)
 94 | fdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_gene.csv'%example_id), index_col = 0)
 95 | adata.obs = pdata
 96 | adata.var = fdata
 97 | 
 98 | print("Done reading data ...")
 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
100 | 
101 | sc.pp.normalize_total(adata, target_sum=1e4)
102 | print("Done normalization by total counts ...")
103 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
104 | 
105 | sc.pp.log1p(adata)
106 | print("Done log transformation ...")
107 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
108 | 
109 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
110 | print("Done finding highly variable genes ...")
111 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
112 | 
113 | adata = adata[:, adata.var.highly_variable]
114 | print("Done filtering in highly variable genes ...")
115 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
116 | 
117 | sc.pp.scale(adata, max_value=10)
118 | print("Done scaling data ...")
119 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
120 | ### done with regress_out and scale ###
121 | 
122 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
123 | print("Done performing PCA ...")
124 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
125 | 
126 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
127 | print("Done computing neighborhood graph ...")
128 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
129 | 
130 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
131 | print("Done UMAP ...")
132 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
133 | 
134 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
135 | adata.obs['leiden_res_1'] = adata.obs['leiden']
136 | print("Done clustering using res = 1 ...")
137 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
138 | 
139 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
140 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
141 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
142 | 
143 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
144 | print("Done UMAP ...")
145 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
146 | 
147 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
148 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
149 | 
150 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s.obs.csv'%example_id))
151 | 
152 | adata.write(os.path.join(WORK_PATH, '%s.h5ad'%example_id), compression="gzip")
153 | print("Done writing data ...")
154 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Embedding_scanpy_kidney.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ###
  4 | ############################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["Anterior intermediate mesoderm",
 30 | "Collecting duct intercalated cells",
 31 | "Connecting tubule",
 32 | "Metanephric mesenchyme",
 33 | "Podocytes",
 34 | "Proximal tubule cells",
 35 | "Ascending loop of Henle",
 36 | "Collecting duct principal cells",
 37 | "Distal convoluted tubule",
 38 | "Nephron progenitors",
 39 | "Posterior intermediate mesoderm",
 40 | "Ureteric bud"]
 41 | 
 42 | example_id = "renal"
 43 | print(example_id)
 44 | 
 45 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)]
 46 | 
 47 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 48 | 
 49 | print("Done reading data ...")
 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 51 | 
 52 | sc.pp.normalize_total(adata, target_sum=1e4)
 53 | print("Done normalization by total counts ...")
 54 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 55 | 
 56 | sc.pp.log1p(adata)
 57 | print("Done log transformation ...")
 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 59 | 
 60 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 61 | print("Done finding highly variable genes ...")
 62 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 63 | 
 64 | adata = adata[:, adata.var.highly_variable]
 65 | print("Done filtering in highly variable genes ...")
 66 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 67 | 
 68 | sc.pp.scale(adata, max_value=10)
 69 | print("Done scaling data ...")
 70 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 71 | ### done with regress_out and scale ###
 72 | 
 73 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 74 | print("Done performing PCA ...")
 75 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 76 | 
 77 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 78 | print("Done computing neighborhood graph ...")
 79 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 80 | 
 81 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 82 | print("Done UMAP ...")
 83 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 84 | 
 85 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 86 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 87 | print("Done clustering using res = 1 ...")
 88 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 89 | 
 90 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 91 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 92 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 93 | 
 94 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 95 | print("Done UMAP ...")
 96 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 97 | 
 98 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 99 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
100 | 
101 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
102 | 
103 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
104 | print("Done writing data ...")
105 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
106 | 
107 | emb = adata.obsm['X_pca']
108 | print(emb.shape)
109 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
110 | 
111 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Embedding_scanpy_lateral_plate_mesoderm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ###
  4 | ############################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["Lateral plate and intermediate mesoderm"]
 30 | 
 31 | example_id = "LPM"
 32 | print(example_id)
 33 | 
 34 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)]
 35 | 
 36 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 37 | 
 38 | print("Done reading data ...")
 39 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 40 | 
 41 | sc.pp.normalize_total(adata, target_sum=1e4)
 42 | print("Done normalization by total counts ...")
 43 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 44 | 
 45 | sc.pp.log1p(adata)
 46 | print("Done log transformation ...")
 47 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 48 | 
 49 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 50 | print("Done finding highly variable genes ...")
 51 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 52 | 
 53 | adata = adata[:, adata.var.highly_variable]
 54 | print("Done filtering in highly variable genes ...")
 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 56 | 
 57 | sc.pp.scale(adata, max_value=10)
 58 | print("Done scaling data ...")
 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 60 | ### done with regress_out and scale ###
 61 | 
 62 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 63 | print("Done performing PCA ...")
 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 65 | 
 66 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 67 | print("Done computing neighborhood graph ...")
 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 69 | 
 70 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 71 | print("Done UMAP ...")
 72 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 73 | 
 74 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 75 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 76 | print("Done clustering using res = 1 ...")
 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 78 | 
 79 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 80 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 81 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 82 | 
 83 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 84 | print("Done UMAP ...")
 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 86 | 
 87 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 88 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 89 | 
 90 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
 91 | 
 92 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
 93 | print("Done writing data ...")
 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 95 | 
 96 | emb = adata.obsm['X_pca']
 97 | print(emb.shape)
 98 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
 99 | 
100 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Embedding_scanpy_patterned_mesoderm_somites_26_34.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ###
  4 | ############################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | example_id = "LPM_somite_26_34"
 18 | print(example_id)
 19 | 
 20 | 
 21 | ### First, only including backbone cells
 22 | 
 23 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_backbone.gene_count.mtx'%example_id))
 24 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_backbone.df_cell.csv'%example_id), index_col = 0)
 25 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
 26 | adata.obs = pdata
 27 | adata.var = fdata
 28 | 
 29 | print("Done reading data ...")
 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 31 | 
 32 | sc.pp.normalize_total(adata, target_sum=1e4)
 33 | print("Done normalization by total counts ...")
 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 35 | 
 36 | sc.pp.log1p(adata)
 37 | print("Done log transformation ...")
 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 39 | 
 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 41 | print("Done finding highly variable genes ...")
 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 43 | 
 44 | adata = adata[:, adata.var.highly_variable]
 45 | print("Done filtering in highly variable genes ...")
 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 47 | 
 48 | sc.pp.scale(adata, max_value=10)
 49 | print("Done scaling data ...")
 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 51 | ### done with regress_out and scale ###
 52 | 
 53 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 54 | print("Done performing PCA ...")
 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 56 | 
 57 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 58 | print("Done computing neighborhood graph ...")
 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 60 | 
 61 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 62 | print("Done UMAP ...")
 63 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 64 | 
 65 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 66 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 67 | print("Done clustering using res = 1 ...")
 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 69 | 
 70 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 71 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 72 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 73 | 
 74 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 75 | print("Done UMAP ...")
 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 77 | 
 78 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 79 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 80 | 
 81 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_backbone_adata_scale.obs.csv'%example_id))
 82 | 
 83 | adata.write(os.path.join(WORK_PATH, '%s_backbone_adata_scale_processed.h5ad'%example_id), compression="gzip")
 84 | print("Done writing data ...")
 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 86 | 
 87 | emb = adata.obsm['X_pca']
 88 | print(emb.shape)
 89 | np.savetxt(os.path.join(WORK_PATH, '%s_backbone_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
 90 | 
 91 | 
 92 | 
 93 | ### Next, including both backbone cells and derivatives
 94 | 
 95 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_all.gene_count.mtx'%example_id))
 96 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_all.df_cell.csv'%example_id), index_col = 0)
 97 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
 98 | adata.obs = pdata
 99 | adata.var = fdata
100 | 
101 | print("Done reading data ...")
102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
103 | 
104 | sc.pp.normalize_total(adata, target_sum=1e4)
105 | print("Done normalization by total counts ...")
106 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
107 | 
108 | sc.pp.log1p(adata)
109 | print("Done log transformation ...")
110 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
111 | 
112 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
113 | print("Done finding highly variable genes ...")
114 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
115 | 
116 | adata = adata[:, adata.var.highly_variable]
117 | print("Done filtering in highly variable genes ...")
118 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
119 | 
120 | sc.pp.scale(adata, max_value=10)
121 | print("Done scaling data ...")
122 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
123 | ### done with regress_out and scale ###
124 | 
125 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
126 | print("Done performing PCA ...")
127 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
128 | 
129 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
130 | print("Done computing neighborhood graph ...")
131 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
132 | 
133 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
134 | print("Done UMAP ...")
135 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
136 | 
137 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
138 | adata.obs['leiden_res_1'] = adata.obs['leiden']
139 | print("Done clustering using res = 1 ...")
140 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
141 | 
142 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
143 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
144 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
145 | 
146 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
147 | print("Done UMAP ...")
148 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
149 | 
150 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
151 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
152 | 
153 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_all_adata_scale.obs.csv'%example_id))
154 | 
155 | adata.write(os.path.join(WORK_PATH, '%s_all_adata_scale_processed.h5ad'%example_id), compression="gzip")
156 | print("Done writing data ...")
157 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
158 | 
159 | emb = adata.obsm['X_pca']
160 | print(emb.shape)
161 | np.savetxt(os.path.join(WORK_PATH, '%s_all_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
162 | 
163 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Embedding_scanpy_patterned_mesoderm_somites_5_20.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on renal subset ###
  4 | ############################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | example_id = "LPM_somite_5_20"
 18 | print(example_id)
 19 | 
 20 | 
 21 | ### First, only including backbone cells
 22 | 
 23 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_backbone.gene_count.mtx'%example_id))
 24 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_backbone.df_cell.csv'%example_id), index_col = 0)
 25 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
 26 | adata.obs = pdata
 27 | adata.var = fdata
 28 | 
 29 | print("Done reading data ...")
 30 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 31 | 
 32 | sc.pp.normalize_total(adata, target_sum=1e4)
 33 | print("Done normalization by total counts ...")
 34 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 35 | 
 36 | sc.pp.log1p(adata)
 37 | print("Done log transformation ...")
 38 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 39 | 
 40 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 41 | print("Done finding highly variable genes ...")
 42 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 43 | 
 44 | adata = adata[:, adata.var.highly_variable]
 45 | print("Done filtering in highly variable genes ...")
 46 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 47 | 
 48 | sc.pp.scale(adata, max_value=10)
 49 | print("Done scaling data ...")
 50 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 51 | ### done with regress_out and scale ###
 52 | 
 53 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 54 | print("Done performing PCA ...")
 55 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 56 | 
 57 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 58 | print("Done computing neighborhood graph ...")
 59 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 60 | 
 61 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 62 | print("Done UMAP ...")
 63 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 64 | 
 65 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 66 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 67 | print("Done clustering using res = 1 ...")
 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 69 | 
 70 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 71 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 72 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 73 | 
 74 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 75 | print("Done UMAP ...")
 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 77 | 
 78 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 79 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 80 | 
 81 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_backbone_adata_scale.obs.csv'%example_id))
 82 | 
 83 | adata.write(os.path.join(WORK_PATH, '%s_backbone_adata_scale_processed.h5ad'%example_id), compression="gzip")
 84 | print("Done writing data ...")
 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 86 | 
 87 | emb = adata.obsm['X_pca']
 88 | print(emb.shape)
 89 | np.savetxt(os.path.join(WORK_PATH, '%s_backbone_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
 90 | 
 91 | 
 92 | 
 93 | ### Next, including both backbone cells and derivatives
 94 | 
 95 | adata = sc.read_mtx(os.path.join(WORK_PATH, '%s_all.gene_count.mtx'%example_id))
 96 | pdata = pd.read_csv(os.path.join(WORK_PATH, '%s_all.df_cell.csv'%example_id), index_col = 0)
 97 | fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
 98 | adata.obs = pdata
 99 | adata.var = fdata
100 | 
101 | print("Done reading data ...")
102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
103 | 
104 | sc.pp.normalize_total(adata, target_sum=1e4)
105 | print("Done normalization by total counts ...")
106 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
107 | 
108 | sc.pp.log1p(adata)
109 | print("Done log transformation ...")
110 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
111 | 
112 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
113 | print("Done finding highly variable genes ...")
114 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
115 | 
116 | adata = adata[:, adata.var.highly_variable]
117 | print("Done filtering in highly variable genes ...")
118 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
119 | 
120 | sc.pp.scale(adata, max_value=10)
121 | print("Done scaling data ...")
122 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
123 | ### done with regress_out and scale ###
124 | 
125 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
126 | print("Done performing PCA ...")
127 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
128 | 
129 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
130 | print("Done computing neighborhood graph ...")
131 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
132 | 
133 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
134 | print("Done UMAP ...")
135 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
136 | 
137 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
138 | adata.obs['leiden_res_1'] = adata.obs['leiden']
139 | print("Done clustering using res = 1 ...")
140 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
141 | 
142 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
143 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
144 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
145 | 
146 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
147 | print("Done UMAP ...")
148 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
149 | 
150 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
151 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
152 | 
153 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_all_adata_scale.obs.csv'%example_id))
154 | 
155 | adata.write(os.path.join(WORK_PATH, '%s_all_adata_scale_processed.h5ad'%example_id), compression="gzip")
156 | print("Done writing data ...")
157 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
158 | 
159 | emb = adata.obsm['X_pca']
160 | print(emb.shape)
161 | np.savetxt(os.path.join(WORK_PATH, '%s_all_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
162 | 
163 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/Spatial_mapping.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #######################################################################################
  3 | ### Part-1: extracting spatial coordinates and annotations from individual sections ###
  4 | #######################################################################################
  5 | 
  6 | import os, sys
  7 | import numpy as np
  8 | import pandas as pd
  9 | import scanpy as sc
 10 | 
 11 | WORK_PATH = "./"
 12 | 
 13 | newpath = os.path.join(WORK_PATH, 'annotation')
 14 | if not os.path.exists(newpath):
 15 |     os.makedirs(newpath)
 16 | 
 17 | file = open(os.path.join(WORK_PATH, "Mosta_file_list.txt"))
 18 | file_list = [line.rstrip().replace(".MOSTA.h5ad", "") for line in file]
 19 | file.close()
 20 | 
 21 | for file_id in file_list:
 22 | 
 23 |     mosta = sc.read_h5ad(os.path.join(WORK_PATH, '%s.MOSTA.h5ad'%file_id))
 24 |     mosta_meta = mosta.obs
 25 | 
 26 |     np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_color.csv'%file_id), mosta.uns['annotation_colors'], delimiter=",", fmt='%s')
 27 |     np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_color_id.csv'%file_id), mosta.obs['annotation'].cat.categories, delimiter=",", fmt='%s')
 28 |     np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_coor.csv'%file_id), mosta.obsm['spatial'], delimiter=",")
 29 |     np.savetxt(os.path.join(WORK_PATH, 'annotation', '%s.spatial_anno.csv'%file_id), mosta.obs['annotation'], delimiter=",", fmt='%s')
 30 | 
 31 | 
 32 | ##################################################################
 33 | ### Part-2: generating h5ad format profile for sc-RNA-seq data ###
 34 | ##################################################################
 35 | 
 36 | import os, sys
 37 | import numpy as np
 38 | import pandas as pd
 39 | import scanpy as sc
 40 | 
 41 | WORK_PATH = "./"
 42 | 
 43 | day_list = ["E95","E105","E115","E125","E135","E145","E155","E165"]
 44 | 
 45 | for day_id in day_list:
 46 | 
 47 |     adata = sc.read_mtx(os.path.join(WORK_PATH, 'sc_data', '%s.gene_count.mtx'%day_id))
 48 |     pdata = pd.read_csv(os.path.join(WORK_PATH, 'sc_data', '%s.df_cell.csv'%day_id), index_col = 0)
 49 |     fdata = pd.read_csv(os.path.join(WORK_PATH, 'sc_data', '%s.df_gene.csv'%day_id), index_col = 0)
 50 |     adata.obs = pdata
 51 |     adata.var = fdata
 52 | 
 53 |     adata.write(os.path.join(WORK_PATH, 'sc_data', '%s.sc_data.h5ad'%day_id)) 
 54 | 
 55 | 
 56 | ########################################################
 57 | ### Part-3: performing spatial mapping using Tangram ###
 58 | ########################################################
 59 | 
 60 | import os, sys
 61 | import numpy as np
 62 | import pandas as pd
 63 | import matplotlib.pyplot as plt
 64 | import seaborn as sns
 65 | import scanpy as sc
 66 | import torch
 67 | import tangram as tg
 68 | 
 69 | WORK_PATH = "./"
 70 | 
 71 | newpath = os.path.join(WORK_PATH, 'result')
 72 | if not os.path.exists(newpath):
 73 |     os.makedirs(newpath)
 74 | 
 75 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 76 | print('Using device:', device)
 77 | print(torch.cuda)
 78 | print(torch.version.cuda)
 79 | print(torch.cuda.is_available())
 80 | 
 81 | file = open(os.path.join(work_path, "MOSTA_file_list.txt"))
 82 | file_list = [line.rstrip().replace(".MOSTA.h5ad", "") for line in file]
 83 | file.close()
 84 | 
 85 | 
 86 | for spatial_id in file_list:
 87 | 
 88 |     print(spatial_id)
 89 | 
 90 |     mosta = sc.read(os.path.join(WORK_PATH, spatial_id + '.MOSTA.h5ad'))
 91 | 
 92 |     day_id = spatial_id.split('_')[0].replace('.','')
 93 |     adata = sc.read(os.path.join(WORK_PATH, 'sc_data', '%s.sc_data.h5ad'%day_id))
 94 | 
 95 |     sc.pp.normalize_total(adata, inplace=True)
 96 |     sc.pp.log1p(adata)
 97 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 98 | 
 99 |     if mosta.shape[0] > 90000:
100 |         sc.pp.subsample(mosta, n_obs = 90000)
101 | 
102 |     var_genes = adata.var.index[adata.var.highly_variable]
103 |     tg.pp_adatas(adata, mosta, genes=var_genes)
104 | 
105 |     ad_map = tg.map_cells_to_space(
106 |         adata_sc=adata,
107 |         adata_sp=mosta,
108 |         device='cuda:0'
109 |     )
110 | 
111 |     tg.project_cell_annotations(ad_map, mosta, annotation='celltype')
112 |     annotation_list = list(pd.unique(adata.obs['celltype']))
113 | 
114 |     colnames = ','.join(list(mosta.obsm['tangram_ct_pred'].columns))
115 | 
116 |     np.savetxt(os.path.join(WORK_PATH, 'result', '%s.result.csv'%spatial_id), mosta.obsm['tangram_ct_pred'], delimiter=",", fmt='%.8e', header=colnames)
117 |     np.savetxt(os.path.join(WORK_PATH, 'result', '%s.result.coor.csv'%spatial_id), mosta.obsm['spatial'], delimiter=",")
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/Section_3_kidney_mesenchyme/step2_Lateral_plate_mesoderm.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ######################################
 3 | ### Section - 3, Kidney_mesenchyme ###
 4 | ######################################
 5 | 
 6 | ##############################################################################
 7 | ### Making 2D UMAP visualization for lateral plate & intermediate mesoderm ###
 8 | ##############################################################################
 9 | 
10 | source("JAX_help_code.R")
11 | source("JAX_color_code.R")
12 | work_path = "./"
13 | 
14 | example_i = "LPM"; print(example_i)
15 | 
16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
17 | 
18 | ### Fig. 3e
19 | p = pd %>%
20 |     ggplot() +
21 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.35) +
22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = lateral_plate_mesoderm_sub_clustering), size=0.2) +
23 |     theme_void() +
24 |     scale_color_manual(values=LPM_color_plate) +
25 |     theme(legend.position="none") + 
26 |     ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 8, height = 6, dpi = 300)
27 | 
28 | 
29 | x_table = table(pd$day)
30 | pd_1 = pd %>% filter(day %in% names(x_table)[x_table > 10000]) %>% group_by(day) %>% sample_n(10000) %>% as.data.frame()
31 | pd_2 = pd %>% filter(day %in% names(x_table)[x_table <= 10000]) %>% as.data.frame()
32 | pd_sub = rbind(pd_1, pd_2)
33 | pd_sub$day = factor(pd_sub$day, levels = names(day_color_plate))
34 | 
35 | ### Fig. 3e (sub panel on the top left)
36 | p = ggplot() +
37 |     geom_point(data = pd_sub, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.35) +
38 |     geom_point(data = pd_sub[sample(1:nrow(pd_sub)),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.2) +
39 |     scale_color_manual(values=day_color_plate) +
40 |     theme_void() +
41 |     theme(legend.position="none") + 
42 |     ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 8, height = 6, dpi = 300)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/Section_4_eye/Embedding_scanpy_eye.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ###
  4 | ##########################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["Amacrine cells",
 30 |                      "Amacrine/Horizontal precursor cells",
 31 |                      "Bipolar precursor cells",
 32 |                      "Cholinergic amacrine cells",
 33 |                      "Ciliary margin cells",
 34 |                      "Cone precursor cells",
 35 |                      "Horizontal cells",
 36 |                      "Naive retinal progenitor cells",
 37 |                      "Photoreceptor precursor cells",
 38 |                      "PV-containing retinal ganglion cells",
 39 |                      "Retinal ganglion cells",
 40 |                      "Retinal progenitor cells",
 41 |                      "Rod precursor cells",
 42 |                      "Eye field",
 43 |                      "Retinal pigment cells"]
 44 | 
 45 | example_id = "eye"
 46 | print(example_id)
 47 | 
 48 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)]
 49 | 
 50 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 51 | 
 52 | print("Done reading data ...")
 53 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 54 | 
 55 | sc.pp.normalize_total(adata, target_sum=1e4)
 56 | print("Done normalization by total counts ...")
 57 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 58 | 
 59 | sc.pp.log1p(adata)
 60 | print("Done log transformation ...")
 61 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 62 | 
 63 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 64 | print("Done finding highly variable genes ...")
 65 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 66 | 
 67 | adata = adata[:, adata.var.highly_variable]
 68 | print("Done filtering in highly variable genes ...")
 69 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 70 | 
 71 | sc.pp.scale(adata, max_value=10)
 72 | print("Done scaling data ...")
 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 74 | ### done with regress_out and scale ###
 75 | 
 76 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 77 | print("Done performing PCA ...")
 78 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 79 | 
 80 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 81 | print("Done computing neighborhood graph ...")
 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 83 | 
 84 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 85 | print("Done UMAP ...")
 86 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 87 | 
 88 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 89 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 90 | print("Done clustering using res = 1 ...")
 91 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 92 | 
 93 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 94 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 95 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 96 | 
 97 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 98 | print("Done UMAP ...")
 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
100 | 
101 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
102 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
103 | 
104 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
105 | 
106 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
107 | print("Done writing data ...")
108 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
109 | 
110 | emb = adata.obsm['X_pca']
111 | print(emb.shape)
112 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
113 | 
114 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/Embedding_early_neurons.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ###
  4 | ##########################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["GABAergic neurons", "Glutamatergic neurons", "Spinal cord dorsal progenitors", "Spinal cord ventral progenitors"]
 30 | 
 31 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 
 32 |              "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 
 33 |              "E12.25", "E12.5", "E12.75"]
 34 | 
 35 | example_id = "Neurons"
 36 | print(example_id)
 37 | 
 38 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)]
 39 | adata = adata[adata.obs["day"].isin(day_include)]
 40 | 
 41 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 42 | 
 43 | print("Done reading data ...")
 44 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 45 | 
 46 | sc.pp.normalize_total(adata, target_sum=1e4)
 47 | print("Done normalization by total counts ...")
 48 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 49 | 
 50 | sc.pp.log1p(adata)
 51 | print("Done log transformation ...")
 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 53 | 
 54 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 55 | print("Done finding highly variable genes ...")
 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 57 | 
 58 | adata = adata[:, adata.var.highly_variable]
 59 | print("Done filtering in highly variable genes ...")
 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 61 | 
 62 | sc.pp.scale(adata, max_value=10)
 63 | print("Done scaling data ...")
 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 65 | ### done with regress_out and scale ###
 66 | 
 67 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 68 | print("Done performing PCA ...")
 69 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 70 | 
 71 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 72 | print("Done computing neighborhood graph ...")
 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 74 | 
 75 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 76 | print("Done UMAP ...")
 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 78 | 
 79 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 80 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 81 | print("Done clustering using res = 1 ...")
 82 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 83 | 
 84 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 85 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 86 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 87 | 
 88 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 89 | print("Done UMAP ...")
 90 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 91 | 
 92 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 93 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 94 | 
 95 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
 96 | 
 97 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
 98 | print("Done writing data ...")
 99 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
100 | 
101 | emb = adata.obsm['X_pca']
102 | print(emb.shape)
103 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
104 | 
105 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/Embedding_neuroectoderm_derivatives.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ###
  4 | ##########################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["CNS_neurons", "Ependymal_cells", "Intermediate_neuronal_progenitors", 
 30 |                     "Neuroectoderm_and_glia", "Oligodendrocytes"]
 31 | 
 32 | celltype_list_exclude = ["Amacrine cells", "Amacrine/Horizontal precursor cells", "Cholinergic amacrine cells", "Horizontal cells", "PV-containing retinal ganglion cells", "Retinal ganglion cells", "Ciliated nodal cells"]
 33 | 
 34 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 
 35 |                "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 
 36 |                "E12.25", "E12.5", "E12.75"]
 37 | 
 38 | example_id = "Neuroectoderm_derivative"
 39 | print(example_id)
 40 | 
 41 | adata = adata[adata.obs["major_trajectory"].isin(celltype_include)]
 42 | adata = adata[adata.obs["day"].isin(day_include)]
 43 | adata = adata[~adata.obs["celltype_update"].isin(celltype_list_exclude)]
 44 | 
 45 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 46 | 
 47 | print("Done reading data ...")
 48 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 49 | 
 50 | sc.pp.normalize_total(adata, target_sum=1e4)
 51 | print("Done normalization by total counts ...")
 52 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 53 | 
 54 | sc.pp.log1p(adata)
 55 | print("Done log transformation ...")
 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 57 | 
 58 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 59 | print("Done finding highly variable genes ...")
 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 61 | 
 62 | adata = adata[:, adata.var.highly_variable]
 63 | print("Done filtering in highly variable genes ...")
 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 65 | 
 66 | sc.pp.scale(adata, max_value=10)
 67 | print("Done scaling data ...")
 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 69 | ### done with regress_out and scale ###
 70 | 
 71 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 72 | print("Done performing PCA ...")
 73 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 74 | 
 75 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 76 | print("Done computing neighborhood graph ...")
 77 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 78 | 
 79 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 80 | print("Done UMAP ...")
 81 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 82 | 
 83 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 84 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 85 | print("Done clustering using res = 1 ...")
 86 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 87 | 
 88 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 89 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 90 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 91 | 
 92 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 93 | print("Done UMAP ...")
 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 95 | 
 96 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 97 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 98 | 
 99 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
100 | 
101 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
102 | print("Done writing data ...")
103 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
104 | 
105 | emb = adata.obsm['X_pca']
106 | print(emb.shape)
107 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
108 | 
109 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/Embedding_patterned_neuroectoderm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################################################################################################
  3 | ### Here, we peformed basic analysis (normalization, dimension reuction, and clustering) on eye subset ###
  4 | ##########################################################################################################
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | 
 13 | start_time = time.time()
 14 | 
 15 | WORK_PATH = './'
 16 | 
 17 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 18 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 19 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 20 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 21 | 
 22 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
 23 | del adata_1, adata_2, adata_3, adata_4
 24 | gc.collect()
 25 | 
 26 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell.csv'), index_col = 0)
 27 | adata.obs = pdata
 28 | 
 29 | celltype_include = ["Telencephalon",
 30 |                   "Dorsal telencephalon",
 31 |                   "Hypothalamus",
 32 |                   "Diencephalon",
 33 |                   "Midbrain",
 34 |                   "Hypothalamus (Sim1+)",
 35 |                   "Anterior floor plate",
 36 |                   "Midbrain-hindbrain boundary",
 37 |                   "Anterior roof plate",
 38 |                   "Hindbrain",
 39 |                   "Floorplate and p3 domain",
 40 |                   "Spinal cord/r7/r8",
 41 |                   "Posterior roof plate"]
 42 | 
 43 | day_include = ["E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 
 44 |              "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 
 45 |              "E12.25", "E12.5", "E12.75"]
 46 | 
 47 | example_id = "Neuroectoderm_backbone"
 48 | print(example_id)
 49 | 
 50 | adata = adata[adata.obs["celltype_update"].isin(celltype_include)]
 51 | adata = adata[adata.obs["day"].isin(day_include)]
 52 | 
 53 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_id), compression="gzip")
 54 | 
 55 | print("Done reading data ...")
 56 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 57 | 
 58 | sc.pp.normalize_total(adata, target_sum=1e4)
 59 | print("Done normalization by total counts ...")
 60 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 61 | 
 62 | sc.pp.log1p(adata)
 63 | print("Done log transformation ...")
 64 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 65 | 
 66 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 67 | print("Done finding highly variable genes ...")
 68 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 69 | 
 70 | adata = adata[:, adata.var.highly_variable]
 71 | print("Done filtering in highly variable genes ...")
 72 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 73 | 
 74 | sc.pp.scale(adata, max_value=10)
 75 | print("Done scaling data ...")
 76 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 77 | ### done with regress_out and scale ###
 78 | 
 79 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 80 | print("Done performing PCA ...")
 81 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 82 | 
 83 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 84 | print("Done computing neighborhood graph ...")
 85 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 86 | 
 87 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 88 | print("Done UMAP ...")
 89 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 90 | 
 91 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 92 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 93 | print("Done clustering using res = 1 ...")
 94 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 95 | 
 96 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 97 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 98 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 99 | 
100 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
101 | print("Done UMAP ...")
102 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
103 | 
104 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
105 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
106 | 
107 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_id))
108 | 
109 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale_processed.h5ad'%example_id), compression="gzip")
110 | print("Done writing data ...")
111 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
112 | 
113 | emb = adata.obsm['X_pca']
114 | print(emb.shape)
115 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_id), emb, delimiter=",", fmt='%1.3f')
116 | 
117 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/step1_Patterned_neuroectoderm.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##################################
  3 | ### Section - 5, Neuroectoderm ###
  4 | ##################################
  5 | 
  6 | ###############################################################
  7 | ### Making 2D UMAP visualization of patterned neuroectoderm ###
  8 | ###############################################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | example_i = "Neuroectoderm_backbone"; print(example_i)
 15 | 
 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 17 | 
 18 | ### Fig. 4a
 19 | 
 20 | p = pd %>%
 21 |     ggplot() +
 22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.3) +
 24 |     theme_void() +
 25 |     scale_color_manual(values=neuroectoderm_color_plate) +
 26 |     theme(legend.position="none") + 
 27 |     ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 28 | 
 29 | 
 30 | #############################################################################
 31 | ### Making 3D UMAP visualization of patterned neuroectoderm + derivatives ###
 32 | #############################################################################
 33 | 
 34 | source("JAX_help_code.R")
 35 | source("JAX_color_code.R")
 36 | work_path = "./"
 37 | 
 38 | example_i = "Neuroectoderm_derivative"; print(example_i)
 39 | 
 40 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 41 | 
 42 | ### Fig. 4b
 43 | 
 44 | fig = plot_ly(pd[sample(1:nrow(pd), 250000),], x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~major_trajectory, colors = major_trajectory_color_plate) %>% 
 45 |     layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
 46 |                         yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
 47 |                         zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2),
 48 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
 49 | saveWidget(fig, paste0(work_path, example_i, "_major_trajectory.html"), selfcontained = FALSE, libdir = "tmp")
 50 | 
 51 | ### Fig. 4c
 52 | 
 53 | fig = plot_ly(pd[sample(1:nrow(pd), 250000),], x=~UMAP_1, y=~UMAP_2, z=~UMAP_3, size = I(30), color = ~day, colors = day_color_plate) %>% 
 54 |     layout(scene = list(xaxis=list(title = list(text ='UMAP_1', font = t1), tickfont = t2),
 55 |                         yaxis=list(title = list(text ='UMAP_2', font = t1), tickfont = t2),
 56 |                         zaxis=list(title = list(text ='UMAP_3', font = t1), tickfont = t2),
 57 |                         camera = list(eye = list(x = -0.8, y = 2, z = 1.5))))
 58 | saveWidget(fig, paste0(work_path, example_i, "_day.html"), selfcontained = FALSE, libdir = "tmp")
 59 | 
 60 | ### BACKUP ###
 61 | 
 62 | celltype_list = c("Telencephalon",
 63 |                   "Dorsal telencephalon",
 64 |                   "Hypothalamus",
 65 |                   "Diencephalon",
 66 |                   "Midbrain",
 67 |                   "Hypothalamus (Sim1+)",
 68 |                   "Anterior floor plate",
 69 |                   "Midbrain-hindbrain boundary",
 70 |                   "Anterior roof plate",
 71 |                   "Hindbrain",
 72 |                   "Floorplate and p3 domain",
 73 |                   "Spinal cord/r7/r8",
 74 |                   "Posterior roof plate")
 75 | 
 76 | day_list = c("E8.5", "E8.75", "E9.0", "E9.25", "E9.5", "E9.75", "E10.0", "E10.25", 
 77 |              "E10.5", "E10.75", "E11.0", "E11.25", "E11.5", "E11.75", "E12.0", 
 78 |              "E12.25", "E12.5", "E12.75")
 79 | 
 80 | emb = as.matrix(pd[,c("UMAP_1","UMAP_2","UMAP_3")])
 81 | 
 82 | dist_1 = list()
 83 | for(day_i in day_list){
 84 |     print(day_i)
 85 |     emb_x = emb[pd$day == day_i & pd$celltype_update %in% celltype_list,]
 86 |     if(nrow(emb_x) > 10000){
 87 |         emb_x = emb_x[sample(1:nrow(emb_x), 10000),]
 88 |     }
 89 |     dist_1[[day_i]] = c(rdist(emb_x))
 90 |     
 91 | }
 92 | 
 93 | dist_2 = list()
 94 | for(day_i in day_list){
 95 |     print(day_i)
 96 |     emb_x = emb[pd$day == day_i & !pd$celltype_update %in% celltype_list,]
 97 |     if(nrow(emb_x) > 10000){
 98 |         emb_x = emb_x[sample(1:nrow(emb_x), 10000),]
 99 |     }
100 |     dist_2[[day_i]] = c(rdist(emb_x))
101 | }
102 | 
103 | df = NULL
104 | for(day_i in day_list){
105 |     df = rbind(df,
106 |                data.frame(day = day_i,
107 |                           dist = mean(dist_1[[day_i]]),
108 |                           group = "patterned_neuroectoderm", stringsAsFactors = FALSE))
109 |     df = rbind(df,
110 |                data.frame(day = day_i,
111 |                           dist = mean(dist_2[[day_i]]),
112 |                           group = "derived_cell_types", stringsAsFactors = FALSE))
113 | }
114 | df$day = factor(df$day, levels = day_list)
115 | 
116 | df$day = factor(df$day, levels = rev(day_list))
117 | p = df %>%
118 |     ggplot(aes(x=day, y=dist, color=group, group=group)) +
119 |     geom_line() +
120 |     geom_point() +
121 |     scale_color_brewer(palette = "Set1") +
122 |     theme_classic(base_size = 10) +
123 |     theme(legend.position="none") +
124 |     coord_flip()
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/step2_Early_neurons.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##################################
  3 | ### Section - 5, Neuroectoderm ###
  4 | ##################################
  5 | 
  6 | #################################################
  7 | ### 2D UMAP of subclustering on early neurons ###
  8 | #################################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | example_i = "Neurons"; print(example_i)
 15 | 
 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 17 | 
 18 | ### Fig. 5e
 19 | 
 20 | p = pd %>%
 21 |     ggplot() +
 22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = neurons_sub_clustering), size=0.3) +
 24 |     theme_void() +
 25 |     scale_color_manual(values=neuroectoderm_color_plate) +
 26 |     theme(legend.position="none") + 
 27 |     ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 28 | 
 29 | ### Extended Data Fig. 10c
 30 | 
 31 | day_list = names(day_color_plate)
 32 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day])
 33 | p = pd %>%
 34 |     ggplot() +
 35 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 36 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) +
 37 |     theme_void() +
 38 |     scale_color_manual(values=neuron_day_color_plate) +
 39 |     theme(legend.position="none") + 
 40 |     ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 41 | 
 42 | 
 43 | ####################################################
 44 | ### 2D UMAP of Intermediate neuronal progenitors ###
 45 | ####################################################
 46 | 
 47 | source("JAX_help_code.R")
 48 | source("JAX_color_code.R")
 49 | work_path = "./"
 50 | 
 51 | example_i = "INP"; print(example_i)
 52 | 
 53 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 54 | 
 55 | ### Extended Data Fig. 10a
 56 | 
 57 | p = pd %>%
 58 |     ggplot() +
 59 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 60 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_update), size=0.3) +
 61 |     theme_void() +
 62 |     scale_color_brewer(palette = "Set2") +
 63 |     theme(legend.position="none") + 
 64 |     ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 65 | 
 66 | day_list = names(day_color_plate)
 67 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day])
 68 | p = pd %>%
 69 |     ggplot() +
 70 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 71 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.3) +
 72 |     theme_void() +
 73 |     scale_color_manual(values=neuron_day_color_plate) +
 74 |     theme(legend.position="none") + 
 75 |     ggsave(paste0(work_path, example_i, ".day.2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 76 | 
 77 | 
 78 | ####################################################################################################
 79 | ### Composition of embryos from each 6-hr bin by intermediate neuronal progenitor and CNS neuron ###
 80 | ####################################################################################################
 81 | 
 82 | source("JAX_help_code.R")
 83 | source("JAX_color_code.R")
 84 | work_path = "./"
 85 | 
 86 | pd = readRDS(paste0(work_path, "df_cell.rds"))
 87 | day_list = names(day_color_plate)
 88 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day])
 89 | 
 90 | pd_1 = pd[pd$major_trajectory == "CNS_neurons",]
 91 | pd_2 = pd[pd$major_trajectory == "Intermediate_neuronal_progenitors",]
 92 | 
 93 | x1 = pd_1 %>% group_by(day) %>% tally() %>%
 94 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 95 |     mutate(frac = 100*n/total_n)
 96 | 
 97 | x2 = pd_2 %>% group_by(day) %>% tally() %>%
 98 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 99 |     mutate(frac = 100*n/total_n)
100 | 
101 | x = x1 %>% select(day, frac) %>% rename(direct_frac = frac) %>% left_join(x2 %>% select(day, frac) %>% rename(indirect_frac = frac), by = "day")
102 | x$indirect_frac[is.na(x$indirect_frac)] = 0
103 | x = data.frame(day = rep(x$day, 2),
104 |                frac = c(x$direct_frac, x$indirect_frac),
105 |                major_trajectory = rep(c("CNS_neurons","Intermediate_neuronal_progenitors"), each = nrow(x)))
106 | 
107 | ### Fig. 4d
108 | 
109 | p = x %>% 
110 |     ggplot(aes(x=day, y=frac, fill = day)) + 
111 |     geom_bar(stat='identity') + facet_grid(rows = vars(major_trajectory)) + 
112 |     labs(x='',y='% of cells') +
113 |     scale_fill_manual(values=day_color_plate) +
114 |     theme_classic(base_size = 10) +
115 |     theme(legend.position="none") +
116 |     theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black"))
117 | 
118 | 
119 | #############################################################
120 | ### For each interneuron, what are the top expressed TFs? ### 
121 | #############################################################
122 | 
123 | source("JAX_help_code.R")
124 | source("JAX_color_code.R")
125 | work_path = "./"
126 | 
127 | dat = readRDS(paste0(work_path, "Neurons_heatmap_dat.rds"))
128 | 
129 | Colors=rev(brewer.pal(11,"Spectral"))
130 | Colors=colorRampPalette(Colors)(120)
131 | pdf(paste0(work_path, "Neurons_heatmap.pdf"), 8, 5)
132 | heatmap.2(as.matrix(t(dat)), 
133 |           col=Colors, 
134 |           scale="col", 
135 |           Rowv = F, 
136 |           Colv = F, 
137 |           key=T, 
138 |           density.info="none", 
139 |           trace="none", 
140 |           cexRow = 1, 
141 |           cexCol = 1,
142 |           margins = c(5,5))
143 | dev.off()
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/Section_5_neuroectoderm/step5_Astrocytes.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##################################
  3 | ### Section - 5, Neuroectoderm ###
  4 | ##################################
  5 | 
  6 | ##############################################
  7 | ### Analyzing astrocytes from stages < E13 ###
  8 | ##############################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | example_i = "Astrocytes"; print(example_i)
 15 | 
 16 | pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 17 | 
 18 | ### Extended Data Fig. 10g
 19 | 
 20 | p = pd %>%
 21 |     ggplot() +
 22 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 23 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = celltype_sub_clustering), size=0.3) +
 24 |     theme_void() +
 25 |     scale_color_manual(values=astrocytes_color_plate) +
 26 |     theme(legend.position="none") + 
 27 |     ggsave(paste0(work_path, example_i, ".2D_UMAP.png"), width = 6, height = 6, dpi = 300)
 28 | 
 29 | 
 30 | 
 31 | ##############################################################
 32 | ### Compositions changing over time for different subtypes ###
 33 | ##############################################################
 34 | 
 35 | source("JAX_help_code.R")
 36 | source("JAX_color_code.R")
 37 | work_path = "./"
 38 | 
 39 | example_i = "Astrocytes"; print(example_i)
 40 | 
 41 | pd = readRDS(paste0(work_path, "df_cell.rds"))
 42 | pd_sub = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 43 | 
 44 | day_list = names(day_color_plate)
 45 | pd$day = factor(pd$day, levels = day_list[day_list %in% pd$day])
 46 | pd_1 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA1 astrocytes"],]
 47 | pd_2 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA2 astrocytes"],]
 48 | pd_3 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "VA3 astrocytes"],]
 49 | pd_4 = pd[pd$cell_id %in% pd_sub$cell_id[pd_sub$celltype_sub_clustering == "Anterior astrocytes"],]
 50 | 
 51 | x1 = pd_1 %>% group_by(day) %>% tally() %>%
 52 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 53 |     mutate(frac = 100*n/total_n)
 54 | 
 55 | x2 = pd_2 %>% group_by(day) %>% tally() %>%
 56 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 57 |     mutate(frac = 100*n/total_n)
 58 | 
 59 | x3 = pd_3 %>% group_by(day) %>% tally() %>%
 60 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 61 |     mutate(frac = 100*n/total_n)
 62 | 
 63 | x4 = pd_4 %>% group_by(day) %>% tally() %>%
 64 |     left_join(pd %>% group_by(day) %>% tally() %>% rename(total_n = n), by = "day") %>%
 65 |     mutate(frac = 100*n/total_n)
 66 | 
 67 | x = x2 %>% select(day, frac) %>% rename(VA2 = frac) %>% 
 68 |     left_join(x1 %>% select(day, frac) %>% rename(VA1 = frac), by = "day") %>% 
 69 |     left_join(x3 %>% select(day, frac) %>% rename(VA3 = frac), by = "day") %>% 
 70 |     left_join(x4 %>% select(day, frac) %>% rename(AA = frac), by = "day")
 71 | x$VA1[is.na(x$VA1)] = 0
 72 | x$VA2[is.na(x$VA2)] = 0
 73 | x$VA3[is.na(x$VA3)] = 0
 74 | x$AA[is.na(x$AA)] = 0
 75 | x = data.frame(day = rep(x$day, 4),
 76 |                frac = c(x$VA1, x$VA2, x$VA3, x$AA),
 77 |                major_trajectory = rep(c("VA1","VA2","VA3","AA"), each = nrow(x)))
 78 | 
 79 | x$major_trajectory = factor(x$major_trajectory, levels = c("VA1","VA2","VA3","AA"))
 80 | 
 81 | ### Extended Data Fig. 10h
 82 | 
 83 | p = x %>% 
 84 |     ggplot(aes(x=day, y=frac, fill = day)) + 
 85 |     geom_bar(stat='identity') + facet_grid(rows = vars(major_trajectory)) + 
 86 |     labs(x='',y='% of cells') +
 87 |     scale_fill_manual(values=day_color_plate_2) +
 88 |     theme_classic(base_size = 10) +
 89 |     theme(legend.position="none") +
 90 |     theme(axis.text.x = element_text(color="black", angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(color="black"))
 91 | 
 92 | 
 93 | 
 94 | ###############################################################################
 95 | ### Mapping different subtypes of astrocytes to their potential progenitors ###
 96 | ###############################################################################
 97 | 
 98 | source("JAX_help_code.R")
 99 | source("JAX_color_code.R")
100 | work_path = "./"
101 | 
102 | example_i = "Neuroectoderm_derivative"
103 | name = "Astrocytes"
104 | 
105 | pd_sub = readRDS(paste0(work_path, name, "_adata_scale.obs.rds"))
106 | 
107 | ### this result was calculated by step4_Mapping_neuroectoderm_derivatives.R
108 | dat = readRDS(paste0(work_path, example_i, ".MNN_pairs.rds"))
109 | 
110 | pd_back = readRDS(paste0(work_path, "Neuroectoderm_backbone_adata_scale.obs.rds"))
111 | rownames(pd_back) = as.vector(pd_back$cell_id)
112 | 
113 | celltype_sub_clustering_list = paste0("VA", c(1:3), " astrocytes")
114 | 
115 | ### Extended Data Fig. 10j
116 | 
117 | for(i in celltype_sub_clustering_list){
118 |     print(i)
119 |     pd_sub_i = pd_sub %>% filter(celltype_sub_clustering == i) %>% pull(cell_id)
120 |     dat_sub = dat %>% filter(A %in% pd_sub_i) %>% group_by(B) %>% tally() %>% rename(cell_id = B, freq = n)
121 |     df = pd_back %>% select(UMAP_1 = UMAP_2d_1, UMAP_2 = UMAP_2d_2, cell_id, day) %>% left_join(dat_sub, by = "cell_id") 
122 |     df$freq[is.na(df$freq)] = 0
123 |     
124 |     name_i = gsub("/", "_", i)
125 |     name_i = gsub(" ", "_", name_i)
126 |     
127 |     try(ggplot() +
128 |             geom_point(data = df[sample(1:nrow(df),100000),], aes(x = UMAP_1, y = UMAP_2), size=0.5, color = "grey80") +
129 |             geom_point(data = df[df$freq != 0,], aes(x = UMAP_1, y = UMAP_2, color = freq), size=0.5) +
130 |             theme_void() +
131 |             scale_color_viridis() +
132 |             theme(legend.position="none") + 
133 |             ggsave(paste0(work_path, name_i, ".png"), width = 8, height = 6, dpi = 300), silent = T)
134 |     
135 | }
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/Section_6_development_tree/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_6_development_tree/.DS_Store


--------------------------------------------------------------------------------
/Section_6_development_tree/Dimension_reduction_subsystem.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################################################################################
  3 | ### First, we manually split all the cell types, from the organogenesis & fetal development, into 12 systems, 
  4 | ### to perform dimension reducting using Scanpy, followed by identifying the kNNs across cells using annoy in Python
  5 | ##########################################################################################
  6 | 
  7 | 
  8 | import scanpy as sc
  9 | import pandas as pd
 10 | import numpy as np
 11 | import os, sys
 12 | import time
 13 | import gc
 14 | from annoy import AnnoyIndex
 15 | 
 16 | start_time = time.time()
 17 | 
 18 | WORK_PATH = './'
 19 | 
 20 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
 21 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
 22 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
 23 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
 24 | 
 25 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4)
 26 | del adata_1, adata_2, adata_3, adata_4
 27 | gc.collect()
 28 | 
 29 | ##############################################
 30 | ### Of note, please read df_cell_graph.rds and then write it to df_cell_graph.csv in R
 31 | 
 32 | ### >>> dat = readRDS("df_cell_graph.rds")
 33 | ### >>> rownames(dat) = as.vector(dat$cell_id)
 34 | ### >>> write.csv(dat, "df_cell_graph.csv")
 35 | 
 36 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_graph.csv'), index_col = 0)
 37 | adata_orig.obs = pdata
 38 | 
 39 | system_list = ["Endothelium",
 40 | "Epithelial_cells",
 41 | "Eye",
 42 | "Gut",
 43 | "Notochord",
 44 | "PNS_glia",
 45 | "PNS_neurons",
 46 | "Renal",
 47 | "Lateral_plate_mesoderm",
 48 | "Blood",
 49 | "Brain_spinal_cord",
 50 | "Mesoderm"]
 51 | 
 52 | for system_i in trajectory_list:
 53 | 
 54 |     print("Processing: %s"%system_i)
 55 | 
 56 |     adata = adata_orig[adata_orig.obs["system"] == system_i]
 57 |     print(adata.shape)
 58 | 
 59 |     sc.pp.normalize_total(adata, target_sum=1e4)
 60 |     sc.pp.log1p(adata)
 61 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 62 |     adata = adata[:, adata.var.highly_variable]
 63 |     sc.pp.scale(adata, max_value=10)
 64 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 65 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 66 | 
 67 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
 68 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 69 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 70 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 71 | 
 72 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
 73 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 74 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 75 | 
 76 |     sc.tl.leiden(adata, resolution=1, n_iterations=2)
 77 |     adata.obs['leiden_res_1'] = adata.obs['leiden']
 78 | 
 79 |     sc.tl.leiden(adata, resolution=2, n_iterations=2)
 80 |     adata.obs['leiden_res_2'] = adata.obs['leiden']
 81 | 
 82 |     sc.tl.leiden(adata, resolution=5, n_iterations=2)
 83 |     adata.obs['leiden_res_5'] = adata.obs['leiden']
 84 | 
 85 |     sc.tl.leiden(adata, resolution=10, n_iterations=2)
 86 |     adata.obs['leiden_res_10'] = adata.obs['leiden']
 87 | 
 88 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%system_i))
 89 | 
 90 |     adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%system_i), compression="gzip")
 91 | 
 92 |     X = adata.obsm['X_pca']
 93 |     print(X.shape)
 94 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), X, delimiter=",", fmt='%1.3f')
 95 | 
 96 |     ### calculating kNN using annoy, this is much faster than using R
 97 | 
 98 |     dist_metric = 'euclidean'
 99 |     k = 15
100 |     ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15.
101 | 
102 |     npc = X.shape[1]
103 |     ncell = X.shape[0]
104 |     annoy_index = AnnoyIndex(npc, metric=dist_metric)
105 | 
106 |     for i in range(ncell):
107 |         annoy_index.add_item(i, list(X[i,:]))
108 |     annoy_index.build(15) ### bigger number will make the result more accurate
109 | 
110 |     knn = []
111 |     for iCell in range(ncell):
112 |         knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
113 |     knn = np.array(knn, dtype=int)
114 | 
115 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%system_i), knn, delimiter=",", fmt='%s')
116 | 
117 | 
118 | 
119 | ######################################################
120 | ### Second, we found the neuroectoderm is too complex, so we subset the patterned neuroectoderm cells to perform embedding
121 | 
122 | 
123 | import scanpy as sc
124 | import pandas as pd
125 | import numpy as np
126 | import os, sys
127 | import time
128 | import gc
129 | from annoy import AnnoyIndex
130 | 
131 | start_time = time.time()
132 | 
133 | WORK_PATH = './'
134 | 
135 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
136 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
137 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
138 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
139 | 
140 | adata_orig = adata_1.concatenate(adata_2, adata_3, adata_4)
141 | del adata_1, adata_2, adata_3, adata_4
142 | gc.collect()
143 | 
144 | ##############################################
145 | ### Of note, please read df_cell_graph.rds and then write it to df_cell_graph.csv in R
146 | 
147 | ### >>> dat = readRDS("df_cell_graph.rds")
148 | ### >>> rownames(dat) = as.vector(dat$cell_id)
149 | ### >>> write.csv(dat, "df_cell_graph.csv")
150 | 
151 | pdata = pd.read_csv(os.path.join(WORK_PATH, 'df_cell_graph.csv'), index_col = 0)
152 | adata_orig.obs = pdata
153 | 
154 | patterned_neuroectoderm = ["Anterior floor plate",
155 | "Diencephalon",
156 | "Floorplate and p3 domain",
157 | "Hypothalamus",
158 | "Midbrain",
159 | "Posterior roof plate",
160 | "Telencephalon",
161 | "Anterior roof plate",
162 | "Dorsal telencephalon",
163 | "Hindbrain",
164 | "Hypothalamus (Sim1+)",
165 | "Midbrain-hindbrain boundary",
166 | "Spinal cord/r7/r8"]
167 | 
168 | system_i = "Neuroectoderm"
169 | print("Processing: %s"%system_i)
170 | 
171 | adata = adata_orig[adata_orig.obs["celltype_update"].isin(patterned_neuroectoderm)]
172 | print(adata.shape)
173 | 
174 | sc.pp.normalize_total(adata, target_sum=1e4)
175 | sc.pp.log1p(adata)
176 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
177 | adata = adata[:, adata.var.highly_variable]
178 | sc.pp.scale(adata, max_value=10)
179 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
180 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
181 | 
182 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
183 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
184 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
185 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
186 | 
187 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
188 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
189 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
190 | 
191 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
192 | adata.obs['leiden_res_1'] = adata.obs['leiden']
193 | 
194 | sc.tl.leiden(adata, resolution=2, n_iterations=2)
195 | adata.obs['leiden_res_2'] = adata.obs['leiden']
196 | 
197 | sc.tl.leiden(adata, resolution=5, n_iterations=2)
198 | adata.obs['leiden_res_5'] = adata.obs['leiden']
199 | 
200 | sc.tl.leiden(adata, resolution=10, n_iterations=2)
201 | adata.obs['leiden_res_10'] = adata.obs['leiden']
202 | 
203 | adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%system_i))
204 | 
205 | adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%system_i), compression="gzip")
206 | 
207 | X = adata.obsm['X_pca']
208 | print(X.shape)
209 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), X, delimiter=",", fmt='%1.3f')
210 | 
211 | ### calculating kNN using annoy, this is much faster than using R
212 | 
213 | dist_metric = 'euclidean'
214 | k = 15
215 | ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15.
216 | 
217 | npc = X.shape[1]
218 | ncell = X.shape[0]
219 | annoy_index = AnnoyIndex(npc, metric=dist_metric)
220 | 
221 | for i in range(ncell):
222 |     annoy_index.add_item(i, list(X[i,:]))
223 | annoy_index.build(15) ### bigger number will make the result more accurate
224 | 
225 | knn = []
226 | for iCell in range(ncell):
227 |     knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
228 | knn = np.array(knn, dtype=int)
229 | 
230 | np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%system_i), knn, delimiter=",", fmt='%s')
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/Section_6_development_tree/Graph_robust.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ################################################################################
  3 | ### Second, to assess the robustness of MNNs to cell sampling, we randomly subsampled 80% 
  4 | ### of cells from each developmental system during organogenesis & fetal development
  5 | 
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | import numpy as np
  9 | import os, sys
 10 | import time
 11 | import gc
 12 | from annoy import AnnoyIndex
 13 | 
 14 | system_list = ["Endothelium",
 15 | "Epithelial_cells",
 16 | "Eye",
 17 | "Gut",
 18 | "Notochord",
 19 | "PNS_glia",
 20 | "PNS_neurons",
 21 | "Renal",
 22 | "Lateral_plate_mesoderm",
 23 | "Blood",
 24 | "Brain_spinal_cord",
 25 | "Mesoderm",
 26 | "Neuroectoderm"]
 27 | 
 28 | WORK_PATH = "./"
 29 | 
 30 | for system_i in system_list:
 31 | 
 32 |     ### PC features were calculated by Dimension_reduction_subsystem.py
 33 | 
 34 |     X = pd.read_csv(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), index_col = False, header=None)
 35 |     X = pd.DataFrame.to_numpy(X)
 36 | 
 37 |     original_size = X.shape[0]
 38 |     subset_size = int(original_size * 0.8)
 39 |     npc = X.shape[1]
 40 | 
 41 |     dist_metric = 'euclidean'
 42 |     k = 15
 43 |     ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15.
 44 | 
 45 |     for cnt in range(100):
 46 |         
 47 |         idx = np.random.choice(original_size, size=subset_size, replace=False)
 48 |         X_sub = X[idx,:]
 49 | 
 50 |         ncell = X_sub.shape[0]
 51 |         annoy_index = AnnoyIndex(npc, metric=dist_metric)
 52 | 
 53 |         for i in range(ncell):
 54 |             annoy_index.add_item(i, list(X_sub[i,:]))
 55 |         annoy_index.build(15) ### bigger number will make the result more accurate
 56 | 
 57 |         knn = []
 58 |         for iCell in range(ncell):
 59 |             knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
 60 |         knn = np.array(knn, dtype=int)
 61 | 
 62 |         np.savetxt(os.path.join(WORK_PATH, '%s_knn_%s.csv'%(system_i, str(cnt+1))), knn, delimiter=",", fmt='%s')
 63 |         np.savetxt(os.path.join(WORK_PATH, '%s_idx_%s.csv'%(system_i, str(cnt+1))), idx, delimiter=",", fmt='%s')
 64 | 
 65 | 
 66 | 
 67 | 
 68 | ##################################################################################
 69 | ### Third, to determine the effect of k parameter choice on the MNNs identified 
 70 | ### between cell types, we examined different k values (k = 5, 10, 20, 30, 40, 50)
 71 | ##################################################################################
 72 | 
 73 | 
 74 | import scanpy as sc
 75 | import pandas as pd
 76 | import numpy as np
 77 | import os, sys
 78 | import time
 79 | import gc
 80 | from annoy import AnnoyIndex
 81 | 
 82 | system_list = ["Endothelium",
 83 | "Epithelial_cells",
 84 | "Eye",
 85 | "Gut",
 86 | "Notochord",
 87 | "PNS_glia",
 88 | "PNS_neurons",
 89 | "Renal",
 90 | "Lateral_plate_mesoderm",
 91 | "Blood",
 92 | "Brain_spinal_cord",
 93 | "Mesoderm",
 94 | "Neuroectoderm"]
 95 | 
 96 | WORK_PATH = "./"
 97 | 
 98 | for system_i in system_list:
 99 | 
100 |     ### PC features were calculated by Dimension_reduction_subsystem.py
101 | 
102 |     X = pd.read_csv(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%system_i), index_col = False, header=None)
103 |     X = pd.DataFrame.to_numpy(X)
104 | 
105 |     ncell = X.shape[0]
106 |     npc = X.shape[1]
107 |     dist_metric = 'euclidean'
108 | 
109 |     annoy_index = AnnoyIndex(npc, metric=dist_metric)
110 | 
111 |     for i in range(ncell):
112 |         annoy_index.add_item(i, list(X[i,:]))
113 |     annoy_index.build(15) ### bigger number will make the result more accurate
114 | 
115 |     for k in [5,10,20,30,40,50]:
116 | 
117 |         knn = []
118 |         for iCell in range(ncell):
119 |             knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
120 |         knn = np.array(knn, dtype=int)
121 | 
122 |         np.savetxt(os.path.join(WORK_PATH, '%s_knn_%s.csv'%(system_i, str(k))), knn, delimiter=",", fmt='%s')
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/Section_6_development_tree/Two_examples.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import scanpy as sc
 3 | import pandas as pd
 4 | import numpy as np
 5 | import os
 6 | import sys
 7 | from annoy import AnnoyIndex
 8 | 
 9 | WORK_PATH = './'
10 | 
11 | example_list = ["suppressor_cells", "lung"]
12 | 
13 | for example_i in example_list:
14 |     print(example_i)
15 | 
16 |     adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i))
17 |     fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
18 |     pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0)
19 |     adata.obs = pdata
20 |     adata.var = fdata
21 | 
22 |     print(adata.shape)
23 | 
24 |     sc.pp.normalize_total(adata, target_sum=1e4)
25 |     sc.pp.log1p(adata)
26 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
27 |     adata = adata[:, adata.var.highly_variable]
28 |     sc.pp.scale(adata, max_value=10)
29 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
30 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
31 | 
32 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
33 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
34 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
35 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
36 | 
37 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
38 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
39 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
40 | 
41 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i))
42 | 
43 |     adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip")
44 | 
45 |     X = adata.obsm['X_pca']
46 |     print(X.shape)
47 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f')
48 | 
49 |     dist_metric = 'euclidean'
50 |     k = 15
51 |     ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15.
52 | 
53 |     npc = X.shape[1]
54 |     ncell = X.shape[0]
55 |     annoy_index = AnnoyIndex(npc, metric=dist_metric)
56 | 
57 |     for i in range(ncell):
58 |         annoy_index.add_item(i, list(X[i,:]))
59 |     annoy_index.build(15) ### bigger number will make the result more accurate
60 | 
61 |     knn = []
62 |     for iCell in range(ncell):
63 |         knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
64 |     knn = np.array(knn, dtype=int)
65 | 
66 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%example_i), knn, delimiter=",", fmt='%s')
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/Section_6_development_tree/step2_Late_stage_graph.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################
  3 | ### Section - 6, Development tree ###
  4 | #####################################
  5 | 
  6 | #########################################################################################################
  7 | ### First, we manually split all the cell types, from the organogenesis & fetal development, into 12 systems, 
  8 | ### to perform dimension reducting using Scanpy, followed by identifying the kNNs across cells using annoy in Python
  9 | 
 10 | ### For Brain_spinal_cord, we further split patterned neuroectoderm ("Neuroectoderm") to perform embedding.
 11 | 
 12 | ### Python Dimension_reduction_subsystem.py
 13 | 
 14 | source("JAX_help_code.R")
 15 | source("JAX_color_code.R")
 16 | work_path = "./"
 17 | 
 18 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds"))
 19 | rownames(pd_all) = as.vector(pd_all$cell_id)
 20 | 
 21 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t")
 22 | 
 23 | system_list = c("Endothelium",                    
 24 |                 "Epithelial_cells",               
 25 |                 "Eye",                            
 26 |                 "Gut",                            
 27 |                 "Notochord",                      
 28 |                 "PNS_glia",                       
 29 |                 "PNS_neurons",                    
 30 |                 "Renal",                          
 31 |                 "Lateral_plate_mesoderm",         
 32 |                 "Blood",     
 33 |                 "Neuroectoderm",
 34 |                 "Brain_spinal_cord",              
 35 |                 "Mesoderm")      
 36 | 
 37 | patterned_neuroectoderm = c("Anterior floor plate",
 38 |                             "Diencephalon",
 39 |                             "Floorplate and p3 domain",
 40 |                             "Hypothalamus",
 41 |                             "Midbrain",
 42 |                             "Posterior roof plate",
 43 |                             "Telencephalon",
 44 |                             "Anterior roof plate",
 45 |                             "Dorsal telencephalon",
 46 |                             "Hindbrain",
 47 |                             "Hypothalamus (Sim1+)",
 48 |                             "Midbrain-hindbrain boundary",
 49 |                             "Spinal cord/r7/r8")
 50 | 
 51 | for(kk in 1:length(system_list)){
 52 |     
 53 |     system_i = system_list[kk]
 54 |     print(system_i)
 55 |     
 56 |     ### After you running the "Python Dimension_reduction_subsystem.py", you will get this files
 57 |     pd = read.csv(paste0(work_path, system_i, "_adata_scale.obs.csv"), as.is=T, row.names = 1)
 58 |     rownames(pd) = as.vector(pd$cell_id)
 59 |     pd = pd %>% left_join(pd_all[,c("cell_id","celltype_new","system","meta_group")], by = "cell_id") %>% as.data.frame()
 60 |     
 61 |     if(system_i == "Neuroectoderm"){
 62 |         pd$system = "Neuroectoderm"
 63 |     }
 64 |     
 65 |     ### create or read MNN pairs between individual cells
 66 |     
 67 |     nn_matrix = read.csv(paste0(work_path, system_i, "_adata_scale.kNN_15.csv"), as.is=T, header=F)
 68 |     nn_matrix = as.matrix(nn_matrix)
 69 |     nn_matrix = nn_matrix + 1 ### python and R using different start index
 70 |     
 71 |     ### extracting MNN pairs
 72 |     ### only retaining those edges which are considered twice (A -> B, B -> A)
 73 |     
 74 |     x = data.frame(i = rep(1:nrow(nn_matrix), ncol(nn_matrix)),
 75 |                    j = c(nn_matrix), stringsAsFactors = FALSE)
 76 |     
 77 |     dat = Matrix::sparseMatrix(i = as.numeric(as.vector(x$i)),
 78 |                                j = as.numeric(as.vector(x$j)),
 79 |                                x = 1)
 80 |     
 81 |     dat_t = t(dat) + dat
 82 |     x = data.frame(summary(dat_t))
 83 |     x = x[x$x == 2 & x$i > x$j,]
 84 |     x$x = NULL   
 85 |     
 86 |     ### x saves the MNN pairs
 87 |     saveRDS(x, paste0(work_path, system_i, ".MNN.rds"))
 88 |     
 89 |     y = data.frame(i = 1:nrow(pd),
 90 |                    j = 1:nrow(pd),
 91 |                    meta_group = as.vector(pd$meta_group), stringsAsFactors = FALSE)
 92 |     
 93 |     dat = x %>% left_join(y %>% select(i, meta_group), by = "i") %>%
 94 |         left_join(y %>% select(j, meta_group), by = "j") %>%
 95 |         group_by(meta_group.x, meta_group.y) %>% tally() 
 96 |     
 97 |     obs = dcast(dat, meta_group.x~meta_group.y)
 98 |     rownames(obs) = as.vector(obs[,1])
 99 |     obs = obs[,-1]
100 |     obs[is.na(obs)] = 0
101 |     obs = as.matrix(obs)
102 |     
103 |     diag(obs) = 0
104 |     
105 |     obs_x = obs + t(obs)
106 |     obs_y = as.vector(obs_x[upper.tri(obs_x)])
107 |     
108 |     group = NULL
109 |     for(i in 2:nrow(obs_x)){
110 |         print(i)
111 |         for(j in 1:(i-1)){
112 |             group = rbind(group, data.frame(system = system_i,
113 |                                             x = colnames(obs_x)[j],
114 |                                             y = rownames(obs_x)[i], stringsAsFactors = F))
115 |         }
116 |     }
117 |     
118 |     group$edge_num = obs_y
119 |     
120 |     group = group %>% 
121 |         left_join(nodes %>% rename(x = meta_group) %>% select(x, celltype_new, celltype_num), by = "x") %>% rename(x_name = celltype_new, x_size = celltype_num) %>%
122 |         left_join(nodes %>% rename(y = meta_group) %>% select(y, celltype_new, celltype_num), by = "y") %>% rename(y_name = celltype_new, y_size = celltype_num)
123 |     
124 |     group$min_size = if_else(group$x_size < group$y_size, group$x_size, group$y_size)
125 |     group$edge_num_norm = group$edge_num/log2(15*group$min_size)
126 |     group$min_size = NULL
127 |     
128 |     saveRDS(group, paste0(work_path, system_i, ".edges_new.rds"))
129 |     
130 |     ### output MNN pairs for manually reviewing
131 |     
132 |     edges = group
133 |     
134 |     edges_2 = edges
135 |     edges_2$x = as.vector(edges$y); edges_2$y = as.vector(edges$x)
136 |     edges_2$x_size = as.vector(edges$y_size); edges_2$y_size = as.vector(edges$x_size)
137 |     edges_2$x_name = as.vector(edges$y_name); edges_2$y_name = as.vector(edges$x_name)
138 |     
139 |     dat = rbind(edges, edges_2) %>% as.data.frame() %>%
140 |         filter(edge_num != 0) %>% rename(MNN_pairs = edge_num, MNN_pairs_normalized = edge_num_norm) %>%
141 |         group_by(x) %>% arrange(desc(MNN_pairs), .by_group = T) %>%
142 |         as.data.frame()
143 |     
144 |     if(system_i == "Brain_spinal_cord"){
145 |         tmp = read.table(paste0(work_path, "Neuroectoderm", ".MNN_pairs.txt"),header=T,as.is=T,sep="\t")
146 |         dat = dat[!dat$x %in% c(tmp$x, tmp$y) | !dat$y %in% c(tmp$x, tmp$y), ]
147 |     }
148 |     
149 |     write.table(dat, paste0(work_path, system_i, ".MNN_pairs.txt"), row.names=F, sep="\t", quote=F)
150 | 
151 | }
152 | 
153 | 
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/Section_6_development_tree/step3_Create_graph.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################
  3 | ### Section - 6, Development tree ###
  4 | #####################################
  5 | 
  6 | ###################################
  7 | ### Summary the edges and nodes ###
  8 | ###################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t")
 15 | 
 16 | ### now we merged edges which have been manually reviewed.
 17 | 
 18 | ### edges_1 includes edges from pre-gastrulation and gastrulation stages
 19 | edges_1 = read.table(paste0(work_path, "edges_1.txt"), header=F, as.is=T, sep="\t")
 20 | 
 21 | ### edges_2 includes edges from organogenesis & fetal development
 22 | edges_2 = read.table(paste0(work_path, "edges_2.txt"), header=F, as.is=T, sep="\t")
 23 | 
 24 | ### edges_3 includes edges which are manually added to connect blood and PNS-neuron
 25 | edges_3 = read.table(paste0(work_path, "edges_3.txt"), header=F, as.is=T, sep="\t")
 26 | 
 27 | edges = rbind(edges_1, edges_2, edges_3)
 28 | names(edges) = c("system", "x", "y", "x_name", "y_name", "edge_type")
 29 | 
 30 | length((unique(c(edges$x, edges$y))))
 31 | 
 32 | write.table(edges, paste0(work_path, "edges.txt"), row.names=F, sep="\t", quote=F)
 33 | 
 34 | ### To better visualize the result, we took out the spatial continuity edges, and also collapse reundant nodes
 35 | edges_sub = edges[edges$edge_type != "Spatial continuity",]
 36 | length((unique(c(edges_sub$x, edges_sub$y))))
 37 | 
 38 | edges_sub = rbind(edges_sub, edges[edges$x %in% c("BS_M37", "BS_M39") | edges$y %in% c("BS_M37", "BS_M39"),])
 39 | length((unique(c(edges_sub$x, edges_sub$y))))
 40 | 
 41 | write.table(edges_sub, paste0(work_path, "edges_sub.txt"), row.names=F, sep="\t", quote=F)
 42 | 
 43 | ### removing redundant nodes
 44 | edges_sub$x_y = paste0(edges_sub$x, ":", edges_sub$y)
 45 | edges_x_1 = edges_sub[edges_sub$x_name == edges_sub$y_name & edges_sub$system == "Pre_gastrulation",]
 46 | edges_x_2 = edges_sub[edges_sub$x_name == edges_sub$y_name & edges_sub$system == "Gastrulation_E8.5b",]
 47 | 
 48 | edges_x_3 = edges_sub[edges_sub$x %in% as.vector(edges_x_1$y),]
 49 | edges_x_4 = edges_sub[edges_sub$x %in% as.vector(edges_x_2$y),]
 50 | 
 51 | edges_x_3_ = edges_x_3 %>% left_join(edges_x_1 %>% select(x,y) %>% rename(new_x = x, x=y), by = "x")
 52 | edges_x_3$x = as.vector(edges_x_3_$new_x)
 53 | 
 54 | edges_x_4_ = edges_x_4 %>% left_join(edges_x_2 %>% select(x,y) %>% rename(new_x = x, x=y), by = "x")
 55 | edges_x_4$x = as.vector(edges_x_4_$new_x)
 56 | 
 57 | edges_x_5 = edges_sub[!edges_sub$x_y %in% c(edges_x_1$x_y, edges_x_2$x_y, edges_x_3$x_y, edges_x_4$x_y),]
 58 | 
 59 | edges_x = rbind(edges_x_3, edges_x_4, edges_x_5)
 60 | print(edges_x[edges_x$x_name == edges_x$y_name,])
 61 | edges_x = edges_x[edges_x$x_name != edges_x$y_name,]
 62 | edges_x$x_y_name = paste0(edges_x$x_name, ":", edges_x$y_name)
 63 | x_table = table(edges_x$x_y_name)
 64 | tmp = edges_x[edges_x$x_y_name %in% names(x_table)[x_table != 1],]
 65 | print(tmp[order(tmp$x_name),])
 66 | 
 67 | redundant_edges = c("En_M5:En_M1", "Ga_M5:Ga_M6", "L_M7:L_M3", "En_M7:En_M5", "Ga_M23:En_M5", "BS_M20:BS_M2", "Ga_M17:En_M7")
 68 | edges_x = edges_x[!edges_x$x_y %in% redundant_edges,]
 69 | print(length(unique(c(edges_x$x, edges_x$y))))
 70 | print(length(unique(c(edges_x$x_name, edges_x$y_name))))
 71 | 
 72 | write.table(edges_x, paste0(work_path, "edges_sub.txt"), row.names=F, sep="\t", quote=F)
 73 | 
 74 | nodes_sub = nodes[nodes$meta_group %in% c(edges_x$x, edges_x$y),]
 75 | write.table(nodes_sub, paste0(work_path, "nodes_sub.txt"), row.names=F, sep="\t", quote=F)
 76 | 
 77 | 
 78 | ##############################################################
 79 | ### making Histogram for accepted edges and rejected edges ###
 80 | ##############################################################
 81 | 
 82 | source("JAX_help_code.R")
 83 | source("JAX_color_code.R")
 84 | work_path = "./"
 85 | 
 86 | dat = read.table(paste0(work_path, "edges_MNNs.txt"), header=T, sep="\t", as.is=T)
 87 | dat = dat[dat$MNN_pairs_normalized > 1,]
 88 | 
 89 | dat_1 = dat[dat$Comments %in% c("Developmental progression", "Spatial continuity"),]
 90 | dat_2 = dat[dat$Comments %in% c("x","X"),]
 91 | 
 92 | dat_uniq = NULL
 93 | x_uniq = NULL
 94 | for(i in 1:nrow(dat_2)){
 95 |     tmp = paste0(dat_2$x[i], ":", dat_2$y[i])
 96 |     tmp_r = paste0(dat_2$y[i], ":", dat_2$x[i])
 97 |     if(tmp %in% x_uniq | tmp_r %in% x_uniq){
 98 |         next
 99 |     } else {
100 |         dat_uniq = rbind(dat_uniq, dat_2[i,])
101 |         x_uniq = c(x_uniq, tmp)
102 |     }
103 | }
104 | 
105 | dat_1$group = "Accepted"
106 | dat_uniq$group = "Rejected"
107 | df = rbind(dat_1, dat_uniq)
108 | df$log2_MNN_pairs_normalized = log2(df$MNN_pairs_normalized)
109 | 
110 | ### Extended Data Fig. 11d
111 | 
112 | p <- df %>%
113 |     ggplot( aes(x=log2_MNN_pairs_normalized, fill=group)) +
114 |     geom_histogram( color="#e9ecef", alpha=0.5, position = 'identity') +
115 |     scale_fill_manual(values=c("#f85633", "#0058d6")) +
116 |     theme_ipsum() +
117 |     labs(fill="")
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/Section_7_key_TFs/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/Section_7_key_TFs/.Rhistory


--------------------------------------------------------------------------------
/Section_7_key_TFs/HSCs_progenitors.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import scanpy as sc
 3 | import pandas as pd
 4 | import numpy as np
 5 | import os
 6 | import sys
 7 | from annoy import AnnoyIndex
 8 | 
 9 | WORK_PATH = './'
10 | 
11 | example_list = ["HSC"]
12 | 
13 | for example_i in example_list:
14 |     print(example_i)
15 | 
16 |     adata = sc.read_mtx(os.path.join(WORK_PATH, '%s.gene_count.mtx'%example_i))
17 |     fdata = pd.read_csv(os.path.join(WORK_PATH, "df_gene.csv"), index_col = 0)
18 |     pdata = pd.read_csv(os.path.join(WORK_PATH, '%s.df_cell.csv'%example_i), index_col = 0)
19 |     adata.obs = pdata
20 |     adata.var = fdata
21 | 
22 |     print(adata.shape)
23 | 
24 |     sc.pp.normalize_total(adata, target_sum=1e4)
25 |     sc.pp.log1p(adata)
26 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
27 |     adata = adata[:, adata.var.highly_variable]
28 |     sc.pp.scale(adata, max_value=10)
29 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
30 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
31 | 
32 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
33 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
34 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
35 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
36 | 
37 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
38 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
39 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
40 | 
41 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%example_i))
42 | 
43 |     adata.write(os.path.join(WORK_PATH, '%s_adata_scale.h5ad'%example_i), compression="gzip")
44 | 
45 |     X = adata.obsm['X_pca']
46 |     print(X.shape)
47 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%example_i), X, delimiter=",", fmt='%1.3f')
48 | 
49 |     dist_metric = 'euclidean'
50 |     k = 15
51 |     ### Here, why we use 15? because the log2(mean cell number across cell types) is around 15.
52 | 
53 |     npc = X.shape[1]
54 |     ncell = X.shape[0]
55 |     annoy_index = AnnoyIndex(npc, metric=dist_metric)
56 | 
57 |     for i in range(ncell):
58 |         annoy_index.add_item(i, list(X[i,:]))
59 |     annoy_index.build(15) ### bigger number will make the result more accurate
60 | 
61 |     knn = []
62 |     for iCell in range(ncell):
63 |         knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:])
64 |     knn = np.array(knn, dtype=int)
65 | 
66 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.kNN_15.csv'%example_i), knn, delimiter=",", fmt='%s')
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/Section_7_key_TFs/step3_Summarize_results.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ####################################
  3 | ### Section - 7, Key TFs & genes ###
  4 | ####################################
  5 | 
  6 | ### making plot to show which TF or gene are appeared for different edges
  7 | 
  8 | ################
  9 | ### key TFs ####
 10 | ################
 11 | 
 12 | source("JAX_help_code.R")
 13 | source("JAX_color_code.R")
 14 | work_path = "./"
 15 | 
 16 | dat = read.csv(paste0(work_path, "All.keyTF.csv"), header=T, as.is=T)
 17 | 
 18 | df = dat %>% select(node_A, node_B, gene_short_name) %>% unique() %>%
 19 |     group_by(node_A, node_B) %>% tally()
 20 | print(paste0(mean(df$n), "+/-", sd(df$n)))
 21 | ### 39.64 +/- 43.44
 22 | 
 23 | print(paste(quantile(df$n, 0.25), quantile(df$n, 0.5), quantile(df$n, 0.75)))
 24 | ### 12 28 51
 25 | 
 26 | df_1 = dat %>% filter(comparing %in% c("group_1 vs. group_2","group_3 vs. group_4")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 27 | df_2 = dat %>% filter(comparing %in% c("group_2 vs. group_3")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 28 | df_3 = dat %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 29 | sum(!df_1 %in% df_2)/length(df_3)
 30 | ### 5%
 31 | 
 32 | df_1 = dat %>% filter(edge_type == "Developmental progression", comparing == "group_1 vs. group_2") %>% 
 33 |     select(node_A, node_B, gene_short_name) %>% unique() %>%
 34 |     group_by(gene_short_name) %>% tally() 
 35 | print(head(df_1[order(df_1$n, decreasing = T),], 20))
 36 | p1 = df_1 %>% 
 37 |     ggplot(aes(n)) + geom_histogram(binwidth = 0.5) +
 38 |     labs(x="# of edges that have been involved", y="# of key TFs", title="") +
 39 |     theme_classic(base_size = 10) +
 40 |     theme(legend.position="none") +
 41 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 
 42 | 
 43 | df_3 = dat %>% 
 44 |     select(node_A, node_B, gene_short_name) %>% unique() %>%
 45 |     group_by(gene_short_name) %>% tally()
 46 | print(head(df_3[order(df_3$n, decreasing = T),], 20))
 47 | p3 = df_3 %>%
 48 |     ggplot(aes(n)) + geom_histogram(binwidth = 0.6) +
 49 |     labs(x="# of edges that have been involved", y="# of key TFs", title="") +
 50 |     theme_classic(base_size = 10) +
 51 |     theme(legend.position="none") +
 52 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 
 53 | 
 54 | ### Extended Data Fig. 11k
 55 | 
 56 | pdf(paste0(work_path, "Hist_TF.pdf"), 6, 3)
 57 | grid.arrange(p1, p3, nrow=1, ncol=2) 
 58 | dev.off()
 59 | 
 60 | ##################
 61 | ### key Genes ####
 62 | ##################
 63 | 
 64 | 
 65 | dat = read.csv(paste0(work_path, "All.keyGene.csv"), header=T, as.is=T)
 66 | 
 67 | df = dat %>% select(node_A, node_B, gene_short_name) %>% unique() %>%
 68 |     group_by(node_A, node_B) %>% tally()
 69 | print(paste0(mean(df$n), "+/-", sd(df$n)))
 70 | ### 293.24 +/- 358.04
 71 | 
 72 | print(paste(quantile(df$n, 0.25), quantile(df$n, 0.5), quantile(df$n, 0.75)))
 73 | ### 76 171 389
 74 | 
 75 | 
 76 | df_1 = dat %>% filter(comparing %in% c("group_1 vs. group_2","group_3 vs. group_4")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 77 | df_2 = dat %>% filter(comparing %in% c("group_2 vs. group_3")) %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 78 | df_3 = dat %>% group_by(gene_short_name) %>% tally() %>% pull(gene_short_name)
 79 | sum(!df_1 %in% df_2)/length(df_3)
 80 | ### 7%
 81 | 
 82 | 
 83 | df_1 = dat %>% filter(edge_type == "Developmental progression", comparing == "group_1 vs. group_2") %>% 
 84 |     select(node_A, node_B, gene_short_name) %>% unique() %>%
 85 |     group_by(gene_short_name) %>% tally() 
 86 | print(head(df_1[order(df_1$n, decreasing = T),], 20))
 87 | df_1[order(df_1$n, decreasing = T),] %>% filter(n > 10) %>% write.csv("~/Dropbox/tmp/Fig.S16.d_1.csv")
 88 | p1 = df_1 %>% 
 89 |     ggplot(aes(n)) + geom_histogram(binwidth = 0.5) +
 90 |     labs(x="# of edges that have been involved", y="# of key Genes", title="") +
 91 |     theme_classic(base_size = 10) +
 92 |     theme(legend.position="none") +
 93 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 
 94 | 
 95 | df_3 = dat %>% 
 96 |     select(node_A, node_B, gene_short_name) %>% unique() %>%
 97 |     group_by(gene_short_name) %>% tally()
 98 | print(head(df_3[order(df_3$n, decreasing = T),], 20))
 99 | df_3[order(df_3$n, decreasing = T),] %>% filter(n > 10) %>% write.csv("~/Dropbox/tmp/Fig.S16.d_3.csv")
100 | p3 = df_3 %>%
101 |     ggplot(aes(n)) + geom_histogram(binwidth = 0.6) +
102 |     labs(x="# of edges that have been involved", y="# of key Genes", title="") +
103 |     theme_classic(base_size = 10) +
104 |     theme(legend.position="none") +
105 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) 
106 | 
107 | ### Extended Data Fig. 11l
108 | 
109 | pdf(paste0(work_path, "Hist_Gene.pdf"), 6, 3)
110 | grid.arrange(p1, p3, nrow=1, ncol=2) 
111 | dev.off()
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Section_7_key_TFs/step4_Pseudotime_endoderm.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ####################################
  4 | ### Section - 7, Key TFs & genes ###
  5 | ####################################
  6 | 
  7 | ##################################################
  8 | ### Anterior primitive streak -> Def. endoderm ###
  9 | ##################################################
 10 | 
 11 | ### Can we estimate pseudotime of cells corresponding to Def.endoderm developement 
 12 | ### during transition and plot key TF/genes expression as a function of pseudotime?
 13 | 
 14 | source("JAX_help_code.R")
 15 | source("JAX_color_code.R")
 16 | work_path = "./"
 17 | 
 18 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t")
 19 | 
 20 | system_i = "Gastrulation"
 21 | 
 22 | edges = read.table(paste0(work_path, "edges.txt"), header=F, as.is=T, sep="\t")
 23 | names(edges) = c("system","x","y","x_name","y_name","edge_type")
 24 | edges = edges[edges$system == "Gastrulation",]
 25 | 
 26 | obj = readRDS(paste0(work_path, "obj_Early_PS.rds"))
 27 | pd = readRDS(paste0(work_path, system_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
 28 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds"))
 29 | pd$celltype_new = NULL
 30 | pd = pd %>% left_join(pd_all[,c("cell_id", "celltype_new")], by = "cell_id")
 31 | 
 32 | if ("meta_group" %in% names(pd)) {pd$meta_group = NULL}
 33 | pd_x = pd %>% left_join(nodes %>% filter(system == system_i) %>% select(celltype_new, meta_group))
 34 | pd$meta_group = as.vector(pd_x$meta_group)
 35 | gene_count = GetAssayData(obj, slot = "counts")
 36 | 
 37 | y = data.frame(i = 1:nrow(pd),
 38 |                j = 1:nrow(pd),
 39 |                meta_group = as.vector(pd$meta_group), stringsAsFactors = FALSE)
 40 | 
 41 | x = readRDS(paste0(work_path, system_i, ".MNN.rds"))
 42 | x_rev = x
 43 | x_rev$i = x$j
 44 | x_rev$j = x$i
 45 | x_rev = rbind(x, x_rev)
 46 | dat = x_rev %>% left_join(y %>% select(i, meta_group), by = "i") %>%
 47 |     left_join(y %>% select(j, meta_group), by = "j")
 48 | 
 49 | cnt = 1
 50 | print(edges[cnt,])
 51 | 
 52 | xx = as.vector(edges$x)[cnt]
 53 | yy = as.vector(edges$y)[cnt]
 54 | 
 55 | dat_cnt = dat[dat$meta_group.x == xx & dat$meta_group.y == yy,]
 56 | group_1_MNN = as.vector(dat_cnt$i)
 57 | group_2_MNN = as.vector(dat_cnt$j)
 58 | 
 59 | coor = c(1:nrow(pd))
 60 | 
 61 | while(length(unique(group_1_MNN)) < 200){
 62 |     num_1 = length(unique(group_1_MNN))
 63 |     tmp = intersect(as.vector(dat$j[dat$i %in% group_1_MNN]), coor[pd$meta_group == xx])
 64 |     group_1_MNN = c(group_1_MNN, tmp)
 65 |     num_2 = length(unique(group_1_MNN))
 66 |     if(num_1 == num_2) {break}
 67 | }
 68 | 
 69 | group_1_close = intersect(as.vector(dat$j[dat$i %in% group_1_MNN]), coor[pd$meta_group == xx])
 70 | group_1_close = group_1_close[!group_1_close %in% group_1_MNN]
 71 | while(length(unique(group_1_close)) < 200){
 72 |     num_1 = length(unique(group_1_close))
 73 |     tmp = intersect(as.vector(dat$j[dat$i %in% group_1_close]), coor[pd$meta_group == xx])
 74 |     group_1_close = c(group_1_close, tmp)
 75 |     group_1_close = group_1_close[!group_1_close %in% group_1_MNN]
 76 |     num_2 = length(unique(group_1_close))
 77 |     if(num_1 == num_2) {break}
 78 | }
 79 | 
 80 | while(length(unique(group_2_MNN)) < 200){
 81 |     num_1 = length(unique(group_2_MNN))
 82 |     tmp = intersect(as.vector(dat$j[dat$i %in% group_2_MNN]), coor[pd$meta_group == yy])
 83 |     group_2_MNN = c(group_2_MNN, tmp)
 84 |     num_2 = length(unique(group_2_MNN))
 85 |     if(num_1 == num_2) {break}
 86 | }
 87 | 
 88 | group_2_close = intersect(as.vector(dat$j[dat$i %in% group_2_MNN]), coor[pd$meta_group == yy])
 89 | group_2_close = group_2_close[!group_2_close %in% group_2_MNN]
 90 | while(length(unique(group_2_close)) < 200){
 91 |     num_1 = length(unique(group_2_close))
 92 |     tmp = intersect(as.vector(dat$j[dat$i %in% group_2_close]), coor[pd$meta_group == yy])
 93 |     group_2_close = c(group_2_close, tmp)
 94 |     group_2_close = group_2_close[!group_2_close %in% group_2_MNN]
 95 |     num_2 = length(unique(group_2_close))
 96 |     if(num_1 == num_2) {break}
 97 | }
 98 | 
 99 | group = rep("other", nrow(pd))
100 | group[coor %in% group_1_close] = "group_1"
101 | group[coor %in% group_1_MNN] = "group_2"
102 | group[coor %in% group_2_MNN] = "group_3"
103 | group[coor %in% group_2_close] = "group_4"
104 | pd$group = as.vector(group)
105 | 
106 | pd_sub = pd[pd$group != "other",]
107 | group_table = table(pd_sub$group)
108 | 
109 | gene_count_sub = gene_count[,as.vector(pd_sub$cell_id)]
110 | obj_sub = CreateSeuratObject(gene_count_sub, meta.data = pd_sub)
111 | obj_sub = NormalizeData(obj_sub, normalization.method = "LogNormalize", scale.factor = 10000)
112 | obj_sub = FindVariableFeatures(obj_sub, selection.method = "vst", nfeatures = 2500)
113 | genes_include = VariableFeatures(obj_sub)
114 | 
115 | cds = doObjectTransform(obj_sub, transform_to = "monocle")
116 | 
117 | pd_x = read.csv(paste0(work_path, "pijuan_obs.csv"), row.names=1, as.is=T)
118 | pd_x = pd_x[colnames(cds),]
119 | 
120 | cds$batch_1 = as.vector(pd_x$batch)
121 | cds$batch_2 = as.vector(pd_x$group)
122 | 
123 | cds = preprocess_cds(cds, use_genes = genes_include)
124 | cds = align_cds(cds, alignment_group = "batch_1")
125 | cds = reduce_dimension(cds)
126 | 
127 | saveRDS(cds, paste0(work_path, "cds_Def_endoderm.rds"))
128 | 
129 | 
130 | #####################
131 | ### Making plots ####
132 | #####################
133 | 
134 | plot_cells(cds, color_cells_by = "celltype_new", cell_size = 1)
135 | 
136 | cds = cluster_cells(cds)
137 | cds = learn_graph(cds)
138 | cds = order_cells(cds)
139 | 
140 | plot_cells(cds,
141 |            color_cells_by = "pseudotime",
142 |            label_cell_groups=FALSE,
143 |            label_leaves=FALSE,
144 |            label_branch_points=FALSE,
145 |            graph_label_size=1.5,
146 |            cell_size = 1)
147 | 
148 | cds$pseudotime = cds@principal_graph_aux[["UMAP"]]$pseudotime
149 | 
150 | df = data.frame(pData(cds)) 
151 | boxplot(df$pseudotime~factor(df$group))
152 | 
153 | df$UMAP_1 = reducedDims(cds)$UMAP[,1]
154 | df$UMAP_2 = reducedDims(cds)$UMAP[,2]
155 | 
156 | ### Extended Data Fig. 11m
157 | 
158 | p = ggplot() +
159 |     geom_point(data = df, aes(x = UMAP_1, y = UMAP_2), size=2, color = "black") +
160 |     geom_point(data = df, aes(x = UMAP_1, y = UMAP_2, color = celltype_new), size=1.8) +
161 |     theme_void() +
162 |     scale_color_manual(values=gastrulation_color_plate) +
163 |     theme(legend.position="none") + 
164 |     ggsave(paste0(work_path, "Def_endoderm_UMAP_celltype.png"), width = 6, height = 4, dpi = 300)
165 | 
166 | p = ggplot() +
167 |     geom_point(data = df, aes(x = UMAP_1, y = UMAP_2), size=2, color = "black") +
168 |     geom_point(data = df, aes(x = UMAP_1, y = UMAP_2, color = pseudotime), size=1.8) +
169 |     theme_void() +
170 |     scale_color_viridis(discrete=F) +
171 |     theme(legend.position="none") + 
172 |     ggsave(paste0(work_path, "Def_endoderm_UMAP_pseudotime.png"), width = 6, height = 4, dpi = 300)
173 | 
174 | 
175 | ###################################
176 | ### making gene expression plot ###
177 | ###################################
178 | 
179 | gene_count = exprs(cds)
180 | gene_count = t(t(gene_count) / colSums(gene_count)) * 100000
181 | 
182 | target_genes = c("Sox17", "Elf3", "Sall4", "Hesx1", "Lin28a", "Ovol2",
183 |                  "Cer1", "Slc25a4", "Cd24a",  "Slc2a3", "Lrpap1", "Krt18")
184 | 
185 | mouse_gene_sub = mouse_gene[mouse_gene$gene_short_name %in% target_genes,]
186 | 
187 | gene_count_x = gene_count[rownames(mouse_gene_sub),]
188 | rownames(gene_count_x) = as.vector(mouse_gene_sub$gene_short_name)
189 | gene_count_x@x = log(gene_count_x@x + 1)
190 | 
191 | dat = data.frame(exp = c(t(as.matrix(gene_count_x))),
192 |                  gene = rep(rownames(gene_count_x), each = ncol(gene_count_x)),
193 |                  pseudotime = rep(as.vector(cds$pseudotime), nrow(gene_count_x)),
194 |                  pseudotime_rank = rep(rank(as.vector(cds$pseudotime)), nrow(gene_count_x)), stringsAsFactors = F)
195 | dat$gene = factor(dat$gene, levels = target_genes)
196 | 
197 | p = ggplot() +
198 |     geom_smooth(data = dat, aes(pseudotime, exp, color = gene), method = loess, se = FALSE) +
199 |     labs(x="", y="", title="") +
200 |     theme_classic(base_size = 12) +
201 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) +
202 |     scale_color_brewer(palette = "Paired")
203 | 
204 | 
205 | dat_add_offset = NULL
206 | for(gene_i in target_genes){
207 |     dat_sub = dat %>% filter(gene == gene_i)
208 |     p_tmp = qplot(pseudotime, exp, data=dat_sub) + stat_smooth(method = loess, se = FALSE)
209 |     dat_tmp = ggplot_build(p_tmp)$data[[2]][,c("x","y")]
210 |     dat_tmp$y = dat_tmp$y - dat_tmp$y[dat_tmp$x == 0]
211 |     dat_tmp$gene = gene_i
212 |     
213 |     dat_add_offset = rbind(dat_add_offset, dat_tmp)
214 | }
215 | dat_add_offset$gene = factor(dat_add_offset$gene, levels = target_genes)
216 | 
217 | p = ggplot() +
218 |     geom_line(data = dat_add_offset, aes(x, y, color = gene), size = 1) +
219 |     labs(x="", y="", title="") +
220 |     theme_classic(base_size = 12) +
221 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black")) +
222 |     scale_color_brewer(palette = "Paired")
223 | 
224 | ### Extended Data Fig. 11n
225 | 
226 | pdf(paste0(work_path, "Def_endoderm_gene_expression_pseudotime_add_offset.pdf"), 5, 3)
227 | print(p)
228 | dev.off()
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/Section_7_key_TFs/step5_HSCs_progenitors.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ####################################
  4 | ### Section - 7, Key TFs & genes ###
  5 | ####################################
  6 | 
  7 | ### Here, we present an example - HSCs, showing much heterogeneity even at the progenitor state
  8 | ### How it connects to multiple different derivatives?
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds"))
 15 | rownames(pd_all) = as.vector(pd_all$cell_id)
 16 | 
 17 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t")
 18 | 
 19 | celltype_include = c("Hematopoietic stem cells (Cd34+)")
 20 | example_i = "HSC"
 21 | 
 22 | pd_sub = pd_all[pd_all$celltype_new %in% celltype_include,]
 23 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M")),]
 24 | gene_count = doExtractData(pd_sub, mouse_gene_sub)
 25 | 
 26 | writeMM(t(gene_count), paste0(work_path, example_i, ".gene_count.mtx"))
 27 | write.csv(pd, paste0(work_path, example_i, ".df_cell.csv"))
 28 | 
 29 | 
 30 | ### Using Scanpy to perform cell embedding
 31 | ### python HSCs_progenitors.py
 32 | 
 33 | pd_x = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
 34 | 
 35 | x_table = table(pd_x$day)
 36 | pd_1 = pd_x[pd_x$day %in% names(x_table)[x_table > 1000],]
 37 | pd_1_ = pd_1 %>% group_by(day) %>% sample_n(1000)
 38 | pd_1 = pd_1[pd_1$cell_id %in% pd_1_$cell_id,]
 39 | pd_2 = pd_x[pd_x$day %in% names(x_table)[x_table <= 1000],]
 40 | pd_plot = rbind(pd_1, pd_2)
 41 | pd_plot$day = factor(pd_plot$day, levels = names(day_color_plate))
 42 | 
 43 | ### Extended Data Fig. 11p
 44 | 
 45 | p = ggplot() +
 46 |     geom_point(data = pd_plot, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1) +
 47 |     geom_point(data = pd_plot[sample(1:nrow(pd_plot)),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=0.8) +
 48 |     theme_void() +
 49 |     scale_color_manual(values=day_color_plate) +
 50 |     theme(legend.position="none") + 
 51 |     ggsave(paste0(work_path, example_i, ".day.png"), width = 6, height = 6, dpi = 300)
 52 | 
 53 | 
 54 | 
 55 | 
 56 | ##################################################################
 57 | ### what are the MNNs between HSCs and its multiple derivatives ##
 58 | ##################################################################
 59 | 
 60 | example_i = "HSC"
 61 | system_i = "Blood"
 62 | 
 63 | pd_target = read.csv(paste0(work_path, example_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
 64 | 
 65 | ### Of note, this profile is generated by co-embedding all the cell types of Blood system
 66 | ### Please see the scripts from Section_6_development_tree for details
 67 | pd = readRDS(paste0(work_path, system_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
 68 | pd_all = readRDS(paste0(work_path, "df_cell_graph.rds"))
 69 | pd$celltype_new = NULL
 70 | pd = pd %>% left_join(pd_all[,c("cell_id", "celltype_new")], by = "cell_id")
 71 | nodes = read.table(paste0(work_path, "nodes.txt"), header=T, as.is=T, sep="\t")
 72 | 
 73 | if ("meta_group" %in% names(pd)) {pd$meta_group = NULL}
 74 | pd_x = pd %>% left_join(nodes %>% filter(system == system_i) %>% select(celltype_new, meta_group))
 75 | pd$meta_group = as.vector(pd_x$meta_group)
 76 | 
 77 | y = data.frame(i = 1:nrow(pd),
 78 |                j = 1:nrow(pd),
 79 |                meta_group = as.vector(pd$meta_group), 
 80 |                cell_id = as.vector(pd$cell_id), stringsAsFactors = FALSE)
 81 | 
 82 | ### Of note, this profile is generated by identifying MNN pairs of cells within Blood system
 83 | ### Please see the scripts from Section_6_development_tree for details
 84 | x = readRDS(paste0(work_path, system_i, ".MNN.rds"))
 85 | x_rev = x
 86 | x_rev$i = x$j
 87 | x_rev$j = x$i
 88 | x_rev = rbind(x, x_rev)
 89 | dat = x_rev %>% left_join(y %>% select(i, meta_group, cell_id), by = "i") %>%
 90 |     left_join(y %>% select(j, meta_group, cell_id), by = "j")
 91 | 
 92 | edges = read.table(paste0(work_path, "edges.txt"), as.is=T, sep="\t")
 93 | nodes_include = as.vector(edges$V3[edges$V2 == "B_M11"])
 94 | 
 95 | dat_uniq = dat %>% filter(meta_group.x == "B_M11", meta_group.y %in% nodes_include) %>%
 96 |     group_by(cell_id.x, meta_group.y) %>% tally() %>% 
 97 |     group_by(cell_id.x) %>% slice_max(order_by = n, n = 1, with_ties = F) %>% 
 98 |     rename(cell_id = cell_id.x, meta_group = meta_group.y) %>% select(cell_id, meta_group) %>%
 99 |     left_join(nodes[,c("meta_group", "celltype_new")]) %>% rename(MNN = celltype_new)
100 | 
101 | pd_target = pd_target %>% left_join(dat_uniq[,c("cell_id","MNN")])
102 | 
103 | ### Extended Data Fig. 11q
104 | 
105 | p = ggplot() +
106 |     geom_point(data = pd_target[is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2), color = "grey80", size=0.6) +
107 |     geom_point(data = pd_target[!is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.65) +
108 |     geom_point(data = pd_target[!is.na(pd_target$MNN),], aes(x = UMAP_2d_1, y = UMAP_2d_2, color = MNN), size=0.6) +
109 |     theme_void() +
110 |     scale_color_manual(values=blood_system_color_plate) +
111 |     theme(legend.position="none") + 
112 |     ggsave(paste0(work_path, example_i, ".MNN.png"), width = 6, height = 6, dpi = 300)
113 | 
114 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/Embedding_birth_series.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #################################
  3 | ### Section - 8, Birth series ###
  4 | #################################
  5 | 
  6 | ######################################
  7 | ### Embedding birth series dataset ###
  8 | ######################################
  9 | 
 10 | ### The gene_count, df_cell, df_gene data could be downloaded from
 11 | ### /net/shendure/vol10/www/content/members/cxqiu/public/backup/jax/download/mtx
 12 | ### cell_annotation.run_28.csv.gz
 13 | ### gene_count.run_28.mtx.gz
 14 | ### gene_annotation.csv.gz
 15 | 
 16 | import scanpy as sc
 17 | import pandas as pd
 18 | import numpy as np
 19 | import os, sys
 20 | 
 21 | WORK_PATH = './'
 22 | example_i = "birth"
 23 | 
 24 | adata = sc.read_mtx(os.path.join(WORK_PATH, "gene_count.run_28.mtx.gz"))
 25 | pdata = pd.read_csv(os.path.join(WORK_PATH, "cell_annotation.run_28.csv.gz"), index_col = 0)
 26 | fdata = pd.read_csv(os.path.join(WORK_PATH, "gene_annotation.csv.gz"), index_col = 0)
 27 | adata.obs = pdata
 28 | adata.var = fdata
 29 | 
 30 | adata = adata[:, adata.var["gene_type"].isin(["protein_coding","pseudogene","lincRNA"])]
 31 | chr_include = ["chr" + str(i) for i in range(1,20)]
 32 | chr_include.append("chrM")
 33 | adata = adata[:, adata.var["chr"].isin(chr_include)]
 34 | 
 35 | adata.write(os.path.join(WORK_PATH, "adata_birth.h5ad"), compression="gzip")
 36 | 
 37 | sc.pp.normalize_total(adata, target_sum=1e4)
 38 | sc.pp.log1p(adata)
 39 | sc.pp.highly_variable_genes(adata, n_top_genes=2500)
 40 | adata = adata[:, adata.var.highly_variable]
 41 | sc.pp.scale(adata, max_value=10)
 42 | sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
 43 | sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
 44 | 
 45 | sc.tl.umap(adata, min_dist=0.3, n_components=3)
 46 | adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
 47 | adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
 48 | adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
 49 | 
 50 | sc.tl.leiden(adata, resolution=1, n_iterations=2)
 51 | adata.obs['leiden_res_1'] = adata.obs['leiden']
 52 | 
 53 | sc.tl.leiden(adata, resolution=2, n_iterations=2)
 54 | adata.obs['leiden_res_2'] = adata.obs['leiden']
 55 | 
 56 | sc.tl.umap(adata, min_dist=0.3, n_components=2)
 57 | print("Done UMAP ...")
 58 | print(str(format((time.time() - start_time)/3600, '.4f')) + 'hours')
 59 | 
 60 | adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
 61 | adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
 62 | 
 63 | adata.obs.to_csv(os.path.join(WORK_PATH, "adata_birth.obs.csv"))
 64 | adata.write(os.path.join(WORK_PATH, "adata_birth_processed.h5ad"), compression="gzip")
 65 | 
 66 | emb = adata.obsm['X_pca']
 67 | print(emb.shape)
 68 | np.savetxt(os.path.join(WORK_PATH, "adata_birth.PCs.csv"), emb, delimiter=",", fmt='%1.3f')
 69 | 
 70 | 
 71 | 
 72 | ##############################################################################################
 73 | ### perform subclustering on three major trajectories, by only including C-section samples ###
 74 | ##############################################################################################
 75 | 
 76 | 
 77 | import scanpy as sc
 78 | import pandas as pd
 79 | import numpy as np
 80 | import os, sys
 81 | 
 82 | WORK_PATH = './'
 83 | 
 84 | adata_all = sc.read_h5ad(os.path.join(WORK_PATH, "adata_birth.h5ad"))
 85 | 
 86 | ### pd_birth.csv is generated by write.csv("pd_birth.rds") in R
 87 | 
 88 | pdata = pd.read_csv(os.path.join(WORK_PATH, "pd_birth.csv"), index_col = 0)
 89 | adata_all.obs = pdata
 90 | 
 91 | ### 
 92 | targ_list = ["Adipocytes","Hepatocytes","Lung_and_airway"]
 93 | day_include = ["Csection_0m","Csection_20m","Csection_40m","Csection_60m","Csection_80m"]
 94 | 
 95 | for i in targ_list:
 96 | 
 97 |     adata = adata_all[adata_all.obs["major_trajectory"] == i]
 98 |     adata = adata[adata.obs["day"].isin(day_include)]
 99 |     print("processing: " + i)
100 |     print(adata.shape)
101 | 
102 |     sc.pp.normalize_total(adata, target_sum=1e4)
103 |     sc.pp.log1p(adata)
104 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
105 |     adata = adata[:, adata.var.highly_variable]
106 |     sc.pp.scale(adata, max_value=10)
107 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
108 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
109 | 
110 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
111 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
112 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
113 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
114 | 
115 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
116 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
117 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
118 |     
119 |     sc.tl.leiden(adata, resolution=1, n_iterations=2)
120 |     adata.obs['subcluster_leiden_res_1'] = adata.obs['leiden']
121 | 
122 |     sc.tl.leiden(adata, resolution=2, n_iterations=2)
123 |     adata.obs['subcluster_leiden_res_2'] = adata.obs['leiden']
124 | 
125 |     sc.tl.leiden(adata, resolution=5, n_iterations=2)
126 |     adata.obs['subcluster_leiden_res_5'] = adata.obs['leiden']
127 | 
128 |     adata.obs.to_csv(os.path.join(WORK_PATH, "adata_%s.obs.csv"%i))
129 |     adata.write(os.path.join(WORK_PATH, "adata_%s_processed.h5ad"%i), compression="gzip")
130 | 
131 | 
132 | 
133 | 
134 | 
135 | #####################################################################################################################
136 | ### perform subclustering on three major trajectories, by including C-section samples + 3 natural birthed samples ###
137 | #####################################################################################################################
138 | 
139 | 
140 | import scanpy as sc
141 | import pandas as pd
142 | import numpy as np
143 | import os, sys
144 | 
145 | WORK_PATH = './'
146 | 
147 | adata_all = sc.read_h5ad(os.path.join(WORK_PATH, "adata_birth.h5ad"))
148 | 
149 | ### pd_birth.csv is generated by write.csv("pd_birth.rds") in R
150 | 
151 | pdata = pd.read_csv(os.path.join(WORK_PATH, "pd_birth.csv"), index_col = 0)
152 | adata_all.obs = pdata
153 | 
154 | ### 
155 | targ_list = ["Adipocytes","Hepatocytes","Lung_and_airway"]
156 | day_include = ["NatBirth","Csection_0m","Csection_20m","Csection_40m","Csection_60m","Csection_80m"]
157 | 
158 | for i in targ_list:
159 | 
160 |     adata = adata_all[adata_all.obs["major_trajectory"] == i]
161 |     adata = adata[adata.obs["day"].isin(day_include)]
162 |     print("processing: " + i)
163 |     print(adata.shape)
164 | 
165 |     sc.pp.normalize_total(adata, target_sum=1e4)
166 |     sc.pp.log1p(adata)
167 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
168 |     adata = adata[:, adata.var.highly_variable]
169 |     sc.pp.scale(adata, max_value=10)
170 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
171 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
172 | 
173 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
174 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
175 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
176 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
177 | 
178 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
179 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
180 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
181 |     
182 |     sc.tl.leiden(adata, resolution=1, n_iterations=2)
183 |     adata.obs['subcluster_leiden_res_1'] = adata.obs['leiden']
184 | 
185 |     sc.tl.leiden(adata, resolution=2, n_iterations=2)
186 |     adata.obs['subcluster_leiden_res_2'] = adata.obs['leiden']
187 | 
188 |     sc.tl.leiden(adata, resolution=5, n_iterations=2)
189 |     adata.obs['subcluster_leiden_res_5'] = adata.obs['leiden']
190 | 
191 |     adata.obs.to_csv(os.path.join(WORK_PATH, "adata_%s_NatBirth.obs.csv"%i))
192 |     adata.write(os.path.join(WORK_PATH, "adata_%s_NatBirth_processed.h5ad"%i), compression="gzip")
193 | 
194 |     emb = adata.obsm['X_pca']
195 |     np.savetxt(os.path.join(WORK_PATH, "adata_%s_NatBirth.PCs.csv"%i), emb, delimiter=",", fmt='%1.3f')
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/Embedding_individual_celltype.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################
 3 | ### Section - 8, Birth series ###
 4 | #################################
 5 | 
 6 | #################################################################################################################
 7 | ### To systematically identify which cell types exhibit abrupt transcriptional changes before vs. after birth ###
 8 | #################################################################################################################
 9 | 
10 | import scanpy as sc
11 | import pandas as pd
12 | import numpy as np
13 | import os, sys
14 | import gc
15 | import time
16 | 
17 | WORK_PATH = "./"
18 | 
19 | adata_1 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_1.h5ad'))
20 | adata_2 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_2.h5ad'))
21 | adata_3 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_3.h5ad'))
22 | adata_4 = sc.read_h5ad(os.path.join(WORK_PATH, 'adata_JAX_dataset_4.h5ad'))
23 | 
24 | adata = adata_1.concatenate(adata_2, adata_3, adata_4)
25 | del adata_1, adata_2, adata_3, adata_4
26 | gc.collect()
27 | 
28 | day_include = ["E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", "E17.5", 
29 |                "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0"]
30 | 
31 | celltype_list = {}
32 | file = open(os.path.join(WORK_PATH, "celltype_include.txt"))
33 | for line in file:
34 |     l = line.rstrip().split('\t')
35 |     celltype_list[l[0]] = l[1]
36 | file.close()
37 | 
38 | for celltype_i in celltype_list:
39 | 
40 |     adata = adata_i[adata_i.obs["celltype_update"] == celltype_i]
41 |     adata = adata_i[adata_i.obs["day"].isin(day_include)]
42 | 
43 |     sc.pp.normalize_total(adata, target_sum=1e4)
44 |     sc.pp.log1p(adata)
45 |     sc.pp.highly_variable_genes(adata, n_top_genes=2500)
46 |     adata = adata[:, adata.var.highly_variable]
47 |     sc.pp.scale(adata, max_value=10)
48 |     sc.tl.pca(adata, svd_solver='arpack', n_comps=30)
49 |     sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
50 | 
51 |     sc.tl.umap(adata, min_dist=0.3, n_components=3)
52 |     adata.obs['UMAP_1'] = list(adata.obsm['X_umap'][:,0])
53 |     adata.obs['UMAP_2'] = list(adata.obsm['X_umap'][:,1])
54 |     adata.obs['UMAP_3'] = list(adata.obsm['X_umap'][:,2])
55 | 
56 |     sc.tl.umap(adata, min_dist=0.3, n_components=2)
57 |     adata.obs['UMAP_2d_1'] = list(adata.obsm['X_umap'][:,0])
58 |     adata.obs['UMAP_2d_2'] = list(adata.obsm['X_umap'][:,1])
59 | 
60 |     adata.obs.to_csv(os.path.join(WORK_PATH, '%s_adata_scale.obs.csv'%celltype_list[celltype_i]))
61 | 
62 |     X = adata.obsm['X_pca']
63 |     print(X.shape)
64 |     np.savetxt(os.path.join(WORK_PATH, '%s_adata_scale.PCs.csv'%celltype_list[celltype_i]), X, delimiter=",", fmt='%1.3f')
65 | 
66 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/step1_Celltypes_shift_after_birth.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #################################
  3 | ### Section - 8, Birth series ###
  4 | #################################
  5 | 
  6 | ################################################################################################
  7 | ### Re-embedded 2D UMAP of cells from three major cell clusters before E18.75, E18.75, or P0 ###
  8 | ################################################################################################
  9 | 
 10 | ### Fig. 6a
 11 | 
 12 | source("JAX_help_code.R")
 13 | source("JAX_color_code.R")
 14 | work_path = "./"
 15 | 
 16 | day_group_color_plate = c("Early" = "#a46cb7",
 17 |                           "E18.75" = "#7aa457",
 18 |                           "P0" = "#cb6a49",
 19 |                           "Other" = "grey90")
 20 | 
 21 | for(example_i in c("Hepatocytes", "Adipocytes", "Lung_and_airway")){
 22 |     
 23 |     example_i = "Renal"; print(example_i)
 24 |     
 25 |     pd = readRDS(paste0(work_path, example_i, "_adata_scale.obs.rds"))
 26 |     
 27 |     day_group = rep("Early", nrow(pd))
 28 |     day_group[pd$day == "E18.75"] = "E18.75"
 29 |     day_group[pd$day == "P0"] = "P0"
 30 |     pd$day_group = as.vector(day_group)
 31 |     
 32 |     pd$tmp = if_else(pd$day_group == "Early", "Early", "Other")
 33 |     p = pd %>%
 34 |         ggplot() +
 35 |         geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 36 |         geom_point(data = subset(pd, tmp == 'Early'),
 37 |                    aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 38 |         theme_void() +
 39 |         scale_color_manual(values=day_group_color_plate) +
 40 |         theme(legend.position="none") + 
 41 |         ggsave(paste0(work_path, example_i, "_1.png"), width = 6, height = 6, dpi = 300)
 42 |     
 43 |     pd$tmp = if_else(pd$day_group == "E18.75", "E18.75", "Other")
 44 |     p = pd %>%
 45 |         ggplot() +
 46 |         geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 47 |         geom_point(data = subset(pd, tmp == 'E18.75'),
 48 |                    aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 49 |         theme_void() +
 50 |         scale_color_manual(values=day_group_color_plate) +
 51 |         theme(legend.position="none") + 
 52 |         ggsave(paste0(work_path, example_i, "_2.png"), width = 6, height = 6, dpi = 300)
 53 |     
 54 |     pd$tmp = if_else(pd$day_group == "P0", "P0", "Other")
 55 |     p = pd %>%
 56 |         ggplot() +
 57 |         geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 58 |         geom_point(data = subset(pd, tmp == 'P0'),
 59 |                    aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=0.5) +
 60 |         theme_void() +
 61 |         scale_color_manual(values=day_group_color_plate) +
 62 |         theme(legend.position="none") + 
 63 |         ggsave(paste0(work_path, example_i, "_3.png"), width = 6, height = 6, dpi = 300)
 64 |     
 65 | }
 66 | 
 67 | 
 68 | #################################################################################################################
 69 | ### To systematically identify which cell types exhibit abrupt transcriptional changes before vs. after birth ###
 70 | #################################################################################################################
 71 | 
 72 | source("JAX_help_code.R")
 73 | source("JAX_color_code.R")
 74 | work_path = "./"
 75 | 
 76 | pd_all = readRDS(paste0(work_path, "df_cell.rds"))
 77 | x = as.vector(pd_all$day)
 78 | x[pd_all$day == "E8.0-E8.5"] = "E8.5"
 79 | pd_all$day = as.vector(x)
 80 | 
 81 | pd_sub = pd_all[pd_all$day %in% c("E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 
 82 |                                   "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0"),]
 83 | dat = pd_sub %>% group_by(celltype_update, day) %>% tally() %>% filter(n >= 200)
 84 | celltype_1 = dat %>% filter(day == "P0") %>% pull(celltype_update)
 85 | celltype_2 = dat %>% filter(day != "P0") %>% group_by(celltype_update) %>% tally() %>% filter(n >= 5) %>% pull(celltype_update)
 86 | celltype_x = intersect(celltype_1, celltype_2)
 87 | 
 88 | celltype_include = data.frame(celltype_update = as.vector(celltype_x),
 89 |                               celltype_name = doSimpleName(as.vector(celltype_x)), 
 90 |                               stringsAsFactors = F)
 91 | 
 92 | write.table(celltype_include, paste0(work_path, "celltype_include.txt"), row.names=F, col.names=F, sep="\t", quote=F)
 93 | 
 94 | ### Running embedding on individual cell types
 95 | ### python Embedding_individual_celltype.py
 96 | 
 97 | for(kk in 1:nrow(celltype_include)){
 98 |     
 99 |     celltype_update_i = celltype_include$celltype_update[kk]
100 |     celltype_name_i = celltype_include$celltype_name[kk]
101 |     print(celltype_update_i)
102 |     
103 |     pd = read.csv(paste0(work_path, celltype_name_i, "_adata_scale.obs.csv"), header=T, row.names=1, as.is=T)
104 |     
105 |     emb = read.csv(paste0(work_path, celltype_name_i, "_adata_scale.PCs.csv"), header=F, as.is=T)
106 |     colnames(emb) = paste0("PC_", 1:30)
107 |     rownames(emb) = rownames(pd) = as.vector(pd$cell_id)
108 |     emb = as.matrix(emb)
109 |     
110 |     x_table = pd %>% group_by(day) %>% tally() %>% filter(n >= 200)
111 |     x_table_median = median(x_table$n)
112 |     pd_1 = pd %>% filter(day %in% as.vector(x_table$day[x_table$n <= x_table_median])) %>% as.data.frame()
113 |     pd_2 = pd %>% filter(day %in% as.vector(x_table$day[x_table$n > x_table_median])) %>% group_by(day) %>% sample_n(x_table_median) %>% as.data.frame()
114 |     pd_sub = rbind(pd_1, pd_2)
115 |     rownames(pd_sub) = as.vector(pd_sub$cell_id)
116 |     emb_sub = emb[as.vector(pd_sub$cell_id),]
117 |     
118 |     k.param = floor(log2(x_table_median)) + 1 + 1; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean"
119 |     nn.ranked = Seurat:::NNHelper(
120 |         data = emb_sub,
121 |         k = k.param,
122 |         method = nn.method,
123 |         searchtype = "standard",
124 |         eps = nn.eps,
125 |         metric = annoy.metric)
126 |     nn.ranked = Indices(object = nn.ranked)
127 |     nn_matrix = nn.ranked
128 |     
129 |     resultA = NULL
130 |     for(i in 1:k.param){
131 |         print(i)
132 |         resultA = cbind(resultA, as.vector(pd_sub$day)[as.vector(nn_matrix[,i])])
133 |     }
134 |     
135 |     resultB = NULL
136 |     for(i in 2:k.param){
137 |         print(i)
138 |         resultB = cbind(resultB, as.vector(resultA[,i] != resultA[,1]))
139 |     }
140 |     
141 |     res = data.frame(day = resultA[,1],
142 |                      pct = apply(resultB, 1, sum)/ncol(resultB))
143 |     
144 |     print(res %>% group_by(day) %>% summarise(mean_pct = mean(pct)) %>% as.data.frame())
145 |     
146 |     saveRDS(res, paste0(work_path, celltype_name_i, "_res.rds"))
147 |     
148 | }
149 | 
150 | 
151 | ###################################
152 | ### Summarizing the kNN results ###
153 | ###################################
154 | 
155 | df = NULL
156 | for(kk in 1:nrow(celltype_include)){
157 |     celltype_update_i = celltype_include$celltype_update[kk]
158 |     celltype_name_i = celltype_include$celltype_name[kk]
159 |     print(celltype_update_i)
160 |     
161 |     res_i = readRDS(paste0(work_path, celltype_name_i, "_res.rds"))
162 |     res_i = res_i %>% group_by(day) %>% summarise(mean_pct = mean(pct)) %>% mutate(celltype_update = celltype_update_i) %>% as.data.frame()
163 |     df = rbind(df, res_i)
164 | }
165 | 
166 | df_order = df %>% filter(day == "P0") %>% arrange(mean_pct)
167 | df$celltype_update = factor(df$celltype_update, levels = rev(as.vector(df_order$celltype_update)))
168 | df$mean_pct = 100*df$mean_pct
169 | 
170 | day_list = c("E16.0", "E16.25", "E16.5", "E16.75", "E17.0", "E17.25", 
171 |              "E17.5", "E17.75", "E18.0", "E18.25", "E18.5", "E18.75", "P0")
172 | day_color = c("#5dae46",
173 |               "#855ecd",
174 |               "#b2b044",
175 |               "#c94ca4",
176 |               "#55a574",
177 |               "#d53f63",
178 |               "#4cbad2",
179 |               "#c95534",
180 |               "#617fc6",
181 |               "#d89248",
182 |               "#bd80c4",
183 |               "#7f702f",
184 |               "#bc6476")
185 | names(day_color) = day_list
186 | 
187 | ### Fig. 6b
188 | ### size 10 X 6
189 | 
190 | p = ggplot() +
191 |     geom_point(data = df %>% filter(day == "P0"), aes(x = mean_pct, y = celltype_update), color = "black", size = 3) +
192 |     geom_point(data = df, aes(x = mean_pct, y = celltype_update, color = day), size = 2) +
193 |     scale_color_manual(values=day_color) +
194 |     labs(x = "Mean % of the nearest neighboring cells from different timepoints", y = "") +
195 |     theme_minimal() +
196 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/step2_Embedding_birth_series.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #################################
  3 | ### Section - 8, Birth series ###
  4 | #################################
  5 | 
  6 | ###########################################
  7 | ### Plotting cell number across samples ###
  8 | ###########################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | df = readRDS(paste0(work_path, "pd_birth.rds"))
 15 | day = rep(NA, nrow(df))
 16 | day[df$RT_group == "E18.75_L1-01"] = "10"
 17 | day[df$RT_group == "E18.75_L2-01"] = "1"
 18 | day[df$RT_group == "E18.75_L2-02"] = "2"
 19 | day[df$RT_group == "E18.75_L2-03"] = "3"
 20 | day[df$RT_group == "E18.75_L2-04"] = "6"
 21 | day[df$RT_group == "E18.75_L2-05"] = "7"
 22 | day[df$RT_group == "E18.75_L2-06"] = "8"
 23 | day[df$RT_group == "E18.75_L2-07"] = "4"
 24 | day[df$RT_group == "E18.75_L2-08"] = "5"
 25 | day[df$RT_group == "E18.75_L2-09"] = "9"
 26 | day[df$RT_group == "P0_L1-01"    ] = "11"
 27 | day[df$RT_group == "P0_L1-04"    ] = "12"
 28 | df$tmp = as.vector(day)
 29 | 
 30 | pd_cell_num_1 = df %>% group_by(tmp, day) %>% tally() %>% rename(cell_num = n) %>% as.data.frame()
 31 | pd_cell_num_1$tmp = factor(pd_cell_num_1$tmp, levels = rev(1:12))
 32 | 
 33 | ### Extended Data Fig. 12a
 34 | 
 35 | p = pd_cell_num_1 %>%
 36 |     ggplot(aes(tmp, cell_num, fill = day)) + 
 37 |     geom_bar(stat="identity") +
 38 |     coord_flip() +
 39 |     scale_fill_manual(values = birth_color_plate) + 
 40 |     geom_text(aes(label = scales::comma(cell_num)), 
 41 |               hjust = -0.1,
 42 |               position = position_dodge(width = 1),
 43 |               inherit.aes = TRUE,
 44 |               size = 5) +
 45 |     labs(x = "", y = "Cells profiled") +
 46 |     theme_classic(base_size = 15) +
 47 |     theme(legend.position="none") +
 48 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
 49 | 
 50 | 
 51 | ##################################################
 52 | ### 2D UMAP of cells from birth series dataset ###
 53 | ##################################################
 54 | 
 55 | source("JAX_help_code.R")
 56 | source("JAX_color_code.R")
 57 | work_path = "./"
 58 | 
 59 | pd = readRDS(paste0(work_path, "pd_birth.rds"))
 60 | 
 61 | ### Extended Data Fig. 12b
 62 | 
 63 | p = pd %>%
 64 |     ggplot() +
 65 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2), size=0.5) +
 66 |     geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = major_trajectory), size=0.3) +
 67 |     scale_color_manual(values=major_trajectory_color_plate) +
 68 |     theme_void() +
 69 |     theme(legend.position="none") + 
 70 |     ggsave(paste0(work_path, "birth_anno.png"), width = 10, height = 10, dpi = 300)
 71 | 
 72 | 
 73 | #########################################################################
 74 | ### 2D UMAP of subclustering results for adipocytes and lung & airway ###
 75 | #########################################################################
 76 | 
 77 | ##########################
 78 | ###  1 - Hepatocytes
 79 | 
 80 | source("JAX_help_code.R")
 81 | source("JAX_color_code.R")
 82 | work_path = "./"
 83 | 
 84 | trajectory_i = "Hepatocytes"
 85 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds"))
 86 | df$day = paste0("Csection_", as.vector(df$day), "m")
 87 | 
 88 | ### Fig. 6e (1st row)
 89 | 
 90 | birth_color_plate = c(birth_color_plate, "other" = "grey80")
 91 | 
 92 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){
 93 |     df$tmp = if_else(df$day == i, i, "other")
 94 |     try(df %>%
 95 |             ggplot() +
 96 |             geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) +
 97 |             geom_point(data = subset(df, tmp == i),
 98 |                        aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) +
 99 |             theme_void() +
100 |             scale_color_manual(values=birth_color_plate) +
101 |             theme(legend.position="none") + 
102 |             ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300))
103 |     df$tmp = NULL
104 | }
105 | 
106 | 
107 | ##########################
108 | ###  2 - Adipocytes
109 | 
110 | trajectory_i = "Adipocytes"
111 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds"))
112 | df$day = paste0("Csection_", as.vector(df$day), "m")
113 | 
114 | ### Extended Data Fig. 12c
115 | 
116 | color_plate = c("Brown adipocyte cells" = "#cb6751",
117 |                 "Adipocyte progenitor cells" = "#7aa457",
118 |                 "Adipocyte cells (Cyp2e1+)" = "#9e6ebd")
119 | 
120 | p = ggplot() +
121 |     geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.2, color="black") +
122 |     geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno_subclustering), size=1) +
123 |     theme_void() +
124 |     scale_color_manual(values=color_plate) +
125 |     theme(legend.position="none") + 
126 |     ggsave(paste0(work_path, trajectory_i, "_anno.png"), width = 4, height = 4, dpi = 300)
127 | 
128 | ### Fig. 6e (2nd row)
129 | 
130 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){
131 |     df$tmp = if_else(df$day == i, i, "other")
132 |     try(df %>%
133 |             ggplot() +
134 |             geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) +
135 |             geom_point(data = subset(df, tmp == i),
136 |                        aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) +
137 |             theme_void() +
138 |             scale_color_manual(values=birth_color_plate) +
139 |             theme(legend.position="none") + 
140 |             ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300))
141 |     df$tmp = NULL
142 | }
143 | 
144 | 
145 | ##########################
146 | ###  3 - Lung & airway
147 | 
148 | trajectory_i = "Lung_and_airway"
149 | df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections.obs.rds"))
150 | df$day = paste0("Csection_", as.vector(df$day), "m")
151 | 
152 | ### Extended Data Fig. 12d
153 | 
154 | color_plate = c("Airway club cells" = "#6dd9b4",
155 |                 "Alveolar Type 1 cells" = "#008cff",
156 |                 "Alveolar Type 2 cells" = "#dab300",
157 |                 "Lung cells (Eln+)" = "#185e3e",
158 |                 "Airway goblet cells" = "#663fc6")
159 | 
160 | p = ggplot() +
161 |     geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), size=1.2, color="black") +
162 |     geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2, color = anno_subclustering), size=1) +
163 |     theme_void() +
164 |     scale_color_manual(values=color_plate) +
165 |     theme(legend.position="none") + 
166 |     ggsave(paste0(work_path, trajectory_i, "_anno.png"), width = 4, height = 4, dpi = 300)
167 | 
168 | ### Fig. 6e (3rd row)
169 | 
170 | for(i in paste0("Csection_", c(0,20,40,60,80), "m")){
171 |     df$tmp = if_else(df$day == i, i, "other")
172 |     try(df %>%
173 |             ggplot() +
174 |             geom_point(aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5, alpha = 0.3) +
175 |             geom_point(data = subset(df, tmp == i),
176 |                        aes(x = UMAP_2d_1, y = UMAP_2d_2, color = tmp), size=1.5) +
177 |             theme_void() +
178 |             scale_color_manual(values=birth_color_plate) +
179 |             theme(legend.position="none") + 
180 |             ggsave(paste0(work_path, trajectory_i, "_", i, ".png"), width = 4, height = 4, dpi = 300))
181 |     df$tmp = NULL
182 | }
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/step3_Celltypes_changing_over_Csection.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################
 3 | ### Section - 8, Birth series ###
 4 | #################################
 5 | 
 6 | ###########################################################
 7 | ### which cell type is changing across C-section series ###
 8 | ###########################################################
 9 | 
10 | source("JAX_help_code.R")
11 | source("JAX_color_code.R")
12 | work_path = "./"
13 | 
14 | pd = readRDS(paste0(work_path, "pd_birth.rds"))
15 | pd$anno = as.vector(pd$major_trajectory)
16 | emb = readRDS(paste0(work_path, "Birth_series.PCs.rds"))
17 | emb = as.matrix(emb)
18 | 
19 | index = pd$day %in% paste0("Csection_", c(0,20,40,60,80), "m")
20 | pd_sub = pd[index,]
21 | emb_sub = emb[index,]
22 | 
23 | k.param = 11; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean"
24 | nn.ranked = Seurat:::NNHelper(
25 |     data = emb_sub,
26 |     k = k.param,
27 |     method = nn.method,
28 |     searchtype = "standard",
29 |     eps = nn.eps,
30 |     metric = annoy.metric)
31 | nn.ranked = Indices(object = nn.ranked)
32 | nn_matrix = nn.ranked
33 | 
34 | saveRDS(nn_matrix, paste0(work_path, "nn_matrix.rds"))
35 | 
36 | day_value = gsub("Csection_", "", as.vector(pd_sub$day))
37 | day_value = gsub("m", "", day_value) 
38 | day_value = as.numeric(day_value)
39 | 
40 | nn_res = NULL
41 | for(i in 1:ncol(nn_matrix)){
42 |     nn_res = cbind(nn_res, day_value[as.vector(nn_matrix[,i])])
43 | }
44 | dat = data.frame(org_day = nn_res[,1], 
45 |                  nn_day = apply(nn_res[,2:ncol(nn_res)], 1, mean),
46 |                  anno = as.vector(pd_sub$anno))
47 | 
48 | celltype_list = names(table(dat$anno))
49 | res = NULL
50 | for(celltype_i in celltype_list){
51 |     res = rbind(res,
52 |                 data.frame(anno = celltype_i,
53 |                            cor = cor.test(dat$org_day[dat$anno == celltype_i], dat$nn_day[dat$anno == celltype_i])$estimate, stringsAsFactors = FALSE))
54 | }
55 | res = res[order(res$cor),]
56 | res$anno = factor(res$anno, levels = as.vector(res$anno))
57 | 
58 | p = res %>%
59 |     ggplot(aes(anno, cor, fill = anno)) + 
60 |     geom_bar(stat="identity") +
61 |     coord_flip() +
62 |     scale_fill_manual(values = major_trajectory_color_plate) +
63 |     labs(x = "", y = "") +
64 |     theme_classic(base_size = 15) +
65 |     theme(legend.position="none") +
66 |     theme(axis.text.x = element_text(color="black"), axis.text.y = element_text(color="black"))
67 | 


--------------------------------------------------------------------------------
/Section_8_birth_series/step5_Comparing_NatBirth.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #################################
  3 | ### Section - 8, Birth series ###
  4 | #################################
  5 | 
  6 | ##########################################################################################################
  7 | ### 2D UMAP of subclustering results for adipocytes and lung & airway (adding Natural birthed samples) ###
  8 | ##########################################################################################################
  9 | 
 10 | source("JAX_help_code.R")
 11 | source("JAX_color_code.R")
 12 | work_path = "./"
 13 | 
 14 | trajectory_list = c("Hepatocytes", "Adipocytes", "Lung_and_airway")
 15 | 
 16 | ### Extended Data Fig. 12f
 17 | 
 18 | for(trajectory_i in trajectory_list){
 19 |     
 20 |     print(trajectory_i)
 21 |     
 22 |     df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections_NatBirth.obs.rds"))
 23 | 
 24 |     rep_id = as.vector(df$day)
 25 |     rep_id[df$embryo_id == "embryo_76"] = "NatBirth_rep1"
 26 |     rep_id[df$embryo_id == "embryo_77"] = "NatBirth_rep2"
 27 |     rep_id[df$embryo_id == "embryo_78"] = "NatBirth_rep3"
 28 |     df$rep_id = as.vector(rep_id)
 29 |     
 30 |     for(i in names(table(df$rep_id))){
 31 |         
 32 |         try(ggplot() +
 33 |             geom_point(data = df, aes(x = UMAP_2d_1, y = UMAP_2d_2), color = "grey80", size=1.5, alpha = 0.3) +
 34 |             geom_point(data = df %>% filter(rep_id == i),
 35 |                        aes(x = UMAP_2d_1, y = UMAP_2d_2, color = day), size=1.5) +
 36 |             theme_void() +
 37 |             scale_color_manual(values=birth_color_plate) +
 38 |             theme(legend.position="none") + 
 39 |             ggsave(paste0(work_path, "birth_", trajectory_i, "_NatBirth_", i, ".png"), width = 4, height = 4, dpi = 300))
 40 |     }
 41 |     
 42 | }
 43 | 
 44 | 
 45 | ###############################################################################
 46 | ### Identifying the neighbors for each NatBirth samples in the co-embedding ###
 47 | ###############################################################################
 48 | 
 49 | source("JAX_help_code.R")
 50 | source("JAX_color_code.R")
 51 | work_path = "./"
 52 | 
 53 | trajectory_list = c("Hepatocytes", "Adipocytes", "Lung_and_airway")
 54 | 
 55 | ### Extended Data Fig. 12e
 56 | 
 57 | for(trajectory_i in trajectory_list){
 58 |     
 59 |     print(trajectory_i)
 60 |     
 61 |     df = readRDS(paste0(work_path, "Birth_series_", trajectory_i, "_Csections_NatBirth.obs.rds"))
 62 |     
 63 |     rep_id = as.vector(df$day)
 64 |     rep_id[df$embryo_id == "embryo_76"] = "NatBirth_rep1"
 65 |     rep_id[df$embryo_id == "embryo_77"] = "NatBirth_rep2"
 66 |     rep_id[df$embryo_id == "embryo_78"] = "NatBirth_rep3"
 67 |     df$rep_id = as.vector(rep_id)
 68 |     
 69 |     emb = read.csv(paste0(work_path, "adata_", trajectory_i, "_NatBirth.PCs.csv"), header=F, as.is=T)
 70 |     colnames(emb) = paste0("PC_", 1:30)
 71 |     rownames(emb) = rownames(df) = as.vector(df$cell_id)
 72 |     emb = as.matrix(emb)
 73 |     
 74 |     result = list()
 75 |     for(kk in 1:3){
 76 |         rep_i = paste0("NatBirth_rep", kk); print(rep_i)
 77 |         
 78 |         df_1 = df[df$rep_id == rep_i,]
 79 |         emb_1 = emb[df$rep_id == rep_i,]
 80 |         
 81 |         df_2 = df[df$rep_id != rep_i,]
 82 |         df_2_x = df_2 %>% group_by(rep_id) %>% sample_n(5000)
 83 |         df_2 = df_2[df_2$cell_id %in% df_2_x$cell_id,]
 84 |         emb_2 = emb[as.vector(df_2$cell_id),]
 85 |         
 86 |         k.param = 10; nn.method = "rann"; nn.eps = 0; annoy.metric = "euclidean"
 87 |         nn.ranked = Seurat:::NNHelper(
 88 |             data = emb_2,
 89 |             query = emb_1,
 90 |             k = k.param,
 91 |             method = nn.method,
 92 |             searchtype = "standard",
 93 |             eps = nn.eps,
 94 |             metric = annoy.metric)
 95 |         nn.ranked = Indices(object = nn.ranked)
 96 |         nn_matrix = nn.ranked
 97 |         
 98 |         resultA = NULL
 99 |         for(i in 1:k.param){
100 |             print(i)
101 |             resultA = cbind(resultA, as.vector(df_2$rep_id)[as.vector(nn_matrix[,i])])
102 |         }
103 |         
104 |         result[[rep_i]] = table(c(resultA))
105 |     }
106 |     
107 |     dat_1 = data.frame(target_id = names(result[[1]]), num = as.vector(result[[1]]))
108 |     dat_1 = rbind(dat_1, data.frame(target_id = "NatBirth_rep1", num = 0))
109 |     dat_1$rep_id = "NatBirth_rep1"
110 |     
111 |     dat_2 = data.frame(target_id = names(result[[2]]), num = as.vector(result[[2]]))
112 |     dat_2 = rbind(dat_2, data.frame(target_id = "NatBirth_rep2", num = 0))
113 |     dat_2$rep_id = "NatBirth_rep2"
114 |     
115 |     dat_3 = data.frame(target_id = names(result[[3]]), num = as.vector(result[[3]]))
116 |     dat_3 = rbind(dat_3, data.frame(target_id = "NatBirth_rep3", num = 0))
117 |     dat_3$rep_id = "NatBirth_rep3"
118 |     
119 |     dat = rbind(dat_1, dat_2, dat_3)
120 |     
121 |     dat$target_id = factor(dat$target_id, levels = c(paste0("NatBirth_rep",1:3), paste0("Csection_",c(0,20,40,60,80),"m")))
122 |     
123 |     p = dat %>%
124 |         ggplot(aes(x=target_id, y = num, fill = target_id))+
125 |         geom_bar(stat="identity") + facet_grid(rep_id ~ .) +
126 |         theme_classic(base_size = 10) +
127 |         theme(legend.position="none") +
128 |         labs(x="",y="# of kNNs") +
129 |         scale_fill_manual(values=birth_color_plate) +
130 |         theme(axis.text.x = element_text(color="black", angle = 90, vjust = 0.5, hjust=1), axis.text.y = element_text(color="black"))
131 |     
132 | }
133 | 
134 | 
135 | ##################################################################
136 | ###  DEGs between NatBirth and C-section samples (20m and 40m) ###
137 | ##################################################################
138 | 
139 | source("JAX_help_code.R")
140 | source("JAX_color_code.R")
141 | work_path = "./"
142 | 
143 | pd = readRDS(paste0(work_path, "pd_birth.rds"))
144 | pd$anno = as.vector(pd$major_trajectory)
145 | 
146 | pd_sub = pd %>% filter(day %in% c("NatBirth","Csection_20m","Csection_40m"))
147 | pd_sub$embryo_group = paste0(pd_sub$anno, "_", pd_sub$embryo_group)
148 | x_table = table(pd_sub$embryo_group)
149 | pd_sub_1 = pd_sub[pd_sub$embryo_group %in% names(x_table)[x_table > 10000],]
150 | pd_sub_2 = pd_sub[pd_sub$embryo_group %in% names(x_table)[x_table <= 10000],]
151 | pd_sub_1_x = pd_sub_1 %>% group_by(embryo_group) %>% sample_n(10000)
152 | pd_sub_1 = pd_sub_1[pd_sub_1$cell_id %in% pd_sub_1_x$cell_id,]
153 | df = rbind(pd_sub_1, pd_sub_2)
154 | 
155 | mouse_gene_sub = mouse_gene[(mouse_gene$gene_type %in% c('protein_coding', 'pseudogene', 'lincRNA')) & mouse_gene$chr %in% paste0("chr", c(1:19, "M", "X", "Y")),]
156 | gene_count = doExtractData(df, mouse_gene_sub)
157 | obj = CreateSeuratObject(gene_count, meta.data = df)
158 | obj = NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000)
159 | 
160 | anno_list = names(table(pd$anno))
161 | res_all = NULL
162 | 
163 | for(i in 1:length(anno_list)){
164 |     anno_i = anno_list[i]; print(anno_i)
165 |     obj_sub = subset(obj, subset = anno == anno_i)
166 |     Idents(obj_sub) = as.vector(obj_sub$day)
167 |     obj_sub = FindVariableFeatures(obj_sub, selection.method = "vst", nfeatures = 5000)
168 |     genes_include = VariableFeatures(obj_sub)
169 |     
170 |     res = FindMarkers(obj_sub, ident.1 = "NatBirth", features = genes_include)
171 |     res = res %>% mutate(gene_ID = rownames(res)) %>% 
172 |         left_join(mouse_gene[,c("gene_ID","gene_short_name")]) %>% as.data.frame() %>% filter(p_val_adj < 0.05)
173 |     res$high_in_which = if_else(res$avg_logFC > 0, "Up_in_NatBirth", "Down_in_NatBirth")
174 |     res$p_val = NULL
175 |     res$major_cell_cluster = anno_i
176 |     
177 |     res_all = rbind(res_all, res)
178 | }
179 | 
180 | write.csv(res_all, paste0(work_path, "adata_major_cell_cluster_NatBirth_DEGs.csv"))
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/spatial_mapping.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChengxiangQiu/JAX_code/07c2dcec7b222bfbcd5666e5d70a642f0bd0bcb2/spatial_mapping.tar.gz


--------------------------------------------------------------------------------