├── DOUblet_Cluster_Labeling.ipynb
├── DeepTree_algorithm_demo.ipynb
├── Human_fetal_lung_cell_atlas_2022
    ├── MarkerGenes.md
    └── README.md
├── LICENSE
├── Pythonplus.py
├── README.md
├── Scanpyplus.py
├── Soft_integration_limb.ipynb
├── Soft_integration_pancreas.ipynb
└── pandasPlus.py


/Human_fetal_lung_cell_atlas_2022/MarkerGenes.md:
--------------------------------------------------------------------------------
  1 | |name|index|big_cluster|pan-group marker|In-group marker|Known markers|Literature justification|cell_number|Proximal/Distal|P/D significance|
  2 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
  3 | |Mid fibro|1|C0|COL1A2|GNAO1, PCLAF|-| |10505|1.07734901|0.04857607|
  4 | |Alveolar fibro|2|C0|COL1A2|CFD, VEGFD, AFDN, FAM155A, GABRD|FGFR4, CFD|Travaglini et al. 2020; Madissoon et al. 2021|8289|1.07161335|0.00015867|
  5 | |Early fibro|3|C0|COL1A2|HMGA1, SNCG, HIST3H2A|-| |4928|NA|NA|
  6 | |Interm fibro|4|C0|COL1A2|CNTNAP2, ALDH1A1, FMO1, CA3, GGT5|-| |3156|0.73322326|1.77E-18|
  7 | |Adventitial fibro|5|C0|COL1A2|SERPINF1, ENTPD2, RCN3, COL12A1, PDGFRL, IGFBP6|SERPINF1|Travaglini et al. 2020; Madissoon et al. 2021|2403|0.83861085|0.000036|
  8 | |Pericyte|6|C0|COL1A2|NDUFA4L2, TRPC6, RFTN1|PDGFRB, COX4I2|Crapo et al. 1982; Travaglini et al. 2020|2161|1.44470911|1.49E-13|
  9 | |Late airway SMC|7|C0|COL1A2|HHIP, MYH11, ACTG2, CNN1, PLP1, CD9, RAMP1|LGR6, ACTG2|Travaglini et al. 2020|1855|1.38415687|5.9E-12|
 10 | |Mid airway SMC 2|8|C0|COL1A2|HHIP, MYH11, ACTG2, MYL4, ARL4A, RAMP1, CD9|ACTG2|Travaglini et al. 2020|871|1.75181483|0.13338706|
 11 | |MYL4+ SMC|9|C0|COL1A2|MYL4, IL17B, CHCHD7|-| |465|NA|NA|
 12 | |Myofibro 1|10|C0|COL1A2|CT45A1, CT45A2, CT45A3, KCNK17|-| |415|1.62240515|0.00050008|
 13 | |Vascular SMC 1|11|C0|COL1A2|OLFML2B, NTRK3, NTN4|-| |371|1.02246753|0.07369706|
 14 | |Myofibro 2|12|C0|COL1A2|KCNK17, THBD, NOTUM, STC1, TPST2|-| |335|1.32952127|0.00705873|
 15 | |Airway fibro|13|C0|COL1A2|SERPINF1, S100A4, FGF7, TNC, CRYM|-| |309|3.3874159|1.23E-20|
 16 | |Vascular SMC 2|14|C0|COL1A2|NET1, PLN, PDLIM5, FRZB, KCNA5|-| |276|1.06452317|0.08653233|
 17 | |Early mesothelial|15|C0|COL1A2|WT1, TNNT1, UPK3B, PODXL|WT1, PDPN|Buechler et al. 2019|211|NA|NA|
 18 | |Mid airway SMC 1|16|C0|COL1A2|MYH11, ACTG2, HTRA1, RHCG, ADAMTSL2|ACTG2|Travaglini et al. 2020|177|0.50427617|0.38307309|
 19 | |Mesenchymal 3|17|C0|COL1A2|PITX2, GATA4, TECRL, RPRM, CA10, RELN|-| |158|NA|NA|
 20 | |Late mesothelial|18|C0|COL1A2|WT1, TNNT1, UPK3B, PODXL, TM4SF1|WT1, PDPN|Buechler et al. 2019|120|0.36313702|0.00000155|
 21 | |Myofibro 3|19|C0|COL1A2|KCNK17, STC1, SYT1, ETV5|-| |98|1.39484778|0.05453932|
 22 | |Mesenchymal 2|20|C0|COL1A2|PITX2, LINC01082, PITX1, C8orf34, CHST9|-| |97|NA|NA|
 23 | |ACTC+ SMC|21|C0|COL1A2|ACTC1, TES|-| |84|NA|NA|
 24 | |Mid mesothelial|22|C0|COL1A2|WT1, TNNT1, UPK3B, PODXL, TM4SF1|WT1, PDPN|Buechler et al. 2019|73|0.25828779|0.17459369|
 25 | |Mesenchymal 1|23|C0|COL1A2|PITX2, EDN3, PDE1A, KIT, TLL1, ALDH1A3|-| |69|NA|NA|
 26 | |Interm chondrocyte|24|C0|COL1A2|COL9A2, HAPLN1, WFDC2, EPYC, CNMD|SOX9, COL9A2, HAPLN1|Gan et al. 2021; Madissoon et al. 2021|51|2.71308915|0.27972473|
 27 | |ASPN+ chondrocyte|25|C0|COL1A2|COL9A2, HAPLN1, NPPC, RBP4, CLIP2, MIA|SOX9, COL9A2, HAPLN1|Gan et al. 2021; Madissoon et al. 2021|33|49.0981617|0.06404166|
 28 | |Resting chondrocyte|26|C0|COL1A2|COL9A2, HAPLN1, EPYC, CNMD, UCMA, S100A1|SOX9, COL9A2, HAPLN1|Gan et al. 2021; Madissoon et al. 2021|19|20.2168901|0.27972473|
 29 | |Late tip|27|C1|KRT19|SFTPA1, RASGRF1, TNC, PPP1R1B|SOX9, ETV5, ETV4|Nikolić et al. 2017; Miller et al. 2018|2127|0.89795088|0.00350518|
 30 | |Mid tip|28|C1|KRT19|TTYH1, SHISAL2B, LEF1, TTN|SOX9, ETV5, ETV4|Nikolić et al. 2017; Miller et al. 2018|1393|0.96270905|0.50386152|
 31 | |Mid stalk|29|C1|KRT19|LRRC3B|-| |1380|1.8378991|0.39138911|
 32 | |Mid airway progenitor|30|C1|KRT19|CYTL1, COL6A2, CDH2, FAM129A, FRMD3, RP1, MTTP|-| |798|0.96270905|0.216647|
 33 | |Late stalk|31|C1|KRT19|FILIP1|-| |670|1.34903443|0.00013479|
 34 | |Early airway progenitor|32|C1|KRT19|CYTL1, ADM, CRH, MITF, ZMAT4|-| |432|NA|NA|
 35 | |AT2|33|C1|KRT19|LRRK2, ABCA3|SFPTC, CD36, NAPSA, ETV5, SLC34A2, HOPX, LAMP3, ABCA3|Travaglini et al. 2020; Barkauskas et al. 2013; Wang et al. 2020|382|0.89467308|0.05533309|
 36 | |AT1|34|C1|KRT19|ADIRF, MSLN, ICAM1, UPK3B, CAV2, MMP28|HOPX, AQP4, AQP5, CAV1, AGER, RAGE, SPOCK2|Travaglini et al. 2020; Wang et al. 2020|283|1.60693471|0.00015515|
 37 | |Early tip|35|C1|KRT19|TTYH1|SOX9, ETV5, ETV4, TPPP3, TESC, CA2, MFSD2A| |252|NA|NA|
 38 | |Club|36|C1|KRT19|AGR3, FMO2, STEAP4, KDR, SCGB1A1|SCGB1A1, SCGB3A2|Lambrechts et al. 2018; Travaglini et al. 2020; Wang et al. 2020; Carraro et al. 2021; Miller et al., 2020|208|1.01032078|0.08653233|
 39 | |Late airway progenitor|37|C1|KRT19|PCP4, CYTL1, RP1|-| |193|0.93326841|0.08313327|
 40 | |Mid basal|38|C1|KRT19|COL14A1, DLK1, ABI3BP|TP63, KRT5, F3|Miller et al. 2020; Wang et al. 2020; Deprez et al. 2020; Carraro et al. 2021; Travaglini et al. 2020; Rock et al. 2009|143|58.7252522|0.04857607|
 41 | |Early stalk|39|C1|KRT19|MAD2L1, UBE2T, BIRC5, CDC20, CCNB2, ORC6|-| |95|NA|NA|
 42 | |GHRL+ NE precursor|40|C1|KRT19|GHRL[+/low], CFC1[low], NEUROD1[low]  ACSL1[low]|-| |94|1.27326036|0.29238481|
 43 | |GHRL+ neuroendocrine|41|C1|KRT19|GHRL, NEUROD1, CFC1, RFX6|GHRL|Cao et al., 2020; Santos et al. 2006; Volante et al. 2002|77|1.0494396|0.18594646|
 44 | |Late basal|42|C1|KRT19|IL33, TP63, KRT15, KRT5[low/-], KRT13[-]|TP63, IL33|Miller et al. 2020; Carraro et al. 2021; Rock et al. 2009|63|5.7084579|0.00000363|
 45 | |Pulmonary NE precursor|43|C1|KRT19|GRP[+/low], ASCL1[low], DPYSL3[low], DDC[low], DLL3[low]|-| |61|3.25487346|0.1048676|
 46 | |Proximal basal|44|C1|KRT19|TP63, KRT15, KRT5[high/+], F3, KRT13[+], IGFBP3, IL33|TP63, KRT5, F3|Miller et al. 2020; Carraro et al. 2021; Travaglini et al. 2020; Rock et al. 2009|57|549.706869|6.13E-16|
 47 | |Proximal secretory 1|45|C1|KRT19|SCGB1A1[low/-]SCGB3A2[+]SCGB3A1[+]|-| |55|39.4710711|0.096808|
 48 | |Intermediate neuroendocrine|46|C1|KRT19|NEUROG3|-| |50|1.8378991|0.39138911|
 49 | |Proximal secretory progenitors|47|C1|KRT19|SCGB3A2, MXPH4, KIAA1324, NDUFA4L2|SCGB3A2|Miller et al. 2020|39|10.5897996|0.50949987|
 50 | |Proximal secretory 2|48|C1|KRT19|SCGB1A1[+]SCGB3A2[+]SCGB3A1[+]|-| |26|58.7252522|0.04857607|
 51 | |MUC5AC+ ASCL1+ progenitor|49|C1|KRT19|MUC5AC, ASCL1|-| |19|NA|NA|
 52 | |SMG basal|50|C1|KRT19|TP63[low/+], KRT15, KRT5[+], KRT13[-/low], SOX9, IL33[-], ACTA2|TP63, ACTA2, SOX9, KRT5|Miller et al. 2020; Yu et al. 2021|15|4.63017211|0.04975534|
 53 | |Squamous|51|C1|KRT19|PSCA, KRT80, CDKN1A, RNF39, SPRR1B, SCEL|SPRR1B, SCEL|Deprez et al. 2020|11|1.66713031|0.19172599|
 54 | |Proximal secretory 3|52|C1|KRT19|SCGB1A1[high/+]SCGB3A2[low/-]SCGB3A1[+]|-| |8|68.3523427|0.03009688|
 55 | |SMG|53|C1|KRT19|LTF[+]SCGB3A2[-]SCGB3A1[+]|LTF|Miller et al. 2020; Yu et al. 2021; Deprez et al., 2020|7|5.33865929|0.09044083|
 56 | |SPP1+ MΦ|54|C2|AIF1/SPI1|RNASE1, SELENOP, LYVE1, LILRB5, CCL2|SPP1, C1QA|Popescu D et al. 2019|2345|0.74024643|9.89E-12|
 57 | |DC2|55|C2|AIF1/SPI1|CD1C, FCER1A, HLA-DOA, PKIB, S100B, CD1E|CD1C, FCER1A, CLEC10A|Villani et al. 2017|1048|0.76120794|0.0000146|
 58 | |S100A12-hi cla. mono.|56|C2|AIF1/SPI1|HLA-DPB1, S100A8, VCAN, FCN1, S100A12, RBP7, SLC11A1, CDA|-| |931|0.86787081|0.01094967|
 59 | |CX3CR1+ MΦ|57|C2|AIF1/SPI1|OLFML2B, METTL27, IGSF21, P2RY12|CXCR1, C1QA|Popescu D et al. 2019|480|0.6443674|0.0000121|
 60 | |Non-cla. mono.|58|C2|AIF1/SPI1|FCGR3A, CD300E, GCH1, GBP2|FCGR3A|Schyns et al. 2019|403|1.05434714|0.06439059|
 61 | |S100A12-lo cla. mono.|59|C2|AIF1/SPI1|FCGBP|-| |327|0.54084246|0.000000364|
 62 | |DC1|60|C2|AIF1/SPI1|C1orf54, CLEC9A, S100B, RGCC, CLNK, RAB7B, CADM1|CLEC9A, XCR1|Villani et al. 2017|323|0.93895312|0.06905945|
 63 | |DC3|61|C2|AIF1/SPI1|S100A8, RNASE2, UPK3A, CES1, MTMR11|CD88-, CD1C, CD163|Bourdely et al. 2020|198|0.84131994|0.06279591|
 64 | |Neutrophil|62|C2|AIF1/SPI1|S100A12, S100A8, CDA, RBP7, PADI4, SLC11A1, VNN2, S100P, FCGR3B, ORM1, G0S2, MCEMP1|ITGAM, ORM1|Borregard N et al. 2007|141|0.71929336|0.03496351|
 65 | |pDC|63|C2|AIF1/SPI1|JCHAIN, GZMB, FAM129C, SERPINF1, MZB1, TPM2, UGCG, LTB, IGKC, CLIC3|GZMB, JCHAIN, SERPINF1|Villani et al. 2017|125|0.70901319|0.03679162|
 66 | |Promonocyte-like|64|C2|AIF1/SPI1|RNASE2, MKI67, NKG7, SMC4, S100A12, MPO|MPO, VCAN, S100A8|Jardine et al. 2021|124|1.048512|0.09912555|
 67 | |Cycling DC|65|C2|AIF1/SPI1|TYMS, LMNB1, CD1C|-| |84|0.91469364|0.11210663|
 68 | |Megakaryocyte|66|C2|AIF1/SPI1|GP9, CAVIN2, GP1BA, CLEC1B, TUBB1, PF4, ITGA2B, PPBP, STOM, RAB27B, LAT, ESAM, GATA1, MYLK|PF4, MYL9|Popescu D et al|61|1.26954859|0.09798098|
 69 | |pre-pDC/DC5|67|C2|AIF1/SPI1|CLEC4C, LILRA4, SIGLEC6, SPI1,S100B, CLEC10A|SIGLEC6, IRF8|Villani et al. 2017, Collin et al. 2018|37|1.14209583|0.15555671|
 70 | |Promyelocyte-like|68|C2|AIF1/SPI1|S100A8, AZU1, MPO, PRTN3, RNASE2, CLEC11A, MS4A3, RNASE3|MPO, CTSG, PRTN3, AZU1|Borregard N et al. 2007|36|0.59770088|0.07682246|
 71 | |Myelocyte-like|69|C2|AIF1/SPI1|PGLYRP1, LCN2, CAMP, S100A12, LTF, CDA, CHI3L1, CD177, S100P, CRISP3, DEFA3, DEFA1B |LCN2, CAMP, LTF|Borregard N et al. 2007|36|0.4835326|0.04932522|
 72 | |aDC 1|70|C2|AIF1/SPI1|CCR7, LAMP3, IL32, BIRC3, TRAF1, ESPTI1, CCL19, ARHGAP22, TFPI2, NCCRP1, IL7R, IDO1|CCR7, CCL19, LAMP3|Zhang et al. 2019; Madissoon et al. 2021|29|0.50907651|0.06404166|
 73 | |GMP|71|C2|AIF1/SPI1|PRSS57, AZU1, PRTN3, MPO, MS4A3, CLEC11A, RNASE2, RNASE3, CTSG, SERPINB10, ELANE, DEFA4|PRSS57, CALR, AZU1,ELANE|Pellin D et al, Kwok I et al|29|1.03098629|0.17459369|
 74 | |HSC|72|C2|AIF1/SPI1|SMIM24, C1QTNF4, CD34, AC084033.3, PRSS57, SPINK2, ANGPT1|MLLT3, HLF, SPINK2, CD34, PRSS57|Popescu D et al. 2019, Calvanese V et al. 2019|29|1.8205686|0.0743501|
 75 | |MEP|73|C2|AIF1/SPI1|AC084033.3, GATA2, PRSS57, GATA1|GATA1, GATA2, KLF1|Popescu D et al. 2019|28|1.03619829|0.17712315|
 76 | |CXCL9+ MΦ|74|C2|AIF1/SPI1|CXCL10, GBP4, GBP1, STAT1, APOL3, CXCL11, GBP5|-| |23|0.88314632|0.18065129|
 77 | |Platelet|75|C2|AIF1/SPI1|HIST1H4H, SOCS2, KEL, GADD45A, HIST1H2AK, HIST1H4B|CLK1, CLCN7|Schwertz et al. 2006, Lewandrowski et al. 2009|21|1.20041499|0.19291972|
 78 | |HSC/ELP|76|C2|AIF1/SPI1|SPINK2, HOPX, PRSS57, PTPRCAP, CD34, SMIM24, MZB1, C1QTNF4, IGHM|CD34|Pellin et al. 2019|21|2.38309946|0.06404166|
 79 | |aDC 2|77|C2|AIF1/SPI1|LAMP3, CD1C, CD1E, TRAF1, CCL22, IL4I1, CXCL9|-| |14|2.37155157|0.09699879|
 80 | |APOE+ MΦ2|78|C2|AIF1/SPI1|ACP2, PLA2G7, CD82, APOE, GPNMB, LMNA, APOC1, KCNMA1, NR1H3|TREM2, APOE|Cochain et al. 2018, Williams et al. 2020, Evren et al. 2021|13|2.20491428|0.15555671|
 81 | |Eosinophil|79|C2|AIF1/SPI1|HDC, GATA2, SLC45A3[high], AC084033.3, CD82, PRSS57, CNRIP1, NFE2, CPA3, GFI1B, RHEX|CLC, IL5RA|Lee et al. 2012|13|0.60615014|0.17459369|
 82 | |APOE+ MΦ1|80|C2|AIF1/SPI1|APOE, APOC1, MMP9, TIMD4, CETP, PLA2G2D, PLA2G7, PTGDS, NR1H3|TREM2, APOE|Cochain et al. 2018, Williams et al. 2020, Evren et al. 2021|10|1.27326036|0.29238481|
 83 | |CMP|81|C2|AIF1/SPI1|DLK1, C1QTNF4, TRH, IGLL1, SMIM24, PRSS57, MPO, CPA3, CD34, SPINK2, ANGPT1, PTPRCAP|SPINK2, CD34, CTSG, PRTN3|Kwok I et al. 2020, Smith SL et al. 2020, Pellin D et al. 2019, Velten L et al. 2017|7|0.39640961|0.17913329|
 84 | |Basophil|82|C2|AIF1/SPI1|CLC, CPA3, HDC, MS4A3, CA8, S100P, HCAR3, IL5RA, GCSAML, GATA2, RHEX|CCR3, IL3RA|Santos et al. 2016|7|1.27326036|0.29238481|
 85 | |Mid cap|83|C3|CDH5|FAM84A, TIAM1, MYOC, DIXDC1, LIMD1|CA4|Schupp et al. 2021; Wang et al. 2020; Travaglini et al. 2020; Gillich et al. 2020|1020|1.04155278|0.05986836|
 86 | |Definitive erythrocyte|84|C3|CDH5|HBB, HBG2, HBG1, ALAS2|HBB| |897|0.75543211|0.0000561|
 87 | |Lymphatic endo|85|C3|CDH5|PTX3, PRSS23, STAB2, RELN, LOX, CYTL1, PDE2A|PDPN, PROX1|Lambrechts et al. 2018; Schupp et al. 2021|812|0.8063696|0.00243739|
 88 | |Late cap|86|C3|CDH5|IL7R|CA4|Schupp et al. 2021; Wang et al. 2020; Travaglini et al. 2020; Gillich et al. 2020|804|0.98480643|0.06279591|
 89 | |Venous endo|87|C3|CDH5|ACKR1, NUAK1, CDH11, PTGDS, SELP, FAM84B, HDAC9, DUSP23, CHODL|ACKR1|Travaglini et al. 2020; Schupp et al. 2021|324|0.95037454|0.07129931|
 90 | |Definitive reticulocyte|88|C3|CDH5|TSPO2, ACSL6, MOSPD1, RSAD2, CHST2, ARG1|UCP2|Flachs et al. 2007|300|0.81470347|0.03837112|
 91 | |Early cap|89|C3|CDH5|TUBB2B, TPM2, THY1, CD24, HAPLN1|CA4|Schupp et al. 2021; Wang et al. 2020; Travaglini et al. 2020; Gillich et al. 2020|265|NA|NA|
 92 | |Primitive erythrocyte|90|C3|CDH5|MT1G, MT1F, MT1E, CTSE, BEX1, SOD3, MAP1A|-| |223|NA|NA|
 93 | |GRIA2+ arterial endo|91|C3|CDH5|GRIA2, ATP13A3, APOA1, AIF1L, DUSP4|GJA5|Travaglini et al. 2020; Schupp et al. 2021|170|1.3381806|0.03985821|
 94 | |Intermediate lymphatic endo|92|C3|CDH5|HS3ST1, KLHL4, LYPD6, AL583785.1, MFAP4, GUCY1A1|-| |134|0.76782868|0.05453932|
 95 | |Aerocyte|93|C3|CDH5|TBX2, S100A3, SPON2, RGS9, GRID1|EDNRB, APLN|Gillich et al. 2020; Travaglini et al. 2020|128|1.9008478|0.00084655|
 96 | |SCG3+ lymphatic endothelial|94|C3|CDH5|SCG3, SBSPON, TCTN3, ABCA4, GJD3, CCDC3, NEO1|SCG3+|Takeda et al. 2019|93|0.70888746|0.05453932|
 97 | |HMOX1+ primitive erythroblast|95|C3|CDH5|HMOX1, UCA1, CDKN1A, FDXR, GDF15|-| |82|NA|NA|
 98 | |Cycling definitive erythroblast|96|C3|CDH5|HEMGN, GATA1, MKI67|-| |80|0.73457894|0.11253951|
 99 | |Definitive erythroblast|97|C3|CDH5|CPEB4, LPIN2, TRAK2, FHDC1, RIPOR3, ATG4D|XPO7, USP7|Hattangadi et al. 2014; Liang et al. 2019|43|1.01311267|0.15555671|
100 | |Arterial endo|98|C3|CDH5|SERPINE2, DKK2, NSG1, RNLS, LINC00440|GJA5|Travaglini et al. 2020; Schupp et al. 2021|42|1.50086939|0.08653233|
101 | |OMD+ endo|99|C3|CDH5|OMD, CXCL2, ITPR2, HEY2, SFRP4, FBN2, BMP4|-| |27|2.72541577|0.03006071|
102 | |Pulmonary venous endo|100|C3|CDH5|PTHLH, RSPO3, FBLN2, CDH23, CDH11, CYP1B1|PTGS1|Schupp et al. 2021|13|10.5897996|0.00840089|
103 | |Intermediate NK|101|C4|CD247|GZMK, GZMB, GNLY, S1PR5, CD52|-| |1080|0.9095305|0.02791797|
104 | |CD56bright NK|102|C4|CD247|GZMK, TOX2, CCL3L1, CXCR6, KRT86, ITM2C,|GZMK, XCL1, NKG7|Bratke et al. 2005, Smith SL et al. 2020|752|0.87409354|0.02070128|
105 | |CD16+ NK|103|C4|CD247|SPON2, FGFBP2, GZMB, GZMH, ADGRG1, CX3CR1, GNLY, S1PR5, LAIR2, MYOM2, PRSS23, KIR2DL3, FCRL6, KIR3DL1, KIR2DL1|PRF1, GZMH, FCGR3A, NKG7|Smith SL et al. 2020|687|1.00280999|0.06404166|
106 | |CD4 T|104|C4|CD247|CD4, CCR7, LRRN3, CD40LG, CHI3L2, ARMH1, BACH2, CSGALNACT1, PASK|CD40LG|Park JE et al. 2020|638|1.21524528|0.00699579|
107 | |CD8 T|105|C4|CD247|CD8B, CCR7, LINC02446, CPNE2, A1BG, FBLN2, S100B, NT5E|CD8A, CD8B|Park JE et al. 2020|405|1.33058729|0.00350518|
108 | |Treg|106|C4|CD247|CTLA4, FOXP3, NCF4, RTKN2, IL2RA, DUSP4, AC133644.2, RGPD2, RGPD1, CCR4, TNFRSF4|TIGIT, FOXP3|Park JE et al. 2020|253|1.08395704|0.07129931|
109 | |Th17|107|C4|CD247|LST1, CCR6, BLK, SLAMF1, GPR25, IL4I1, RORC, IL23R, TRDV2, SCART1, ADAM19, IL23A|IL4I1, CCR6|Santarlasci et al. 2014; Wang et al. 2009|167|1.30121434|0.04410283|
110 | |ILC2|108|C4|CD247|HPGDS, PTGDR2, KRT1, TMEM273, IL9R, BACE2, TNFRSF18, IL2RA, IL1RL1, CD84, PLAGL1|GATA3, MAF, PTGDR2, HPGDS|Mazzurana et al. 2021|149|0.7944453|0.05986836|
111 | |NKT2|109|C4|CD247|CXCR3, SIRPG, ZNF683, IFNG-AS1, TRDV2, DBN1, CPNE2, PDCD1, TRG-AS1, AC004585.1|KLRC2|Cohen et al. 2013|147|0.81665964|0.06404166|
112 | |ILC3|110|C4|CD247|CCR6, IL4I1, LST1, RORC, IL23R, KIT, CA2, SLC16A3, SCART1, GPR25, ADAM12, TNFSF13B, TTC39C-AS1, TOX2|IL17A, RORC, ZBTB16|Zhong et al. 2016, Cumano A et al 2019|135|1.45254145|0.0294878|
113 | |NKT1|111|C4|CD247|IL23R, SLC4A10, TRDV2, CXCR6, RORC, BLK, CCR6|KLRB1|Liao et al. 2013|123|0.51801199|0.00078683|
114 | |ILCP|112|C4|CD247|LINC00299, IL4I1, RORC, HPN, KIT, SCN1B, JAG1, IL1R1, PEG10, CXCR5, ADAM12, BCAS1|SCN1B, HPN|Elmentaite et al. 2021|80|0.92962627|0.13198559|
115 | |Cycling NK|113|C4|CD247|TYMS, MKI67, CDK1, TOP2A, NKG7|-| |64|1.42296836|0.07085423|
116 | |Cycling T|114|C4|CD247|TYMS, MKI67, CDK1, TOP2A, CD52|-| |46|1.14521314|0.13472932|
117 | |Activated NK|115|C4|CD247|TNFRSF9, CRTAM, TNFRSF18, RAMP1, CD82|TNFRSF9|Marvel et al. 2010|40|0.96270905|0.15555671|
118 | |Tαβ_Entry|116|C4|CD247|CD1E, ARPP21, CD1B, MZB1, RAG1, CCR9, RAG2, AL138899.1, AL365440.2, CD1A, CD1C, TSHR, GALNT7|CCR9, RAG1, RAG2|Park JE et al. 2020|12|2.82601689|0.09044083|
119 | |Late pre-B|117|C5|VPREB3|IGLV1-40, BACH2, IGLL5|CD19, IGLL5, VPREB1|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|594|0.87606214|0.03009688|
120 | |CD5- Mature B|118|C5|VPREB3|SLC2A3, MYC, LINC00926, CD83, SELL, GPR183, TRBC2, CD1C, DUSP2, NR4A2, CXCR5|CD19, MS4A1, IGHM, IGHD|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|555|0.99808976|0.06404166|
121 | |Large pre-B|119|C5|VPREB3|IGLL5, LMNB1, TYMS, TCL1A, PTPN6|CD19, MS4A1, IL7R, SPIB|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|546|1.03276448|0.06404166|
122 | |λ small pre-B|120|C5|VPREB3|ACSM3, AMN, DEPP1, BMP3, IGF2-1, CEBPD, LINC00472, AC108879.1, RAG1, VPREB1|CD19, RAG1, VPREB1|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|542|0.98066334|0.06404166|
123 | |Immature B|121|C5|VPREB3|LTB, MS4A1, IGHD, LTB, LIMS2, TNFRSF18|CD19, MS4A1, IGHM|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|526|0.981287|0.06404166|
124 | |κ small pre-B|122|C5|VPREB3|TNFRSF17, AL133467.1, CDHR3, RAG1, VPREB1|CD19, RAG1, IGLL5, VPREB1|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|453|1.05180753|0.06404166|
125 | |Late pro-B|123|C5|VPREB3|TYMS[-], CD9, NEIL1, FCMR, RUBCNL, TRGV9, H1F0, DEPP1, SMAD1, CD27|CD19, IL7R, CD34, DNTT|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|452|0.76481239|0.00350518|
126 | |CD5+ CCL22- mature B|124|C5|VPREB3|ISG20, BANK1, JCHAIN, FCRL1, CLECL1, TRAC, CCL22[-]|CD19, MS4A1, IGHM, IGHD, CD5,|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|218|1.16879356|0.06279591|
127 | |Pro-B|125|C5|VPREB3|DNTT, SOCS2, LMNB1, TYMS, EGFL7, ERG, LINC01013|CD19, IL7R, CD34, DNTT, MKI67|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|135|0.93443566|0.09379346|
128 | |CD5+ CCL22+ mature B|126|C5|VPREB3|CCND2, TYW3, CCL22, MIR155HG, CCR10|-| |61|1.29467769|0.09224432|
129 | |Pro-B/Pre-B transition|127|C5|VPREB3|LTB, LINC01013, LINC01781, CDC25B, CCDC191, IGLL5|CD19, MS4A1 low, RAG1, IGLL5, VPREB1|Burrows et al 2020, Clark et al. 2013, Matthias et al. 2005, Rickert et al. 2013|56|1.19283871|0.11297547|
130 | |Ciliated|128|C6|DNAH12|FOXJ1, ALOX15|FOXJ1, MYB|Deprez et al. 2020; A. Wang et al. 2020; Carraro et al. 2021|1339|1.11510041|0.01168299|
131 | |MUC16+ ciliated|129|C6|DNAH12|MUC16, FOXJ1[low/+]|-| |218|8.54240928|4.69E-33|
132 | |Deuterosomal|130|C6|DNAH12|MCIDAS, CDC20B|MCIDAS, HES6, CDC20B|Deprez et al. 2020|84|1.14938394|0.10192084|
133 | |Schwann precursor|131|C7|DLX1|POSTN, HEYL, TYMS, LINC01198, INSC, SLC15A3, CTH, NOTCH1|MPZ, PLP1, S100B (low)|Woodhoo et al. 2008|192|NA|NA|
134 | |MFNG+ DBH+ neuron|132|C7|DLX1|DBH, SRRM4, MFNG, DDC, IGFBPL1, ENO3, NFASC, VWDE, SEZ6, IL17B, THBS4, FNDC5, PIPOX|-| |70|NA|NA|
135 | |TM4SF4+ CHODL+ neuron|133|C7|DLX1|TAC3, NRSN1, TM4SF4, AKAP6, NDUFA4L2, MAB21L2, CHODL, NXPH4 |-| |58|NA|NA|
136 | |TM4SF4+ PENK+ neuron|134|C7|DLX1|PENK, TMOD1, ISL1, MAPT, PTGER3, TAC3, TTC9B, PLPPR5, SLC5A7, ENTPD3, PLPP4, MIR7-3HG, CELF4|-| |49|NA|NA|
137 | |Late Schwann|135|C7|DLX1|TMEM176A, TMEM176B, OLFML2A, EGFL8, AHNAK, ENTPD2, COL4A1, LAMB1, SCN7A, TSPAN11, GFRA3, MATN2, RXRG, FST, HSPG2, COL5A3, PLEKHA4, MIA, MAL|-| |47|7.06769328|0.000039|
138 | |Mid Schwann|136|C7|DLX1|APOA1, MBP, FAM198B, RELN, ITIH5, FAM19A5, SEMA3B, EGFL8, LAMA2, GFRA3, HSPG2, LAMB1, MATN2, COL14A1, OLFML2B, PLEKHA4, MAL, ABCA8|-| |46|49.0981617|0.06404166|
139 | |COL20A1+ Schwann|137|C7|DLX1|COL20A1, ITM2A, ACTC1, KLF4, CYTL1, P3H2, ANGPT1, FRAS1, ALDH1A3, TGFBI, TFAP2C, GRID2|COL20A1|Castro et al. 2020|39|39.4710711|0.096808|
140 | |SST+ neuron|138|C7|DLX1|DPYS,LY6H, SYT4, NRCAM, PPP2R2B, SST, ACHE, SNCG, VAT1L, ARHGDIG|-| |36|NA|NA|
141 | |Early Schwann|139|C7|DLX1|ASCL1, HAND2-AS1, ALDH1A1, OLFML2A|-| |33|NA|NA|
142 | |KCNIP4+ neuron|140|C7|DLX1|PTPRR, KCNIP4, MAP7, TSPAN7, AC092691.1, PCBP3, PLPPR2, CNGB1, DLX3, CHRM2, AK1, NWD2, PDE2A|KCNIP4|Pruunsild et al. 2005|30|20.2168901|0.27972473|
143 | |PCP4+ neuron|141|C7|DLX1|PCP4, ACHE, LY6H, TMEM130, SST, SV2C, SYT4, NTNG1, AC073050.1, FRY, HOXD1|-| |27|NA|NA|
144 | |FGFBP2+ Neural progenitor|142|C7|DLX1|ADM, FGFBP2, RAPSN, METTL7A, BCL11A, EPAS1, DOK4, DGKB, DDC, SYN2, CCSER1, VWDE, PLCD4|NFIX|Heng et al. 2014|15|0.96270905|0.50386152|
145 | |Proliferating Schwann|143|C7|DLX1|APOA1, MIA, MAL, PLEKHA4, CENPM, GFRA3, KIFC1, CDK1, COL5A3|-| |7|NA|NA|
146 | |Mast|144|C8|TPSD1|TPSB2, TPSAB1, AL157895.1, CPA3, HDC, TPSD1, SLC18A2, VWA5A, SLC45A3|KIT, TPSAB1, HPGDS, HDC|Popescu D et al. 2019|22|2.04745165|0.07751663|
147 | 


--------------------------------------------------------------------------------
/Human_fetal_lung_cell_atlas_2022/README.md:
--------------------------------------------------------------------------------
 1 | # ⚠️⚠️Repo MOVED to [HERE](https://github.com/Peng-He-Lab/Human_fetal_lung_cell_atlas_2022) ⚠️⚠️!
 2 | # ⚠️⚠️Updated Folder [HERE](https://github.com/Peng-He-Lab/Human_fetal_lung_cell_atlas_2022)⚠️⚠️!
 3 | # ⚠️⚠️Newer version [HERE](https://github.com/Peng-He-Lab/Human_fetal_lung_cell_atlas_2022) ⚠️⚠️!
 4 | 
 5 | ![image](https://github.com/brianpenghe/python-genomics/assets/4110443/26eecd70-732a-4d07-a23c-a7699f687919)
 6 | # Processed data
 7 | ## [Human lung cell atlases](https://www.lungcellatlas.org/)
 8 | ### [Human fetal lung cell atlas](https://fetal-lung.cellgeni.sanger.ac.uk/) ([Tutorial](https://youtu.be/3BZdofyr6us?feature=shared))
 9 | #### [scRNA-seq](https://fetal-lung.cellgeni.sanger.ac.uk/scRNA.html)
10 | #### [scATAC-seq](https://fetal-lung.cellgeni.sanger.ac.uk/atac) 
11 |     [cellranger-atac output files](https://urldefense.proofpoint.com/v2/url?u=https-3A__drive.google.com_drive_folders_1AOfYN7fl31XzxVqDjMA0OAE9PpVMtOa4-3Fusp-3Dsharing&d=DwMFaQ&c=D7ByGjS34AllFgecYw0iC6Zq7qlm8uclZFI0SqQnqBo&r=jnOclbqDO8gTjG2ALkLiP8QIqLRquEVwWdtJRbxgXwQ&m=CrCPk7IFAhbpTuzwLiJOGATAuwNr00RsDhlXZQYUwp0PZkwrkNp6IzSs3bGi_-WT&s=GQoOWAXqOaG0J66pOVnkjM0-TOuGj3Hr5bR6OLbRwkc&e=)
12 |     
13 | &nbsp;&nbsp;&nbsp;&nbsp;[ATAC-seq tracks](https://genome.ucsc.edu/s/brianpenghe/scATAC_fetal_lung20211206)
14 | #### [*in situ* staining Images](https://fetal-lung.cellgeni.sanger.ac.uk/figures.html)
15 | #### [Visium spatial transcriptomics](https://fetal-lung.cellgeni.sanger.ac.uk/visium.html)
16 | Alternatively, you can find our scRNA-seq data on [cellxgene.cziscience](https://cellxgene.cziscience.com/collections/2d2e2acd-dade-489f-a2da-6c11aa654028)
17 | 
18 | # Our marker gene table
19 | ### [Marker genes for 144 cell states](https://github.com/brianpenghe/python-genomics/blob/master/Human_fetal_lung_cell_atlas_2022/MarkerGenes.md)
20 | 
21 | # Raw data (fastq)
22 | ## [BioStudies/ArrayExpress](https://www.ebi.ac.uk/biostudies/arrayexpress/studies?query=high-resolution%2Bsingle-cell%2Bmultiomic%2Batlas%2Bof%2Bthe%2Bhuman%2Bfetal%2Blung)
23 | #### [Fetal lung scRNA-seq and scVDJ](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11278?accession=E-MTAB-11278)
24 | #### [Organoid scRNA-seq](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11267?accession=E-MTAB-11267)
25 | #### [Fetal lung Visium scRNA-seq](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11265?accession=E-MTAB-11265)
26 | #### [Fetal lung scATAC-seq](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11266?accession=E-MTAB-11266)
27 | ## [ENA](https://www.ebi.ac.uk/ena/browser/text-search?query=high-resolution%20single-cell%20multiomic%20atlas%20of%20the%20human%20fetal%20lung)
28 | 
29 | # Citation
30 | ![image](https://user-images.githubusercontent.com/4110443/210624020-e668e3e6-5df8-4afd-a8f2-748a50c7fee6.png)
31 | #### [link to the paper](https://www.cell.com/cell/fulltext/S0092-8674(22)01415-5)
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2021, brianpenghe
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Pythonplus.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | 
3 | def CheckSource(Function2Check):
4 |     print( "".join(inspect.getsourcelines(Function2Check)[0]))
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ⚠️⚠️Repo MOVED to [HERE](https://github.com/Peng-He-Lab/ScanpyPlus) ⚠️⚠️!
  2 | # ⚠️⚠️Updated Folder [HERE](https://github.com/Peng-He-Lab/ScanpyPlus)⚠️⚠️!
  3 | # ⚠️⚠️Newer version [HERE](https://github.com/Peng-He-Lab/ScanpyPlus) ⚠️⚠️!
  4 | 
  5 | # python-genomics
  6 | A set of files to do genomics analysis in python
  7 | 
  8 | To use any of these script collections, just run these two lines in your *python kernel / Jupyter notebook*:
  9 | ```
 10 | sys.path.append('/home/ubuntu/tools/python-genomics')
 11 | import Scanpyplus
 12 | ```
 13 | ## Citation:
 14 | [He, Lim and Sun et al.](https://www.cell.com/cell/fulltext/S0092-8674(22)01415-5)
 15 | A human fetal lung cell atlas uncovers proximal-distal gradients of differentiation and key regulators of epithelial fates
 16 | 
 17 | ## DeepTree feature selection
 18 | ![image](https://user-images.githubusercontent.com/4110443/146441826-a4079e4c-c9de-4d93-9ebe-3e1c07227eb1.png)
 19 | 
 20 | 
 21 | Among the functions in *Scanpyplus*, there's also a function to do feature gene selection (*DeepTree* algorithm). It removes garbage among highly variable genes, mitigate batch effect if you remove garbage batch by batch, and increases signal-to-noise ratio of the top PCs to promote rare cell type discovery.
 22 | 
 23 | [Here](https://nbviewer.jupyter.org/github/brianpenghe/python-genomics/blob/master/DeepTree_algorithm_demo.ipynb) is a [notebook](https://github.com/brianpenghe/python-genomics/blob/master/DeepTree_algorithm_demo.ipynb) to use DeepTree algorithm to "de-noise" highly-variable genes and improve initial clustering. 
 24 | 
 25 | A *MATLAB* implementation can be found [here](https://github.com/brianpenghe/Matlab-genomics).
 26 | 
 27 | This algorithm can be potentially used to reduce batch effect when fearing overcorrection, especially comparing conditions or time points. Two notebooks are provided showing "soft integration" of [fetal limb](https://nbviewer.jupyter.org/github/brianpenghe/python-genomics/blob/master/Soft_integration_limb.ipynb) and [pancreas](https://nbviewer.jupyter.org/github/brianpenghe/python-genomics/blob/master/Soft_integration_pancreas.ipynb) data.
 28 | 
 29 | ## Doublet Cluster Labeling (DouCLing)
 30 | ![unnamed](https://user-images.githubusercontent.com/4110443/146441371-e7b4bec2-9e87-4a9d-98ad-3f3401ce13ed.jpg)
 31 | 
 32 | There are 4 types of doublets:
 33 | 
 34 | ![image](https://user-images.githubusercontent.com/4110443/146040113-1c1b27e6-453e-48fa-a4e8-786ff8c759ec.png)
 35 | 
 36 | Cross-sample doublets can usually be identified by hastags or genetic backgrounds. Theoretically, (n-1)/n of doublets can be identified as cross-sample doublets when n samples with different hashtags/genetics are pooled equally.
 37 | 
 38 | Heterotypic doublets can sometimes trick data scientists into thinking they are a new type of dual-feature cell type like NKT cells etc. 
 39 | Heterotypic doublets are usually identified by matching individual cells to synthetic doublets regardless of manually curated clusters. Algorithms like Scrublet can remove a substantial part of doublet cells but not all of them. The survivor doublets can still aggregate into tiny clusters picked up by the annotaters when doing subclustering. Doublets of rarer cell types are also often missed, which obscures the discoveries of new cell types and states.
 40 | To leverage the input from biologists' manual parsing and the increased sensitivity of cluster-average signatures, I introduce here an alternative approach to facilitate heterotypic doublet cluster identification. This approach scans through individual tiny clusters and look for its "Parent 2" that gives it a unique feature that's different from its sibling subclusters sharing the same "Parent 1". 
 41 | [A notebook using published PBMC data](https://nbviewer.jupyter.org/github/brianpenghe/python-genomics/blob/master/DOUblet_Cluster_Labeling.ipynb) is provided.
 42 | 
 43 | <details>
 44 |   <summary><b>Other functions in Scanpyplus:</b></summary>
 45 | 
 46 | ### An alternative way to call doublet subclusters based on *Scrublet* and [the gastrulation paper](https://www.nature.com/articles/s41586-019-0933-9)
 47 | `Bertie(adata,Resln=1,batch_key='batch')` was written with the help from [K. Polanski](https://github.com/ktpolanski). This script aggregates *Scrublet* scores from subclusters and makes threshold cuts based on subcluster p-values. And this is done batch by batch.
 48 | 
 49 | A variant version `Bertie_preclustered` allows users to use user-defined clusters to calculate p-values. This is also done batch by batch.
 50 | 
 51 | ### Manipulating colors:
 52 | You can extract the color dict of a variable from an anndata object using `ExtractColor(adata,obsKey='louvain',keytype=int)`, 
 53 | 
 54 | and manipulate the color dict using `UpdateUnsColor`. 
 55 | 
 56 | You can also cherry pick a value of a variable and make it white using `MakeWhite`.
 57 | 
 58 | ### Manipulating obs (observation) names and metadata:
 59 | You can plot sankey graph between two variables of an anndata object using `ScanpySankey`. 
 60 | 
 61 | Re-ordering the cluster IDs based on relationship rather than size can be done by `orderGroups`.
 62 | 
 63 | `remove_barcode_suffix` removes the suffix after the '-' in the cell (barcode) name.
 64 | 
 65 | `CopyMeta` copies the metadata (both obs and var) from one object to another.
 66 | 
 67 | `AddMeta` stores a dataframe of obs values per each cell into an object.
 68 | 
 69 | `AddMetaBatch` reads a dataframe of obs values per batch into an object. This format of metadata (rows are batch names, columns are obs categories) is more common, compact and human readable that is usually stored in *Excel* spreadsheets.
 70 | 
 71 | ### Manipulating var (variable) names metadata:
 72 | `OrthoTranslate` translates mouse genes to human orthologs and filter out poorly conserved genes, based on ortholog table that can be derived from Biomart etc.
 73 | 
 74 | ### Converting file types:
 75 | `file2gz` creates .gz files which is useful for creating artificial 10X files.
 76 | 
 77 | `Scanpy2MM` saves an *anndata* into *MatrixMarket* form.
 78 | 
 79 | `mtx2df` reads *MatrixMarket* files into a dataframe.
 80 | 
 81 | ### Manipulating matrix:
 82 | Transfer the raw layer to the default layer by `GetRaw` and calculate integer raw counts based on `n_counts` 
 83 | 
 84 | and log-transformed counts using `CalculateRaw`.
 85 | 
 86 | For large matrices, cells can be `DownSample`d based on labels such as cell types.
 87 | 
 88 | Sometimes `PseudoBulk` profiles are also useful to generate, whether it's the mean, median or max.
 89 | 
 90 | ### Manipulating obsm embedding coordinates:
 91 | `ShiftEmbedding` creates a platter that juxtaposes subsets of the data (batches, stages etc.) to visualize side by side.
 92 | 
 93 | `CopyEmbedding` copies the embedding of one object to another.
 94 | 
 95 | ### Plotting stacked barplots of cell-type/condition proportions:
 96 | `celltype_per_stage_plot` and `stage_per_celltype_plot` plot horizontal and vertical bar plots respectively based on two metadata variables (cell type and stage, for example).
 97 | 
 98 | ### Plot 3D UMAP:
 99 | `Plot3DimUMAP` generates a 3D plot (by *plotly*) of the UMAP after sc.tl.umap produces the 3D coordinates.
100 | 
101 | 
102 | ### Gene-level calculation and plotting:
103 | `DEmarkers` calculates, filteres and plots differentially expressed genes between two populations.
104 | 
105 | `GlobalMarkers` calculates marker genes for every cell cluster and filters them.
106 | 
107 | `ClusterGenes` transposes a log-transformed *adata* object and performs clustering and dimension reduction to classify genes.
108 | 
109 | `Dotplot2D` plots the expression levels of a gene across two metadata categories (e.g. samples and cell types). It can be used to trace maternal contimation by plotting XIST and check a key gene's expression patterns against cell types and age etc.
110 | 
111 | ### Plotting Seaborn plots:
112 | `snsSplitViolin` plots splitviolin plots for two populations.
113 | 
114 | `snsCluster` plots clustermaps using an *anndata object* as input. This has been helped by Bao Zhang from [Zhang lab](https://github.com/ZhangHongbo-Lab)
115 | 
116 | `markSeaborn` marks specific genes on a *Seaborn* plot.
117 | 
118 | `extractSeabornRows` extracts the rowlabels of a *Seaborn object* and saves into a *Series*.
119 | 
120 | ### Plotting Venn diagram:
121 | `Venn_Upset` can be used to directly plot upset plots (bar plots of each category of intersections).
122 | 
123 | ### Label transfer:
124 | `LogisticRegressionCellType` can learn the defining features of a variable (such as cell type) of the reference object and predict the corresponding labels of a query object. 
125 | 
126 | The saved model files and also be re-used to predict a new query object in future by `LogisticPrediction`.
127 | </details>
128 | 
129 | <details>
130 |   <summary><b>Functions in pandasPlus:</b></summary>
131 | 
132 | `DF2Ann` converts a dataframe into an *anndata* object.
133 | 
134 | `UpSetFromLists` plots an upset plot (barplot of Venn diagram intersections) based on lists of lists.
135 | 
136 | `show_graph_with_labels` plots an interaction graph using edges to represent connection strength (max at 1, at least 0.9 to be shown).
137 | 
138 | Dataframe values can also be used to calculate `zscore` and `Ginni` coefficients.
139 | 
140 | `cellphonedb_n_interaction_Mat` and `cellphonedb_mat_per_interaction` are useful to reformat cellphonedb outputs.
141 | </details>
142 | 


--------------------------------------------------------------------------------
/Scanpyplus.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | #import scrublet as scr
  3 | import scipy.io
  4 | from scipy import sparse
  5 | import matplotlib.pyplot as plt
  6 | from matplotlib import rcParams
  7 | import seaborn as sns
  8 | import numpy as np
  9 | import random
 10 | import sys
 11 | from sklearn.manifold import TSNE
 12 | from sklearn.preprocessing import scale
 13 | from sklearn.decomposition import TruncatedSVD
 14 | from sklearn.cluster import SpectralClustering
 15 | from sklearn.model_selection import cross_val_score, StratifiedKFold
 16 | from sklearn.linear_model import LogisticRegression
 17 | from sklearn.feature_extraction.text import TfidfTransformer
 18 | import joblib
 19 | 
 20 | import numpy as np
 21 | import pandas as pd
 22 | import scanpy as sc
 23 | import anndata
 24 | import os
 25 | from scipy import sparse
 26 | from scipy import cluster
 27 | from glob import iglob
 28 | import gzip
 29 | 
 30 | def ExtractColor(adata,obsKey='louvain',keytype=int):
 31 | #   labels=sorted(adata.obs[obsKey].unique().to_list(),key=keytype)
 32 |     labels=adata.obs[obsKey].cat.categories
 33 |     colors=adata.uns[obsKey+'_colors']
 34 |     return dict(zip(labels,colors))
 35 | 
 36 | def UpdateUnsColor(adata,ColorDict,obsKey='louvain'):
 37 |     #ColorDict is like {'Secretory 3 C1': '#c0c0c5','Secretory 4 C1': '#87ceee'}
 38 |     ColorUns=ExtractColor(adata,obsKey,keytype=str)
 39 |     ColorUns.update(ColorDict)
 40 |     adata.uns[obsKey+'_colors']=list(ColorUns.values())
 41 |     return adata
 42 | 
 43 | def Plot3DimUMAP(adata,obsKey='leiden',obsmKey='X_umap'):
 44 |     #Make sure adata.obsm['X_umap'] contains three columns
 45 |     #The obsKey must points to a str/categorical variable
 46 |     import plotly.express as px
 47 |     ThreeDdata=pd.DataFrame(adata.obsm[obsmKey],index=adata.obs_names,columns=['x','y','z'])
 48 |     ThreeDdata[obsKey]=adata.obs[obsKey]
 49 |     fig=px.scatter_3d(ThreeDdata,x='x',y='y',z='z',color=obsKey,opacity=1,
 50 |                  color_discrete_map=ExtractColor(adata,obsKey,str))
 51 |     fig.update_traces(marker=dict(size=2, 
 52 |                               line=dict(width=0,
 53 |                                         color='black')))
 54 |     return fig
 55 | 
 56 | def ScanpySankey(adata,var1,var2,aspect=20,
 57 |                 fontsize=12, figureName="cell type", leftLabels=['Default'],
 58 |     rightLabels=['Default']):
 59 |     from pysankey import sankey
 60 |     colordict={**ExtractColor(adata,var1,str),
 61 |                **ExtractColor(adata,var2,str)}
 62 |     if 'Default' in leftLabels:
 63 |         leftLabels=sorted(adata.obs[var1].unique().tolist())
 64 |     if 'Default' in rightLabels:
 65 |         rightLabels=sorted(adata.obs[var2].unique().tolist())
 66 |     return sankey(adata.obs[var1],adata.obs[var2],aspect=aspect,colorDict=colordict,
 67 | fontsize=fontsize,figureName=figureName,leftLabels=leftLabels,rightLabels=rightLabels)
 68 | 
 69 | def iRODS_stats_starsolo(samples):
 70 |     #samples should be a list of library IDs
 71 |     qc = pd.DataFrame(0, index=samples, columns=['n_cells', 'median_n_counts'])
 72 |     for sample in samples:
 73 |         #download and import data
 74 |         os.system('iget -Kr /archive/HCA/10X/'+sample+'/starsolo/counts/Gene/cr3')
 75 |         adata = sc.read_10x_mtx('cr3')
 76 |         #this gets .obs['n_counts'] computed
 77 |         sc.pp.filter_cells(adata, min_counts=1)
 78 |         #compute the qc metrics and store them
 79 |         qc.loc[sample, 'n_cells'] = adata.shape[0]
 80 |         qc.loc[sample, 'median_n_counts'] = np.median(adata.obs['n_counts']).astype(float)
 81 |         #delete downloaded count matrix
 82 |         os.system('rm -r cr3')
 83 |     return qc
 84 | 
 85 | def orderGroups(adata,groupby='leiden'):
 86 |     #this returns a list of group names
 87 |     sc.tl.dendrogram(adata,groupby=groupby)
 88 |     return adata.uns[f'dendrogram_'+groupby]['dendrogram_info']['ivl']
 89 | 
 90 | def MakeWhite(adata,obsKey='louvain',whiteCat='nan',type=str):
 91 |     temp=ExtractColor(adata,obsKey,type)
 92 |     temp[whiteCat]='#FFFFFF'
 93 |     UpdateUnsColor(adata,temp,obsKey)
 94 |     return adata
 95 | 
 96 | def GetRaw(adata_all):
 97 |     adata=anndata.AnnData(X=adata_all.raw.X,obs=adata_all.obs,var=adata_all.raw.var,\
 98 | obsm=adata_all.obsm,uns=adata_all.uns,obsp=adata_all.obsp)
 99 |     adata.raw=adata
100 |     return adata
101 | 
102 | def CalculateRaw(adata,scaling_factor=10000):
103 |     #update by Polanski in Feb 2022
104 |     #The object must contain a log-transformed matrix
105 |     #This function returns an integer-count object
106 |     #The normalization constant is assumed to be 10000
107 |     #return anndata.AnnData(X=sparse.csr_matrix(np.rint(np.array(np.expm1(adata.X).todense().transpose())*(adata.obs['n_counts'].values).transpose() / scaling_factor).transpose()),\
108 |     #              obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
109 |     adata.X=adata.X.tocsr() #this step makes sure the datamatrix is in csr not csc
110 |     X = np.expm1(adata.X)
111 |     scaling_vector = adata.obs['n_counts'].values / scaling_factor
112 |     #.indptr[i]:.indptr[i+1] provides the .data coordinates where the i'th row of the data resides in CSR
113 |     #which happens to be a cell, which happens to be what we have a unique entry in scaling_vector for
114 |     for i in np.arange(X.shape[0]):
115 |         X.data[X.indptr[i]:X.indptr[i+1]] = X.data[X.indptr[i]:X.indptr[i+1]] * scaling_vector[i]
116 |     return anndata.AnnData(X=np.rint(X),obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
117 | 
118 | 
119 | def CalculateRawAuto(adata):
120 |     X = np.expm1(adata.X)
121 |     #.indptr[i]:.indptr[i+1] provides the .data coordinates where the i'th row of the data resides in CSR
122 |     #which happens to be a cell, which happens to be what we need to reverse
123 |     for i in np.arange(X.shape[0]):
124 |         #the object is cursed, locate lowest count for each cell. treat that as 1
125 |         #divide other counts by it. don't round for post-fact checks
126 |         norm_one = np.min(X.data[X.indptr[i]:X.indptr[i+1]])
127 |         X.data[X.indptr[i]:X.indptr[i+1]] = X.data[X.indptr[i]:X.indptr[i+1]] / norm_one
128 |     #originally this had X=np.rint(X) but we actually want the full value space here
129 |     return anndata.AnnData(X=X,obs=adata.obs,var=adata.var,obsm=adata.obsm,varm=adata.varm)
130 | 
131 | def CheckGAPDH(adata,sparse=True,gene='GAPDH'):
132 |     if sparse==True:
133 |         return adata[:,gene].X[0:5].todense()
134 |     else:
135 |         return adata[:,gene].X[0:5]
136 | 
137 | def FindSimilarGenes(adata,genename='GAPDH'):
138 |     #This function finds the most correlated genes for a given gene
139 |     temp=adata.to_df()
140 |     corr_temp=np.corrcoef(temp,rowvar=False)
141 |     corr_temp_series=pd.Series(corr_temp[:,temp.columns.get_loc(genename)],
142 | index=temp.columns)
143 |     return corr_temp_series.sort_values(ascending=False)
144 | 
145 | def OrthoTranslate(adata,\
146 | oTable='~/refseq/Mouse-Human_orthologs_only.csv'):
147 |     adata.var_names_make_unique(join='-')
148 |     OrthologTable = pd.read_csv(oTable).dropna()
149 |     MouseGenes=OrthologTable.loc[:,'Gene name'].drop_duplicates(keep=False)
150 |     HumanGenes=OrthologTable.loc[:,'Human gene name'].drop_duplicates(keep=False)
151 |     FilteredTable=OrthologTable.loc[((OrthologTable.loc[:,'Gene name'].isin(MouseGenes)) &\
152 |                             (OrthologTable.loc[:,'Human gene name'].isin(HumanGenes))),:]
153 |     bdata=adata[:,adata.var_names.isin(FilteredTable.loc[:,'Gene name'])]
154 |     FilteredTable.set_index('Gene name',inplace=True,drop=False)
155 |     bdata.var_names=FilteredTable.loc[bdata.var_names,'Human gene name']
156 |     return bdata
157 | 
158 | def remove_barcode_suffix(adata):
159 |     bdata=adata.copy()
160 |     bdata.obs_names=pd.Index([i[0] for i in bdata.obs_names.str.split('-',expand=True)])
161 |     return bdata
162 | 
163 | def file2gz(file,delete_original=True):
164 |     with open(file,'rb') as src, gzip.open(file+'.gz','wb') as dst:
165 |         dst.writelines(src)
166 |     if delete_original==True:
167 |         os.remove(file)
168 | 
169 | def Scanpy2MM(adata,prefix='temp',write2Dobsm=['No']):
170 |     #Scanpy2MM(adata,"./")
171 |     #please make sure the object contains raw counts (using our CalculateRaw function)
172 |     adata.var['feature_types']='Gene Expression'
173 |     scipy.io.mmwrite(prefix+'matrix.mtx',adata.X.transpose(),field='integer')
174 |     if 'gene_ids' not in adata.var.columns.unique():
175 |         adata.var['gene_ids']=adata.var_names
176 |     adata.var[['gene_ids','feature_types']].reset_index().set_index(keys='gene_ids').to_csv(prefix+"features.tsv", \
177 |             sep = "\t", index= True,header=False)
178 |     if 'No' in write2Dobsm:
179 |         print('No embeddings written')
180 |     else:
181 |         for basis in write2Dobsm: #save embeddings in obs
182 |             adata.obs[basis+'_x']=adata.obsm[basis][:,0]
183 |             adata.obs[basis+'_y']=adata.obsm[basis][:,1]
184 |     adata.obs.to_csv(prefix+"barcodes.tsv", sep = "\t", columns=[],header= False)
185 |     adata.obs.to_csv(prefix+"metadata.tsv", sep = "\t", index= True)
186 |     if 'No' in write2Dobsm:
187 |         print('No embeddings written')
188 |     else:
189 |         for basis in write2Dobsm: #delete obsm->obs
190 |             del adata.obs[basis+'_x']
191 |             del adata.obs[basis+'_y']
192 |     file2gz(prefix+"matrix.mtx")
193 |     file2gz(prefix+"barcodes.tsv")
194 |     ##file2gz(prefix+"metadata.tsv")
195 |     file2gz(prefix+"features.tsv")
196 | 
197 | def ShiftEmbedding(adata,domain_key='batch',embedding='X_umap',nrows=3,alpha=0.9):
198 |     from sklearn import preprocessing
199 |     scaler = preprocessing.MinMaxScaler()
200 |     adata.obs[embedding+'0']=adata.obsm[embedding][:,0]
201 |     adata.obs[embedding+'1']=adata.obsm[embedding][:,1]
202 |     X=adata.obs[embedding+'0']
203 |     Y=adata.obs[embedding+'1']
204 |     batch_categories=adata.obs[domain_key].unique()
205 |     for i in list(range(len(batch_categories))):
206 |         temp=adata[adata.obs[domain_key]==batch_categories[i]].obsm[embedding]
207 |         scaler.fit(temp)
208 |         X.loc[adata.obs[domain_key]==batch_categories[i]]=(scaler.transform(temp)*alpha+ [int(i/nrows),i%nrows])[:,0]
209 |         Y.loc[adata.obs[domain_key]==batch_categories[i]]=(scaler.transform(temp)*alpha+ [i/nrows,i%nrows])[:,1]
210 |     adata.obsm[embedding]=np.vstack((X.values,Y.values)).T
211 |     del adata.obs[embedding+'0']
212 |     del adata.obs[embedding+'1']
213 |     return adata
214 | 
215 | def CopyEmbedding(aFrom,aTo,embedding='X_umap'):
216 |     aFrom.obs['temp0']=aFrom.obsm[embedding][:,0]
217 |     aFrom.obs['temp1']=aFrom.obsm[embedding][:,1]
218 |     aTo.obs['temp0']=''
219 |     aTo.obs['temp1']=''
220 |     aTo.obs.loc[aFrom.obs_names,'temp0']=aFrom.obs['temp0']
221 |     aTo.obs.loc[aFrom.obs_names,'temp1']=aFrom.obs['temp1']
222 |     aTo.obsm[embedding]=np.vstack((aTo.obs['temp0'],aTo.obs['temp1'])).T
223 |     del aFrom.obs['temp0']
224 |     del aFrom.obs['temp1']
225 |     del aTo.obs['temp0']
226 |     del aTo.obs['temp1']
227 |     return aTo
228 | 
229 | def CopyMeta(aFro,aTo,overwrite=False):
230 |     #This function copies the metadata of one object to another
231 |     aFrom=aFro[aFro.obs_names.isin(aTo.obs_names)][:,
232 |                aFro.var_names.isin(aTo.var_names)]
233 |     aFrom
234 |     if overwrite==True:
235 |         obs_items=aFrom.obs.columns
236 |         var_items=aFrom.var.columns
237 |     else:
238 |         obs_items=aFrom.obs.columns[~aFrom.obs.columns.isin(aTo.obs.columns)]
239 |         var_items=aFrom.var.columns[~aFrom.var.columns.isin(aTo.var.columns)]
240 |     aTo.obs[obs_items]=np.nan
241 |     aTo.var[var_items]=np.nan
242 |     aTo.obs.loc[aFrom.obs_names,obs_items]=aFrom.obs.loc[:,obs_items]
243 |     aTo.var.loc[aFrom.var_names,var_items]=aFrom.var.loc[:,var_items]
244 |     return aTo
245 | 
246 | def AddMeta(adata,meta):
247 |     meta_df=meta.loc[meta.index.isin(adata.obs_names),:]
248 |     meta_df=meta_df.loc[meta_df.index.drop_duplicates(keep=False),:]
249 |     temp=adata.copy()
250 | #    temp.obs=temp.obs.combine_first(meta_df) #it has barcode sliding problem!
251 |     for i in meta_df.columns:
252 |         print("copying "+i+"\n")
253 |         temp.obs[i]=np.nan
254 |         temp.obs.loc[meta_df.index,i]=meta_df.loc[:,i]
255 |     return temp
256 | 
257 | def AddMetaBatch(adata,meta_compact,batch_key='batch'):
258 |     #import your csv file into a df. The index should be batch IDs
259 |     temp=adata.copy()
260 |     for i in meta_compact.columns:
261 |         temp.obs[i]=temp.obs[batch_key].replace(to_replace=meta_compact.loc[:,i].to_dict())
262 |     return temp
263 | 
264 | def ExtractMetaBatch(adata,batch_key='batch'):
265 |     #return a dataframe of the most frequent value for each variable per batch key
266 |     #This can be regarded as the reverse of AddMetaBatch except for numeric variables
267 |     return adata.obs.groupby(batch_key).agg(pd.Series.mode)
268 | 
269 | def celltype_per_stage_plot(adata,celltypekey='louvain',stagekey='batch',plotlabel=True,\
270 |     celltypelist=['default'],stagelist=['default'],celltypekeytype=int,stagekeytype=str,
271 |     fontsize='x-small',yfontsize='x-small',legend_pos=(1,0.5),savefig=None):
272 |     # this is a function for horizonal bar plots
273 |     if 'default' in celltypelist:
274 |         celltypelist = sorted(adata.obs[celltypekey].unique().tolist(),key=celltypekeytype)
275 |     if 'default' in stagelist:
276 |         stagelist = sorted(adata.obs[stagekey].unique().tolist(),key=stagekeytype)
277 |     celltypelist=[i for i in celltypelist if i in adata.obs[celltypekey].unique()]
278 |     stagelist=[i for i in stagelist if i in adata.obs[stagekey].unique()]
279 |     colors=ExtractColor(adata,celltypekey,keytype=str)
280 |     count_array=np.array(pd.crosstab(adata.obs[celltypekey],adata.obs[stagekey]).loc[celltypelist,stagelist])
281 |     count_ratio_array=count_array / np.sum(count_array,axis=0)
282 |     for i in range(len(celltypelist)):
283 |         plt.barh(stagelist[::-1],count_ratio_array[i,::-1],
284 |             left=np.sum(count_ratio_array[0:i,::-1],axis=0),color=colors[celltypelist[i]],label=celltypelist[i])
285 |         plt.yticks(fontsize=yfontsize)
286 |     plt.grid(b=False)
287 |     if plotlabel:
288 |         plt.legend(celltypelist,fontsize=fontsize,bbox_to_anchor=legend_pos)
289 |     if savefig is not None:
290 |         plt.savefig(savefig+'.pdf',bbox_inches='tight')
291 | 
292 | def stage_per_celltype_plot(adata,celltypekey='louvain',stagekey='batch',plotlabel=True,\
293 |     # this is a function for vertical bar plots
294 |     # please remember to run pl.umap to assign colors
295 |     celltypelist=['default'],stagelist=['default'],celltypekeytype=int,stagekeytype=str,
296 |     fontsize='x-small',xfontsize='x-small',legend_pos=(1,1),savefig=None):
297 |     if 'default' in celltypelist:
298 |         celltypelist = sorted(adata.obs[celltypekey].unique().tolist(),key=celltypekeytype)
299 |     if 'default' in stagelist:
300 |         stagelist = sorted(adata.obs[stagekey].unique().tolist(),key=stagekeytype)
301 |     celltypelist=[i for i in celltypelist if i in adata.obs[celltypekey].unique()]
302 |     stagelist=[i for i in stagelist if i in adata.obs[stagekey].unique()]
303 |     colors=ExtractColor(adata,stagekey,keytype=str)
304 |     count_array=np.array(pd.crosstab(adata.obs[celltypekey],adata.obs[stagekey]).loc[celltypelist,stagelist])
305 |     count_ratio_array=count_array.transpose() / np.sum(count_array,axis=1)
306 |     for i in range(len(stagelist)):
307 |         plt.bar(celltypelist,count_ratio_array[i,:],
308 |             bottom=1-np.sum(count_ratio_array[0:i+1,:],axis=0),
309 |            color=colors[stagelist[i]],label=stagelist[i])
310 |         plt.xticks(fontsize=xfontsize)
311 |     plt.grid(b=False)
312 |     plt.legend(stagelist,fontsize=fontsize,bbox_to_anchor=legend_pos)
313 |     plt.xticks(rotation=90)
314 |     if savefig is not None:
315 |         plt.savefig(savefig+'.pdf',bbox_inches='tight')
316 | 
317 | def mtx2df(mtx,idx,col):
318 |     #mtx is the name/location of the matrix.mtx file
319 |     #idx is the index file (rownames)
320 |     #col is the colnames file
321 |     count = scipy.io.mmread(mtx)
322 |     idxs = [i.strip() for i in open(idx)]
323 |     cols = [i.strip() for i in open(col)]
324 |     sc_count = pd.DataFrame(data=count.toarray(),
325 |                         index=idxs,
326 |                         columns=cols)
327 |     return sc_count
328 | 
329 | def returnDEres(adata, column = None, key= None, remove_mito_ribo = True):
330 |     import functools
331 |     if key is None:
332 |         key = 'rank_genes_groups'
333 |     else:
334 |         key = key
335 | 
336 |     if column is None:
337 |         column = list(adata.uns[key]['scores'].dtype.fields.keys())[0]
338 |     else:
339 |         column = column
340 | 
341 |     scores = pd.DataFrame(data = adata.uns[key]['scores'][column], index = adata.uns[key]['names'][column])
342 |     lfc = pd.DataFrame(data = adata.uns[key]['logfoldchanges'][column], index = adata.uns[key]['names'][column])
343 |     pvals = pd.DataFrame(data = adata.uns[key]['pvals'][column], index = adata.uns[key]['names'][column])
344 |     padj = pd.DataFrame(data = adata.uns[key]['pvals_adj'][column], index = adata.uns[key]['names'][column])
345 |     try:
346 |         pts = pd.DataFrame(data = adata.uns[key]['pts'][column], index = adata.uns[key]['names'][column])       
347 |     except:
348 |         pass
349 |     scores = scores.loc[scores.index.dropna()]
350 |     lfc = lfc.loc[lfc.index.dropna()]
351 |     pvals = pvals.loc[pvals.index.dropna()]
352 |     padj = padj.loc[padj.index.dropna()]
353 |     try:
354 |         pts = pts.loc[pts.index.dropna()]
355 |     except:
356 |         pass
357 |     try:
358 |         dfs = [scores, lfc, pvals, padj, pts]
359 |     except:
360 |         dfs = [scores, lfc, pvals, padj]
361 |     df_final = functools.reduce(lambda left, right: pd.merge(left, right, left_index = True, right_index = True), dfs)
362 |     try:
363 |         df_final.columns = ['scores', 'logfoldchanges', 'pvals', 'pvals_adj', 'pts']
364 |     except:
365 |         df_final.columns = ['scores', 'logfoldchanges', 'pvals', 'pvals_adj']
366 |     if remove_mito_ribo:
367 |         df_final = df_final[~df_final.index.isin(list(df_final.filter(regex='^RPL|^RPS|^MRPS|^MRPL|^MT-', axis = 0).index))]
368 |         df_final = df_final[~df_final.index.isin(list(df_final.filter(regex='^Rpl|^Rps|^Mrps|^Mrpl|^mt-', axis = 0).index))]
369 |     return(df_final)
370 | 
371 | def DEmarkers(adata,celltype,reference,obs,max_out_group_fraction=0.25,\
372 | use_raw=False,length=100,obslist=['percent_mito','n_genes','batch'],\
373 | min_fold_change=2,min_in_group_fraction=0.25,log=True,method='wilcoxon',
374 | embedding='X_umap'):
375 |     celltype=celltype
376 |     sc.tl.rank_genes_groups(adata, obs, groups=[celltype],n_genes=length,
377 |                         reference=reference,method=method,log=log,pts=True)
378 | #    temp=returnDEres(adata,key='rank_genes_groups',column=celltype)
379 |     sc.tl.filter_rank_genes_groups(adata, groupby=obs,\
380 |                     max_out_group_fraction=max_out_group_fraction,
381 |                     min_fold_change=min_fold_change,use_raw=use_raw,
382 |                     min_in_group_fraction=min_in_group_fraction)
383 |     GeneList=pd.DataFrame(adata.uns['rank_genes_groups_filtered']['names']).loc[:,celltype].dropna().head(length).transpose().tolist()
384 | #    temp1=pd.concat([temp,
385 | #        (adata[:,temp.index][adata.obs[obs]==celltype].to_df()>0).mean(axis=0).rename('pct1'),
386 | #        (adata[:,temp.index][adata.obs[obs]==reference].to_df()>0).mean(axis=0).rename('pct2')],
387 | #        axis=1)
388 |     import math
389 | #    GeneList=temp1.loc[(temp1.pvals < 0.05) & (temp1.pct1 >= min_in_group_fraction) & \
390 | #(temp1.logfoldchanges > math.log(min_fold_change)) & (temp1.pct2 <= max_out_group_fraction),:].index.tolist()
391 |     sc.pl.embedding(adata,basis=embedding,color=GeneList+obslist,
392 |            color_map = 'jet',use_raw=use_raw)
393 |     sc.pl.dotplot(adata,var_names=GeneList,
394 |              groupby=obs,use_raw=use_raw,standard_scale='var')
395 |     sc.pl.stacked_violin(adata[adata.obs[obs].isin([celltype,reference]),:],var_names=GeneList,groupby=obs,
396 |            swap_axes=True)
397 |     del adata.uns['rank_genes_groups']
398 |     del adata.uns['rank_genes_groups_filtered']
399 |     return GeneList
400 | 
401 | def GlobalMarkers(adata,obs,max_out_group_fraction=0.25,min_fold_change=2,\
402 | min_in_group_fraction=0.25,use_raw=False,method='wilcoxon'):
403 |     sc.tl.rank_genes_groups(adata,groupby=obs,n_genes=len(adata.var_names),
404 |                   method=method)
405 |     sc.tl.filter_rank_genes_groups(adata,groupby=obs,
406 |                     max_out_group_fraction=max_out_group_fraction,
407 |                     min_fold_change=min_fold_change,use_raw=use_raw,
408 |                     min_in_group_fraction=min_in_group_fraction)
409 |     Markers=pd.DataFrame(adata.uns['rank_genes_groups_filtered']['names'])
410 |     return Markers.apply(lambda x: pd.Series(x.dropna().values))
411 | 
412 | def HVGbyBatch(adata,batch_key='batch',min_mean=0.0125, max_mean=3, min_disp=0.5,\
413 | min_clustersize=100,genenames=['default']):
414 |     if 'default' in genenames:
415 |         genenames = adata.var_names
416 |     sc.settings.verbosity=0
417 |     batchlist=adata.obs[batch_key].value_counts()
418 |     for key in batchlist[batchlist>min_clustersize].index:
419 |         adata_sample = adata[adata.obs[batch_key]==key,:][:,genenames]
420 |         print(key)
421 |         sc.pp.highly_variable_genes(adata_sample, min_mean=min_mean, max_mean=max_mean, min_disp=min_disp)
422 |         adata.var['highly_variable'+key]=pd.Series(adata.var_names,\
423 |             index=adata.var_names).isin(adata_sample.var_names[adata_sample.var['highly_variable']])
424 |     sc.settings.verbosity=3
425 |     adata.var['highly_variable_n']=0
426 |     temp=adata.var['highly_variable_n'].astype('int32')
427 |     for key in batchlist[batchlist>min_clustersize].index:
428 |         temp=temp+adata.var['highly_variable'+key].astype('int32')
429 |     adata.var['highly_variable_n']=temp
430 |     return adata
431 | 
432 | def HVG_cutoff(adata,range_int=10,cutoff=5000,HVG_var='highly_variable_n',fig_size=(8,6)):
433 |     if range_int>max(adata.var[HVG_var]):
434 |         range_int=max(adata.var[HVG_var])
435 |     HVG_list=[]
436 |     for i in list(range(range_int)):
437 |         HVG_list.append((adata.var[HVG_var]>i).value_counts()[True])
438 |     plt.figure(figsize=fig_size)
439 |     plt.plot(list(range(range_int)),HVG_list)
440 |     plt.axhline(y=cutoff, color='r', linestyle='--')
441 |     HVG_n=next((x for x in reversed(HVG_list) if x >= cutoff), HVG_list[0])
442 |     HVG_i=len(HVG_list) - 1 - next((i for i, x in enumerate(reversed(HVG_list)) if x >= cutoff), len(HVG_list)-1)
443 |     print("The smallest i for intersection to achieve more than "+str(cutoff)+" HVGs is "+str(HVG_i)+" , yielding "+str(HVG_n)+" genes")
444 |     return HVG_i
445 | 
446 | def Bertie(adata,Resln=1,batch_key='batch'):
447 |     import scrublet as scr
448 |     scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval']
449 |     adata.obs['doublet_scores']=0
450 |     def bh(pvalues):
451 |         '''
452 |         Computes the Benjamini-Hochberg FDR correction.
453 | 
454 |         Input:
455 |             * pvals - vector of p-values to correct
456 |         '''
457 |         n = int(pvalues.shape[0])
458 |         new_pvalues = np.empty(n)
459 |         values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
460 |         values.sort()
461 |         values.reverse()
462 |         new_values = []
463 |         for i, vals in enumerate(values):
464 |             rank = n - i
465 |             pvalue, index = vals
466 |             new_values.append((n/rank) * pvalue)
467 |         for i in range(0, int(n)-1):
468 |             if new_values[i] < new_values[i+1]:
469 |                 new_values[i+1] = new_values[i]
470 |         for i, vals in enumerate(values):
471 |             pvalue, index = vals
472 |             new_pvalues[index] = new_values[i]
473 |         return new_pvalues
474 | 
475 |     for i in np.unique(adata.obs[batch_key]):
476 |         print(i)
477 |         adata_sample = adata[adata.obs[batch_key]==i,:]
478 |         scrub = scr.Scrublet(adata_sample.X)
479 |         doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
480 |         adata_sample.obs['scrublet_score'] = doublet_scores
481 |         adata_sample=adata_sample.copy()
482 |         sc.pp.filter_genes(adata_sample, min_cells=3)
483 |         sc.pp.normalize_per_cell(adata_sample, counts_per_cell_after=1e4)
484 |         sc.pp.log1p(adata_sample)
485 |         sc.pp.highly_variable_genes(adata_sample, min_mean=0.0125, max_mean=3, min_disp=0.5)
486 |         adata_sample = adata_sample[:, adata_sample.var['highly_variable']]
487 |         sc.pp.scale(adata_sample, max_value=10)
488 |         sc.tl.pca(adata_sample, svd_solver='arpack')
489 |         adata_sample = adata_sample.copy()
490 |         # del adata_sample.obsm['X_diffmap']
491 |         sc.pp.neighbors(adata_sample)
492 |         #eoverclustering proper - do basic clustering first, then cluster each cluster
493 |         sc.tl.louvain(adata_sample)
494 |         for clus in np.unique(adata_sample.obs['louvain']):
495 |             sc.tl.louvain(adata_sample, restrict_to=('louvain',[clus]),resolution=Resln)
496 |             adata_sample.obs['louvain'] = adata_sample.obs['louvain_R']
497 |         #compute the cluster scores - the median of Scrublet scores per overclustered cluster
498 |         for clus in np.unique(adata_sample.obs['louvain']):
499 |             adata_sample.obs.loc[adata_sample.obs['louvain']==clus, 'scrublet_cluster_score'] = \
500 |                 np.median(adata_sample.obs.loc[adata_sample.obs['louvain']==clus, 'scrublet_score'])
501 |         #now compute doublet p-values. figure out the median and mad (from above-median values) for the distribution
502 |         med = np.median(adata_sample.obs['scrublet_cluster_score'])
503 |         mask = adata_sample.obs['scrublet_cluster_score']>med
504 |         mad = np.median(adata_sample.obs['scrublet_cluster_score'][mask]-med)
505 |         #let's do a one-sided test. the Bertie write-up does not address this but it makes sense
506 |         pvals = 1-scipy.stats.norm.cdf(adata_sample.obs['scrublet_cluster_score'], loc=med, scale=1.4826*mad)
507 |         adata_sample.obs['bh_pval'] = bh(pvals)
508 |         #create results data frame for single sample and copy stuff over from the adata object
509 |         scrublet_sample = pd.DataFrame(0, index=adata_sample.obs_names, columns=scorenames)
510 |         for meta in scorenames:
511 |             scrublet_sample[meta] = adata_sample.obs[meta]
512 |         #write out complete sample scores
513 |         #scrublet_sample.to_csv('scrublet-scores/'+i+'.csv')
514 | 
515 |         #scrub.plot_histogram();
516 |         #plt.savefig('limb/sample_'+i+'_doulet_histogram.pdf')
517 |         adata.obs.loc[adata.obs[batch_key]==i,'doublet_scores']=doublet_scores
518 |         adata.obs.loc[adata.obs[batch_key]==i,'bh_pval'] = bh(pvals)
519 |         del adata_sample
520 |     return adata
521 | 
522 | def Bertie_preclustered(adata,batch_key='batch',cluster_key='louvain'):
523 |     import scrublet as scr
524 |     scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval']
525 |     adata.obs['doublet_scores']=0
526 |     def bh(pvalues):
527 |         '''
528 |         Computes the Benjamini-Hochberg FDR correction.
529 | 
530 |         Input:
531 |             * pvals - vector of p-values to correct
532 |         '''
533 |         n = int(pvalues.shape[0])
534 |         new_pvalues = np.empty(n)
535 |         values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
536 |         values.sort()
537 |         values.reverse()
538 |         new_values = []
539 |         for i, vals in enumerate(values):
540 |             rank = n - i
541 |             pvalue, index = vals
542 |             new_values.append((n/rank) * pvalue)
543 |         for i in range(0, int(n)-1):
544 |             if new_values[i] < new_values[i+1]:
545 |                 new_values[i+1] = new_values[i]
546 |         for i, vals in enumerate(values):
547 |             pvalue, index = vals
548 |             new_pvalues[index] = new_values[i]
549 |         return new_pvalues
550 | 
551 |     for i in np.unique(adata.obs[batch_key]):
552 |         adata_sample = adata[adata.obs[batch_key]==i,:]
553 |         scrub = scr.Scrublet(adata_sample.X)
554 |         doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
555 |         adata_sample.obs['scrublet_score'] = doublet_scores
556 |         adata_sample=adata_sample.copy()
557 | 
558 |         for clus in np.unique(adata_sample.obs[cluster_key]):
559 |             adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_cluster_score'] = \
560 |                 np.median(adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_score'])
561 | 
562 |         med = np.median(adata_sample.obs['scrublet_cluster_score'])
563 |         mask = adata_sample.obs['scrublet_cluster_score']>med
564 |         mad = np.median(adata_sample.obs['scrublet_cluster_score'][mask]-med)
565 |         #let's do a one-sided test. the Bertie write-up does not address this but it makes sense
566 |         pvals = 1-scipy.stats.norm.cdf(adata_sample.obs['scrublet_cluster_score'], loc=med, scale=1.4826*mad)
567 |         adata_sample.obs['bh_pval'] = bh(pvals)
568 |         #create results data frame for single sample and copy stuff over from the adata object
569 |         scrublet_sample = pd.DataFrame(0, index=adata_sample.obs_names, columns=scorenames)
570 |         for meta in scorenames:
571 |             scrublet_sample[meta] = adata_sample.obs[meta]
572 |         #write out complete sample scores
573 |         #scrublet_sample.to_csv('scrublet-scores/'+i+'.csv')
574 | 
575 |         #scrub.plot_histogram();
576 |         #plt.savefig('limb/sample_'+i+'_doulet_histogram.pdf')
577 |         adata.obs.loc[adata.obs[batch_key]==i,'doublet_scores']=doublet_scores
578 |         adata.obs.loc[adata.obs[batch_key]==i,'bh_pval'] = bh(pvals)
579 |         del adata_sample
580 |     return adata
581 | 
582 | 
583 | def snsSplitViolin(adata,genelist,celltype='leiden',celltypelist=['0','1']):
584 |     df=sc.get.obs_df(adata[adata.obs[celltype].isin(celltypelist)],genelist+[celltype])
585 |     df = df.set_index(celltype).stack().reset_index()
586 |     df.columns=[celltype,'gene','value']
587 |     sns.violinplot(data=df, x='gene', y='value', hue=celltype,
588 |                 split=True, inner="quart", linewidth=1)
589 | 
590 | def DownSample(MouseC1data,cell_type='leiden',downsampleTo=10):
591 |     NewIndex3=[]
592 |     if ( downsampleTo > 0 ) & (isinstance(downsampleTo, int)):
593 |         for i in MouseC1data.obs[cell_type].sort_values().unique():
594 |             NewIndex3=NewIndex3+random.sample(\
595 |                    population=MouseC1data[MouseC1data.obs[cell_type]==i].obs_names.tolist(),
596 |             k=min(downsampleTo,len(MouseC1data[MouseC1data.obs[cell_type]==i\
597 |                          ].obs_names.tolist())))
598 |     return MouseC1data[NewIndex3]
599 | 
600 | def snsCluster(MouseC1data,MouseC1ColorDict2={False:'#000000',True:'#00FFFF'},cell_type='louvain',gene_type='highly_variable',\
601 |             cellnames=['default'],genenames=['default'],figsize=(10,7),row_cluster=False,col_cluster=False,\
602 |             robust=True,xticklabels=False,yticklabels=False,method='complete',metric='correlation',cmap='jet',\
603 |             downsampleTo=0):
604 |     if 'default' in cellnames:
605 |         cellnames = MouseC1data.obs_names
606 |         if ( downsampleTo > 0 ) & (isinstance(downsampleTo, int)):
607 |             cellnames = DownSample(MouseC1data,cell_type,downsampleTo).obs_names
608 |     if 'default' in genenames:
609 |         genenames = MouseC1data.var_names
610 |     genenames = [i for i in genenames if i in MouseC1data.var_names]
611 |     cellnames = [i for i in cellnames if i in MouseC1data.obs_names]
612 |     cell_types=cell_type
613 |     gene_types=gene_type
614 |     if type(cell_type) == str:
615 |         cell_types=[cell_type]
616 |     if type(gene_type) == str:
617 |         gene_types=[gene_type]
618 |     louvain_col_colors=[]
619 |     for key in cell_types: 
620 |         MouseC1data_df = MouseC1data[MouseC1data.obs_names][:,genenames].to_df()
621 |         MouseC1data_df[key] = MouseC1data[MouseC1data.obs_names].obs[key]
622 |         MouseC1data_df = MouseC1data_df.sort_values(by=key)
623 |         MouseC1data_df3 = MouseC1data_df.loc[pd.Series(cellnames,index=cellnames).index,:]
624 |         cluster_names=MouseC1data_df3.pop(key)
625 |         louvain_col_colors.append(cluster_names.map(ExtractColor(MouseC1data,obsKey=key,keytype=str)).astype(str))
626 |     adata_for_plotting = MouseC1data_df.loc[cellnames,MouseC1data_df.columns.isin(genenames)]
627 |     adata_for_plotting = adata_for_plotting.reindex(columns=genenames)
628 |     if len(louvain_col_colors) > 1:
629 |         louvain_col_colors=pd.concat(louvain_col_colors,axis=1)
630 |     else:
631 |         louvain_col_colors=louvain_col_colors[0]
632 |     if 'null' in gene_types:
633 |         cg1_0point2=sns.clustermap(adata_for_plotting.transpose(),metric=metric,cmap=cmap,\
634 |                  figsize=figsize,row_cluster=row_cluster,col_cluster=col_cluster,robust=robust,xticklabels=xticklabels,\
635 |                  yticklabels=yticklabels,z_score=0,vmin=-2.5,vmax=2.5,col_colors=louvain_col_colors,method=method)
636 |     else:
637 |         celltype_row_colors=[]
638 |         for key in gene_types:
639 |             genegroup_names=MouseC1data[:,genenames].var[key]
640 |             celltype_row_colors.append(genegroup_names.map(MouseC1ColorDict2).astype(str))
641 |         if len(celltype_row_colors) > 1:
642 |             celltype_row_colors=pd.concat(celltype_row_colors,axis=1)
643 |         else:
644 |             celltype_row_colors=celltype_row_colors[0]
645 |         cg1_0point2=sns.clustermap(adata_for_plotting.transpose(),metric=metric,cmap=cmap,\
646 |                  figsize=figsize,row_cluster=row_cluster,col_cluster=col_cluster,robust=robust,xticklabels=xticklabels,\
647 |                  yticklabels=yticklabels,z_score=0,vmin=-2.5,vmax=2.5,col_colors=louvain_col_colors,row_colors=celltype_row_colors,method=method)
648 | 
649 |     return cg1_0point2
650 | 
651 | def extractSeabornRows(snsObj):
652 |     #This function returns a Series containing row labels
653 |     NewIndex=pd.DataFrame(np.asarray([snsObj.data.index[i] for i in snsObj.dendrogram_row.reordered_ind])).iloc[:,0]
654 |     return NewIndex
655 | 
656 | def markSeaborn(snsObj,genes,clustermap=True):
657 |     if clustermap == True:
658 |         NewIndex=pd.DataFrame(np.asarray([snsObj.data.index[i] for i in snsObj.dendrogram_row.reordered_ind])).iloc[:,0]
659 |         NewIndex2=NewIndex.isin(genes)
660 |         snsObj.ax_heatmap.set_yticks(NewIndex[NewIndex2].index.values.tolist())
661 |         snsObj.ax_heatmap.set_yticklabels(NewIndex[NewIndex2].values.tolist())
662 |     else:
663 |         NewIndex=pd.DataFrame(np.asarray(snsObj.data.index))
664 |         NewIndex2=snsObj.data.index.isin(genes)
665 |         snsObj.ax_heatmap.set_yticks(NewIndex[NewIndex2].index.values)
666 |         snsObj.ax_heatmap.set_yticklabels(NewIndex[NewIndex2].values[:,0])
667 |     #snsObj.fig
668 |     return snsObj.fig
669 | 
670 | def PseudoBulk(adata, group_key, layer=None, gene_symbols=None):
671 | #This function was written by ivirshup
672 | #https://github.com/scverse/scanpy/issues/181#issuecomment-534867254
673 |     if layer is not None:
674 |         getX = lambda x: x.layers[layer]
675 |     else:
676 |         getX = lambda x: x.X
677 |     if gene_symbols is not None:
678 |         new_idx = adata.var[idx]
679 |     else:
680 |         new_idx = adata.var_names
681 | 
682 |     grouped = adata.obs.groupby(group_key)
683 |     out = pd.DataFrame(
684 |         np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
685 |         columns=list(grouped.groups.keys()),
686 |         index=adata.var_names
687 |     )
688 | 
689 |     for group, idx in grouped.indices.items():
690 |         X = getX(adata[idx])
691 |         out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
692 |     return out
693 | 
694 | def Dotplot2D(adata,obs1,obs2,gene,cmap='OrRd', min_count=1):
695 |     #This function was modified from K Polanski's codes. It can plot a gene such as XIST across samples and cell types
696 |     #require at least these many cells in a batch+celltype intersection to process it
697 | 
698 |     #extract a simpler form of all the needed data - the gene's expression and the two obs columns
699 |     #this way things run way quicker downstream
700 |     expression = np.array(adata[:,gene].X)
701 |     batches = adata.obs[obs1].values
702 |     celltypes = adata.obs[obs2].values
703 | 
704 |     dot_size_df = pd.DataFrame(0.0, index=np.unique(batches), columns=np.unique(celltypes))
705 |     dot_color_df = pd.DataFrame(0.0, index=np.unique(batches), columns=np.unique(celltypes))
706 | 
707 |     for batch in np.unique(batches):
708 |         mask_batch = (batches == batch)
709 |         for celltype in np.unique(celltypes):
710 |             mask_celltype = (celltypes == celltype)
711 |             #skip if there's not enough data for spot
712 |             if np.sum(mask_batch & mask_celltype) >= min_count:
713 |                 sub = expression[mask_batch & mask_celltype]
714 |                 #color is mean expression
715 |                 dot_color_df.loc[batch, celltype] = np.mean(sub)
716 |                 #fraction expressed can be easily computed
717 |                 #by making all expressed cells be 1, and then doing a mean again
718 |                 sub[sub>0] = 1
719 |                 dot_size_df.loc[batch, celltype] = np.mean(sub)
720 | 
721 |     #reduce dimensions - no need for all-zero rows/cols
722 |     dot_size_df = dot_size_df.loc[(dot_size_df.sum(axis=1) != 0), (dot_size_df.sum(axis=0) != 0)]
723 |     dot_color_df = dot_color_df.loc[(dot_color_df.sum(axis=1) != 0), (dot_color_df.sum(axis=0) != 0)]
724 | 
725 |     import anndata
726 |     from scanpy.pl import DotPlot
727 | 
728 |     bdata = anndata.AnnData(np.zeros(dot_size_df.shape))
729 |     bdata.var_names = dot_size_df.columns
730 |     bdata.obs_names = list(dot_size_df.index)
731 |     bdata.obs[obs1] = dot_size_df.index
732 |     bdp = DotPlot(bdata, dot_size_df.columns, obs1, dot_size_df=dot_size_df, dot_color_df=dot_color_df)
733 |     bdp = bdp.style(cmap=cmap)
734 |     bdp.make_figure()
735 | 
736 | 
737 | def DeepTree(adata,MouseC1ColorDict2,cell_type='louvain',gene_type='highly_variable',\
738 |             cellnames=['default'],genenames=['default'],figsize=(10,7),row_cluster=True,col_cluster=True,\
739 |             method='complete',metric='correlation',Cutoff=0.8,CladeSize=2):
740 |     if 'default' in cellnames:
741 |         cellnames = adata.obs_names
742 |     if 'default' in genenames:
743 |         genenames = adata.var_names
744 |     test=snsCluster(adata,\
745 |                            MouseC1ColorDict2=MouseC1ColorDict2,\
746 |                            genenames=genenames, cellnames=cellnames,\
747 |                            gene_type=gene_type, cell_type=cell_type,method=method,\
748 |                            figsize=figsize,row_cluster=row_cluster,col_cluster=col_cluster,metric=metric)
749 |     cutree = cluster.hierarchy.cut_tree(test.dendrogram_row.linkage,height=Cutoff)
750 |     TreeDict=dict(zip(*np.unique(cutree, return_counts=True)))
751 |     TreeDF=pd.DataFrame(TreeDict,index=[0])
752 |     DeepIndex=[i in TreeDF.loc[:,TreeDF.iloc[0,:] > CladeSize].columns.values for i in cutree]
753 |     bdata=adata[:,test.data.index][cellnames]
754 |     bdata.var['Deep']=DeepIndex
755 |     test1=snsCluster(bdata,\
756 |                            MouseC1ColorDict2=MouseC1ColorDict2,\
757 |                            cellnames=cellnames,gene_type='Deep',cell_type=cell_type,method=method,\
758 |                            figsize=figsize,row_cluster=True,col_cluster=True,metric=metric)
759 |     test2=snsCluster(bdata[:,DeepIndex],\
760 |                            MouseC1ColorDict2=MouseC1ColorDict2,\
761 |                            cellnames=cellnames,gene_type='null',cell_type=cell_type,method=method,\
762 |                            figsize=figsize,row_cluster=True,col_cluster=True,metric=metric)
763 |     return [bdata,test,test1,test2]
764 | 
765 | def DeepTree_per_batch(adata,batch_key='batch',obslist=['batch'],min_clustersize=100,Cutoff=0.8,CladeSize=2):
766 |     batchlist=adata.obs[batch_key].value_counts()
767 |     for key in batchlist[batchlist>min_clustersize].index:
768 |         print(key)
769 |         bdata=adata[:,adata.var['highly_variable'+key]][adata.obs[batch_key]==key,:]
770 |         sc.pp.filter_genes(bdata,min_cells=3)
771 |         sc.pl.umap(bdata,color=obslist)
772 |         [bdata,test, test1, test2]=DeepTree(bdata,
773 |                         MouseC1ColorDict2={False:'#000000',True:'#00FFFF'},
774 |                         cell_type=obslist,
775 |                         cellnames=adata[adata.obs[batch_key]==key,:].obs_names.tolist(),
776 |                         genenames=adata[:,adata.var['highly_variable'+key]].var_names.tolist(),
777 |                          row_cluster=True,col_cluster=True,Cutoff=Cutoff,CladeSize=CladeSize)
778 |         adata.var['Deep_'+key]=pd.Series(adata.var_names,index=adata.var_names).isin((bdata)[:,bdata.var['Deep']].var_names)
779 | #        sc.pl.umap(adata,color=obslist)
780 |     adata.var['Deep_n']=0
781 |     temp=adata.var['Deep_n'].astype('int32')
782 |     for key in batchlist[batchlist>min_clustersize].index:
783 |         temp=temp+adata.var['Deep_'+key].astype('int32')
784 |     adata.var['Deep_n']=temp
785 |     return adata
786 | 
787 | def Venn_Upset(adata,genelists,size_height=3):
788 |     from upsetplot import UpSet
789 |     from upsetplot import plot
790 |     #gene lists can be ['Deep_1','Deep_2']
791 |     deepgenes=pd.DataFrame(adata.var[genelists+['highly_variable']])
792 |     deepgenes=deepgenes.set_index(genelists)
793 |     upset = UpSet(deepgenes, subset_size='count', intersection_plot_elements=size_height)
794 |     upset.plot()
795 |     return upset
796 | 
797 | def Treemap(adata,output="temp",branchlist=['project','batch'],width=1000,height=700,title='title'):
798 |     import pandas as pd
799 |     import numpy as np
800 |     temp=adata.obs.groupby(by=branchlist).size()
801 |     temp=temp[temp>0]
802 |     import plotly.express as px
803 |     fig = px.treemap(temp.reset_index(),
804 |                  path=branchlist,values=0)
805 |     fig.update_layout(title=title,
806 |                   width=width, height=height)
807 |     fig.write_image(output+'.pdf')
808 |     temp.to_csv(output+'.csv')
809 |     return fig
810 | 
811 | def DeepTree2(adata,method='complete',metric='correlation',cellnames=['default'],genenames=['default'],\
812 |                Cutoff=0.8,CladeSize=2):
813 |     if 'default' in cellnames:
814 |         cellnames = adata.obs_names
815 |     if 'default' in genenames:
816 |         genenames = adata.var_names
817 |     adata_df=adata[cellnames,:][:,genenames].to_df()
818 |     testscipy=scipy.cluster.hierarchy.fclusterdata(adata_df.transpose(),\
819 |                     metric=metric,method=method,t=Cutoff,criterion="distance")
820 |     TreeDict=dict(zip(*np.unique(testscipy, return_counts=True)))
821 |     TreeDF=pd.DataFrame(TreeDict,index=[0])
822 |     DeepIndex=[i in TreeDF.loc[:,TreeDF.iloc[0,:] > CladeSize].columns.values for i in testscipy]
823 |     bdata=adata[cellnames,:][:,genenames]
824 |     bdata.var['Deep']=DeepIndex
825 |     return bdata
826 | 
827 | def LoadLogitModel(model_addr):
828 |     import joblib
829 |     lr = joblib.load(open(model_addr,'rb'))
830 |     return lr
831 | 
832 | def LoadLogitGenes(genecsv):
833 |     CT_genes=pd.read_csv(genecsv,header=None)
834 |     CT_genes['idx'] = CT_genes.index
835 |     CT_genes.columns = ['symbol', 'idx']
836 |     CT_genes = np.array(CT_genes['symbol'])
837 |     return CT_genes
838 | 
839 | def ExtractLogitScores(adata,model,CT_genes):
840 |     P=model.predict_proba(adata[:,CT_genes].X)
841 |     df=pd.DataFrame(P,index=adata.obs_names,columns=model.classes_+'_prob')
842 |     df['lr_score']=df.max(axis=1)
843 |     return df
844 | 
845 | def Model2Coeff(model_pkl,genelistcsv):
846 |     lr = LoadLogitModel(model_pkl)
847 |     CT_genes = LoadLogitGenes(genelistcsv)
848 |     return pd.DataFrame(lr.coef_,columns=CT_genes,index=lr.classes_)
849 | 
850 | from datetime import date
851 | def LogisticRegressionCellType(Reference, Query, Category = 'louvain', DoValidate = False,\
852 |     multi_class='ovr',n_jobs=15,max_iter=1000,tol=1e-4,keyword=''):
853 |     #This function doesn't do normalization or scaling
854 |     #The logistic regression function returns the updated Query object with predicted info stored
855 |     Reference.var_names_make_unique()
856 |     Query.var_names_make_unique()
857 |     IntersectGenes = np.intersect1d(Reference.var_names,Query.var_names)
858 |     Reference2 = Reference[:,IntersectGenes]
859 |     Query2 = Query[:,IntersectGenes]
860 |     X = Reference2.X
861 |     y = Reference2.obs[Category].replace(np.nan,'None',regex=True)
862 |     x = Query2.X
863 |     cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
864 |     logit = LogisticRegression(penalty='l2',
865 |                            random_state=42,
866 |                            C=0.2,
867 |                            tol=tol,
868 |                            n_jobs=n_jobs,
869 |                            solver='sag',
870 |                            multi_class=multi_class,
871 |                            max_iter=max_iter,
872 |                            verbose=10
873 |                           )
874 |     result=logit.fit(X, y)
875 |     y_predict=result.predict(x)
876 |     today = date.today()
877 |     if DoValidate is True:
878 |         scores = cross_val_score(logit, X, y, cv=cv)
879 |         print(scores)
880 |     _ = joblib.dump(result,str(today)+'Sklearn.result.'+keyword+'.pkl',compress=9)
881 |     np.savetxt(str(today)+'Sklearn.result.'+keyword+'.csv',IntersectGenes,fmt='%s',delimiter=',')
882 |     Query.obs['Predicted'] = y_predict
883 |     return Query
884 | 
885 | def LogisticPrediction(adata,model_pkl,genelistcsv,scores=True):
886 |     #This function imports saved logistic model and gene list csv to predict cell types for adata
887 |     #adata has better been scaled if you trained a model using scaled AnnData
888 |     CT_genes = LoadLogitGenes(genelistcsv)
889 |     lr = LoadLogitModel(model_pkl)
890 |     lr.features = CT_genes
891 |     features = adata.var_names
892 |     k_x = features.isin(list(CT_genes))
893 |     print(f'{k_x.sum()} features used for prediction', file=sys.stderr)
894 |     k_x_idx = np.where(k_x)[0]
895 |     temp=adata
896 |     X = temp.X[:, k_x_idx]
897 |     features = features[k_x]
898 |     ad_ft = pd.DataFrame(features.values, columns=['ad_features']).reset_index().rename(columns={'index': 'ad_idx'})
899 |     lr_ft = pd.DataFrame(lr.features, columns=['lr_features']).reset_index().rename(columns={'index': 'lr_idx'})
900 |     lr_idx = lr_ft.merge(ad_ft, left_on='lr_features', right_on='ad_features').sort_values(by='ad_idx').lr_idx.values
901 | 
902 |     lr.n_features_in_ = lr_idx.size
903 |     lr.features = lr.features[lr_idx]
904 |     lr.coef_ = lr.coef_[:, lr_idx]
905 |     predicted_hi = lr.predict(X)
906 |     adata.obs['Predicted'] = predicted_hi
907 |     if scores:
908 |         return AddMeta(adata,ExtractLogitScores(adata,lr,CT_genes))
909 |     else:
910 |         return adata
911 | 
912 | def DouCLing(adata,hi_type,lo_type,rm_genes=[],print_marker_genes=False, fraction_threshold=0.6):
913 |     DoubletScores=pd.DataFrame(0,index=adata.obs[hi_type].unique(),
914 |                           columns=['Parent1','Parent2','Parent1_count','Parent2_count','All_count','p_value'])
915 |     #Dou(blet)C(luster)L(abe)ling method
916 |     #hi_type='leiden_R' is the key for high-resolution cell types that main include doublet clusters
917 |     #lo_type='leiden' is the key for low-resolution cell types that represent compartments
918 |     #this function aims to identify cross-compartment doublet clusters. Same-compartment doublet clusters 
919 |     # look more like transitional cell types closer to homotypic doublets which are difficult to catch
920 |     #rm_genes=['TYMS','MKI67'] is a list of genes you don't want to include, such as cell-cycle genes
921 |     import time
922 |     import scipy.stats as ss
923 |     start_time = time.time()
924 |     alpha=adata.obs[lo_type].value_counts()
925 |     for j in adata.obs[lo_type].sort_values().unique():
926 |         temp=adata[adata.obs[lo_type]==j][:,~adata.var_names.isin(rm_genes)]
927 |         if len(temp.obs[hi_type].value_counts())==1:
928 |             continue
929 |         sc.tl.rank_genes_groups(temp,groupby=hi_type,n_genes=50)
930 |         Markers=pd.DataFrame(temp.uns['rank_genes_groups']['names'])
931 |         for i in pd.DataFrame(temp.uns['rank_genes_groups']['names']).columns:
932 |             scoring_genes=Markers.loc[~(Markers.loc[:,i].isin(rm_genes)),i]
933 |             if print_marker_genes:
934 |                 print(scoring_genes)
935 |             if len(scoring_genes)<20:
936 |                 continue
937 |             sc.tl.score_genes(adata,gene_list=scoring_genes,score_name=i+'_score')
938 |             DoubletScores.loc[i,'Parent1']=adata[adata.obs[hi_type]==i\
939 |                         ].obs[lo_type].value_counts().index[0]
940 |             cutoff=adata[adata.obs[hi_type]==i].obs[i+'_score'].quantile(q=0.75)
941 | 
942 |             beta=adata.obs.loc[adata.obs.loc[:,i+'_score'\
943 |                     ]>cutoff,lo_type].value_counts()
944 |             DoubletScores.loc[i,'Parent2']=beta.index[0]
945 | #        if DoubletScores.loc[i,'Parent1']==DoubletScores.loc[i,'Parent2']:
946 | #            pass
947 | #        else:
948 |             DoubletScores.loc[i,'Parent2_count']=beta[0]
949 |             DoubletScores.loc[i,'Parent1_count']=beta.loc[DoubletScores.loc[i,'Parent1']]
950 |             DoubletScores.loc[i,'All_count']=beta.sum()
951 |             hpd=ss.hypergeom(alpha.sum()-alpha.loc[DoubletScores.loc[i,'Parent1']],
952 |                          alpha.loc[DoubletScores.loc[i,'Parent2']],
953 |                         beta.sum()-beta.loc[DoubletScores.loc[i,'Parent1']])
954 |             DoubletScores.loc[i,'p_value']=hpd.pmf(DoubletScores.loc[i,'Parent2_count'])
955 |     print("--- %s seconds ---" % (time.time() - start_time))
956 |     DoubletScores.loc[:,'Is_doublet_cluster']=(DoubletScores.loc[:,'Parent2_count'] / DoubletScores.loc[:,'All_count'] > fraction_threshold) & \
957 | ~(DoubletScores.loc[:,'Parent1'] == DoubletScores.loc[:,'Parent2'])
958 |     return DoubletScores
959 | 
960 | def ClusterGenes(adata,num_pcs=50,embedding='tsne'):
961 |     #adata is already log-transformed
962 |     bdata = adata.copy()
963 |     sc.pp.scale(bdata)
964 |     bdata = bdata.T
965 |     sc.tl.pca(bdata,n_comps=num_pcs)
966 |     sc.pl.pca_variance_ratio(bdata, log=True,n_pcs=50)
967 |     sc.pp.neighbors(bdata,n_pcs=num_pcs)
968 |     if embedding=='umap':
969 |         sc.tl.umap(bdata)
970 |     if embedding=='tsne':
971 |         sc.tl.tsne(bdata)
972 |     sc.tl.leiden(bdata,resolution=0.5)
973 |     #bdata.obs['s_genes'] = [i in s_genes for i in bdata.obs_names]
974 |     #bdata.obs['g2m_genes'] = [i in g2m_genes for i in bdata.obs_names]
975 |     return bdata
976 | 


--------------------------------------------------------------------------------
/pandasPlus.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | #import scrublet as scr
  3 | import scipy.io
  4 | #import scvelo as scv
  5 | import matplotlib.pyplot as plt
  6 | import networkx as nx
  7 | from matplotlib import rcParams
  8 | import seaborn as sns
  9 | import numpy as np
 10 | import random
 11 | import sys
 12 | from sklearn.manifold import TSNE
 13 | from sklearn.preprocessing import scale
 14 | from sklearn.decomposition import TruncatedSVD
 15 | from sklearn.cluster import SpectralClustering
 16 | from sklearn.model_selection import cross_val_score, StratifiedKFold
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.feature_extraction.text import TfidfTransformer
 19 | import joblib
 20 | #not that it used to be called pySankey with uppercase "s"
 21 | import scipy.stats
 22 | import numpy as np
 23 | import pandas as pd
 24 | #import scanpy.api as sc
 25 | import anndata
 26 | #import bbknn
 27 | import os
 28 | from scipy import sparse
 29 | from scipy import cluster
 30 | from glob import iglob
 31 | 
 32 | def show_graph_with_labels(adjacency_matrix):
 33 |     rows, cols = np.where(adjacency_matrix.values >= 0.9)
 34 |     edges = zip(rows.tolist(), cols.tolist())
 35 |     gr = nx.Graph()
 36 |     gr.add_edges_from(edges)
 37 |     nx.draw(gr, node_size=100, 
 38 |             labels={i : adjacency_matrix.index.values.tolist()[i] for i in range(0, len(adjacency_matrix.index.values) ) }, 
 39 |             with_labels=True)
 40 |     plt.show()
 41 | 
 42 | def DF2Ann(DF):
 43 |     #This function converts a dataframe to AnnData
 44 |     #Make sure to transpose if needed
 45 |     return(anndata.AnnData(DF))
 46 | 
 47 | def UpSetFromLists(listOflist,labels,size_height=3,showplot=True):
 48 |     from upsetplot import UpSet
 49 |     listall=list(set([j for i in listOflist for j in i]))
 50 |     temp=pd.Series(listall,index=listall)
 51 |     temp2=pd.concat([temp.isin(i) for i in listOflist+[temp]],axis=1)
 52 |     temp2.columns=labels+['all']
 53 |     temp2=temp2.set_index(labels)
 54 |     upset = UpSet(temp2,subset_size='count', intersection_plot_elements=3)
 55 |     if showplot is True:
 56 |         upset.plot()
 57 |     return upset
 58 | 
 59 | def zscore(DF,dropna=True,axis=1):
 60 |     output=pd.DataFrame(scipy.stats.zscore(DF.values,axis=1),
 61 |              index=DF.index,
 62 |             columns=DF.columns)
 63 |     if dropna==True:
 64 |         return output.dropna(axis=1-axis)
 65 |     return output
 66 | 
 67 | def Ginni(beta):
 68 |     #beta is a DataFrame with columns of distributions
 69 |     beta.loc['Ginni']=[np.abs(np.subtract.outer(beta.loc[:,i].values,\
 70 |                beta.loc[:,i].values)).mean()/np.mean(beta.loc[:,i].values)*0.5 for i in beta.columns]
 71 |     return beta
 72 | 
 73 | def cellphonedb_p2adjMat(cpdb_p_loc='pvalues.txt',pval=0.05):
 74 |     #this function reads the p_value file from cellphonedb output and generates a matrix file 
 75 |     #with significant pairs only
 76 |     cpdb_p=pd.read_csv(cpdb_p_loc,sep='\t')
 77 |     cellpairs=pd.Series(cpdb_p.iloc[0,11:].index).str.split('|',expand=True)
 78 |     cell0=cellpairs.loc[:,0].unique()
 79 |     cell1=cellpairs.loc[:,1].unique()
 80 |     InterMat=pd.DataFrame('',index=cell0,columns=cell1)
 81 |     for i in cell0:
 82 |         for j in cell1:
 83 |             InterMat.loc[i,j]= '|'.join(cpdb_p.loc[cpdb_p.loc[:,i+'|'+j]<pval,
 84 |                                                'interacting_pair'].tolist())
 85 |     InterMat.to_csv(cpdb_p_loc+'.pairs'+str(pval)+'.csv')
 86 |     return InterMat
 87 | 
 88 | def cellphonedb_n_interaction_Mat(cpdb_p_loc='pvalues.txt',pval=0.05):
 89 |     #this function reads the p_value file from cellphonedb output and generates a matrix file 
 90 |     #with significant pairs only
 91 |     cpdb_p=pd.read_csv(cpdb_p_loc,sep='\t')
 92 |     cellpairs=pd.Series(cpdb_p.iloc[0,11:].index).str.split('|',expand=True)
 93 |     cell0=cellpairs.loc[:,0].unique()
 94 |     cell1=cellpairs.loc[:,1].unique()
 95 |     InterMat=pd.DataFrame(0,index=cell0,columns=cell1)
 96 |     for i in cell0:
 97 |         for j in cell1:
 98 |             InterMat.loc[i,j]= len(cpdb_p.loc[cpdb_p.loc[:,i+'|'+j]<pval,
 99 |                                                'interacting_pair'])
100 |     InterMat.to_csv(cpdb_p_loc+'.n_pairs'+str(pval)+'.csv')
101 |     return InterMat
102 | 
103 | def cellphonedb_mat_per_interaction(interacting_pair,cpdb_p_loc='pvalues.txt'):
104 |     #this function returns a matrix of celltype-celltype interaction p-values for a specific interaction-pair
105 |     cpdb_p=pd.read_csv(cpdb_p_loc,sep='\t')
106 |     cellpairs=pd.Series(cpdb_p.iloc[0,11:].index).str.split('|',expand=True)
107 |     cell0=cellpairs.loc[:,0].unique()
108 |     cell1=cellpairs.loc[:,1].unique()
109 |     InterMat=pd.DataFrame(0,index=cell0,columns=cell1)
110 |     for i in cell0:
111 |         for j in cell1:
112 |             InterMat.loc[i,j]= cpdb_p.loc[cpdb_p.loc[:,'interacting_pair'].isin([interacting_pair]),i+'|'+j].values
113 |     InterMat.to_csv(cpdb_p_loc+'.pairs'+interacting_pair+'.csv')
114 |     return InterMat
115 | 
116 | def ListsOverlap(A,B):
117 |     #This function is written by ChatGPT to explore the overlapping element counts between each list in A and each list in B
118 |     #A and B are both lists of lists
119 |     counts = []
120 |     for a_list in A:
121 |         row = []
122 |         for b_list in B:
123 |             shared = len(set(a_list).intersection(b_list))
124 |             row.append(shared)
125 |         counts.append(row)
126 |     df = pd.DataFrame(counts, columns=["B" + str(i+1) for i in range(len(B))], index=["A" + str(i+1) for i in range(len(A))])
127 |     return df
128 | 


--------------------------------------------------------------------------------