├── Data ├── Gene.gtf ├── TE.bed ├── test.bam └── test.exclusive.idx ├── LICENSE ├── README.md ├── bin ├── scTE ├── scTEATAC ├── scTEATAC_build └── scTE_build ├── docs └── scTE.png ├── example ├── Figure3 │ ├── 0.cluster_scripts │ │ ├── scte │ │ │ ├── do_batch.sh │ │ │ └── scte.sh │ │ └── starsolo │ │ │ ├── do_batch.sh │ │ │ └── starsolo.sh │ ├── 1.pack.py │ ├── 2.norm_and_learn.py │ ├── 3.diffexp.py │ ├── 4.plots-allgenes.py │ ├── 4.plots-alltes.py │ ├── 4.plots-specific-tes.py │ ├── 5.marker_genes-leiden-0.2.py │ ├── 5.marker_genes-small-grp_cut.py │ ├── 5.marker_genes-small.py │ ├── 5.marker_genes.py │ └── TE_genes_id.mm10.txt.gz ├── Figure4.ipynb └── Figure6.ipynb ├── scTE ├── __init__.py ├── annotation.py ├── base.py ├── miniglbase │ ├── README.md │ ├── __init__.py │ ├── base_genelist.py │ ├── config.py │ ├── genelist.py │ ├── location.py │ └── utils.py └── scatacseq.py ├── setup.py └── test.sh /Data/Gene.gtf: -------------------------------------------------------------------------------- 1 | ##description: evidence-based annotation of the mouse genome (GRCm38), version M21 (Ensembl 96) 2 | ##provider: GENCODE 3 | ##contact: gencode-help@ebi.ac.uk 4 | ##format: gtf 5 | ##date: 2019-03-27 6 | chr1 HAVANA gene 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; havana_gene "OTTMUSG00000049935.1"; 7 | chr1 HAVANA transcript 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1"; 8 | chr1 HAVANA exon 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; exon_number 1; exon_id "ENSMUSE00001343744.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1"; 9 | chr1 ENSEMBL gene 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; gene_type "snRNA"; gene_name "Gm26206"; level 3; 10 | chr1 ENSEMBL transcript 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; level 3; transcript_support_level "NA"; tag "basic"; 11 | chr1 ENSEMBL exon 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; exon_number 1; exon_id "ENSMUSE00000522066.1"; level 3; transcript_support_level "NA"; tag "basic"; 12 | chr1 HAVANA gene 3205901 3671498 . - . gene_id "ENSMUSG00000051951.5"; gene_type "protein_coding"; gene_name "Xkr4"; level 2; havana_gene "OTTMUSG00000026353.2"; 13 | chr1 HAVANA transcript 3205901 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1"; 14 | chr1 HAVANA exon 3213609 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 1; exon_id "ENSMUSE00000858910.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1"; 15 | chr1 HAVANA exon 3205901 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 2; exon_id "ENSMUSE00000866652.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1"; 16 | chr1 HAVANA transcript 3206523 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1"; 17 | chr1 HAVANA exon 3213439 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 1; exon_id "ENSMUSE00000863980.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1"; 18 | chr1 HAVANA exon 3206523 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 2; exon_id "ENSMUSE00000867897.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1"; 19 | chr1 HAVANA transcript 3214482 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 20 | chr1 HAVANA exon 3670552 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 21 | chr1 HAVANA CDS 3670552 3671348 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 22 | chr1 HAVANA start_codon 3671346 3671348 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 23 | chr1 HAVANA exon 3421702 3421901 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 24 | chr1 HAVANA CDS 3421702 3421901 . - 1 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 25 | chr1 HAVANA exon 3214482 3216968 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 26 | chr1 HAVANA CDS 3216025 3216968 . - 2 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 27 | chr1 HAVANA stop_codon 3216022 3216024 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 28 | chr1 HAVANA UTR 3671349 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 29 | chr1 HAVANA UTR 3214482 3216024 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1"; 30 | chr1 HAVANA gene 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049958.1"; 31 | chr1 HAVANA transcript 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1"; 32 | chr1 HAVANA exon 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; exon_number 1; exon_id "ENSMUSE00001339323.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1"; 33 | chr1 HAVANA gene 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; gene_type "TEC"; gene_name "Gm37180"; level 2; havana_gene "OTTMUSG00000049960.1"; 34 | chr1 HAVANA transcript 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1"; 35 | chr1 HAVANA exon 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; exon_number 1; exon_id "ENSMUSE00001343189.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1"; 36 | chr1 HAVANA gene 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; gene_type "TEC"; gene_name "Gm37363"; level 2; havana_gene "OTTMUSG00000049961.1"; 37 | chr1 HAVANA transcript 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1"; 38 | chr1 HAVANA exon 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; exon_number 1; exon_id "ENSMUSE00001343686.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1"; 39 | chr1 HAVANA gene 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; gene_type "TEC"; gene_name "Gm37686"; level 2; havana_gene "OTTMUSG00000049930.1"; 40 | chr1 HAVANA transcript 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1"; 41 | chr1 HAVANA exon 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; exon_number 1; exon_id "ENSMUSE00001337180.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1"; 42 | chr1 HAVANA gene 3466587 3513553 . + . gene_id "ENSMUSG00000089699.1"; gene_type "antisense"; gene_name "Gm1992"; level 2; havana_gene "OTTMUSG00000026352.1"; 43 | chr1 HAVANA transcript 3466587 3513553 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1"; 44 | chr1 HAVANA exon 3466587 3466687 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 1; exon_id "ENSMUSE00000869502.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1"; 45 | chr1 HAVANA exon 3513405 3513553 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 2; exon_id "ENSMUSE00000864479.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1"; 46 | chr1 HAVANA gene 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; gene_type "TEC"; gene_name "Gm37329"; level 2; havana_gene "OTTMUSG00000049929.1"; 47 | chr1 HAVANA transcript 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1"; 48 | chr1 HAVANA exon 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; exon_number 1; exon_id "ENSMUSE00001345667.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1"; 49 | chr1 HAVANA gene 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049921.1"; 50 | chr1 HAVANA transcript 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1"; 51 | chr1 HAVANA exon 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; exon_number 1; exon_id "ENSMUSE00001343235.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1"; 52 | chr1 HAVANA gene 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; gene_type "TEC"; gene_name "Gm38148"; level 2; havana_gene "OTTMUSG00000049927.1"; 53 | chr1 HAVANA transcript 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1"; 54 | chr1 HAVANA exon 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; exon_number 1; exon_id "ENSMUSE00001343966.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1"; 55 | chr1 HAVANA gene 3647309 3658904 . - . gene_id "ENSMUSG00000102331.1"; gene_type "sense_intronic"; gene_name "Gm19938"; level 2; havana_gene "OTTMUSG00000049924.1"; 56 | chr1 HAVANA transcript 3647309 3658904 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1"; 57 | chr1 HAVANA exon 3658847 3658904 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 1; exon_id "ENSMUSE00001337496.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1"; 58 | chr1 HAVANA exon 3647309 3650509 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 2; exon_id "ENSMUSE00001339227.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1"; 59 | chr1 HAVANA gene 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; gene_type "TEC"; gene_name "Gm10568"; level 2; havana_gene "OTTMUSG00000049922.1"; 60 | chr1 HAVANA transcript 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1"; 61 | chr1 HAVANA exon 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; exon_number 1; exon_id "ENSMUSE00001341983.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1"; 62 | chr1 HAVANA gene 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; gene_type "TEC"; gene_name "Gm38385"; level 2; havana_gene "OTTMUSG00000049923.1"; 63 | chr1 HAVANA transcript 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1"; 64 | chr1 HAVANA exon 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; exon_number 1; exon_id "ENSMUSE00001342074.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1"; 65 | chr1 ENSEMBL gene 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; gene_type "snRNA"; gene_name "Gm27396"; level 3; 66 | chr1 ENSEMBL transcript 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; level 3; transcript_support_level "NA"; tag "basic"; 67 | chr1 ENSEMBL exon 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; exon_number 1; exon_id "ENSMUSE00000846843.2"; level 3; transcript_support_level "NA"; tag "basic"; 68 | chr1 HAVANA gene 3905739 3986215 . - . gene_id "ENSMUSG00000102343.1"; gene_type "lincRNA"; gene_name "Gm37381"; level 2; havana_gene "OTTMUSG00000049934.1"; 69 | chr1 HAVANA transcript 3905739 3986215 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1"; 70 | chr1 HAVANA exon 3986147 3986215 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 1; exon_id "ENSMUSE00001344134.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1"; 71 | chr1 HAVANA exon 3985160 3985351 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 2; exon_id "ENSMUSE00001337703.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1"; 72 | chr1 HAVANA exon 3905739 3906134 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 3; exon_id "ENSMUSE00001345637.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1"; 73 | chr1 HAVANA transcript 3984225 3985984 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1"; 74 | chr1 HAVANA exon 3985160 3985984 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 1; exon_id "ENSMUSE00001340315.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1"; 75 | chr1 HAVANA exon 3984225 3984298 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 2; exon_id "ENSMUSE00001340468.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1"; 76 | chr1 HAVANA gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900.12"; gene_type "protein_coding"; gene_name "Rp1"; level 2; tag "overlapping_locus"; havana_gene "OTTMUSG00000049985.3"; 77 | chr1 HAVANA transcript 3999557 4409241 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 78 | chr1 HAVANA exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 79 | chr1 HAVANA CDS 4409170 4409187 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 80 | chr1 HAVANA start_codon 4409185 4409187 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 81 | chr1 HAVANA exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 82 | chr1 HAVANA CDS 4352202 4352837 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 83 | chr1 HAVANA exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 84 | chr1 HAVANA CDS 4351910 4352081 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 85 | chr1 HAVANA exon 4311270 4311433 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 86 | chr1 HAVANA CDS 4311270 4311433 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 87 | chr1 HAVANA exon 4292926 4293012 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 88 | chr1 HAVANA CDS 4292926 4293012 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 89 | chr1 HAVANA exon 4284766 4284898 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 90 | chr1 HAVANA CDS 4284766 4284898 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 91 | chr1 HAVANA exon 4267469 4267620 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 92 | chr1 HAVANA CDS 4267469 4267620 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 93 | chr1 HAVANA exon 4261527 4261605 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 94 | chr1 HAVANA CDS 4261527 4261605 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 95 | chr1 HAVANA exon 4245031 4245106 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 96 | chr1 HAVANA CDS 4245031 4245106 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 97 | chr1 HAVANA exon 4243543 4243619 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 98 | chr1 HAVANA CDS 4243543 4243619 . - 1 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 99 | chr1 HAVANA exon 4243417 4243448 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 100 | chr1 HAVANA CDS 4243417 4243448 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1"; 101 | -------------------------------------------------------------------------------- /Data/TE.bed: -------------------------------------------------------------------------------- 1 | chr3 144583200 144583342 B1_Mur4 0 - 2 | chr6 86389924 86389960 B2_Mm2 0 + 3 | chr7 5364171 5364232 PB1D10 0 + 4 | chr10 55902552 55902867 LTR80B 0 + 5 | chr12 56707313 56707382 B1F 0 + 6 | chr2 62000937 62001039 RMER15 0 + 7 | chr13 67837236 67837625 MTC 0 - 8 | chr13 97860467 97860597 ID_B1 0 - 9 | chr3 129323773 129323852 ID4_ 0 - 10 | chr15 53302093 53302237 B1_Mur4 0 - 11 | chr3 17544777 17545068 MTE2a 0 - 12 | chr14 114380245 114381362 Lx3A 0 - 13 | chr14 36135784 36136221 MLT1G1 0 - 14 | chr9 3382929 3383043 B2_Mm2 0 - 15 | chr2 23523042 23524033 L1Md_F2 0 + 16 | chr10 130416389 130416521 Lx7 0 + 17 | chr10 124812631 124812919 LTR16B 0 - 18 | chr8 121282143 121282358 ORR1G 0 - 19 | chrX 56261784 56261888 B4A 0 - 20 | chr12 19314026 19314159 L2a 0 - 21 | chr13 34470884 34476084 L1Md_A 0 + 22 | chr1 15430986 15431050 MLT1O 0 - 23 | chr11 97176772 97176823 B4 0 + 24 | chr6 120487970 120488131 B2_Mm2 0 - 25 | chr2 112370309 112370404 PB1D9 0 - 26 | chr14 11380848 11380988 L1MB7 0 + 27 | chr7 125706670 125706784 PB1D9 0 - 28 | chr1 119963513 119963866 Lx8 0 + 29 | chr14 121217593 121217684 RLTR20A4 0 - 30 | chr13 14527292 14527394 Lx8b 0 + 31 | chrX 113068169 113068313 B1_Mm 0 - 32 | chr7 21774699 21774922 RMER19B2 0 + 33 | chr3 104611578 104611728 B3A 0 - 34 | chr2 158183914 158183943 B1F1 0 + 35 | chrX 83091173 83091268 PB1D7 0 + 36 | chrY 18505375 18507434 L1_Mus3 0 - 37 | chrY 53460095 53460226 B1_Mus2 0 + 38 | chr18 56988834 56988941 L3 0 + 39 | chr15 46551396 46551807 MMERVK10C-int 0 - 40 | chr18 79506187 79506333 B1_Mm 0 - 41 | chr2 104648414 104648547 B1_Mur2 0 - 42 | chr7 109416903 109417032 Lx7 0 + 43 | chr1 33863431 33863563 ID_B1 0 - 44 | chr4 148585303 148585574 RLTR19-int 0 - 45 | chr2 164776167 164776283 B1_Mur2 0 + 46 | chr2 155889136 155889458 MLTR11B 0 + 47 | chr1 140608946 140609064 RMER13A2 0 - 48 | chr11 50474308 50474667 ORR1A2 0 + 49 | chr3 35549471 35549633 Lx7 0 - 50 | chr18 20885705 20885850 B1_Mus1 0 + 51 | chr9 98122822 98123031 URR1B 0 + 52 | chr5 145787688 145787824 RSINE1 0 + 53 | chr9 116910264 116910518 B4 0 + 54 | chr2 118982678 118982802 L1MB8 0 - 55 | chr1 74231577 74231701 ID_B1 0 - 56 | chr3 51388265 51388358 PB1D7 0 + 57 | chr1 78437903 78438016 ID_B1 0 + 58 | chr1 179450543 179450599 PB1D9 0 + 59 | chr11 106956412 106956506 B1F 0 - 60 | chr7 105070982 105071111 B1F 0 + 61 | chr14 55891766 55891869 B1F2 0 + 62 | chr3 95002315 95002463 B1_Mm 0 + 63 | chr14 123443243 123443788 L1_Mus1 0 + 64 | chr9 84553142 84553311 ID_B1 0 - 65 | chrX 74054421 74054609 B2_Mm2 0 - 66 | chr2 50599335 50599996 L1_Mur2 0 + 67 | chr11 10009054 10009447 RLTR47 0 + 68 | chr14 14575064 14575178 B2_Mm2 0 - 69 | chrX 66050795 66051345 L1Md_F2 0 + 70 | chr4 109302482 109302690 B3 0 + 71 | chr6 5823803 5823847 MLT1B 0 + 72 | chr9 94472366 94472513 B1_Mus1 0 - 73 | chr2 7172981 7173150 Tigger19a 0 + 74 | chr9 33581540 33581630 B3A 0 + 75 | chr1 60831307 60832014 L1_Mur3 0 - 76 | chr2 16821242 16821456 RMER15-int 0 - 77 | chr7 142943894 142944262 ORR1C2 0 + 78 | chr12 73440499 73440743 B4 0 - 79 | chrX 90113268 90113445 B3 0 + 80 | chr18 20618867 20619808 L1M3e 0 + 81 | chr9 114718823 114718968 B1_Mm 0 - 82 | chr11 12670894 12671016 MIR 0 - 83 | chr13 32387251 32387629 MLT1D 0 + 84 | chrX 97791970 97792192 URR1A 0 + 85 | chr13 76374166 76374333 ERVB4_1B-I_MM-int 0 - 86 | chr5 47907546 47907672 Lx10 0 + 87 | chr16 8837567 8837715 B1_Mus1 0 - 88 | chr4 150767884 150768026 B1_Mur4 0 + 89 | chr10 99491068 99491203 B1_Mur3 0 - 90 | chr17 90847952 90849043 L1_Mus3 0 + 91 | chr2 99125206 99126690 L1_Mur3 0 - 92 | chr13 47581815 47582027 B4A 0 + 93 | chr6 99194219 99194361 MER117 0 + 94 | chr14 30096250 30096453 B3 0 + 95 | chr13 24443830 24443959 PB1D10 0 - 96 | chr13 28396991 28397036 L1Md_F2 0 + 97 | chr1 165293822 165294036 B3 0 + 98 | chr7 17889030 17889430 RMER6BA 0 + 99 | chr9 25684255 25684348 MLT2D 0 - 100 | chr6 119394571 119394668 PB1D7 0 + 101 | chr19 119394571 119394668 test 0 + 102 | chrM 1193971 1193968 test 0 + 103 | -------------------------------------------------------------------------------- /Data/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.bam -------------------------------------------------------------------------------- /Data/test.exclusive.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.exclusive.idx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jiangping He, Andrew P. Hutchins & Jiekai Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scTE 2 | ============== 3 | 4 | Quantifying transposable element (TEs) expression from single-cell sequencing data 5 | ---------------------------------------------------------------------- 6 | [![DOI](https://zenodo.org/badge/190696033.svg)](https://zenodo.org/badge/190696033.svg) 7 | 8 | scTE takes as input: 9 | 10 | * Aligned sequence reads (BAM/SAM format) 11 | * The genomic location of TEs (BED format) 12 | * The genomic location of genes (GTF format) 13 | 14 | 15 | ![scTE workflow](./docs/scTE.png) 16 | 17 | 18 | Installation 19 | ------------ 20 | scTE works with python >=3.6. 21 | 22 | ```bash 23 | $ git clone https://github.com/JiekaiLab/scTE.git 24 | $ cd scTE 25 | $ python setup.py install 26 | ``` 27 | 28 | Usage 29 | ----- 30 | 31 | **Building genome indices**
32 | scTE builds genome indices for the fast alignment of reads to genes and TEs. These indices can be automatically generated using the commands: 33 | 34 | ```bash 35 | $ scTE_build -g mm10 # Mouse 36 | $ scTE_build -g hg38 # Human 37 | $ scTE_build -g panTro6 # Chimpanzee 38 | $ scTE_build -g macFas5 # Macaca fascicularis 39 | $ scTE_build -g dm6 # Drosophila melanogaster 40 | $ scTE_build -g danRer11 # Zebrafish 41 | $ scTE_build -g xenTro9 # Xenopus tropicalis 42 | ``` 43 | 44 | These scripts will automatically download the genome annotations, for mouse: 45 | 46 | ```bash 47 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz 48 | $ http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz 49 | ``` 50 | 51 | Or for human: 52 | 53 | ```bash 54 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz 55 | $ http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz 56 | ``` 57 | 58 | Or for Chimpanzee: 59 | 60 | ```bash 61 | $ http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz 62 | $ https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz 63 | ``` 64 | 65 | Or for Macaca fascicularis: 66 | 67 | ```bash 68 | $ http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz 69 | $ http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz 70 | ``` 71 | 72 | Or for Drosophila melanogaster: 73 | 74 | ```bash 75 | $ http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz 76 | $ http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz 77 | ``` 78 | 79 | Or for Zebrafish: 80 | 81 | ```bash 82 | $ http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz 83 | $ https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz 84 | ``` 85 | 86 | Or for Xenopus tropicalis: 87 | 88 | ```bash 89 | $ http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz 90 | $ https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz 91 | ``` 92 | 93 | `mm10, hg38, panTro6, macFas5, dm6, danRer11, xenTro9` is the genome assembly version. 94 | If you want to use your customs reference, you can use the ` -gene -te` options: 95 | 96 | ``` 97 | scTE_build -te TEs.bed -gene Genes.gtf -o custome 98 | 99 | -te 100 | Six columns bed file for transposable elements annotation. 101 | -gene 102 | Gtf file for genes annotation. 103 | ``` 104 | For more informat about BED and GTF format, see from [UCSC](https://genome.ucsc.edu/FAQ/FAQformat). 105 | These annotations are then processed and converted into genome indices. The scTE algorithm will allocate 106 | reads first to gene exons, and then to TEs by default. Hence TEs inside exon/UTR regions of genes annotated 107 | in GENCODE will only contribute to the gene, and not to the TE score. This feature can be changed by 108 | setting `–mode/-m inclusive` in scTE, which will instruct scTE to assign the reads to both TEs and genes 109 | if a read comes from a TE inside exon/UTR regions of genes. If you want to remove the TEs inside the intron 110 | of genes, you can sete `–mode/-m nointron` in scTE 111 | 112 | **Analysis of 10x style scRNA-seq data** 113 | 114 | scTE makes BAM/SAM file as input, highly recommend to use unfiltered alignment file as input. 115 | 116 | For `bam` file generated by [STARsolo](https://github.com/alexdobin/STAR) etc, the cell barcodes and UMI need to be integrated into the read 'CR:Z' or 'UR:Z' tage as bellow: 117 | 118 | ```bash 119 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CR -UMI UR 120 | ``` 121 | ```bash 122 | $ samtools view test.bam 123 | A00269:12:H7YF2DMXX:2 0 chr10 55902580 255 50M * 0 0 GTTCTCTCCGTATGTGAGCATGGGAGATACATCCCAGAAAGGCAGAAGGG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF 124 | A00269:13:H7YF2DMXX:2 0 chr10 55902784 255 50M * 0 0 ATAATCTTTGAGATCTCTGGTGAAAATAAGTAGCATAAAGGACAGAATCA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF 125 | A00269:14:H7YF2DMXX:2 0 chr13 67837311 255 50M * 0 0 CTGTTCATTATTTGAGGAAATCAGGACAGGAAATCAAACATGGCAGAATC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:ATCGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF 126 | A00269:15:H7YF2DMXX:2 0 chr14 114380523 255 50M * 0 0 GATCCAGATTAATTGAGACTGTTGATCCTCCTACAGGGTCGCCCTTCTCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF 127 | ``` 128 | 129 | For `bam` file generated by [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) etc, the cell barcodes and UMI need to be integrated into the read 'CB:Z' or 'UB:Z' tage as bellow: 130 | 131 | ```bash 132 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CB -UMI UB 133 | ``` 134 | ```bash 135 | $ samtools view test.bam 136 | A00519:758:HTCCHDSXY:3:2535:21296:19774 16 chr1 14021 0 90M * 0 0 TGGATTTCTATCTCCCTGGCTTGGTGCCAGTTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACTCCAAGACATC ,FFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFF:FFFFF NH:i:5 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:CTCCCTCCACTGCGAC CY:Z:FFFFFFFFFFFFFFFF CB:Z:CTCCCTCCACTGCGAC-1 UR:Z:AAGGCGTAGTAG UY:Z:FFFFFFFFFFFF UB:Z:AAGGCGTAGTAG 137 | A00519:758:HTCCHDSXY:1:1355:17237:31720 0 chr1 14260 0 90M * 0 0 CTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCTGCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:5 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:1 RE:A:I xf:i:0 CR:Z:TCGTCCACAGTATGAA CY:Z:FFFFFFFFFFFFFFFF CB:Z:TCGTCCACAGTATGAA-1 UR:Z:GACTTATTTTTT UY:Z:FFFFFFFFFFFF UB:Z:GACTTATTTTTT 138 | A00519:758:HTCCHDSXY:3:2227:16703:32080 16 chr1 14411 1 90M * 0 0 TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG FFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:3 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:TTGAGTGGTTGTGGCC CY:Z:FFFFFFFFFFFFFFFF CB:Z:TTGAGTGGTTGTGGCC-1 UR:Z:TATAATGCTCAG UY:Z:FFFFFFFFFFFF UB:Z:TATAATGCTCAG 139 | A00519:758:HTCCHDSXY:3:2563:23665:33802 16 chr1 14411 1 90M * 0 0 TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG FFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:3 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:TGTTGAGAGGCAATGC CY:Z:FFFFFFFFFFFFFFFF CB:Z:TGTTGAGAGGCAATGC-1 UR:Z:ACGGGTGTGGAG UY:Z:FFFFFFFFFFFF UB:Z:ACGGGTGTGGAG 140 | ``` 141 | ``` 142 | -i 143 | Input file: BAM/SAM file from CellRanger or STARsolo 144 | -o 145 | Output file prefix 146 | -x 147 | The filename of the index for the reference genome annotation generated by scTE_build 148 | -p 149 | Number of threads to use, Default: 1. scTE takes ~10Gb memory each thread for human and mouse genome. 150 | --hdf5 151 | Save the output as .h5ad formatted file instead of csv file. Default: False 152 | ``` 153 | 154 | scTE is most tuned to [STARsolo](https://github.com/alexdobin/STAR) or the [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline outputs, 155 | and can accept BAM files produced by either of these two programs. 156 | For other aligners, the barcode should be stored in the `CR:Z` or `CB:Z` tag, and the UMI in the `UR:Z` or `UB:Z` tag in the BAM file 157 | 158 | **Analysis of C1 style scRNA-seq data**
159 | If the UMI is missing or not used in the scRNA-seq technology (for example on the Fluidigm C1 platform), it can be disabled with `–UMI False` 160 | (the default is True) switch in scTE. If the barcode is missing it can be disabled with the `–CB False` (the default is True), 161 | and instead the cell barcodes will be taken from the names of the BAM files. 162 | 163 | ```bash 164 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx -CB False -UMI False 165 | ``` 166 | multiple BAM files can be provided to scTE with the `–i` option 167 | ``` 168 | $ scTE -i *.bam -o out -x mm10.exclusive.idx -CB False -UMI False 169 | ``` 170 | or 171 | ``` 172 | $ scTE -i input1.bam,input2.bam,... -o out -x mm10.exclusive.idx -CB False -UMI False 173 | ``` 174 | 175 | **Analysis of scATAC-seq data**
176 | The genome indices were prebuilt using: 177 | ``` 178 | $ wget -c http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz -O mm10.te.txt.gz 179 | $ zcat mm10.te.txt.gz | grep -E 'LINE|SINE|LTR|Retroposon' | cut -f6-8,11 >mm10.te.bed 180 | $ scTEATAC_build -g mm10.te.bed -o mm10.te.atac 181 | ``` 182 | Then the bam file can processe using scTE with the command: 183 | ``` 184 | scTEATAC -i input.bam -x mm10.te.atac.idx 185 | ``` 186 | 187 | **Citation**
188 | If scTE is useful for your research, consider citing [Nature Communications (2021)](https://www.nature.com/articles/s41467-021-21808-x) 189 | 190 | 191 | -------------------------------------------------------------------------------- /bin/scTE: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import pandas as pd 3 | import multiprocessing 4 | from functools import partial 5 | import logging 6 | import os, sys, glob, datetime, time, gzip 7 | import argparse 8 | import collections 9 | from math import log 10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../')) 11 | from scTE.miniglbase import genelist, glload, location 12 | from scTE.annotation import annoGtf 13 | from scTE.base import * 14 | 15 | def prepare_parser(): 16 | desc = "hahaha..." 17 | 18 | exmp = "Example: scTE <-i scRNA.sorted.bam> <-o out> [--min_genes 200] [--min_counts 400] [-p 4] <-x mm10.exclusive.idx>" 19 | 20 | parser = argparse.ArgumentParser(prog='scTE',description=desc, epilog=exmp) 21 | 22 | optional = parser._action_groups.pop() 23 | 24 | optional.add_argument('--min_genes', dest='genenumber',metavar='INT', type=int,default=200, 25 | help='Minimum number of genes expressed required for a cell to pass filtering. Default: 200') 26 | 27 | optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int, 28 | help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes') 29 | 30 | optional.add_argument('--expect-cells', dest='cellnumber',metavar='INT', type=int, default=10000, 31 | help='Expected number of cells. Default: 10000') 32 | 33 | optional.add_argument('-f','--format', metavar='input file format', dest='format', type=str, nargs='?', default='BAM', choices=['BAM','SAM'], 34 | help='Input file format: BAM or SAM. DEFAULT: BAM') 35 | 36 | optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='CR', choices=['CR','CB','False'], 37 | help='Set to false to ignore for cell barcodes, it is useful for SMART-seq. If you set CB=False, it also will set UMI=False by default, Default: CR') 38 | 39 | optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='UR', choices=['UR','UB','False'], 40 | help='Set to false to ignore for UMI, it is useful for SMART-seq. Default: True') 41 | 42 | optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'], 43 | help='Keep the _scTEtmp file, which is useful for debugging. Default: False') 44 | 45 | optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'], 46 | help='Save the output as .h5ad formatted file instead of csv file. Default: False') 47 | 48 | optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1, 49 | help='Number of threads to use, Default: 1') 50 | 51 | optional.add_argument('-v','--version', action='version', version='%(prog)s 1.0') 52 | 53 | required = parser.add_argument_group('required arguments') 54 | 55 | required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True, 56 | help='Input file: BAM/SAM file from CellRanger or STARsolo, the file must be sorted by chromosome position') 57 | 58 | required.add_argument('-x', dest='annoglb',nargs='+', required=True, 59 | help='The filename of the index for the reference genome annotation.') 60 | 61 | # required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True, 62 | # help='"hg38" for human, "mm10" for mouse') 63 | 64 | required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix') 65 | 66 | parser._action_groups.append(optional) 67 | optional = parser.add_argument_group('optional arguments') 68 | optional 69 | 70 | return parser 71 | 72 | def main(): 73 | """Start scTEs......parse options......""" 74 | 75 | timestart=datetime.datetime.now() 76 | args=read_opts(prepare_parser()) 77 | 78 | # Fix up the UMI/CB booleans: 79 | # if args.UMI == 'True': args.UMI = True 80 | # else: args.UMI = False 81 | # if args.CB == 'True': args.CB = True 82 | # else: args.CB = False 83 | if args.hdf5 == 'True': args.hdf5 = True 84 | else: args.hdf5 = False 85 | 86 | info = args.info 87 | error = args.error 88 | 89 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required' 90 | 91 | info(args.argtxt + "\n") 92 | 93 | outname = args.out.split('/')[-1:][0] 94 | 95 | info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 96 | allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0]) #genome=args.genome 97 | print(sorted(chr_list)) 98 | info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 99 | 100 | info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 101 | 102 | if len(args.input) == 1 and ',' in args.input[0]: 103 | args.input=args.input[0].split(',') 104 | 105 | if not os.path.exists('%s_scTEtmp/o1'%outname): 106 | os.system('mkdir -p %s_scTEtmp/o1'%outname) 107 | 108 | for k in args.input: 109 | checkCBUMI(filename=k,out=outname,CB=args.CB,UMI=args.UMI) 110 | info("Input SAM/BAM file appears to be valid") 111 | 112 | if len(args.input) > 1: 113 | info('Using parabam2bed as more than 1 input BAM') 114 | pool=multiprocessing.Pool(processes=args.thread) 115 | partial_work = partial(Para_bam2bed, CB=args.CB, UMI=args.UMI,out=outname) 116 | pool.map(partial_work, args.input) 117 | os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname)) 118 | 119 | else: 120 | print(args.CB,args.UMI,'good\n') 121 | Bam2bed(args.input[0], args.CB, args.UMI, outname, args.thread) 122 | info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 123 | 124 | info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 125 | if args.thread == 1: #Single thread path, mainly 126 | # This is useful for testing optimsations, as the multiprocessing path the profile 127 | # Just gets locked up in {method 'acquire' of '_thread.lock' objects} 128 | info('Executing single thread path') 129 | whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI) 130 | else: 131 | info('Executing multiple thread path with %s threads' % args.thread) 132 | pool=multiprocessing.Pool(processes=args.thread) 133 | partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI) 134 | pool.map(partial_work, chr_list) 135 | whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber) 136 | 137 | info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 138 | 139 | info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 140 | if args.thread == 1: #Single thread path 141 | for chrom in chr_list: 142 | align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #CB=args.CB 143 | 144 | else: # Multiprocessing path: 145 | pool = multiprocessing.Pool(processes=args.thread) 146 | partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist) # send a copy of the index, CB=args.CB 147 | pool.map(partial_work, chr_list) 148 | 149 | if not os.path.exists('%s_scTEtmp/o4'%outname): 150 | os.system('mkdir -p %s_scTEtmp/o4'%outname) 151 | os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname)) 152 | info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 153 | 154 | info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 155 | len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber, hdf5=args.hdf5) 156 | if args.hdf5 == True: 157 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.h5ad'.format(len_res, genenumber, filename)) 158 | else: 159 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename)) 160 | 161 | info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 162 | 163 | if args.keeptmp == 'True': 164 | pass 165 | else: 166 | os.system('rm -rf %s_scTEtmp'%outname) 167 | 168 | timeend = datetime.datetime.now() 169 | info("Done with %s\n" % timediff(timestart,timeend)) 170 | 171 | if __name__ == '__main__': 172 | try: 173 | main() 174 | except KeyboardInterrupt: 175 | sys.stderr.write("User interrupt !\n") 176 | sys.exit(0) 177 | 178 | 179 | -------------------------------------------------------------------------------- /bin/scTEATAC: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | desc = ''' 3 | 4 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI 5 | 6 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings): 7 | 8 | **** 9 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM' 10 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat ' 11 | 12 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex' 13 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex' 14 | 15 | # p1 = read 16 | # p2 = barcode and UMI 17 | # Make sure you set the correct genome index; 18 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2} 19 | **** 20 | 21 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI 22 | 23 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million). 24 | I guess there remains a change of a clash, but it should be so rare 25 | as to be basically impossible. 26 | 27 | Keep in mind though that downstream UMI statistics are inaccurate 28 | 29 | Require pysam 30 | 31 | ''' 32 | import sys, os , time 33 | import gzip 34 | import argparse 35 | import logging 36 | try: 37 | import pysam 38 | except ImportError: 39 | print('pack_scatacseq requires pysam') 40 | sys.quit() 41 | 42 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../')) 43 | # from scTE.scatacseq import build_barcode_dict, parse_bam, load_expected_whitelist 44 | from scTE.scatacseq import atacBam2bed,para_atacBam2bed 45 | from scTE.base import * 46 | 47 | # Command-line options; 48 | def prepare_parser(): 49 | exmp = 'scTEATAC -i input.bam -o out --genome mm10 -x mm10.te.idx' 50 | 51 | description = 'Package the BAM and BARCODE for the scATAC-seq data to make it suitable for scTE main pipeline' 52 | 53 | description = 'dummy' 54 | 55 | parser = argparse.ArgumentParser(prog='scTE_scatacseq', description=description, epilog=exmp) 56 | # Optional: 57 | optional = parser._action_groups.pop() 58 | # optional.add_argument('-e', '--expwhite', nargs=1, required=False, help='A txt file containing the expected whitelist of barcodes to correct the observed barcodes with') 59 | optional.add_argument('--ondisk', action='store_true', required=False, help='Do everything in memory (faster, but you will need a lot!, or do it on disk (slower, but no memory requirement') 60 | 61 | optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int, default=1000, 62 | help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes') 63 | 64 | optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='False', choices=['True','False'], 65 | help='Set to false to ignore for cell barcodes, Default: False') 66 | 67 | optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='False', choices=['True','False'], 68 | help='Set to false to ignore for UMI. Default: False') 69 | 70 | optional.add_argument('--ignoreDuplicates', dest='noDup', type=str, nargs='?', default='True', choices=['True','False'], 71 | help='If set, reads that have the same orientation and start position will be considered only once. If reads are paired, the mate’s position also has to coincide to ignore a read. Default: True') 72 | 73 | optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'], 74 | help='Keep the _scTEtmp file, which is useful for debugging. Default: False') 75 | 76 | optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1, 77 | help='Number of threads to use, Default: 1') 78 | 79 | optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'], 80 | help='Save the output as .h5ad formatted file instead of csv file. Default: False') 81 | 82 | required = parser.add_argument_group('required arguments') 83 | 84 | required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True, 85 | help='Input file: BAM/SAM file') 86 | 87 | # required.add_argument('-o', '--out', nargs=1, required=True, help='the output filename prefix') 88 | required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix') 89 | 90 | required.add_argument('-x', dest='annoglb',nargs='+', required=True, 91 | help='The filename of the indexed genome') 92 | 93 | # required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True, 94 | # help='"hg38" for human, "mm10" for mouse') 95 | 96 | 97 | # required.add_argument('-f', '--infastq', nargs=1, required=True, help='THe FASTQ file containing the barcode read') 98 | # required.add_argument('-o', '--outbam', nargs=1, required=True, help='the BAM alignment file to save the result into') 99 | # required.add_argument('-w', '--obswhite', nargs=1, required=True, help='A txt file to save the observed barcode whitelist to') 100 | 101 | parser._action_groups.append(optional) 102 | 103 | logging.basicConfig(level=logging.DEBUG, 104 | format='%(levelname)-8s: %(message)s', 105 | datefmt='%m-%d %H:%M') 106 | 107 | parser.log = logging.getLogger('scTE_scatacseq') 108 | 109 | return parser 110 | 111 | def main(): 112 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required' 113 | 114 | timestart=datetime.datetime.now() 115 | 116 | # args=read_opts(prepare_parser()) 117 | parser = prepare_parser() 118 | args = parser.parse_args() 119 | info = logging.info 120 | 121 | logger = parser.log 122 | 123 | if args.CB == 'True': args.CB = True 124 | else: args.CB = False 125 | if args.hdf5 == 'True': args.hdf5 = True 126 | else: args.hdf5 = False 127 | if args.noDup == 'True': args.noDup = True 128 | else: args.noDup = False 129 | if args.UMI == 'True': args.UMI = True 130 | else: args.UMI = False 131 | 132 | args.genenumber = 0 133 | args.cellnumber = 1e4 134 | 135 | logger.info('Arguments:') 136 | logger.info('out: %s' % args.out) 137 | logger.info('index: %s \n' % args.annoglb[0]) 138 | logger.info("Minimum number of counts required = %s"% args.countnumber) 139 | logger.info("Number of threads = %s " % args.thread) 140 | 141 | outname = args.out.split('/')[-1:][0] 142 | 143 | info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 144 | allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0]) 145 | chr_list = [ k for k in chr_list if k not in ['chrM']] 146 | info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 147 | 148 | info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 149 | 150 | if len(args.input) == 1 and ',' in args.input[0]: 151 | args.input=args.input[0].split(',') 152 | 153 | if not os.path.exists('%s_scTEtmp/o1'%outname): 154 | os.system('mkdir -p %s_scTEtmp/o1'%outname) 155 | 156 | if len(args.input) > 1: 157 | info('Using para_atacBam2bed as more than 1 input BAM') 158 | pool=multiprocessing.Pool(processes=args.thread) 159 | partial_work = partial(para_atacBam2bed, CB=args.CB,out=outname, noDup=args.noDup) 160 | pool.map(partial_work, args.input) 161 | 162 | os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname)) 163 | else: 164 | atacBam2bed(args.input[0], outname, CB=args.CB, UMI=args.UMI, noDup=args.noDup, num_threads=args.thread) 165 | info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 166 | 167 | info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 168 | if args.thread == 1: #Single thread path, mainly 169 | # This is useful for testing optimsations, as the multiprocessing path the profile 170 | # Just gets locked up in {method 'acquire' of '_thread.lock' objects} 171 | info('Executing single thread path') 172 | whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI) 173 | else: 174 | info('Executing multiple thread path with %s threads' % args.thread) 175 | pool=multiprocessing.Pool(processes=args.thread) 176 | partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI) 177 | pool.map(partial_work, chr_list) 178 | whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber) 179 | 180 | info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 181 | 182 | info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 183 | if args.thread == 1: #Single thread path 184 | for chrom in chr_list: 185 | align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #, CB=args.CB 186 | 187 | else: # Multiprocessing path: 188 | pool = multiprocessing.Pool(processes=args.thread) 189 | partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist ) # send a copy of the index , CB=args.CB 190 | pool.map(partial_work, chr_list) 191 | 192 | if not os.path.exists('%s_scTEtmp/o4'%outname): 193 | os.system('mkdir -p %s_scTEtmp/o4'%outname) 194 | os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname)) 195 | info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 196 | 197 | info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 198 | len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber,hdf5=args.hdf5) 199 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename)) 200 | info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 201 | 202 | if args.keeptmp == 'True': 203 | pass 204 | else: 205 | os.system('rm -rf %s_scTEtmp'%outname) 206 | 207 | timeend = datetime.datetime.now() 208 | info("Done with %s\n" % timediff(timestart,timeend)) 209 | 210 | 211 | if args.ondisk: # Cleanup the DB 212 | os.remove(tmpfilename) 213 | 214 | if __name__ == '__main__': 215 | try: 216 | main() 217 | except KeyboardInterrupt: 218 | sys.stderr.write("User interrupt\n") 219 | sys.exit(0) 220 | -------------------------------------------------------------------------------- /bin/scTEATAC_build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import multiprocessing 4 | from functools import partial 5 | import logging 6 | import os, sys, glob, datetime, time, gzip 7 | import argparse 8 | import collections 9 | from math import log 10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../')) 11 | from scTE.miniglbase import genelist, glload, location 12 | 13 | chr_list = [ str(k) for k in list(range(1,50))] + ['X','Y', 'M'] 14 | 15 | def read_opts(parser): 16 | args = parser.parse_args() 17 | 18 | # if args.mode not in ['inclusive', 'exclusive'] : 19 | # logging.error("Counting mode %s not supported\n" % (args.mode)) 20 | # parser.print_help() 21 | # sys.exit(1) 22 | # 23 | # if args.genome not in ['mm10', 'hg38'] : 24 | # logging.error("Counting mode %s not supported\n" % (args.genome)) 25 | # parser.print_help() 26 | # sys.exit(1) 27 | 28 | args.info = logging.info 29 | return args 30 | 31 | def genomeIndex(genome,outname): 32 | 33 | 34 | form={'force_tsv':True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3} 35 | if genome.endswith('.gz'): 36 | genome = genelist(genome, format=form, gzip=True) 37 | else: 38 | genome = genelist(genome, format=form) 39 | 40 | genome.save('%s.idx'%outname) 41 | 42 | def prepare_parser(): 43 | 44 | desc = "Build genome annotation index for scTE" 45 | 46 | exmp = "Example: scTEATAC_build -g Data/TE.bed -o mm10.te" 47 | 48 | parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp) 49 | 50 | optional = parser._action_groups.pop() 51 | 52 | optional.add_argument('-g','--genome', metavar='genome', dest='genome',type=str, nargs='?', required=True, 53 | help='Bed file of the genome window') 54 | 55 | optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name') 56 | 57 | required = parser.add_argument_group('required arguments') 58 | 59 | parser._action_groups.append(optional) 60 | optional = parser.add_argument_group('optional arguments') 61 | optional 62 | 63 | return parser 64 | 65 | def main(): 66 | 67 | timestart=datetime.datetime.now() 68 | args=read_opts(prepare_parser()) 69 | 70 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required' 71 | 72 | info = args.info 73 | 74 | info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 75 | 76 | genomefile=args.genome 77 | genomeIndex(args.genome,args.out) 78 | 79 | info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 80 | 81 | if __name__ == '__main__': 82 | try: 83 | main() 84 | except KeyboardInterrupt: 85 | sys.stderr.write("User interrupt !\n") 86 | sys.exit(0) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /bin/scTE_build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import multiprocessing 4 | from functools import partial 5 | import logging 6 | import os, sys, glob, datetime, time, gzip 7 | import argparse 8 | import collections 9 | from math import log 10 | import numpy as np 11 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../')) 12 | from scTE.miniglbase import genelist, glload, location 13 | 14 | chr_list = [ str(k) for k in list(range(1,50))] + ['X','Y', 'M'] 15 | 16 | def read_opts(parser): 17 | args = parser.parse_args() 18 | 19 | if args.mode not in ['inclusive', 'exclusive', 'nointron'] : 20 | logging.error("Counting mode %s not supported\n" % (args.mode)) 21 | parser.print_help() 22 | sys.exit(1) 23 | 24 | if args.genome not in ['mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9','other'] : 25 | logging.error("Counting genome %s not supported\n" % (args.genome)) 26 | parser.print_help() 27 | sys.exit(1) 28 | 29 | args.info = logging.info 30 | return args 31 | 32 | def cleanexon(exons): 33 | tmp = [] 34 | for k in sorted(exons): 35 | E=[] 36 | for it in exons[k]: 37 | E+=list(range(it[1],it[2])) 38 | E=sorted(set(E)) 39 | 40 | s=0 41 | #tmp=[] 42 | for id in range(0,len(E)-1): 43 | if E[id+1]-E[id] >1: 44 | en=id 45 | tmp.append({'loc': location(chr=it[0], left=E[s], right=E[en]), 'annot': k}) 46 | s=en+1 47 | tmp.append({'loc': location(chr=it[0], left=E[s], right=E[id+1]), 'annot': k}) 48 | 49 | return tmp 50 | 51 | def readGtf(filename): 52 | raw = {} 53 | clean = {} 54 | if '.gz' in filename: 55 | o = gzip.open(filename,'rb') 56 | else: 57 | o = open(filename,'r') 58 | 59 | for idx, l in enumerate(o): 60 | if '.gz' in filename: 61 | l=l.decode('ascii') 62 | if l.startswith('#'): 63 | continue 64 | t=l.strip().split('\t') 65 | if t[2]=='exon' or t[2]=='UTR': 66 | if 'chr' not in t[0]: 67 | chr = 'chr' + t[0] 68 | chr = t[0] 69 | if chr.replace('chr','') not in chr_list: 70 | continue 71 | left = int(t[3]) 72 | riht = int(t[4]) 73 | 74 | if 'gene_name' not in t[8]: 75 | continue 76 | 77 | name=t[8].split('gene_name "')[1].split('";')[0] 78 | 79 | if name not in raw: 80 | raw[name] = [] 81 | raw[name].append([chr,left,riht]) 82 | 83 | if 'protein_coding' not in l and 'lincRNA' not in l: 84 | continue 85 | if name not in clean: 86 | clean[name] = [] 87 | clean[name].append([chr,left,riht]) 88 | o.close() 89 | 90 | return raw, clean 91 | 92 | 93 | def genomeIndex(genome, mode, tefile, genefile, outname, geneurls, teurls): 94 | 95 | if not genefile: #Download twice for double check, as sometines wget may stops on the way 96 | os.system('wget -c -t 0 -T 5 %s'%geneurls) 97 | os.system('wget -c -t 0 -T 5 %s'%geneurls) 98 | genefilename = geneurls.split('/')[-1:][0] 99 | else: 100 | genefilename = genefile 101 | 102 | a = readGtf(genefilename) 103 | 104 | raw = cleanexon(a[0]) 105 | clean = cleanexon(a[1]) 106 | 107 | 108 | # for costume chromsome 109 | if tefile: 110 | o = open(tefile,'rU') 111 | for line in o: 112 | chr = line.strip().split('\t')[0] 113 | if chr not in chr_list: 114 | chr_list.append(chr) 115 | o.close() 116 | #====================== 117 | 118 | if not tefile: 119 | os.system('wget -c -t 0 -T 5 %s'%teurls) 120 | os.system('wget -c -t 0 -T 5 %s'%teurls) 121 | tefilename = teurls.split('/')[-1:][0] 122 | teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10} 123 | else: 124 | tefilename = tefile 125 | 126 | gls = genelist() 127 | gls.load_list(clean) 128 | 129 | if mode == 'exclusive': 130 | gene = {} 131 | for l in clean: 132 | chr = l['loc'].loc['chr'] 133 | if chr not in chr_list: 134 | continue 135 | left = l['loc']['left'] 136 | rite = l['loc']['right'] 137 | 138 | left_buck = ((left-1)//10000) * 10000 139 | right_buck = (rite//10000) * 10000 140 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 141 | 142 | if chr not in gene: 143 | gene[chr] = {} 144 | 145 | if buckets_reqd: 146 | for buck in buckets_reqd: 147 | if buck not in gene[chr]: 148 | gene[chr][buck] = [] 149 | gene[chr][buck].append([left, rite]) 150 | 151 | # Process the TEs: 152 | noverlap = [] 153 | if '.gz' in tefilename: 154 | o = gzip.open(tefilename,'rb') 155 | else: 156 | o = open(tefilename,'rU') 157 | 158 | for n, l in enumerate(o): 159 | if '.gz' in tefilename: 160 | l = l.decode('ascii') 161 | t = l.strip().split('\t') 162 | 163 | if not tefile: 164 | chr = t[5].replace('chr', '') 165 | left = int(t[6]) 166 | rite = int(t[7]) 167 | name = t[10] 168 | clas=t[11] 169 | if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']: 170 | continue 171 | else: 172 | chr = t[0].replace('chr', '') 173 | left = int(t[1]) 174 | rite = int(t[2]) 175 | name = t[3] 176 | 177 | if chr not in chr_list: 178 | continue 179 | if chr not in gene: # Should be very rare 180 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name}) 181 | continue 182 | 183 | left_buck = ((left-1)//10000) * 10000 184 | right_buck = (rite//10000) * 10000 185 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 186 | 187 | if buckets_reqd: 188 | i = 1 189 | for buck in buckets_reqd: 190 | if buck not in gene[chr]: 191 | pass 192 | else: 193 | for k in gene[chr][buck]: 194 | if left < k[1] and rite > k[0]: 195 | i = 0 196 | break 197 | if i == 0: # already found an overlap, so quit out; 198 | break 199 | if i == 1: 200 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name}) 201 | 202 | TEs = genelist() 203 | TEs.load_list(noverlap) 204 | 205 | genes = genelist() 206 | genes.load_list(raw) 207 | 208 | all_annot = genes + TEs 209 | 210 | if not outname: 211 | all_annot.save('%s.exclusive.idx'%genome) 212 | print('Done the index building, results output to %s.exclusive.idx \n'% genome) 213 | else: 214 | all_annot.save('%s.exclusive.idx'%outname) 215 | print('Done the index building, results output to %s.exclusive.idx \n'% outname) 216 | 217 | elif mode == 'inclusive': 218 | genes = genelist() 219 | genes.load_list(raw) 220 | 221 | 222 | if not tefile: 223 | teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10, 'clas':11} 224 | if tefilename.endswith('.gz'): 225 | TEs = genelist(tefilename, format=teform, gzip=True) 226 | else: 227 | TEs = genelist(tefilename, format=teform) 228 | 229 | keep=[] 230 | for id,item in enumerate(TEs): 231 | if item['clas'] not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']: 232 | continue 233 | if item['loc']['chr'] not in chr_list: 234 | continue 235 | tmp=item.copy() 236 | del tmp['clas'] 237 | keep.append(tmp) 238 | gls=genelist() 239 | gls.load_list(keep) 240 | 241 | else: 242 | TEs = genelist(tefilename, format={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot':3}) 243 | gls = TEs.deepcopy() 244 | 245 | 246 | all_annot = genes + gls 247 | 248 | if not outname: 249 | all_annot.save('%s.inclusive.idx'%genome) 250 | print('Done the index building, results output to %s.inclusive.idx \n'% genome) 251 | else: 252 | all_annot.save('%s.inclusive.idx'%outname) 253 | print('Done the index building, results output to %s.inclusive.idx \n'% outname) 254 | 255 | elif mode == 'nointron': 256 | raw_gene = a[0] 257 | clean_gene ={} 258 | for k in raw_gene: 259 | if len(raw_gene[k]) == 1: # the gene only have one exon 260 | clean_gene[k] = [raw_gene[k][0]] 261 | else: 262 | tmp = [] 263 | for it in raw_gene[k]: 264 | tmp += it 265 | chr = [ item for item in tmp if 'chr' in str(item) ][0] 266 | tmp = [ int(item) for item in tmp if 'chr' not in str(item) ] 267 | clean_gene[k] = [[ chr, np.min(tmp), np.max(tmp)]] 268 | clean = cleanexon(clean_gene) 269 | 270 | # adapted from 'exclusive' mode to remove the overlap reads 271 | gene = {} 272 | for l in clean: 273 | chr = l['loc'].loc['chr'] 274 | if chr not in chr_list: 275 | continue 276 | left = l['loc']['left'] 277 | rite = l['loc']['right'] 278 | 279 | left_buck = ((left-1)//10000) * 10000 280 | right_buck = (rite//10000) * 10000 281 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 282 | 283 | if chr not in gene: 284 | gene[chr] = {} 285 | 286 | if buckets_reqd: 287 | for buck in buckets_reqd: 288 | if buck not in gene[chr]: 289 | gene[chr][buck] = [] 290 | gene[chr][buck].append([left, rite]) 291 | 292 | # Process the TEs: 293 | noverlap = [] 294 | if '.gz' in tefilename: 295 | o = gzip.open(tefilename,'rb') 296 | else: 297 | o = open(tefilename,'rU') 298 | 299 | for n, l in enumerate(o): 300 | if '.gz' in tefilename: 301 | l = l.decode('ascii') 302 | t = l.strip().split('\t') 303 | 304 | if not tefile: 305 | chr = t[5].replace('chr', '') 306 | left = int(t[6]) 307 | rite = int(t[7]) 308 | name = t[10] 309 | clas=t[11] 310 | if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']: 311 | continue 312 | else: 313 | chr = t[0].replace('chr', '') 314 | left = int(t[1]) 315 | rite = int(t[2]) 316 | name = t[3] 317 | 318 | if chr not in chr_list: 319 | continue 320 | if chr not in gene: # Should be very rare 321 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name}) 322 | continue 323 | 324 | left_buck = ((left-1)//10000) * 10000 325 | right_buck = (rite//10000) * 10000 326 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 327 | 328 | if buckets_reqd: 329 | i = 1 330 | for buck in buckets_reqd: 331 | if buck not in gene[chr]: 332 | pass 333 | else: 334 | for k in gene[chr][buck]: 335 | if left < k[1] and rite > k[0]: 336 | i = 0 337 | break 338 | if i == 0: # already found an overlap, so quit out; 339 | break 340 | if i == 1: 341 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name}) 342 | 343 | TEs = genelist() 344 | TEs.load_list(noverlap) 345 | 346 | genes = genelist() 347 | genes.load_list(raw) 348 | 349 | all_annot = genes + TEs 350 | 351 | if not outname: 352 | all_annot.save('%s.nointron.idx'%genome) 353 | print('Done the index building, results output to %s.nointron.idx \n'% genome) 354 | else: 355 | all_annot.save('%s.nointron.idx'%outname) 356 | print('Done the index building, results output to %s.nointron.idx \n'% outname) 357 | 358 | if not tefile: 359 | os.system('rm %s '% tefilename) 360 | if not genefile: 361 | os.system('rm %s'%genefilename) 362 | 363 | def prepare_parser(): 364 | 365 | desc = "Build genome annotation index for scTE" 366 | 367 | exmp = "Example: scTE_build -te Data/TE.bed -gene Data/Gene.gtf" 368 | 369 | parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp) 370 | 371 | optional = parser._action_groups.pop() 372 | 373 | optional.add_argument('-te', dest='tefile',nargs='+', 374 | help='Six columns bed file for transposable elements annotation. Need the -gene option.') 375 | 376 | optional.add_argument('-gene', dest='genefile',nargs='+', 377 | help='Gtf file for genes annotation. Need the -te option. Mutalluy exclusive to -x option') 378 | 379 | optional.add_argument('-m','--mode', dest='mode', type=str, nargs='?', default='exclusive', choices=['inclusive','exclusive','nointron'], 380 | help='How to count TEs expression: inclusive (inclued all reads that can map to TEs), or exclusive (exclued the reads that can map to the exon of protein coding genes and lncRNAs), or nointron (exclude the reads that can map to the exons and intron of genes).\ 381 | DEFAULT: exclusive') 382 | 383 | optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name') 384 | 385 | optional.add_argument('-g','--genome', dest='genome',type=str, nargs='?',default='other',choices=['other','mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9'], 386 | help='Possible Genomes: mm10 (mouse), hg38 (human), panTro6 (Chimpanzee), macFas5 (Macaca fascicularis), dm6 (Drosophila melanogaster), danRer11 (Zebrafish), xenTro9 (Xenopus tropicalis)', ) 387 | 388 | # required = parser.add_argument_group('required arguments') 389 | # 390 | # required.add_argument('-g','--genome', dest='genome',type=str, nargs='?', choices=['hg38','mm10','macFas5','dm6','other'],required=True, 391 | # help='Possible Genomes: mm10 (mouse), hg38 (human)') 392 | 393 | parser._action_groups.append(optional) 394 | optional = parser.add_argument_group('optional arguments') 395 | optional 396 | 397 | return parser 398 | 399 | def main(): 400 | timestart=datetime.datetime.now() 401 | args=read_opts(prepare_parser()) 402 | 403 | print(args) 404 | # if not args.genome: 405 | # print('good') 406 | # 407 | # print(args.genome) 408 | 409 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required' 410 | 411 | info = args.info 412 | 413 | info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 414 | 415 | if args.tefile: 416 | tefile = args.tefile[0] 417 | else: 418 | tefile = None 419 | 420 | if args.genefile: 421 | genefile = args.genefile[0] 422 | else: 423 | genefile = None 424 | 425 | if args.genome == 'mm10': 426 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 427 | 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz', 428 | 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz') 429 | 430 | elif args.genome == 'hg38': 431 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 432 | 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz', 433 | 'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz') 434 | 435 | elif args.genome == 'panTro6': 436 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 437 | 'http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz', 438 | 'https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz') 439 | 440 | elif args.genome == 'macFas5': 441 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 442 | 'http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz', 443 | 'http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz') 444 | 445 | elif args.genome == 'dm6': 446 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 447 | 'http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz', 448 | 'http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz') 449 | 450 | elif args.genome == 'danRer11': 451 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 452 | 'http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz', 453 | 'https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz') 454 | 455 | elif args.genome == 'xenTro9': 456 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out, 457 | 'http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz', 458 | 'https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz') 459 | 460 | elif args.genome == 'other': 461 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,'No path','No path') 462 | 463 | 464 | info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) 465 | 466 | if __name__ == '__main__': 467 | try: 468 | main() 469 | except KeyboardInterrupt: 470 | sys.stderr.write("User interrupt !\n") 471 | sys.exit(0) 472 | 473 | 474 | 475 | -------------------------------------------------------------------------------- /docs/scTE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/docs/scTE.png -------------------------------------------------------------------------------- /example/Figure3/0.cluster_scripts/scte/do_batch.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | for f in ../starsolo*/*.bam 4 | do 5 | root=`basename $f` 6 | path=`dirname $f` 7 | 8 | bf=`echo $root | sed -r 's#.Aligned.sortedByCoord.out.bam##g' | sed 's#.bam##g'` 9 | tt=`echo $bf.csv.gz ` # outfile 10 | if [ ! -f $tt ] # Check not already done 11 | then 12 | echo scTE $tt 13 | qsub -N scte.$bf -v in=$f,out=$bf scte.sh 14 | sleep 1 15 | fi 16 | done 17 | 18 | -------------------------------------------------------------------------------- /example/Figure3/0.cluster_scripts/scte/scte.sh: -------------------------------------------------------------------------------- 1 | #PBS -l nodes=1:ppn=2,mem=64gb 2 | #PBS -j oe 3 | #PBS -o ${out}.out 4 | #PBS -q batch 5 | #PBS -V 6 | cd $PBS_O_WORKDIR 7 | 8 | genome_mm10='/data3/lab-andrew/scTE/scte_indeces/mm10.exclusive.idx' 9 | genome_hg38='/data3/lab-andrew/scTE/scte_indeces/hg38.exclusive.idx' 10 | 11 | python3 /share/apps/genomics/unstable/scTE/bin/scTE -i ${in} -x $genome_mm10 -g mm10 -p 1 -o ${out} 12 | 13 | gzip ${out}.csv 14 | -------------------------------------------------------------------------------- /example/Figure3/0.cluster_scripts/starsolo/do_batch.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | for f in ../fqs/*.p1.fq.gz 4 | do 5 | root=`basename $f` 6 | path=`dirname $f` 7 | 8 | bf=`echo $root | sed -r 's#.p1.fq.gz##g'` 9 | p2=`echo $f | sed 's#.p1.fq.gz#.p2.fq.gz#g'` 10 | tt=`echo ss.$bf.Aligned.sortedByCoord.out.bam` # outfile 11 | if [ ! -f $tt ] # Check not already done 12 | then 13 | echo STARsolo $tt 14 | qsub -N solo.$bf -v p1=$f,p2=$p2,out=$bf. starsolo.sh 15 | sleep 2 16 | fi 17 | done 18 | 19 | -------------------------------------------------------------------------------- /example/Figure3/0.cluster_scripts/starsolo/starsolo.sh: -------------------------------------------------------------------------------- 1 | #PBS -N ss.${out}.starsolo 2 | #PBS -l nodes=1:ppn=32 3 | #PBS -l mem=32gb 4 | #PBS -j oe 5 | #PBS -o ss.${out}.out 6 | #PBS -q batch 7 | #PBS -V 8 | cd $PBS_O_WORKDIR 9 | 10 | ulimit -n 2000 11 | 12 | whitelist='--soloCBwhitelist /data3/lab-andrew/scTE/scrnaseq_barcodes/version1.txt' # Make sure you get the right bartcode version 13 | 14 | # Required arguments; 15 | mods='--soloType Droplet --soloFeatures Gene --soloBarcodeReadLength 1 --soloCBlen 14 --soloUMIstart 15 ' 16 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic' 17 | opts='--runRNGseed 42 --runThreadN 32 --readFilesCommand zcat ' 18 | 19 | # required for scTE: 20 | sam_att='--outSAMattributes NH HI AS nM CR CY UR UY' 21 | 22 | genome_mm10='--genomeDir /data3/lab-andrew/scTE/custom_indeces/mm10_gencode_vM21_starsolo/SAindex' 23 | genome_hg38='--genomeDir /data3/lab-andrew/scTE/custom_indeces/hg38_gencode_v30_starsolo/SAindex' 24 | 25 | # p1 = read 26 | # p2 = barcode and UMI 27 | # Make sure you set the correct genome index; 28 | STAR $opts $teopts $mods $whitelist $sam_att $genome_mm10 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2} 29 | 30 | rm -r ss.${out}_STARgenome 31 | rm -r ss.${out}_STARpass1 32 | rm -r ss.${out}_STARtmp 33 | -------------------------------------------------------------------------------- /example/Figure3/1.pack.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Pack the scRNA-seq data using scanpy, prep for scran normalisation 4 | 5 | """ 6 | 7 | import logging, matplotlib, os, sys 8 | import scanpy as sc 9 | import numpy as np 10 | import scipy as sp 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | from anndata import AnnData 14 | from matplotlib import rcParams 15 | from matplotlib import colors 16 | import seaborn as sb 17 | from rpy2.robjects.packages import importr 18 | plt.rcParams['figure.figsize'] = (8,8) 19 | sc.settings.verbosity = 3 20 | sc.set_figure_params(dpi=200, dpi_save=200) 21 | matplotlib.rcParams['pdf.fonttype'] = 42 22 | matplotlib.rcParams['font.size'] = 10 23 | sc.settings.autoshow = False 24 | 25 | def sparsify(filename): 26 | data = pd.read_csv(filename, index_col=0, header=0) 27 | genes = data.columns 28 | cells = data.index 29 | data = sp.sparse.csr_matrix(data.to_numpy()) 30 | data.astype('float32') 31 | 32 | ''' 33 | oh = open('gene_names.{0}.tsv'.format(os.path.split(filename)[1]), 'w') 34 | for g in genes: 35 | oh.write('%s\n' % g) 36 | oh.close() 37 | ''' 38 | 39 | print('Loaded {0}'.format(filename)) 40 | ad = AnnData(data, obs={'obs_names': cells}, var={'var_names': genes}) 41 | del data 42 | return ad 43 | 44 | sam1 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam1.csv.gz") ; sam1.obs['stage'] = "E6.5" ; sam1.obs['replicate'] = "E6.5-1" 45 | sam2 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam5.csv.gz") ; sam2.obs['stage'] = "E6.5" ; sam2.obs['replicate'] = "E6.5-2" 46 | #sam3 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam18.csv.gz") ; sam3.obs['stage'] = "E6.5" ; sam3.obs['replicate'] = "E6.5-3" 47 | #sam4 = sparsify("../scte_data/ss.gastrulation_E6.75_Sam7.csv.gz") ; sam4.obs['stage'] = "E6.75" ; sam4.obs['replicate'] = "E6.75-1" 48 | sam5 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam10.csv.gz") ; sam5.obs['stage'] = "E7.0" ; sam5.obs['replicate'] = "E7.0-1" 49 | #sam6 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam15.csv.gz") ; sam6.obs['stage'] = "E7.0" ; sam6.obs['replicate'] = "E7.0-3" 50 | sam7 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam30.csv.gz") ; sam7.obs['stage'] = "E7.0" ; sam7.obs['replicate'] = "E7.0-4" 51 | sam8 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam31.csv.gz") ; sam8.obs['stage'] = "E7.0" ; sam8.obs['replicate'] = "E7.0-5" 52 | sam9 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam32.csv.gz") ; sam9.obs['stage'] = "E7.0" ; sam9.obs['replicate'] = "E7.0-6" 53 | sam10 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam23.csv.gz") ; sam10.obs['stage'] = "E7.25" ; sam10.obs['replicate'] = "E7.25-2" 54 | sam11 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam26.csv.gz") ; sam11.obs['stage'] = "E7.25" ; sam11.obs['replicate'] = "E7.25-3" 55 | sam12 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam27.csv.gz") ; sam12.obs['stage'] = "E7.25" ; sam12.obs['replicate'] = "E7.25-4" 56 | sam13 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam2.csv.gz") ; sam13.obs['stage'] = "E7.5" ; sam13.obs['replicate'] = "E7.5-1" 57 | sam14 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam3.csv.gz") ; sam14.obs['stage'] = "E7.5" ; sam14.obs['replicate'] = "E7.5-2" 58 | sam15 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam4.csv.gz") ; sam15.obs['stage'] = "E7.5" ; sam15.obs['replicate'] = "E7.5-3" 59 | sam16 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam6.csv.gz") ; sam16.obs['stage'] = "E7.5" ; sam16.obs['replicate'] = "E7.5-4" 60 | sam17 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam19.csv.gz") ; sam17.obs['stage'] = "E7.5" ; sam17.obs['replicate'] = "E7.5-5" 61 | sam18 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam20.csv.gz") ; sam18.obs['stage'] = "E7.5" ; sam18.obs['replicate'] = "E7.5-6" 62 | sam19 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam8.csv.gz") ; sam19.obs['stage'] = "E7.75" ; sam19.obs['replicate'] = "E7.75-1" 63 | sam20 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam9.csv.gz") ; sam20.obs['stage'] = "E7.75" ; sam20.obs['replicate'] = "E7.75-2" 64 | sam21 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam12.csv.gz") ; sam21.obs['stage'] = "E7.75" ; sam21.obs['replicate'] = "E7.75-3" 65 | sam22 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam13.csv.gz") ; sam22.obs['stage'] = "E7.75" ; sam22.obs['replicate'] = "E7.75-4" 66 | sam23 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam16.csv.gz") ; sam23.obs['stage'] = "E8.0" ; sam23.obs['replicate'] = "E8.0-1" 67 | sam24 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam33.csv.gz") ; sam24.obs['stage'] = "E8.0" ; sam24.obs['replicate'] = "E8.0-2" 68 | sam25 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam34.csv.gz") ; sam25.obs['stage'] = "E8.0" ; sam25.obs['replicate'] = "E8.0-3" 69 | sam26 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam35.csv.gz") ; sam26.obs['stage'] = "E8.0" ; sam26.obs['replicate'] = "E8.0-4" 70 | sam27 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam24.csv.gz") ; sam27.obs['stage'] = "E8.25" ; sam27.obs['replicate'] = "E8.25-1" 71 | sam28 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam25.csv.gz") ; sam28.obs['stage'] = "E8.25" ; sam28.obs['replicate'] = "E8.25-2" 72 | sam29 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam28.csv.gz") ; sam29.obs['stage'] = "E8.25" ; sam29.obs['replicate'] = "E8.25-3" 73 | sam30 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam17.csv.gz") ; sam30.obs['stage'] = "E8.5" ; sam30.obs['replicate'] = "E8.5-1" 74 | sam31 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam29.csv.gz") ; sam31.obs['stage'] = "E8.5" ; sam31.obs['replicate'] = "E8.5-2" 75 | sam32 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam36.csv.gz") ; sam32.obs['stage'] = "E8.5" ; sam32.obs['replicate'] = "E8.5-3" 76 | sam33 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam37.csv.gz") ; sam33.obs['stage'] = "E8.5" ; sam33.obs['replicate'] = "E8.5-4" 77 | sam34 = sparsify("../scte_data/ss.gastrulation_mixed_Sam21.csv.gz") ; sam34.obs['stage'] = "mixed" ; sam34.obs['replicate'] = "mixed-1" 78 | sam35 = sparsify("../scte_data/ss.gastrulation_mixed_Sam22.csv.gz") ; sam35.obs['stage'] = "mixed" ; sam35.obs['replicate'] = "mixed-2" 79 | 80 | print('Loaded Samples...') 81 | 82 | # Do very simple prefiltering: 83 | samples = [sam1, sam2, #sam3, sam4, 84 | sam5, #sam6, 85 | sam7, sam8, sam9, sam10, 86 | sam11, sam12, sam13, sam14, sam15, 87 | sam16, sam17, sam18, sam19, sam20, 88 | sam21, sam22, sam23, sam24, sam25, 89 | sam26, sam27, sam28, sam29, sam30, 90 | sam31, sam32, sam33, sam34, sam35] 91 | 92 | # Quick pre-filtering, these should be low, otherwise it can mess up downstream analysis, but also can get rid of trivial uninteresting things 93 | [sc.pp.filter_cells(sam, min_genes=2000) for sam in samples] 94 | [sc.pp.filter_cells(sam, max_counts=100000) for sam in samples] 95 | [sc.pp.filter_cells(sam, min_counts=5000) for sam in samples] 96 | # Do not filter gene here; concatenate joins on the union, so if a gene fails in a single sample, it will also be deleted from all other samples; 97 | 98 | print('Concatenating') 99 | adata = sam1.concatenate(samples[1:]) 100 | 101 | del samples 102 | 103 | adata.X = adata.X.astype('float32') 104 | 105 | print(adata) 106 | 107 | sc.pl.violin(adata, ['n_genes', 'n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-pre-norm-replicates.pdf') 108 | 109 | # Base filtering for trivial QC failures: 110 | sc.pp.filter_cells(adata, min_genes=3000) 111 | sc.pp.filter_cells(adata, min_counts=8000) 112 | sc.pp.filter_cells(adata, max_counts=100000) 113 | sc.pp.filter_genes(adata, min_cells=50) # Only filter genes here; 114 | 115 | print('Number of cells after gene filter: {:d}'.format(adata.n_obs)) 116 | 117 | #sc.pl.violin(adata, ['n_genes','n_counts'], groupby='stage', size=0, log=False, cut=0, show=False, save='qc1.pdf') 118 | sc.pl.violin(adata, ['n_genes','n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-replicates.pdf') 119 | 120 | p = sb.distplot(adata.obs['n_counts'], kde=False) 121 | p.get_figure().savefig('figures/distplot_ncounts1.pdf') 122 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']<4000], kde=False, bins=60) 123 | p.get_figure().savefig('figures/distplot_ncounts2.pdf') 124 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']>10000], kde=False, bins=60) 125 | p.get_figure().savefig('figures/distplot_ncounts3.pdf') 126 | #Thresholding decision: genes 127 | p = sb.distplot(adata.obs['n_genes'], kde=False, bins=60) 128 | p.get_figure().savefig('figures/distplot_ngenes1.pdf') 129 | p = sb.distplot(adata.obs['n_genes'][adata.obs['n_genes']<2000], kde=False, bins=60) 130 | p.get_figure().savefig('figures/distplot_ngenes2.pdf') 131 | 132 | print('Total number of cells: {:d}'.format(adata.n_obs)) 133 | print('Total number of genes: {:d}'.format(adata.n_vars)) 134 | 135 | adata.write('./raw_data.h5ad') 136 | -------------------------------------------------------------------------------- /example/Figure3/2.norm_and_learn.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import anndata 3 | import scanpy as sc 4 | import numpy as np 5 | import scipy as sp 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from matplotlib import rcParams 9 | from matplotlib import colors 10 | import seaborn as sb 11 | plt.rcParams['figure.figsize']=(8,8) #rescale figures 12 | sc.settings.verbosity = 3 13 | sc.set_figure_params(dpi=200, dpi_save=300) 14 | 15 | adata = sc.read('raw_data.h5ad') 16 | sc.pp.normalize_total(adata) 17 | sc.pp.log1p(adata) 18 | print(adata) 19 | 20 | print('Number of cells: {:d}'.format(adata.n_obs)) 21 | 22 | sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2000) 23 | sc.pl.highly_variable_genes(adata, show=False, save='highly_variable.pdf') 24 | 25 | # Calculate the visualizations 26 | sc.pp.pca(adata, n_comps=20, use_highly_variable=True, svd_solver='arpack') # PC=20 from Nature paper 27 | sc.pp.neighbors(adata) 28 | sc.tl.tsne(adata, n_jobs=3) 29 | sc.tl.umap(adata, min_dist=0.6) 30 | sc.tl.diffmap(adata) 31 | 32 | sc.pl.pca_variance_ratio(adata, log=True, show=False, save='pca_variance.pdf') 33 | 34 | # Perform clustering - using highly variable genes 35 | sc.tl.leiden(adata, resolution=1.0, key_added='leiden_r1') 36 | sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5') 37 | sc.tl.leiden(adata, resolution=0.4, key_added='leiden_r0.4') 38 | sc.tl.leiden(adata, resolution=0.35, key_added='leiden_r0.35') 39 | sc.tl.leiden(adata, resolution=0.3, key_added='leiden_r0.3') 40 | sc.tl.leiden(adata, resolution=0.25, key_added='leiden_r0.25') 41 | sc.tl.leiden(adata, resolution=0.2, key_added='leiden_r0.2') 42 | sc.tl.leiden(adata, resolution=0.1, key_added='leiden_r0.1') 43 | 44 | adata.write('./learned.h5ad') 45 | 46 | todraw = ['leiden_r1', 'leiden_r0.5', 'leiden_r0.4', 'leiden_r0.35', 'leiden_r0.3', 'leiden_r0.25', 'leiden_r0.2', 'leiden_r0.1', 'replicate'] 47 | 48 | #Visualize the clustering and how this is reflected by different technical covariates 49 | sc.pl.tsne(adata, color=todraw, size=10, legend_loc='on data', show=False, save='tsne.pdf') 50 | sc.pl.umap(adata, color=todraw, size=10, legend_loc='on data', show=False, save='umap.pdf') 51 | 52 | -------------------------------------------------------------------------------- /example/Figure3/3.diffexp.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import matplotlib.pyplot as plt 4 | from matplotlib import rcParams 5 | from matplotlib import colors 6 | import pandas as pd 7 | from glbase3 import genelist 8 | plt.rcParams['figure.figsize']=(8,8) 9 | sc.settings.verbosity = 3 10 | sc.set_figure_params(dpi=200, dpi_save=200) 11 | matplotlib.rcParams['pdf.fonttype']=42 12 | matplotlib.rcParams['font.size']=10 13 | 14 | sc.settings.figdir = 'diffexp' 15 | 16 | adata = sc.read('./learned.h5ad') 17 | 18 | sc.tl.rank_genes_groups(adata, 'leiden_r0.5', method='wilcoxon', n_genes=3000) 19 | adata.write('./de.h5ad') 20 | 21 | adata = sc.read('./de.h5ad') 22 | 23 | sc.pl.rank_genes_groups(adata, n_genes=25, sharey=True, show=False, save='genes-top25.pdf') 24 | sc.pl.rank_genes_groups(adata, key='rank_genes_groups', show=False, save='genes.pdf') 25 | sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups', show=False, save='genes-top25.pdf') 26 | 27 | #print(pd.DataFrame(adata.uns['rank_genes_groups'])) 28 | 29 | print(pd.DataFrame(adata.uns['rank_genes_groups']['names'])) 30 | 31 | print() 32 | topall = pd.DataFrame(adata.uns['rank_genes_groups']['names']) # get all; 33 | fcs = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges']) 34 | padj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']) 35 | 36 | topall.to_csv('top100.csv') 37 | 38 | # Go through and trim the TEs: 39 | 40 | TEs = set(genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name']) 41 | 42 | newcols = {} 43 | 44 | groups = list(topall.columns.values) 45 | 46 | for group in groups: 47 | newcols[group] = [] 48 | 49 | t = zip([i[group] for i in adata.uns['rank_genes_groups']['names']], [i[group] for i in adata.uns['rank_genes_groups']['logfoldchanges']], [i[group] for i in adata.uns['rank_genes_groups']['pvals_adj']]) 50 | 51 | print('Group: {0}'.format(group)) 52 | print(t) 53 | 54 | for item in t: 55 | print(item) 56 | if abs(item[1]) < 1: # fold change 57 | continue 58 | if item[2] > 0.01: # just in case 59 | continue 60 | 61 | if item[0] in TEs: 62 | newcols[group].append(item[0]) 63 | 64 | 65 | # join all and draw a dotplot: 66 | joined = [] 67 | for group in newcols: 68 | joined += newcols[group] 69 | 70 | # Need to remove duplicates, but preserver order: 71 | newl = [] 72 | for i in joined: 73 | if i not in newl: 74 | newl.append(i) 75 | joined = newl 76 | 77 | print(joined) 78 | sc.pl.dotplot(adata, joined, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf') 79 | sc.pl.matrixplot(adata, joined, groupby='leiden_r0.5', dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf') 80 | 81 | for k in joined: 82 | sc.pl.tsne(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k)) 83 | sc.pl.umap(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k)) 84 | -------------------------------------------------------------------------------- /example/Figure3/4.plots-allgenes.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import matplotlib.pyplot as plt 4 | from matplotlib import rcParams 5 | from matplotlib import colors 6 | 7 | from glbase3 import * 8 | 9 | plt.rcParams['figure.figsize']=(8,8) 10 | sc.settings.verbosity = 3 11 | sc.set_figure_params(dpi=200, dpi_save=200) 12 | matplotlib.rcParams['pdf.fonttype']=42 13 | matplotlib.rcParams['font.size']=10 14 | 15 | sc.settings.figdir = 'genes' 16 | 17 | adata = sc.read('./learned.h5ad') 18 | print(adata) 19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index 20 | 21 | TEs = genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name'] 22 | 23 | print(TEs) 24 | 25 | for g in all_genes: 26 | if g not in TEs and '(' not in g: 27 | print(g) 28 | sc.pl.umap(adata, color=[g], size=6, legend_loc='on data', color_map='plasma', show=False, save='-{0}.pdf'.format(g), vmin=0, vmax=3) 29 | 30 | 31 | -------------------------------------------------------------------------------- /example/Figure3/4.plots-alltes.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import matplotlib.pyplot as plt 4 | from matplotlib import rcParams 5 | from matplotlib import colors 6 | 7 | from glbase3 import * 8 | 9 | plt.rcParams['figure.figsize']=(8,8) 10 | sc.settings.verbosity = 3 11 | sc.set_figure_params(dpi=200, dpi_save=200) 12 | matplotlib.rcParams['pdf.fonttype']=42 13 | matplotlib.rcParams['font.size']=10 14 | 15 | sc.settings.figdir = 'tes' 16 | 17 | adata = sc.read('./learned.h5ad') 18 | print(adata) 19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index 20 | 21 | TEs = genelist(filename='TE_genes_id.mm10.txt.gz', format={'name': 0, 'force_tsv': True}, gzip=True) 22 | 23 | #merker_tes = ['ID2', 'MER5C1', 'MER34B-int', 'MER63D', 'MT2A'] 24 | #sc.pl.stacked_violin(adata, var_names=merker_tes, groupby='leiden_r0.2', rotation=90, show=False, save='tes.pdf') 25 | 26 | for te in TEs: 27 | print(te['name']) 28 | if te['name'] in all_genes: 29 | sc.pl.umap(adata, color=[te['name'], te['name']], size=10, legend_loc='on data', show=False, save='TE-{0}.pdf'.format(te['name']), vmin=0, vmax=3) 30 | 31 | 32 | -------------------------------------------------------------------------------- /example/Figure3/4.plots-specific-tes.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import matplotlib.pyplot as plt 4 | from matplotlib import rcParams 5 | from matplotlib import colors 6 | 7 | from glbase3 import * 8 | 9 | plt.rcParams['figure.figsize']=(8,8) 10 | sc.settings.verbosity = 3 11 | sc.set_figure_params(dpi=200, dpi_save=200) 12 | matplotlib.rcParams['pdf.fonttype']=42 13 | matplotlib.rcParams['font.size']=6 14 | 15 | sc.settings.figdir = 'specific-tes' 16 | 17 | adata = sc.read('./learned.h5ad') 18 | 19 | # high, few: Expressed rarely, but very high in the cells that they are expressed in 20 | marker_genes_dictB = { 21 | #'Epiblast': ['MTEb-int',], 22 | 'Primitive streak': ['RLTR1D2_MM', ], 23 | #'Endothelium': ['ERVB7_2B-LTR_MM',], 24 | 25 | #'Ectoderms': ['MamRep137'], 26 | #'Endoderms': ['MLT1I'], 27 | 'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'], 28 | 'Extraembryonic': ['LTR16A', ], 29 | 'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',], 30 | #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ], 31 | 'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'], 32 | } 33 | sc.pl.dotplot(adata, marker_genes_dictB, groupby='leiden_r0.5', dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf') 34 | 35 | # Super-specific 36 | marker_genes_dictC = { 37 | #'Primitive streak': [ ], 38 | 'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',], 39 | #'others':['MuRRS4-int'], 40 | 'Exe. endoderm': ['MER46C', 'MuRRS4-int', 'RLTR20B3', 'RLTR1B-int', 'LTRIS2',], 41 | 'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'], 42 | #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'], 43 | 'Erythroid': ['RLTR10F', 'L1_Mur1',], 44 | } 45 | sc.pl.dotplot(adata, marker_genes_dictC, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf') 46 | -------------------------------------------------------------------------------- /example/Figure3/5.marker_genes-leiden-0.2.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import numpy as np 4 | import scipy as sp 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from matplotlib import rcParams 8 | from matplotlib import colors 9 | import seaborn as sb 10 | from rpy2.robjects.packages import importr 11 | #from gprofiler import gprofiler 12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures 13 | sc.settings.verbosity = 1 14 | sc.set_figure_params(dpi=200, dpi_save=300) 15 | 16 | sc.settings.figdir = 'markers-leiden0.2' 17 | 18 | adata = sc.read('learned.h5ad') # 19 | #sc.pp.log1p(adata) 20 | 21 | print(adata.var_names) 22 | 23 | oh = open('gene_names.all.tsv', 'w') 24 | for g in adata.var_names: 25 | oh.write('%s\n' % g) 26 | oh.close() 27 | 28 | marker_genes_dict = { 29 | 'Epiblast': ["Pou5f1"], # Done 30 | 'Primitive Streak': ['Mixl1'], # Done 31 | 'Meso/endoderm': ['Eomes', 'T'], # Done 32 | 'Endoderm': ['Sox17'], # Done 33 | 'Mesoderm': ['Tbx6'], # Done 34 | 'Ectoderm': ['Nr2f1', 'Pax6'], 35 | 'Exe. endoderm': ["Apoa2"], # Done 36 | 'Exe. ectoderm': ["Tfap2c"], # Done 37 | 'Mesenchyme': ['Pmp22'], # Done 38 | 'Blood progenitors': ['Runx1'], # Done 39 | 'Erythroid': ['Gata1'], # Done 40 | } 41 | 42 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, rotation=90, dendrogram=False, show=False, save='markers.pdf') 43 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.2', dot_max=0.5, dendrogram=False, standard_scale='var', show=False, save='markers.pdf') 44 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, show=False, save='markers.pdf') 45 | ''' 46 | for k in marker_genes_dict: 47 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k)) 48 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k)) 49 | 50 | ''' 51 | -------------------------------------------------------------------------------- /example/Figure3/5.marker_genes-small-grp_cut.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import numpy as np 4 | import scipy as sp 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from matplotlib import rcParams 8 | from matplotlib import colors 9 | import seaborn as sb 10 | #from rpy2.robjects.packages import importr 11 | #from gprofiler import gprofiler 12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures 13 | sc.settings.verbosity = 1 14 | sc.set_figure_params(dpi=200, dpi_save=300) 15 | 16 | #matplotlib.rcParams['pdf.fonttype']=42 17 | #matplotlib.rcParams['font.size']=6 18 | 19 | todo = 'leiden_r0.3' 20 | 21 | sc.settings.figdir = 'markers-{0}'.format(todo) 22 | 23 | adata = sc.read('learned.h5ad') 24 | 25 | marker_genes_dict = { 26 | 'Epiblast': ["Pou5f1"], 27 | 'Primitive streak': ["Mixl1"], #Nanong?!?! 28 | 'Endoderms': ["Cer1", "Sox7"], 29 | 'Mesoderms': ["T", 'Cdx1'], 30 | 'Ectoderms': ['Six3'], # And Grhl2 31 | 32 | 'Exe endoderm': ["Apoa2"], 33 | 'Exe ectoderm': ["Tfap2c"], 34 | 35 | 'Cardiomyocytes': ["Tnnt2"], 36 | 'Blood prog.': ["Lmo2", ], 37 | 'Erythroid': ["Gypa"], 38 | } 39 | 40 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby=todo, rotation=90, dendrogram=True, show=False, save='markers.pdf') 41 | sc.pl.dotplot(adata, marker_genes_dict, groupby=todo, color_map='Greens', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='markers.pdf') 42 | sc.pl.heatmap(adata, marker_genes_dict, groupby=todo, vmax=3, show=False, save='markers.pdf') 43 | 44 | # high, few: Expressed rarely, but very high in the cells that they are expressed in 45 | marker_genes_dictB = { 46 | #'Epiblast': ['MTEb-int',], 47 | #'Primitive streak': ['RLTR1D2_MM', ], 48 | #'Endothelium': ['ERVB7_2B-LTR_MM',], 49 | 50 | #'Ectoderms': ['MamRep137'], 51 | #'Endoderms': ['MLT1I'], 52 | 'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'], 53 | 'Extraembryonic': ['LTR16A', ], 54 | 'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',], 55 | #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ], 56 | 'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'], 57 | } 58 | sc.pl.dotplot(adata, marker_genes_dictB, groupby=todo, dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf') 59 | 60 | # Super-specific 61 | marker_genes_dictC = { 62 | #'Primitive streak': [ ], 63 | 'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',], 64 | #'others':['MuRRS4-int'], 65 | 'Exe. endoderm': ['MER46C', 'MuRRS4-int', 'RLTR20B3', 'RLTR1B-int', 'LTRIS2',], 66 | 'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'], 67 | #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'], 68 | 'Erythroid': ['RLTR10F', 'L1_Mur1',], 69 | } 70 | sc.pl.dotplot(adata, marker_genes_dictC, groupby=todo, dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf') 71 | -------------------------------------------------------------------------------- /example/Figure3/5.marker_genes-small.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import numpy as np 4 | import scipy as sp 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from matplotlib import rcParams 8 | from matplotlib import colors 9 | import seaborn as sb 10 | from rpy2.robjects.packages import importr 11 | #from gprofiler import gprofiler 12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures 13 | sc.settings.verbosity = 1 14 | sc.set_figure_params(dpi=200, dpi_save=300) 15 | 16 | sc.settings.figdir = 'markers-small' 17 | 18 | adata = sc.read('learned.h5ad') 19 | 20 | marker_genes_dict = { 21 | 'Epiblast': ["Pou5f1"], 22 | 'Primitive streak': ["Eomes", "Mixl1"], #Nanong?!?! 23 | 'Endoderms': ["Cer1", "Sox7"], 24 | 'Mesoderms': ["T", 'Cdx1'], 25 | 'Ectoderms': ['Grhl2', 'Six3'], 26 | 27 | 'Exe endoderm': ["Apoa2"], 28 | 'Exe ectoderm': ["Tfap2c"], 29 | 30 | 'Cardiomyocytes': ["Tnnt2"], 31 | 'Blood prog.': ["Lmo2", ], 32 | 'Erythroid': ["Gypa"], 33 | } 34 | 35 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf') 36 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', color_map='Greens', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf') 37 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf') 38 | 39 | for k in marker_genes_dict: 40 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k)) 41 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k)) 42 | -------------------------------------------------------------------------------- /example/Figure3/5.marker_genes.py: -------------------------------------------------------------------------------- 1 | import logging, matplotlib, os, sys 2 | import scanpy as sc 3 | import numpy as np 4 | import scipy as sp 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from matplotlib import rcParams 8 | from matplotlib import colors 9 | import seaborn as sb 10 | from rpy2.robjects.packages import importr 11 | #from gprofiler import gprofiler 12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures 13 | sc.settings.verbosity = 1 14 | sc.set_figure_params(dpi=200, dpi_save=300) 15 | 16 | sc.settings.figdir = 'markers' 17 | 18 | adata = sc.read('learned.h5ad') # You can skip the script 3 if using te 2b. 19 | #sc.pp.log1p(adata) 20 | 21 | print(adata.var_names) 22 | 23 | oh = open('gene_names.all.tsv', 'w') 24 | for g in adata.var_names: 25 | oh.write('%s\n' % g) 26 | oh.close() 27 | 28 | marker_genes_dict = { 29 | 'Epiblast': ["Pou5f1", "Epcam"], 30 | 'Primitive streak': ["Eomes", "Nanog"], #Nanog?!?! 31 | 'Anterior primitive streak': ["Gsc", "Mixl1"], 32 | 'Notochord': ["Noto", "T"], 33 | 'Def. Endoderm': ["Cer1", "Sox7"], 34 | 'Nascent mesoderm': ["Mesp1", "Apela"], 35 | 'Caudal mesoderm': ["Cdx1", "Hes7"], 36 | 'Paraxial mesoderm': ["Tcf15", "Tbx1"], 37 | 'Somitic mesoderm': ["Tbx6", "Dll1"], 38 | 'Pharngyeal mesoderm': ["Tcf21", "Isl1"], 39 | 'Cardiomyocytes': ["Tnnt2", "Myl4"], 40 | 'Allantois': ["Tbx4", "Hoxa11"], 41 | 'Mesenchyme': ["Krt18", "Pmp22"], 42 | 'Hemandothelial prog.': ["Kdr", "Etv2"], 43 | 'Endothelium': ["Pecam1", "Anxa5"], 44 | 'Blood prog.': ["Runx1", "Lmo2"], 45 | 'Erythroid': ["Gata1", "Gypa"], 46 | 'Neuromesoderml prog.': ["Cdx4", "Epha5"], 47 | 'Neurectoderm': ["Six3", "Irx3"], 48 | 'Neural crest': ["Dlx2", "Sox10"], 49 | 'Brain': ["En1", "Pax2"], 50 | 'Spinal cord': ["Sox2", "Pax2"], 51 | 'Surface ectoderm': ["Trp63", "Grhl2"], 52 | 'Visceral endoderm': ["Dkk1", "Amot"], 53 | 'Exe endoderm': ["Ttr", "Apoa2"], 54 | 'Exe ectoderm': ["Tfap2c", "Elf5"], 55 | 'Parietal endoderm': ["Sparc", "Plat"], 56 | 'others': ['Fgf5', 'Lefty2'], 57 | } 58 | 59 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf') 60 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf') 61 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf') 62 | 63 | for k in marker_genes_dict: 64 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k)) 65 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k)) 66 | 67 | -------------------------------------------------------------------------------- /example/Figure3/TE_genes_id.mm10.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/example/Figure3/TE_genes_id.mm10.txt.gz -------------------------------------------------------------------------------- /scTE/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__ = "1.0" 3 | 4 | # from .miniglbase import genelist, location, glload 5 | # import .miniglbase 6 | 7 | __all__ = ["genelist", "location", "glload",] 8 | -------------------------------------------------------------------------------- /scTE/annotation.py: -------------------------------------------------------------------------------- 1 | import os,sys,gzip,time 2 | import numpy as np 3 | from scTE.miniglbase import genelist, glload, location 4 | 5 | form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3} 6 | 7 | def cleanexon(filename, genefilename, exons): 8 | if not os.path.exists('%s_scTEtmp/index'%filename): 9 | os.system('mkdir -p %s_scTEtmp/index'%filename) 10 | 11 | oh=gzip.open('%s_scTEtmp/index/%s.bed.gz'%(filename,genefilename),'wt') 12 | for k in sorted(exons): 13 | E=[] 14 | for it in exons[k]: 15 | E+=list(range(it[1],it[2])) 16 | E=sorted(set(E)) 17 | 18 | s=0 19 | tmp=[] 20 | for id in range(0,len(E)-1): 21 | if E[id+1]-E[id] >1: 22 | en=id 23 | tmp.append([E[s],E[en]]) 24 | s=en+1 25 | tmp.append([E[s],E[id+1]]) 26 | 27 | for item in tmp: 28 | oh.write('%s\t%s\t%s\t%s\n'%(it[0],item[0],item[1],k)) 29 | oh.close() 30 | 31 | def annoGtf(filename, genefile, tefile, mode): 32 | 33 | genefilename = genefile.split('/')[-1:][0].replace('.gtf','').replace('.gz','') 34 | tefilename = tefile.split('/')[-1:][0].replace('.bed','').replace('.gz','') 35 | 36 | raw = {} 37 | clean = {} 38 | if '.gz' in genefile: 39 | o = gzip.open(genefile,'rb') 40 | else: 41 | o=open(genefile,'rU') 42 | for l in o: 43 | if '.gz' in genefile: 44 | l=l.decode('ascii') 45 | if l.startswith('#'): 46 | continue 47 | t=l.strip().split('\t') 48 | if t[2]=='exon' or t[2]=='UTR': 49 | chr = t[0].replace('chr','') 50 | left = int(t[3]) 51 | riht = int(t[4]) 52 | name=t[8].split('gene_name "')[1].split('";')[0] 53 | 54 | if name not in raw: 55 | raw[name] = [] 56 | raw[name].append([chr,left,riht]) 57 | 58 | if 'protein_coding' not in l and 'lincRNA' not in l: 59 | continue 60 | if name not in clean: 61 | clean[name] = [] 62 | clean[name].append([chr,left,riht]) 63 | o.close() 64 | 65 | cleanexon(filename,'%s.raw'%genefilename,raw) 66 | cleanexon(filename,'%s.clean'%genefilename,clean) 67 | 68 | if mode == 'exclusive': 69 | gene ={} 70 | o = gzip.open('%s_scTEtmp/index/%s.clean.bed.gz'%(filename,genefilename),'rb') 71 | for l in o: 72 | t = l.decode('ascii').strip().split('\t') 73 | chr = t[0].replace('chr','') 74 | left = int(t[1]) 75 | rite = int(t[2]) 76 | 77 | left_buck = int((left-1)/10000) * 10000 78 | right_buck = int((rite)/10000) * 10000 79 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 80 | 81 | if chr not in gene: 82 | gene[chr] = {} 83 | 84 | if buckets_reqd: 85 | for buck in buckets_reqd: 86 | if buck not in gene[chr]: 87 | gene[chr][buck] = [] 88 | gene[chr][buck].append([left, rite]) 89 | o.close() 90 | 91 | noverlap = [] 92 | if '.gz' in tefile: 93 | o = gzip.open(tefile,'rb') 94 | else: 95 | o = open(tefile,'rU') 96 | for n,l in enumerate(o): 97 | if '.gz' in tefile: 98 | l = l.decode('ascii') 99 | t = l.strip().split('\t') 100 | chr = t[0] 101 | left = int(t[1]) 102 | rite = int(t[2]) 103 | 104 | if chr not in gene: 105 | noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3])) 106 | continue 107 | 108 | left_buck = int((left-1)/10000) * 10000 109 | right_buck = int((rite)/10000) * 10000 110 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 111 | 112 | if buckets_reqd: 113 | i = 1 114 | for buck in buckets_reqd: 115 | if buck not in gene[chr]: 116 | pass 117 | else: 118 | for k in gene[chr][buck]: 119 | if left < k[1] and rite > k[0]: 120 | i = 0 121 | break 122 | if i == 0: 123 | break 124 | if i == 1: 125 | noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3])) 126 | 127 | oh = gzip.open('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename),'wt') 128 | for k in noverlap: 129 | oh.write(k) 130 | oh.close() 131 | 132 | genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename, genefilename), format=form, gzip=True) 133 | TEs = genelist('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename), format=form, gzip=True) 134 | print(genes) 135 | print(TEs) 136 | 137 | all_annot = genes + TEs 138 | all_annot.save('%s_scTEtmp/index/custome.exclusive.glb'%filename) 139 | annot = '%s_scTEtmp/index/custome.exclusive.glb'%filename 140 | 141 | elif mode == 'inclusive': 142 | genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename,genefilename), format=form, gzip=True) 143 | if tefilename.endswith('.gz'): 144 | TEs = genelist(tefile, format=form, gzip=True) 145 | else: 146 | TEs = genelist(tefile, format=form) 147 | 148 | all_annot = genes + TEs 149 | all_annot.save('%s_scTEtmp/index/custome.inclusive.glb'%filename) 150 | annot = '%s_scTEtmp/index/custome.inclusive.glb'%filename 151 | 152 | return annot 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /scTE/base.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import multiprocessing 3 | import argparse 4 | from functools import partial 5 | import logging 6 | import os, sys, glob, datetime, time, gzip 7 | import collections 8 | from collections import defaultdict 9 | from math import log 10 | from scTE.miniglbase import genelist, glload, location 11 | from scTE.annotation import annoGtf 12 | import subprocess 13 | 14 | import numpy as np 15 | import scipy 16 | import anndata as ad 17 | 18 | def read_opts(parser): 19 | args = parser.parse_args() 20 | if args.format == "BAM" : 21 | args.parser = "BAM" 22 | elif args.format == "SAM" : 23 | args.parser = "SAM" 24 | else : 25 | logging.error("The input file must be SAM/BAM format: %s !\n" % (args.format)) 26 | sys.exit(1) 27 | 28 | args.error = logging.critical 29 | args.warn = logging.warning 30 | args.debug = logging.debug 31 | args.info = logging.info 32 | 33 | args.argtxt ="\n".join(("Parameter list:", \ 34 | "Sample = %s" % (args.out), \ 35 | # "Genome = %s" % (args.genome), \ 36 | "Reference annotation index = %s" %(args.annoglb[0]), \ 37 | "Minimum number of genes required = %s" % (args.genenumber), \ 38 | "Minimum number of counts required = %s"% (args.countnumber),\ 39 | "Number of threads = %s " % (args.thread),\ 40 | )) 41 | return args 42 | 43 | # def getanno(filename, genefile, tefile, genome, mode): 44 | # form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3} 45 | # 46 | # if genefile == 'default' and tefile == 'default': 47 | # if genome == 'mm10': 48 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ] 49 | # if mode == 'exclusive': 50 | # if not os.path.exists('mm10.exclusive.glb'): 51 | # logging.error("Did not find the annotation index mm10.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) 52 | # sys.exit(1) 53 | # all_annot = 'mm10.exclusive.glb' 54 | # allelement = set(glload(all_annot)['annot']) 55 | # 56 | # elif mode == 'inclusive': 57 | # if not os.path.exists('mm10.inclusive.glb'): 58 | # logging.error("Did not find the annotation index mm10.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) 59 | # sys.exit(1) 60 | # all_annot = 'mm10.inclusive.glb' 61 | # allelement = set(glload(all_annot)['annot']) 62 | # 63 | # elif genome == 'hg38': 64 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ] 65 | # if mode == 'exclusive': 66 | # if not os.path.exists('hg38.exclusive.glb'): 67 | # logging.error("Did not find the annotation index hg38.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" ) 68 | # sys.exit(1) 69 | # all_annot = 'hg38.exclusive.glb' 70 | # allelement = set(glload(all_annot)['annot']) 71 | # 72 | # elif mode == 'inclusive': 73 | # if not os.path.exists('hg38.inclusive.glb'): 74 | # logging.error("Did not find the annotation index hg38.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n") 75 | # sys.exit(1) 76 | # all_annot = 'hg38.inclusive.glb' 77 | # allelement = set(glload(all_annot)['annot']) 78 | # else: 79 | # if genome in ['hg38']: 80 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ] 81 | # 82 | # elif genome in ['mm10']: 83 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ] 84 | # 85 | # if not os.path.isfile(tefile) : 86 | # logging.error("No such file: %s !\n" %(tefile)) 87 | # sys.exit(1) 88 | # 89 | # if not os.path.isfile(genefile) : 90 | # logging.error("No such file: %s !\n" % (genefile)) 91 | # sys.exit(1) 92 | # 93 | # all_annot = annoGtf(filename, genefile=genefile, tefile=tefile, mode=mode) 94 | # allelement = set(glload(all_annot)['annot']) 95 | # 96 | # return(allelement,chr_list,all_annot) 97 | 98 | def Readanno(filename, annoglb): #genome 99 | glannot = glload(annoglb) 100 | allelement = set(glannot['annot']) 101 | # if genome in ['mm10']: 102 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ] 103 | # elif genome in ['hg38']: 104 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ] 105 | 106 | chr_list = list(set([ k['chr'] for k in glannot['loc']])) #this is useful for costume chromsome 107 | return(allelement, chr_list, annoglb, glannot) 108 | 109 | def checkCBUMI(filename,out,CB,UMI): 110 | if CB == 'CR': 111 | subprocess.run('samtools view %s | head -100| grep "CR:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True) 112 | time.sleep(2) #subprocess need take some time 113 | o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r') 114 | for l in o: 115 | l=l.strip() 116 | if int(l) < 100: 117 | logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename) 118 | sys.exit(1) 119 | elif CB == 'CB': 120 | subprocess.run('samtools view %s | head -100| grep "CB:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True) 121 | time.sleep(2) #subprocess need take some time 122 | o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r') 123 | for l in o: 124 | l=l.strip() 125 | if int(l) < 100: 126 | logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename) 127 | sys.exit(1) 128 | 129 | if UMI == 'UR': 130 | subprocess.run('samtools view %s | head -100| grep "UR:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True) 131 | time.sleep(2) 132 | o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r') 133 | for l in o: 134 | l=l.strip() 135 | if int(l) < 100: 136 | logging.error("The input file %s has no UR:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename) 137 | sys.exit(1) 138 | elif UMI == 'UB': 139 | subprocess.run('samtools view %s | head -100| grep "UB:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True) 140 | time.sleep(2) 141 | o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r') 142 | for l in o: 143 | l=l.strip() 144 | if int(l) < 100: 145 | logging.error("The input file %s has no UB:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename) 146 | sys.exit(1) 147 | 148 | def Bam2bed(filename, CB, UMI, out, num_threads): 149 | if not os.path.exists('%s_scTEtmp/o1'%out): 150 | os.system('mkdir -p %s_scTEtmp/o1'%out) 151 | 152 | sample=filename.split('/')[-1].replace('.bam','') 153 | if sys.platform == 'darwin': # Mac OSX has BSD sed 154 | switch = '-E' 155 | else: 156 | switch = '-r' 157 | 158 | if UMI == 'False': 159 | if CB == 'False': 160 | # Put the sample name in the barcode slot 161 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\'| gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, out, switch, out, out)) 162 | elif CB == 'CR': 163 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out)) 164 | elif CB == 'CB': 165 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out)) 166 | elif UMI == 'UR': 167 | if CB == 'CR': 168 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out)) 169 | elif CB == 'CB': 170 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out)) 171 | elif UMI == 'UB': 172 | if CB == 'CR': 173 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out)) 174 | elif CB == 'CB': 175 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out)) 176 | 177 | def Para_bam2bed(filename, CB, UMI, out): 178 | if not os.path.exists('%s_scTEtmp/o0'%out): 179 | os.system('mkdir -p %s_scTEtmp/o0'%out) 180 | 181 | sample=filename.split('/')[-1].replace('.bam','') 182 | 183 | if sys.platform == 'darwin': # Mac OSX has BSD sed 184 | switch = '-E' 185 | else: 186 | switch = '-r' 187 | 188 | if UMI == 'False': 189 | if CB == 'False': 190 | os.system('samtools view %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, sample, switch, out, sample)) 191 | elif CB == 'CR': 192 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample)) 193 | elif CB == 'CB': 194 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample)) 195 | elif UMI == 'UR': 196 | if CB == 'CR': 197 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample)) 198 | elif CB == 'CB': 199 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample)) 200 | elif UMI == 'UB': 201 | if CB == 'CR': 202 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample)) 203 | elif CB == 'CB': 204 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample)) 205 | 206 | def splitAllChrs(chromosome_list, filename, genenumber, countnumber, UMI=True): 207 | ''' 208 | **Purpose** 209 | Split the data into separate beds, and count up all the times each barcode appears 210 | 211 | This variant uses more memory, but does it all at the same time and gets the filtered whitelist for free 212 | 213 | **Arguments** 214 | chromosome_list 215 | List of chromosome names 216 | 217 | filename (Required) 218 | filename stub to use for tmp files 219 | 220 | genenumber (Required) 221 | Minimum number of genes expressed required for a cell to pass filtering 222 | 223 | countnumber (Required) 224 | Minimum number of counts required for a cell to pass filtering. 225 | 226 | UMI (optional, default=True) 227 | use the UMI 228 | 229 | **Returns** 230 | The barcode whitelist 231 | ''' 232 | 233 | if not os.path.exists('%s_scTEtmp/o2' % filename): 234 | os.system('mkdir -p %s_scTEtmp/o2'%filename) 235 | 236 | chromosome_list = set([c.replace('chr', '') for c in chromosome_list]) 237 | 238 | file_handle_in = gzip.open('%s_scTEtmp/o1/%s.bed.gz' % (filename,filename), 'rt') 239 | file_handles_out = {chr: gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz' % (filename,filename,chr), 'wt') for chr in chromosome_list} 240 | 241 | CRs = defaultdict(int) 242 | 243 | if UMI: 244 | uniques = {chrom: set([]) for chrom in chromosome_list} 245 | 246 | # Make a BED for each chromosome 247 | for line in file_handle_in: 248 | t = line.strip().split('\t') 249 | chrom = t[0].replace('chr', '') # strip chr 250 | 251 | if chrom not in chromosome_list: # remove the unusual chromosomes 252 | # Force chrMT -> chrM 253 | if chrom == 'MT': 254 | chrom = 'M' 255 | else: 256 | continue 257 | 258 | if UMI: 259 | if line in uniques[chrom]: 260 | continue 261 | uniques[chrom].add(line) 262 | CRs[t[3]] += 1 263 | else: 264 | CRs[t[3]] += 1 265 | 266 | file_handles_out[chrom].write(line) 267 | 268 | [file_handles_out[k].close() for k in file_handles_out] 269 | file_handle_in.close() 270 | 271 | if not countnumber: 272 | mincounts = 2 * genenumber 273 | else: 274 | mincounts = countnumber 275 | 276 | CRs = {k: v for k, v in CRs.items() if v >= mincounts} 277 | 278 | return list(CRs.keys()) 279 | 280 | def filterCRs(filename, genenumber, countnumber): 281 | CRs = defaultdict(int) 282 | for f in sorted(glob.glob('%s_scTEtmp/o2/%s*.count.gz'%(filename,filename))): 283 | logging.info('Reading %s '%os.path.split(f)[1]) 284 | o = gzip.open(f,'rt') 285 | for l in o: 286 | t = l.strip().split('\t') 287 | CRs[t[0]] += int(t[1]) 288 | o.close() 289 | 290 | if not countnumber: 291 | mincounts = 2* genenumber 292 | else: 293 | mincounts = countnumber 294 | 295 | logging.info('Before filter %s'%len(CRs)) 296 | CRs = {k: v for k, v in CRs.items() if v >= mincounts} 297 | logging.info('Aefore filter %s'%len(CRs)) 298 | 299 | return list(CRs.keys()) 300 | 301 | def splitChr(chr, filename, CB, UMI): 302 | if not os.path.exists('%s_scTEtmp/o2'%filename): 303 | os.system('mkdir -p %s_scTEtmp/o2'%filename) 304 | 305 | chr=chr.replace('chr','') 306 | if CB == 'CR' or CB == 'CB': CB = True 307 | else: CB = False 308 | if UMI == 'UR' or UMI == 'UB': UMI = True 309 | else: UMI= False 310 | 311 | if not CB: # C1-style data is a cell per BAM, so no barcode; 312 | if not UMI: 313 | if chr == '1': 314 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 315 | elif chr == '2': 316 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 317 | elif chr == '3': 318 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 319 | else: 320 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 321 | else: 322 | if chr == '1': 323 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 324 | elif chr == '2': 325 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 326 | elif chr == '3': 327 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 328 | else: 329 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 330 | else: 331 | if not UMI: # did not remove the potential PCR duplicates for scRNA-seq 332 | if chr == '1': 333 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 334 | elif chr == '2': 335 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 336 | elif chr == '3': 337 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 338 | else: 339 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 340 | else: 341 | if chr == '1': 342 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 343 | elif chr == '2': 344 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 345 | elif chr == '3': 346 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 347 | else: 348 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr)) 349 | 350 | CRs = defaultdict(int) 351 | o = gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr),'rt') 352 | for l in o: 353 | t = l.strip().split('\t') 354 | CRs[t[3]] += 1 355 | o.close() 356 | 357 | o = gzip.open('%s_scTEtmp/o2/%s.chr%s.count.gz'%(filename,filename,chr),'wt') 358 | for k in CRs: 359 | o.write('%s\t%s\n'%(k,CRs[k])) 360 | o.close() 361 | 362 | def align(chr, filename, all_annot, glannot, whitelist): #CB 363 | ''' 364 | **Purpose** 365 | For each read, align it to the index and assign a TE, gene. 366 | 367 | This is the speed critical part. 368 | 369 | ''' 370 | s1 = time.time() 371 | chr = 'chr' + chr 372 | 373 | if not os.path.exists('%s_scTEtmp/o3'%filename): 374 | os.system('mkdir -p %s_scTEtmp/o3'%filename) 375 | 376 | if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline 377 | glannot = glload(all_annot) 378 | 379 | # Only keep the glbase parts we need. 380 | buckets = glannot.buckets[chr.replace('chr', '')] 381 | all_annot = glannot.linearData 382 | 383 | oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt') 384 | res = {} 385 | for line in oh: 386 | t = line.strip().split('\t') 387 | barcode = t[3] 388 | if barcode not in whitelist: 389 | continue 390 | if barcode not in res: 391 | res[barcode] = defaultdict(int) 392 | 393 | #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom; 394 | left = int(t[1]) 395 | rite = int(t[2]) 396 | 397 | #loc = location(chr=chrom, left=left, right=rite) 398 | left_buck = ((left-1)//10000) * 10000 399 | right_buck = ((rite)//10000) * 10000 400 | buckets_reqd = range(left_buck, right_buck+10000, 10000) 401 | 402 | if buckets_reqd: 403 | loc_ids = set() 404 | loc_ids_update = loc_ids.update 405 | 406 | # get the ids reqd. 407 | [loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets] 408 | 409 | result = [all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"])] 410 | 411 | if result: 412 | for gene in result: 413 | res[barcode][gene] += 1 414 | 415 | oh.close() 416 | 417 | oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename,filename,chr), 'wt') 418 | for bc in sorted(res): 419 | for gene in sorted(res[bc]): 420 | oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene])) 421 | oh.close() 422 | 423 | def Countexpression(filename, allelement, genenumber, cellnumber, hdf5): 424 | gene_seen = allelement 425 | 426 | whitelist={} 427 | o = gzip.open('%s_scTEtmp/o4/%s.bed.gz'%(filename, filename), 'rt') 428 | for n,l in enumerate(o): 429 | t = l.strip().split('\t') 430 | if t[0] not in whitelist: 431 | whitelist[t[0]] = 0 432 | whitelist[t[0]] += 1 433 | o.close() 434 | 435 | CRlist = [] 436 | sortcb = sorted(whitelist.items(), key=lambda item:item[1], reverse=True) 437 | for n,k in enumerate(sortcb): 438 | if k[1] < genenumber: 439 | break 440 | if n >= cellnumber: 441 | break 442 | CRlist.append(k[0]) 443 | CRlist = set(CRlist) 444 | 445 | res = {} 446 | genes_oh = gzip.open('%s_scTEtmp/o4/%s.bed.gz' % (filename,filename), 'rt') 447 | for n, l in enumerate(genes_oh): 448 | t = l.strip().split('\t') 449 | if t[0] not in CRlist: 450 | continue 451 | if t[0] not in res: 452 | res[t[0]] = {} 453 | if t[1] not in res[t[0]]: 454 | res[t[0]][t[1]] = 0 455 | res[t[0]][t[1]] += int(t[2]) 456 | 457 | genes_oh.close() 458 | 459 | s=time.time() 460 | 461 | # Save out the final file 462 | 463 | gene_seen = list(gene_seen) # Do the sort once; 464 | gene_seen.sort() 465 | 466 | #==== save results ===== 467 | if not hdf5: # save as csv 468 | res_oh = open('%s.csv'%filename, 'w') 469 | res_oh.write('barcodes,') 470 | res_oh.write('%s\n' % (','.join([str(i) for i in gene_seen]))) 471 | 472 | for k in sorted(res): 473 | l = ["0"] * len(gene_seen) # Avoid all the appends 474 | for idx, gene in enumerate(gene_seen): 475 | if gene in res[k]: 476 | l[idx] = str(res[k][gene]) 477 | res_oh.write('%s,%s\n' % (k, ','.join(l))) 478 | res_oh.close() 479 | 480 | else: # save as hdf5 481 | data = [] 482 | CBs = [] 483 | for k in sorted(res): 484 | l = ["0"] * len(gene_seen) # Avoid all the appends 485 | for idx, gene in enumerate(gene_seen): 486 | if gene in res[k]: 487 | l[idx] = str(res[k][gene]) 488 | data.append(l) 489 | CBs.append(k) 490 | 491 | obs = pd.DataFrame(index = CBs) 492 | var = pd.DataFrame(index = gene_seen) 493 | adata = ad.AnnData(np.asarray(data).astype(int),var = var,obs = obs) 494 | adata.X = scipy.sparse.csr_matrix(adata.X) 495 | adata.write('%s.h5ad'%filename) 496 | 497 | #======================== 498 | 499 | 500 | return len(res), genenumber, filename 501 | 502 | def timediff(timestart, timestop): 503 | t = (timestop-timestart) 504 | time_day = t.days 505 | s_time = t.seconds 506 | ms_time = t.microseconds / 1000000 507 | usedtime = int(s_time + ms_time) 508 | time_hour = int(usedtime / 60 / 60 ) 509 | time_minute = int((usedtime - time_hour * 3600 ) / 60 ) 510 | time_second = int(usedtime - time_hour * 3600 - time_minute * 60 ) 511 | retstr = "%dd %dh %dm %ds" %(time_day, time_hour, time_minute, time_second,) 512 | return retstr 513 | -------------------------------------------------------------------------------- /scTE/miniglbase/README.md: -------------------------------------------------------------------------------- 1 | # README # 2 | 3 | ### What is glbase3? ### 4 | 5 | This is a staged mini version of glbase. 6 | 7 | You can find the full install here: 8 | 9 | https://github.com/oaxiom/glbase3 10 | 11 | == License == 12 | 13 | glbase is distributed under the MIT license: 14 | {{{ 15 | Copyright (C) 2009-2019 Andrew Hutchins 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 20 | 21 | Except as contained in this notice, the name(s) of the above copyright holders shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | }}} 25 | 26 | -------------------------------------------------------------------------------- /scTE/miniglbase/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Initialise glbase, import all the libraries, set up the environment etc. 4 | 5 | Requires: 6 | * numpy 7 | * matplotlib 8 | * scipy 9 | * sklearn 10 | * h5py 11 | * networkx 12 | """ 13 | 14 | import sys, os 15 | 16 | #----------------------------------------------------------------------- 17 | # Load all of the global configuration options. 18 | try: 19 | from . import config 20 | except: 21 | print("Error: Fatal - glbase3 is not installed correctly, cannot find my own libraries") 22 | print(" Is the python 'sys.path' correct?") 23 | sys.exit() # no raise if I can't get errors, it's surely a fatal installation problem. 24 | 25 | # ---------------------------------------------------------------------- 26 | # Test for availability of the core non-standard libs. 27 | # These need to be available as the subsequent load/checking is weak/non-existent. 28 | 29 | try: 30 | import numpy 31 | config.NUMPY_AVAIL = True 32 | except Exception: 33 | raise LibraryNotFoundError("Fatal - Numpy is not available or not installed") 34 | 35 | try: 36 | import scipy 37 | config.SCIPY_AVAIL = True 38 | except Exception: 39 | raise LibraryNotFoundError("Fatal - Scipy is not available or not installed") 40 | 41 | # ---------------------------------------------------------------------- 42 | # Now import the rest of my libraries - assumes here they are available. 43 | # If I can get config and errors then these are probably available too. 44 | 45 | from .utils import glload 46 | from .location import location 47 | from .genelist import genelist 48 | 49 | # export all of the libraries, methods and helpers. 50 | __all__ = ["genelist", 51 | 'config', 52 | "location", 53 | "glload", 54 | ] 55 | -------------------------------------------------------------------------------- /scTE/miniglbase/base_genelist.py: -------------------------------------------------------------------------------- 1 | 2 | import copy, pickle, re 3 | from shlex import split as shlexsplit 4 | 5 | from . import config 6 | from .location import location 7 | 8 | class _base_genelist: 9 | def __init__(self): 10 | """ 11 | (Internal) 12 | This is the base derived class for all genelists. 13 | It contains methods available to all implementations of genelist. 14 | """ 15 | self.name = None 16 | self.linearData = None 17 | 18 | def __repr__(self): 19 | return("") 20 | 21 | def __in__(self, key): 22 | """ 23 | (Override) 24 | 25 | Confer: 26 | if "key" in genelist: 27 | """ 28 | return(key in list(self.keys())) 29 | 30 | def __bool__(self): 31 | """ 32 | Fixes: 33 | if genelist: # contains something 34 | True 35 | 36 | and fixes: 37 | 38 | len(genelist) = 0 39 | if genelist: # Would pass even if the genelist is empty 40 | False 41 | 42 | """ 43 | return(len(self) > 0) 44 | 45 | def __shallowcopy__(self): 46 | raise Exception("__shallowcopy__() is NOT supposrted for genelists, use gl.deepcopy() or gl.shallowcopy()") 47 | 48 | def __deepcopy__(self, fake_arg): 49 | raise Exception("__deepcopy__() is NOT supported for genelists, use gl.deepcopy() or gl.shallowcopy()") 50 | 51 | def deepcopy(self): 52 | """ 53 | Confer copy to mean a deepcopy as opposed to a shallowcopy. 54 | 55 | This is required as genelists are compound lists. 56 | """ 57 | return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory 58 | 59 | def shallowcopy(self): 60 | """ 61 | (New) 62 | 63 | Some weird behaviour here, I know, this is so I can still get access to 64 | the shallow copy mechanism even though 90% of the operations are copies. 65 | """ 66 | return(copy.copy(self)) # But doesnt this just call __copy__() anyway? 67 | 68 | def __len__(self): 69 | """ 70 | (Override) 71 | get the length of the list 72 | """ 73 | return(len(self.linearData)) 74 | 75 | def __int__(self): 76 | """ 77 | (Override) 78 | get the length of the list 79 | NOTE: It's possible this is a bug/feature. 80 | I don't remove it at the moment as I'm not sure if it is used anywhere. 81 | 82 | """ 83 | return(len(self.linearData)) 84 | 85 | def __iter__(self): 86 | """ 87 | (Override) 88 | make the geneList behave like a normal iterator (list) 89 | """ 90 | for n in self.linearData: 91 | yield n 92 | 93 | def __getitem__(self, index): 94 | """ 95 | (Override) 96 | confers a = geneList[0] behaviour 97 | 98 | This is a very slow way to access the data, and may be a little inconsistent in the things 99 | it returns. 100 | 101 | NOTE: 102 | a = genelist[0] # returns a single dict 103 | a = genelist[0:10] # returns a new 10 item normal python list. 104 | a = genelist["name"] returns a python list containing a vertical slice of all of the "name" keys 105 | 106 | """ 107 | newl = False 108 | if isinstance(index, int): 109 | # this should return a single dictionary. 110 | return(self.linearData[index]) 111 | elif isinstance(index, str): 112 | # returns all labels with that item. 113 | return(self._findAllLabelsByKey(index)) 114 | elif isinstance(index, slice): 115 | # returns a new genelist corresponding to the slice. 116 | newl = self.shallowcopy() 117 | newl.linearData = utils.qdeepcopy(self.linearData[index]) # separate the data so it can be modified. 118 | newl._optimiseData() 119 | return(newl) # deep copy the slice. 120 | 121 | def __setitem__(self, index, *args): 122 | """ 123 | (Override) 124 | Block key editing. 125 | """ 126 | raise AssertionError 127 | 128 | def __hash__(self): 129 | """ 130 | (Override) 131 | 132 | compute a sensible hash value 133 | """ 134 | try: 135 | return(hash(self.name + str(self[0]) + str(self[-1]) + str(len(self)))) # hash data for comparison. 136 | except Exception: 137 | try: 138 | return(hash(self.name + str(self[0]) + str(self[-1]))) # len() probably not available (delayedlist?). 139 | except Exception: # I bet the list is empty. 140 | return(hash(self.name)) 141 | 142 | def __add__(self, gene_list): 143 | """ 144 | (Override) 145 | confer append like behaviour: c = a + b 146 | keeps duplicates (just concatenate's lists) 147 | """ 148 | mkeys = self._collectIdenticalKeys(gene_list) 149 | if not mkeys: # unable to match. 150 | config.log.warning("No matching keys, the resulting list would be meaningless") 151 | return(False) 152 | newl = self.deepcopy() 153 | newl.linearData.extend(copy.deepcopy(gene_list.linearData)) 154 | newl._optimiseData() 155 | return(newl) 156 | 157 | def __eq__(self, gene_list): 158 | """ 159 | (Internal) 160 | Are the lists equivalent? 161 | lists now, must only have one identical key. 162 | 163 | This is just testing the keys... 164 | Wrong... 165 | """ 166 | # check the hash's first to see if they are identical. 167 | # This is diabled as it can be very slow. 168 | #if self.__hash__() == gene_list.__hash__(): 169 | # return(True) 170 | 171 | for key in self.linearData[0]: 172 | if key in gene_list.linearData[0]: 173 | return(True) # just one key in common required. 174 | return(False) 175 | 176 | def __ne__(self, gene_list): 177 | """ 178 | (Internal) 179 | Are the lists equivalent? 180 | ie do they have the same keys? 181 | """ 182 | return(not self.__eq__(gene_list)) 183 | 184 | def keys(self): 185 | """ 186 | return a list of all the valid keys for this geneList 187 | """ 188 | return([key for key in self.linearData[0]]) # Not exhaustive 189 | 190 | def _guessDataType(self, value): 191 | """ 192 | (Internal) 193 | 194 | Take a guess at the most reasonable datatype to store value as. 195 | returns the resulting data type based on a list of logical cooercions 196 | (explain as I fail each cooercion). 197 | Used internally in _loadCSV() 198 | I expect this will get larger and larger with new datatypes, so it's here as 199 | as a separate function. 200 | 201 | Datatype coercion preference: 202 | float > list > int > location > string 203 | """ 204 | 205 | try: # see if the element is a float() 206 | if "." in value: # if no decimal point, prefer to save as a int. 207 | return(float(value)) 208 | else: 209 | raise ValueError 210 | except ValueError: 211 | try: 212 | # Potential error here if it is a list of strings? 213 | if '[' in value and ']' in value and ',' in value and '.' in value: # Probably a Python list of floats 214 | return([float(i) for i in value.strip(']').strip('[').split(',')]) 215 | elif '[' in value and ']' in value and ',' in value: # Probably a Python list of ints 216 | return([int(i) for i in value.strip(']').strip('[').split(',')]) 217 | else: 218 | raise ValueError 219 | except ValueError: 220 | try: # see if it's actually an int? 221 | return(int(value)) 222 | except ValueError: 223 | try: # see if I can cooerce it into a location: 224 | return(location(loc=value)) 225 | except (TypeError, IndexError, AttributeError, AssertionError, ValueError): # this is not working, just store it as a string 226 | return(str(value).strip()) 227 | return("") # return an empty datatype. 228 | # I think it is possible to get here. If the exception at int() or float() returns something other than a 229 | # ValueError (Unlikely, Impossible?) 230 | 231 | def _processKey(self, format, column): 232 | """ 233 | (Internal) 234 | the inner part of _loadCSV() to determine what to do with the key. 235 | Better in here too for security. 236 | """ 237 | 238 | d = {} 239 | for key in format: 240 | if isinstance(format[key], str) and "location" in format[key]: 241 | # locations are very common, add support for them out of the box: 242 | d[key] = eval(format[key]) 243 | else: 244 | d[key] = self._guessDataType(column[format[key]]) 245 | 246 | return(d) 247 | 248 | def save(self, filename=None, compressed=False): 249 | """ 250 | **Purpose** 251 | 252 | Save the genelist as a binary representation. 253 | This is guaranteed to be available for all geneList representations, with 254 | the only exception being the delayedlists. As that wouldn't 255 | make any sense as delayedlists are not copied into memory. 256 | 257 | You can use this method to cache the file. It's particularly useful for large files 258 | that get processed once but are then used a lot. 259 | 260 | loading the list back into memory is relatively quick. 261 | 262 | list = glload("path/to/filename.glb") 263 | 264 | I generally used extension is glb. Although you can use 265 | whatever you like. 266 | 267 | **Arguments** 268 | 269 | filename 270 | filename (and path, if you like) to save the file to 271 | 272 | compressed (Optional, default=False) 273 | use compression (not currently implemented) 274 | 275 | **Result** 276 | 277 | returns None 278 | Saves a binary representation of the geneList 279 | 280 | """ 281 | assert filename, "no filename specified" 282 | 283 | oh = open(filename, "wb") 284 | if compressed: 285 | config.log.warning("compression not currently implemented, saving anyway") 286 | pickle.dump(self, oh, -1) 287 | else: 288 | pickle.dump(self, oh, -1) 289 | oh.close() 290 | config.log.info("Saved binary version of list: '%s'" % filename) 291 | 292 | def from_pandas(self, pandas_data_frame): 293 | """ 294 | **Purpose** 295 | 296 | Convert a pandas dataFrame to a genelist 297 | 298 | NOTE: This is an INPLACE method that will REPLACE any exisiting data 299 | in the 300 | 301 | **Arguments** 302 | 303 | pandas_data_frame (Required) 304 | The pandas data frame to convert 305 | 306 | **Result** 307 | None 308 | The object is populated by 309 | 310 | """ 311 | if len(self) > 0: 312 | config.log.warning('genelist.from_pandas() will overwrite the existing data in the genelist') 313 | 314 | newl = [] 315 | key_names = pandas_data_frame.columns 316 | for index, row in pandas_data_frame.iterrows(): 317 | newitem = {} 318 | for k, item in zip(key_names, row): 319 | newitem[k] = item 320 | newl.append(newitem) 321 | self.linearData = newl 322 | self._optimiseData() 323 | 324 | config.log.info("genelist.from_pandas() imported dataFrame") 325 | -------------------------------------------------------------------------------- /scTE/miniglbase/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | config.py 3 | 4 | config must be imported before any other glbase library. 5 | 6 | """ 7 | 8 | import logging 9 | 10 | # -------------- Versioning data 11 | GLBASE_VERSION = "1.1105" 12 | 13 | # -------------- General options 14 | 15 | SILENT = False # set this to True to silence all glbase output. Only works at startup 16 | DEBUG = True 17 | do_logging = True 18 | 19 | # flags for the availability of libraries 20 | MATPLOTLIB_AVAIL = False # required 21 | NUMPY_AVAIL = False # required 22 | SCIPY_AVAIL = False # required 23 | SKLEARN_AVAIL = False # required 24 | H5PY_AVAIL = False # Optional. 25 | NETWORKX_AVAIL = False # optional 26 | PYDOT_AVAIL = False # optional 27 | NUMEXPR_AVAIL = False # Optional 28 | PYGRAPHVIZ_AVAIL = False # Optional 29 | 30 | # Some simple options for printing genelists 31 | NUM_ITEMS_TO_PRINT = 3 # number of items to print by default. 32 | PRINT_LAST_ITEM = True 33 | 34 | # size of buckets for collide() and overlap() 35 | # If this is changed then glload will not work correctly. 36 | bucket_size = 10000 # in bp - tested, seems a reasonable choice. 37 | 38 | # -------------- set up the logger here. 39 | logging.basicConfig(level=logging.DEBUG, 40 | format='%(levelname)-8s: %(message)s', 41 | datefmt='%m-%d %H:%M'), 42 | 43 | 44 | log = logging.getLogger('glbase3') 45 | log.setLevel(logging.INFO) 46 | -------------------------------------------------------------------------------- /scTE/miniglbase/location.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | location.py 4 | 5 | part of glbase. 6 | 7 | This class is an internal class that implements a more convenient way to manipulate 8 | genomic coordiantes. 9 | 10 | TODO: 11 | . add a 'in' code clause e.g.: 12 | if 1000 in location: (see if 1000 > left & < right) 13 | if a_location in b_location: (exectute a collide()) 14 | 15 | """ 16 | 17 | import copy, pickle 18 | 19 | class location: 20 | def __init__(self, loc=None, chr=None, left=None, right=None): 21 | if isinstance(loc, location): 22 | # It's actually already a loc. 23 | # I want to copy it and leave. 24 | self.loc = copy.copy(loc.loc) 25 | else: 26 | if loc: 27 | s = loc.lower().replace(",", "") # ucsc includes commas, remove them so you can cut and paste 28 | t = s.split(":") 29 | self.loc = {"chr": t[0].strip("chr").rstrip().upper(), "left":int(t[1].split("-")[0]), "right":int(t[1].split("-")[1])} 30 | else: 31 | self.loc = {"chr": str(chr).strip("chr").rstrip().upper(), "left": int(left), "right": int(right)} 32 | self.__update() # make sure the locstring is valid: 33 | 34 | def __eq__(self, other): 35 | if other: 36 | if isinstance(other, str): 37 | return(str(self) == str(other.replace(",", ""))) # use string comparison. 38 | 39 | # use a faster ? dict comparison, or throw an exception, as this item probably not a 40 | if self.loc["chr"] == other.loc["chr"]: 41 | if self.loc["left"] == other.loc["left"]: 42 | if self.loc["right"] == other.loc["right"]: 43 | return(True) 44 | return(False) 45 | 46 | def __lt__(self, other): # deprecated in Python3 47 | # Make locations sortable 48 | if self.loc['chr'] < other.loc['chr']: 49 | return True 50 | elif self.loc['chr'] == other.loc['chr']: 51 | if self.loc['left'] < other.loc['left']: 52 | return True 53 | elif self.loc['left'] == other.loc['left']: # For ties 54 | return False 55 | return False 56 | #self.loc['chr'] > other.loc['chr']: 57 | return False 58 | 59 | def __hash__(self): 60 | return(hash(self._loc_string)) 61 | 62 | def __deepcopy__(self, memo): 63 | return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory 64 | 65 | def __bool__(self): 66 | return(True) 67 | 68 | def __repr__(self): 69 | return("" % (self._loc_string)) 70 | 71 | def __len__(self): 72 | # work out the span. 73 | return(max([0, self.loc["right"] - self.loc["left"]])) 74 | 75 | def split(self, value=None): 76 | # ignores the 'value' argument completely and returns a three-ple 77 | return( (self.loc["chr"], self.loc["left"], self.loc["right"]) ) 78 | 79 | def __update(self): 80 | self._loc_string = None 81 | try: 82 | self._loc_string = "chr%s:%s-%s" % (self.loc["chr"].strip("chr"), self.loc["left"], self.loc["right"]) 83 | except Exception: # chr possibly sets of strings ... etc. 84 | self._loc_string = "chr%s:%s-%s" % (self.loc["chr"], self.loc["left"], self.loc["right"]) 85 | # I can't import my bunch of errors, as location is used in that module. So I spoof an assert 86 | if not self._loc_string: # failed to make a valid string... 87 | raise "Bad location formatting" 88 | 89 | def __getitem__(self, key): 90 | if key == "string": 91 | self.__update() # only update when accessed. 92 | return(self._loc_string) 93 | elif key == "dict": 94 | return(self.loc) 95 | return(self.loc[key]) 96 | 97 | def __setitem__(self, key, value): 98 | self.loc[key] = value 99 | self.__update() 100 | 101 | def __str__(self): 102 | return(self._loc_string) 103 | 104 | """ 105 | these methods below should copy the location and send a modified version back. 106 | """ 107 | def expand(self, base_pairs): 108 | new = copy.deepcopy(self) 109 | new.loc["left"] -= base_pairs 110 | new.loc["right"] += base_pairs 111 | new.__update() 112 | return(new) 113 | 114 | def expandLeft(self, base_pairs): 115 | new = copy.deepcopy(self) 116 | new.loc["left"] -= base_pairs 117 | new.__update() 118 | return(new) 119 | 120 | def expandRight(self, base_pairs): 121 | new = copy.deepcopy(self) 122 | new.loc["right"] += base_pairs 123 | new.__update() 124 | return(new) 125 | 126 | def shrink(self, base_pairs): 127 | new = copy.deepcopy(self) 128 | new.loc["left"] += base_pairs 129 | new.loc["right"] -= base_pairs 130 | new.__update() 131 | return(new) 132 | 133 | def shrinkLeft(self, base_pairs): 134 | new = copy.deepcopy(self) 135 | new.loc["left"] += base_pairs 136 | new.__update() 137 | return(new) 138 | 139 | def shrinkRight(self, base_pairs): 140 | new = copy.deepcopy(self) 141 | new.loc["right"] -= base_pairs 142 | new.__update() 143 | return(new) 144 | 145 | def pointLeft(self): 146 | """ 147 | get a new location at the exact left of the coordinate 148 | """ 149 | new = copy.deepcopy(self) 150 | new.loc["right"] = new.loc["left"] 151 | new.__update() 152 | return(new) 153 | 154 | def pointRight(self): 155 | """ 156 | get a new location at the exact right of the coordinate 157 | """ 158 | new = copy.deepcopy(self) 159 | new.loc["left"] = new.loc["right"] 160 | new.__update() 161 | return(new) 162 | 163 | def pointify(self): 164 | new = copy.deepcopy(self) 165 | centre = (self.loc["left"] + self.loc["right"]) // 2 166 | new.loc = {"chr": self.loc["chr"], "left": centre, "right": centre} 167 | new.__update() 168 | return(new) 169 | 170 | def collide(self, loc): 171 | if loc["chr"] != self["chr"]: 172 | return(False) 173 | return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"]) 174 | 175 | def qcollide(self, loc): 176 | """ 177 | **Purpose** 178 | perform a collision with another location object. 179 | This assumes you have already checked the locations are on the same chromosome. 180 | 181 | **Returns** 182 | True or False 183 | """ 184 | return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"]) # nice one-liner 185 | 186 | def distance(self, loc): 187 | """ 188 | **Purpose** 189 | calculate the distance between two locations. 190 | 191 | **Returns** 192 | an integer indicating the distance, note that 193 | the chromosomes should be the same or it will raise an 194 | exception. distance() should not be used as a test for 195 | overlap. use collide() for that. 196 | """ 197 | assert self["chr"] == loc["chr"], "chromosomes are not the same, %s vs %s" % (self, loc) 198 | return(self.qdistance(loc)) 199 | 200 | def qdistance(self, loc): 201 | """ 202 | (Internal) 203 | ignore the assert. 204 | """ 205 | centreA = (self.loc["left"] + self.loc["right"]) // 2 206 | centreB = (loc["left"] + loc["right"]) // 2 207 | return(centreA - centreB) 208 | 209 | def __sub__(self, loc): 210 | """ 211 | **Purpose** 212 | Allow things like: 213 | 214 | distance = locA - locB 215 | """ 216 | return(self.distance(loc)) 217 | 218 | def offset(self, base_pairs): 219 | """ 220 | get a new location offset from the 5' end by n base pairs 221 | returns a point location. 222 | """ 223 | new = copy.deepcopy(self) 224 | new.loc["left"] += base_pairs 225 | new.loc["right"] = new.loc["left"] 226 | new.__update() 227 | return(new) 228 | 229 | def keys(self): 230 | """ 231 | Get the keys 232 | """ 233 | return([i for i in self.loc]) 234 | 235 | if __name__ == "__main__": 236 | import timeit 237 | 238 | s = "a = location(loc='chr1:1000-2000').pointify()" 239 | t = timeit.Timer(s, "from location import location") 240 | print("%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000)) -------------------------------------------------------------------------------- /scTE/miniglbase/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities 3 | 4 | Various utilities to support the genome scanning scripts. 5 | 6 | MAny of these predate glbase3, but are a little tricky to remove as I am not sure where 7 | they are used (if at all). 8 | 9 | So excuse the terrible code in places. I will deprecate occasional functions from this. 10 | 11 | R=[AG], Y=[CT], K=[GT], M=[AC], S=[GC], W=[AT], and the four-fold 12 | degenerate character N=[ATCG] 13 | 3-fold degenerate motifs re not used like the Lander paper. 14 | 15 | """ 16 | 17 | import sys, os, pickle 18 | 19 | from . import config 20 | 21 | def glload(filename): 22 | """ 23 | **Purpose** 24 | Load a glbase binary file 25 | (Actually a Python pickle) 26 | 27 | **Arguments** 28 | filename (Required) 29 | the filename of the glbase binary file to load. 30 | 31 | **Returns** 32 | The glbase object previously saved as a binary file 33 | """ 34 | assert os.path.exists(os.path.realpath(filename)), "File '%s' not found" % filename 35 | 36 | try: 37 | oh = open(os.path.realpath(filename), "rb") 38 | newl = pickle.load(oh) 39 | oh.close() 40 | except pickle.UnpicklingError: 41 | raise BadBinaryFileFormatError(filename) 42 | 43 | # Recalculate the _optimiseData for old lists, and new features 44 | try: 45 | if newl.qkeyfind: 46 | pass 47 | if "loc" in list(newl.keys()) or "tss_loc" in list(newl.keys()): # buckets are only present if a loc key is available. 48 | if newl.buckets: # added in 0.381, only in objects with tss_loc or loc key. 49 | pass 50 | except Exception: 51 | config.log.warning("Old glb format, will rebuild buckets and/or qkeyfind, consider resaving") 52 | newl._optimiseData() 53 | 54 | try: 55 | cons = len(newl._conditions) # expression-like object 56 | config.log.info("Loaded '%s' binary file with %s items, %s conditions" % (filename, len(newl), cons)) 57 | except AttributeError: 58 | config.log.info("Loaded '%s' binary file with %s items" % (filename, len(newl))) 59 | return(newl) 60 | -------------------------------------------------------------------------------- /scTE/scatacseq.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI 4 | 5 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings): 6 | 7 | **** 8 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM' 9 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat ' 10 | 11 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex' 12 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex' 13 | 14 | # p1 = read 15 | # p2 = barcode and UMI 16 | # Make sure you set the correct genome index; 17 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2} 18 | **** 19 | 20 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI 21 | 22 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million). 23 | I guess there remains a change of a clash, but it should be so rare as to be basically impossible. 24 | 25 | Require pysam 26 | 27 | 28 | See also: bin/pack_scatacseq 29 | 30 | ''' 31 | 32 | import sys,os 33 | import gzip 34 | import argparse 35 | import logging 36 | import dbm 37 | import time 38 | import random 39 | 40 | try: 41 | import pysam 42 | except ImportError: 43 | pass # fail silently 44 | 45 | def generate_mismatches(seq): 46 | """ 47 | **Purpose** 48 | Generate all 1 bp mismatches for the sequence 49 | """ 50 | newseqs = [] 51 | 52 | for pos in range(len(seq)): 53 | newseqs += list(library([[i] for i in seq[0:pos]] + ["ACGT"] + [[i] for i in seq[pos:-1]])) 54 | 55 | return set(newseqs) 56 | 57 | def fastq(file_handle): 58 | """ 59 | Generator object to parse a FASTQ file 60 | 61 | """ 62 | name = "dummy" 63 | while name != "": 64 | name = file_handle.readline().strip() 65 | seq = file_handle.readline().strip() 66 | strand = file_handle.readline().strip() 67 | qual = file_handle.readline().strip() 68 | 69 | yield {"name": name, "strand": strand, "seq": seq, "qual": qual} 70 | return 71 | 72 | def library(args): 73 | """ 74 | Sequence generator iterator 75 | 76 | """ 77 | if not args: 78 | yield "" 79 | return 80 | for i in args[0]: 81 | for tmp in library(args[1:]): 82 | yield i + tmp 83 | return 84 | 85 | def atacBam2bed(filename, out, CB, UMI, noDup, num_threads): 86 | 87 | sample=filename.split('/')[-1].replace('.bam','') 88 | 89 | if sys.platform == 'darwin': # Mac OSX has BSD sed 90 | switch = '-E' 91 | else: 92 | switch = '-r' 93 | 94 | if not CB: 95 | # Put the sample name in the barcode slot 96 | if noDup: 97 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out)) 98 | else: 99 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out)) 100 | else: 101 | if noDup: 102 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out)) 103 | # os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out)) 104 | else: 105 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out)) 106 | 107 | def para_atacBam2bed(filename, CB, out, noDup): 108 | if not os.path.exists('%ss_scTEtmp/o0'%out): 109 | os.system('mkdir -p %s_scTEtmp/o0'%out) 110 | 111 | sample=filename.split('/')[-1].replace('.bam','') 112 | 113 | if sys.platform == 'darwin': # Mac OSX has BSD sed 114 | switch = '-E' 115 | else: 116 | switch = '-r' 117 | 118 | if not CB: 119 | if noDup: 120 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample)) 121 | else: 122 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample)) 123 | else: 124 | if noDup: 125 | # os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out)) 126 | os.system('bamToBed -i %s | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out)) 127 | else: 128 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out)) 129 | 130 | def load_expected_whitelist(filename, logger): 131 | """ 132 | **Purpose** 133 | Load the expected whitelist and output a set 134 | 135 | """ 136 | expected_whitelist = [] 137 | oh = open(filename, 'rt') 138 | for line in oh: 139 | expected_whitelist.append(line.strip()) 140 | oh.close() 141 | 142 | expected_whitelist = set(expected_whitelist) 143 | 144 | logger.info('Found {0:,} expected barcodes'.format(len(expected_whitelist))) 145 | 146 | return expected_whitelist 147 | 148 | def build_barcode_dict(barcode_filename, save_whitelist=False, expected_whitelist=False, 149 | gzip_file=True, logger=False, ondisk=True): 150 | ''' 151 | **Purposse** 152 | The BAM and the FASTQ are not guaranteed to be in the same order, so I need to make a look up for 153 | the read ID and the barcode 154 | 155 | **Arguments** 156 | barcode_filename (Required) 157 | 158 | save_whitelist (Optional, default=False) 159 | save out the whitelist of barcodes (i.e. the ones actually observed)\ 160 | 161 | TODO: This should be checked against the expected whitelist, and 1bp Hamming corrected 162 | 163 | **Returns** 164 | A dict mapping : 165 | ''' 166 | assert barcode_filename, 'barcode_filename is required' 167 | 168 | if expected_whitelist: 169 | logger.info('Checking against the expected whitelist and correcting barcodes') 170 | else: 171 | logger.warning('Not checking the barcodes against an expected whitelist, barcodes will not be corrected') 172 | 173 | bad_barcodes = 0 174 | rescued_barcodes = 0 175 | 176 | if ondisk: 177 | tmpfilename = './tpm_{0:}_{1:}_{2:}.dbm'.format(barcode_filename, time.time(), random.randint(0, 10000)) 178 | barcode_lookup = dbm.open(tmpfilename, 'n') 179 | else: 180 | tmpfilename = None 181 | barcode_lookup = {} 182 | 183 | if gzip_file: 184 | oh = gzip.open(barcode_filename, 'rt') 185 | else: 186 | oh = open(barcode_filename, 'rt') 187 | 188 | for idx, fq in enumerate(fastq(oh)): 189 | barcode = fq['seq'] 190 | if 'N' in barcode: # Discard this barcode 191 | bad_barcodes += 1 192 | continue 193 | 194 | if expected_whitelist and barcode not in expected_whitelist: 195 | # barcode not in the whitelist 196 | # see if we can resuce it: 197 | rescued = False 198 | for mm in generate_mismatches(barcode): 199 | if mm in expected_whitelist: 200 | barcode = mm # Corrected 201 | rescued_barcodes += 1 202 | rescued = True 203 | break 204 | if not rescued: 205 | bad_barcodes += 1 # unrecoverable 206 | continue 207 | 208 | name = fq['name'].split(' ')[0].lstrip('@') # Any other types seen? 209 | barcode_lookup[name] = barcode 210 | 211 | if (idx+1) % 10000000 == 0: 212 | logger.info('Processed: {:,} barcode reads'.format(idx+1)) 213 | oh.close() 214 | 215 | logger.info('Processed: {:,} barcode reads from the FASTQ'.format(idx+1)) 216 | logger.info('Bad reads with no barcode {:,} reads'.format(bad_barcodes)) 217 | logger.info('Rescued {:,} reads'.format(rescued_barcodes)) 218 | logger.info('Found {:,} valid reads'.format(len(set(barcode_lookup.keys())), )) 219 | logger.info('Found {:,} valid barcodes'.format(len(set(barcode_lookup.values())), )) 220 | 221 | if save_whitelist: 222 | logger.info('Saved whitelist: {0}'.format(save_whitelist)) 223 | oh = open(save_whitelist, 'wt') 224 | for k in sorted(set(barcode_lookup.values())): 225 | oh.write('%s\n' % (k)) 226 | 227 | oh.close() 228 | 229 | return barcode_lookup, expected_whitelist, tmpfilename 230 | 231 | def parse_bam(infile, barcode_lookup, outfile, barcode_corrector, logger): 232 | """ 233 | **Purpose** 234 | Parse the BAM file and insert the CR: and YR: tags 235 | """ 236 | inbam = pysam.AlignmentFile(infile[0], 'rb') 237 | outfile = pysam.AlignmentFile(outfile, 'wb', template=inbam) 238 | 239 | #umi_iterator = library(["ACGT"] * 14) 240 | 241 | not_paired = 0 # unpaired ATAC 242 | no_matching_barcode = 0 # No matching read:barcode pair 243 | corrected_barcodes = 0 244 | pairs_too_far_apart = 0 245 | 246 | quick_lookup = {} 247 | 248 | for idx, read in enumerate(inbam): 249 | if (idx+1) % 10000000 == 0: 250 | logger.info('Processed: {:,} reads'.format(idx+1)) 251 | #break 252 | 253 | if not read.is_paired: 254 | not_paired += 1 255 | continue 256 | 257 | if read.query_alignment_length > 1000: 258 | pairs_too_far_apart += 1 259 | continue 260 | 261 | # UMI iterator 262 | #try: 263 | # umi = umi_iterator.__next__() 264 | #except StopIteration: 265 | # umi_iterator = library(["ACGT"] * 14) 266 | 267 | # Add the barcode: 268 | # See if the read is in the lookup: 269 | if read.query_name in barcode_lookup: 270 | read.set_tags([('CR:Z', barcode_lookup[read.query_name]),]) 271 | else: 272 | no_matching_barcode += 1 273 | continue 274 | 275 | # The BAM file is not garunteed to be in order, but the pairs should be pretty close, so I just need to check for the other pair on a simple lookup list 276 | # and only write out the pairs once I got two 277 | if read.query_name in quick_lookup: # I found it's pair 278 | outfile.write(read) 279 | outfile.write(quick_lookup[read.query_name]) 280 | del quick_lookup[read.query_name] 281 | else: 282 | # no pair, store it for later 283 | quick_lookup[read.query_name] = read 284 | 285 | inbam.close() 286 | outfile.close() 287 | 288 | logger.info('Processed {:,} reads from the BAM'.format(idx+1)) 289 | logger.info('{:,} reads were unpaired'.format(not_paired+1)) 290 | logger.info('{:,} read pairs were too far apart'.format(pairs_too_far_apart+1)) 291 | logger.info('Matched {0:,} ({1:.1f}%) reads to a barcode'.format(idx - no_matching_barcode, (idx - no_matching_barcode) / idx * 100.0)) 292 | logger.info('Save BAM ouput file: {0}'.format(infile[0])) 293 | return 294 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import glob,os 3 | 4 | def readme(): 5 | with open('README.md',encoding="utf-8") as f: 6 | return f.read() 7 | 8 | setup(name='scTE', 9 | version='1.0', 10 | description='Tool for estimating differential enrichment of Transposable Elements and other highly repetitive regions in single-cell data', 11 | long_description=readme(), 12 | classifiers=[ 13 | 'Programming Language :: Python :: 3', 14 | 'Programming Language :: Python :: 3.6', 15 | ], 16 | python_requires=">=3.6", 17 | keywords='..', 18 | url='..', 19 | author='..', 20 | author_email='he_jiangping@grmh-gdl.cn; andrewh@sustech.edu.cn', 21 | license='..', 22 | packages=[ 23 | 'scTE', 24 | 'scTE.miniglbase', 25 | ], 26 | platforms=[ 27 | 'Linux', 28 | 'MacOS' 29 | ], 30 | install_requires=[ 31 | 'argparse','scipy','pandas', 32 | 'numpy','anndata', 33 | ], 34 | include_package_data=True, 35 | zip_safe=False, 36 | scripts=[ 37 | 'bin/scTE', 38 | 'bin/scTE_build', 39 | 'bin/scTEATAC_build', 40 | 'bin/scTEATAC', 41 | ] 42 | ) 43 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | 2 | scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m exclusive 3 | 4 | scTE -i Data/test.bam -p 12 --min_genes 1 -o out --genome mm10 -x Data/test.exclusive.idx 5 | 6 | #scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m nointron 7 | 8 | --------------------------------------------------------------------------------