├── Data
├── Gene.gtf
├── TE.bed
├── test.bam
└── test.exclusive.idx
├── LICENSE
├── README.md
├── bin
├── scTE
├── scTEATAC
├── scTEATAC_build
└── scTE_build
├── docs
└── scTE.png
├── example
├── Figure3
│ ├── 0.cluster_scripts
│ │ ├── scte
│ │ │ ├── do_batch.sh
│ │ │ └── scte.sh
│ │ └── starsolo
│ │ │ ├── do_batch.sh
│ │ │ └── starsolo.sh
│ ├── 1.pack.py
│ ├── 2.norm_and_learn.py
│ ├── 3.diffexp.py
│ ├── 4.plots-allgenes.py
│ ├── 4.plots-alltes.py
│ ├── 4.plots-specific-tes.py
│ ├── 5.marker_genes-leiden-0.2.py
│ ├── 5.marker_genes-small-grp_cut.py
│ ├── 5.marker_genes-small.py
│ ├── 5.marker_genes.py
│ └── TE_genes_id.mm10.txt.gz
├── Figure4.ipynb
└── Figure6.ipynb
├── scTE
├── __init__.py
├── annotation.py
├── base.py
├── miniglbase
│ ├── README.md
│ ├── __init__.py
│ ├── base_genelist.py
│ ├── config.py
│ ├── genelist.py
│ ├── location.py
│ └── utils.py
└── scatacseq.py
├── setup.py
└── test.sh
/Data/Gene.gtf:
--------------------------------------------------------------------------------
1 | ##description: evidence-based annotation of the mouse genome (GRCm38), version M21 (Ensembl 96)
2 | ##provider: GENCODE
3 | ##contact: gencode-help@ebi.ac.uk
4 | ##format: gtf
5 | ##date: 2019-03-27
6 | chr1 HAVANA gene 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; gene_type "TEC"; gene_name "4933401J01Rik"; level 2; havana_gene "OTTMUSG00000049935.1";
7 | chr1 HAVANA transcript 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1";
8 | chr1 HAVANA exon 3073253 3074322 . + . gene_id "ENSMUSG00000102693.1"; transcript_id "ENSMUST00000193812.1"; gene_type "TEC"; gene_name "4933401J01Rik"; transcript_type "TEC"; transcript_name "4933401J01Rik-201"; exon_number 1; exon_id "ENSMUSE00001343744.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049935.1"; havana_transcript "OTTMUST00000127109.1";
9 | chr1 ENSEMBL gene 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; gene_type "snRNA"; gene_name "Gm26206"; level 3;
10 | chr1 ENSEMBL transcript 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; level 3; transcript_support_level "NA"; tag "basic";
11 | chr1 ENSEMBL exon 3102016 3102125 . + . gene_id "ENSMUSG00000064842.1"; transcript_id "ENSMUST00000082908.1"; gene_type "snRNA"; gene_name "Gm26206"; transcript_type "snRNA"; transcript_name "Gm26206-201"; exon_number 1; exon_id "ENSMUSE00000522066.1"; level 3; transcript_support_level "NA"; tag "basic";
12 | chr1 HAVANA gene 3205901 3671498 . - . gene_id "ENSMUSG00000051951.5"; gene_type "protein_coding"; gene_name "Xkr4"; level 2; havana_gene "OTTMUSG00000026353.2";
13 | chr1 HAVANA transcript 3205901 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
14 | chr1 HAVANA exon 3213609 3216344 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 1; exon_id "ENSMUSE00000858910.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
15 | chr1 HAVANA exon 3205901 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000162897.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-203"; exon_number 2; exon_id "ENSMUSE00000866652.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086625.1";
16 | chr1 HAVANA transcript 3206523 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
17 | chr1 HAVANA exon 3213439 3215632 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 1; exon_id "ENSMUSE00000863980.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
18 | chr1 HAVANA exon 3206523 3207317 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000159265.1"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "processed_transcript"; transcript_name "Xkr4-202"; exon_number 2; exon_id "ENSMUSE00000867897.1"; level 2; transcript_support_level "1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000086624.1";
19 | chr1 HAVANA transcript 3214482 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
20 | chr1 HAVANA exon 3670552 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
21 | chr1 HAVANA CDS 3670552 3671348 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
22 | chr1 HAVANA start_codon 3671346 3671348 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
23 | chr1 HAVANA exon 3421702 3421901 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
24 | chr1 HAVANA CDS 3421702 3421901 . - 1 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 2; exon_id "ENSMUSE00000449517.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
25 | chr1 HAVANA exon 3214482 3216968 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
26 | chr1 HAVANA CDS 3216025 3216968 . - 2 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
27 | chr1 HAVANA stop_codon 3216022 3216024 . - 0 gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
28 | chr1 HAVANA UTR 3671349 3671498 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 1; exon_id "ENSMUSE00000485541.3"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
29 | chr1 HAVANA UTR 3214482 3216024 . - . gene_id "ENSMUSG00000051951.5"; transcript_id "ENSMUST00000070533.4"; gene_type "protein_coding"; gene_name "Xkr4"; transcript_type "protein_coding"; transcript_name "Xkr4-201"; exon_number 3; exon_id "ENSMUSE00000448840.2"; level 2; protein_id "ENSMUSP00000070648.4"; transcript_support_level "1"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS14803.1"; havana_gene "OTTMUSG00000026353.2"; havana_transcript "OTTMUST00000065166.1";
30 | chr1 HAVANA gene 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049958.1";
31 | chr1 HAVANA transcript 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1";
32 | chr1 HAVANA exon 3252757 3253236 . + . gene_id "ENSMUSG00000102851.1"; transcript_id "ENSMUST00000192857.1"; gene_type "processed_pseudogene"; gene_name "Gm18956"; transcript_type "processed_pseudogene"; transcript_name "Gm18956-201"; exon_number 1; exon_id "ENSMUSE00001339323.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049958.1"; havana_transcript "OTTMUST00000127143.1";
33 | chr1 HAVANA gene 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; gene_type "TEC"; gene_name "Gm37180"; level 2; havana_gene "OTTMUSG00000049960.1";
34 | chr1 HAVANA transcript 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1";
35 | chr1 HAVANA exon 3365731 3368549 . - . gene_id "ENSMUSG00000103377.1"; transcript_id "ENSMUST00000195335.1"; gene_type "TEC"; gene_name "Gm37180"; transcript_type "TEC"; transcript_name "Gm37180-201"; exon_number 1; exon_id "ENSMUSE00001343189.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049960.1"; havana_transcript "OTTMUST00000127145.1";
36 | chr1 HAVANA gene 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; gene_type "TEC"; gene_name "Gm37363"; level 2; havana_gene "OTTMUSG00000049961.1";
37 | chr1 HAVANA transcript 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1";
38 | chr1 HAVANA exon 3375556 3377788 . - . gene_id "ENSMUSG00000104017.1"; transcript_id "ENSMUST00000192336.1"; gene_type "TEC"; gene_name "Gm37363"; transcript_type "TEC"; transcript_name "Gm37363-201"; exon_number 1; exon_id "ENSMUSE00001343686.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049961.1"; havana_transcript "OTTMUST00000127146.1";
39 | chr1 HAVANA gene 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; gene_type "TEC"; gene_name "Gm37686"; level 2; havana_gene "OTTMUSG00000049930.1";
40 | chr1 HAVANA transcript 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1";
41 | chr1 HAVANA exon 3464977 3467285 . - . gene_id "ENSMUSG00000103025.1"; transcript_id "ENSMUST00000194099.1"; gene_type "TEC"; gene_name "Gm37686"; transcript_type "TEC"; transcript_name "Gm37686-201"; exon_number 1; exon_id "ENSMUSE00001337180.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049930.1"; havana_transcript "OTTMUST00000127101.1";
42 | chr1 HAVANA gene 3466587 3513553 . + . gene_id "ENSMUSG00000089699.1"; gene_type "antisense"; gene_name "Gm1992"; level 2; havana_gene "OTTMUSG00000026352.1";
43 | chr1 HAVANA transcript 3466587 3513553 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
44 | chr1 HAVANA exon 3466587 3466687 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 1; exon_id "ENSMUSE00000869502.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
45 | chr1 HAVANA exon 3513405 3513553 . + . gene_id "ENSMUSG00000089699.1"; transcript_id "ENSMUST00000161581.1"; gene_type "antisense"; gene_name "Gm1992"; transcript_type "antisense"; transcript_name "Gm1992-201"; exon_number 2; exon_id "ENSMUSE00000864479.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000026352.1"; havana_transcript "OTTMUST00000065165.1";
46 | chr1 HAVANA gene 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; gene_type "TEC"; gene_name "Gm37329"; level 2; havana_gene "OTTMUSG00000049929.1";
47 | chr1 HAVANA transcript 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1";
48 | chr1 HAVANA exon 3512451 3514507 . - . gene_id "ENSMUSG00000103201.1"; transcript_id "ENSMUST00000192973.1"; gene_type "TEC"; gene_name "Gm37329"; transcript_type "TEC"; transcript_name "Gm37329-201"; exon_number 1; exon_id "ENSMUSE00001345667.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049929.1"; havana_transcript "OTTMUST00000127100.1";
49 | chr1 HAVANA gene 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; level 1; tag "pseudo_consens"; havana_gene "OTTMUSG00000049921.1";
50 | chr1 HAVANA transcript 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1";
51 | chr1 HAVANA exon 3531795 3532720 . + . gene_id "ENSMUSG00000103147.1"; transcript_id "ENSMUST00000192183.1"; gene_type "processed_pseudogene"; gene_name "Gm7341"; transcript_type "processed_pseudogene"; transcript_name "Gm7341-201"; exon_number 1; exon_id "ENSMUSE00001343235.1"; level 1; transcript_support_level "NA"; ont "PGO:0000004"; tag "pseudo_consens"; tag "basic"; havana_gene "OTTMUSG00000049921.1"; havana_transcript "OTTMUST00000127089.1";
52 | chr1 HAVANA gene 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; gene_type "TEC"; gene_name "Gm38148"; level 2; havana_gene "OTTMUSG00000049927.1";
53 | chr1 HAVANA transcript 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1";
54 | chr1 HAVANA exon 3592892 3595903 . - . gene_id "ENSMUSG00000103161.1"; transcript_id "ENSMUST00000195166.1"; gene_type "TEC"; gene_name "Gm38148"; transcript_type "TEC"; transcript_name "Gm38148-201"; exon_number 1; exon_id "ENSMUSE00001343966.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049927.1"; havana_transcript "OTTMUST00000127098.1";
55 | chr1 HAVANA gene 3647309 3658904 . - . gene_id "ENSMUSG00000102331.1"; gene_type "sense_intronic"; gene_name "Gm19938"; level 2; havana_gene "OTTMUSG00000049924.1";
56 | chr1 HAVANA transcript 3647309 3658904 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
57 | chr1 HAVANA exon 3658847 3658904 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 1; exon_id "ENSMUSE00001337496.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
58 | chr1 HAVANA exon 3647309 3650509 . - . gene_id "ENSMUSG00000102331.1"; transcript_id "ENSMUST00000192692.1"; gene_type "sense_intronic"; gene_name "Gm19938"; transcript_type "sense_intronic"; transcript_name "Gm19938-201"; exon_number 2; exon_id "ENSMUSE00001339227.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049924.1"; havana_transcript "OTTMUST00000127092.1";
59 | chr1 HAVANA gene 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; gene_type "TEC"; gene_name "Gm10568"; level 2; havana_gene "OTTMUSG00000049922.1";
60 | chr1 HAVANA transcript 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1";
61 | chr1 HAVANA exon 3680155 3681788 . + . gene_id "ENSMUSG00000102348.1"; transcript_id "ENSMUST00000193244.1"; gene_type "TEC"; gene_name "Gm10568"; transcript_type "TEC"; transcript_name "Gm10568-201"; exon_number 1; exon_id "ENSMUSE00001341983.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049922.1"; havana_transcript "OTTMUST00000127090.1";
62 | chr1 HAVANA gene 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; gene_type "TEC"; gene_name "Gm38385"; level 2; havana_gene "OTTMUSG00000049923.1";
63 | chr1 HAVANA transcript 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1";
64 | chr1 HAVANA exon 3752010 3754360 . + . gene_id "ENSMUSG00000102592.1"; transcript_id "ENSMUST00000194454.1"; gene_type "TEC"; gene_name "Gm38385"; transcript_type "TEC"; transcript_name "Gm38385-201"; exon_number 1; exon_id "ENSMUSE00001342074.1"; level 2; transcript_support_level "NA"; tag "basic"; havana_gene "OTTMUSG00000049923.1"; havana_transcript "OTTMUST00000127091.1";
65 | chr1 ENSEMBL gene 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; gene_type "snRNA"; gene_name "Gm27396"; level 3;
66 | chr1 ENSEMBL transcript 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; level 3; transcript_support_level "NA"; tag "basic";
67 | chr1 ENSEMBL exon 3783876 3783933 . - . gene_id "ENSMUSG00000088333.2"; transcript_id "ENSMUST00000157708.2"; gene_type "snRNA"; gene_name "Gm27396"; transcript_type "snRNA"; transcript_name "Gm27396-201"; exon_number 1; exon_id "ENSMUSE00000846843.2"; level 3; transcript_support_level "NA"; tag "basic";
68 | chr1 HAVANA gene 3905739 3986215 . - . gene_id "ENSMUSG00000102343.1"; gene_type "lincRNA"; gene_name "Gm37381"; level 2; havana_gene "OTTMUSG00000049934.1";
69 | chr1 HAVANA transcript 3905739 3986215 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
70 | chr1 HAVANA exon 3986147 3986215 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 1; exon_id "ENSMUSE00001344134.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
71 | chr1 HAVANA exon 3985160 3985351 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 2; exon_id "ENSMUSE00001337703.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
72 | chr1 HAVANA exon 3905739 3906134 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000194643.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-202"; exon_number 3; exon_id "ENSMUSE00001345637.1"; level 2; transcript_support_level "3"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127107.1";
73 | chr1 HAVANA transcript 3984225 3985984 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
74 | chr1 HAVANA exon 3985160 3985984 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 1; exon_id "ENSMUSE00001340315.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
75 | chr1 HAVANA exon 3984225 3984298 . - . gene_id "ENSMUSG00000102343.1"; transcript_id "ENSMUST00000192427.1"; gene_type "lincRNA"; gene_name "Gm37381"; transcript_type "lincRNA"; transcript_name "Gm37381-201"; exon_number 2; exon_id "ENSMUSE00001340468.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTMUSG00000049934.1"; havana_transcript "OTTMUST00000127108.1";
76 | chr1 HAVANA gene 3999557 4409241 . - . gene_id "ENSMUSG00000025900.12"; gene_type "protein_coding"; gene_name "Rp1"; level 2; tag "overlapping_locus"; havana_gene "OTTMUSG00000049985.3";
77 | chr1 HAVANA transcript 3999557 4409241 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
78 | chr1 HAVANA exon 4409170 4409241 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
79 | chr1 HAVANA CDS 4409170 4409187 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
80 | chr1 HAVANA start_codon 4409185 4409187 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 1; exon_id "ENSMUSE00001378580.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
81 | chr1 HAVANA exon 4352202 4352837 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
82 | chr1 HAVANA CDS 4352202 4352837 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 2; exon_id "ENSMUSE00001403780.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
83 | chr1 HAVANA exon 4351910 4352081 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
84 | chr1 HAVANA CDS 4351910 4352081 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 3; exon_id "ENSMUSE00001396015.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
85 | chr1 HAVANA exon 4311270 4311433 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
86 | chr1 HAVANA CDS 4311270 4311433 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 4; exon_id "ENSMUSE00001380053.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
87 | chr1 HAVANA exon 4292926 4293012 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
88 | chr1 HAVANA CDS 4292926 4293012 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 5; exon_id "ENSMUSE00001377871.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
89 | chr1 HAVANA exon 4284766 4284898 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
90 | chr1 HAVANA CDS 4284766 4284898 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 6; exon_id "ENSMUSE00001379434.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
91 | chr1 HAVANA exon 4267469 4267620 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
92 | chr1 HAVANA CDS 4267469 4267620 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 7; exon_id "ENSMUSE00001379919.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
93 | chr1 HAVANA exon 4261527 4261605 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
94 | chr1 HAVANA CDS 4261527 4261605 . - 0 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 8; exon_id "ENSMUSE00001380048.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
95 | chr1 HAVANA exon 4245031 4245106 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
96 | chr1 HAVANA CDS 4245031 4245106 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 9; exon_id "ENSMUSE00001382043.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
97 | chr1 HAVANA exon 4243543 4243619 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
98 | chr1 HAVANA CDS 4243543 4243619 . - 1 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 10; exon_id "ENSMUSE00001379965.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
99 | chr1 HAVANA exon 4243417 4243448 . - . gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
100 | chr1 HAVANA CDS 4243417 4243448 . - 2 gene_id "ENSMUSG00000025900.12"; transcript_id "ENSMUST00000208660.1"; gene_type "protein_coding"; gene_name "Rp1"; transcript_type "protein_coding"; transcript_name "Rp1-203"; exon_number 11; exon_id "ENSMUSE00001379150.1"; level 2; protein_id "ENSMUSP00000146439.1"; transcript_support_level "5"; tag "RNA_Seq_supported_only"; tag "basic"; tag "appris_principal_1"; havana_gene "OTTMUSG00000049985.3"; havana_transcript "OTTMUST00000145515.1";
101 |
--------------------------------------------------------------------------------
/Data/TE.bed:
--------------------------------------------------------------------------------
1 | chr3 144583200 144583342 B1_Mur4 0 -
2 | chr6 86389924 86389960 B2_Mm2 0 +
3 | chr7 5364171 5364232 PB1D10 0 +
4 | chr10 55902552 55902867 LTR80B 0 +
5 | chr12 56707313 56707382 B1F 0 +
6 | chr2 62000937 62001039 RMER15 0 +
7 | chr13 67837236 67837625 MTC 0 -
8 | chr13 97860467 97860597 ID_B1 0 -
9 | chr3 129323773 129323852 ID4_ 0 -
10 | chr15 53302093 53302237 B1_Mur4 0 -
11 | chr3 17544777 17545068 MTE2a 0 -
12 | chr14 114380245 114381362 Lx3A 0 -
13 | chr14 36135784 36136221 MLT1G1 0 -
14 | chr9 3382929 3383043 B2_Mm2 0 -
15 | chr2 23523042 23524033 L1Md_F2 0 +
16 | chr10 130416389 130416521 Lx7 0 +
17 | chr10 124812631 124812919 LTR16B 0 -
18 | chr8 121282143 121282358 ORR1G 0 -
19 | chrX 56261784 56261888 B4A 0 -
20 | chr12 19314026 19314159 L2a 0 -
21 | chr13 34470884 34476084 L1Md_A 0 +
22 | chr1 15430986 15431050 MLT1O 0 -
23 | chr11 97176772 97176823 B4 0 +
24 | chr6 120487970 120488131 B2_Mm2 0 -
25 | chr2 112370309 112370404 PB1D9 0 -
26 | chr14 11380848 11380988 L1MB7 0 +
27 | chr7 125706670 125706784 PB1D9 0 -
28 | chr1 119963513 119963866 Lx8 0 +
29 | chr14 121217593 121217684 RLTR20A4 0 -
30 | chr13 14527292 14527394 Lx8b 0 +
31 | chrX 113068169 113068313 B1_Mm 0 -
32 | chr7 21774699 21774922 RMER19B2 0 +
33 | chr3 104611578 104611728 B3A 0 -
34 | chr2 158183914 158183943 B1F1 0 +
35 | chrX 83091173 83091268 PB1D7 0 +
36 | chrY 18505375 18507434 L1_Mus3 0 -
37 | chrY 53460095 53460226 B1_Mus2 0 +
38 | chr18 56988834 56988941 L3 0 +
39 | chr15 46551396 46551807 MMERVK10C-int 0 -
40 | chr18 79506187 79506333 B1_Mm 0 -
41 | chr2 104648414 104648547 B1_Mur2 0 -
42 | chr7 109416903 109417032 Lx7 0 +
43 | chr1 33863431 33863563 ID_B1 0 -
44 | chr4 148585303 148585574 RLTR19-int 0 -
45 | chr2 164776167 164776283 B1_Mur2 0 +
46 | chr2 155889136 155889458 MLTR11B 0 +
47 | chr1 140608946 140609064 RMER13A2 0 -
48 | chr11 50474308 50474667 ORR1A2 0 +
49 | chr3 35549471 35549633 Lx7 0 -
50 | chr18 20885705 20885850 B1_Mus1 0 +
51 | chr9 98122822 98123031 URR1B 0 +
52 | chr5 145787688 145787824 RSINE1 0 +
53 | chr9 116910264 116910518 B4 0 +
54 | chr2 118982678 118982802 L1MB8 0 -
55 | chr1 74231577 74231701 ID_B1 0 -
56 | chr3 51388265 51388358 PB1D7 0 +
57 | chr1 78437903 78438016 ID_B1 0 +
58 | chr1 179450543 179450599 PB1D9 0 +
59 | chr11 106956412 106956506 B1F 0 -
60 | chr7 105070982 105071111 B1F 0 +
61 | chr14 55891766 55891869 B1F2 0 +
62 | chr3 95002315 95002463 B1_Mm 0 +
63 | chr14 123443243 123443788 L1_Mus1 0 +
64 | chr9 84553142 84553311 ID_B1 0 -
65 | chrX 74054421 74054609 B2_Mm2 0 -
66 | chr2 50599335 50599996 L1_Mur2 0 +
67 | chr11 10009054 10009447 RLTR47 0 +
68 | chr14 14575064 14575178 B2_Mm2 0 -
69 | chrX 66050795 66051345 L1Md_F2 0 +
70 | chr4 109302482 109302690 B3 0 +
71 | chr6 5823803 5823847 MLT1B 0 +
72 | chr9 94472366 94472513 B1_Mus1 0 -
73 | chr2 7172981 7173150 Tigger19a 0 +
74 | chr9 33581540 33581630 B3A 0 +
75 | chr1 60831307 60832014 L1_Mur3 0 -
76 | chr2 16821242 16821456 RMER15-int 0 -
77 | chr7 142943894 142944262 ORR1C2 0 +
78 | chr12 73440499 73440743 B4 0 -
79 | chrX 90113268 90113445 B3 0 +
80 | chr18 20618867 20619808 L1M3e 0 +
81 | chr9 114718823 114718968 B1_Mm 0 -
82 | chr11 12670894 12671016 MIR 0 -
83 | chr13 32387251 32387629 MLT1D 0 +
84 | chrX 97791970 97792192 URR1A 0 +
85 | chr13 76374166 76374333 ERVB4_1B-I_MM-int 0 -
86 | chr5 47907546 47907672 Lx10 0 +
87 | chr16 8837567 8837715 B1_Mus1 0 -
88 | chr4 150767884 150768026 B1_Mur4 0 +
89 | chr10 99491068 99491203 B1_Mur3 0 -
90 | chr17 90847952 90849043 L1_Mus3 0 +
91 | chr2 99125206 99126690 L1_Mur3 0 -
92 | chr13 47581815 47582027 B4A 0 +
93 | chr6 99194219 99194361 MER117 0 +
94 | chr14 30096250 30096453 B3 0 +
95 | chr13 24443830 24443959 PB1D10 0 -
96 | chr13 28396991 28397036 L1Md_F2 0 +
97 | chr1 165293822 165294036 B3 0 +
98 | chr7 17889030 17889430 RMER6BA 0 +
99 | chr9 25684255 25684348 MLT2D 0 -
100 | chr6 119394571 119394668 PB1D7 0 +
101 | chr19 119394571 119394668 test 0 +
102 | chrM 1193971 1193968 test 0 +
103 |
--------------------------------------------------------------------------------
/Data/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.bam
--------------------------------------------------------------------------------
/Data/test.exclusive.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/Data/test.exclusive.idx
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Jiangping He, Andrew P. Hutchins & Jiekai Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | scTE
2 | ==============
3 |
4 | Quantifying transposable element (TEs) expression from single-cell sequencing data
5 | ----------------------------------------------------------------------
6 | [](https://zenodo.org/badge/190696033.svg)
7 |
8 | scTE takes as input:
9 |
10 | * Aligned sequence reads (BAM/SAM format)
11 | * The genomic location of TEs (BED format)
12 | * The genomic location of genes (GTF format)
13 |
14 |
15 | 
16 |
17 |
18 | Installation
19 | ------------
20 | scTE works with python >=3.6.
21 |
22 | ```bash
23 | $ git clone https://github.com/JiekaiLab/scTE.git
24 | $ cd scTE
25 | $ python setup.py install
26 | ```
27 |
28 | Usage
29 | -----
30 |
31 | **Building genome indices**
32 | scTE builds genome indices for the fast alignment of reads to genes and TEs. These indices can be automatically generated using the commands:
33 |
34 | ```bash
35 | $ scTE_build -g mm10 # Mouse
36 | $ scTE_build -g hg38 # Human
37 | $ scTE_build -g panTro6 # Chimpanzee
38 | $ scTE_build -g macFas5 # Macaca fascicularis
39 | $ scTE_build -g dm6 # Drosophila melanogaster
40 | $ scTE_build -g danRer11 # Zebrafish
41 | $ scTE_build -g xenTro9 # Xenopus tropicalis
42 | ```
43 |
44 | These scripts will automatically download the genome annotations, for mouse:
45 |
46 | ```bash
47 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz
48 | $ http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz
49 | ```
50 |
51 | Or for human:
52 |
53 | ```bash
54 | $ ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz
55 | $ http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz
56 | ```
57 |
58 | Or for Chimpanzee:
59 |
60 | ```bash
61 | $ http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz
62 | $ https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz
63 | ```
64 |
65 | Or for Macaca fascicularis:
66 |
67 | ```bash
68 | $ http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz
69 | $ http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz
70 | ```
71 |
72 | Or for Drosophila melanogaster:
73 |
74 | ```bash
75 | $ http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz
76 | $ http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz
77 | ```
78 |
79 | Or for Zebrafish:
80 |
81 | ```bash
82 | $ http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz
83 | $ https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz
84 | ```
85 |
86 | Or for Xenopus tropicalis:
87 |
88 | ```bash
89 | $ http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz
90 | $ https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz
91 | ```
92 |
93 | `mm10, hg38, panTro6, macFas5, dm6, danRer11, xenTro9` is the genome assembly version.
94 | If you want to use your customs reference, you can use the ` -gene -te` options:
95 |
96 | ```
97 | scTE_build -te TEs.bed -gene Genes.gtf -o custome
98 |
99 | -te
100 | Six columns bed file for transposable elements annotation.
101 | -gene
102 | Gtf file for genes annotation.
103 | ```
104 | For more informat about BED and GTF format, see from [UCSC](https://genome.ucsc.edu/FAQ/FAQformat).
105 | These annotations are then processed and converted into genome indices. The scTE algorithm will allocate
106 | reads first to gene exons, and then to TEs by default. Hence TEs inside exon/UTR regions of genes annotated
107 | in GENCODE will only contribute to the gene, and not to the TE score. This feature can be changed by
108 | setting `–mode/-m inclusive` in scTE, which will instruct scTE to assign the reads to both TEs and genes
109 | if a read comes from a TE inside exon/UTR regions of genes. If you want to remove the TEs inside the intron
110 | of genes, you can sete `–mode/-m nointron` in scTE
111 |
112 | **Analysis of 10x style scRNA-seq data**
113 |
114 | scTE makes BAM/SAM file as input, highly recommend to use unfiltered alignment file as input.
115 |
116 | For `bam` file generated by [STARsolo](https://github.com/alexdobin/STAR) etc, the cell barcodes and UMI need to be integrated into the read 'CR:Z' or 'UR:Z' tage as bellow:
117 |
118 | ```bash
119 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CR -UMI UR
120 | ```
121 | ```bash
122 | $ samtools view test.bam
123 | A00269:12:H7YF2DMXX:2 0 chr10 55902580 255 50M * 0 0 GTTCTCTCCGTATGTGAGCATGGGAGATACATCCCAGAAAGGCAGAAGGG FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF
124 | A00269:13:H7YF2DMXX:2 0 chr10 55902784 255 50M * 0 0 ATAATCTTTGAGATCTCTGGTGAAAATAAGTAGCATAAAGGACAGAATCA FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF
125 | A00269:14:H7YF2DMXX:2 0 chr13 67837311 255 50M * 0 0 CTGTTCATTATTTGAGGAAATCAGGACAGGAAATCAAACATGGCAGAATC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:ATCGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF
126 | A00269:15:H7YF2DMXX:2 0 chr14 114380523 255 50M * 0 0 GATCCAGATTAATTGAGACTGTTGATCCTCCTACAGGGTCGCCCTTCTCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:1 HI:i:1 AS:i:49 nM:i:0 CR:Z:CTAGAGTGTTTCGCTC CY:Z:FFFFFFFFFFFFFFFF UR:Z:TACATGACGC UY:Z:FFFFFFFFFF
127 | ```
128 |
129 | For `bam` file generated by [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) etc, the cell barcodes and UMI need to be integrated into the read 'CB:Z' or 'UB:Z' tage as bellow:
130 |
131 | ```bash
132 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx --hdf5 True -CB CB -UMI UB
133 | ```
134 | ```bash
135 | $ samtools view test.bam
136 | A00519:758:HTCCHDSXY:3:2535:21296:19774 16 chr1 14021 0 90M * 0 0 TGGATTTCTATCTCCCTGGCTTGGTGCCAGTTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACTCCAAGACATC ,FFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:F:FFFFFFFFFFFFFFFFFFF:FFFFF NH:i:5 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:CTCCCTCCACTGCGAC CY:Z:FFFFFFFFFFFFFFFF CB:Z:CTCCCTCCACTGCGAC-1 UR:Z:AAGGCGTAGTAG UY:Z:FFFFFFFFFFFF UB:Z:AAGGCGTAGTAG
137 | A00519:758:HTCCHDSXY:1:1355:17237:31720 0 chr1 14260 0 90M * 0 0 CTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCTGCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCC FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:5 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:1 RE:A:I xf:i:0 CR:Z:TCGTCCACAGTATGAA CY:Z:FFFFFFFFFFFFFFFF CB:Z:TCGTCCACAGTATGAA-1 UR:Z:GACTTATTTTTT UY:Z:FFFFFFFFFFFF UB:Z:GACTTATTTTTT
138 | A00519:758:HTCCHDSXY:3:2227:16703:32080 16 chr1 14411 1 90M * 0 0 TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG FFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:3 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:TTGAGTGGTTGTGGCC CY:Z:FFFFFFFFFFFFFFFF CB:Z:TTGAGTGGTTGTGGCC-1 UR:Z:TATAATGCTCAG UY:Z:FFFFFFFFFFFF UB:Z:TATAATGCTCAG
139 | A00519:758:HTCCHDSXY:3:2563:23665:33802 16 chr1 14411 1 90M * 0 0 TCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGCCTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG FFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF NH:i:3 HI:i:1 AS:i:88 nM:i:0 RG:Z:SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K:0:1:HTCCHDSXY:3 RE:A:I xf:i:0 CR:Z:TGTTGAGAGGCAATGC CY:Z:FFFFFFFFFFFFFFFF CB:Z:TGTTGAGAGGCAATGC-1 UR:Z:ACGGGTGTGGAG UY:Z:FFFFFFFFFFFF UB:Z:ACGGGTGTGGAG
140 | ```
141 | ```
142 | -i
143 | Input file: BAM/SAM file from CellRanger or STARsolo
144 | -o
145 | Output file prefix
146 | -x
147 | The filename of the index for the reference genome annotation generated by scTE_build
148 | -p
149 | Number of threads to use, Default: 1. scTE takes ~10Gb memory each thread for human and mouse genome.
150 | --hdf5
151 | Save the output as .h5ad formatted file instead of csv file. Default: False
152 | ```
153 |
154 | scTE is most tuned to [STARsolo](https://github.com/alexdobin/STAR) or the [Cell Ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline outputs,
155 | and can accept BAM files produced by either of these two programs.
156 | For other aligners, the barcode should be stored in the `CR:Z` or `CB:Z` tag, and the UMI in the `UR:Z` or `UB:Z` tag in the BAM file
157 |
158 | **Analysis of C1 style scRNA-seq data**
159 | If the UMI is missing or not used in the scRNA-seq technology (for example on the Fluidigm C1 platform), it can be disabled with `–UMI False`
160 | (the default is True) switch in scTE. If the barcode is missing it can be disabled with the `–CB False` (the default is True),
161 | and instead the cell barcodes will be taken from the names of the BAM files.
162 |
163 | ```bash
164 | $ scTE -i inp.bam -o out -x mm10.exclusive.idx -CB False -UMI False
165 | ```
166 | multiple BAM files can be provided to scTE with the `–i` option
167 | ```
168 | $ scTE -i *.bam -o out -x mm10.exclusive.idx -CB False -UMI False
169 | ```
170 | or
171 | ```
172 | $ scTE -i input1.bam,input2.bam,... -o out -x mm10.exclusive.idx -CB False -UMI False
173 | ```
174 |
175 | **Analysis of scATAC-seq data**
176 | The genome indices were prebuilt using:
177 | ```
178 | $ wget -c http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz -O mm10.te.txt.gz
179 | $ zcat mm10.te.txt.gz | grep -E 'LINE|SINE|LTR|Retroposon' | cut -f6-8,11 >mm10.te.bed
180 | $ scTEATAC_build -g mm10.te.bed -o mm10.te.atac
181 | ```
182 | Then the bam file can processe using scTE with the command:
183 | ```
184 | scTEATAC -i input.bam -x mm10.te.atac.idx
185 | ```
186 |
187 | **Citation**
188 | If scTE is useful for your research, consider citing [Nature Communications (2021)](https://www.nature.com/articles/s41467-021-21808-x)
189 |
190 |
191 |
--------------------------------------------------------------------------------
/bin/scTE:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pandas as pd
3 | import multiprocessing
4 | from functools import partial
5 | import logging
6 | import os, sys, glob, datetime, time, gzip
7 | import argparse
8 | import collections
9 | from math import log
10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
11 | from scTE.miniglbase import genelist, glload, location
12 | from scTE.annotation import annoGtf
13 | from scTE.base import *
14 |
15 | def prepare_parser():
16 | desc = "hahaha..."
17 |
18 | exmp = "Example: scTE <-i scRNA.sorted.bam> <-o out> [--min_genes 200] [--min_counts 400] [-p 4] <-x mm10.exclusive.idx>"
19 |
20 | parser = argparse.ArgumentParser(prog='scTE',description=desc, epilog=exmp)
21 |
22 | optional = parser._action_groups.pop()
23 |
24 | optional.add_argument('--min_genes', dest='genenumber',metavar='INT', type=int,default=200,
25 | help='Minimum number of genes expressed required for a cell to pass filtering. Default: 200')
26 |
27 | optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int,
28 | help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes')
29 |
30 | optional.add_argument('--expect-cells', dest='cellnumber',metavar='INT', type=int, default=10000,
31 | help='Expected number of cells. Default: 10000')
32 |
33 | optional.add_argument('-f','--format', metavar='input file format', dest='format', type=str, nargs='?', default='BAM', choices=['BAM','SAM'],
34 | help='Input file format: BAM or SAM. DEFAULT: BAM')
35 |
36 | optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='CR', choices=['CR','CB','False'],
37 | help='Set to false to ignore for cell barcodes, it is useful for SMART-seq. If you set CB=False, it also will set UMI=False by default, Default: CR')
38 |
39 | optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='UR', choices=['UR','UB','False'],
40 | help='Set to false to ignore for UMI, it is useful for SMART-seq. Default: True')
41 |
42 | optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'],
43 | help='Keep the _scTEtmp file, which is useful for debugging. Default: False')
44 |
45 | optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'],
46 | help='Save the output as .h5ad formatted file instead of csv file. Default: False')
47 |
48 | optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1,
49 | help='Number of threads to use, Default: 1')
50 |
51 | optional.add_argument('-v','--version', action='version', version='%(prog)s 1.0')
52 |
53 | required = parser.add_argument_group('required arguments')
54 |
55 | required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True,
56 | help='Input file: BAM/SAM file from CellRanger or STARsolo, the file must be sorted by chromosome position')
57 |
58 | required.add_argument('-x', dest='annoglb',nargs='+', required=True,
59 | help='The filename of the index for the reference genome annotation.')
60 |
61 | # required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True,
62 | # help='"hg38" for human, "mm10" for mouse')
63 |
64 | required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix')
65 |
66 | parser._action_groups.append(optional)
67 | optional = parser.add_argument_group('optional arguments')
68 | optional
69 |
70 | return parser
71 |
72 | def main():
73 | """Start scTEs......parse options......"""
74 |
75 | timestart=datetime.datetime.now()
76 | args=read_opts(prepare_parser())
77 |
78 | # Fix up the UMI/CB booleans:
79 | # if args.UMI == 'True': args.UMI = True
80 | # else: args.UMI = False
81 | # if args.CB == 'True': args.CB = True
82 | # else: args.CB = False
83 | if args.hdf5 == 'True': args.hdf5 = True
84 | else: args.hdf5 = False
85 |
86 | info = args.info
87 | error = args.error
88 |
89 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
90 |
91 | info(args.argtxt + "\n")
92 |
93 | outname = args.out.split('/')[-1:][0]
94 |
95 | info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
96 | allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0]) #genome=args.genome
97 | print(sorted(chr_list))
98 | info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
99 |
100 | info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
101 |
102 | if len(args.input) == 1 and ',' in args.input[0]:
103 | args.input=args.input[0].split(',')
104 |
105 | if not os.path.exists('%s_scTEtmp/o1'%outname):
106 | os.system('mkdir -p %s_scTEtmp/o1'%outname)
107 |
108 | for k in args.input:
109 | checkCBUMI(filename=k,out=outname,CB=args.CB,UMI=args.UMI)
110 | info("Input SAM/BAM file appears to be valid")
111 |
112 | if len(args.input) > 1:
113 | info('Using parabam2bed as more than 1 input BAM')
114 | pool=multiprocessing.Pool(processes=args.thread)
115 | partial_work = partial(Para_bam2bed, CB=args.CB, UMI=args.UMI,out=outname)
116 | pool.map(partial_work, args.input)
117 | os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname))
118 |
119 | else:
120 | print(args.CB,args.UMI,'good\n')
121 | Bam2bed(args.input[0], args.CB, args.UMI, outname, args.thread)
122 | info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
123 |
124 | info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
125 | if args.thread == 1: #Single thread path, mainly
126 | # This is useful for testing optimsations, as the multiprocessing path the profile
127 | # Just gets locked up in {method 'acquire' of '_thread.lock' objects}
128 | info('Executing single thread path')
129 | whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI)
130 | else:
131 | info('Executing multiple thread path with %s threads' % args.thread)
132 | pool=multiprocessing.Pool(processes=args.thread)
133 | partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI)
134 | pool.map(partial_work, chr_list)
135 | whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber)
136 |
137 | info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
138 |
139 | info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
140 | if args.thread == 1: #Single thread path
141 | for chrom in chr_list:
142 | align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #CB=args.CB
143 |
144 | else: # Multiprocessing path:
145 | pool = multiprocessing.Pool(processes=args.thread)
146 | partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist) # send a copy of the index, CB=args.CB
147 | pool.map(partial_work, chr_list)
148 |
149 | if not os.path.exists('%s_scTEtmp/o4'%outname):
150 | os.system('mkdir -p %s_scTEtmp/o4'%outname)
151 | os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname))
152 | info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
153 |
154 | info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
155 | len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber, hdf5=args.hdf5)
156 | if args.hdf5 == True:
157 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.h5ad'.format(len_res, genenumber, filename))
158 | else:
159 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename))
160 |
161 | info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
162 |
163 | if args.keeptmp == 'True':
164 | pass
165 | else:
166 | os.system('rm -rf %s_scTEtmp'%outname)
167 |
168 | timeend = datetime.datetime.now()
169 | info("Done with %s\n" % timediff(timestart,timeend))
170 |
171 | if __name__ == '__main__':
172 | try:
173 | main()
174 | except KeyboardInterrupt:
175 | sys.stderr.write("User interrupt !\n")
176 | sys.exit(0)
177 |
178 |
179 |
--------------------------------------------------------------------------------
/bin/scTEATAC:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | desc = '''
3 |
4 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI
5 |
6 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings):
7 |
8 | ****
9 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM'
10 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat '
11 |
12 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex'
13 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex'
14 |
15 | # p1 = read
16 | # p2 = barcode and UMI
17 | # Make sure you set the correct genome index;
18 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
19 | ****
20 |
21 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI
22 |
23 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million).
24 | I guess there remains a change of a clash, but it should be so rare
25 | as to be basically impossible.
26 |
27 | Keep in mind though that downstream UMI statistics are inaccurate
28 |
29 | Require pysam
30 |
31 | '''
32 | import sys, os , time
33 | import gzip
34 | import argparse
35 | import logging
36 | try:
37 | import pysam
38 | except ImportError:
39 | print('pack_scatacseq requires pysam')
40 | sys.quit()
41 |
42 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
43 | # from scTE.scatacseq import build_barcode_dict, parse_bam, load_expected_whitelist
44 | from scTE.scatacseq import atacBam2bed,para_atacBam2bed
45 | from scTE.base import *
46 |
47 | # Command-line options;
48 | def prepare_parser():
49 | exmp = 'scTEATAC -i input.bam -o out --genome mm10 -x mm10.te.idx'
50 |
51 | description = 'Package the BAM and BARCODE for the scATAC-seq data to make it suitable for scTE main pipeline'
52 |
53 | description = 'dummy'
54 |
55 | parser = argparse.ArgumentParser(prog='scTE_scatacseq', description=description, epilog=exmp)
56 | # Optional:
57 | optional = parser._action_groups.pop()
58 | # optional.add_argument('-e', '--expwhite', nargs=1, required=False, help='A txt file containing the expected whitelist of barcodes to correct the observed barcodes with')
59 | optional.add_argument('--ondisk', action='store_true', required=False, help='Do everything in memory (faster, but you will need a lot!, or do it on disk (slower, but no memory requirement')
60 |
61 | optional.add_argument('--min_counts', dest='countnumber',metavar='INT', type=int, default=1000,
62 | help='Minimum number of counts required for a cell to pass filtering. Default: 2*min_genes')
63 |
64 | optional.add_argument('-CB', dest='CB', type=str, nargs='?', default='False', choices=['True','False'],
65 | help='Set to false to ignore for cell barcodes, Default: False')
66 |
67 | optional.add_argument('-UMI', dest='UMI', type=str, nargs='?', default='False', choices=['True','False'],
68 | help='Set to false to ignore for UMI. Default: False')
69 |
70 | optional.add_argument('--ignoreDuplicates', dest='noDup', type=str, nargs='?', default='True', choices=['True','False'],
71 | help='If set, reads that have the same orientation and start position will be considered only once. If reads are paired, the mate’s position also has to coincide to ignore a read. Default: True')
72 |
73 | optional.add_argument('--keeptmp', dest='keeptmp', type=str, nargs='?', default='False', choices=['True','False'],
74 | help='Keep the _scTEtmp file, which is useful for debugging. Default: False')
75 |
76 | optional.add_argument('-p','--thread', metavar='INT', dest='thread', type=int, default=1,
77 | help='Number of threads to use, Default: 1')
78 |
79 | optional.add_argument('--hdf5', dest='hdf5', type=str, nargs='?', default='False', choices=['True','False'],
80 | help='Save the output as .h5ad formatted file instead of csv file. Default: False')
81 |
82 | required = parser.add_argument_group('required arguments')
83 |
84 | required.add_argument('-i','--input', dest='input', type=str, nargs='+', required=True,
85 | help='Input file: BAM/SAM file')
86 |
87 | # required.add_argument('-o', '--out', nargs=1, required=True, help='the output filename prefix')
88 | required.add_argument('-o','--out', dest='out', nargs='?', required=True, help='Output file prefix')
89 |
90 | required.add_argument('-x', dest='annoglb',nargs='+', required=True,
91 | help='The filename of the indexed genome')
92 |
93 | # required.add_argument('-g','--genome', metavar='genome', dest='genome', type=str, nargs='?', default='mm10', choices=['hg38','mm10',], required=True,
94 | # help='"hg38" for human, "mm10" for mouse')
95 |
96 |
97 | # required.add_argument('-f', '--infastq', nargs=1, required=True, help='THe FASTQ file containing the barcode read')
98 | # required.add_argument('-o', '--outbam', nargs=1, required=True, help='the BAM alignment file to save the result into')
99 | # required.add_argument('-w', '--obswhite', nargs=1, required=True, help='A txt file to save the observed barcode whitelist to')
100 |
101 | parser._action_groups.append(optional)
102 |
103 | logging.basicConfig(level=logging.DEBUG,
104 | format='%(levelname)-8s: %(message)s',
105 | datefmt='%m-%d %H:%M')
106 |
107 | parser.log = logging.getLogger('scTE_scatacseq')
108 |
109 | return parser
110 |
111 | def main():
112 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
113 |
114 | timestart=datetime.datetime.now()
115 |
116 | # args=read_opts(prepare_parser())
117 | parser = prepare_parser()
118 | args = parser.parse_args()
119 | info = logging.info
120 |
121 | logger = parser.log
122 |
123 | if args.CB == 'True': args.CB = True
124 | else: args.CB = False
125 | if args.hdf5 == 'True': args.hdf5 = True
126 | else: args.hdf5 = False
127 | if args.noDup == 'True': args.noDup = True
128 | else: args.noDup = False
129 | if args.UMI == 'True': args.UMI = True
130 | else: args.UMI = False
131 |
132 | args.genenumber = 0
133 | args.cellnumber = 1e4
134 |
135 | logger.info('Arguments:')
136 | logger.info('out: %s' % args.out)
137 | logger.info('index: %s \n' % args.annoglb[0])
138 | logger.info("Minimum number of counts required = %s"% args.countnumber)
139 | logger.info("Number of threads = %s " % args.thread)
140 |
141 | outname = args.out.split('/')[-1:][0]
142 |
143 | info("Loading the genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
144 | allelement, chr_list, all_annot, glannot = Readanno(filename=outname, annoglb=args.annoglb[0])
145 | chr_list = [ k for k in chr_list if k not in ['chrM']]
146 | info("Finished loading the genome annotation index... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
147 |
148 | info("Processing BAM/SAM files ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
149 |
150 | if len(args.input) == 1 and ',' in args.input[0]:
151 | args.input=args.input[0].split(',')
152 |
153 | if not os.path.exists('%s_scTEtmp/o1'%outname):
154 | os.system('mkdir -p %s_scTEtmp/o1'%outname)
155 |
156 | if len(args.input) > 1:
157 | info('Using para_atacBam2bed as more than 1 input BAM')
158 | pool=multiprocessing.Pool(processes=args.thread)
159 | partial_work = partial(para_atacBam2bed, CB=args.CB,out=outname, noDup=args.noDup)
160 | pool.map(partial_work, args.input)
161 |
162 | os.system('gunzip -c -f %s_scTEtmp/o0/*.bed.gz | gzip > %s_scTEtmp/o1/%s.bed.gz' % (outname,outname,outname))
163 | else:
164 | atacBam2bed(args.input[0], outname, CB=args.CB, UMI=args.UMI, noDup=args.noDup, num_threads=args.thread)
165 | info("Done BAM/SAM files processing ...%s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
166 |
167 | info("Splitting ...%s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
168 | if args.thread == 1: #Single thread path, mainly
169 | # This is useful for testing optimsations, as the multiprocessing path the profile
170 | # Just gets locked up in {method 'acquire' of '_thread.lock' objects}
171 | info('Executing single thread path')
172 | whitelist = splitAllChrs(chr_list, filename=outname, genenumber=args.genenumber, countnumber=args.countnumber, UMI=args.UMI)
173 | else:
174 | info('Executing multiple thread path with %s threads' % args.thread)
175 | pool=multiprocessing.Pool(processes=args.thread)
176 | partial_work = partial(splitChr, filename=outname, CB=args.CB, UMI=args.UMI)
177 | pool.map(partial_work, chr_list)
178 | whitelist = filterCRs(filename=outname, genenumber=args.genenumber, countnumber=args.countnumber)
179 |
180 | info("Finished processing sample files %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
181 |
182 | info("Fetching from the annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
183 | if args.thread == 1: #Single thread path
184 | for chrom in chr_list:
185 | align(chr=chrom, filename=outname, all_annot=None, glannot=glannot, whitelist=whitelist) #, CB=args.CB
186 |
187 | else: # Multiprocessing path:
188 | pool = multiprocessing.Pool(processes=args.thread)
189 | partial_work = partial(align, filename=outname, all_annot=all_annot, glannot=None, whitelist=whitelist ) # send a copy of the index , CB=args.CB
190 | pool.map(partial_work, chr_list)
191 |
192 | if not os.path.exists('%s_scTEtmp/o4'%outname):
193 | os.system('mkdir -p %s_scTEtmp/o4'%outname)
194 | os.system('gunzip -c -f %s_scTEtmp/o3/%s.*.bed.gz | gzip > %s_scTEtmp/o4/%s.bed.gz' % (outname,outname,outname,outname))
195 | info("Done fetching... %s \n"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
196 |
197 | info("Calculating expression... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
198 | len_res, genenumber, filename = Countexpression(filename=args.out, allelement=allelement, genenumber=args.genenumber, cellnumber=args.cellnumber,hdf5=args.hdf5)
199 | info('Detect {0} cells expressed at least {1} genes, results output to {2}.csv'.format(len_res, genenumber, filename))
200 | info("Finished calculating expression %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
201 |
202 | if args.keeptmp == 'True':
203 | pass
204 | else:
205 | os.system('rm -rf %s_scTEtmp'%outname)
206 |
207 | timeend = datetime.datetime.now()
208 | info("Done with %s\n" % timediff(timestart,timeend))
209 |
210 |
211 | if args.ondisk: # Cleanup the DB
212 | os.remove(tmpfilename)
213 |
214 | if __name__ == '__main__':
215 | try:
216 | main()
217 | except KeyboardInterrupt:
218 | sys.stderr.write("User interrupt\n")
219 | sys.exit(0)
220 |
--------------------------------------------------------------------------------
/bin/scTEATAC_build:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import multiprocessing
4 | from functools import partial
5 | import logging
6 | import os, sys, glob, datetime, time, gzip
7 | import argparse
8 | import collections
9 | from math import log
10 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
11 | from scTE.miniglbase import genelist, glload, location
12 |
13 | chr_list = [ str(k) for k in list(range(1,50))] + ['X','Y', 'M']
14 |
15 | def read_opts(parser):
16 | args = parser.parse_args()
17 |
18 | # if args.mode not in ['inclusive', 'exclusive'] :
19 | # logging.error("Counting mode %s not supported\n" % (args.mode))
20 | # parser.print_help()
21 | # sys.exit(1)
22 | #
23 | # if args.genome not in ['mm10', 'hg38'] :
24 | # logging.error("Counting mode %s not supported\n" % (args.genome))
25 | # parser.print_help()
26 | # sys.exit(1)
27 |
28 | args.info = logging.info
29 | return args
30 |
31 | def genomeIndex(genome,outname):
32 |
33 |
34 | form={'force_tsv':True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
35 | if genome.endswith('.gz'):
36 | genome = genelist(genome, format=form, gzip=True)
37 | else:
38 | genome = genelist(genome, format=form)
39 |
40 | genome.save('%s.idx'%outname)
41 |
42 | def prepare_parser():
43 |
44 | desc = "Build genome annotation index for scTE"
45 |
46 | exmp = "Example: scTEATAC_build -g Data/TE.bed -o mm10.te"
47 |
48 | parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp)
49 |
50 | optional = parser._action_groups.pop()
51 |
52 | optional.add_argument('-g','--genome', metavar='genome', dest='genome',type=str, nargs='?', required=True,
53 | help='Bed file of the genome window')
54 |
55 | optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name')
56 |
57 | required = parser.add_argument_group('required arguments')
58 |
59 | parser._action_groups.append(optional)
60 | optional = parser.add_argument_group('optional arguments')
61 | optional
62 |
63 | return parser
64 |
65 | def main():
66 |
67 | timestart=datetime.datetime.now()
68 | args=read_opts(prepare_parser())
69 |
70 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
71 |
72 | info = args.info
73 |
74 | info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
75 |
76 | genomefile=args.genome
77 | genomeIndex(args.genome,args.out)
78 |
79 | info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
80 |
81 | if __name__ == '__main__':
82 | try:
83 | main()
84 | except KeyboardInterrupt:
85 | sys.stderr.write("User interrupt !\n")
86 | sys.exit(0)
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/bin/scTE_build:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import multiprocessing
4 | from functools import partial
5 | import logging
6 | import os, sys, glob, datetime, time, gzip
7 | import argparse
8 | import collections
9 | from math import log
10 | import numpy as np
11 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0], '../'))
12 | from scTE.miniglbase import genelist, glload, location
13 |
14 | chr_list = [ str(k) for k in list(range(1,50))] + ['X','Y', 'M']
15 |
16 | def read_opts(parser):
17 | args = parser.parse_args()
18 |
19 | if args.mode not in ['inclusive', 'exclusive', 'nointron'] :
20 | logging.error("Counting mode %s not supported\n" % (args.mode))
21 | parser.print_help()
22 | sys.exit(1)
23 |
24 | if args.genome not in ['mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9','other'] :
25 | logging.error("Counting genome %s not supported\n" % (args.genome))
26 | parser.print_help()
27 | sys.exit(1)
28 |
29 | args.info = logging.info
30 | return args
31 |
32 | def cleanexon(exons):
33 | tmp = []
34 | for k in sorted(exons):
35 | E=[]
36 | for it in exons[k]:
37 | E+=list(range(it[1],it[2]))
38 | E=sorted(set(E))
39 |
40 | s=0
41 | #tmp=[]
42 | for id in range(0,len(E)-1):
43 | if E[id+1]-E[id] >1:
44 | en=id
45 | tmp.append({'loc': location(chr=it[0], left=E[s], right=E[en]), 'annot': k})
46 | s=en+1
47 | tmp.append({'loc': location(chr=it[0], left=E[s], right=E[id+1]), 'annot': k})
48 |
49 | return tmp
50 |
51 | def readGtf(filename):
52 | raw = {}
53 | clean = {}
54 | if '.gz' in filename:
55 | o = gzip.open(filename,'rb')
56 | else:
57 | o = open(filename,'r')
58 |
59 | for idx, l in enumerate(o):
60 | if '.gz' in filename:
61 | l=l.decode('ascii')
62 | if l.startswith('#'):
63 | continue
64 | t=l.strip().split('\t')
65 | if t[2]=='exon' or t[2]=='UTR':
66 | if 'chr' not in t[0]:
67 | chr = 'chr' + t[0]
68 | chr = t[0]
69 | if chr.replace('chr','') not in chr_list:
70 | continue
71 | left = int(t[3])
72 | riht = int(t[4])
73 |
74 | if 'gene_name' not in t[8]:
75 | continue
76 |
77 | name=t[8].split('gene_name "')[1].split('";')[0]
78 |
79 | if name not in raw:
80 | raw[name] = []
81 | raw[name].append([chr,left,riht])
82 |
83 | if 'protein_coding' not in l and 'lincRNA' not in l:
84 | continue
85 | if name not in clean:
86 | clean[name] = []
87 | clean[name].append([chr,left,riht])
88 | o.close()
89 |
90 | return raw, clean
91 |
92 |
93 | def genomeIndex(genome, mode, tefile, genefile, outname, geneurls, teurls):
94 |
95 | if not genefile: #Download twice for double check, as sometines wget may stops on the way
96 | os.system('wget -c -t 0 -T 5 %s'%geneurls)
97 | os.system('wget -c -t 0 -T 5 %s'%geneurls)
98 | genefilename = geneurls.split('/')[-1:][0]
99 | else:
100 | genefilename = genefile
101 |
102 | a = readGtf(genefilename)
103 |
104 | raw = cleanexon(a[0])
105 | clean = cleanexon(a[1])
106 |
107 |
108 | # for costume chromsome
109 | if tefile:
110 | o = open(tefile,'rU')
111 | for line in o:
112 | chr = line.strip().split('\t')[0]
113 | if chr not in chr_list:
114 | chr_list.append(chr)
115 | o.close()
116 | #======================
117 |
118 | if not tefile:
119 | os.system('wget -c -t 0 -T 5 %s'%teurls)
120 | os.system('wget -c -t 0 -T 5 %s'%teurls)
121 | tefilename = teurls.split('/')[-1:][0]
122 | teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10}
123 | else:
124 | tefilename = tefile
125 |
126 | gls = genelist()
127 | gls.load_list(clean)
128 |
129 | if mode == 'exclusive':
130 | gene = {}
131 | for l in clean:
132 | chr = l['loc'].loc['chr']
133 | if chr not in chr_list:
134 | continue
135 | left = l['loc']['left']
136 | rite = l['loc']['right']
137 |
138 | left_buck = ((left-1)//10000) * 10000
139 | right_buck = (rite//10000) * 10000
140 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
141 |
142 | if chr not in gene:
143 | gene[chr] = {}
144 |
145 | if buckets_reqd:
146 | for buck in buckets_reqd:
147 | if buck not in gene[chr]:
148 | gene[chr][buck] = []
149 | gene[chr][buck].append([left, rite])
150 |
151 | # Process the TEs:
152 | noverlap = []
153 | if '.gz' in tefilename:
154 | o = gzip.open(tefilename,'rb')
155 | else:
156 | o = open(tefilename,'rU')
157 |
158 | for n, l in enumerate(o):
159 | if '.gz' in tefilename:
160 | l = l.decode('ascii')
161 | t = l.strip().split('\t')
162 |
163 | if not tefile:
164 | chr = t[5].replace('chr', '')
165 | left = int(t[6])
166 | rite = int(t[7])
167 | name = t[10]
168 | clas=t[11]
169 | if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
170 | continue
171 | else:
172 | chr = t[0].replace('chr', '')
173 | left = int(t[1])
174 | rite = int(t[2])
175 | name = t[3]
176 |
177 | if chr not in chr_list:
178 | continue
179 | if chr not in gene: # Should be very rare
180 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
181 | continue
182 |
183 | left_buck = ((left-1)//10000) * 10000
184 | right_buck = (rite//10000) * 10000
185 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
186 |
187 | if buckets_reqd:
188 | i = 1
189 | for buck in buckets_reqd:
190 | if buck not in gene[chr]:
191 | pass
192 | else:
193 | for k in gene[chr][buck]:
194 | if left < k[1] and rite > k[0]:
195 | i = 0
196 | break
197 | if i == 0: # already found an overlap, so quit out;
198 | break
199 | if i == 1:
200 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
201 |
202 | TEs = genelist()
203 | TEs.load_list(noverlap)
204 |
205 | genes = genelist()
206 | genes.load_list(raw)
207 |
208 | all_annot = genes + TEs
209 |
210 | if not outname:
211 | all_annot.save('%s.exclusive.idx'%genome)
212 | print('Done the index building, results output to %s.exclusive.idx \n'% genome)
213 | else:
214 | all_annot.save('%s.exclusive.idx'%outname)
215 | print('Done the index building, results output to %s.exclusive.idx \n'% outname)
216 |
217 | elif mode == 'inclusive':
218 | genes = genelist()
219 | genes.load_list(raw)
220 |
221 |
222 | if not tefile:
223 | teform ={'force_tsv': True, 'loc': 'location(chr=column[5], left=column[6], right=column[7])', 'annot': 10, 'clas':11}
224 | if tefilename.endswith('.gz'):
225 | TEs = genelist(tefilename, format=teform, gzip=True)
226 | else:
227 | TEs = genelist(tefilename, format=teform)
228 |
229 | keep=[]
230 | for id,item in enumerate(TEs):
231 | if item['clas'] not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
232 | continue
233 | if item['loc']['chr'] not in chr_list:
234 | continue
235 | tmp=item.copy()
236 | del tmp['clas']
237 | keep.append(tmp)
238 | gls=genelist()
239 | gls.load_list(keep)
240 |
241 | else:
242 | TEs = genelist(tefilename, format={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot':3})
243 | gls = TEs.deepcopy()
244 |
245 |
246 | all_annot = genes + gls
247 |
248 | if not outname:
249 | all_annot.save('%s.inclusive.idx'%genome)
250 | print('Done the index building, results output to %s.inclusive.idx \n'% genome)
251 | else:
252 | all_annot.save('%s.inclusive.idx'%outname)
253 | print('Done the index building, results output to %s.inclusive.idx \n'% outname)
254 |
255 | elif mode == 'nointron':
256 | raw_gene = a[0]
257 | clean_gene ={}
258 | for k in raw_gene:
259 | if len(raw_gene[k]) == 1: # the gene only have one exon
260 | clean_gene[k] = [raw_gene[k][0]]
261 | else:
262 | tmp = []
263 | for it in raw_gene[k]:
264 | tmp += it
265 | chr = [ item for item in tmp if 'chr' in str(item) ][0]
266 | tmp = [ int(item) for item in tmp if 'chr' not in str(item) ]
267 | clean_gene[k] = [[ chr, np.min(tmp), np.max(tmp)]]
268 | clean = cleanexon(clean_gene)
269 |
270 | # adapted from 'exclusive' mode to remove the overlap reads
271 | gene = {}
272 | for l in clean:
273 | chr = l['loc'].loc['chr']
274 | if chr not in chr_list:
275 | continue
276 | left = l['loc']['left']
277 | rite = l['loc']['right']
278 |
279 | left_buck = ((left-1)//10000) * 10000
280 | right_buck = (rite//10000) * 10000
281 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
282 |
283 | if chr not in gene:
284 | gene[chr] = {}
285 |
286 | if buckets_reqd:
287 | for buck in buckets_reqd:
288 | if buck not in gene[chr]:
289 | gene[chr][buck] = []
290 | gene[chr][buck].append([left, rite])
291 |
292 | # Process the TEs:
293 | noverlap = []
294 | if '.gz' in tefilename:
295 | o = gzip.open(tefilename,'rb')
296 | else:
297 | o = open(tefilename,'rU')
298 |
299 | for n, l in enumerate(o):
300 | if '.gz' in tefilename:
301 | l = l.decode('ascii')
302 | t = l.strip().split('\t')
303 |
304 | if not tefile:
305 | chr = t[5].replace('chr', '')
306 | left = int(t[6])
307 | rite = int(t[7])
308 | name = t[10]
309 | clas=t[11]
310 | if clas not in ['DNA','LINE','LTR','SINE','Satellite','Retroposon']:
311 | continue
312 | else:
313 | chr = t[0].replace('chr', '')
314 | left = int(t[1])
315 | rite = int(t[2])
316 | name = t[3]
317 |
318 | if chr not in chr_list:
319 | continue
320 | if chr not in gene: # Should be very rare
321 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
322 | continue
323 |
324 | left_buck = ((left-1)//10000) * 10000
325 | right_buck = (rite//10000) * 10000
326 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
327 |
328 | if buckets_reqd:
329 | i = 1
330 | for buck in buckets_reqd:
331 | if buck not in gene[chr]:
332 | pass
333 | else:
334 | for k in gene[chr][buck]:
335 | if left < k[1] and rite > k[0]:
336 | i = 0
337 | break
338 | if i == 0: # already found an overlap, so quit out;
339 | break
340 | if i == 1:
341 | noverlap.append({'loc': location(chr=chr, left=left, right=rite), 'annot': name})
342 |
343 | TEs = genelist()
344 | TEs.load_list(noverlap)
345 |
346 | genes = genelist()
347 | genes.load_list(raw)
348 |
349 | all_annot = genes + TEs
350 |
351 | if not outname:
352 | all_annot.save('%s.nointron.idx'%genome)
353 | print('Done the index building, results output to %s.nointron.idx \n'% genome)
354 | else:
355 | all_annot.save('%s.nointron.idx'%outname)
356 | print('Done the index building, results output to %s.nointron.idx \n'% outname)
357 |
358 | if not tefile:
359 | os.system('rm %s '% tefilename)
360 | if not genefile:
361 | os.system('rm %s'%genefilename)
362 |
363 | def prepare_parser():
364 |
365 | desc = "Build genome annotation index for scTE"
366 |
367 | exmp = "Example: scTE_build -te Data/TE.bed -gene Data/Gene.gtf"
368 |
369 | parser = argparse.ArgumentParser(prog='scTE_build',description=desc, epilog=exmp)
370 |
371 | optional = parser._action_groups.pop()
372 |
373 | optional.add_argument('-te', dest='tefile',nargs='+',
374 | help='Six columns bed file for transposable elements annotation. Need the -gene option.')
375 |
376 | optional.add_argument('-gene', dest='genefile',nargs='+',
377 | help='Gtf file for genes annotation. Need the -te option. Mutalluy exclusive to -x option')
378 |
379 | optional.add_argument('-m','--mode', dest='mode', type=str, nargs='?', default='exclusive', choices=['inclusive','exclusive','nointron'],
380 | help='How to count TEs expression: inclusive (inclued all reads that can map to TEs), or exclusive (exclued the reads that can map to the exon of protein coding genes and lncRNAs), or nointron (exclude the reads that can map to the exons and intron of genes).\
381 | DEFAULT: exclusive')
382 |
383 | optional.add_argument('-o','--out', dest='out', nargs='?', help='Output file prefix, Default: the genome name')
384 |
385 | optional.add_argument('-g','--genome', dest='genome',type=str, nargs='?',default='other',choices=['other','mm10','hg38','panTro6','macFas5','dm6','danRer11','xenTro9'],
386 | help='Possible Genomes: mm10 (mouse), hg38 (human), panTro6 (Chimpanzee), macFas5 (Macaca fascicularis), dm6 (Drosophila melanogaster), danRer11 (Zebrafish), xenTro9 (Xenopus tropicalis)', )
387 |
388 | # required = parser.add_argument_group('required arguments')
389 | #
390 | # required.add_argument('-g','--genome', dest='genome',type=str, nargs='?', choices=['hg38','mm10','macFas5','dm6','other'],required=True,
391 | # help='Possible Genomes: mm10 (mouse), hg38 (human)')
392 |
393 | parser._action_groups.append(optional)
394 | optional = parser.add_argument_group('optional arguments')
395 | optional
396 |
397 | return parser
398 |
399 | def main():
400 | timestart=datetime.datetime.now()
401 | args=read_opts(prepare_parser())
402 |
403 | print(args)
404 | # if not args.genome:
405 | # print('good')
406 | #
407 | # print(args.genome)
408 |
409 | assert sys.version_info >= (3, 6), 'Python >=3.6 is required'
410 |
411 | info = args.info
412 |
413 | info("Building the scTE genome annotation index... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
414 |
415 | if args.tefile:
416 | tefile = args.tefile[0]
417 | else:
418 | tefile = None
419 |
420 | if args.genefile:
421 | genefile = args.genefile[0]
422 | else:
423 | genefile = None
424 |
425 | if args.genome == 'mm10':
426 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
427 | 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M21/gencode.vM21.annotation.gtf.gz',
428 | 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz')
429 |
430 | elif args.genome == 'hg38':
431 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
432 | 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz',
433 | 'http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/rmsk.txt.gz')
434 |
435 | elif args.genome == 'panTro6':
436 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
437 | 'http://ftp.ensembl.org/pub/release-103/gtf/pan_troglodytes/Pan_troglodytes.Pan_tro_3.0.103.gtf.gz',
438 | 'https://hgdownload.soe.ucsc.edu/goldenPath/panTro6/database/rmsk.txt.gz')
439 |
440 | elif args.genome == 'macFas5':
441 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
442 | 'http://ftp.ensembl.org/pub/release-102/gtf/macaca_fascicularis/Macaca_fascicularis.Macaca_fascicularis_5.0.102.gtf.gz',
443 | 'http://hgdownload.soe.ucsc.edu/goldenPath/macFas5/database/rmsk.txt.gz')
444 |
445 | elif args.genome == 'dm6':
446 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
447 | 'http://ftp.ensembl.org/pub/release-103/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.32.103.gtf.gz',
448 | 'http://hgdownload.soe.ucsc.edu/goldenPath/dm6/database/rmsk.txt.gz')
449 |
450 | elif args.genome == 'danRer11':
451 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
452 | 'http://ftp.ensembl.org/pub/release-103/gtf/danio_rerio/Danio_rerio.GRCz11.103.gtf.gz',
453 | 'https://hgdownload.soe.ucsc.edu/goldenPath/danRer11/database/rmsk.txt.gz')
454 |
455 | elif args.genome == 'xenTro9':
456 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,
457 | 'http://ftp.ensembl.org/pub/release-103/gtf/xenopus_tropicalis/Xenopus_tropicalis.Xenopus_tropicalis_v9.1.103.gtf.gz',
458 | 'https://hgdownload.soe.ucsc.edu/goldenPath/xenTro9/database/rmsk.txt.gz')
459 |
460 | elif args.genome == 'other':
461 | genomeIndex(args.genome,args.mode,tefile,genefile, args.out,'No path','No path')
462 |
463 |
464 | info("Done genome annotation index building... %s"%(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
465 |
466 | if __name__ == '__main__':
467 | try:
468 | main()
469 | except KeyboardInterrupt:
470 | sys.stderr.write("User interrupt !\n")
471 | sys.exit(0)
472 |
473 |
474 |
475 |
--------------------------------------------------------------------------------
/docs/scTE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/docs/scTE.png
--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/scte/do_batch.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | for f in ../starsolo*/*.bam
4 | do
5 | root=`basename $f`
6 | path=`dirname $f`
7 |
8 | bf=`echo $root | sed -r 's#.Aligned.sortedByCoord.out.bam##g' | sed 's#.bam##g'`
9 | tt=`echo $bf.csv.gz ` # outfile
10 | if [ ! -f $tt ] # Check not already done
11 | then
12 | echo scTE $tt
13 | qsub -N scte.$bf -v in=$f,out=$bf scte.sh
14 | sleep 1
15 | fi
16 | done
17 |
18 |
--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/scte/scte.sh:
--------------------------------------------------------------------------------
1 | #PBS -l nodes=1:ppn=2,mem=64gb
2 | #PBS -j oe
3 | #PBS -o ${out}.out
4 | #PBS -q batch
5 | #PBS -V
6 | cd $PBS_O_WORKDIR
7 |
8 | genome_mm10='/data3/lab-andrew/scTE/scte_indeces/mm10.exclusive.idx'
9 | genome_hg38='/data3/lab-andrew/scTE/scte_indeces/hg38.exclusive.idx'
10 |
11 | python3 /share/apps/genomics/unstable/scTE/bin/scTE -i ${in} -x $genome_mm10 -g mm10 -p 1 -o ${out}
12 |
13 | gzip ${out}.csv
14 |
--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/starsolo/do_batch.sh:
--------------------------------------------------------------------------------
1 |
2 |
3 | for f in ../fqs/*.p1.fq.gz
4 | do
5 | root=`basename $f`
6 | path=`dirname $f`
7 |
8 | bf=`echo $root | sed -r 's#.p1.fq.gz##g'`
9 | p2=`echo $f | sed 's#.p1.fq.gz#.p2.fq.gz#g'`
10 | tt=`echo ss.$bf.Aligned.sortedByCoord.out.bam` # outfile
11 | if [ ! -f $tt ] # Check not already done
12 | then
13 | echo STARsolo $tt
14 | qsub -N solo.$bf -v p1=$f,p2=$p2,out=$bf. starsolo.sh
15 | sleep 2
16 | fi
17 | done
18 |
19 |
--------------------------------------------------------------------------------
/example/Figure3/0.cluster_scripts/starsolo/starsolo.sh:
--------------------------------------------------------------------------------
1 | #PBS -N ss.${out}.starsolo
2 | #PBS -l nodes=1:ppn=32
3 | #PBS -l mem=32gb
4 | #PBS -j oe
5 | #PBS -o ss.${out}.out
6 | #PBS -q batch
7 | #PBS -V
8 | cd $PBS_O_WORKDIR
9 |
10 | ulimit -n 2000
11 |
12 | whitelist='--soloCBwhitelist /data3/lab-andrew/scTE/scrnaseq_barcodes/version1.txt' # Make sure you get the right bartcode version
13 |
14 | # Required arguments;
15 | mods='--soloType Droplet --soloFeatures Gene --soloBarcodeReadLength 1 --soloCBlen 14 --soloUMIstart 15 '
16 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic'
17 | opts='--runRNGseed 42 --runThreadN 32 --readFilesCommand zcat '
18 |
19 | # required for scTE:
20 | sam_att='--outSAMattributes NH HI AS nM CR CY UR UY'
21 |
22 | genome_mm10='--genomeDir /data3/lab-andrew/scTE/custom_indeces/mm10_gencode_vM21_starsolo/SAindex'
23 | genome_hg38='--genomeDir /data3/lab-andrew/scTE/custom_indeces/hg38_gencode_v30_starsolo/SAindex'
24 |
25 | # p1 = read
26 | # p2 = barcode and UMI
27 | # Make sure you set the correct genome index;
28 | STAR $opts $teopts $mods $whitelist $sam_att $genome_mm10 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
29 |
30 | rm -r ss.${out}_STARgenome
31 | rm -r ss.${out}_STARpass1
32 | rm -r ss.${out}_STARtmp
33 |
--------------------------------------------------------------------------------
/example/Figure3/1.pack.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Pack the scRNA-seq data using scanpy, prep for scran normalisation
4 |
5 | """
6 |
7 | import logging, matplotlib, os, sys
8 | import scanpy as sc
9 | import numpy as np
10 | import scipy as sp
11 | import pandas as pd
12 | import matplotlib.pyplot as plt
13 | from anndata import AnnData
14 | from matplotlib import rcParams
15 | from matplotlib import colors
16 | import seaborn as sb
17 | from rpy2.robjects.packages import importr
18 | plt.rcParams['figure.figsize'] = (8,8)
19 | sc.settings.verbosity = 3
20 | sc.set_figure_params(dpi=200, dpi_save=200)
21 | matplotlib.rcParams['pdf.fonttype'] = 42
22 | matplotlib.rcParams['font.size'] = 10
23 | sc.settings.autoshow = False
24 |
25 | def sparsify(filename):
26 | data = pd.read_csv(filename, index_col=0, header=0)
27 | genes = data.columns
28 | cells = data.index
29 | data = sp.sparse.csr_matrix(data.to_numpy())
30 | data.astype('float32')
31 |
32 | '''
33 | oh = open('gene_names.{0}.tsv'.format(os.path.split(filename)[1]), 'w')
34 | for g in genes:
35 | oh.write('%s\n' % g)
36 | oh.close()
37 | '''
38 |
39 | print('Loaded {0}'.format(filename))
40 | ad = AnnData(data, obs={'obs_names': cells}, var={'var_names': genes})
41 | del data
42 | return ad
43 |
44 | sam1 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam1.csv.gz") ; sam1.obs['stage'] = "E6.5" ; sam1.obs['replicate'] = "E6.5-1"
45 | sam2 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam5.csv.gz") ; sam2.obs['stage'] = "E6.5" ; sam2.obs['replicate'] = "E6.5-2"
46 | #sam3 = sparsify("../scte_data/ss.gastrulation_E6.5_Sam18.csv.gz") ; sam3.obs['stage'] = "E6.5" ; sam3.obs['replicate'] = "E6.5-3"
47 | #sam4 = sparsify("../scte_data/ss.gastrulation_E6.75_Sam7.csv.gz") ; sam4.obs['stage'] = "E6.75" ; sam4.obs['replicate'] = "E6.75-1"
48 | sam5 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam10.csv.gz") ; sam5.obs['stage'] = "E7.0" ; sam5.obs['replicate'] = "E7.0-1"
49 | #sam6 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam15.csv.gz") ; sam6.obs['stage'] = "E7.0" ; sam6.obs['replicate'] = "E7.0-3"
50 | sam7 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam30.csv.gz") ; sam7.obs['stage'] = "E7.0" ; sam7.obs['replicate'] = "E7.0-4"
51 | sam8 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam31.csv.gz") ; sam8.obs['stage'] = "E7.0" ; sam8.obs['replicate'] = "E7.0-5"
52 | sam9 = sparsify("../scte_data/ss.gastrulation_E7.0_Sam32.csv.gz") ; sam9.obs['stage'] = "E7.0" ; sam9.obs['replicate'] = "E7.0-6"
53 | sam10 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam23.csv.gz") ; sam10.obs['stage'] = "E7.25" ; sam10.obs['replicate'] = "E7.25-2"
54 | sam11 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam26.csv.gz") ; sam11.obs['stage'] = "E7.25" ; sam11.obs['replicate'] = "E7.25-3"
55 | sam12 = sparsify("../scte_data/ss.gastrulation_E7.25_Sam27.csv.gz") ; sam12.obs['stage'] = "E7.25" ; sam12.obs['replicate'] = "E7.25-4"
56 | sam13 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam2.csv.gz") ; sam13.obs['stage'] = "E7.5" ; sam13.obs['replicate'] = "E7.5-1"
57 | sam14 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam3.csv.gz") ; sam14.obs['stage'] = "E7.5" ; sam14.obs['replicate'] = "E7.5-2"
58 | sam15 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam4.csv.gz") ; sam15.obs['stage'] = "E7.5" ; sam15.obs['replicate'] = "E7.5-3"
59 | sam16 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam6.csv.gz") ; sam16.obs['stage'] = "E7.5" ; sam16.obs['replicate'] = "E7.5-4"
60 | sam17 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam19.csv.gz") ; sam17.obs['stage'] = "E7.5" ; sam17.obs['replicate'] = "E7.5-5"
61 | sam18 = sparsify("../scte_data/ss.gastrulation_E7.5_Sam20.csv.gz") ; sam18.obs['stage'] = "E7.5" ; sam18.obs['replicate'] = "E7.5-6"
62 | sam19 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam8.csv.gz") ; sam19.obs['stage'] = "E7.75" ; sam19.obs['replicate'] = "E7.75-1"
63 | sam20 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam9.csv.gz") ; sam20.obs['stage'] = "E7.75" ; sam20.obs['replicate'] = "E7.75-2"
64 | sam21 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam12.csv.gz") ; sam21.obs['stage'] = "E7.75" ; sam21.obs['replicate'] = "E7.75-3"
65 | sam22 = sparsify("../scte_data/ss.gastrulation_E7.75_Sam13.csv.gz") ; sam22.obs['stage'] = "E7.75" ; sam22.obs['replicate'] = "E7.75-4"
66 | sam23 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam16.csv.gz") ; sam23.obs['stage'] = "E8.0" ; sam23.obs['replicate'] = "E8.0-1"
67 | sam24 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam33.csv.gz") ; sam24.obs['stage'] = "E8.0" ; sam24.obs['replicate'] = "E8.0-2"
68 | sam25 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam34.csv.gz") ; sam25.obs['stage'] = "E8.0" ; sam25.obs['replicate'] = "E8.0-3"
69 | sam26 = sparsify("../scte_data/ss.gastrulation_E8.0_Sam35.csv.gz") ; sam26.obs['stage'] = "E8.0" ; sam26.obs['replicate'] = "E8.0-4"
70 | sam27 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam24.csv.gz") ; sam27.obs['stage'] = "E8.25" ; sam27.obs['replicate'] = "E8.25-1"
71 | sam28 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam25.csv.gz") ; sam28.obs['stage'] = "E8.25" ; sam28.obs['replicate'] = "E8.25-2"
72 | sam29 = sparsify("../scte_data/ss.gastrulation_E8.25_Sam28.csv.gz") ; sam29.obs['stage'] = "E8.25" ; sam29.obs['replicate'] = "E8.25-3"
73 | sam30 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam17.csv.gz") ; sam30.obs['stage'] = "E8.5" ; sam30.obs['replicate'] = "E8.5-1"
74 | sam31 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam29.csv.gz") ; sam31.obs['stage'] = "E8.5" ; sam31.obs['replicate'] = "E8.5-2"
75 | sam32 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam36.csv.gz") ; sam32.obs['stage'] = "E8.5" ; sam32.obs['replicate'] = "E8.5-3"
76 | sam33 = sparsify("../scte_data/ss.gastrulation_E8.5_Sam37.csv.gz") ; sam33.obs['stage'] = "E8.5" ; sam33.obs['replicate'] = "E8.5-4"
77 | sam34 = sparsify("../scte_data/ss.gastrulation_mixed_Sam21.csv.gz") ; sam34.obs['stage'] = "mixed" ; sam34.obs['replicate'] = "mixed-1"
78 | sam35 = sparsify("../scte_data/ss.gastrulation_mixed_Sam22.csv.gz") ; sam35.obs['stage'] = "mixed" ; sam35.obs['replicate'] = "mixed-2"
79 |
80 | print('Loaded Samples...')
81 |
82 | # Do very simple prefiltering:
83 | samples = [sam1, sam2, #sam3, sam4,
84 | sam5, #sam6,
85 | sam7, sam8, sam9, sam10,
86 | sam11, sam12, sam13, sam14, sam15,
87 | sam16, sam17, sam18, sam19, sam20,
88 | sam21, sam22, sam23, sam24, sam25,
89 | sam26, sam27, sam28, sam29, sam30,
90 | sam31, sam32, sam33, sam34, sam35]
91 |
92 | # Quick pre-filtering, these should be low, otherwise it can mess up downstream analysis, but also can get rid of trivial uninteresting things
93 | [sc.pp.filter_cells(sam, min_genes=2000) for sam in samples]
94 | [sc.pp.filter_cells(sam, max_counts=100000) for sam in samples]
95 | [sc.pp.filter_cells(sam, min_counts=5000) for sam in samples]
96 | # Do not filter gene here; concatenate joins on the union, so if a gene fails in a single sample, it will also be deleted from all other samples;
97 |
98 | print('Concatenating')
99 | adata = sam1.concatenate(samples[1:])
100 |
101 | del samples
102 |
103 | adata.X = adata.X.astype('float32')
104 |
105 | print(adata)
106 |
107 | sc.pl.violin(adata, ['n_genes', 'n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-pre-norm-replicates.pdf')
108 |
109 | # Base filtering for trivial QC failures:
110 | sc.pp.filter_cells(adata, min_genes=3000)
111 | sc.pp.filter_cells(adata, min_counts=8000)
112 | sc.pp.filter_cells(adata, max_counts=100000)
113 | sc.pp.filter_genes(adata, min_cells=50) # Only filter genes here;
114 |
115 | print('Number of cells after gene filter: {:d}'.format(adata.n_obs))
116 |
117 | #sc.pl.violin(adata, ['n_genes','n_counts'], groupby='stage', size=0, log=False, cut=0, show=False, save='qc1.pdf')
118 | sc.pl.violin(adata, ['n_genes','n_counts'], groupby='replicate', size=0, log=False, cut=0, show=False, save='qc1-replicates.pdf')
119 |
120 | p = sb.distplot(adata.obs['n_counts'], kde=False)
121 | p.get_figure().savefig('figures/distplot_ncounts1.pdf')
122 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']<4000], kde=False, bins=60)
123 | p.get_figure().savefig('figures/distplot_ncounts2.pdf')
124 | p = sb.distplot(adata.obs['n_counts'][adata.obs['n_counts']>10000], kde=False, bins=60)
125 | p.get_figure().savefig('figures/distplot_ncounts3.pdf')
126 | #Thresholding decision: genes
127 | p = sb.distplot(adata.obs['n_genes'], kde=False, bins=60)
128 | p.get_figure().savefig('figures/distplot_ngenes1.pdf')
129 | p = sb.distplot(adata.obs['n_genes'][adata.obs['n_genes']<2000], kde=False, bins=60)
130 | p.get_figure().savefig('figures/distplot_ngenes2.pdf')
131 |
132 | print('Total number of cells: {:d}'.format(adata.n_obs))
133 | print('Total number of genes: {:d}'.format(adata.n_vars))
134 |
135 | adata.write('./raw_data.h5ad')
136 |
--------------------------------------------------------------------------------
/example/Figure3/2.norm_and_learn.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import anndata
3 | import scanpy as sc
4 | import numpy as np
5 | import scipy as sp
6 | import pandas as pd
7 | import matplotlib.pyplot as plt
8 | from matplotlib import rcParams
9 | from matplotlib import colors
10 | import seaborn as sb
11 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
12 | sc.settings.verbosity = 3
13 | sc.set_figure_params(dpi=200, dpi_save=300)
14 |
15 | adata = sc.read('raw_data.h5ad')
16 | sc.pp.normalize_total(adata)
17 | sc.pp.log1p(adata)
18 | print(adata)
19 |
20 | print('Number of cells: {:d}'.format(adata.n_obs))
21 |
22 | sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2000)
23 | sc.pl.highly_variable_genes(adata, show=False, save='highly_variable.pdf')
24 |
25 | # Calculate the visualizations
26 | sc.pp.pca(adata, n_comps=20, use_highly_variable=True, svd_solver='arpack') # PC=20 from Nature paper
27 | sc.pp.neighbors(adata)
28 | sc.tl.tsne(adata, n_jobs=3)
29 | sc.tl.umap(adata, min_dist=0.6)
30 | sc.tl.diffmap(adata)
31 |
32 | sc.pl.pca_variance_ratio(adata, log=True, show=False, save='pca_variance.pdf')
33 |
34 | # Perform clustering - using highly variable genes
35 | sc.tl.leiden(adata, resolution=1.0, key_added='leiden_r1')
36 | sc.tl.leiden(adata, resolution=0.5, key_added='leiden_r0.5')
37 | sc.tl.leiden(adata, resolution=0.4, key_added='leiden_r0.4')
38 | sc.tl.leiden(adata, resolution=0.35, key_added='leiden_r0.35')
39 | sc.tl.leiden(adata, resolution=0.3, key_added='leiden_r0.3')
40 | sc.tl.leiden(adata, resolution=0.25, key_added='leiden_r0.25')
41 | sc.tl.leiden(adata, resolution=0.2, key_added='leiden_r0.2')
42 | sc.tl.leiden(adata, resolution=0.1, key_added='leiden_r0.1')
43 |
44 | adata.write('./learned.h5ad')
45 |
46 | todraw = ['leiden_r1', 'leiden_r0.5', 'leiden_r0.4', 'leiden_r0.35', 'leiden_r0.3', 'leiden_r0.25', 'leiden_r0.2', 'leiden_r0.1', 'replicate']
47 |
48 | #Visualize the clustering and how this is reflected by different technical covariates
49 | sc.pl.tsne(adata, color=todraw, size=10, legend_loc='on data', show=False, save='tsne.pdf')
50 | sc.pl.umap(adata, color=todraw, size=10, legend_loc='on data', show=False, save='umap.pdf')
51 |
52 |
--------------------------------------------------------------------------------
/example/Figure3/3.diffexp.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import matplotlib.pyplot as plt
4 | from matplotlib import rcParams
5 | from matplotlib import colors
6 | import pandas as pd
7 | from glbase3 import genelist
8 | plt.rcParams['figure.figsize']=(8,8)
9 | sc.settings.verbosity = 3
10 | sc.set_figure_params(dpi=200, dpi_save=200)
11 | matplotlib.rcParams['pdf.fonttype']=42
12 | matplotlib.rcParams['font.size']=10
13 |
14 | sc.settings.figdir = 'diffexp'
15 |
16 | adata = sc.read('./learned.h5ad')
17 |
18 | sc.tl.rank_genes_groups(adata, 'leiden_r0.5', method='wilcoxon', n_genes=3000)
19 | adata.write('./de.h5ad')
20 |
21 | adata = sc.read('./de.h5ad')
22 |
23 | sc.pl.rank_genes_groups(adata, n_genes=25, sharey=True, show=False, save='genes-top25.pdf')
24 | sc.pl.rank_genes_groups(adata, key='rank_genes_groups', show=False, save='genes.pdf')
25 | sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups', show=False, save='genes-top25.pdf')
26 |
27 | #print(pd.DataFrame(adata.uns['rank_genes_groups']))
28 |
29 | print(pd.DataFrame(adata.uns['rank_genes_groups']['names']))
30 |
31 | print()
32 | topall = pd.DataFrame(adata.uns['rank_genes_groups']['names']) # get all;
33 | fcs = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
34 | padj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])
35 |
36 | topall.to_csv('top100.csv')
37 |
38 | # Go through and trim the TEs:
39 |
40 | TEs = set(genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name'])
41 |
42 | newcols = {}
43 |
44 | groups = list(topall.columns.values)
45 |
46 | for group in groups:
47 | newcols[group] = []
48 |
49 | t = zip([i[group] for i in adata.uns['rank_genes_groups']['names']], [i[group] for i in adata.uns['rank_genes_groups']['logfoldchanges']], [i[group] for i in adata.uns['rank_genes_groups']['pvals_adj']])
50 |
51 | print('Group: {0}'.format(group))
52 | print(t)
53 |
54 | for item in t:
55 | print(item)
56 | if abs(item[1]) < 1: # fold change
57 | continue
58 | if item[2] > 0.01: # just in case
59 | continue
60 |
61 | if item[0] in TEs:
62 | newcols[group].append(item[0])
63 |
64 |
65 | # join all and draw a dotplot:
66 | joined = []
67 | for group in newcols:
68 | joined += newcols[group]
69 |
70 | # Need to remove duplicates, but preserver order:
71 | newl = []
72 | for i in joined:
73 | if i not in newl:
74 | newl.append(i)
75 | joined = newl
76 |
77 | print(joined)
78 | sc.pl.dotplot(adata, joined, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf')
79 | sc.pl.matrixplot(adata, joined, groupby='leiden_r0.5', dendrogram=True, standard_scale='var', show=False, save='de-tes.pdf')
80 |
81 | for k in joined:
82 | sc.pl.tsne(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k))
83 | sc.pl.umap(adata, color=[k,k], size=15, legend_loc='on data', vmax=2, show=False, save='markers-{0}.pdf'.format(k))
84 |
--------------------------------------------------------------------------------
/example/Figure3/4.plots-allgenes.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import matplotlib.pyplot as plt
4 | from matplotlib import rcParams
5 | from matplotlib import colors
6 |
7 | from glbase3 import *
8 |
9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=10
14 |
15 | sc.settings.figdir = 'genes'
16 |
17 | adata = sc.read('./learned.h5ad')
18 | print(adata)
19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index
20 |
21 | TEs = genelist(filename='../../TE_genes_id.mm10.txt', format={'name': 0, 'force_tsv': True})['name']
22 |
23 | print(TEs)
24 |
25 | for g in all_genes:
26 | if g not in TEs and '(' not in g:
27 | print(g)
28 | sc.pl.umap(adata, color=[g], size=6, legend_loc='on data', color_map='plasma', show=False, save='-{0}.pdf'.format(g), vmin=0, vmax=3)
29 |
30 |
31 |
--------------------------------------------------------------------------------
/example/Figure3/4.plots-alltes.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import matplotlib.pyplot as plt
4 | from matplotlib import rcParams
5 | from matplotlib import colors
6 |
7 | from glbase3 import *
8 |
9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=10
14 |
15 | sc.settings.figdir = 'tes'
16 |
17 | adata = sc.read('./learned.h5ad')
18 | print(adata)
19 | all_genes = adata.var['n_cells'].index # gene names are stored in the index
20 |
21 | TEs = genelist(filename='TE_genes_id.mm10.txt.gz', format={'name': 0, 'force_tsv': True}, gzip=True)
22 |
23 | #merker_tes = ['ID2', 'MER5C1', 'MER34B-int', 'MER63D', 'MT2A']
24 | #sc.pl.stacked_violin(adata, var_names=merker_tes, groupby='leiden_r0.2', rotation=90, show=False, save='tes.pdf')
25 |
26 | for te in TEs:
27 | print(te['name'])
28 | if te['name'] in all_genes:
29 | sc.pl.umap(adata, color=[te['name'], te['name']], size=10, legend_loc='on data', show=False, save='TE-{0}.pdf'.format(te['name']), vmin=0, vmax=3)
30 |
31 |
32 |
--------------------------------------------------------------------------------
/example/Figure3/4.plots-specific-tes.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import matplotlib.pyplot as plt
4 | from matplotlib import rcParams
5 | from matplotlib import colors
6 |
7 | from glbase3 import *
8 |
9 | plt.rcParams['figure.figsize']=(8,8)
10 | sc.settings.verbosity = 3
11 | sc.set_figure_params(dpi=200, dpi_save=200)
12 | matplotlib.rcParams['pdf.fonttype']=42
13 | matplotlib.rcParams['font.size']=6
14 |
15 | sc.settings.figdir = 'specific-tes'
16 |
17 | adata = sc.read('./learned.h5ad')
18 |
19 | # high, few: Expressed rarely, but very high in the cells that they are expressed in
20 | marker_genes_dictB = {
21 | #'Epiblast': ['MTEb-int',],
22 | 'Primitive streak': ['RLTR1D2_MM', ],
23 | #'Endothelium': ['ERVB7_2B-LTR_MM',],
24 |
25 | #'Ectoderms': ['MamRep137'],
26 | #'Endoderms': ['MLT1I'],
27 | 'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'],
28 | 'Extraembryonic': ['LTR16A', ],
29 | 'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',],
30 | #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ],
31 | 'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'],
32 | }
33 | sc.pl.dotplot(adata, marker_genes_dictB, groupby='leiden_r0.5', dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf')
34 |
35 | # Super-specific
36 | marker_genes_dictC = {
37 | #'Primitive streak': [ ],
38 | 'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',],
39 | #'others':['MuRRS4-int'],
40 | 'Exe. endoderm': ['MER46C', 'MuRRS4-int', 'RLTR20B3', 'RLTR1B-int', 'LTRIS2',],
41 | 'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'],
42 | #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'],
43 | 'Erythroid': ['RLTR10F', 'L1_Mur1',],
44 | }
45 | sc.pl.dotplot(adata, marker_genes_dictC, groupby='leiden_r0.5', dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf')
46 |
--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-leiden-0.2.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import numpy as np
4 | import scipy as sp
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | from matplotlib import rcParams
8 | from matplotlib import colors
9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 |
16 | sc.settings.figdir = 'markers-leiden0.2'
17 |
18 | adata = sc.read('learned.h5ad') #
19 | #sc.pp.log1p(adata)
20 |
21 | print(adata.var_names)
22 |
23 | oh = open('gene_names.all.tsv', 'w')
24 | for g in adata.var_names:
25 | oh.write('%s\n' % g)
26 | oh.close()
27 |
28 | marker_genes_dict = {
29 | 'Epiblast': ["Pou5f1"], # Done
30 | 'Primitive Streak': ['Mixl1'], # Done
31 | 'Meso/endoderm': ['Eomes', 'T'], # Done
32 | 'Endoderm': ['Sox17'], # Done
33 | 'Mesoderm': ['Tbx6'], # Done
34 | 'Ectoderm': ['Nr2f1', 'Pax6'],
35 | 'Exe. endoderm': ["Apoa2"], # Done
36 | 'Exe. ectoderm': ["Tfap2c"], # Done
37 | 'Mesenchyme': ['Pmp22'], # Done
38 | 'Blood progenitors': ['Runx1'], # Done
39 | 'Erythroid': ['Gata1'], # Done
40 | }
41 |
42 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, rotation=90, dendrogram=False, show=False, save='markers.pdf')
43 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.2', dot_max=0.5, dendrogram=False, standard_scale='var', show=False, save='markers.pdf')
44 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.2', vmax=3, show=False, save='markers.pdf')
45 | '''
46 | for k in marker_genes_dict:
47 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
48 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
49 |
50 | '''
51 |
--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-small-grp_cut.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import numpy as np
4 | import scipy as sp
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | from matplotlib import rcParams
8 | from matplotlib import colors
9 | import seaborn as sb
10 | #from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 |
16 | #matplotlib.rcParams['pdf.fonttype']=42
17 | #matplotlib.rcParams['font.size']=6
18 |
19 | todo = 'leiden_r0.3'
20 |
21 | sc.settings.figdir = 'markers-{0}'.format(todo)
22 |
23 | adata = sc.read('learned.h5ad')
24 |
25 | marker_genes_dict = {
26 | 'Epiblast': ["Pou5f1"],
27 | 'Primitive streak': ["Mixl1"], #Nanong?!?!
28 | 'Endoderms': ["Cer1", "Sox7"],
29 | 'Mesoderms': ["T", 'Cdx1'],
30 | 'Ectoderms': ['Six3'], # And Grhl2
31 |
32 | 'Exe endoderm': ["Apoa2"],
33 | 'Exe ectoderm': ["Tfap2c"],
34 |
35 | 'Cardiomyocytes': ["Tnnt2"],
36 | 'Blood prog.': ["Lmo2", ],
37 | 'Erythroid': ["Gypa"],
38 | }
39 |
40 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby=todo, rotation=90, dendrogram=True, show=False, save='markers.pdf')
41 | sc.pl.dotplot(adata, marker_genes_dict, groupby=todo, color_map='Greens', dot_max=0.7, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
42 | sc.pl.heatmap(adata, marker_genes_dict, groupby=todo, vmax=3, show=False, save='markers.pdf')
43 |
44 | # high, few: Expressed rarely, but very high in the cells that they are expressed in
45 | marker_genes_dictB = {
46 | #'Epiblast': ['MTEb-int',],
47 | #'Primitive streak': ['RLTR1D2_MM', ],
48 | #'Endothelium': ['ERVB7_2B-LTR_MM',],
49 |
50 | #'Ectoderms': ['MamRep137'],
51 | #'Endoderms': ['MLT1I'],
52 | 'Mesoendoderm': ['RLTR48A', 'IAPEY4_LTR', 'ORR1F-int'],
53 | 'Extraembryonic': ['LTR16A', ],
54 | 'Exe. endoderm': ['MER5C', 'RLTR6B_Mm',],
55 | #'Exe. ectoderm': ['ERVB4_2-LTR_MM', ],
56 | 'Cardiomyocyte': ['L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int', 'RLTR16'],
57 | }
58 | sc.pl.dotplot(adata, marker_genes_dictB, groupby=todo, dot_max=0.3, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersB.pdf')
59 |
60 | # Super-specific
61 | marker_genes_dictC = {
62 | #'Primitive streak': [ ],
63 | 'Mesoendoderm': ['ERVB4_1C-LTR_Mm', 'ETnERV3-int',],
64 | #'others':['MuRRS4-int'],
65 | 'Exe. endoderm': ['MER46C', 'MuRRS4-int', 'RLTR20B3', 'RLTR1B-int', 'LTRIS2',],
66 | 'Exe. ectoderm': ['RLTR45', 'RLTR45-int', 'IAPLTR1_Mm'],
67 | #'Cardiomyocyte': ['ETnERV3-int', 'L1ME3D', 'RLTR13A2', 'ERVB2_1A-I_MM-int'],
68 | 'Erythroid': ['RLTR10F', 'L1_Mur1',],
69 | }
70 | sc.pl.dotplot(adata, marker_genes_dictC, groupby=todo, dot_max=0.7, dendrogram=True, standard_scale='var', vmax=1, show=False, save='markersC.pdf')
71 |
--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes-small.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import numpy as np
4 | import scipy as sp
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | from matplotlib import rcParams
8 | from matplotlib import colors
9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 |
16 | sc.settings.figdir = 'markers-small'
17 |
18 | adata = sc.read('learned.h5ad')
19 |
20 | marker_genes_dict = {
21 | 'Epiblast': ["Pou5f1"],
22 | 'Primitive streak': ["Eomes", "Mixl1"], #Nanong?!?!
23 | 'Endoderms': ["Cer1", "Sox7"],
24 | 'Mesoderms': ["T", 'Cdx1'],
25 | 'Ectoderms': ['Grhl2', 'Six3'],
26 |
27 | 'Exe endoderm': ["Apoa2"],
28 | 'Exe ectoderm': ["Tfap2c"],
29 |
30 | 'Cardiomyocytes': ["Tnnt2"],
31 | 'Blood prog.': ["Lmo2", ],
32 | 'Erythroid': ["Gypa"],
33 | }
34 |
35 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf')
36 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', color_map='Greens', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
37 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf')
38 |
39 | for k in marker_genes_dict:
40 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
41 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
42 |
--------------------------------------------------------------------------------
/example/Figure3/5.marker_genes.py:
--------------------------------------------------------------------------------
1 | import logging, matplotlib, os, sys
2 | import scanpy as sc
3 | import numpy as np
4 | import scipy as sp
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | from matplotlib import rcParams
8 | from matplotlib import colors
9 | import seaborn as sb
10 | from rpy2.robjects.packages import importr
11 | #from gprofiler import gprofiler
12 | plt.rcParams['figure.figsize']=(8,8) #rescale figures
13 | sc.settings.verbosity = 1
14 | sc.set_figure_params(dpi=200, dpi_save=300)
15 |
16 | sc.settings.figdir = 'markers'
17 |
18 | adata = sc.read('learned.h5ad') # You can skip the script 3 if using te 2b.
19 | #sc.pp.log1p(adata)
20 |
21 | print(adata.var_names)
22 |
23 | oh = open('gene_names.all.tsv', 'w')
24 | for g in adata.var_names:
25 | oh.write('%s\n' % g)
26 | oh.close()
27 |
28 | marker_genes_dict = {
29 | 'Epiblast': ["Pou5f1", "Epcam"],
30 | 'Primitive streak': ["Eomes", "Nanog"], #Nanog?!?!
31 | 'Anterior primitive streak': ["Gsc", "Mixl1"],
32 | 'Notochord': ["Noto", "T"],
33 | 'Def. Endoderm': ["Cer1", "Sox7"],
34 | 'Nascent mesoderm': ["Mesp1", "Apela"],
35 | 'Caudal mesoderm': ["Cdx1", "Hes7"],
36 | 'Paraxial mesoderm': ["Tcf15", "Tbx1"],
37 | 'Somitic mesoderm': ["Tbx6", "Dll1"],
38 | 'Pharngyeal mesoderm': ["Tcf21", "Isl1"],
39 | 'Cardiomyocytes': ["Tnnt2", "Myl4"],
40 | 'Allantois': ["Tbx4", "Hoxa11"],
41 | 'Mesenchyme': ["Krt18", "Pmp22"],
42 | 'Hemandothelial prog.': ["Kdr", "Etv2"],
43 | 'Endothelium': ["Pecam1", "Anxa5"],
44 | 'Blood prog.': ["Runx1", "Lmo2"],
45 | 'Erythroid': ["Gata1", "Gypa"],
46 | 'Neuromesoderml prog.': ["Cdx4", "Epha5"],
47 | 'Neurectoderm': ["Six3", "Irx3"],
48 | 'Neural crest': ["Dlx2", "Sox10"],
49 | 'Brain': ["En1", "Pax2"],
50 | 'Spinal cord': ["Sox2", "Pax2"],
51 | 'Surface ectoderm': ["Trp63", "Grhl2"],
52 | 'Visceral endoderm': ["Dkk1", "Amot"],
53 | 'Exe endoderm': ["Ttr", "Apoa2"],
54 | 'Exe ectoderm': ["Tfap2c", "Elf5"],
55 | 'Parietal endoderm': ["Sparc", "Plat"],
56 | 'others': ['Fgf5', 'Lefty2'],
57 | }
58 |
59 | sc.pl.stacked_violin(adata, marker_genes_dict, groupby='leiden_r0.5', rotation=90, dendrogram=True, show=False, save='markers.pdf')
60 | sc.pl.dotplot(adata, marker_genes_dict, groupby='leiden_r0.5', dot_max=0.5, dendrogram=True, standard_scale='var', show=False, save='markers.pdf')
61 | sc.pl.heatmap(adata, marker_genes_dict, groupby='leiden_r0.5', vmax=3, show=False, save='markers.pdf')
62 |
63 | for k in marker_genes_dict:
64 | sc.pl.tsne(adata, color=marker_genes_dict[k], size=10, legend_loc='on data', vmax=3, show=False, save='markers-{0}.pdf'.format(k))
65 | sc.pl.umap(adata, color=marker_genes_dict[k], color_map='plasma', size=10, vmax=3, legend_loc='on data', show=False, save='markers-{0}.pdf'.format(k))
66 |
67 |
--------------------------------------------------------------------------------
/example/Figure3/TE_genes_id.mm10.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiekaiLab/scTE/566f6ab3baaf76cd006ab965edc08e4576eb73c9/example/Figure3/TE_genes_id.mm10.txt.gz
--------------------------------------------------------------------------------
/scTE/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | __version__ = "1.0"
3 |
4 | # from .miniglbase import genelist, location, glload
5 | # import .miniglbase
6 |
7 | __all__ = ["genelist", "location", "glload",]
8 |
--------------------------------------------------------------------------------
/scTE/annotation.py:
--------------------------------------------------------------------------------
1 | import os,sys,gzip,time
2 | import numpy as np
3 | from scTE.miniglbase import genelist, glload, location
4 |
5 | form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
6 |
7 | def cleanexon(filename, genefilename, exons):
8 | if not os.path.exists('%s_scTEtmp/index'%filename):
9 | os.system('mkdir -p %s_scTEtmp/index'%filename)
10 |
11 | oh=gzip.open('%s_scTEtmp/index/%s.bed.gz'%(filename,genefilename),'wt')
12 | for k in sorted(exons):
13 | E=[]
14 | for it in exons[k]:
15 | E+=list(range(it[1],it[2]))
16 | E=sorted(set(E))
17 |
18 | s=0
19 | tmp=[]
20 | for id in range(0,len(E)-1):
21 | if E[id+1]-E[id] >1:
22 | en=id
23 | tmp.append([E[s],E[en]])
24 | s=en+1
25 | tmp.append([E[s],E[id+1]])
26 |
27 | for item in tmp:
28 | oh.write('%s\t%s\t%s\t%s\n'%(it[0],item[0],item[1],k))
29 | oh.close()
30 |
31 | def annoGtf(filename, genefile, tefile, mode):
32 |
33 | genefilename = genefile.split('/')[-1:][0].replace('.gtf','').replace('.gz','')
34 | tefilename = tefile.split('/')[-1:][0].replace('.bed','').replace('.gz','')
35 |
36 | raw = {}
37 | clean = {}
38 | if '.gz' in genefile:
39 | o = gzip.open(genefile,'rb')
40 | else:
41 | o=open(genefile,'rU')
42 | for l in o:
43 | if '.gz' in genefile:
44 | l=l.decode('ascii')
45 | if l.startswith('#'):
46 | continue
47 | t=l.strip().split('\t')
48 | if t[2]=='exon' or t[2]=='UTR':
49 | chr = t[0].replace('chr','')
50 | left = int(t[3])
51 | riht = int(t[4])
52 | name=t[8].split('gene_name "')[1].split('";')[0]
53 |
54 | if name not in raw:
55 | raw[name] = []
56 | raw[name].append([chr,left,riht])
57 |
58 | if 'protein_coding' not in l and 'lincRNA' not in l:
59 | continue
60 | if name not in clean:
61 | clean[name] = []
62 | clean[name].append([chr,left,riht])
63 | o.close()
64 |
65 | cleanexon(filename,'%s.raw'%genefilename,raw)
66 | cleanexon(filename,'%s.clean'%genefilename,clean)
67 |
68 | if mode == 'exclusive':
69 | gene ={}
70 | o = gzip.open('%s_scTEtmp/index/%s.clean.bed.gz'%(filename,genefilename),'rb')
71 | for l in o:
72 | t = l.decode('ascii').strip().split('\t')
73 | chr = t[0].replace('chr','')
74 | left = int(t[1])
75 | rite = int(t[2])
76 |
77 | left_buck = int((left-1)/10000) * 10000
78 | right_buck = int((rite)/10000) * 10000
79 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
80 |
81 | if chr not in gene:
82 | gene[chr] = {}
83 |
84 | if buckets_reqd:
85 | for buck in buckets_reqd:
86 | if buck not in gene[chr]:
87 | gene[chr][buck] = []
88 | gene[chr][buck].append([left, rite])
89 | o.close()
90 |
91 | noverlap = []
92 | if '.gz' in tefile:
93 | o = gzip.open(tefile,'rb')
94 | else:
95 | o = open(tefile,'rU')
96 | for n,l in enumerate(o):
97 | if '.gz' in tefile:
98 | l = l.decode('ascii')
99 | t = l.strip().split('\t')
100 | chr = t[0]
101 | left = int(t[1])
102 | rite = int(t[2])
103 |
104 | if chr not in gene:
105 | noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
106 | continue
107 |
108 | left_buck = int((left-1)/10000) * 10000
109 | right_buck = int((rite)/10000) * 10000
110 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
111 |
112 | if buckets_reqd:
113 | i = 1
114 | for buck in buckets_reqd:
115 | if buck not in gene[chr]:
116 | pass
117 | else:
118 | for k in gene[chr][buck]:
119 | if left < k[1] and rite > k[0]:
120 | i = 0
121 | break
122 | if i == 0:
123 | break
124 | if i == 1:
125 | noverlap.append('%s\t%s\t%s\t%s\n'%(chr,left,rite,t[3]))
126 |
127 | oh = gzip.open('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename),'wt')
128 | for k in noverlap:
129 | oh.write(k)
130 | oh.close()
131 |
132 | genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename, genefilename), format=form, gzip=True)
133 | TEs = genelist('%s_scTEtmp/index/%s.exclusive.gz'%(filename, tefilename), format=form, gzip=True)
134 | print(genes)
135 | print(TEs)
136 |
137 | all_annot = genes + TEs
138 | all_annot.save('%s_scTEtmp/index/custome.exclusive.glb'%filename)
139 | annot = '%s_scTEtmp/index/custome.exclusive.glb'%filename
140 |
141 | elif mode == 'inclusive':
142 | genes = genelist('%s_scTEtmp/index/%s.raw.bed.gz'%(filename,genefilename), format=form, gzip=True)
143 | if tefilename.endswith('.gz'):
144 | TEs = genelist(tefile, format=form, gzip=True)
145 | else:
146 | TEs = genelist(tefile, format=form)
147 |
148 | all_annot = genes + TEs
149 | all_annot.save('%s_scTEtmp/index/custome.inclusive.glb'%filename)
150 | annot = '%s_scTEtmp/index/custome.inclusive.glb'%filename
151 |
152 | return annot
153 |
154 |
155 |
156 |
157 |
158 |
--------------------------------------------------------------------------------
/scTE/base.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import multiprocessing
3 | import argparse
4 | from functools import partial
5 | import logging
6 | import os, sys, glob, datetime, time, gzip
7 | import collections
8 | from collections import defaultdict
9 | from math import log
10 | from scTE.miniglbase import genelist, glload, location
11 | from scTE.annotation import annoGtf
12 | import subprocess
13 |
14 | import numpy as np
15 | import scipy
16 | import anndata as ad
17 |
18 | def read_opts(parser):
19 | args = parser.parse_args()
20 | if args.format == "BAM" :
21 | args.parser = "BAM"
22 | elif args.format == "SAM" :
23 | args.parser = "SAM"
24 | else :
25 | logging.error("The input file must be SAM/BAM format: %s !\n" % (args.format))
26 | sys.exit(1)
27 |
28 | args.error = logging.critical
29 | args.warn = logging.warning
30 | args.debug = logging.debug
31 | args.info = logging.info
32 |
33 | args.argtxt ="\n".join(("Parameter list:", \
34 | "Sample = %s" % (args.out), \
35 | # "Genome = %s" % (args.genome), \
36 | "Reference annotation index = %s" %(args.annoglb[0]), \
37 | "Minimum number of genes required = %s" % (args.genenumber), \
38 | "Minimum number of counts required = %s"% (args.countnumber),\
39 | "Number of threads = %s " % (args.thread),\
40 | ))
41 | return args
42 |
43 | # def getanno(filename, genefile, tefile, genome, mode):
44 | # form ={'force_tsv': True, 'loc': 'location(chr=column[0], left=column[1], right=column[2])', 'annot': 3}
45 | #
46 | # if genefile == 'default' and tefile == 'default':
47 | # if genome == 'mm10':
48 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
49 | # if mode == 'exclusive':
50 | # if not os.path.exists('mm10.exclusive.glb'):
51 | # logging.error("Did not find the annotation index mm10.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
52 | # sys.exit(1)
53 | # all_annot = 'mm10.exclusive.glb'
54 | # allelement = set(glload(all_annot)['annot'])
55 | #
56 | # elif mode == 'inclusive':
57 | # if not os.path.exists('mm10.inclusive.glb'):
58 | # logging.error("Did not find the annotation index mm10.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
59 | # sys.exit(1)
60 | # all_annot = 'mm10.inclusive.glb'
61 | # allelement = set(glload(all_annot)['annot'])
62 | #
63 | # elif genome == 'hg38':
64 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
65 | # if mode == 'exclusive':
66 | # if not os.path.exists('hg38.exclusive.glb'):
67 | # logging.error("Did not find the annotation index hg38.exclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n" )
68 | # sys.exit(1)
69 | # all_annot = 'hg38.exclusive.glb'
70 | # allelement = set(glload(all_annot)['annot'])
71 | #
72 | # elif mode == 'inclusive':
73 | # if not os.path.exists('hg38.inclusive.glb'):
74 | # logging.error("Did not find the annotation index hg38.inclusive.glb, you can download it from scTE github (www....) or either give the annotation with -te and -gene option \n")
75 | # sys.exit(1)
76 | # all_annot = 'hg38.inclusive.glb'
77 | # allelement = set(glload(all_annot)['annot'])
78 | # else:
79 | # if genome in ['hg38']:
80 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
81 | #
82 | # elif genome in ['mm10']:
83 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
84 | #
85 | # if not os.path.isfile(tefile) :
86 | # logging.error("No such file: %s !\n" %(tefile))
87 | # sys.exit(1)
88 | #
89 | # if not os.path.isfile(genefile) :
90 | # logging.error("No such file: %s !\n" % (genefile))
91 | # sys.exit(1)
92 | #
93 | # all_annot = annoGtf(filename, genefile=genefile, tefile=tefile, mode=mode)
94 | # allelement = set(glload(all_annot)['annot'])
95 | #
96 | # return(allelement,chr_list,all_annot)
97 |
98 | def Readanno(filename, annoglb): #genome
99 | glannot = glload(annoglb)
100 | allelement = set(glannot['annot'])
101 | # if genome in ['mm10']:
102 | # chr_list = ['chr'+ str(i) for i in range(1,20) ] + [ 'chrX','chrY', 'chrM' ]
103 | # elif genome in ['hg38']:
104 | # chr_list = ['chr'+ str(i) for i in range(1,23) ] + [ 'chrX','chrY', 'chrM' ]
105 |
106 | chr_list = list(set([ k['chr'] for k in glannot['loc']])) #this is useful for costume chromsome
107 | return(allelement, chr_list, annoglb, glannot)
108 |
109 | def checkCBUMI(filename,out,CB,UMI):
110 | if CB == 'CR':
111 | subprocess.run('samtools view %s | head -100| grep "CR:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True)
112 | time.sleep(2) #subprocess need take some time
113 | o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r')
114 | for l in o:
115 | l=l.strip()
116 | if int(l) < 100:
117 | logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename)
118 | sys.exit(1)
119 | elif CB == 'CB':
120 | subprocess.run('samtools view %s | head -100| grep "CB:Z:" | wc -l > %s_scTEtmp/o1/testCR.txt'%(filename,out),shell=True)
121 | time.sleep(2) #subprocess need take some time
122 | o=open('%s_scTEtmp/o1/testCR.txt'%(out),'r')
123 | for l in o:
124 | l=l.strip()
125 | if int(l) < 100:
126 | logging.error("The input file %s has no cell barcodes information, plese make sure the aligner have add the cell barcode key, or set CB to False"%filename)
127 | sys.exit(1)
128 |
129 | if UMI == 'UR':
130 | subprocess.run('samtools view %s | head -100| grep "UR:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True)
131 | time.sleep(2)
132 | o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r')
133 | for l in o:
134 | l=l.strip()
135 | if int(l) < 100:
136 | logging.error("The input file %s has no UR:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
137 | sys.exit(1)
138 | elif UMI == 'UB':
139 | subprocess.run('samtools view %s | head -100| grep "UB:Z:" | wc -l > %s_scTEtmp/o1/testUMI.txt'%(filename,out),shell=True)
140 | time.sleep(2)
141 | o=open('%s_scTEtmp/o1/testUMI.txt'%(out),'r')
142 | for l in o:
143 | l=l.strip()
144 | if int(l) < 100:
145 | logging.error("The input file %s has no UB:Z information, plese make sure the aligner have add the UMI key, or set UMI to False" % filename)
146 | sys.exit(1)
147 |
148 | def Bam2bed(filename, CB, UMI, out, num_threads):
149 | if not os.path.exists('%s_scTEtmp/o1'%out):
150 | os.system('mkdir -p %s_scTEtmp/o1'%out)
151 |
152 | sample=filename.split('/')[-1].replace('.bam','')
153 | if sys.platform == 'darwin': # Mac OSX has BSD sed
154 | switch = '-E'
155 | else:
156 | switch = '-r'
157 |
158 | if UMI == 'False':
159 | if CB == 'False':
160 | # Put the sample name in the barcode slot
161 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\'| gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, out, switch, out, out))
162 | elif CB == 'CR':
163 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
164 | elif CB == 'CB':
165 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, out, out))
166 | elif UMI == 'UR':
167 | if CB == 'CR':
168 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
169 | elif CB == 'CB':
170 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
171 | elif UMI == 'UB':
172 | if CB == 'CR':
173 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
174 | elif CB == 'CB':
175 | os.system('samtools view -@ %s %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\'| sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (num_threads, filename, switch, switch, switch, out,out))
176 |
177 | def Para_bam2bed(filename, CB, UMI, out):
178 | if not os.path.exists('%s_scTEtmp/o0'%out):
179 | os.system('mkdir -p %s_scTEtmp/o0'%out)
180 |
181 | sample=filename.split('/')[-1].replace('.bam','')
182 |
183 | if sys.platform == 'darwin': # Mac OSX has BSD sed
184 | switch = '-E'
185 | else:
186 | switch = '-r'
187 |
188 | if UMI == 'False':
189 | if CB == 'False':
190 | os.system('samtools view %s | awk \'{OFS="\t"}{print $3,$4,$4+100,"%s"}\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, sample, switch, out, sample))
191 | elif CB == 'CR':
192 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample))
193 | elif CB == 'CB':
194 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/^chr//g\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, out,sample))
195 | elif UMI == 'UR':
196 | if CB == 'CR':
197 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
198 | elif CB == 'CB':
199 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UR:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UR:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
200 | elif UMI == 'UB':
201 | if CB == 'CR':
202 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CR:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CR:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
203 | elif CB == 'CB':
204 | os.system('samtools view %s | awk \'{OFS="\t"}{for(i=12;i<=NF;i++)if($i~/CB:Z:/)n=i}{for(i=12;i<=NF;i++)if($i~/UB:Z:/)m=i}{print $3,$4,$4+100,$n,$m}\' | sed %s \'s/CB:Z://g\' | sed %s \'s/UB:Z://g\' | sed %s \'s/^chr//g\' | awk \'!x[$4$5]++\' | gzip > %s_scTEtmp/o0/%s.bed.gz'%(filename, switch, switch, switch, out,sample))
205 |
206 | def splitAllChrs(chromosome_list, filename, genenumber, countnumber, UMI=True):
207 | '''
208 | **Purpose**
209 | Split the data into separate beds, and count up all the times each barcode appears
210 |
211 | This variant uses more memory, but does it all at the same time and gets the filtered whitelist for free
212 |
213 | **Arguments**
214 | chromosome_list
215 | List of chromosome names
216 |
217 | filename (Required)
218 | filename stub to use for tmp files
219 |
220 | genenumber (Required)
221 | Minimum number of genes expressed required for a cell to pass filtering
222 |
223 | countnumber (Required)
224 | Minimum number of counts required for a cell to pass filtering.
225 |
226 | UMI (optional, default=True)
227 | use the UMI
228 |
229 | **Returns**
230 | The barcode whitelist
231 | '''
232 |
233 | if not os.path.exists('%s_scTEtmp/o2' % filename):
234 | os.system('mkdir -p %s_scTEtmp/o2'%filename)
235 |
236 | chromosome_list = set([c.replace('chr', '') for c in chromosome_list])
237 |
238 | file_handle_in = gzip.open('%s_scTEtmp/o1/%s.bed.gz' % (filename,filename), 'rt')
239 | file_handles_out = {chr: gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz' % (filename,filename,chr), 'wt') for chr in chromosome_list}
240 |
241 | CRs = defaultdict(int)
242 |
243 | if UMI:
244 | uniques = {chrom: set([]) for chrom in chromosome_list}
245 |
246 | # Make a BED for each chromosome
247 | for line in file_handle_in:
248 | t = line.strip().split('\t')
249 | chrom = t[0].replace('chr', '') # strip chr
250 |
251 | if chrom not in chromosome_list: # remove the unusual chromosomes
252 | # Force chrMT -> chrM
253 | if chrom == 'MT':
254 | chrom = 'M'
255 | else:
256 | continue
257 |
258 | if UMI:
259 | if line in uniques[chrom]:
260 | continue
261 | uniques[chrom].add(line)
262 | CRs[t[3]] += 1
263 | else:
264 | CRs[t[3]] += 1
265 |
266 | file_handles_out[chrom].write(line)
267 |
268 | [file_handles_out[k].close() for k in file_handles_out]
269 | file_handle_in.close()
270 |
271 | if not countnumber:
272 | mincounts = 2 * genenumber
273 | else:
274 | mincounts = countnumber
275 |
276 | CRs = {k: v for k, v in CRs.items() if v >= mincounts}
277 |
278 | return list(CRs.keys())
279 |
280 | def filterCRs(filename, genenumber, countnumber):
281 | CRs = defaultdict(int)
282 | for f in sorted(glob.glob('%s_scTEtmp/o2/%s*.count.gz'%(filename,filename))):
283 | logging.info('Reading %s '%os.path.split(f)[1])
284 | o = gzip.open(f,'rt')
285 | for l in o:
286 | t = l.strip().split('\t')
287 | CRs[t[0]] += int(t[1])
288 | o.close()
289 |
290 | if not countnumber:
291 | mincounts = 2* genenumber
292 | else:
293 | mincounts = countnumber
294 |
295 | logging.info('Before filter %s'%len(CRs))
296 | CRs = {k: v for k, v in CRs.items() if v >= mincounts}
297 | logging.info('Aefore filter %s'%len(CRs))
298 |
299 | return list(CRs.keys())
300 |
301 | def splitChr(chr, filename, CB, UMI):
302 | if not os.path.exists('%s_scTEtmp/o2'%filename):
303 | os.system('mkdir -p %s_scTEtmp/o2'%filename)
304 |
305 | chr=chr.replace('chr','')
306 | if CB == 'CR' or CB == 'CB': CB = True
307 | else: CB = False
308 | if UMI == 'UR' or UMI == 'UB': UMI = True
309 | else: UMI= False
310 |
311 | if not CB: # C1-style data is a cell per BAM, so no barcode;
312 | if not UMI:
313 | if chr == '1':
314 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
315 | elif chr == '2':
316 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
317 | elif chr == '3':
318 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
319 | else:
320 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
321 | else:
322 | if chr == '1':
323 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
324 | elif chr == '2':
325 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
326 | elif chr == '3':
327 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
328 | else:
329 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
330 | else:
331 | if not UMI: # did not remove the potential PCR duplicates for scRNA-seq
332 | if chr == '1':
333 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
334 | elif chr == '2':
335 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
336 | elif chr == '3':
337 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
338 | else:
339 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
340 | else:
341 | if chr == '1':
342 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^1\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
343 | elif chr == '2':
344 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^2\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
345 | elif chr == '3':
346 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep -v ^3\'[0-9]\' | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
347 | else:
348 | os.system('gunzip -c -f %s_scTEtmp/o1/%s.bed.gz | grep ^%s | gzip -c > %s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr,filename,filename,chr))
349 |
350 | CRs = defaultdict(int)
351 | o = gzip.open('%s_scTEtmp/o2/%s.chr%s.bed.gz'%(filename,filename,chr),'rt')
352 | for l in o:
353 | t = l.strip().split('\t')
354 | CRs[t[3]] += 1
355 | o.close()
356 |
357 | o = gzip.open('%s_scTEtmp/o2/%s.chr%s.count.gz'%(filename,filename,chr),'wt')
358 | for k in CRs:
359 | o.write('%s\t%s\n'%(k,CRs[k]))
360 | o.close()
361 |
362 | def align(chr, filename, all_annot, glannot, whitelist): #CB
363 | '''
364 | **Purpose**
365 | For each read, align it to the index and assign a TE, gene.
366 |
367 | This is the speed critical part.
368 |
369 | '''
370 | s1 = time.time()
371 | chr = 'chr' + chr
372 |
373 | if not os.path.exists('%s_scTEtmp/o3'%filename):
374 | os.system('mkdir -p %s_scTEtmp/o3'%filename)
375 |
376 | if not glannot: # Load separately for the multicore pipeline, share the index for the single core pipeline
377 | glannot = glload(all_annot)
378 |
379 | # Only keep the glbase parts we need.
380 | buckets = glannot.buckets[chr.replace('chr', '')]
381 | all_annot = glannot.linearData
382 |
383 | oh = gzip.open('%s_scTEtmp/o2/%s.%s.bed.gz' % (filename, filename, chr), 'rt')
384 | res = {}
385 | for line in oh:
386 | t = line.strip().split('\t')
387 | barcode = t[3]
388 | if barcode not in whitelist:
389 | continue
390 | if barcode not in res:
391 | res[barcode] = defaultdict(int)
392 |
393 | #chrom = t[0].replace('chr', '') # Don't need as each align is already split for each chrom;
394 | left = int(t[1])
395 | rite = int(t[2])
396 |
397 | #loc = location(chr=chrom, left=left, right=rite)
398 | left_buck = ((left-1)//10000) * 10000
399 | right_buck = ((rite)//10000) * 10000
400 | buckets_reqd = range(left_buck, right_buck+10000, 10000)
401 |
402 | if buckets_reqd:
403 | loc_ids = set()
404 | loc_ids_update = loc_ids.update
405 |
406 | # get the ids reqd.
407 | [loc_ids_update(buckets[buck]) for buck in buckets_reqd if buck in buckets]
408 |
409 | result = [all_annot[index]['annot'] for index in loc_ids if (rite >= all_annot[index]['loc'].loc['left'] and left <= all_annot[index]['loc'].loc["right"])]
410 |
411 | if result:
412 | for gene in result:
413 | res[barcode][gene] += 1
414 |
415 | oh.close()
416 |
417 | oh = gzip.open('%s_scTEtmp/o3/%s.%s.bed.gz' % (filename,filename,chr), 'wt')
418 | for bc in sorted(res):
419 | for gene in sorted(res[bc]):
420 | oh.write('%s\t%s\t%s\n' % (bc, gene, res[bc][gene]))
421 | oh.close()
422 |
423 | def Countexpression(filename, allelement, genenumber, cellnumber, hdf5):
424 | gene_seen = allelement
425 |
426 | whitelist={}
427 | o = gzip.open('%s_scTEtmp/o4/%s.bed.gz'%(filename, filename), 'rt')
428 | for n,l in enumerate(o):
429 | t = l.strip().split('\t')
430 | if t[0] not in whitelist:
431 | whitelist[t[0]] = 0
432 | whitelist[t[0]] += 1
433 | o.close()
434 |
435 | CRlist = []
436 | sortcb = sorted(whitelist.items(), key=lambda item:item[1], reverse=True)
437 | for n,k in enumerate(sortcb):
438 | if k[1] < genenumber:
439 | break
440 | if n >= cellnumber:
441 | break
442 | CRlist.append(k[0])
443 | CRlist = set(CRlist)
444 |
445 | res = {}
446 | genes_oh = gzip.open('%s_scTEtmp/o4/%s.bed.gz' % (filename,filename), 'rt')
447 | for n, l in enumerate(genes_oh):
448 | t = l.strip().split('\t')
449 | if t[0] not in CRlist:
450 | continue
451 | if t[0] not in res:
452 | res[t[0]] = {}
453 | if t[1] not in res[t[0]]:
454 | res[t[0]][t[1]] = 0
455 | res[t[0]][t[1]] += int(t[2])
456 |
457 | genes_oh.close()
458 |
459 | s=time.time()
460 |
461 | # Save out the final file
462 |
463 | gene_seen = list(gene_seen) # Do the sort once;
464 | gene_seen.sort()
465 |
466 | #==== save results =====
467 | if not hdf5: # save as csv
468 | res_oh = open('%s.csv'%filename, 'w')
469 | res_oh.write('barcodes,')
470 | res_oh.write('%s\n' % (','.join([str(i) for i in gene_seen])))
471 |
472 | for k in sorted(res):
473 | l = ["0"] * len(gene_seen) # Avoid all the appends
474 | for idx, gene in enumerate(gene_seen):
475 | if gene in res[k]:
476 | l[idx] = str(res[k][gene])
477 | res_oh.write('%s,%s\n' % (k, ','.join(l)))
478 | res_oh.close()
479 |
480 | else: # save as hdf5
481 | data = []
482 | CBs = []
483 | for k in sorted(res):
484 | l = ["0"] * len(gene_seen) # Avoid all the appends
485 | for idx, gene in enumerate(gene_seen):
486 | if gene in res[k]:
487 | l[idx] = str(res[k][gene])
488 | data.append(l)
489 | CBs.append(k)
490 |
491 | obs = pd.DataFrame(index = CBs)
492 | var = pd.DataFrame(index = gene_seen)
493 | adata = ad.AnnData(np.asarray(data).astype(int),var = var,obs = obs)
494 | adata.X = scipy.sparse.csr_matrix(adata.X)
495 | adata.write('%s.h5ad'%filename)
496 |
497 | #========================
498 |
499 |
500 | return len(res), genenumber, filename
501 |
502 | def timediff(timestart, timestop):
503 | t = (timestop-timestart)
504 | time_day = t.days
505 | s_time = t.seconds
506 | ms_time = t.microseconds / 1000000
507 | usedtime = int(s_time + ms_time)
508 | time_hour = int(usedtime / 60 / 60 )
509 | time_minute = int((usedtime - time_hour * 3600 ) / 60 )
510 | time_second = int(usedtime - time_hour * 3600 - time_minute * 60 )
511 | retstr = "%dd %dh %dm %ds" %(time_day, time_hour, time_minute, time_second,)
512 | return retstr
513 |
--------------------------------------------------------------------------------
/scTE/miniglbase/README.md:
--------------------------------------------------------------------------------
1 | # README #
2 |
3 | ### What is glbase3? ###
4 |
5 | This is a staged mini version of glbase.
6 |
7 | You can find the full install here:
8 |
9 | https://github.com/oaxiom/glbase3
10 |
11 | == License ==
12 |
13 | glbase is distributed under the MIT license:
14 | {{{
15 | Copyright (C) 2009-2019 Andrew Hutchins
16 |
17 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
18 |
19 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
20 |
21 | Except as contained in this notice, the name(s) of the above copyright holders shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization.
22 |
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | }}}
25 |
26 |
--------------------------------------------------------------------------------
/scTE/miniglbase/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Initialise glbase, import all the libraries, set up the environment etc.
4 |
5 | Requires:
6 | * numpy
7 | * matplotlib
8 | * scipy
9 | * sklearn
10 | * h5py
11 | * networkx
12 | """
13 |
14 | import sys, os
15 |
16 | #-----------------------------------------------------------------------
17 | # Load all of the global configuration options.
18 | try:
19 | from . import config
20 | except:
21 | print("Error: Fatal - glbase3 is not installed correctly, cannot find my own libraries")
22 | print(" Is the python 'sys.path' correct?")
23 | sys.exit() # no raise if I can't get errors, it's surely a fatal installation problem.
24 |
25 | # ----------------------------------------------------------------------
26 | # Test for availability of the core non-standard libs.
27 | # These need to be available as the subsequent load/checking is weak/non-existent.
28 |
29 | try:
30 | import numpy
31 | config.NUMPY_AVAIL = True
32 | except Exception:
33 | raise LibraryNotFoundError("Fatal - Numpy is not available or not installed")
34 |
35 | try:
36 | import scipy
37 | config.SCIPY_AVAIL = True
38 | except Exception:
39 | raise LibraryNotFoundError("Fatal - Scipy is not available or not installed")
40 |
41 | # ----------------------------------------------------------------------
42 | # Now import the rest of my libraries - assumes here they are available.
43 | # If I can get config and errors then these are probably available too.
44 |
45 | from .utils import glload
46 | from .location import location
47 | from .genelist import genelist
48 |
49 | # export all of the libraries, methods and helpers.
50 | __all__ = ["genelist",
51 | 'config',
52 | "location",
53 | "glload",
54 | ]
55 |
--------------------------------------------------------------------------------
/scTE/miniglbase/base_genelist.py:
--------------------------------------------------------------------------------
1 |
2 | import copy, pickle, re
3 | from shlex import split as shlexsplit
4 |
5 | from . import config
6 | from .location import location
7 |
8 | class _base_genelist:
9 | def __init__(self):
10 | """
11 | (Internal)
12 | This is the base derived class for all genelists.
13 | It contains methods available to all implementations of genelist.
14 | """
15 | self.name = None
16 | self.linearData = None
17 |
18 | def __repr__(self):
19 | return("")
20 |
21 | def __in__(self, key):
22 | """
23 | (Override)
24 |
25 | Confer:
26 | if "key" in genelist:
27 | """
28 | return(key in list(self.keys()))
29 |
30 | def __bool__(self):
31 | """
32 | Fixes:
33 | if genelist: # contains something
34 | True
35 |
36 | and fixes:
37 |
38 | len(genelist) = 0
39 | if genelist: # Would pass even if the genelist is empty
40 | False
41 |
42 | """
43 | return(len(self) > 0)
44 |
45 | def __shallowcopy__(self):
46 | raise Exception("__shallowcopy__() is NOT supposrted for genelists, use gl.deepcopy() or gl.shallowcopy()")
47 |
48 | def __deepcopy__(self, fake_arg):
49 | raise Exception("__deepcopy__() is NOT supported for genelists, use gl.deepcopy() or gl.shallowcopy()")
50 |
51 | def deepcopy(self):
52 | """
53 | Confer copy to mean a deepcopy as opposed to a shallowcopy.
54 |
55 | This is required as genelists are compound lists.
56 | """
57 | return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory
58 |
59 | def shallowcopy(self):
60 | """
61 | (New)
62 |
63 | Some weird behaviour here, I know, this is so I can still get access to
64 | the shallow copy mechanism even though 90% of the operations are copies.
65 | """
66 | return(copy.copy(self)) # But doesnt this just call __copy__() anyway?
67 |
68 | def __len__(self):
69 | """
70 | (Override)
71 | get the length of the list
72 | """
73 | return(len(self.linearData))
74 |
75 | def __int__(self):
76 | """
77 | (Override)
78 | get the length of the list
79 | NOTE: It's possible this is a bug/feature.
80 | I don't remove it at the moment as I'm not sure if it is used anywhere.
81 |
82 | """
83 | return(len(self.linearData))
84 |
85 | def __iter__(self):
86 | """
87 | (Override)
88 | make the geneList behave like a normal iterator (list)
89 | """
90 | for n in self.linearData:
91 | yield n
92 |
93 | def __getitem__(self, index):
94 | """
95 | (Override)
96 | confers a = geneList[0] behaviour
97 |
98 | This is a very slow way to access the data, and may be a little inconsistent in the things
99 | it returns.
100 |
101 | NOTE:
102 | a = genelist[0] # returns a single dict
103 | a = genelist[0:10] # returns a new 10 item normal python list.
104 | a = genelist["name"] returns a python list containing a vertical slice of all of the "name" keys
105 |
106 | """
107 | newl = False
108 | if isinstance(index, int):
109 | # this should return a single dictionary.
110 | return(self.linearData[index])
111 | elif isinstance(index, str):
112 | # returns all labels with that item.
113 | return(self._findAllLabelsByKey(index))
114 | elif isinstance(index, slice):
115 | # returns a new genelist corresponding to the slice.
116 | newl = self.shallowcopy()
117 | newl.linearData = utils.qdeepcopy(self.linearData[index]) # separate the data so it can be modified.
118 | newl._optimiseData()
119 | return(newl) # deep copy the slice.
120 |
121 | def __setitem__(self, index, *args):
122 | """
123 | (Override)
124 | Block key editing.
125 | """
126 | raise AssertionError
127 |
128 | def __hash__(self):
129 | """
130 | (Override)
131 |
132 | compute a sensible hash value
133 | """
134 | try:
135 | return(hash(self.name + str(self[0]) + str(self[-1]) + str(len(self)))) # hash data for comparison.
136 | except Exception:
137 | try:
138 | return(hash(self.name + str(self[0]) + str(self[-1]))) # len() probably not available (delayedlist?).
139 | except Exception: # I bet the list is empty.
140 | return(hash(self.name))
141 |
142 | def __add__(self, gene_list):
143 | """
144 | (Override)
145 | confer append like behaviour: c = a + b
146 | keeps duplicates (just concatenate's lists)
147 | """
148 | mkeys = self._collectIdenticalKeys(gene_list)
149 | if not mkeys: # unable to match.
150 | config.log.warning("No matching keys, the resulting list would be meaningless")
151 | return(False)
152 | newl = self.deepcopy()
153 | newl.linearData.extend(copy.deepcopy(gene_list.linearData))
154 | newl._optimiseData()
155 | return(newl)
156 |
157 | def __eq__(self, gene_list):
158 | """
159 | (Internal)
160 | Are the lists equivalent?
161 | lists now, must only have one identical key.
162 |
163 | This is just testing the keys...
164 | Wrong...
165 | """
166 | # check the hash's first to see if they are identical.
167 | # This is diabled as it can be very slow.
168 | #if self.__hash__() == gene_list.__hash__():
169 | # return(True)
170 |
171 | for key in self.linearData[0]:
172 | if key in gene_list.linearData[0]:
173 | return(True) # just one key in common required.
174 | return(False)
175 |
176 | def __ne__(self, gene_list):
177 | """
178 | (Internal)
179 | Are the lists equivalent?
180 | ie do they have the same keys?
181 | """
182 | return(not self.__eq__(gene_list))
183 |
184 | def keys(self):
185 | """
186 | return a list of all the valid keys for this geneList
187 | """
188 | return([key for key in self.linearData[0]]) # Not exhaustive
189 |
190 | def _guessDataType(self, value):
191 | """
192 | (Internal)
193 |
194 | Take a guess at the most reasonable datatype to store value as.
195 | returns the resulting data type based on a list of logical cooercions
196 | (explain as I fail each cooercion).
197 | Used internally in _loadCSV()
198 | I expect this will get larger and larger with new datatypes, so it's here as
199 | as a separate function.
200 |
201 | Datatype coercion preference:
202 | float > list > int > location > string
203 | """
204 |
205 | try: # see if the element is a float()
206 | if "." in value: # if no decimal point, prefer to save as a int.
207 | return(float(value))
208 | else:
209 | raise ValueError
210 | except ValueError:
211 | try:
212 | # Potential error here if it is a list of strings?
213 | if '[' in value and ']' in value and ',' in value and '.' in value: # Probably a Python list of floats
214 | return([float(i) for i in value.strip(']').strip('[').split(',')])
215 | elif '[' in value and ']' in value and ',' in value: # Probably a Python list of ints
216 | return([int(i) for i in value.strip(']').strip('[').split(',')])
217 | else:
218 | raise ValueError
219 | except ValueError:
220 | try: # see if it's actually an int?
221 | return(int(value))
222 | except ValueError:
223 | try: # see if I can cooerce it into a location:
224 | return(location(loc=value))
225 | except (TypeError, IndexError, AttributeError, AssertionError, ValueError): # this is not working, just store it as a string
226 | return(str(value).strip())
227 | return("") # return an empty datatype.
228 | # I think it is possible to get here. If the exception at int() or float() returns something other than a
229 | # ValueError (Unlikely, Impossible?)
230 |
231 | def _processKey(self, format, column):
232 | """
233 | (Internal)
234 | the inner part of _loadCSV() to determine what to do with the key.
235 | Better in here too for security.
236 | """
237 |
238 | d = {}
239 | for key in format:
240 | if isinstance(format[key], str) and "location" in format[key]:
241 | # locations are very common, add support for them out of the box:
242 | d[key] = eval(format[key])
243 | else:
244 | d[key] = self._guessDataType(column[format[key]])
245 |
246 | return(d)
247 |
248 | def save(self, filename=None, compressed=False):
249 | """
250 | **Purpose**
251 |
252 | Save the genelist as a binary representation.
253 | This is guaranteed to be available for all geneList representations, with
254 | the only exception being the delayedlists. As that wouldn't
255 | make any sense as delayedlists are not copied into memory.
256 |
257 | You can use this method to cache the file. It's particularly useful for large files
258 | that get processed once but are then used a lot.
259 |
260 | loading the list back into memory is relatively quick.
261 |
262 | list = glload("path/to/filename.glb")
263 |
264 | I generally used extension is glb. Although you can use
265 | whatever you like.
266 |
267 | **Arguments**
268 |
269 | filename
270 | filename (and path, if you like) to save the file to
271 |
272 | compressed (Optional, default=False)
273 | use compression (not currently implemented)
274 |
275 | **Result**
276 |
277 | returns None
278 | Saves a binary representation of the geneList
279 |
280 | """
281 | assert filename, "no filename specified"
282 |
283 | oh = open(filename, "wb")
284 | if compressed:
285 | config.log.warning("compression not currently implemented, saving anyway")
286 | pickle.dump(self, oh, -1)
287 | else:
288 | pickle.dump(self, oh, -1)
289 | oh.close()
290 | config.log.info("Saved binary version of list: '%s'" % filename)
291 |
292 | def from_pandas(self, pandas_data_frame):
293 | """
294 | **Purpose**
295 |
296 | Convert a pandas dataFrame to a genelist
297 |
298 | NOTE: This is an INPLACE method that will REPLACE any exisiting data
299 | in the
300 |
301 | **Arguments**
302 |
303 | pandas_data_frame (Required)
304 | The pandas data frame to convert
305 |
306 | **Result**
307 | None
308 | The object is populated by
309 |
310 | """
311 | if len(self) > 0:
312 | config.log.warning('genelist.from_pandas() will overwrite the existing data in the genelist')
313 |
314 | newl = []
315 | key_names = pandas_data_frame.columns
316 | for index, row in pandas_data_frame.iterrows():
317 | newitem = {}
318 | for k, item in zip(key_names, row):
319 | newitem[k] = item
320 | newl.append(newitem)
321 | self.linearData = newl
322 | self._optimiseData()
323 |
324 | config.log.info("genelist.from_pandas() imported dataFrame")
325 |
--------------------------------------------------------------------------------
/scTE/miniglbase/config.py:
--------------------------------------------------------------------------------
1 | """
2 | config.py
3 |
4 | config must be imported before any other glbase library.
5 |
6 | """
7 |
8 | import logging
9 |
10 | # -------------- Versioning data
11 | GLBASE_VERSION = "1.1105"
12 |
13 | # -------------- General options
14 |
15 | SILENT = False # set this to True to silence all glbase output. Only works at startup
16 | DEBUG = True
17 | do_logging = True
18 |
19 | # flags for the availability of libraries
20 | MATPLOTLIB_AVAIL = False # required
21 | NUMPY_AVAIL = False # required
22 | SCIPY_AVAIL = False # required
23 | SKLEARN_AVAIL = False # required
24 | H5PY_AVAIL = False # Optional.
25 | NETWORKX_AVAIL = False # optional
26 | PYDOT_AVAIL = False # optional
27 | NUMEXPR_AVAIL = False # Optional
28 | PYGRAPHVIZ_AVAIL = False # Optional
29 |
30 | # Some simple options for printing genelists
31 | NUM_ITEMS_TO_PRINT = 3 # number of items to print by default.
32 | PRINT_LAST_ITEM = True
33 |
34 | # size of buckets for collide() and overlap()
35 | # If this is changed then glload will not work correctly.
36 | bucket_size = 10000 # in bp - tested, seems a reasonable choice.
37 |
38 | # -------------- set up the logger here.
39 | logging.basicConfig(level=logging.DEBUG,
40 | format='%(levelname)-8s: %(message)s',
41 | datefmt='%m-%d %H:%M'),
42 |
43 |
44 | log = logging.getLogger('glbase3')
45 | log.setLevel(logging.INFO)
46 |
--------------------------------------------------------------------------------
/scTE/miniglbase/location.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | location.py
4 |
5 | part of glbase.
6 |
7 | This class is an internal class that implements a more convenient way to manipulate
8 | genomic coordiantes.
9 |
10 | TODO:
11 | . add a 'in' code clause e.g.:
12 | if 1000 in location: (see if 1000 > left & < right)
13 | if a_location in b_location: (exectute a collide())
14 |
15 | """
16 |
17 | import copy, pickle
18 |
19 | class location:
20 | def __init__(self, loc=None, chr=None, left=None, right=None):
21 | if isinstance(loc, location):
22 | # It's actually already a loc.
23 | # I want to copy it and leave.
24 | self.loc = copy.copy(loc.loc)
25 | else:
26 | if loc:
27 | s = loc.lower().replace(",", "") # ucsc includes commas, remove them so you can cut and paste
28 | t = s.split(":")
29 | self.loc = {"chr": t[0].strip("chr").rstrip().upper(), "left":int(t[1].split("-")[0]), "right":int(t[1].split("-")[1])}
30 | else:
31 | self.loc = {"chr": str(chr).strip("chr").rstrip().upper(), "left": int(left), "right": int(right)}
32 | self.__update() # make sure the locstring is valid:
33 |
34 | def __eq__(self, other):
35 | if other:
36 | if isinstance(other, str):
37 | return(str(self) == str(other.replace(",", ""))) # use string comparison.
38 |
39 | # use a faster ? dict comparison, or throw an exception, as this item probably not a
40 | if self.loc["chr"] == other.loc["chr"]:
41 | if self.loc["left"] == other.loc["left"]:
42 | if self.loc["right"] == other.loc["right"]:
43 | return(True)
44 | return(False)
45 |
46 | def __lt__(self, other): # deprecated in Python3
47 | # Make locations sortable
48 | if self.loc['chr'] < other.loc['chr']:
49 | return True
50 | elif self.loc['chr'] == other.loc['chr']:
51 | if self.loc['left'] < other.loc['left']:
52 | return True
53 | elif self.loc['left'] == other.loc['left']: # For ties
54 | return False
55 | return False
56 | #self.loc['chr'] > other.loc['chr']:
57 | return False
58 |
59 | def __hash__(self):
60 | return(hash(self._loc_string))
61 |
62 | def __deepcopy__(self, memo):
63 | return(pickle.loads(pickle.dumps(self, -1))) # This is 2-3x faster and presumably uses less memory
64 |
65 | def __bool__(self):
66 | return(True)
67 |
68 | def __repr__(self):
69 | return("" % (self._loc_string))
70 |
71 | def __len__(self):
72 | # work out the span.
73 | return(max([0, self.loc["right"] - self.loc["left"]]))
74 |
75 | def split(self, value=None):
76 | # ignores the 'value' argument completely and returns a three-ple
77 | return( (self.loc["chr"], self.loc["left"], self.loc["right"]) )
78 |
79 | def __update(self):
80 | self._loc_string = None
81 | try:
82 | self._loc_string = "chr%s:%s-%s" % (self.loc["chr"].strip("chr"), self.loc["left"], self.loc["right"])
83 | except Exception: # chr possibly sets of strings ... etc.
84 | self._loc_string = "chr%s:%s-%s" % (self.loc["chr"], self.loc["left"], self.loc["right"])
85 | # I can't import my bunch of errors, as location is used in that module. So I spoof an assert
86 | if not self._loc_string: # failed to make a valid string...
87 | raise "Bad location formatting"
88 |
89 | def __getitem__(self, key):
90 | if key == "string":
91 | self.__update() # only update when accessed.
92 | return(self._loc_string)
93 | elif key == "dict":
94 | return(self.loc)
95 | return(self.loc[key])
96 |
97 | def __setitem__(self, key, value):
98 | self.loc[key] = value
99 | self.__update()
100 |
101 | def __str__(self):
102 | return(self._loc_string)
103 |
104 | """
105 | these methods below should copy the location and send a modified version back.
106 | """
107 | def expand(self, base_pairs):
108 | new = copy.deepcopy(self)
109 | new.loc["left"] -= base_pairs
110 | new.loc["right"] += base_pairs
111 | new.__update()
112 | return(new)
113 |
114 | def expandLeft(self, base_pairs):
115 | new = copy.deepcopy(self)
116 | new.loc["left"] -= base_pairs
117 | new.__update()
118 | return(new)
119 |
120 | def expandRight(self, base_pairs):
121 | new = copy.deepcopy(self)
122 | new.loc["right"] += base_pairs
123 | new.__update()
124 | return(new)
125 |
126 | def shrink(self, base_pairs):
127 | new = copy.deepcopy(self)
128 | new.loc["left"] += base_pairs
129 | new.loc["right"] -= base_pairs
130 | new.__update()
131 | return(new)
132 |
133 | def shrinkLeft(self, base_pairs):
134 | new = copy.deepcopy(self)
135 | new.loc["left"] += base_pairs
136 | new.__update()
137 | return(new)
138 |
139 | def shrinkRight(self, base_pairs):
140 | new = copy.deepcopy(self)
141 | new.loc["right"] -= base_pairs
142 | new.__update()
143 | return(new)
144 |
145 | def pointLeft(self):
146 | """
147 | get a new location at the exact left of the coordinate
148 | """
149 | new = copy.deepcopy(self)
150 | new.loc["right"] = new.loc["left"]
151 | new.__update()
152 | return(new)
153 |
154 | def pointRight(self):
155 | """
156 | get a new location at the exact right of the coordinate
157 | """
158 | new = copy.deepcopy(self)
159 | new.loc["left"] = new.loc["right"]
160 | new.__update()
161 | return(new)
162 |
163 | def pointify(self):
164 | new = copy.deepcopy(self)
165 | centre = (self.loc["left"] + self.loc["right"]) // 2
166 | new.loc = {"chr": self.loc["chr"], "left": centre, "right": centre}
167 | new.__update()
168 | return(new)
169 |
170 | def collide(self, loc):
171 | if loc["chr"] != self["chr"]:
172 | return(False)
173 | return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"])
174 |
175 | def qcollide(self, loc):
176 | """
177 | **Purpose**
178 | perform a collision with another location object.
179 | This assumes you have already checked the locations are on the same chromosome.
180 |
181 | **Returns**
182 | True or False
183 | """
184 | return(self.loc["right"] >= loc.loc["left"] and self.loc["left"] <= loc.loc["right"]) # nice one-liner
185 |
186 | def distance(self, loc):
187 | """
188 | **Purpose**
189 | calculate the distance between two locations.
190 |
191 | **Returns**
192 | an integer indicating the distance, note that
193 | the chromosomes should be the same or it will raise an
194 | exception. distance() should not be used as a test for
195 | overlap. use collide() for that.
196 | """
197 | assert self["chr"] == loc["chr"], "chromosomes are not the same, %s vs %s" % (self, loc)
198 | return(self.qdistance(loc))
199 |
200 | def qdistance(self, loc):
201 | """
202 | (Internal)
203 | ignore the assert.
204 | """
205 | centreA = (self.loc["left"] + self.loc["right"]) // 2
206 | centreB = (loc["left"] + loc["right"]) // 2
207 | return(centreA - centreB)
208 |
209 | def __sub__(self, loc):
210 | """
211 | **Purpose**
212 | Allow things like:
213 |
214 | distance = locA - locB
215 | """
216 | return(self.distance(loc))
217 |
218 | def offset(self, base_pairs):
219 | """
220 | get a new location offset from the 5' end by n base pairs
221 | returns a point location.
222 | """
223 | new = copy.deepcopy(self)
224 | new.loc["left"] += base_pairs
225 | new.loc["right"] = new.loc["left"]
226 | new.__update()
227 | return(new)
228 |
229 | def keys(self):
230 | """
231 | Get the keys
232 | """
233 | return([i for i in self.loc])
234 |
235 | if __name__ == "__main__":
236 | import timeit
237 |
238 | s = "a = location(loc='chr1:1000-2000').pointify()"
239 | t = timeit.Timer(s, "from location import location")
240 | print("%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000))
--------------------------------------------------------------------------------
/scTE/miniglbase/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities
3 |
4 | Various utilities to support the genome scanning scripts.
5 |
6 | MAny of these predate glbase3, but are a little tricky to remove as I am not sure where
7 | they are used (if at all).
8 |
9 | So excuse the terrible code in places. I will deprecate occasional functions from this.
10 |
11 | R=[AG], Y=[CT], K=[GT], M=[AC], S=[GC], W=[AT], and the four-fold
12 | degenerate character N=[ATCG]
13 | 3-fold degenerate motifs re not used like the Lander paper.
14 |
15 | """
16 |
17 | import sys, os, pickle
18 |
19 | from . import config
20 |
21 | def glload(filename):
22 | """
23 | **Purpose**
24 | Load a glbase binary file
25 | (Actually a Python pickle)
26 |
27 | **Arguments**
28 | filename (Required)
29 | the filename of the glbase binary file to load.
30 |
31 | **Returns**
32 | The glbase object previously saved as a binary file
33 | """
34 | assert os.path.exists(os.path.realpath(filename)), "File '%s' not found" % filename
35 |
36 | try:
37 | oh = open(os.path.realpath(filename), "rb")
38 | newl = pickle.load(oh)
39 | oh.close()
40 | except pickle.UnpicklingError:
41 | raise BadBinaryFileFormatError(filename)
42 |
43 | # Recalculate the _optimiseData for old lists, and new features
44 | try:
45 | if newl.qkeyfind:
46 | pass
47 | if "loc" in list(newl.keys()) or "tss_loc" in list(newl.keys()): # buckets are only present if a loc key is available.
48 | if newl.buckets: # added in 0.381, only in objects with tss_loc or loc key.
49 | pass
50 | except Exception:
51 | config.log.warning("Old glb format, will rebuild buckets and/or qkeyfind, consider resaving")
52 | newl._optimiseData()
53 |
54 | try:
55 | cons = len(newl._conditions) # expression-like object
56 | config.log.info("Loaded '%s' binary file with %s items, %s conditions" % (filename, len(newl), cons))
57 | except AttributeError:
58 | config.log.info("Loaded '%s' binary file with %s items" % (filename, len(newl)))
59 | return(newl)
60 |
--------------------------------------------------------------------------------
/scTE/scatacseq.py:
--------------------------------------------------------------------------------
1 | '''
2 |
3 | The scATAC-seq data comes as three files, P1, P2 and the barcode, and there is no UMI
4 |
5 | You can just align P1 and P2 with your favourite aligner (we prefer STAR with these settings):
6 |
7 | ****
8 | teopts=' --outFilterMultimapNmax 100 --winAnchorMultimapNmax 100 --outSAMmultNmax 1 --outSAMtype BAM SortedByCoordinate --twopassMode Basic --outWigType wiggle --outWigNorm RPM'
9 | opts='--runRNGseed 42 --runThreadN 12 --readFilesCommand zcat '
10 |
11 | genome_mm10='--genomeDir mm10_gencode_vM21_starsolo/SAindex'
12 | genome_hg38='--genomeDir hg38_gencode_v30_starsolo/SAindex'
13 |
14 | # p1 = read
15 | # p2 = barcode and UMI
16 | # Make sure you set the correct genome index;
17 | STAR $opts $teopts $genome_hg38 --outFileNamePrefix ss.${out} --readFilesIn ${p1} ${p2}
18 | ****
19 |
20 | This script will then reprocess the BAM file, and put the BARCODE into CR SAM tag and spoof a UMI
21 |
22 | The UMI is generated by incrementing the sequence, so, each UMI is up to 4^14 (26 million).
23 | I guess there remains a change of a clash, but it should be so rare as to be basically impossible.
24 |
25 | Require pysam
26 |
27 |
28 | See also: bin/pack_scatacseq
29 |
30 | '''
31 |
32 | import sys,os
33 | import gzip
34 | import argparse
35 | import logging
36 | import dbm
37 | import time
38 | import random
39 |
40 | try:
41 | import pysam
42 | except ImportError:
43 | pass # fail silently
44 |
45 | def generate_mismatches(seq):
46 | """
47 | **Purpose**
48 | Generate all 1 bp mismatches for the sequence
49 | """
50 | newseqs = []
51 |
52 | for pos in range(len(seq)):
53 | newseqs += list(library([[i] for i in seq[0:pos]] + ["ACGT"] + [[i] for i in seq[pos:-1]]))
54 |
55 | return set(newseqs)
56 |
57 | def fastq(file_handle):
58 | """
59 | Generator object to parse a FASTQ file
60 |
61 | """
62 | name = "dummy"
63 | while name != "":
64 | name = file_handle.readline().strip()
65 | seq = file_handle.readline().strip()
66 | strand = file_handle.readline().strip()
67 | qual = file_handle.readline().strip()
68 |
69 | yield {"name": name, "strand": strand, "seq": seq, "qual": qual}
70 | return
71 |
72 | def library(args):
73 | """
74 | Sequence generator iterator
75 |
76 | """
77 | if not args:
78 | yield ""
79 | return
80 | for i in args[0]:
81 | for tmp in library(args[1:]):
82 | yield i + tmp
83 | return
84 |
85 | def atacBam2bed(filename, out, CB, UMI, noDup, num_threads):
86 |
87 | sample=filename.split('/')[-1].replace('.bam','')
88 |
89 | if sys.platform == 'darwin': # Mac OSX has BSD sed
90 | switch = '-E'
91 | else:
92 | switch = '-r'
93 |
94 | if not CB:
95 | # Put the sample name in the barcode slot
96 | if noDup:
97 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out))
98 | else:
99 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, sample,switch, out, out))
100 | else:
101 | if noDup:
102 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
103 | # os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
104 | else:
105 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
106 |
107 | def para_atacBam2bed(filename, CB, out, noDup):
108 | if not os.path.exists('%ss_scTEtmp/o0'%out):
109 | os.system('mkdir -p %s_scTEtmp/o0'%out)
110 |
111 | sample=filename.split('/')[-1].replace('.bam','')
112 |
113 | if sys.platform == 'darwin': # Mac OSX has BSD sed
114 | switch = '-E'
115 | else:
116 | switch = '-r'
117 |
118 | if not CB:
119 | if noDup:
120 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample))
121 | else:
122 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,"%s"}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' %(filename, sample, switch, out, sample))
123 | else:
124 | if noDup:
125 | # os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out))
126 | os.system('bamToBed -i %s | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$3,$4}\' | sed %s \'s/^chr//g\' | awk \'!x[$0]++\' | gzip -c > %s_scTEtmp/o1/%s.bed.gz' % (filename, switch, out, out))
127 | else:
128 | os.system('bamToBed -i %s -bedpe | awk -F ["\t":] \'{OFS="\t"}{print $1,$2,$6,$7}\' | sed %s \'s/^chr//g\' | gzip -c > %s_scTEtmp/o0/%s.bed.gz' % (filename, switch, out, out))
129 |
130 | def load_expected_whitelist(filename, logger):
131 | """
132 | **Purpose**
133 | Load the expected whitelist and output a set
134 |
135 | """
136 | expected_whitelist = []
137 | oh = open(filename, 'rt')
138 | for line in oh:
139 | expected_whitelist.append(line.strip())
140 | oh.close()
141 |
142 | expected_whitelist = set(expected_whitelist)
143 |
144 | logger.info('Found {0:,} expected barcodes'.format(len(expected_whitelist)))
145 |
146 | return expected_whitelist
147 |
148 | def build_barcode_dict(barcode_filename, save_whitelist=False, expected_whitelist=False,
149 | gzip_file=True, logger=False, ondisk=True):
150 | '''
151 | **Purposse**
152 | The BAM and the FASTQ are not guaranteed to be in the same order, so I need to make a look up for
153 | the read ID and the barcode
154 |
155 | **Arguments**
156 | barcode_filename (Required)
157 |
158 | save_whitelist (Optional, default=False)
159 | save out the whitelist of barcodes (i.e. the ones actually observed)\
160 |
161 | TODO: This should be checked against the expected whitelist, and 1bp Hamming corrected
162 |
163 | **Returns**
164 | A dict mapping :
165 | '''
166 | assert barcode_filename, 'barcode_filename is required'
167 |
168 | if expected_whitelist:
169 | logger.info('Checking against the expected whitelist and correcting barcodes')
170 | else:
171 | logger.warning('Not checking the barcodes against an expected whitelist, barcodes will not be corrected')
172 |
173 | bad_barcodes = 0
174 | rescued_barcodes = 0
175 |
176 | if ondisk:
177 | tmpfilename = './tpm_{0:}_{1:}_{2:}.dbm'.format(barcode_filename, time.time(), random.randint(0, 10000))
178 | barcode_lookup = dbm.open(tmpfilename, 'n')
179 | else:
180 | tmpfilename = None
181 | barcode_lookup = {}
182 |
183 | if gzip_file:
184 | oh = gzip.open(barcode_filename, 'rt')
185 | else:
186 | oh = open(barcode_filename, 'rt')
187 |
188 | for idx, fq in enumerate(fastq(oh)):
189 | barcode = fq['seq']
190 | if 'N' in barcode: # Discard this barcode
191 | bad_barcodes += 1
192 | continue
193 |
194 | if expected_whitelist and barcode not in expected_whitelist:
195 | # barcode not in the whitelist
196 | # see if we can resuce it:
197 | rescued = False
198 | for mm in generate_mismatches(barcode):
199 | if mm in expected_whitelist:
200 | barcode = mm # Corrected
201 | rescued_barcodes += 1
202 | rescued = True
203 | break
204 | if not rescued:
205 | bad_barcodes += 1 # unrecoverable
206 | continue
207 |
208 | name = fq['name'].split(' ')[0].lstrip('@') # Any other types seen?
209 | barcode_lookup[name] = barcode
210 |
211 | if (idx+1) % 10000000 == 0:
212 | logger.info('Processed: {:,} barcode reads'.format(idx+1))
213 | oh.close()
214 |
215 | logger.info('Processed: {:,} barcode reads from the FASTQ'.format(idx+1))
216 | logger.info('Bad reads with no barcode {:,} reads'.format(bad_barcodes))
217 | logger.info('Rescued {:,} reads'.format(rescued_barcodes))
218 | logger.info('Found {:,} valid reads'.format(len(set(barcode_lookup.keys())), ))
219 | logger.info('Found {:,} valid barcodes'.format(len(set(barcode_lookup.values())), ))
220 |
221 | if save_whitelist:
222 | logger.info('Saved whitelist: {0}'.format(save_whitelist))
223 | oh = open(save_whitelist, 'wt')
224 | for k in sorted(set(barcode_lookup.values())):
225 | oh.write('%s\n' % (k))
226 |
227 | oh.close()
228 |
229 | return barcode_lookup, expected_whitelist, tmpfilename
230 |
231 | def parse_bam(infile, barcode_lookup, outfile, barcode_corrector, logger):
232 | """
233 | **Purpose**
234 | Parse the BAM file and insert the CR: and YR: tags
235 | """
236 | inbam = pysam.AlignmentFile(infile[0], 'rb')
237 | outfile = pysam.AlignmentFile(outfile, 'wb', template=inbam)
238 |
239 | #umi_iterator = library(["ACGT"] * 14)
240 |
241 | not_paired = 0 # unpaired ATAC
242 | no_matching_barcode = 0 # No matching read:barcode pair
243 | corrected_barcodes = 0
244 | pairs_too_far_apart = 0
245 |
246 | quick_lookup = {}
247 |
248 | for idx, read in enumerate(inbam):
249 | if (idx+1) % 10000000 == 0:
250 | logger.info('Processed: {:,} reads'.format(idx+1))
251 | #break
252 |
253 | if not read.is_paired:
254 | not_paired += 1
255 | continue
256 |
257 | if read.query_alignment_length > 1000:
258 | pairs_too_far_apart += 1
259 | continue
260 |
261 | # UMI iterator
262 | #try:
263 | # umi = umi_iterator.__next__()
264 | #except StopIteration:
265 | # umi_iterator = library(["ACGT"] * 14)
266 |
267 | # Add the barcode:
268 | # See if the read is in the lookup:
269 | if read.query_name in barcode_lookup:
270 | read.set_tags([('CR:Z', barcode_lookup[read.query_name]),])
271 | else:
272 | no_matching_barcode += 1
273 | continue
274 |
275 | # The BAM file is not garunteed to be in order, but the pairs should be pretty close, so I just need to check for the other pair on a simple lookup list
276 | # and only write out the pairs once I got two
277 | if read.query_name in quick_lookup: # I found it's pair
278 | outfile.write(read)
279 | outfile.write(quick_lookup[read.query_name])
280 | del quick_lookup[read.query_name]
281 | else:
282 | # no pair, store it for later
283 | quick_lookup[read.query_name] = read
284 |
285 | inbam.close()
286 | outfile.close()
287 |
288 | logger.info('Processed {:,} reads from the BAM'.format(idx+1))
289 | logger.info('{:,} reads were unpaired'.format(not_paired+1))
290 | logger.info('{:,} read pairs were too far apart'.format(pairs_too_far_apart+1))
291 | logger.info('Matched {0:,} ({1:.1f}%) reads to a barcode'.format(idx - no_matching_barcode, (idx - no_matching_barcode) / idx * 100.0))
292 | logger.info('Save BAM ouput file: {0}'.format(infile[0]))
293 | return
294 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | import glob,os
3 |
4 | def readme():
5 | with open('README.md',encoding="utf-8") as f:
6 | return f.read()
7 |
8 | setup(name='scTE',
9 | version='1.0',
10 | description='Tool for estimating differential enrichment of Transposable Elements and other highly repetitive regions in single-cell data',
11 | long_description=readme(),
12 | classifiers=[
13 | 'Programming Language :: Python :: 3',
14 | 'Programming Language :: Python :: 3.6',
15 | ],
16 | python_requires=">=3.6",
17 | keywords='..',
18 | url='..',
19 | author='..',
20 | author_email='he_jiangping@grmh-gdl.cn; andrewh@sustech.edu.cn',
21 | license='..',
22 | packages=[
23 | 'scTE',
24 | 'scTE.miniglbase',
25 | ],
26 | platforms=[
27 | 'Linux',
28 | 'MacOS'
29 | ],
30 | install_requires=[
31 | 'argparse','scipy','pandas',
32 | 'numpy','anndata',
33 | ],
34 | include_package_data=True,
35 | zip_safe=False,
36 | scripts=[
37 | 'bin/scTE',
38 | 'bin/scTE_build',
39 | 'bin/scTEATAC_build',
40 | 'bin/scTEATAC',
41 | ]
42 | )
43 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 |
2 | scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m exclusive
3 |
4 | scTE -i Data/test.bam -p 12 --min_genes 1 -o out --genome mm10 -x Data/test.exclusive.idx
5 |
6 | #scTE_build -g mm10 -te Data/TE.bed -gene Data/Gene.gtf -o Data/test -m nointron
7 |
8 |
--------------------------------------------------------------------------------